ring-0.17.14/.cargo_vcs_info.json0000644000000001610000000000100121430ustar { "git": { "sha1": "2723abbca9e83347d82b056d5b239c6604f786df", "dirty": true }, "path_in_vcs": "" }ring-0.17.14/Cargo.lock0000644000000225650000000000100101320ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "bumpalo" version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631" [[package]] name = "cc" version = "1.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7777341816418c02e033934a09f20dc0ccaf65a5201ef8a450ae0105a573fda" dependencies = [ "shlex", ] [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "getrandom" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "js-sys", "libc", "wasi", "wasm-bindgen", ] [[package]] name = "js-sys" version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a865e038f7f6ed956f788f0d7d60c541fff74c7bd74272c5d4cf15c63743e705" dependencies = [ "once_cell", "wasm-bindgen", ] [[package]] name = "libc" version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "log" version = "0.4.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" [[package]] name = "minicov" version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27fe9f1cc3c22e1687f9446c2083c4c5fc7f0bcf1c7a86bdbded14985895b4b" dependencies = [ "cc", "walkdir", ] [[package]] name = "once_cell" version = "1.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" [[package]] name = "proc-macro2" version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" dependencies = [ "proc-macro2", ] [[package]] name = "ring" version = "0.17.14" dependencies = [ "cc", "cfg-if", "getrandom", "libc", "untrusted", "wasm-bindgen-test", "windows-sys 0.52.0", ] [[package]] name = "same-file" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" dependencies = [ "winapi-util", ] [[package]] name = "scoped-tls" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "syn" version = "2.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "unicode-ident" version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" [[package]] name = "untrusted" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "walkdir" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" dependencies = [ "same-file", "winapi-util", ] [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d15e63b4482863c109d70a7b8706c1e364eb6ea449b201a76c5b89cedcec2d5c" dependencies = [ "cfg-if", "once_cell", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d36ef12e3aaca16ddd3f67922bc63e48e953f126de60bd33ccc0101ef9998cd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" version = "0.4.47" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9dfaf8f50e5f293737ee323940c7d8b08a66a95a419223d9f41610ca08b0833d" dependencies = [ "cfg-if", "js-sys", "once_cell", "wasm-bindgen", "web-sys", ] [[package]] name = "wasm-bindgen-macro" version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "705440e08b42d3e4b36de7d66c944be628d579796b8090bfa3471478a2260051" dependencies = [ "quote", "wasm-bindgen-macro-support", ] [[package]] name = "wasm-bindgen-macro-support" version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98c9ae5a76e46f4deecd0f0255cc223cfa18dc9b261213b8aa0c7b36f61b3f1d" dependencies = [ "proc-macro2", "quote", "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ee99da9c5ba11bd675621338ef6fa52296b76b83305e9b6e5c77d4c286d6d49" [[package]] name = "wasm-bindgen-test" version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d919bb60ebcecb9160afee6c71b43a58a4f0517a2de0054cd050d02cec08201" dependencies = [ "js-sys", "minicov", "once_cell", "scoped-tls", "wasm-bindgen", "wasm-bindgen-futures", "wasm-bindgen-test-macro", ] [[package]] name = "wasm-bindgen-test-macro" version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "222ebde6ea87fbfa6bdd2e9f1fd8a91d60aee5db68792632176c4e16a74fc7d8" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "web-sys" version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a98bc3c33f0fe7e59ad7cd041b89034fa82a7c2d4365ca538dda6cdaf513863c" dependencies = [ "js-sys", "wasm-bindgen", ] [[package]] name = "winapi-util" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ "windows-sys 0.59.0", ] [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ "windows-targets", ] [[package]] name = "windows-sys" version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_gnullvm", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" ring-0.17.14/Cargo.toml0000644000000207500000000000100101470ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.66.0" name = "ring" version = "0.17.14" build = "build.rs" links = "ring_core_0_17_14_" include = [ "LICENSE", "LICENSE-other-bits", "LICENSE-BoringSSL", "src/polyfill/once_cell/LICENSE-APACHE", "src/polyfill/once_cell/LICENSE-MIT", "Cargo.toml", "pregenerated/*", "benches/*.rs", "build.rs", "crypto/chacha/asm/chacha-armv4.pl", "crypto/chacha/asm/chacha-armv8.pl", "crypto/chacha/asm/chacha-x86.pl", "crypto/chacha/asm/chacha-x86_64.pl", "crypto/constant_time_test.c", "crypto/cpu_intel.c", "crypto/crypto.c", "crypto/curve25519/asm/x25519-asm-arm.S", "crypto/curve25519/curve25519.c", "crypto/curve25519/curve25519_64_adx.c", "crypto/curve25519/curve25519_tables.h", "crypto/curve25519/internal.h", "crypto/fipsmodule/aes/aes_nohw.c", "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl", "crypto/fipsmodule/aes/asm/aesni-x86.pl", "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl", "crypto/fipsmodule/aes/asm/aesni-x86_64.pl", "crypto/fipsmodule/aes/asm/aesv8-armx.pl", "crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl", "crypto/fipsmodule/aes/asm/ghash-armv4.pl", "crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl", "crypto/fipsmodule/aes/asm/ghash-x86.pl", "crypto/fipsmodule/aes/asm/ghash-x86_64.pl", "crypto/fipsmodule/aes/asm/ghashv8-armx.pl", "crypto/fipsmodule/aes/asm/bsaes-armv7.pl", "crypto/fipsmodule/aes/asm/bsaes-x86_64.pl", "crypto/fipsmodule/aes/asm/vsaes-armv7.pl", "crypto/fipsmodule/aes/asm/vpaes-armv7.pl", "crypto/fipsmodule/aes/asm/vpaes-armv8.pl", "crypto/fipsmodule/aes/asm/vpaes-x86.pl", "crypto/fipsmodule/aes/asm/vpaes-x86_64.pl", "crypto/fipsmodule/bn/asm/armv4-mont.pl", "crypto/fipsmodule/bn/asm/armv8-mont.pl", "crypto/fipsmodule/bn/asm/x86-mont.pl", "crypto/fipsmodule/bn/asm/x86_64-mont.pl", "crypto/fipsmodule/bn/asm/x86_64-mont5.pl", "crypto/fipsmodule/bn/internal.h", "crypto/fipsmodule/bn/montgomery.c", "crypto/fipsmodule/bn/montgomery_inv.c", "crypto/fipsmodule/ec/asm/p256-armv8-asm.pl", "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl", "crypto/fipsmodule/ec/ecp_nistz.c", "crypto/fipsmodule/ec/ecp_nistz.h", "crypto/fipsmodule/ec/ecp_nistz384.h", "crypto/fipsmodule/ec/ecp_nistz384.inl", "crypto/fipsmodule/ec/gfp_p256.c", "crypto/fipsmodule/ec/gfp_p384.c", "crypto/fipsmodule/ec/p256.c", "crypto/fipsmodule/ec/p256-nistz-table.h", "crypto/fipsmodule/ec/p256-nistz.c", "crypto/fipsmodule/ec/p256-nistz.h", "crypto/fipsmodule/ec/p256_shared.h", "crypto/fipsmodule/ec/p256_table.h", "crypto/fipsmodule/ec/util.h", "crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt", "crypto/fipsmodule/sha/asm/sha256-armv4.pl", "crypto/fipsmodule/sha/asm/sha512-armv4.pl", "crypto/fipsmodule/sha/asm/sha512-armv8.pl", "crypto/fipsmodule/sha/asm/sha512-x86_64.pl", "crypto/internal.h", "crypto/limbs/limbs.c", "crypto/limbs/limbs.h", "crypto/limbs/limbs.inl", "crypto/mem.c", "crypto/perlasm/arm-xlate.pl", "crypto/perlasm/x86asm.pl", "crypto/perlasm/x86gas.pl", "crypto/perlasm/x86nasm.pl", "crypto/perlasm/x86_64-xlate.pl", "crypto/poly1305/poly1305.c", "crypto/poly1305/poly1305_arm.c", "crypto/poly1305/poly1305_arm_asm.S", "crypto/cipher/asm/chacha20_poly1305_armv8.pl", "crypto/cipher/asm/chacha20_poly1305_x86_64.pl", "examples/**/*.rs", "include/ring-core/aes.h", "include/ring-core/asm_base.h", "include/ring-core/base.h", "include/ring-core/check.h", "include/ring-core/mem.h", "include/ring-core/target.h", "include/ring-core/type_check.h", "src/**/*.rs", "src/aead/poly1305_test.txt", "src/data/alg-rsa-encryption.der", "src/ec/curve25519/ed25519/ed25519_pkcs8_v2_template.der", "src/ec/suite_b/ecdsa/ecPublicKey_p256_pkcs8_v1_template.der", "src/ec/suite_b/ecdsa/ecPublicKey_p384_pkcs8_v1_template.der", "src/rsa/signature_rsa_example_private_key.der", "src/rsa/signature_rsa_example_public_key.der", "tests/**/*.rs", "tests/ecdsa_test_private_key_p256.p8", "tests/ecdsa_test_public_key_p256.der", "tests/ecdsa_test_public_key_p256_debug.txt", "tests/ed25519_test_private_key.bin", "tests/ed25519_test_private_key.p8", "tests/ed25519_test_public_key.bin", "tests/ed25519_test_public_key.der", "tests/rsa_test_private_key_2048.p8", "tests/rsa_test_public_key_2048.der", "tests/rsa_test_public_key_2048_debug.txt", "tests/rsa_test_public_modulus.bin", "third_party/fiat/asm/fiat_curve25519_adx_mul.S", "third_party/fiat/asm/fiat_curve25519_adx_square.S", "third_party/fiat/curve25519_32.h", "third_party/fiat/curve25519_64.h", "third_party/fiat/curve25519_64_adx.h", "third_party/fiat/curve25519_64_msvc.h", "third_party/fiat/p256_32.h", "third_party/fiat/p256_64.h", "third_party/fiat/p256_64_msvc.h", "third_party/fiat/LICENSE", ] autolib = false autobins = false autoexamples = false autotests = false autobenches = false description = "An experiment." readme = "README.md" keywords = [ "crypto", "cryptography", "rand", "ECC", "RSA", ] categories = [ "cryptography", "no-std", ] license = "Apache-2.0 AND ISC" repository = "https://github.com/briansmith/ring" [package.metadata.docs.rs] all-features = true [features] alloc = [] default = [ "alloc", "dev_urandom_fallback", ] dev_urandom_fallback = [] less-safe-getrandom-custom-or-rdrand = [] less-safe-getrandom-espidf = [] slow_tests = [] std = ["alloc"] test_logging = [] unstable-testing-arm-no-hw = [] unstable-testing-arm-no-neon = [] wasm32_unknown_unknown_js = ["getrandom/js"] [lib] name = "ring" path = "src/lib.rs" [[test]] name = "aead_tests" path = "tests/aead_tests.rs" [[test]] name = "agreement_tests" path = "tests/agreement_tests.rs" [[test]] name = "constant_time_tests" path = "tests/constant_time_tests.rs" [[test]] name = "digest_tests" path = "tests/digest_tests.rs" [[test]] name = "ecdsa_tests" path = "tests/ecdsa_tests.rs" [[test]] name = "ed25519_tests" path = "tests/ed25519_tests.rs" [[test]] name = "error_tests" path = "tests/error_tests.rs" [[test]] name = "hkdf_tests" path = "tests/hkdf_tests.rs" [[test]] name = "hmac_tests" path = "tests/hmac_tests.rs" [[test]] name = "pbkdf2_tests" path = "tests/pbkdf2_tests.rs" [[test]] name = "quic_tests" path = "tests/quic_tests.rs" [[test]] name = "rand_tests" path = "tests/rand_tests.rs" [[test]] name = "rsa_tests" path = "tests/rsa_tests.rs" [[test]] name = "signature_tests" path = "tests/signature_tests.rs" [dependencies.cfg-if] version = "1.0.0" default-features = false [dependencies.getrandom] version = "0.2.10" [dependencies.untrusted] version = "0.9" [build-dependencies.cc] version = "1.2.8" default-features = false [target.'cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_os = "windows"))'.dependencies.windows-sys] version = "0.52" features = [ "Win32_Foundation", "Win32_System_Threading", ] [target.'cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_vendor = "apple", any(target_os = "ios", target_os = "macos", target_os = "tvos", target_os = "visionos", target_os = "watchos")))'.dependencies.libc] version = "0.2.155" default-features = false [target.'cfg(all(any(all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little")), any(target_os = "android", target_os = "linux")))'.dependencies.libc] version = "0.2.148" default-features = false [target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dev-dependencies.wasm-bindgen-test] version = "0.3.37" features = ["std"] default-features = false [target.'cfg(any(unix, windows, target_os = "wasi"))'.dev-dependencies.libc] version = "0.2.148" default-features = false [profile.bench] opt-level = 3 lto = true codegen-units = 1 debug = 0 debug-assertions = false rpath = false [profile.release] opt-level = 3 lto = true codegen-units = 1 debug = 0 debug-assertions = false rpath = false ring-0.17.14/Cargo.toml.orig000064400000000000000000000175571046102023000136430ustar 00000000000000[package] build = "build.rs" categories = ["cryptography", "no-std"] description = "An experiment." edition = "2021" keywords = ["crypto", "cryptography", "rand", "ECC", "RSA"] license = "Apache-2.0 AND ISC" name = "ring" repository = "https://github.com/briansmith/ring" # Keep in sync with .github/workflows/ci.yml ("MSRV") and see the MSRV note # in cpu/arm.rs. # 1.66 is required on x86/x86_64 for https://github.com/rust-lang/rust/pull/101861. rust-version = "1.66.0" # Keep in sync with `links` below. version = "0.17.14" # Keep in sync with `version` above. # # build.rs verifies that this equals "ring_core_{major}_{minor}_{patch}_{pre}" # as keeping this in sync with the symbol prefixing is crucial for ensuring # the safety of multiple versions of *ring* being used in a program. links = "ring_core_0_17_14_" include = [ "LICENSE", "LICENSE-other-bits", "LICENSE-BoringSSL", "src/polyfill/once_cell/LICENSE-APACHE", "src/polyfill/once_cell/LICENSE-MIT", "Cargo.toml", "pregenerated/*", "benches/*.rs", "build.rs", "crypto/chacha/asm/chacha-armv4.pl", "crypto/chacha/asm/chacha-armv8.pl", "crypto/chacha/asm/chacha-x86.pl", "crypto/chacha/asm/chacha-x86_64.pl", "crypto/constant_time_test.c", "crypto/cpu_intel.c", "crypto/crypto.c", "crypto/curve25519/asm/x25519-asm-arm.S", "crypto/curve25519/curve25519.c", "crypto/curve25519/curve25519_64_adx.c", "crypto/curve25519/curve25519_tables.h", "crypto/curve25519/internal.h", "crypto/fipsmodule/aes/aes_nohw.c", "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl", "crypto/fipsmodule/aes/asm/aesni-x86.pl", "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl", "crypto/fipsmodule/aes/asm/aesni-x86_64.pl", "crypto/fipsmodule/aes/asm/aesv8-armx.pl", "crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl", "crypto/fipsmodule/aes/asm/ghash-armv4.pl", "crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl", "crypto/fipsmodule/aes/asm/ghash-x86.pl", "crypto/fipsmodule/aes/asm/ghash-x86_64.pl", "crypto/fipsmodule/aes/asm/ghashv8-armx.pl", "crypto/fipsmodule/aes/asm/bsaes-armv7.pl", "crypto/fipsmodule/aes/asm/bsaes-x86_64.pl", "crypto/fipsmodule/aes/asm/vsaes-armv7.pl", "crypto/fipsmodule/aes/asm/vpaes-armv7.pl", "crypto/fipsmodule/aes/asm/vpaes-armv8.pl", "crypto/fipsmodule/aes/asm/vpaes-x86.pl", "crypto/fipsmodule/aes/asm/vpaes-x86_64.pl", "crypto/fipsmodule/bn/asm/armv4-mont.pl", "crypto/fipsmodule/bn/asm/armv8-mont.pl", "crypto/fipsmodule/bn/asm/x86-mont.pl", "crypto/fipsmodule/bn/asm/x86_64-mont.pl", "crypto/fipsmodule/bn/asm/x86_64-mont5.pl", "crypto/fipsmodule/bn/internal.h", "crypto/fipsmodule/bn/montgomery.c", "crypto/fipsmodule/bn/montgomery_inv.c", "crypto/fipsmodule/ec/asm/p256-armv8-asm.pl", "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl", "crypto/fipsmodule/ec/ecp_nistz.c", "crypto/fipsmodule/ec/ecp_nistz.h", "crypto/fipsmodule/ec/ecp_nistz384.h", "crypto/fipsmodule/ec/ecp_nistz384.inl", "crypto/fipsmodule/ec/gfp_p256.c", "crypto/fipsmodule/ec/gfp_p384.c", "crypto/fipsmodule/ec/p256.c", "crypto/fipsmodule/ec/p256-nistz-table.h", "crypto/fipsmodule/ec/p256-nistz.c", "crypto/fipsmodule/ec/p256-nistz.h", "crypto/fipsmodule/ec/p256_shared.h", "crypto/fipsmodule/ec/p256_table.h", "crypto/fipsmodule/ec/util.h", "crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt", "crypto/fipsmodule/sha/asm/sha256-armv4.pl", "crypto/fipsmodule/sha/asm/sha512-armv4.pl", "crypto/fipsmodule/sha/asm/sha512-armv8.pl", "crypto/fipsmodule/sha/asm/sha512-x86_64.pl", "crypto/internal.h", "crypto/limbs/limbs.c", "crypto/limbs/limbs.h", "crypto/limbs/limbs.inl", "crypto/mem.c", "crypto/perlasm/arm-xlate.pl", "crypto/perlasm/x86asm.pl", "crypto/perlasm/x86gas.pl", "crypto/perlasm/x86nasm.pl", "crypto/perlasm/x86_64-xlate.pl", "crypto/poly1305/poly1305.c", "crypto/poly1305/poly1305_arm.c", "crypto/poly1305/poly1305_arm_asm.S", "crypto/cipher/asm/chacha20_poly1305_armv8.pl", "crypto/cipher/asm/chacha20_poly1305_x86_64.pl", "examples/**/*.rs", "include/ring-core/aes.h", "include/ring-core/asm_base.h", "include/ring-core/base.h", "include/ring-core/check.h", "include/ring-core/mem.h", "include/ring-core/target.h", "include/ring-core/type_check.h", "src/**/*.rs", "src/aead/poly1305_test.txt", "src/data/alg-rsa-encryption.der", "src/ec/curve25519/ed25519/ed25519_pkcs8_v2_template.der", "src/ec/suite_b/ecdsa/ecPublicKey_p256_pkcs8_v1_template.der", "src/ec/suite_b/ecdsa/ecPublicKey_p384_pkcs8_v1_template.der", "src/rsa/signature_rsa_example_private_key.der", "src/rsa/signature_rsa_example_public_key.der", "tests/**/*.rs", "tests/ecdsa_test_private_key_p256.p8", "tests/ecdsa_test_public_key_p256.der", "tests/ecdsa_test_public_key_p256_debug.txt", "tests/ed25519_test_private_key.bin", "tests/ed25519_test_private_key.p8", "tests/ed25519_test_public_key.bin", "tests/ed25519_test_public_key.der", "tests/rsa_test_private_key_2048.p8", "tests/rsa_test_public_key_2048.der", "tests/rsa_test_public_key_2048_debug.txt", "tests/rsa_test_public_modulus.bin", "third_party/fiat/asm/fiat_curve25519_adx_mul.S", "third_party/fiat/asm/fiat_curve25519_adx_square.S", "third_party/fiat/curve25519_32.h", "third_party/fiat/curve25519_64.h", "third_party/fiat/curve25519_64_adx.h", "third_party/fiat/curve25519_64_msvc.h", "third_party/fiat/p256_32.h", "third_party/fiat/p256_64.h", "third_party/fiat/p256_64_msvc.h", "third_party/fiat/LICENSE", ] [package.metadata.docs.rs] all-features = true [lib] name = "ring" [dependencies] cfg-if = { version = "1.0.0", default-features = false } getrandom = { version = "0.2.10" } untrusted = { version = "0.9" } [target.'cfg(all(any(all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little")), any(target_os = "android", target_os = "linux")))'.dependencies] libc = { version = "0.2.148", default-features = false } [target.'cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_vendor = "apple", any(target_os = "ios", target_os = "macos", target_os = "tvos", target_os = "visionos", target_os = "watchos")))'.dependencies] libc = { version = "0.2.155", default-features = false } [target.'cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_os = "windows"))'.dependencies] windows-sys = { version = "0.52", features = ["Win32_Foundation", "Win32_System_Threading"] } [target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dev-dependencies] wasm-bindgen-test = { version = "0.3.37", default-features = false, features = ["std"] } [target.'cfg(any(unix, windows, target_os = "wasi"))'.dev-dependencies] libc = { version = "0.2.148", default-features = false } [build-dependencies] cc = { version = "1.2.8", default-features = false } [features] # These features are documented in the top-level module's documentation. default = ["alloc", "dev_urandom_fallback"] alloc = [] dev_urandom_fallback = [] less-safe-getrandom-custom-or-rdrand = [] less-safe-getrandom-espidf = [] slow_tests = [] std = ["alloc"] unstable-testing-arm-no-hw = [] unstable-testing-arm-no-neon = [] test_logging = [] wasm32_unknown_unknown_js = ["getrandom/js"] # XXX: debug = false because of https://github.com/rust-lang/rust/issues/34122 [profile.bench] opt-level = 3 debug = false rpath = false lto = true debug-assertions = false codegen-units = 1 [profile.release] opt-level = 3 debug = false rpath = false lto = true debug-assertions = false codegen-units = 1 [workspace] members = [ # intentionally not a default member so that `cargo test` doesn't cause criterion.rs and all its # dependencies to get built. "bench", "cavp", ] default-members = [ ".", "cavp" ] ring-0.17.14/LICENSE000064400000000000000000000007631046102023000117500ustar 00000000000000*ring* uses an "ISC" license, like BoringSSL used to use, for new code files. See LICENSE-other-bits for the text of that license. See LICENSE-BoringSSL for code that was sourced from BoringSSL under the Apache 2.0 license. Some code that was sourced from BoringSSL under the ISC license. In each case, the license info is at the top of the file. See src/polyfill/once_cell/LICENSE-APACHE and src/polyfill/once_cell/LICENSE-MIT for the license to code that was sourced from the once_cell project. ring-0.17.14/LICENSE-BoringSSL000064400000000000000000000350261046102023000135500ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Licenses for support code ------------------------- Parts of the TLS test suite are under the Go license. This code is not included in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so distributing code linked against BoringSSL does not trigger this license: Copyright (c) 2009 The Go Authors. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. BoringSSL uses the Chromium test infrastructure to run a continuous build, trybots etc. The scripts which manage this, and the script for generating build metadata, are under the Chromium license. Distributing code linked against BoringSSL does not trigger this license. Copyright 2015 The Chromium Authors. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ring-0.17.14/LICENSE-other-bits000064400000000000000000000013331046102023000140200ustar 00000000000000Copyright 2015-2025 Brian Smith. Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ring-0.17.14/README.md000064400000000000000000000040461046102023000122200ustar 00000000000000THE SOFTWARE IS PROVIDED "AS IS" AND BRIAN SMITH AND THE AUTHORS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL BRIAN SMITH OR THE AUTHORS BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. Most of the C and assembly language code in *ring* comes from BoringSSL. BoringSSL is a fork of OpenSSL. This quote from the BoringSSL README.md discouraging you from using it applies to this project: > BoringSSL is a fork of OpenSSL that is designed to meet Google's needs. > > Although BoringSSL is an open source project, it is not intended for general > use, as OpenSSL is. We don't recommend that third parties depend upon it. This project was originally shared on GitHub in 2015 as an experiment. It was put on crates.io shortly to help other people with their experiments. It is an experiment. Side Channels ------------- See [SIDE-CHANNELS.md](SIDE-CHANNELS.md) for important information regarding the limitations of the side channel mitigations in this project. Toolchains & Targets -------------------- Be especially weary about using toolchains (C compilers, etc.) or targets that aren't supported by other projects, especially BoringSSL. The further you are from using the same version of Clang that Chrome uses, the more weary you should be. Bug Reporting ------------- For security vulnerabilities, see https://github.com/briansmith/ring/security/policy. Please report bugs that aren't security vulnerabilities either as pull requests or as issues in [the issue tracker](https://github.com/briansmith/ring/issues). Release Notes ------------- It is recommended that you review every commit in this project. Some particularly noteworthy changes are noted in the [RELEASES.md](RELEASES.md). We could use some help in making this better. ring-0.17.14/build.rs000064400000000000000000001010751046102023000124060ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Build the non-Rust components. // It seems like it would be a good idea to use `log!` for logging, but it // isn't worth having the external dependencies (one for the `log` crate, and // another for the concrete logging implementation). Instead we use `eprintln!` // to log everything to stderr. use std::{ ffi::{OsStr, OsString}, fs::{self, DirEntry}, io::Write, path::{Path, PathBuf}, process::{Command, Stdio}, }; mod env { use std::ffi::OsString; /// Read an environment variable and tell Cargo that we depend on it. /// /// The name is static since we intend to only read a static set of environment /// variables. pub fn var_os(name: &'static str) -> Option { println!("cargo:rerun-if-env-changed={}", name); std::env::var_os(name) } pub fn var(name: &'static str) -> Option { var_os(name).and_then(|value| value.into_string().ok()) } } const X86: &str = "x86"; const X86_64: &str = "x86_64"; const AARCH64: &str = "aarch64"; const ARM: &str = "arm"; const WASM32: &str = "wasm32"; #[rustfmt::skip] const RING_SRCS: &[(&[&str], &str)] = &[ (&[], "crypto/curve25519/curve25519.c"), (&[], "crypto/fipsmodule/aes/aes_nohw.c"), (&[], "crypto/fipsmodule/bn/montgomery.c"), (&[], "crypto/fipsmodule/bn/montgomery_inv.c"), (&[], "crypto/fipsmodule/ec/ecp_nistz.c"), (&[], "crypto/fipsmodule/ec/gfp_p256.c"), (&[], "crypto/fipsmodule/ec/gfp_p384.c"), (&[], "crypto/fipsmodule/ec/p256.c"), (&[], "crypto/limbs/limbs.c"), (&[], "crypto/mem.c"), (&[], "crypto/poly1305/poly1305.c"), (&[ARM, X86_64, X86], "crypto/crypto.c"), (&[X86_64, X86], "crypto/cpu_intel.c"), (&[X86], "crypto/fipsmodule/aes/asm/aesni-x86.pl"), (&[X86], "crypto/fipsmodule/aes/asm/ghash-x86.pl"), (&[X86], "crypto/fipsmodule/aes/asm/vpaes-x86.pl"), (&[X86], "crypto/fipsmodule/bn/asm/x86-mont.pl"), (&[X86], "crypto/chacha/asm/chacha-x86.pl"), (&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"), (&[X86_64], "crypto/curve25519/curve25519_64_adx.c"), (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/ghash-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/vpaes-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/bn/asm/x86_64-mont.pl"), (&[X86_64], "crypto/fipsmodule/bn/asm/x86_64-mont5.pl"), (&[X86_64], "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl"), (&[X86_64], SHA512_X86_64), (&[X86_64], "crypto/cipher/asm/chacha20_poly1305_x86_64.pl"), (&[X86_64], "third_party/fiat/asm/fiat_curve25519_adx_mul.S"), (&[X86_64], "third_party/fiat/asm/fiat_curve25519_adx_square.S"), (&[AARCH64, X86_64], "crypto/fipsmodule/ec/p256-nistz.c"), (&[ARM], "crypto/fipsmodule/aes/asm/bsaes-armv7.pl"), (&[ARM], "crypto/fipsmodule/aes/asm/ghash-armv4.pl"), (&[ARM], "crypto/fipsmodule/aes/asm/vpaes-armv7.pl"), (&[ARM], "crypto/fipsmodule/bn/asm/armv4-mont.pl"), (&[ARM], "crypto/chacha/asm/chacha-armv4.pl"), (&[ARM], "crypto/curve25519/asm/x25519-asm-arm.S"), (&[ARM], "crypto/poly1305/poly1305_arm.c"), (&[ARM], "crypto/poly1305/poly1305_arm_asm.S"), (&[ARM], "crypto/fipsmodule/sha/asm/sha256-armv4.pl"), (&[ARM], "crypto/fipsmodule/sha/asm/sha512-armv4.pl"), (&[AARCH64], "crypto/chacha/asm/chacha-armv8.pl"), (&[AARCH64], "crypto/cipher/asm/chacha20_poly1305_armv8.pl"), (&[AARCH64], "crypto/fipsmodule/aes/asm/aesv8-armx.pl"), (&[AARCH64], "crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl"), (&[AARCH64], "crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl"), (&[AARCH64], "crypto/fipsmodule/aes/asm/ghashv8-armx.pl"), (&[AARCH64], "crypto/fipsmodule/aes/asm/vpaes-armv8.pl"), (&[AARCH64], "crypto/fipsmodule/bn/asm/armv8-mont.pl"), (&[AARCH64], "crypto/fipsmodule/ec/asm/p256-armv8-asm.pl"), (&[AARCH64], SHA512_ARMV8), ]; const SHA256_X86_64: &str = "crypto/fipsmodule/sha/asm/sha256-x86_64.pl"; const SHA512_X86_64: &str = "crypto/fipsmodule/sha/asm/sha512-x86_64.pl"; const SHA256_ARMV8: &str = "crypto/fipsmodule/sha/asm/sha256-armv8.pl"; const SHA512_ARMV8: &str = "crypto/fipsmodule/sha/asm/sha512-armv8.pl"; const RING_TEST_SRCS: &[&str] = &[("crypto/constant_time_test.c")]; const PREGENERATED: &str = "pregenerated"; fn cpp_flags(compiler: &cc::Tool) -> &'static [&'static str] { if !compiler.is_like_msvc() { static NON_MSVC_FLAGS: &[&str] = &[ "-fvisibility=hidden", "-std=c1x", // GCC 4.6 requires "c1x" instead of "c11" "-Wall", "-Wbad-function-cast", "-Wcast-align", "-Wcast-qual", "-Wconversion", "-Wmissing-field-initializers", "-Wmissing-include-dirs", "-Wnested-externs", "-Wredundant-decls", "-Wshadow", "-Wsign-compare", "-Wsign-conversion", "-Wstrict-prototypes", "-Wundef", "-Wuninitialized", ]; NON_MSVC_FLAGS } else { static MSVC_FLAGS: &[&str] = &[ "/Gy", // Enable function-level linking. "/Zc:wchar_t", "/Zc:forScope", "/Zc:inline", // Warnings. "/Wall", "/wd4127", // C4127: conditional expression is constant "/wd4464", // C4464: relative include path contains '..' "/wd4514", // C4514: : unreferenced inline function has be "/wd4710", // C4710: function not inlined "/wd4711", // C4711: function 'function' selected for inline expansion "/wd4820", // C4820: : bytes padding added after "/wd5045", /* C5045: Compiler will insert Spectre mitigation for memory load if * /Qspectre switch specified */ ]; MSVC_FLAGS } } // None means "any OS" or "any target". The first match in sequence order is // taken. const ASM_TARGETS: &[AsmTarget] = &[ AsmTarget { oss: LINUX_ABI, arch: AARCH64, perlasm_format: "linux64", }, AsmTarget { oss: LINUX_ABI, arch: ARM, perlasm_format: "linux32", }, AsmTarget { oss: LINUX_ABI, arch: X86, perlasm_format: "elf", }, AsmTarget { oss: LINUX_ABI, arch: X86_64, perlasm_format: "elf", }, AsmTarget { oss: &["horizon"], arch: ARM, perlasm_format: "linux32", }, AsmTarget { oss: APPLE_ABI, arch: AARCH64, perlasm_format: "ios64", }, AsmTarget { oss: APPLE_ABI, arch: X86_64, perlasm_format: "macosx", }, AsmTarget { oss: &[WINDOWS], arch: X86, perlasm_format: WIN32N, }, AsmTarget { oss: &[WINDOWS], arch: X86_64, perlasm_format: NASM, }, AsmTarget { oss: &[WINDOWS], arch: AARCH64, perlasm_format: "win64", }, ]; struct AsmTarget { /// Operating systems. oss: &'static [&'static str], /// Architectures. arch: &'static str, /// The PerlAsm format name. perlasm_format: &'static str, } impl AsmTarget { fn use_nasm(&self) -> bool { [WIN32N, NASM].contains(&self.perlasm_format) } } /// Operating systems that have the same ABI as Linux on every architecture /// mentioned in `ASM_TARGETS`. const LINUX_ABI: &[&str] = &[ "android", "dragonfly", "freebsd", "fuchsia", "haiku", "hurd", "illumos", "netbsd", "openbsd", "linux", "redox", "solaris", ]; const WIN32N: &str = "win32n"; const NASM: &str = "nasm"; /// Operating systems that have the same ABI as macOS on every architecture /// mentioned in `ASM_TARGETS`. const APPLE_ABI: &[&str] = &["ios", "macos", "tvos", "visionos", "watchos"]; const WINDOWS: &str = "windows"; fn main() { // Avoid assuming the working directory is the same is the $CARGO_MANIFEST_DIR so that toolchains // which may assume other working directories can still build this code. let c_root_dir = PathBuf::from( env::var_os("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR should always be set"), ); // Keep in sync with `core_name_and_version!` in prefixed.rs. let core_name_and_version = [ &env::var("CARGO_PKG_NAME").unwrap(), "core", &env::var("CARGO_PKG_VERSION_MAJOR").unwrap(), &env::var("CARGO_PKG_VERSION_MINOR").unwrap(), &env::var("CARGO_PKG_VERSION_PATCH").unwrap(), &env::var("CARGO_PKG_VERSION_PRE").unwrap(), // Often empty ] .join("_"); // Ensure `links` in Cargo.toml is consistent with the version. assert_eq!( &env::var("CARGO_MANIFEST_LINKS").unwrap(), &core_name_and_version ); const RING_PREGENERATE_ASM: &str = "RING_PREGENERATE_ASM"; match env::var_os(RING_PREGENERATE_ASM).as_deref() { Some(s) if s == "1" => { pregenerate_asm_main(&c_root_dir, &core_name_and_version); } None => ring_build_rs_main(&c_root_dir, &core_name_and_version), _ => { panic!("${} has an invalid value", RING_PREGENERATE_ASM); } } } fn ring_build_rs_main(c_root_dir: &Path, core_name_and_version: &str) { let out_dir = env::var_os("OUT_DIR").unwrap(); let out_dir = PathBuf::from(out_dir); let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); let os = env::var("CARGO_CFG_TARGET_OS").unwrap(); let env = env::var("CARGO_CFG_TARGET_ENV").unwrap(); let endian = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap(); let is_little_endian = endian == "little"; let is_git = fs::metadata(c_root_dir.join(".git")).is_ok(); // Published builds are always built in release mode. let is_debug = is_git && env::var("DEBUG").unwrap() != "false"; // During local development, force warnings in non-Rust code to be treated // as errors. Since warnings are highly compiler-dependent and compilers // don't maintain backward compatibility w.r.t. which warnings they issue, // don't do this for packaged builds. let force_warnings_into_errors = is_git; let target = Target { arch, os, env, is_debug, force_warnings_into_errors, }; let asm_target = if is_little_endian { ASM_TARGETS.iter().find(|asm_target| { asm_target.arch == target.arch && asm_target.oss.contains(&target.os.as_ref()) }) } else { None }; // If `.git` exists then assume this is the "local hacking" case where // we want to make it easy to build *ring* using `cargo build`/`cargo test` // without a prerequisite `package` step, at the cost of needing additional // tools like `Perl` and/or `nasm`. // // If `.git` doesn't exist then assume that this is a packaged build where // we want to optimize for minimizing the build tools required: No Perl, // no nasm, etc. let generated_dir = if !is_git { c_root_dir.join(PREGENERATED) } else { generate_sources_and_preassemble( &out_dir, asm_target.into_iter(), c_root_dir, core_name_and_version, ); out_dir.clone() }; build_c_code( asm_target, &target, &generated_dir, c_root_dir, &out_dir, core_name_and_version, ); emit_rerun_if_changed() } fn pregenerate_asm_main(c_root_dir: &Path, core_name_and_version: &str) { let pregenerated = c_root_dir.join(PREGENERATED); fs::create_dir(&pregenerated).unwrap(); generate_sources_and_preassemble( &pregenerated, ASM_TARGETS.iter(), c_root_dir, core_name_and_version, ); } fn generate_sources_and_preassemble<'a>( out_dir: &Path, asm_targets: impl Iterator, c_root_dir: &Path, core_name_and_version: &str, ) { generate_prefix_symbols_headers(out_dir, core_name_and_version).unwrap(); let perl_exe = get_perl_exe(); for asm_target in asm_targets { let perlasm_src_dsts = perlasm_src_dsts(out_dir, asm_target); perlasm(&perl_exe, &perlasm_src_dsts, asm_target, c_root_dir); if asm_target.use_nasm() { // Package pregenerated object files in addition to pregenerated // assembly language source files, so that the user doesn't need // to install the assembler. let srcs = asm_srcs(perlasm_src_dsts); for src in srcs { nasm(&src, asm_target.arch, out_dir, out_dir, c_root_dir); } } } } struct Target { arch: String, os: String, env: String, /// Is this a debug build? This affects whether assertions might be enabled /// in the C code. For packaged builds, this should always be `false`. is_debug: bool, /// true: Force warnings to be treated as errors. /// false: Use the default behavior (perhaps determined by `$CFLAGS`, etc.) force_warnings_into_errors: bool, } fn build_c_code( asm_target: Option<&AsmTarget>, target: &Target, generated_dir: &Path, c_root_dir: &Path, out_dir: &Path, core_name_and_version: &str, ) { let (asm_srcs, obj_srcs) = if let Some(asm_target) = asm_target { let perlasm_src_dsts = perlasm_src_dsts(generated_dir, asm_target); let asm_srcs = asm_srcs(perlasm_src_dsts); if asm_target.use_nasm() { // Nasm was already used to generate the object files, so use them instead of // assembling. let obj_srcs = asm_srcs .iter() .map(|src| obj_path(generated_dir, src.as_path())) .collect::>(); (vec![], obj_srcs) } else { (asm_srcs, vec![]) } } else { (vec![], vec![]) }; let core_srcs = sources_for_arch(&target.arch) .into_iter() .filter(|p| !is_perlasm(p)) .filter(|p| { if let Some(extension) = p.extension() { // We don't (and can't) use any .S on Windows since MSVC and NASM can't assemble // them. if extension == "S" && (target.arch == X86_64 || target.arch == X86) && target.os == WINDOWS { return false; } } true }) .collect::>(); let test_srcs = RING_TEST_SRCS.iter().map(PathBuf::from).collect::>(); let libs = [ ( core_name_and_version, &core_srcs[..], &asm_srcs[..], &obj_srcs[..], ), ( &(String::from(core_name_and_version) + "_test"), &test_srcs[..], &[], &[], ), ]; // XXX: Ideally, ring-test would only be built for `cargo test`, but Cargo // can't do that yet. libs.iter() .for_each(|&(lib_name, srcs, asm_srcs, obj_srcs)| { let srcs = srcs.iter().chain(asm_srcs); build_library( target, c_root_dir, out_dir, lib_name, srcs, generated_dir, obj_srcs, ) }); println!( "cargo:rustc-link-search=native={}", out_dir.to_str().expect("Invalid path") ); } fn new_build(target: &Target, c_root_dir: &Path, include_dir: &Path) -> cc::Build { let mut b = cc::Build::new(); configure_cc(&mut b, target, c_root_dir, include_dir); b } fn build_library<'a>( target: &Target, c_root_dir: &Path, out_dir: &Path, lib_name: &str, srcs: impl Iterator, include_dir: &Path, preassembled_objs: &[PathBuf], ) { let mut c = new_build(target, c_root_dir, include_dir); // Compile all the (dirty) source files into object files. srcs.for_each(|src| { c.file(c_root_dir.join(src)); }); preassembled_objs.iter().for_each(|obj| { c.object(obj); }); // Rebuild the library if necessary. let lib_path = PathBuf::from(out_dir).join(format!("lib{}.a", lib_name)); // Handled below. let _ = c.cargo_metadata(false); c.compile( lib_path .file_name() .and_then(|f| f.to_str()) .expect("No filename"), ); // Link the library. This works even when the library doesn't need to be // rebuilt. println!("cargo:rustc-link-lib=static={}", lib_name); } fn obj_path(out_dir: &Path, src: &Path) -> PathBuf { let mut out_path = out_dir.join(src.file_name().unwrap()); // To eliminate unnecessary conditional logic, use ".o" as the extension, // even when the compiler (e.g. MSVC) would normally use something else // (e.g. ".obj"). cc-rs seems to do the same. assert!(out_path.set_extension("o")); out_path } fn configure_cc(c: &mut cc::Build, target: &Target, c_root_dir: &Path, include_dir: &Path) { let compiler = c.get_compiler(); // FIXME: On Windows AArch64 we currently must use Clang to compile C code let compiler = if target.os == WINDOWS && target.arch == AARCH64 && !compiler.is_like_clang() { let _ = c.compiler("clang"); c.get_compiler() } else { compiler }; let _ = c.include(c_root_dir.join("include")); let _ = c.include(include_dir); for f in cpp_flags(&compiler) { let _ = c.flag(f); } if APPLE_ABI.contains(&target.os.as_str()) { // ``-gfull`` is required for Darwin's |-dead_strip|. let _ = c.flag("-gfull"); } else if !compiler.is_like_msvc() { let _ = c.flag("-g3"); }; if !target.is_debug { let _ = c.define("NDEBUG", None); } if target.arch == X86 { let is_msvc_not_clang_cl = compiler.is_like_msvc() && !compiler.is_like_clang_cl(); if !is_msvc_not_clang_cl { let _ = c.flag("-msse2"); } } // Allow cross-compiling without a target sysroot for these targets. if (target.arch == WASM32) || (target.os == "linux" && target.env == "musl" && target.arch != X86_64) { // TODO: Expand this to non-clang compilers in 0.17.0 if practical. if compiler.is_like_clang() { let _ = c.flag("-nostdlibinc"); let _ = c.define("RING_CORE_NOSTDLIBINC", "1"); } } if target.force_warnings_into_errors { c.warnings_into_errors(true); } } fn nasm(file: &Path, arch: &str, include_dir: &Path, out_dir: &Path, c_root_dir: &Path) { let out_file = obj_path(out_dir, file); let oformat = match arch { x if x == X86_64 => "win64", x if x == X86 => "win32", _ => panic!("unsupported arch: {}", arch), }; // Nasm requires that the path end in a path separator. let mut include_dir = include_dir.as_os_str().to_os_string(); include_dir.push(OsString::from(String::from(std::path::MAIN_SEPARATOR))); let mut c = Command::new("./target/tools/windows/nasm/nasm"); let _ = c .arg("-o") .arg(out_file.to_str().expect("Invalid path")) .arg("-f") .arg(oformat) .arg("-i") .arg("include/") .arg("-i") .arg(include_dir) .arg("-Xgnu") .arg("-gcv8") .arg(c_root_dir.join(file)); run_command(c); } fn run_command_with_args(command_name: &Path, args: &[OsString]) { let mut cmd = Command::new(command_name); let _ = cmd.args(args); run_command(cmd) } fn run_command(mut cmd: Command) { eprintln!("running {:?}", cmd); cmd.stderr(Stdio::inherit()); let status = cmd.status().unwrap_or_else(|e| { panic!("failed to execute [{:?}]: {}", cmd, e); }); if !status.success() { panic!("execution failed"); } } fn sources_for_arch(arch: &str) -> Vec { RING_SRCS .iter() .filter(|&&(archs, _)| archs.is_empty() || archs.contains(&arch)) .map(|&(_, p)| PathBuf::from(p)) .collect::>() } fn perlasm_src_dsts(out_dir: &Path, asm_target: &AsmTarget) -> Vec<(PathBuf, PathBuf)> { let srcs = sources_for_arch(asm_target.arch); let mut src_dsts = srcs .iter() .filter(|p| is_perlasm(p)) .map(|src| (src.clone(), asm_path(out_dir, src, asm_target))) .collect::>(); // Some PerlAsm source files need to be run multiple times with different // output paths. { // Appease the borrow checker. let mut maybe_synthesize = |concrete, synthesized| { let concrete_path = PathBuf::from(concrete); if srcs.contains(&concrete_path) { let synthesized_path = PathBuf::from(synthesized); src_dsts.push(( concrete_path, asm_path(out_dir, &synthesized_path, asm_target), )) } }; maybe_synthesize(SHA512_X86_64, SHA256_X86_64); maybe_synthesize(SHA512_ARMV8, SHA256_ARMV8); } src_dsts } fn asm_srcs(perlasm_src_dsts: Vec<(PathBuf, PathBuf)>) -> Vec { perlasm_src_dsts .into_iter() .map(|(_src, dst)| dst) .collect::>() } fn is_perlasm(path: &Path) -> bool { path.extension().unwrap().to_str().unwrap() == "pl" } fn asm_path(out_dir: &Path, src: &Path, asm_target: &AsmTarget) -> PathBuf { let src_stem = src.file_stem().expect("source file without basename"); let dst_stem = src_stem.to_str().unwrap(); let dst_filename = format!("{}-{}", dst_stem, asm_target.perlasm_format); let extension = if asm_target.use_nasm() { "asm" } else { "S" }; out_dir.join(dst_filename).with_extension(extension) } fn perlasm( perl_exe: &Path, src_dst: &[(PathBuf, PathBuf)], asm_target: &AsmTarget, c_root_dir: &Path, ) { for (src, dst) in src_dst { let mut args = vec![ join_components_with_forward_slashes(&c_root_dir.join(src)), asm_target.perlasm_format.into(), ]; if asm_target.arch == X86 { args.push("-fPIC".into()); } // Work around PerlAsm issue for ARM and AAarch64 targets by replacing // back slashes with forward slashes. args.push(join_components_with_forward_slashes(dst)); run_command_with_args(perl_exe, &args); } } fn join_components_with_forward_slashes(path: &Path) -> OsString { let parts = path.components().map(|c| c.as_os_str()).collect::>(); parts.join(OsStr::new("/")) } fn get_perl_exe() -> PathBuf { get_command("PERL_EXECUTABLE", "perl") } fn get_command(var: &'static str, default: &str) -> PathBuf { PathBuf::from(env::var_os(var).unwrap_or_else(|| default.into())) } // TODO: We should emit `cargo:rerun-if-changed-env` for the various // environment variables that affect the build. fn emit_rerun_if_changed() { for path in &["crypto", "include", "third_party/fiat"] { walk_dir(&PathBuf::from(path), &|entry| { let path = entry.path(); match path.extension().and_then(|ext| ext.to_str()) { Some("c") | Some("S") | Some("h") | Some("inl") | Some("pl") | None => { println!("cargo:rerun-if-changed={}", path.to_str().unwrap()); } _ => { // Ignore other types of files. } } }) } } fn walk_dir(dir: &Path, cb: &impl Fn(&DirEntry)) { if dir.is_dir() { for entry in fs::read_dir(dir).unwrap() { let entry = entry.unwrap(); let path = entry.path(); if path.is_dir() { walk_dir(&path, cb); } else { cb(&entry); } } } } /// Creates the necessary header files for symbol renaming. /// /// For simplicity, both non-Nasm- and Nasm- style headers are always /// generated, even though local non-packaged builds need only one of them. fn generate_prefix_symbols_headers( out_dir: &Path, core_name_and_version: &str, ) -> Result<(), std::io::Error> { let prefix = &(String::from(core_name_and_version) + "_"); generate_prefix_symbols_header(out_dir, "prefix_symbols.h", '#', None, prefix)?; generate_prefix_symbols_header( out_dir, "prefix_symbols_asm.h", '#', Some("#if defined(__APPLE__)"), prefix, )?; generate_prefix_symbols_header( out_dir, "prefix_symbols_nasm.inc", '%', Some("%ifidn __OUTPUT_FORMAT__,win32"), prefix, )?; Ok(()) } fn generate_prefix_symbols_header( out_dir: &Path, filename: &str, pp: char, prefix_condition: Option<&str>, prefix: &str, ) -> Result<(), std::io::Error> { let dir = out_dir.join("ring_core_generated"); fs::create_dir_all(&dir)?; let path = dir.join(filename); let mut file = fs::File::create(path)?; let filename_ident = filename.replace('.', "_").to_uppercase(); writeln!( file, r#" {pp}ifndef ring_core_generated_{filename_ident} {pp}define ring_core_generated_{filename_ident} "#, pp = pp, filename_ident = filename_ident )?; if let Some(prefix_condition) = prefix_condition { writeln!(file, "{}", prefix_condition)?; writeln!(file, "{}", prefix_all_symbols(pp, "_", prefix))?; writeln!(file, "{pp}else", pp = pp)?; }; writeln!(file, "{}", prefix_all_symbols(pp, "", prefix))?; if prefix_condition.is_some() { writeln!(file, "{pp}endif", pp = pp)? } writeln!(file, "{pp}endif", pp = pp)?; Ok(()) } fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { // Rename some nistz256 assembly functions to match the names of their // polyfills. static SYMBOLS_TO_RENAME: &[(&str, &str)] = &[ ("ecp_nistz256_point_double", "p256_point_double"), ("ecp_nistz256_point_add", "p256_point_add"), ("ecp_nistz256_point_add_affine", "p256_point_add_affine"), ("ecp_nistz256_ord_mul_mont", "p256_scalar_mul_mont"), ("ecp_nistz256_ord_sqr_mont", "p256_scalar_sqr_rep_mont"), ("ecp_nistz256_mul_mont", "p256_mul_mont"), ("ecp_nistz256_sqr_mont", "p256_sqr_mont"), ]; static SYMBOLS_TO_PREFIX: &[&str] = &[ "adx_bmi2_available", "avx2_available", "CRYPTO_memcmp", "CRYPTO_poly1305_finish", "CRYPTO_poly1305_finish_neon", "CRYPTO_poly1305_init", "CRYPTO_poly1305_init_neon", "CRYPTO_poly1305_update", "CRYPTO_poly1305_update_neon", "ChaCha20_ctr32", "ChaCha20_ctr32_avx2", "ChaCha20_ctr32_neon", "ChaCha20_ctr32_nohw", "ChaCha20_ctr32_ssse3", "ChaCha20_ctr32_ssse3_4x", "LIMB_is_zero", "LIMBS_add_mod", "LIMBS_are_zero", "LIMBS_equal", "LIMBS_less_than", "LIMBS_reduce_once", "LIMBS_select_512_32", "LIMBS_shl_mod", "LIMBS_sub_mod", "LIMBS_window5_split_window", "LIMBS_window5_unsplit_window", "LIMB_shr", "OPENSSL_cpuid_setup", "aes_gcm_dec_kernel", "aes_gcm_dec_update_vaes_avx2", "aes_gcm_enc_kernel", "aes_gcm_enc_update_vaes_avx2", "aes_hw_ctr32_encrypt_blocks", "aes_hw_set_encrypt_key", "aes_hw_set_encrypt_key_alt", "aes_hw_set_encrypt_key_base", "aes_nohw_ctr32_encrypt_blocks", "aes_nohw_encrypt", "aes_nohw_set_encrypt_key", "aesni_gcm_decrypt", "aesni_gcm_encrypt", "bn_from_montgomery_in_place", "bn_gather5", "bn_mul_mont", "bn_mul_mont_nohw", "bn_mul4x_mont", "bn_mulx4x_mont", "bn_mul8x_mont_neon", "bn_mul4x_mont_gather5", "bn_mulx4x_mont_gather5", "bn_neg_inv_mod_r_u64", "bn_power5_nohw", "bn_powerx5", "bn_scatter5", "bn_sqr8x_internal", "bn_sqr8x_mont", "bn_sqrx8x_internal", "bsaes_ctr32_encrypt_blocks", "bssl_constant_time_test_conditional_memcpy", "bssl_constant_time_test_conditional_memxor", "bssl_constant_time_test_main", "chacha20_poly1305_open", "chacha20_poly1305_open_avx2", "chacha20_poly1305_open_sse41", "chacha20_poly1305_seal", "chacha20_poly1305_seal_avx2", "chacha20_poly1305_seal_sse41", "ecp_nistz256_mul_mont_adx", "ecp_nistz256_mul_mont_nohw", "ecp_nistz256_ord_mul_mont_adx", "ecp_nistz256_ord_mul_mont_nohw", "ecp_nistz256_ord_sqr_mont_adx", "ecp_nistz256_ord_sqr_mont_nohw", "ecp_nistz256_point_add_adx", "ecp_nistz256_point_add_nohw", "ecp_nistz256_point_add_affine_adx", "ecp_nistz256_point_add_affine_nohw", "ecp_nistz256_point_double_adx", "ecp_nistz256_point_double_nohw", "ecp_nistz256_select_w5_avx2", "ecp_nistz256_select_w5_nohw", "ecp_nistz256_select_w7_avx2", "ecp_nistz256_select_w7_nohw", "ecp_nistz256_sqr_mont_adx", "ecp_nistz256_sqr_mont_nohw", "fiat_curve25519_adx_mul", "fiat_curve25519_adx_square", "gcm_ghash_avx", "gcm_ghash_clmul", "gcm_ghash_neon", "gcm_ghash_vpclmulqdq_avx2_1", "gcm_gmult_clmul", "gcm_gmult_neon", "gcm_init_avx", "gcm_init_clmul", "gcm_init_neon", "gcm_init_vpclmulqdq_avx2", "k25519Precomp", "limbs_mul_add_limb", "little_endian_bytes_from_scalar", "ecp_nistz256_neg", "ecp_nistz256_select_w5", "ecp_nistz256_select_w7", "neon_available", "p256_mul_mont", "p256_point_add", "p256_point_add_affine", "p256_point_double", "p256_point_mul", "p256_point_mul_base", "p256_point_mul_base_vartime", "p256_scalar_mul_mont", "p256_scalar_sqr_rep_mont", "p256_sqr_mont", "p384_elem_div_by_2", "p384_elem_mul_mont", "p384_elem_neg", "p384_elem_sub", "p384_point_add", "p384_point_double", "p384_point_mul", "p384_scalar_mul_mont", "openssl_poly1305_neon2_addmulmod", "openssl_poly1305_neon2_blocks", "sha256_block_data_order", "sha256_block_data_order_avx", "sha256_block_data_order_ssse3", "sha256_block_data_order_hw", "sha256_block_data_order_neon", "sha256_block_data_order_nohw", "sha512_block_data_order", "sha512_block_data_order_avx", "sha512_block_data_order_hw", "sha512_block_data_order_neon", "sha512_block_data_order_nohw", "vpaes_ctr32_encrypt_blocks", "vpaes_encrypt", "vpaes_encrypt_key_to_bsaes", "vpaes_set_encrypt_key", "x25519_NEON", "x25519_fe_invert", "x25519_fe_isnegative", "x25519_fe_mul_ttt", "x25519_fe_neg", "x25519_fe_tobytes", "x25519_ge_double_scalarmult_vartime", "x25519_ge_frombytes_vartime", "x25519_ge_scalarmult_base", "x25519_ge_scalarmult_base_adx", "x25519_public_from_private_generic_masked", "x25519_sc_mask", "x25519_sc_muladd", "x25519_sc_reduce", "x25519_scalar_mult_adx", "x25519_scalar_mult_generic_masked", ]; let mut out = String::new(); for (old, new) in SYMBOLS_TO_RENAME { let line = format!( "{pp}define {prefix_prefix}{old} {prefix_prefix}{new}\n", pp = pp, prefix_prefix = prefix_prefix, old = old, new = new ); out += &line; } for symbol in SYMBOLS_TO_PREFIX { let line = format!( "{pp}define {prefix_prefix}{symbol} {prefix_prefix}{prefix}{symbol}\n", pp = pp, prefix_prefix = prefix_prefix, prefix = prefix, symbol = symbol ); out += &line; } out } ring-0.17.14/crypto/chacha/asm/chacha-armv4.pl000064400000000000000000000654171046102023000170540ustar 00000000000000#! /usr/bin/env perl # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # December 2014 # # ChaCha20 for ARMv4. # # Performance in cycles per byte out of large buffer. # # IALU/gcc-4.4 1xNEON 3xNEON+1xIALU # # Cortex-A5 19.3(*)/+95% 21.8 14.1 # Cortex-A8 10.5(*)/+160% 13.9 6.35 # Cortex-A9 12.9(**)/+110% 14.3 6.50 # Cortex-A15 11.0/+40% 16.0 5.00 # Snapdragon S4 11.5/+125% 13.6 4.90 # # (*) most "favourable" result for aligned data on little-endian # processor, result for misaligned data is 10-15% lower; # (**) this result is a trade-off: it can be improved by 20%, # but then Snapdragon S4 and Cortex-A8 results get # 20-25% worse; $flavour = shift; if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; } else { open OUT,">$output"; *STDOUT=*OUT; } sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; my $arg = pop; $arg = "#$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; } my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); my @t=map("r$_",(8..11)); sub ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my $odd = $d0&1; my ($xc,$xc_) = (@t[0..1]); my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); my @ret; # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' are permanently allocated in registers, @x[0..7], # while 'c's and pair of 'd's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # bunch of 'c' stores and loads in the middle, but none in # the beginning or end. If you observe 'd' column, you'll # notice that 15 and 13 are reused in next pair of rounds. # This is why these two are chosen for offloading to memory, # to make loads count more. push @ret,( "&add (@x[$a0],@x[$a0],@x[$b0])", "&mov ($xd,$xd,'ror#16')", "&add (@x[$a1],@x[$a1],@x[$b1])", "&mov ($xd_,$xd_,'ror#16')", "&eor ($xd,$xd,@x[$a0],'ror#16')", "&eor ($xd_,$xd_,@x[$a1],'ror#16')", "&add ($xc,$xc,$xd)", "&mov (@x[$b0],@x[$b0],'ror#20')", "&add ($xc_,$xc_,$xd_)", "&mov (@x[$b1],@x[$b1],'ror#20')", "&eor (@x[$b0],@x[$b0],$xc,'ror#20')", "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')", "&add (@x[$a0],@x[$a0],@x[$b0])", "&mov ($xd,$xd,'ror#24')", "&add (@x[$a1],@x[$a1],@x[$b1])", "&mov ($xd_,$xd_,'ror#24')", "&eor ($xd,$xd,@x[$a0],'ror#24')", "&eor ($xd_,$xd_,@x[$a1],'ror#24')", "&add ($xc,$xc,$xd)", "&mov (@x[$b0],@x[$b0],'ror#25')" ); push @ret,( "&str ($xd,'[sp,#4*(16+$d0)]')", "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); push @ret,( "&add ($xc_,$xc_,$xd_)", "&mov (@x[$b1],@x[$b1],'ror#25')" ); push @ret,( "&str ($xd_,'[sp,#4*(16+$d1)]')", "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); push @ret,( "&eor (@x[$b0],@x[$b0],$xc,'ror#25')", "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" ); $xd=@x[$d2] if (!$odd); $xd_=@x[$d3] if ($odd); push @ret,( "&str ($xc,'[sp,#4*(16+$c0)]')", "&ldr ($xc,'[sp,#4*(16+$c2)]')", "&add (@x[$a2],@x[$a2],@x[$b2])", "&mov ($xd,$xd,'ror#16')", "&str ($xc_,'[sp,#4*(16+$c1)]')", "&ldr ($xc_,'[sp,#4*(16+$c3)]')", "&add (@x[$a3],@x[$a3],@x[$b3])", "&mov ($xd_,$xd_,'ror#16')", "&eor ($xd,$xd,@x[$a2],'ror#16')", "&eor ($xd_,$xd_,@x[$a3],'ror#16')", "&add ($xc,$xc,$xd)", "&mov (@x[$b2],@x[$b2],'ror#20')", "&add ($xc_,$xc_,$xd_)", "&mov (@x[$b3],@x[$b3],'ror#20')", "&eor (@x[$b2],@x[$b2],$xc,'ror#20')", "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')", "&add (@x[$a2],@x[$a2],@x[$b2])", "&mov ($xd,$xd,'ror#24')", "&add (@x[$a3],@x[$a3],@x[$b3])", "&mov ($xd_,$xd_,'ror#24')", "&eor ($xd,$xd,@x[$a2],'ror#24')", "&eor ($xd_,$xd_,@x[$a3],'ror#24')", "&add ($xc,$xc,$xd)", "&mov (@x[$b2],@x[$b2],'ror#25')", "&add ($xc_,$xc_,$xd_)", "&mov (@x[$b3],@x[$b3],'ror#25')", "&eor (@x[$b2],@x[$b2],$xc,'ror#25')", "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" ); @ret; } $code.=<<___; @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both @ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. .arch armv7-a .text #if defined(__thumb2__) || defined(__clang__) .syntax unified #endif #if defined(__thumb2__) .thumb #else .code 32 #endif #if defined(__thumb2__) || defined(__clang__) #define ldrhsb ldrbhs #endif .align 5 .Lsigma: .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral .Lone: .long 1,0,0,0 .globl ChaCha20_ctr32_nohw .type ChaCha20_ctr32_nohw,%function .align 5 ChaCha20_ctr32_nohw: ldr r12,[sp,#0] @ pull pointer to counter and nonce stmdb sp!,{r0-r2,r4-r11,lr} adr r14,.Lsigma ldmia r12,{r4-r7} @ load counter and nonce sub sp,sp,#4*(16) @ off-load area stmdb sp!,{r4-r7} @ copy counter and nonce ldmia r3,{r4-r11} @ load key ldmia r14,{r0-r3} @ load sigma stmdb sp!,{r4-r11} @ copy key stmdb sp!,{r0-r3} @ copy sigma str r10,[sp,#4*(16+10)] @ off-load "@x[10]" str r11,[sp,#4*(16+11)] @ off-load "@x[11]" b .Loop_outer_enter .align 4 .Loop_outer: ldmia sp,{r0-r9} @ load key material str @t[3],[sp,#4*(32+2)] @ save len str r12, [sp,#4*(32+1)] @ save inp str r14, [sp,#4*(32+0)] @ save out .Loop_outer_enter: ldr @t[3], [sp,#4*(15)] ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load ldr @t[2], [sp,#4*(13)] ldr @x[14],[sp,#4*(14)] str @t[3], [sp,#4*(16+15)] mov @t[3],#10 b .Loop .align 4 .Loop: subs @t[3],@t[3],#1 ___ foreach (&ROUND(0, 4, 8,12)) { eval; } foreach (&ROUND(0, 5,10,15)) { eval; } $code.=<<___; bne .Loop ldr @t[3],[sp,#4*(32+2)] @ load len str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store str @t[1], [sp,#4*(16+9)] str @x[12],[sp,#4*(16+12)] str @t[2], [sp,#4*(16+13)] str @x[14],[sp,#4*(16+14)] @ at this point we have first half of 512-bit result in @ @x[0-7] and second half at sp+4*(16+8) cmp @t[3],#64 @ done yet? #ifdef __thumb2__ itete lo #endif addlo r12,sp,#4*(0) @ shortcut or ... ldrhs r12,[sp,#4*(32+1)] @ ... load inp addlo r14,sp,#4*(0) @ shortcut or ... ldrhs r14,[sp,#4*(32+0)] @ ... load out ldr @t[0],[sp,#4*(0)] @ load key material ldr @t[1],[sp,#4*(1)] #if __ARM_ARCH>=6 || !defined(__ARMEB__) # if __ARM_ARCH<7 orr @t[2],r12,r14 tst @t[2],#3 @ are input and output aligned? ldr @t[2],[sp,#4*(2)] bne .Lunaligned cmp @t[3],#64 @ restore flags # else ldr @t[2],[sp,#4*(2)] # endif ldr @t[3],[sp,#4*(3)] add @x[0],@x[0],@t[0] @ accumulate key material add @x[1],@x[1],@t[1] # ifdef __thumb2__ itt hs # endif ldrhs @t[0],[r12],#16 @ load input ldrhs @t[1],[r12,#-12] add @x[2],@x[2],@t[2] add @x[3],@x[3],@t[3] # ifdef __thumb2__ itt hs # endif ldrhs @t[2],[r12,#-8] ldrhs @t[3],[r12,#-4] # if __ARM_ARCH>=6 && defined(__ARMEB__) rev @x[0],@x[0] rev @x[1],@x[1] rev @x[2],@x[2] rev @x[3],@x[3] # endif # ifdef __thumb2__ itt hs # endif eorhs @x[0],@x[0],@t[0] @ xor with input eorhs @x[1],@x[1],@t[1] add @t[0],sp,#4*(4) str @x[0],[r14],#16 @ store output # ifdef __thumb2__ itt hs # endif eorhs @x[2],@x[2],@t[2] eorhs @x[3],@x[3],@t[3] ldmia @t[0],{@t[0]-@t[3]} @ load key material str @x[1],[r14,#-12] str @x[2],[r14,#-8] str @x[3],[r14,#-4] add @x[4],@x[4],@t[0] @ accumulate key material add @x[5],@x[5],@t[1] # ifdef __thumb2__ itt hs # endif ldrhs @t[0],[r12],#16 @ load input ldrhs @t[1],[r12,#-12] add @x[6],@x[6],@t[2] add @x[7],@x[7],@t[3] # ifdef __thumb2__ itt hs # endif ldrhs @t[2],[r12,#-8] ldrhs @t[3],[r12,#-4] # if __ARM_ARCH>=6 && defined(__ARMEB__) rev @x[4],@x[4] rev @x[5],@x[5] rev @x[6],@x[6] rev @x[7],@x[7] # endif # ifdef __thumb2__ itt hs # endif eorhs @x[4],@x[4],@t[0] eorhs @x[5],@x[5],@t[1] add @t[0],sp,#4*(8) str @x[4],[r14],#16 @ store output # ifdef __thumb2__ itt hs # endif eorhs @x[6],@x[6],@t[2] eorhs @x[7],@x[7],@t[3] str @x[5],[r14,#-12] ldmia @t[0],{@t[0]-@t[3]} @ load key material str @x[6],[r14,#-8] add @x[0],sp,#4*(16+8) str @x[7],[r14,#-4] ldmia @x[0],{@x[0]-@x[7]} @ load second half add @x[0],@x[0],@t[0] @ accumulate key material add @x[1],@x[1],@t[1] # ifdef __thumb2__ itt hs # endif ldrhs @t[0],[r12],#16 @ load input ldrhs @t[1],[r12,#-12] # ifdef __thumb2__ itt hi # endif strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it add @x[2],@x[2],@t[2] add @x[3],@x[3],@t[3] # ifdef __thumb2__ itt hs # endif ldrhs @t[2],[r12,#-8] ldrhs @t[3],[r12,#-4] # if __ARM_ARCH>=6 && defined(__ARMEB__) rev @x[0],@x[0] rev @x[1],@x[1] rev @x[2],@x[2] rev @x[3],@x[3] # endif # ifdef __thumb2__ itt hs # endif eorhs @x[0],@x[0],@t[0] eorhs @x[1],@x[1],@t[1] add @t[0],sp,#4*(12) str @x[0],[r14],#16 @ store output # ifdef __thumb2__ itt hs # endif eorhs @x[2],@x[2],@t[2] eorhs @x[3],@x[3],@t[3] str @x[1],[r14,#-12] ldmia @t[0],{@t[0]-@t[3]} @ load key material str @x[2],[r14,#-8] str @x[3],[r14,#-4] add @x[4],@x[4],@t[0] @ accumulate key material add @x[5],@x[5],@t[1] # ifdef __thumb2__ itt hi # endif addhi @t[0],@t[0],#1 @ next counter value strhi @t[0],[sp,#4*(12)] @ save next counter value # ifdef __thumb2__ itt hs # endif ldrhs @t[0],[r12],#16 @ load input ldrhs @t[1],[r12,#-12] add @x[6],@x[6],@t[2] add @x[7],@x[7],@t[3] # ifdef __thumb2__ itt hs # endif ldrhs @t[2],[r12,#-8] ldrhs @t[3],[r12,#-4] # if __ARM_ARCH>=6 && defined(__ARMEB__) rev @x[4],@x[4] rev @x[5],@x[5] rev @x[6],@x[6] rev @x[7],@x[7] # endif # ifdef __thumb2__ itt hs # endif eorhs @x[4],@x[4],@t[0] eorhs @x[5],@x[5],@t[1] # ifdef __thumb2__ it ne # endif ldrne @t[0],[sp,#4*(32+2)] @ re-load len # ifdef __thumb2__ itt hs # endif eorhs @x[6],@x[6],@t[2] eorhs @x[7],@x[7],@t[3] str @x[4],[r14],#16 @ store output str @x[5],[r14,#-12] # ifdef __thumb2__ it hs # endif subhs @t[3],@t[0],#64 @ len-=64 str @x[6],[r14,#-8] str @x[7],[r14,#-4] bhi .Loop_outer beq .Ldone # if __ARM_ARCH<7 b .Ltail .align 4 .Lunaligned: @ unaligned endian-neutral path cmp @t[3],#64 @ restore flags # endif #endif #if __ARM_ARCH<7 ldr @t[3],[sp,#4*(3)] ___ for ($i=0;$i<16;$i+=4) { my $j=$i&0x7; $code.=<<___ if ($i==4); add @x[0],sp,#4*(16+8) ___ $code.=<<___ if ($i==8); ldmia @x[0],{@x[0]-@x[7]} @ load second half # ifdef __thumb2__ itt hi # endif strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" ___ $code.=<<___; add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material ___ $code.=<<___ if ($i==12); # ifdef __thumb2__ itt hi # endif addhi @t[0],@t[0],#1 @ next counter value strhi @t[0],[sp,#4*(12)] @ save next counter value ___ $code.=<<___; add @x[$j+1],@x[$j+1],@t[1] add @x[$j+2],@x[$j+2],@t[2] # ifdef __thumb2__ itete lo # endif eorlo @t[0],@t[0],@t[0] @ zero or ... ldrhsb @t[0],[r12],#16 @ ... load input eorlo @t[1],@t[1],@t[1] ldrhsb @t[1],[r12,#-12] add @x[$j+3],@x[$j+3],@t[3] # ifdef __thumb2__ itete lo # endif eorlo @t[2],@t[2],@t[2] ldrhsb @t[2],[r12,#-8] eorlo @t[3],@t[3],@t[3] ldrhsb @t[3],[r12,#-4] eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) eor @x[$j+1],@t[1],@x[$j+1] # ifdef __thumb2__ itt hs # endif ldrhsb @t[0],[r12,#-15] @ load more input ldrhsb @t[1],[r12,#-11] eor @x[$j+2],@t[2],@x[$j+2] strb @x[$j+0],[r14],#16 @ store output eor @x[$j+3],@t[3],@x[$j+3] # ifdef __thumb2__ itt hs # endif ldrhsb @t[2],[r12,#-7] ldrhsb @t[3],[r12,#-3] strb @x[$j+1],[r14,#-12] eor @x[$j+0],@t[0],@x[$j+0],lsr#8 strb @x[$j+2],[r14,#-8] eor @x[$j+1],@t[1],@x[$j+1],lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb @t[0],[r12,#-14] @ load more input ldrhsb @t[1],[r12,#-10] strb @x[$j+3],[r14,#-4] eor @x[$j+2],@t[2],@x[$j+2],lsr#8 strb @x[$j+0],[r14,#-15] eor @x[$j+3],@t[3],@x[$j+3],lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb @t[2],[r12,#-6] ldrhsb @t[3],[r12,#-2] strb @x[$j+1],[r14,#-11] eor @x[$j+0],@t[0],@x[$j+0],lsr#8 strb @x[$j+2],[r14,#-7] eor @x[$j+1],@t[1],@x[$j+1],lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb @t[0],[r12,#-13] @ load more input ldrhsb @t[1],[r12,#-9] strb @x[$j+3],[r14,#-3] eor @x[$j+2],@t[2],@x[$j+2],lsr#8 strb @x[$j+0],[r14,#-14] eor @x[$j+3],@t[3],@x[$j+3],lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb @t[2],[r12,#-5] ldrhsb @t[3],[r12,#-1] strb @x[$j+1],[r14,#-10] strb @x[$j+2],[r14,#-6] eor @x[$j+0],@t[0],@x[$j+0],lsr#8 strb @x[$j+3],[r14,#-2] eor @x[$j+1],@t[1],@x[$j+1],lsr#8 strb @x[$j+0],[r14,#-13] eor @x[$j+2],@t[2],@x[$j+2],lsr#8 strb @x[$j+1],[r14,#-9] eor @x[$j+3],@t[3],@x[$j+3],lsr#8 strb @x[$j+2],[r14,#-5] strb @x[$j+3],[r14,#-1] ___ $code.=<<___ if ($i<12); add @t[0],sp,#4*(4+$i) ldmia @t[0],{@t[0]-@t[3]} @ load key material ___ } $code.=<<___; # ifdef __thumb2__ it ne # endif ldrne @t[0],[sp,#4*(32+2)] @ re-load len # ifdef __thumb2__ it hs # endif subhs @t[3],@t[0],#64 @ len-=64 bhi .Loop_outer beq .Ldone #endif .Ltail: ldr r12,[sp,#4*(32+1)] @ load inp add @t[1],sp,#4*(0) ldr r14,[sp,#4*(32+0)] @ load out .Loop_tail: ldrb @t[2],[@t[1]],#1 @ read buffer on stack ldrb @t[3],[r12],#1 @ read input subs @t[0],@t[0],#1 eor @t[3],@t[3],@t[2] strb @t[3],[r14],#1 @ store output bne .Loop_tail .Ldone: add sp,sp,#4*(32+3) ldmia sp!,{r4-r11,pc} .size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw ___ {{{ my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = map("q$_",(0..15)); sub NEONROUND { my $odd = pop; my ($a,$b,$c,$d,$t)=@_; ( "&vadd_i32 ($a,$a,$b)", "&veor ($d,$d,$a)", "&vrev32_16 ($d,$d)", # vrot ($d,16) "&vadd_i32 ($c,$c,$d)", "&veor ($t,$b,$c)", "&vshr_u32 ($b,$t,20)", "&vsli_32 ($b,$t,12)", "&vadd_i32 ($a,$a,$b)", "&veor ($t,$d,$a)", "&vshr_u32 ($d,$t,24)", "&vsli_32 ($d,$t,8)", "&vadd_i32 ($c,$c,$d)", "&veor ($t,$b,$c)", "&vshr_u32 ($b,$t,25)", "&vsli_32 ($b,$t,7)", "&vext_8 ($c,$c,$c,8)", "&vext_8 ($b,$b,$b,$odd?12:4)", "&vext_8 ($d,$d,$d,$odd?4:12)" ); } $code.=<<___; #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon .globl ChaCha20_ctr32_neon .type ChaCha20_ctr32_neon,%function .align 5 ChaCha20_ctr32_neon: ldr r12,[sp,#0] @ pull pointer to counter and nonce stmdb sp!,{r0-r2,r4-r11,lr} adr r14,.Lsigma vstmdb sp!,{d8-d15} @ ABI spec says so stmdb sp!,{r0-r3} vld1.32 {$b0-$c0},[r3] @ load key ldmia r3,{r4-r11} @ load key sub sp,sp,#4*(16+16) vld1.32 {$d0},[r12] @ load counter and nonce add r12,sp,#4*8 ldmia r14,{r0-r3} @ load sigma vld1.32 {$a0},[r14]! @ load sigma vld1.32 {$t0},[r14] @ one vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key str r10,[sp,#4*(16+10)] @ off-load "@x[10]" str r11,[sp,#4*(16+11)] @ off-load "@x[11]" vshl.i32 $t1#lo,$t0#lo,#1 @ two vstr $t0#lo,[sp,#4*(16+0)] vshl.i32 $t2#lo,$t0#lo,#2 @ four vstr $t1#lo,[sp,#4*(16+2)] vmov $a1,$a0 vstr $t2#lo,[sp,#4*(16+4)] vmov $a2,$a0 vmov $b1,$b0 vmov $b2,$b0 b .Loop_neon_enter .align 4 .Loop_neon_outer: ldmia sp,{r0-r9} @ load key material cmp @t[3],#64*2 @ if len<=64*2 bls .Lbreak_neon @ switch to integer-only vmov $a1,$a0 str @t[3],[sp,#4*(32+2)] @ save len vmov $a2,$a0 str r12, [sp,#4*(32+1)] @ save inp vmov $b1,$b0 str r14, [sp,#4*(32+0)] @ save out vmov $b2,$b0 .Loop_neon_enter: ldr @t[3], [sp,#4*(15)] vadd.i32 $d1,$d0,$t0 @ counter+1 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load vmov $c1,$c0 ldr @t[2], [sp,#4*(13)] vmov $c2,$c0 ldr @x[14],[sp,#4*(14)] vadd.i32 $d2,$d1,$t0 @ counter+2 str @t[3], [sp,#4*(16+15)] mov @t[3],#10 add @x[12],@x[12],#3 @ counter+3 b .Loop_neon .align 4 .Loop_neon: subs @t[3],@t[3],#1 ___ my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); my @thread3=&ROUND(0,4,8,12); foreach (@thread0) { eval; eval(shift(@thread3)); eval(shift(@thread1)); eval(shift(@thread3)); eval(shift(@thread2)); eval(shift(@thread3)); } @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); @thread3=&ROUND(0,5,10,15); foreach (@thread0) { eval; eval(shift(@thread3)); eval(shift(@thread1)); eval(shift(@thread3)); eval(shift(@thread2)); eval(shift(@thread3)); } $code.=<<___; bne .Loop_neon add @t[3],sp,#32 vld1.32 {$t0-$t1},[sp] @ load key material vld1.32 {$t2-$t3},[@t[3]] ldr @t[3],[sp,#4*(32+2)] @ load len str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store str @t[1], [sp,#4*(16+9)] str @x[12],[sp,#4*(16+12)] str @t[2], [sp,#4*(16+13)] str @x[14],[sp,#4*(16+14)] @ at this point we have first half of 512-bit result in @ @x[0-7] and second half at sp+4*(16+8) ldr r12,[sp,#4*(32+1)] @ load inp ldr r14,[sp,#4*(32+0)] @ load out vadd.i32 $a0,$a0,$t0 @ accumulate key material vadd.i32 $a1,$a1,$t0 vadd.i32 $a2,$a2,$t0 vldr $t0#lo,[sp,#4*(16+0)] @ one vadd.i32 $b0,$b0,$t1 vadd.i32 $b1,$b1,$t1 vadd.i32 $b2,$b2,$t1 vldr $t1#lo,[sp,#4*(16+2)] @ two vadd.i32 $c0,$c0,$t2 vadd.i32 $c1,$c1,$t2 vadd.i32 $c2,$c2,$t2 vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 vadd.i32 $d0,$d0,$t3 vadd.i32 $d1,$d1,$t3 vadd.i32 $d2,$d2,$t3 cmp @t[3],#64*4 blo .Ltail_neon vld1.8 {$t0-$t1},[r12]! @ load input mov @t[3],sp vld1.8 {$t2-$t3},[r12]! veor $a0,$a0,$t0 @ xor with input veor $b0,$b0,$t1 vld1.8 {$t0-$t1},[r12]! veor $c0,$c0,$t2 veor $d0,$d0,$t3 vld1.8 {$t2-$t3},[r12]! veor $a1,$a1,$t0 vst1.8 {$a0-$b0},[r14]! @ store output veor $b1,$b1,$t1 vld1.8 {$t0-$t1},[r12]! veor $c1,$c1,$t2 vst1.8 {$c0-$d0},[r14]! veor $d1,$d1,$t3 vld1.8 {$t2-$t3},[r12]! veor $a2,$a2,$t0 vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration veor $t0#hi,$t0#hi,$t0#hi vldr $t0#lo,[sp,#4*(16+4)] @ four veor $b2,$b2,$t1 vld1.32 {$c0-$d0},[@t[3]] veor $c2,$c2,$t2 vst1.8 {$a1-$b1},[r14]! veor $d2,$d2,$t3 vst1.8 {$c1-$d1},[r14]! vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value vldr $t0#lo,[sp,#4*(16+0)] @ one ldmia sp,{@t[0]-@t[3]} @ load key material add @x[0],@x[0],@t[0] @ accumulate key material ldr @t[0],[r12],#16 @ load input vst1.8 {$a2-$b2},[r14]! add @x[1],@x[1],@t[1] ldr @t[1],[r12,#-12] vst1.8 {$c2-$d2},[r14]! add @x[2],@x[2],@t[2] ldr @t[2],[r12,#-8] add @x[3],@x[3],@t[3] ldr @t[3],[r12,#-4] # ifdef __ARMEB__ rev @x[0],@x[0] rev @x[1],@x[1] rev @x[2],@x[2] rev @x[3],@x[3] # endif eor @x[0],@x[0],@t[0] @ xor with input add @t[0],sp,#4*(4) eor @x[1],@x[1],@t[1] str @x[0],[r14],#16 @ store output eor @x[2],@x[2],@t[2] str @x[1],[r14,#-12] eor @x[3],@x[3],@t[3] ldmia @t[0],{@t[0]-@t[3]} @ load key material str @x[2],[r14,#-8] str @x[3],[r14,#-4] add @x[4],@x[4],@t[0] @ accumulate key material ldr @t[0],[r12],#16 @ load input add @x[5],@x[5],@t[1] ldr @t[1],[r12,#-12] add @x[6],@x[6],@t[2] ldr @t[2],[r12,#-8] add @x[7],@x[7],@t[3] ldr @t[3],[r12,#-4] # ifdef __ARMEB__ rev @x[4],@x[4] rev @x[5],@x[5] rev @x[6],@x[6] rev @x[7],@x[7] # endif eor @x[4],@x[4],@t[0] add @t[0],sp,#4*(8) eor @x[5],@x[5],@t[1] str @x[4],[r14],#16 @ store output eor @x[6],@x[6],@t[2] str @x[5],[r14,#-12] eor @x[7],@x[7],@t[3] ldmia @t[0],{@t[0]-@t[3]} @ load key material str @x[6],[r14,#-8] add @x[0],sp,#4*(16+8) str @x[7],[r14,#-4] ldmia @x[0],{@x[0]-@x[7]} @ load second half add @x[0],@x[0],@t[0] @ accumulate key material ldr @t[0],[r12],#16 @ load input add @x[1],@x[1],@t[1] ldr @t[1],[r12,#-12] # ifdef __thumb2__ it hi # endif strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it add @x[2],@x[2],@t[2] ldr @t[2],[r12,#-8] # ifdef __thumb2__ it hi # endif strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it add @x[3],@x[3],@t[3] ldr @t[3],[r12,#-4] # ifdef __ARMEB__ rev @x[0],@x[0] rev @x[1],@x[1] rev @x[2],@x[2] rev @x[3],@x[3] # endif eor @x[0],@x[0],@t[0] add @t[0],sp,#4*(12) eor @x[1],@x[1],@t[1] str @x[0],[r14],#16 @ store output eor @x[2],@x[2],@t[2] str @x[1],[r14,#-12] eor @x[3],@x[3],@t[3] ldmia @t[0],{@t[0]-@t[3]} @ load key material str @x[2],[r14,#-8] str @x[3],[r14,#-4] add @x[4],@x[4],@t[0] @ accumulate key material add @t[0],@t[0],#4 @ next counter value add @x[5],@x[5],@t[1] str @t[0],[sp,#4*(12)] @ save next counter value ldr @t[0],[r12],#16 @ load input add @x[6],@x[6],@t[2] add @x[4],@x[4],#3 @ counter+3 ldr @t[1],[r12,#-12] add @x[7],@x[7],@t[3] ldr @t[2],[r12,#-8] ldr @t[3],[r12,#-4] # ifdef __ARMEB__ rev @x[4],@x[4] rev @x[5],@x[5] rev @x[6],@x[6] rev @x[7],@x[7] # endif eor @x[4],@x[4],@t[0] # ifdef __thumb2__ it hi # endif ldrhi @t[0],[sp,#4*(32+2)] @ re-load len eor @x[5],@x[5],@t[1] eor @x[6],@x[6],@t[2] str @x[4],[r14],#16 @ store output eor @x[7],@x[7],@t[3] str @x[5],[r14,#-12] sub @t[3],@t[0],#64*4 @ len-=64*4 str @x[6],[r14,#-8] str @x[7],[r14,#-4] bhi .Loop_neon_outer b .Ldone_neon .align 4 .Lbreak_neon: @ harmonize NEON and integer-only stack frames: load data @ from NEON frame, but save to integer-only one; distance @ between the two is 4*(32+4+16-32)=4*(20). str @t[3], [sp,#4*(20+32+2)] @ save len add @t[3],sp,#4*(32+4) str r12, [sp,#4*(20+32+1)] @ save inp str r14, [sp,#4*(20+32+0)] @ save out ldr @x[12],[sp,#4*(16+10)] ldr @x[14],[sp,#4*(16+11)] vldmia @t[3],{d8-d15} @ fulfill ABI requirement str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" ldr @t[3], [sp,#4*(15)] ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load ldr @t[2], [sp,#4*(13)] ldr @x[14],[sp,#4*(14)] str @t[3], [sp,#4*(20+16+15)] add @t[3],sp,#4*(20) vst1.32 {$a0-$b0},[@t[3]]! @ copy key add sp,sp,#4*(20) @ switch frame vst1.32 {$c0-$d0},[@t[3]] mov @t[3],#10 b .Loop @ go integer-only .align 4 .Ltail_neon: cmp @t[3],#64*3 bhs .L192_or_more_neon cmp @t[3],#64*2 bhs .L128_or_more_neon cmp @t[3],#64*1 bhs .L64_or_more_neon add @t[0],sp,#4*(8) vst1.8 {$a0-$b0},[sp] add @t[2],sp,#4*(0) vst1.8 {$c0-$d0},[@t[0]] b .Loop_tail_neon .align 4 .L64_or_more_neon: vld1.8 {$t0-$t1},[r12]! vld1.8 {$t2-$t3},[r12]! veor $a0,$a0,$t0 veor $b0,$b0,$t1 veor $c0,$c0,$t2 veor $d0,$d0,$t3 vst1.8 {$a0-$b0},[r14]! vst1.8 {$c0-$d0},[r14]! beq .Ldone_neon add @t[0],sp,#4*(8) vst1.8 {$a1-$b1},[sp] add @t[2],sp,#4*(0) vst1.8 {$c1-$d1},[@t[0]] sub @t[3],@t[3],#64*1 @ len-=64*1 b .Loop_tail_neon .align 4 .L128_or_more_neon: vld1.8 {$t0-$t1},[r12]! vld1.8 {$t2-$t3},[r12]! veor $a0,$a0,$t0 veor $b0,$b0,$t1 vld1.8 {$t0-$t1},[r12]! veor $c0,$c0,$t2 veor $d0,$d0,$t3 vld1.8 {$t2-$t3},[r12]! veor $a1,$a1,$t0 veor $b1,$b1,$t1 vst1.8 {$a0-$b0},[r14]! veor $c1,$c1,$t2 vst1.8 {$c0-$d0},[r14]! veor $d1,$d1,$t3 vst1.8 {$a1-$b1},[r14]! vst1.8 {$c1-$d1},[r14]! beq .Ldone_neon add @t[0],sp,#4*(8) vst1.8 {$a2-$b2},[sp] add @t[2],sp,#4*(0) vst1.8 {$c2-$d2},[@t[0]] sub @t[3],@t[3],#64*2 @ len-=64*2 b .Loop_tail_neon .align 4 .L192_or_more_neon: vld1.8 {$t0-$t1},[r12]! vld1.8 {$t2-$t3},[r12]! veor $a0,$a0,$t0 veor $b0,$b0,$t1 vld1.8 {$t0-$t1},[r12]! veor $c0,$c0,$t2 veor $d0,$d0,$t3 vld1.8 {$t2-$t3},[r12]! veor $a1,$a1,$t0 veor $b1,$b1,$t1 vld1.8 {$t0-$t1},[r12]! veor $c1,$c1,$t2 vst1.8 {$a0-$b0},[r14]! veor $d1,$d1,$t3 vld1.8 {$t2-$t3},[r12]! veor $a2,$a2,$t0 vst1.8 {$c0-$d0},[r14]! veor $b2,$b2,$t1 vst1.8 {$a1-$b1},[r14]! veor $c2,$c2,$t2 vst1.8 {$c1-$d1},[r14]! veor $d2,$d2,$t3 vst1.8 {$a2-$b2},[r14]! vst1.8 {$c2-$d2},[r14]! beq .Ldone_neon ldmia sp,{@t[0]-@t[3]} @ load key material add @x[0],@x[0],@t[0] @ accumulate key material add @t[0],sp,#4*(4) add @x[1],@x[1],@t[1] add @x[2],@x[2],@t[2] add @x[3],@x[3],@t[3] ldmia @t[0],{@t[0]-@t[3]} @ load key material add @x[4],@x[4],@t[0] @ accumulate key material add @t[0],sp,#4*(8) add @x[5],@x[5],@t[1] add @x[6],@x[6],@t[2] add @x[7],@x[7],@t[3] ldmia @t[0],{@t[0]-@t[3]} @ load key material # ifdef __ARMEB__ rev @x[0],@x[0] rev @x[1],@x[1] rev @x[2],@x[2] rev @x[3],@x[3] rev @x[4],@x[4] rev @x[5],@x[5] rev @x[6],@x[6] rev @x[7],@x[7] # endif stmia sp,{@x[0]-@x[7]} add @x[0],sp,#4*(16+8) ldmia @x[0],{@x[0]-@x[7]} @ load second half add @x[0],@x[0],@t[0] @ accumulate key material add @t[0],sp,#4*(12) add @x[1],@x[1],@t[1] add @x[2],@x[2],@t[2] add @x[3],@x[3],@t[3] ldmia @t[0],{@t[0]-@t[3]} @ load key material add @x[4],@x[4],@t[0] @ accumulate key material add @t[0],sp,#4*(8) add @x[5],@x[5],@t[1] add @x[4],@x[4],#3 @ counter+3 add @x[6],@x[6],@t[2] add @x[7],@x[7],@t[3] ldr @t[3],[sp,#4*(32+2)] @ re-load len # ifdef __ARMEB__ rev @x[0],@x[0] rev @x[1],@x[1] rev @x[2],@x[2] rev @x[3],@x[3] rev @x[4],@x[4] rev @x[5],@x[5] rev @x[6],@x[6] rev @x[7],@x[7] # endif stmia @t[0],{@x[0]-@x[7]} add @t[2],sp,#4*(0) sub @t[3],@t[3],#64*3 @ len-=64*3 .Loop_tail_neon: ldrb @t[0],[@t[2]],#1 @ read buffer on stack ldrb @t[1],[r12],#1 @ read input subs @t[3],@t[3],#1 eor @t[0],@t[0],@t[1] strb @t[0],[r14],#1 @ store output bne .Loop_tail_neon .Ldone_neon: add sp,sp,#4*(32+4) vldmia sp,{d8-d15} add sp,sp,#4*(16+3) ldmia sp!,{r4-r11,pc} .size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon #endif ___ }}} foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/chacha/asm/chacha-armv8.pl000064400000000000000000000650061046102023000170520ustar 00000000000000#! /usr/bin/env perl # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # June 2015 # # ChaCha20 for ARMv8. # # Performance in cycles per byte out of large buffer. # # IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU # # Apple A7 5.50/+49% 3.33 1.70 # Cortex-A53 8.40/+80% 4.72 4.72(*) # Cortex-A57 8.06/+43% 4.90 4.43(**) # Denver 4.50/+82% 2.63 2.67(*) # X-Gene 9.50/+46% 8.82 8.89(*) # Mongoose 8.00/+44% 3.64 3.25 # Kryo 8.17/+50% 4.83 4.65 # # (*) it's expected that doubling interleave factor doesn't help # all processors, only those with higher NEON latency and # higher instruction issue rate; # (**) expected improvement was actually higher; $flavour=shift; $output=shift; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; my $arg = pop; $arg = "#$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; } my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4)); my @x=map("x$_",(5..17,19..21)); my @d=map("x$_",(22..28,30)); sub ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); ( "&add_32 (@x[$a0],@x[$a0],@x[$b0])", "&add_32 (@x[$a1],@x[$a1],@x[$b1])", "&add_32 (@x[$a2],@x[$a2],@x[$b2])", "&add_32 (@x[$a3],@x[$a3],@x[$b3])", "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", "&ror_32 (@x[$d0],@x[$d0],16)", "&ror_32 (@x[$d1],@x[$d1],16)", "&ror_32 (@x[$d2],@x[$d2],16)", "&ror_32 (@x[$d3],@x[$d3],16)", "&add_32 (@x[$c0],@x[$c0],@x[$d0])", "&add_32 (@x[$c1],@x[$c1],@x[$d1])", "&add_32 (@x[$c2],@x[$c2],@x[$d2])", "&add_32 (@x[$c3],@x[$c3],@x[$d3])", "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", "&ror_32 (@x[$b0],@x[$b0],20)", "&ror_32 (@x[$b1],@x[$b1],20)", "&ror_32 (@x[$b2],@x[$b2],20)", "&ror_32 (@x[$b3],@x[$b3],20)", "&add_32 (@x[$a0],@x[$a0],@x[$b0])", "&add_32 (@x[$a1],@x[$a1],@x[$b1])", "&add_32 (@x[$a2],@x[$a2],@x[$b2])", "&add_32 (@x[$a3],@x[$a3],@x[$b3])", "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", "&ror_32 (@x[$d0],@x[$d0],24)", "&ror_32 (@x[$d1],@x[$d1],24)", "&ror_32 (@x[$d2],@x[$d2],24)", "&ror_32 (@x[$d3],@x[$d3],24)", "&add_32 (@x[$c0],@x[$c0],@x[$d0])", "&add_32 (@x[$c1],@x[$c1],@x[$d1])", "&add_32 (@x[$c2],@x[$c2],@x[$d2])", "&add_32 (@x[$c3],@x[$c3],@x[$d3])", "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", "&ror_32 (@x[$b0],@x[$b0],25)", "&ror_32 (@x[$b1],@x[$b1],25)", "&ror_32 (@x[$b2],@x[$b2],25)", "&ror_32 (@x[$b3],@x[$b3],25)" ); } $code.=<<___; .section .rodata .align 5 .Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral .Lone: .long 1,0,0,0 .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by " .text .globl ChaCha20_ctr32_nohw .type ChaCha20_ctr32_nohw,%function .align 5 ChaCha20_ctr32_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 adrp @x[0],:pg_hi21:.Lsigma add @x[0],@x[0],:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#64 ldp @d[0],@d[1],[@x[0]] // load sigma ldp @d[2],@d[3],[$key] // load key ldp @d[4],@d[5],[$key,#16] ldp @d[6],@d[7],[$ctr] // load counter #ifdef __AARCH64EB__ ror @d[2],@d[2],#32 ror @d[3],@d[3],#32 ror @d[4],@d[4],#32 ror @d[5],@d[5],#32 ror @d[6],@d[6],#32 ror @d[7],@d[7],#32 #endif .Loop_outer: mov.32 @x[0],@d[0] // unpack key block lsr @x[1],@d[0],#32 mov.32 @x[2],@d[1] lsr @x[3],@d[1],#32 mov.32 @x[4],@d[2] lsr @x[5],@d[2],#32 mov.32 @x[6],@d[3] lsr @x[7],@d[3],#32 mov.32 @x[8],@d[4] lsr @x[9],@d[4],#32 mov.32 @x[10],@d[5] lsr @x[11],@d[5],#32 mov.32 @x[12],@d[6] lsr @x[13],@d[6],#32 mov.32 @x[14],@d[7] lsr @x[15],@d[7],#32 mov $ctr,#10 subs $len,$len,#64 .Loop: sub $ctr,$ctr,#1 ___ foreach (&ROUND(0, 4, 8,12)) { eval; } foreach (&ROUND(0, 5,10,15)) { eval; } $code.=<<___; cbnz $ctr,.Loop add.32 @x[0],@x[0],@d[0] // accumulate key block add @x[1],@x[1],@d[0],lsr#32 add.32 @x[2],@x[2],@d[1] add @x[3],@x[3],@d[1],lsr#32 add.32 @x[4],@x[4],@d[2] add @x[5],@x[5],@d[2],lsr#32 add.32 @x[6],@x[6],@d[3] add @x[7],@x[7],@d[3],lsr#32 add.32 @x[8],@x[8],@d[4] add @x[9],@x[9],@d[4],lsr#32 add.32 @x[10],@x[10],@d[5] add @x[11],@x[11],@d[5],lsr#32 add.32 @x[12],@x[12],@d[6] add @x[13],@x[13],@d[6],lsr#32 add.32 @x[14],@x[14],@d[7] add @x[15],@x[15],@d[7],lsr#32 b.lo .Ltail add @x[0],@x[0],@x[1],lsl#32 // pack add @x[2],@x[2],@x[3],lsl#32 ldp @x[1],@x[3],[$inp,#0] // load input add @x[4],@x[4],@x[5],lsl#32 add @x[6],@x[6],@x[7],lsl#32 ldp @x[5],@x[7],[$inp,#16] add @x[8],@x[8],@x[9],lsl#32 add @x[10],@x[10],@x[11],lsl#32 ldp @x[9],@x[11],[$inp,#32] add @x[12],@x[12],@x[13],lsl#32 add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $inp,$inp,#64 #ifdef __AARCH64EB__ rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] rev @x[6],@x[6] rev @x[8],@x[8] rev @x[10],@x[10] rev @x[12],@x[12] rev @x[14],@x[14] #endif eor @x[0],@x[0],@x[1] eor @x[2],@x[2],@x[3] eor @x[4],@x[4],@x[5] eor @x[6],@x[6],@x[7] eor @x[8],@x[8],@x[9] eor @x[10],@x[10],@x[11] eor @x[12],@x[12],@x[13] eor @x[14],@x[14],@x[15] stp @x[0],@x[2],[$out,#0] // store output add @d[6],@d[6],#1 // increment counter stp @x[4],@x[6],[$out,#16] stp @x[8],@x[10],[$out,#32] stp @x[12],@x[14],[$out,#48] add $out,$out,#64 b.hi .Loop_outer ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .align 4 .Ltail: add $len,$len,#64 .Less_than_64: sub $out,$out,#1 add $inp,$inp,$len add $out,$out,$len add $ctr,sp,$len neg $len,$len add @x[0],@x[0],@x[1],lsl#32 // pack add @x[2],@x[2],@x[3],lsl#32 add @x[4],@x[4],@x[5],lsl#32 add @x[6],@x[6],@x[7],lsl#32 add @x[8],@x[8],@x[9],lsl#32 add @x[10],@x[10],@x[11],lsl#32 add @x[12],@x[12],@x[13],lsl#32 add @x[14],@x[14],@x[15],lsl#32 #ifdef __AARCH64EB__ rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] rev @x[6],@x[6] rev @x[8],@x[8] rev @x[10],@x[10] rev @x[12],@x[12] rev @x[14],@x[14] #endif stp @x[0],@x[2],[sp,#0] stp @x[4],@x[6],[sp,#16] stp @x[8],@x[10],[sp,#32] stp @x[12],@x[14],[sp,#48] .Loop_tail: ldrb w10,[$inp,$len] ldrb w11,[$ctr,$len] add $len,$len,#1 eor w10,w10,w11 strb w10,[$out,$len] cbnz $len,.Loop_tail stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw ___ {{{ my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) = map("v$_.4s",(0..7,16..23)); my (@K)=map("v$_.4s",(24..30)); my $ONE="v31.4s"; sub NEONROUND { my $odd = pop; my ($a,$b,$c,$d,$t)=@_; ( "&add ('$a','$a','$b')", "&eor ('$d','$d','$a')", "&rev32_16 ('$d','$d')", # vrot ($d,16) "&add ('$c','$c','$d')", "&eor ('$t','$b','$c')", "&ushr ('$b','$t',20)", "&sli ('$b','$t',12)", "&add ('$a','$a','$b')", "&eor ('$t','$d','$a')", "&ushr ('$d','$t',24)", "&sli ('$d','$t',8)", "&add ('$c','$c','$d')", "&eor ('$t','$b','$c')", "&ushr ('$b','$t',25)", "&sli ('$b','$t',7)", "&ext ('$c','$c','$c',8)", "&ext ('$d','$d','$d',$odd?4:12)", "&ext ('$b','$b','$b',$odd?12:4)" ); } $code.=<<___; .globl ChaCha20_ctr32_neon .type ChaCha20_ctr32_neon,%function .align 5 ChaCha20_ctr32_neon: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 adrp @x[0],:pg_hi21:.Lsigma add @x[0],@x[0],:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] cmp $len,#512 b.hs .L512_or_more_neon sub sp,sp,#64 ldp @d[0],@d[1],[@x[0]] // load sigma ld1 {@K[0]},[@x[0]],#16 ldp @d[2],@d[3],[$key] // load key ldp @d[4],@d[5],[$key,#16] ld1 {@K[1],@K[2]},[$key] ldp @d[6],@d[7],[$ctr] // load counter ld1 {@K[3]},[$ctr] ld1 {$ONE},[@x[0]] #ifdef __AARCH64EB__ rev64 @K[0],@K[0] ror @d[2],@d[2],#32 ror @d[3],@d[3],#32 ror @d[4],@d[4],#32 ror @d[5],@d[5],#32 ror @d[6],@d[6],#32 ror @d[7],@d[7],#32 #endif add @K[3],@K[3],$ONE // += 1 add @K[4],@K[3],$ONE add @K[5],@K[4],$ONE shl $ONE,$ONE,#2 // 1 -> 4 .Loop_outer_neon: mov.32 @x[0],@d[0] // unpack key block lsr @x[1],@d[0],#32 mov $A0,@K[0] mov.32 @x[2],@d[1] lsr @x[3],@d[1],#32 mov $A1,@K[0] mov.32 @x[4],@d[2] lsr @x[5],@d[2],#32 mov $A2,@K[0] mov.32 @x[6],@d[3] mov $B0,@K[1] lsr @x[7],@d[3],#32 mov $B1,@K[1] mov.32 @x[8],@d[4] mov $B2,@K[1] lsr @x[9],@d[4],#32 mov $D0,@K[3] mov.32 @x[10],@d[5] mov $D1,@K[4] lsr @x[11],@d[5],#32 mov $D2,@K[5] mov.32 @x[12],@d[6] mov $C0,@K[2] lsr @x[13],@d[6],#32 mov $C1,@K[2] mov.32 @x[14],@d[7] mov $C2,@K[2] lsr @x[15],@d[7],#32 mov $ctr,#10 subs $len,$len,#256 .Loop_neon: sub $ctr,$ctr,#1 ___ my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); my @thread3=&ROUND(0,4,8,12); foreach (@thread0) { eval; eval(shift(@thread3)); eval(shift(@thread1)); eval(shift(@thread3)); eval(shift(@thread2)); eval(shift(@thread3)); } @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); @thread3=&ROUND(0,5,10,15); foreach (@thread0) { eval; eval(shift(@thread3)); eval(shift(@thread1)); eval(shift(@thread3)); eval(shift(@thread2)); eval(shift(@thread3)); } $code.=<<___; cbnz $ctr,.Loop_neon add.32 @x[0],@x[0],@d[0] // accumulate key block add $A0,$A0,@K[0] add @x[1],@x[1],@d[0],lsr#32 add $A1,$A1,@K[0] add.32 @x[2],@x[2],@d[1] add $A2,$A2,@K[0] add @x[3],@x[3],@d[1],lsr#32 add $C0,$C0,@K[2] add.32 @x[4],@x[4],@d[2] add $C1,$C1,@K[2] add @x[5],@x[5],@d[2],lsr#32 add $C2,$C2,@K[2] add.32 @x[6],@x[6],@d[3] add $D0,$D0,@K[3] add @x[7],@x[7],@d[3],lsr#32 add.32 @x[8],@x[8],@d[4] add $D1,$D1,@K[4] add @x[9],@x[9],@d[4],lsr#32 add.32 @x[10],@x[10],@d[5] add $D2,$D2,@K[5] add @x[11],@x[11],@d[5],lsr#32 add.32 @x[12],@x[12],@d[6] add $B0,$B0,@K[1] add @x[13],@x[13],@d[6],lsr#32 add.32 @x[14],@x[14],@d[7] add $B1,$B1,@K[1] add @x[15],@x[15],@d[7],lsr#32 add $B2,$B2,@K[1] b.lo .Ltail_neon add @x[0],@x[0],@x[1],lsl#32 // pack add @x[2],@x[2],@x[3],lsl#32 ldp @x[1],@x[3],[$inp,#0] // load input add @x[4],@x[4],@x[5],lsl#32 add @x[6],@x[6],@x[7],lsl#32 ldp @x[5],@x[7],[$inp,#16] add @x[8],@x[8],@x[9],lsl#32 add @x[10],@x[10],@x[11],lsl#32 ldp @x[9],@x[11],[$inp,#32] add @x[12],@x[12],@x[13],lsl#32 add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $inp,$inp,#64 #ifdef __AARCH64EB__ rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] rev @x[6],@x[6] rev @x[8],@x[8] rev @x[10],@x[10] rev @x[12],@x[12] rev @x[14],@x[14] #endif ld1.8 {$T0-$T3},[$inp],#64 eor @x[0],@x[0],@x[1] eor @x[2],@x[2],@x[3] eor @x[4],@x[4],@x[5] eor @x[6],@x[6],@x[7] eor @x[8],@x[8],@x[9] eor $A0,$A0,$T0 eor @x[10],@x[10],@x[11] eor $B0,$B0,$T1 eor @x[12],@x[12],@x[13] eor $C0,$C0,$T2 eor @x[14],@x[14],@x[15] eor $D0,$D0,$T3 ld1.8 {$T0-$T3},[$inp],#64 stp @x[0],@x[2],[$out,#0] // store output add @d[6],@d[6],#4 // increment counter stp @x[4],@x[6],[$out,#16] add @K[3],@K[3],$ONE // += 4 stp @x[8],@x[10],[$out,#32] add @K[4],@K[4],$ONE stp @x[12],@x[14],[$out,#48] add @K[5],@K[5],$ONE add $out,$out,#64 st1.8 {$A0-$D0},[$out],#64 ld1.8 {$A0-$D0},[$inp],#64 eor $A1,$A1,$T0 eor $B1,$B1,$T1 eor $C1,$C1,$T2 eor $D1,$D1,$T3 st1.8 {$A1-$D1},[$out],#64 eor $A2,$A2,$A0 eor $B2,$B2,$B0 eor $C2,$C2,$C0 eor $D2,$D2,$D0 st1.8 {$A2-$D2},[$out],#64 b.hi .Loop_outer_neon ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .Ltail_neon: add $len,$len,#256 cmp $len,#64 b.lo .Less_than_64 add @x[0],@x[0],@x[1],lsl#32 // pack add @x[2],@x[2],@x[3],lsl#32 ldp @x[1],@x[3],[$inp,#0] // load input add @x[4],@x[4],@x[5],lsl#32 add @x[6],@x[6],@x[7],lsl#32 ldp @x[5],@x[7],[$inp,#16] add @x[8],@x[8],@x[9],lsl#32 add @x[10],@x[10],@x[11],lsl#32 ldp @x[9],@x[11],[$inp,#32] add @x[12],@x[12],@x[13],lsl#32 add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $inp,$inp,#64 #ifdef __AARCH64EB__ rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] rev @x[6],@x[6] rev @x[8],@x[8] rev @x[10],@x[10] rev @x[12],@x[12] rev @x[14],@x[14] #endif eor @x[0],@x[0],@x[1] eor @x[2],@x[2],@x[3] eor @x[4],@x[4],@x[5] eor @x[6],@x[6],@x[7] eor @x[8],@x[8],@x[9] eor @x[10],@x[10],@x[11] eor @x[12],@x[12],@x[13] eor @x[14],@x[14],@x[15] stp @x[0],@x[2],[$out,#0] // store output add @d[6],@d[6],#4 // increment counter stp @x[4],@x[6],[$out,#16] stp @x[8],@x[10],[$out,#32] stp @x[12],@x[14],[$out,#48] add $out,$out,#64 b.eq .Ldone_neon sub $len,$len,#64 cmp $len,#64 b.lo .Less_than_128 ld1.8 {$T0-$T3},[$inp],#64 eor $A0,$A0,$T0 eor $B0,$B0,$T1 eor $C0,$C0,$T2 eor $D0,$D0,$T3 st1.8 {$A0-$D0},[$out],#64 b.eq .Ldone_neon sub $len,$len,#64 cmp $len,#64 b.lo .Less_than_192 ld1.8 {$T0-$T3},[$inp],#64 eor $A1,$A1,$T0 eor $B1,$B1,$T1 eor $C1,$C1,$T2 eor $D1,$D1,$T3 st1.8 {$A1-$D1},[$out],#64 b.eq .Ldone_neon sub $len,$len,#64 st1.8 {$A2-$D2},[sp] b .Last_neon .Less_than_128: st1.8 {$A0-$D0},[sp] b .Last_neon .Less_than_192: st1.8 {$A1-$D1},[sp] b .Last_neon .align 4 .Last_neon: sub $out,$out,#1 add $inp,$inp,$len add $out,$out,$len add $ctr,sp,$len neg $len,$len .Loop_tail_neon: ldrb w10,[$inp,$len] ldrb w11,[$ctr,$len] add $len,$len,#1 eor w10,w10,w11 strb w10,[$out,$len] cbnz $len,.Loop_tail_neon stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] .Ldone_neon: ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon ___ { my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2, $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23)); $code.=<<___; .type ChaCha20_512_neon,%function .align 5 ChaCha20_512_neon: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 adrp @x[0],:pg_hi21:.Lsigma add @x[0],@x[0],:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] .L512_or_more_neon: sub sp,sp,#128+64 ldp @d[0],@d[1],[@x[0]] // load sigma ld1 {@K[0]},[@x[0]],#16 ldp @d[2],@d[3],[$key] // load key ldp @d[4],@d[5],[$key,#16] ld1 {@K[1],@K[2]},[$key] ldp @d[6],@d[7],[$ctr] // load counter ld1 {@K[3]},[$ctr] ld1 {$ONE},[@x[0]] #ifdef __AARCH64EB__ rev64 @K[0],@K[0] ror @d[2],@d[2],#32 ror @d[3],@d[3],#32 ror @d[4],@d[4],#32 ror @d[5],@d[5],#32 ror @d[6],@d[6],#32 ror @d[7],@d[7],#32 #endif add @K[3],@K[3],$ONE // += 1 stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part add @K[3],@K[3],$ONE // not typo str @K[2],[sp,#32] add @K[4],@K[3],$ONE add @K[5],@K[4],$ONE add @K[6],@K[5],$ONE shl $ONE,$ONE,#2 // 1 -> 4 stp d8,d9,[sp,#128+0] // meet ABI requirements stp d10,d11,[sp,#128+16] stp d12,d13,[sp,#128+32] stp d14,d15,[sp,#128+48] sub $len,$len,#512 // not typo .Loop_outer_512_neon: mov $A0,@K[0] mov $A1,@K[0] mov $A2,@K[0] mov $A3,@K[0] mov $A4,@K[0] mov $A5,@K[0] mov $B0,@K[1] mov.32 @x[0],@d[0] // unpack key block mov $B1,@K[1] lsr @x[1],@d[0],#32 mov $B2,@K[1] mov.32 @x[2],@d[1] mov $B3,@K[1] lsr @x[3],@d[1],#32 mov $B4,@K[1] mov.32 @x[4],@d[2] mov $B5,@K[1] lsr @x[5],@d[2],#32 mov $D0,@K[3] mov.32 @x[6],@d[3] mov $D1,@K[4] lsr @x[7],@d[3],#32 mov $D2,@K[5] mov.32 @x[8],@d[4] mov $D3,@K[6] lsr @x[9],@d[4],#32 mov $C0,@K[2] mov.32 @x[10],@d[5] mov $C1,@K[2] lsr @x[11],@d[5],#32 add $D4,$D0,$ONE // +4 mov.32 @x[12],@d[6] add $D5,$D1,$ONE // +4 lsr @x[13],@d[6],#32 mov $C2,@K[2] mov.32 @x[14],@d[7] mov $C3,@K[2] lsr @x[15],@d[7],#32 mov $C4,@K[2] stp @K[3],@K[4],[sp,#48] // off-load key block, variable part mov $C5,@K[2] str @K[5],[sp,#80] mov $ctr,#5 subs $len,$len,#512 .Loop_upper_neon: sub $ctr,$ctr,#1 ___ my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); my $diff = ($#thread0+1)*6 - $#thread67 - 1; my $i = 0; foreach (@thread0) { eval; eval(shift(@thread67)); eval(shift(@thread1)); eval(shift(@thread67)); eval(shift(@thread2)); eval(shift(@thread67)); eval(shift(@thread3)); eval(shift(@thread67)); eval(shift(@thread4)); eval(shift(@thread67)); eval(shift(@thread5)); eval(shift(@thread67)); } @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); foreach (@thread0) { eval; eval(shift(@thread67)); eval(shift(@thread1)); eval(shift(@thread67)); eval(shift(@thread2)); eval(shift(@thread67)); eval(shift(@thread3)); eval(shift(@thread67)); eval(shift(@thread4)); eval(shift(@thread67)); eval(shift(@thread5)); eval(shift(@thread67)); } $code.=<<___; cbnz $ctr,.Loop_upper_neon add.32 @x[0],@x[0],@d[0] // accumulate key block add @x[1],@x[1],@d[0],lsr#32 add.32 @x[2],@x[2],@d[1] add @x[3],@x[3],@d[1],lsr#32 add.32 @x[4],@x[4],@d[2] add @x[5],@x[5],@d[2],lsr#32 add.32 @x[6],@x[6],@d[3] add @x[7],@x[7],@d[3],lsr#32 add.32 @x[8],@x[8],@d[4] add @x[9],@x[9],@d[4],lsr#32 add.32 @x[10],@x[10],@d[5] add @x[11],@x[11],@d[5],lsr#32 add.32 @x[12],@x[12],@d[6] add @x[13],@x[13],@d[6],lsr#32 add.32 @x[14],@x[14],@d[7] add @x[15],@x[15],@d[7],lsr#32 add @x[0],@x[0],@x[1],lsl#32 // pack add @x[2],@x[2],@x[3],lsl#32 ldp @x[1],@x[3],[$inp,#0] // load input add @x[4],@x[4],@x[5],lsl#32 add @x[6],@x[6],@x[7],lsl#32 ldp @x[5],@x[7],[$inp,#16] add @x[8],@x[8],@x[9],lsl#32 add @x[10],@x[10],@x[11],lsl#32 ldp @x[9],@x[11],[$inp,#32] add @x[12],@x[12],@x[13],lsl#32 add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $inp,$inp,#64 #ifdef __AARCH64EB__ rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] rev @x[6],@x[6] rev @x[8],@x[8] rev @x[10],@x[10] rev @x[12],@x[12] rev @x[14],@x[14] #endif eor @x[0],@x[0],@x[1] eor @x[2],@x[2],@x[3] eor @x[4],@x[4],@x[5] eor @x[6],@x[6],@x[7] eor @x[8],@x[8],@x[9] eor @x[10],@x[10],@x[11] eor @x[12],@x[12],@x[13] eor @x[14],@x[14],@x[15] stp @x[0],@x[2],[$out,#0] // store output add @d[6],@d[6],#1 // increment counter mov.32 @x[0],@d[0] // unpack key block lsr @x[1],@d[0],#32 stp @x[4],@x[6],[$out,#16] mov.32 @x[2],@d[1] lsr @x[3],@d[1],#32 stp @x[8],@x[10],[$out,#32] mov.32 @x[4],@d[2] lsr @x[5],@d[2],#32 stp @x[12],@x[14],[$out,#48] add $out,$out,#64 mov.32 @x[6],@d[3] lsr @x[7],@d[3],#32 mov.32 @x[8],@d[4] lsr @x[9],@d[4],#32 mov.32 @x[10],@d[5] lsr @x[11],@d[5],#32 mov.32 @x[12],@d[6] lsr @x[13],@d[6],#32 mov.32 @x[14],@d[7] lsr @x[15],@d[7],#32 mov $ctr,#5 .Loop_lower_neon: sub $ctr,$ctr,#1 ___ @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); foreach (@thread0) { eval; eval(shift(@thread67)); eval(shift(@thread1)); eval(shift(@thread67)); eval(shift(@thread2)); eval(shift(@thread67)); eval(shift(@thread3)); eval(shift(@thread67)); eval(shift(@thread4)); eval(shift(@thread67)); eval(shift(@thread5)); eval(shift(@thread67)); } @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); foreach (@thread0) { eval; eval(shift(@thread67)); eval(shift(@thread1)); eval(shift(@thread67)); eval(shift(@thread2)); eval(shift(@thread67)); eval(shift(@thread3)); eval(shift(@thread67)); eval(shift(@thread4)); eval(shift(@thread67)); eval(shift(@thread5)); eval(shift(@thread67)); } $code.=<<___; cbnz $ctr,.Loop_lower_neon add.32 @x[0],@x[0],@d[0] // accumulate key block ldp @K[0],@K[1],[sp,#0] add @x[1],@x[1],@d[0],lsr#32 ldp @K[2],@K[3],[sp,#32] add.32 @x[2],@x[2],@d[1] ldp @K[4],@K[5],[sp,#64] add @x[3],@x[3],@d[1],lsr#32 add $A0,$A0,@K[0] add.32 @x[4],@x[4],@d[2] add $A1,$A1,@K[0] add @x[5],@x[5],@d[2],lsr#32 add $A2,$A2,@K[0] add.32 @x[6],@x[6],@d[3] add $A3,$A3,@K[0] add @x[7],@x[7],@d[3],lsr#32 add $A4,$A4,@K[0] add.32 @x[8],@x[8],@d[4] add $A5,$A5,@K[0] add @x[9],@x[9],@d[4],lsr#32 add $C0,$C0,@K[2] add.32 @x[10],@x[10],@d[5] add $C1,$C1,@K[2] add @x[11],@x[11],@d[5],lsr#32 add $C2,$C2,@K[2] add.32 @x[12],@x[12],@d[6] add $C3,$C3,@K[2] add @x[13],@x[13],@d[6],lsr#32 add $C4,$C4,@K[2] add.32 @x[14],@x[14],@d[7] add $C5,$C5,@K[2] add @x[15],@x[15],@d[7],lsr#32 add $D4,$D4,$ONE // +4 add @x[0],@x[0],@x[1],lsl#32 // pack add $D5,$D5,$ONE // +4 add @x[2],@x[2],@x[3],lsl#32 add $D0,$D0,@K[3] ldp @x[1],@x[3],[$inp,#0] // load input add $D1,$D1,@K[4] add @x[4],@x[4],@x[5],lsl#32 add $D2,$D2,@K[5] add @x[6],@x[6],@x[7],lsl#32 add $D3,$D3,@K[6] ldp @x[5],@x[7],[$inp,#16] add $D4,$D4,@K[3] add @x[8],@x[8],@x[9],lsl#32 add $D5,$D5,@K[4] add @x[10],@x[10],@x[11],lsl#32 add $B0,$B0,@K[1] ldp @x[9],@x[11],[$inp,#32] add $B1,$B1,@K[1] add @x[12],@x[12],@x[13],lsl#32 add $B2,$B2,@K[1] add @x[14],@x[14],@x[15],lsl#32 add $B3,$B3,@K[1] ldp @x[13],@x[15],[$inp,#48] add $B4,$B4,@K[1] add $inp,$inp,#64 add $B5,$B5,@K[1] #ifdef __AARCH64EB__ rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] rev @x[6],@x[6] rev @x[8],@x[8] rev @x[10],@x[10] rev @x[12],@x[12] rev @x[14],@x[14] #endif ld1.8 {$T0-$T3},[$inp],#64 eor @x[0],@x[0],@x[1] eor @x[2],@x[2],@x[3] eor @x[4],@x[4],@x[5] eor @x[6],@x[6],@x[7] eor @x[8],@x[8],@x[9] eor $A0,$A0,$T0 eor @x[10],@x[10],@x[11] eor $B0,$B0,$T1 eor @x[12],@x[12],@x[13] eor $C0,$C0,$T2 eor @x[14],@x[14],@x[15] eor $D0,$D0,$T3 ld1.8 {$T0-$T3},[$inp],#64 stp @x[0],@x[2],[$out,#0] // store output add @d[6],@d[6],#7 // increment counter stp @x[4],@x[6],[$out,#16] stp @x[8],@x[10],[$out,#32] stp @x[12],@x[14],[$out,#48] add $out,$out,#64 st1.8 {$A0-$D0},[$out],#64 ld1.8 {$A0-$D0},[$inp],#64 eor $A1,$A1,$T0 eor $B1,$B1,$T1 eor $C1,$C1,$T2 eor $D1,$D1,$T3 st1.8 {$A1-$D1},[$out],#64 ld1.8 {$A1-$D1},[$inp],#64 eor $A2,$A2,$A0 ldp @K[0],@K[1],[sp,#0] eor $B2,$B2,$B0 ldp @K[2],@K[3],[sp,#32] eor $C2,$C2,$C0 eor $D2,$D2,$D0 st1.8 {$A2-$D2},[$out],#64 ld1.8 {$A2-$D2},[$inp],#64 eor $A3,$A3,$A1 eor $B3,$B3,$B1 eor $C3,$C3,$C1 eor $D3,$D3,$D1 st1.8 {$A3-$D3},[$out],#64 ld1.8 {$A3-$D3},[$inp],#64 eor $A4,$A4,$A2 eor $B4,$B4,$B2 eor $C4,$C4,$C2 eor $D4,$D4,$D2 st1.8 {$A4-$D4},[$out],#64 shl $A0,$ONE,#1 // 4 -> 8 eor $A5,$A5,$A3 eor $B5,$B5,$B3 eor $C5,$C5,$C3 eor $D5,$D5,$D3 st1.8 {$A5-$D5},[$out],#64 add @K[3],@K[3],$A0 // += 8 add @K[4],@K[4],$A0 add @K[5],@K[5],$A0 add @K[6],@K[6],$A0 b.hs .Loop_outer_512_neon adds $len,$len,#512 ushr $A0,$ONE,#2 // 4 -> 1 ldp d8,d9,[sp,#128+0] // meet ABI requirements ldp d10,d11,[sp,#128+16] ldp d12,d13,[sp,#128+32] ldp d14,d15,[sp,#128+48] stp @K[0],$ONE,[sp,#0] // wipe off-load area stp @K[0],$ONE,[sp,#32] stp @K[0],$ONE,[sp,#64] b.eq .Ldone_512_neon cmp $len,#192 sub @K[3],@K[3],$A0 // -= 1 sub @K[4],@K[4],$A0 sub @K[5],@K[5],$A0 add sp,sp,#128 b.hs .Loop_outer_neon eor @K[1],@K[1],@K[1] eor @K[2],@K[2],@K[2] eor @K[3],@K[3],@K[3] eor @K[4],@K[4],@K[4] eor @K[5],@K[5],@K[5] eor @K[6],@K[6],@K[6] b .Loop_outer .Ldone_512_neon: ldp x19,x20,[x29,#16] add sp,sp,#128+64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_512_neon,.-ChaCha20_512_neon ___ } }}} foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; # flush ring-0.17.14/crypto/chacha/asm/chacha-x86.pl000064400000000000000000000332141046102023000164360ustar 00000000000000#! /usr/bin/env perl # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # January 2015 # # ChaCha20 for x86. # # Performance in cycles per byte out of large buffer. # # 1xIALU/gcc 4xSSSE3 # Pentium 17.5/+80% # PIII 14.2/+60% # P4 18.6/+84% # Core2 9.56/+89% 4.83 # Westmere 9.50/+45% 3.35 # Sandy Bridge 10.5/+47% 3.20 # Haswell 8.15/+50% 2.83 # Skylake 7.53/+22% 2.75 # Silvermont 17.4/+36% 8.35 # Goldmont 13.4/+40% 4.36 # Sledgehammer 10.2/+54% # Bulldozer 13.4/+50% 4.38(*) # # (*) Bulldozer actually executes 4xXOP code path that delivers 3.55; # # Modified from upstream OpenSSL to remove the XOP code. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; $output=pop; open STDOUT,">$output"; &asm_init($ARGV[0]); $xmm=$ymm=1; $gasver=999; # enable everything $a="eax"; ($b,$b_)=("ebx","ebp"); ($c,$c_)=("ecx","esi"); ($d,$d_)=("edx","edi"); &static_label("ssse3_data"); &static_label("pic_point"); if ($xmm) { my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); my ($out,$inp,$len)=("edi","esi","ecx"); sub QUARTERROUND_SSSE3 { my ($ai,$bi,$ci,$di,$i)=@_; my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 if ($i==0) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==3) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); } elsif ($i==4) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==7) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); } #&paddd ($xa,$xb); # see elsewhere #&pxor ($xd,$xa); # see elsewhere &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); &pshufb ($xd,&QWP(0,"eax")); # rot16 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); &paddd ($xc,$xd); &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); &pxor ($xb,$xc); &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); &movdqa ($xa_,$xb); # borrow as temporary &pslld ($xb,12); &psrld ($xa_,20); &por ($xb,$xa_); &movdqa($xa_,&QWP(16*$an-128,"ebx")); &paddd ($xa,$xb); &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); &pxor ($xd,$xa); &movdqa (&QWP(16*$ai-128,"ebx"),$xa); &pshufb ($xd,&QWP(16,"eax")); # rot8 &paddd ($xc,$xd); &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); &movdqa ($xd_,$xd) if ($di==$dn); &pxor ($xb,$xc); &paddd ($xa_,$xb_) if ($i<7); # elsewhere &movdqa ($xa,$xb); # borrow as temporary &pslld ($xb,7); &psrld ($xa,25); &pxor ($xd_,$xa_) if ($i<7); # elsewhere &por ($xb,$xa); ($xa,$xa_)=($xa_,$xa); ($xb,$xb_)=($xb_,$xb); ($xc,$xc_)=($xc_,$xc); ($xd,$xd_)=($xd_,$xd); } &function_begin("ChaCha20_ctr32_ssse3"); &call (&label("pic_point")); &set_label("pic_point"); &blindpop("eax"); &mov ($out,&wparam(0)); &mov ($inp,&wparam(1)); &mov ($len,&wparam(2)); &mov ("edx",&wparam(3)); # key &mov ("ebx",&wparam(4)); # counter and nonce &mov ("ebp","esp"); &stack_push (131); &and ("esp",-64); &mov (&DWP(512,"esp"),"ebp"); &lea ("eax",&DWP(&label("ssse3_data")."-". &label("pic_point"),"eax")); &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce if (defined($gasver) && $gasver>=2.17) { # even though we encode # pshufb manually, we # handle only register # operands, while this # segment uses memory # operand... &cmp ($len,64*4); &jb (&label("1x")); &mov (&DWP(512+4,"esp"),"edx"); # offload pointers &mov (&DWP(512+8,"esp"),"ebx"); &sub ($len,64*4); # bias len &lea ("ebp",&DWP(256+128,"esp")); # size optimization &movdqu ("xmm7",&QWP(0,"edx")); # key &pshufd ("xmm0","xmm3",0x00); &pshufd ("xmm1","xmm3",0x55); &pshufd ("xmm2","xmm3",0xaa); &pshufd ("xmm3","xmm3",0xff); &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters &pshufd ("xmm4","xmm7",0x00); &pshufd ("xmm5","xmm7",0x55); &psubd ("xmm0",&QWP(16*4,"eax")); &pshufd ("xmm6","xmm7",0xaa); &pshufd ("xmm7","xmm7",0xff); &movdqa (&QWP(16*12-128,"ebp"),"xmm0"); &movdqa (&QWP(16*13-128,"ebp"),"xmm1"); &movdqa (&QWP(16*14-128,"ebp"),"xmm2"); &movdqa (&QWP(16*15-128,"ebp"),"xmm3"); &movdqu ("xmm3",&QWP(16,"edx")); # key &movdqa (&QWP(16*4-128,"ebp"),"xmm4"); &movdqa (&QWP(16*5-128,"ebp"),"xmm5"); &movdqa (&QWP(16*6-128,"ebp"),"xmm6"); &movdqa (&QWP(16*7-128,"ebp"),"xmm7"); &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma &lea ("ebx",&DWP(128,"esp")); # size optimization &pshufd ("xmm0","xmm3",0x00); &pshufd ("xmm1","xmm3",0x55); &pshufd ("xmm2","xmm3",0xaa); &pshufd ("xmm3","xmm3",0xff); &pshufd ("xmm4","xmm7",0x00); &pshufd ("xmm5","xmm7",0x55); &pshufd ("xmm6","xmm7",0xaa); &pshufd ("xmm7","xmm7",0xff); &movdqa (&QWP(16*8-128,"ebp"),"xmm0"); &movdqa (&QWP(16*9-128,"ebp"),"xmm1"); &movdqa (&QWP(16*10-128,"ebp"),"xmm2"); &movdqa (&QWP(16*11-128,"ebp"),"xmm3"); &movdqa (&QWP(16*0-128,"ebp"),"xmm4"); &movdqa (&QWP(16*1-128,"ebp"),"xmm5"); &movdqa (&QWP(16*2-128,"ebp"),"xmm6"); &movdqa (&QWP(16*3-128,"ebp"),"xmm7"); &lea ($inp,&DWP(128,$inp)); # size optimization &lea ($out,&DWP(128,$out)); # size optimization &jmp (&label("outer_loop")); &set_label("outer_loop",16); #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material &movdqa ("xmm1",&QWP(16*1-128,"ebp")); &movdqa ("xmm2",&QWP(16*2-128,"ebp")); &movdqa ("xmm3",&QWP(16*3-128,"ebp")); #&movdqa ("xmm4",&QWP(16*4-128,"ebp")); &movdqa ("xmm5",&QWP(16*5-128,"ebp")); &movdqa ("xmm6",&QWP(16*6-128,"ebp")); &movdqa ("xmm7",&QWP(16*7-128,"ebp")); #&movdqa (&QWP(16*0-128,"ebx"),"xmm0"); &movdqa (&QWP(16*1-128,"ebx"),"xmm1"); &movdqa (&QWP(16*2-128,"ebx"),"xmm2"); &movdqa (&QWP(16*3-128,"ebx"),"xmm3"); #&movdqa (&QWP(16*4-128,"ebx"),"xmm4"); &movdqa (&QWP(16*5-128,"ebx"),"xmm5"); &movdqa (&QWP(16*6-128,"ebx"),"xmm6"); &movdqa (&QWP(16*7-128,"ebx"),"xmm7"); #&movdqa ("xmm0",&QWP(16*8-128,"ebp")); #&movdqa ("xmm1",&QWP(16*9-128,"ebp")); &movdqa ("xmm2",&QWP(16*10-128,"ebp")); &movdqa ("xmm3",&QWP(16*11-128,"ebp")); &movdqa ("xmm4",&QWP(16*12-128,"ebp")); &movdqa ("xmm5",&QWP(16*13-128,"ebp")); &movdqa ("xmm6",&QWP(16*14-128,"ebp")); &movdqa ("xmm7",&QWP(16*15-128,"ebp")); &paddd ("xmm4",&QWP(16*4,"eax")); # counter value #&movdqa (&QWP(16*8-128,"ebx"),"xmm0"); #&movdqa (&QWP(16*9-128,"ebx"),"xmm1"); &movdqa (&QWP(16*10-128,"ebx"),"xmm2"); &movdqa (&QWP(16*11-128,"ebx"),"xmm3"); &movdqa (&QWP(16*12-128,"ebx"),"xmm4"); &movdqa (&QWP(16*13-128,"ebx"),"xmm5"); &movdqa (&QWP(16*14-128,"ebx"),"xmm6"); &movdqa (&QWP(16*15-128,"ebx"),"xmm7"); &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value &movdqa ($xa, &QWP(16*0-128,"ebp")); &movdqa ($xd, "xmm4"); &movdqa ($xb_,&QWP(16*4-128,"ebp")); &movdqa ($xc, &QWP(16*8-128,"ebp")); &movdqa ($xc_,&QWP(16*9-128,"ebp")); &mov ("edx",10); # loop counter &nop (); &set_label("loop",16); &paddd ($xa,$xb_); # elsewhere &movdqa ($xb,$xb_); &pxor ($xd,$xa); # elsewhere &QUARTERROUND_SSSE3(0, 4, 8, 12, 0); &QUARTERROUND_SSSE3(1, 5, 9, 13, 1); &QUARTERROUND_SSSE3(2, 6,10, 14, 2); &QUARTERROUND_SSSE3(3, 7,11, 15, 3); &QUARTERROUND_SSSE3(0, 5,10, 15, 4); &QUARTERROUND_SSSE3(1, 6,11, 12, 5); &QUARTERROUND_SSSE3(2, 7, 8, 13, 6); &QUARTERROUND_SSSE3(3, 4, 9, 14, 7); &dec ("edx"); &jnz (&label("loop")); &movdqa (&QWP(16*4-128,"ebx"),$xb_); &movdqa (&QWP(16*8-128,"ebx"),$xc); &movdqa (&QWP(16*9-128,"ebx"),$xc_); &movdqa (&QWP(16*12-128,"ebx"),$xd); &movdqa (&QWP(16*14-128,"ebx"),$xd_); my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there &movdqa ($xa1,&QWP(16*1-128,"ebx")); &movdqa ($xa2,&QWP(16*2-128,"ebx")); &movdqa ($xa3,&QWP(16*3-128,"ebx")); for($i=0;$i<256;$i+=64) { &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material &paddd ($xa1,&QWP($i+16*1-128,"ebp")); &paddd ($xa2,&QWP($i+16*2-128,"ebp")); &paddd ($xa3,&QWP($i+16*3-128,"ebp")); &movdqa ($xt2,$xa0); # "de-interlace" data &punpckldq ($xa0,$xa1); &movdqa ($xt3,$xa2); &punpckldq ($xa2,$xa3); &punpckhdq ($xt2,$xa1); &punpckhdq ($xt3,$xa3); &movdqa ($xa1,$xa0); &punpcklqdq ($xa0,$xa2); # "a0" &movdqa ($xa3,$xt2); &punpcklqdq ($xt2,$xt3); # "a2" &punpckhqdq ($xa1,$xa2); # "a1" &punpckhqdq ($xa3,$xt3); # "a3" #($xa2,$xt2)=($xt2,$xa2); &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input &movdqu ($xt1,&QWP(64*1-128,$inp)); &movdqu ($xa2,&QWP(64*2-128,$inp)); &movdqu ($xt3,&QWP(64*3-128,$inp)); &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); &pxor ($xt0,$xa0); &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); &pxor ($xt1,$xa1); &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); &pxor ($xt2,$xa2); &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); &pxor ($xt3,$xa3); &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); &movdqu (&QWP(64*0-128,$out),$xt0); # store output &movdqu (&QWP(64*1-128,$out),$xt1); &movdqu (&QWP(64*2-128,$out),$xt2); &movdqu (&QWP(64*3-128,$out),$xt3); &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); } &sub ($len,64*4); &jnc (&label("outer_loop")); &add ($len,64*4); &jz (&label("done")); &mov ("ebx",&DWP(512+8,"esp")); # restore pointers &lea ($inp,&DWP(-128,$inp)); &mov ("edx",&DWP(512+4,"esp")); &lea ($out,&DWP(-128,$out)); &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value &movdqu ("xmm3",&QWP(0,"ebx")); &paddd ("xmm2",&QWP(16*6,"eax")); # +four &pand ("xmm3",&QWP(16*7,"eax")); &por ("xmm3","xmm2"); # counter value } { my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round &paddd ($a,$b); &pxor ($d,$a); &pshufb ($d,$rot16); &paddd ($c,$d); &pxor ($b,$c); &movdqa ($t,$b); &psrld ($b,20); &pslld ($t,12); &por ($b,$t); &paddd ($a,$b); &pxor ($d,$a); &pshufb ($d,$rot24); &paddd ($c,$d); &pxor ($b,$c); &movdqa ($t,$b); &psrld ($b,25); &pslld ($t,7); &por ($b,$t); } &set_label("1x"); &movdqa ($a,&QWP(16*2,"eax")); # sigma &movdqu ($b,&QWP(0,"edx")); &movdqu ($c,&QWP(16,"edx")); #&movdqu ($d,&QWP(0,"ebx")); # already loaded &movdqa ($rot16,&QWP(0,"eax")); &movdqa ($rot24,&QWP(16,"eax")); &mov (&DWP(16*3,"esp"),"ebp"); &movdqa (&QWP(16*0,"esp"),$a); &movdqa (&QWP(16*1,"esp"),$b); &movdqa (&QWP(16*2,"esp"),$c); &movdqa (&QWP(16*3,"esp"),$d); &mov ("edx",10); &jmp (&label("loop1x")); &set_label("outer1x",16); &movdqa ($d,&QWP(16*5,"eax")); # one &movdqa ($a,&QWP(16*0,"esp")); &movdqa ($b,&QWP(16*1,"esp")); &movdqa ($c,&QWP(16*2,"esp")); &paddd ($d,&QWP(16*3,"esp")); &mov ("edx",10); &movdqa (&QWP(16*3,"esp"),$d); &jmp (&label("loop1x")); &set_label("loop1x",16); &SSSE3ROUND(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b00111001); &pshufd ($d,$d,0b10010011); &nop (); &SSSE3ROUND(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b10010011); &pshufd ($d,$d,0b00111001); &dec ("edx"); &jnz (&label("loop1x")); &paddd ($a,&QWP(16*0,"esp")); &paddd ($b,&QWP(16*1,"esp")); &paddd ($c,&QWP(16*2,"esp")); &paddd ($d,&QWP(16*3,"esp")); &cmp ($len,64); &jb (&label("tail")); &movdqu ($t,&QWP(16*0,$inp)); &movdqu ($t1,&QWP(16*1,$inp)); &pxor ($a,$t); # xor with input &movdqu ($t,&QWP(16*2,$inp)); &pxor ($b,$t1); &movdqu ($t1,&QWP(16*3,$inp)); &pxor ($c,$t); &pxor ($d,$t1); &lea ($inp,&DWP(16*4,$inp)); # inp+=64 &movdqu (&QWP(16*0,$out),$a); # write output &movdqu (&QWP(16*1,$out),$b); &movdqu (&QWP(16*2,$out),$c); &movdqu (&QWP(16*3,$out),$d); &lea ($out,&DWP(16*4,$out)); # inp+=64 &sub ($len,64); &jnz (&label("outer1x")); &jmp (&label("done")); &set_label("tail"); &movdqa (&QWP(16*0,"esp"),$a); &movdqa (&QWP(16*1,"esp"),$b); &movdqa (&QWP(16*2,"esp"),$c); &movdqa (&QWP(16*3,"esp"),$d); &xor ("eax","eax"); &xor ("edx","edx"); &xor ("ebp","ebp"); &set_label("tail_loop"); &movb ("al",&BP(0,"esp","ebp")); &movb ("dl",&BP(0,$inp,"ebp")); &lea ("ebp",&DWP(1,"ebp")); &xor ("al","dl"); &movb (&BP(-1,$out,"ebp"),"al"); &dec ($len); &jnz (&label("tail_loop")); } &set_label("done"); &mov ("esp",&DWP(512,"esp")); &function_end("ChaCha20_ctr32_ssse3"); &align (64); &set_label("ssse3_data"); &data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd); &data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe); &data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574); &data_word(0,1,2,3); &data_word(4,4,4,4); &data_word(1,0,0,0); &data_word(4,0,0,0); &data_word(0,-1,-1,-1); &align (64); } &asciz ("ChaCha20 for x86, CRYPTOGAMS by "); &asm_finish(); close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/chacha/asm/chacha-x86_64.pl000064400000000000000000001331441046102023000167520ustar 00000000000000#! /usr/bin/env perl # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # November 2014 # # ChaCha20 for x86_64. # # December 2016 # # Add AVX512F code path. # # Performance in cycles per byte out of large buffer. # # IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 NxAVX(v) # # P4 9.48/+99% -/22.7(ii) - # Core2 7.83/+55% 7.90/8.08 4.35 # Westmere 7.19/+50% 5.60/6.70 3.00 # Sandy Bridge 8.31/+42% 5.45/6.76 2.72 # Ivy Bridge 6.71/+46% 5.40/6.49 2.41 # Haswell 5.92/+43% 5.20/6.45 2.42 1.23 # Skylake[-X] 5.87/+39% 4.70/- 2.31 1.19[0.57] # Silvermont 12.0/+33% 7.75/7.40 7.03(iii) # Knights L 11.7/- - 9.60(iii) 0.80 # Goldmont 10.6/+17% 5.10/- 3.28 # Sledgehammer 7.28/+52% -/14.2(ii) - # Bulldozer 9.66/+28% 9.85/11.1 3.06(iv) # Ryzen 5.96/+50% 5.19/- 2.40 2.09 # VIA Nano 10.5/+46% 6.72/8.60 6.05 # # (i) compared to older gcc 3.x one can observe >2x improvement on # most platforms; # (ii) as it can be seen, SSE2 performance is too low on legacy # processors; NxSSE2 results are naturally better, but not # impressively better than IALU ones, which is why you won't # find SSE2 code below; # (iii) this is not optimal result for Atom because of MSROM # limitations, SSE2 can do better, but gain is considered too # low to justify the [maintenance] effort; # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20; # # Modified from upstream OpenSSL to remove the XOP code. $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; $avx = 2; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # input parameter block ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); $code.=<<___; .text .section .rodata .align 64 .Lzero: .long 0,0,0,0 .Lone: .long 1,0,0,0 .Linc: .long 0,1,2,3 .Lfour: .long 4,4,4,4 .Lincy: .long 0,2,4,6,1,3,5,7 .Leight: .long 8,8,8,8,8,8,8,8 .Lrot16: .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd .Lrot24: .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe .Lsigma: .asciz "expand 32-byte k" .align 64 .Lzeroz: .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 .Lfourz: .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 .Lincz: .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 .Lsixteen: .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .asciz "ChaCha20 for x86_64, CRYPTOGAMS by " .text ___ sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); @t=("%esi","%edi"); sub ROUND { # critical path is 24 cycles per round my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_)=map("\"$_\"",@t); my @x=map("\"$_\"",@x); # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' and 'd's are permanently allocated in registers, # @x[0..7,12..15], while 'c's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # bunch of 'c' stores and loads in the middle, but none in # the beginning or end. # Normally instructions would be interleaved to favour in-order # execution. Generally out-of-order cores manage it gracefully, # but not this time for some reason. As in-order execution # cores are dying breed, old Atom is the only one around, # instructions are left uninterleaved. Besides, Atom is better # off executing 1xSSSE3 code anyway... ( "&add (@x[$a0],@x[$b0])", # Q1 "&xor (@x[$d0],@x[$a0])", "&rol (@x[$d0],16)", "&add (@x[$a1],@x[$b1])", # Q2 "&xor (@x[$d1],@x[$a1])", "&rol (@x[$d1],16)", "&add ($xc,@x[$d0])", "&xor (@x[$b0],$xc)", "&rol (@x[$b0],12)", "&add ($xc_,@x[$d1])", "&xor (@x[$b1],$xc_)", "&rol (@x[$b1],12)", "&add (@x[$a0],@x[$b0])", "&xor (@x[$d0],@x[$a0])", "&rol (@x[$d0],8)", "&add (@x[$a1],@x[$b1])", "&xor (@x[$d1],@x[$a1])", "&rol (@x[$d1],8)", "&add ($xc,@x[$d0])", "&xor (@x[$b0],$xc)", "&rol (@x[$b0],7)", "&add ($xc_,@x[$d1])", "&xor (@x[$b1],$xc_)", "&rol (@x[$b1],7)", "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's "&mov (\"4*$c1(%rsp)\",$xc_)", "&mov ($xc,\"4*$c2(%rsp)\")", "&mov ($xc_,\"4*$c3(%rsp)\")", "&add (@x[$a2],@x[$b2])", # Q3 "&xor (@x[$d2],@x[$a2])", "&rol (@x[$d2],16)", "&add (@x[$a3],@x[$b3])", # Q4 "&xor (@x[$d3],@x[$a3])", "&rol (@x[$d3],16)", "&add ($xc,@x[$d2])", "&xor (@x[$b2],$xc)", "&rol (@x[$b2],12)", "&add ($xc_,@x[$d3])", "&xor (@x[$b3],$xc_)", "&rol (@x[$b3],12)", "&add (@x[$a2],@x[$b2])", "&xor (@x[$d2],@x[$a2])", "&rol (@x[$d2],8)", "&add (@x[$a3],@x[$b3])", "&xor (@x[$d3],@x[$a3])", "&rol (@x[$d3],8)", "&add ($xc,@x[$d2])", "&xor (@x[$b2],$xc)", "&rol (@x[$b2],7)", "&add ($xc_,@x[$d3])", "&xor (@x[$b3],$xc_)", "&rol (@x[$b3],7)" ); } ######################################################################## # Generic code path that handles all lengths on pre-SSSE3 processors. $code.=<<___; .globl ChaCha20_ctr32_nohw .type ChaCha20_ctr32_nohw,\@function,5 .align 64 ChaCha20_ctr32_nohw: .cfi_startproc _CET_ENDBR push %rbx .cfi_push rbx push %rbp .cfi_push rbp push %r12 .cfi_push r12 push %r13 .cfi_push r13 push %r14 .cfi_push r14 push %r15 .cfi_push r15 sub \$64+24,%rsp .cfi_adjust_cfa_offset `64+24` .Lctr32_body: #movdqa .Lsigma(%rip),%xmm0 movdqu ($key),%xmm1 movdqu 16($key),%xmm2 movdqu ($counter),%xmm3 movdqa .Lone(%rip),%xmm4 #movdqa %xmm0,4*0(%rsp) # key[0] movdqa %xmm1,4*4(%rsp) # key[1] movdqa %xmm2,4*8(%rsp) # key[2] movdqa %xmm3,4*12(%rsp) # key[3] mov $len,%rbp # reassign $len jmp .Loop_outer .align 32 .Loop_outer: mov \$0x61707865,@x[0] # 'expa' mov \$0x3320646e,@x[1] # 'nd 3' mov \$0x79622d32,@x[2] # '2-by' mov \$0x6b206574,@x[3] # 'te k' mov 4*4(%rsp),@x[4] mov 4*5(%rsp),@x[5] mov 4*6(%rsp),@x[6] mov 4*7(%rsp),@x[7] movd %xmm3,@x[12] mov 4*13(%rsp),@x[13] mov 4*14(%rsp),@x[14] mov 4*15(%rsp),@x[15] mov %rbp,64+0(%rsp) # save len mov \$10,%ebp mov $inp,64+8(%rsp) # save inp movq %xmm2,%rsi # "@x[8]" mov $out,64+16(%rsp) # save out mov %rsi,%rdi shr \$32,%rdi # "@x[9]" jmp .Loop .align 32 .Loop: ___ foreach (&ROUND (0, 4, 8,12)) { eval; } foreach (&ROUND (0, 5,10,15)) { eval; } &dec ("%ebp"); &jnz (".Loop"); $code.=<<___; mov @t[1],4*9(%rsp) # modulo-scheduled mov @t[0],4*8(%rsp) mov 64(%rsp),%rbp # load len movdqa %xmm2,%xmm1 mov 64+8(%rsp),$inp # load inp paddd %xmm4,%xmm3 # increment counter mov 64+16(%rsp),$out # load out add \$0x61707865,@x[0] # 'expa' add \$0x3320646e,@x[1] # 'nd 3' add \$0x79622d32,@x[2] # '2-by' add \$0x6b206574,@x[3] # 'te k' add 4*4(%rsp),@x[4] add 4*5(%rsp),@x[5] add 4*6(%rsp),@x[6] add 4*7(%rsp),@x[7] add 4*12(%rsp),@x[12] add 4*13(%rsp),@x[13] add 4*14(%rsp),@x[14] add 4*15(%rsp),@x[15] paddd 4*8(%rsp),%xmm1 cmp \$64,%rbp jb .Ltail xor 4*0($inp),@x[0] # xor with input xor 4*1($inp),@x[1] xor 4*2($inp),@x[2] xor 4*3($inp),@x[3] xor 4*4($inp),@x[4] xor 4*5($inp),@x[5] xor 4*6($inp),@x[6] xor 4*7($inp),@x[7] movdqu 4*8($inp),%xmm0 xor 4*12($inp),@x[12] xor 4*13($inp),@x[13] xor 4*14($inp),@x[14] xor 4*15($inp),@x[15] lea 4*16($inp),$inp # inp+=64 pxor %xmm1,%xmm0 movdqa %xmm2,4*8(%rsp) movd %xmm3,4*12(%rsp) mov @x[0],4*0($out) # write output mov @x[1],4*1($out) mov @x[2],4*2($out) mov @x[3],4*3($out) mov @x[4],4*4($out) mov @x[5],4*5($out) mov @x[6],4*6($out) mov @x[7],4*7($out) movdqu %xmm0,4*8($out) mov @x[12],4*12($out) mov @x[13],4*13($out) mov @x[14],4*14($out) mov @x[15],4*15($out) lea 4*16($out),$out # out+=64 sub \$64,%rbp jnz .Loop_outer jmp .Ldone .align 16 .Ltail: mov @x[0],4*0(%rsp) mov @x[1],4*1(%rsp) xor %rbx,%rbx mov @x[2],4*2(%rsp) mov @x[3],4*3(%rsp) mov @x[4],4*4(%rsp) mov @x[5],4*5(%rsp) mov @x[6],4*6(%rsp) mov @x[7],4*7(%rsp) movdqa %xmm1,4*8(%rsp) mov @x[12],4*12(%rsp) mov @x[13],4*13(%rsp) mov @x[14],4*14(%rsp) mov @x[15],4*15(%rsp) .Loop_tail: movzb ($inp,%rbx),%eax movzb (%rsp,%rbx),%edx lea 1(%rbx),%rbx xor %edx,%eax mov %al,-1($out,%rbx) dec %rbp jnz .Loop_tail .Ldone: lea 64+24+48(%rsp),%rsi mov -48(%rsi),%r15 .cfi_restore r15 mov -40(%rsi),%r14 .cfi_restore r14 mov -32(%rsi),%r13 .cfi_restore r13 mov -24(%rsi),%r12 .cfi_restore r12 mov -16(%rsi),%rbp .cfi_restore rbp mov -8(%rsi),%rbx .cfi_restore rbx lea (%rsi),%rsp .cfi_adjust_cfa_offset `-64-24-48` .Lno_data: ret .cfi_endproc .size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw ___ ######################################################################## # SSSE3 code path that handles longer messages. { # assign variables to favor Atom front-end my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); sub SSSE3_lane_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); my @x=map("\"$_\"",@xx); # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' and 'd's are permanently allocated in registers, # @x[0..7,12..15], while 'c's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # bunch of 'c' stores and loads in the middle, but none in # the beginning or end. ( "&paddd (@x[$a0],@x[$b0])", # Q1 "&paddd (@x[$a1],@x[$b1])", # Q2 "&pxor (@x[$d0],@x[$a0])", "&pxor (@x[$d1],@x[$a1])", "&pshufb (@x[$d0],$t1)", "&pshufb (@x[$d1],$t1)", "&paddd ($xc,@x[$d0])", "&paddd ($xc_,@x[$d1])", "&pxor (@x[$b0],$xc)", "&pxor (@x[$b1],$xc_)", "&movdqa ($t0,@x[$b0])", "&pslld (@x[$b0],12)", "&psrld ($t0,20)", "&movdqa ($t1,@x[$b1])", "&pslld (@x[$b1],12)", "&por (@x[$b0],$t0)", "&psrld ($t1,20)", "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) "&por (@x[$b1],$t1)", "&paddd (@x[$a0],@x[$b0])", "&paddd (@x[$a1],@x[$b1])", "&pxor (@x[$d0],@x[$a0])", "&pxor (@x[$d1],@x[$a1])", "&pshufb (@x[$d0],$t0)", "&pshufb (@x[$d1],$t0)", "&paddd ($xc,@x[$d0])", "&paddd ($xc_,@x[$d1])", "&pxor (@x[$b0],$xc)", "&pxor (@x[$b1],$xc_)", "&movdqa ($t1,@x[$b0])", "&pslld (@x[$b0],7)", "&psrld ($t1,25)", "&movdqa ($t0,@x[$b1])", "&pslld (@x[$b1],7)", "&por (@x[$b0],$t1)", "&psrld ($t0,25)", "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) "&por (@x[$b1],$t0)", "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", "&paddd (@x[$a2],@x[$b2])", # Q3 "&paddd (@x[$a3],@x[$b3])", # Q4 "&pxor (@x[$d2],@x[$a2])", "&pxor (@x[$d3],@x[$a3])", "&pshufb (@x[$d2],$t1)", "&pshufb (@x[$d3],$t1)", "&paddd ($xc,@x[$d2])", "&paddd ($xc_,@x[$d3])", "&pxor (@x[$b2],$xc)", "&pxor (@x[$b3],$xc_)", "&movdqa ($t0,@x[$b2])", "&pslld (@x[$b2],12)", "&psrld ($t0,20)", "&movdqa ($t1,@x[$b3])", "&pslld (@x[$b3],12)", "&por (@x[$b2],$t0)", "&psrld ($t1,20)", "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) "&por (@x[$b3],$t1)", "&paddd (@x[$a2],@x[$b2])", "&paddd (@x[$a3],@x[$b3])", "&pxor (@x[$d2],@x[$a2])", "&pxor (@x[$d3],@x[$a3])", "&pshufb (@x[$d2],$t0)", "&pshufb (@x[$d3],$t0)", "&paddd ($xc,@x[$d2])", "&paddd ($xc_,@x[$d3])", "&pxor (@x[$b2],$xc)", "&pxor (@x[$b3],$xc_)", "&movdqa ($t1,@x[$b2])", "&pslld (@x[$b2],7)", "&psrld ($t1,25)", "&movdqa ($t0,@x[$b3])", "&pslld (@x[$b3],7)", "&por (@x[$b2],$t1)", "&psrld ($t0,25)", "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) "&por (@x[$b3],$t0)" ); } my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .globl ChaCha20_ctr32_ssse3_4x .type ChaCha20_ctr32_ssse3_4x,\@function,5 .align 32 ChaCha20_ctr32_ssse3_4x: .cfi_startproc _CET_ENDBR mov %rsp,%r9 # frame pointer .cfi_def_cfa_register r9 ___ $code.=<<___; sub \$0x140+$xframe,%rsp ___ ################ stack layout # +0x00 SIMD equivalent of @x[8-12] # ... # +0x40 constant copy of key[0-2] smashed by lanes # ... # +0x100 SIMD counters (with nonce smashed by lanes) # ... # +0x140 $code.=<<___ if ($win64); movaps %xmm6,-0xa8(%r9) movaps %xmm7,-0x98(%r9) movaps %xmm8,-0x88(%r9) movaps %xmm9,-0x78(%r9) movaps %xmm10,-0x68(%r9) movaps %xmm11,-0x58(%r9) movaps %xmm12,-0x48(%r9) movaps %xmm13,-0x38(%r9) movaps %xmm14,-0x28(%r9) movaps %xmm15,-0x18(%r9) .L4x_body: ___ $code.=<<___; movdqa .Lsigma(%rip),$xa3 # key[0] movdqu ($key),$xb3 # key[1] movdqu 16($key),$xt3 # key[2] movdqu ($counter),$xd3 # key[3] lea 0x100(%rsp),%rcx # size optimization lea .Lrot16(%rip),%r10 lea .Lrot24(%rip),%r11 pshufd \$0x00,$xa3,$xa0 # smash key by lanes... pshufd \$0x55,$xa3,$xa1 movdqa $xa0,0x40(%rsp) # ... and offload pshufd \$0xaa,$xa3,$xa2 movdqa $xa1,0x50(%rsp) pshufd \$0xff,$xa3,$xa3 movdqa $xa2,0x60(%rsp) movdqa $xa3,0x70(%rsp) pshufd \$0x00,$xb3,$xb0 pshufd \$0x55,$xb3,$xb1 movdqa $xb0,0x80-0x100(%rcx) pshufd \$0xaa,$xb3,$xb2 movdqa $xb1,0x90-0x100(%rcx) pshufd \$0xff,$xb3,$xb3 movdqa $xb2,0xa0-0x100(%rcx) movdqa $xb3,0xb0-0x100(%rcx) pshufd \$0x00,$xt3,$xt0 # "$xc0" pshufd \$0x55,$xt3,$xt1 # "$xc1" movdqa $xt0,0xc0-0x100(%rcx) pshufd \$0xaa,$xt3,$xt2 # "$xc2" movdqa $xt1,0xd0-0x100(%rcx) pshufd \$0xff,$xt3,$xt3 # "$xc3" movdqa $xt2,0xe0-0x100(%rcx) movdqa $xt3,0xf0-0x100(%rcx) pshufd \$0x00,$xd3,$xd0 pshufd \$0x55,$xd3,$xd1 paddd .Linc(%rip),$xd0 # don't save counters yet pshufd \$0xaa,$xd3,$xd2 movdqa $xd1,0x110-0x100(%rcx) pshufd \$0xff,$xd3,$xd3 movdqa $xd2,0x120-0x100(%rcx) movdqa $xd3,0x130-0x100(%rcx) jmp .Loop_enter4x .align 32 .Loop_outer4x: movdqa 0x40(%rsp),$xa0 # re-load smashed key movdqa 0x50(%rsp),$xa1 movdqa 0x60(%rsp),$xa2 movdqa 0x70(%rsp),$xa3 movdqa 0x80-0x100(%rcx),$xb0 movdqa 0x90-0x100(%rcx),$xb1 movdqa 0xa0-0x100(%rcx),$xb2 movdqa 0xb0-0x100(%rcx),$xb3 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" movdqa 0x100-0x100(%rcx),$xd0 movdqa 0x110-0x100(%rcx),$xd1 movdqa 0x120-0x100(%rcx),$xd2 movdqa 0x130-0x100(%rcx),$xd3 paddd .Lfour(%rip),$xd0 # next SIMD counters .Loop_enter4x: movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" movdqa (%r10),$xt3 # .Lrot16(%rip) mov \$10,%eax movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters jmp .Loop4x .align 32 .Loop4x: ___ foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } $code.=<<___; dec %eax jnz .Loop4x paddd 0x40(%rsp),$xa0 # accumulate key material paddd 0x50(%rsp),$xa1 paddd 0x60(%rsp),$xa2 paddd 0x70(%rsp),$xa3 movdqa $xa0,$xt2 # "de-interlace" data punpckldq $xa1,$xa0 movdqa $xa2,$xt3 punpckldq $xa3,$xa2 punpckhdq $xa1,$xt2 punpckhdq $xa3,$xt3 movdqa $xa0,$xa1 punpcklqdq $xa2,$xa0 # "a0" movdqa $xt2,$xa3 punpcklqdq $xt3,$xt2 # "a2" punpckhqdq $xa2,$xa1 # "a1" punpckhqdq $xt3,$xa3 # "a3" ___ ($xa2,$xt2)=($xt2,$xa2); $code.=<<___; paddd 0x80-0x100(%rcx),$xb0 paddd 0x90-0x100(%rcx),$xb1 paddd 0xa0-0x100(%rcx),$xb2 paddd 0xb0-0x100(%rcx),$xb3 movdqa $xa0,0x00(%rsp) # offload $xaN movdqa $xa1,0x10(%rsp) movdqa 0x20(%rsp),$xa0 # "xc2" movdqa 0x30(%rsp),$xa1 # "xc3" movdqa $xb0,$xt2 punpckldq $xb1,$xb0 movdqa $xb2,$xt3 punpckldq $xb3,$xb2 punpckhdq $xb1,$xt2 punpckhdq $xb3,$xt3 movdqa $xb0,$xb1 punpcklqdq $xb2,$xb0 # "b0" movdqa $xt2,$xb3 punpcklqdq $xt3,$xt2 # "b2" punpckhqdq $xb2,$xb1 # "b1" punpckhqdq $xt3,$xb3 # "b3" ___ ($xb2,$xt2)=($xt2,$xb2); my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); $code.=<<___; paddd 0xc0-0x100(%rcx),$xc0 paddd 0xd0-0x100(%rcx),$xc1 paddd 0xe0-0x100(%rcx),$xc2 paddd 0xf0-0x100(%rcx),$xc3 movdqa $xa2,0x20(%rsp) # keep offloading $xaN movdqa $xa3,0x30(%rsp) movdqa $xc0,$xt2 punpckldq $xc1,$xc0 movdqa $xc2,$xt3 punpckldq $xc3,$xc2 punpckhdq $xc1,$xt2 punpckhdq $xc3,$xt3 movdqa $xc0,$xc1 punpcklqdq $xc2,$xc0 # "c0" movdqa $xt2,$xc3 punpcklqdq $xt3,$xt2 # "c2" punpckhqdq $xc2,$xc1 # "c1" punpckhqdq $xt3,$xc3 # "c3" ___ ($xc2,$xt2)=($xt2,$xc2); ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary $code.=<<___; paddd 0x100-0x100(%rcx),$xd0 paddd 0x110-0x100(%rcx),$xd1 paddd 0x120-0x100(%rcx),$xd2 paddd 0x130-0x100(%rcx),$xd3 movdqa $xd0,$xt2 punpckldq $xd1,$xd0 movdqa $xd2,$xt3 punpckldq $xd3,$xd2 punpckhdq $xd1,$xt2 punpckhdq $xd3,$xt3 movdqa $xd0,$xd1 punpcklqdq $xd2,$xd0 # "d0" movdqa $xt2,$xd3 punpcklqdq $xt3,$xt2 # "d2" punpckhqdq $xd2,$xd1 # "d1" punpckhqdq $xt3,$xd3 # "d3" ___ ($xd2,$xt2)=($xt2,$xd2); $code.=<<___; cmp \$64*4,$len jb .Ltail4x movdqu 0x00($inp),$xt0 # xor with input movdqu 0x10($inp),$xt1 movdqu 0x20($inp),$xt2 movdqu 0x30($inp),$xt3 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? pxor $xb0,$xt1 pxor $xc0,$xt2 pxor $xd0,$xt3 movdqu $xt0,0x00($out) movdqu 0x40($inp),$xt0 movdqu $xt1,0x10($out) movdqu 0x50($inp),$xt1 movdqu $xt2,0x20($out) movdqu 0x60($inp),$xt2 movdqu $xt3,0x30($out) movdqu 0x70($inp),$xt3 lea 0x80($inp),$inp # size optimization pxor 0x10(%rsp),$xt0 pxor $xb1,$xt1 pxor $xc1,$xt2 pxor $xd1,$xt3 movdqu $xt0,0x40($out) movdqu 0x00($inp),$xt0 movdqu $xt1,0x50($out) movdqu 0x10($inp),$xt1 movdqu $xt2,0x60($out) movdqu 0x20($inp),$xt2 movdqu $xt3,0x70($out) lea 0x80($out),$out # size optimization movdqu 0x30($inp),$xt3 pxor 0x20(%rsp),$xt0 pxor $xb2,$xt1 pxor $xc2,$xt2 pxor $xd2,$xt3 movdqu $xt0,0x00($out) movdqu 0x40($inp),$xt0 movdqu $xt1,0x10($out) movdqu 0x50($inp),$xt1 movdqu $xt2,0x20($out) movdqu 0x60($inp),$xt2 movdqu $xt3,0x30($out) movdqu 0x70($inp),$xt3 lea 0x80($inp),$inp # inp+=64*4 pxor 0x30(%rsp),$xt0 pxor $xb3,$xt1 pxor $xc3,$xt2 pxor $xd3,$xt3 movdqu $xt0,0x40($out) movdqu $xt1,0x50($out) movdqu $xt2,0x60($out) movdqu $xt3,0x70($out) lea 0x80($out),$out # out+=64*4 sub \$64*4,$len jnz .Loop_outer4x jmp .Ldone4x .Ltail4x: cmp \$192,$len jae .L192_or_more4x cmp \$128,$len jae .L128_or_more4x cmp \$64,$len jae .L64_or_more4x #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? xor %r10,%r10 #movdqa $xt0,0x00(%rsp) movdqa $xb0,0x10(%rsp) movdqa $xc0,0x20(%rsp) movdqa $xd0,0x30(%rsp) jmp .Loop_tail4x .align 32 .L64_or_more4x: movdqu 0x00($inp),$xt0 # xor with input movdqu 0x10($inp),$xt1 movdqu 0x20($inp),$xt2 movdqu 0x30($inp),$xt3 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? pxor $xb0,$xt1 pxor $xc0,$xt2 pxor $xd0,$xt3 movdqu $xt0,0x00($out) movdqu $xt1,0x10($out) movdqu $xt2,0x20($out) movdqu $xt3,0x30($out) je .Ldone4x movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? lea 0x40($inp),$inp # inp+=64*1 xor %r10,%r10 movdqa $xt0,0x00(%rsp) movdqa $xb1,0x10(%rsp) lea 0x40($out),$out # out+=64*1 movdqa $xc1,0x20(%rsp) sub \$64,$len # len-=64*1 movdqa $xd1,0x30(%rsp) jmp .Loop_tail4x .align 32 .L128_or_more4x: movdqu 0x00($inp),$xt0 # xor with input movdqu 0x10($inp),$xt1 movdqu 0x20($inp),$xt2 movdqu 0x30($inp),$xt3 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? pxor $xb0,$xt1 pxor $xc0,$xt2 pxor $xd0,$xt3 movdqu $xt0,0x00($out) movdqu 0x40($inp),$xt0 movdqu $xt1,0x10($out) movdqu 0x50($inp),$xt1 movdqu $xt2,0x20($out) movdqu 0x60($inp),$xt2 movdqu $xt3,0x30($out) movdqu 0x70($inp),$xt3 pxor 0x10(%rsp),$xt0 pxor $xb1,$xt1 pxor $xc1,$xt2 pxor $xd1,$xt3 movdqu $xt0,0x40($out) movdqu $xt1,0x50($out) movdqu $xt2,0x60($out) movdqu $xt3,0x70($out) je .Ldone4x movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? lea 0x80($inp),$inp # inp+=64*2 xor %r10,%r10 movdqa $xt0,0x00(%rsp) movdqa $xb2,0x10(%rsp) lea 0x80($out),$out # out+=64*2 movdqa $xc2,0x20(%rsp) sub \$128,$len # len-=64*2 movdqa $xd2,0x30(%rsp) jmp .Loop_tail4x .align 32 .L192_or_more4x: movdqu 0x00($inp),$xt0 # xor with input movdqu 0x10($inp),$xt1 movdqu 0x20($inp),$xt2 movdqu 0x30($inp),$xt3 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? pxor $xb0,$xt1 pxor $xc0,$xt2 pxor $xd0,$xt3 movdqu $xt0,0x00($out) movdqu 0x40($inp),$xt0 movdqu $xt1,0x10($out) movdqu 0x50($inp),$xt1 movdqu $xt2,0x20($out) movdqu 0x60($inp),$xt2 movdqu $xt3,0x30($out) movdqu 0x70($inp),$xt3 lea 0x80($inp),$inp # size optimization pxor 0x10(%rsp),$xt0 pxor $xb1,$xt1 pxor $xc1,$xt2 pxor $xd1,$xt3 movdqu $xt0,0x40($out) movdqu 0x00($inp),$xt0 movdqu $xt1,0x50($out) movdqu 0x10($inp),$xt1 movdqu $xt2,0x60($out) movdqu 0x20($inp),$xt2 movdqu $xt3,0x70($out) lea 0x80($out),$out # size optimization movdqu 0x30($inp),$xt3 pxor 0x20(%rsp),$xt0 pxor $xb2,$xt1 pxor $xc2,$xt2 pxor $xd2,$xt3 movdqu $xt0,0x00($out) movdqu $xt1,0x10($out) movdqu $xt2,0x20($out) movdqu $xt3,0x30($out) je .Ldone4x movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? lea 0x40($inp),$inp # inp+=64*3 xor %r10,%r10 movdqa $xt0,0x00(%rsp) movdqa $xb3,0x10(%rsp) lea 0x40($out),$out # out+=64*3 movdqa $xc3,0x20(%rsp) sub \$192,$len # len-=64*3 movdqa $xd3,0x30(%rsp) .Loop_tail4x: movzb ($inp,%r10),%eax movzb (%rsp,%r10),%ecx lea 1(%r10),%r10 xor %ecx,%eax mov %al,-1($out,%r10) dec $len jnz .Loop_tail4x .Ldone4x: ___ $code.=<<___ if ($win64); movaps -0xa8(%r9),%xmm6 movaps -0x98(%r9),%xmm7 movaps -0x88(%r9),%xmm8 movaps -0x78(%r9),%xmm9 movaps -0x68(%r9),%xmm10 movaps -0x58(%r9),%xmm11 movaps -0x48(%r9),%xmm12 movaps -0x38(%r9),%xmm13 movaps -0x28(%r9),%xmm14 movaps -0x18(%r9),%xmm15 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register rsp .L4x_epilogue: ret .cfi_endproc .size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x ___ } ######################################################################## # AVX2 code path if ($avx>1) { my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); sub AVX2_lane_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); my @x=map("\"$_\"",@xx); # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' and 'd's are permanently allocated in registers, # @x[0..7,12..15], while 'c's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # bunch of 'c' stores and loads in the middle, but none in # the beginning or end. ( "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", "&vpshufb (@x[$d0],@x[$d0],$t1)", "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", "&vpshufb (@x[$d1],@x[$d1],$t1)", "&vpaddd ($xc,$xc,@x[$d0])", "&vpxor (@x[$b0],$xc,@x[$b0])", "&vpslld ($t0,@x[$b0],12)", "&vpsrld (@x[$b0],@x[$b0],20)", "&vpor (@x[$b0],$t0,@x[$b0])", "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) "&vpaddd ($xc_,$xc_,@x[$d1])", "&vpxor (@x[$b1],$xc_,@x[$b1])", "&vpslld ($t1,@x[$b1],12)", "&vpsrld (@x[$b1],@x[$b1],20)", "&vpor (@x[$b1],$t1,@x[$b1])", "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", "&vpxor (@x[$d0],@x[$a0],@x[$d0])", "&vpshufb (@x[$d0],@x[$d0],$t0)", "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", "&vpxor (@x[$d1],@x[$a1],@x[$d1])", "&vpshufb (@x[$d1],@x[$d1],$t0)", "&vpaddd ($xc,$xc,@x[$d0])", "&vpxor (@x[$b0],$xc,@x[$b0])", "&vpslld ($t1,@x[$b0],7)", "&vpsrld (@x[$b0],@x[$b0],25)", "&vpor (@x[$b0],$t1,@x[$b0])", "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) "&vpaddd ($xc_,$xc_,@x[$d1])", "&vpxor (@x[$b1],$xc_,@x[$b1])", "&vpslld ($t0,@x[$b1],7)", "&vpsrld (@x[$b1],@x[$b1],25)", "&vpor (@x[$b1],$t0,@x[$b1])", "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", "&vpshufb (@x[$d2],@x[$d2],$t1)", "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", "&vpshufb (@x[$d3],@x[$d3],$t1)", "&vpaddd ($xc,$xc,@x[$d2])", "&vpxor (@x[$b2],$xc,@x[$b2])", "&vpslld ($t0,@x[$b2],12)", "&vpsrld (@x[$b2],@x[$b2],20)", "&vpor (@x[$b2],$t0,@x[$b2])", "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) "&vpaddd ($xc_,$xc_,@x[$d3])", "&vpxor (@x[$b3],$xc_,@x[$b3])", "&vpslld ($t1,@x[$b3],12)", "&vpsrld (@x[$b3],@x[$b3],20)", "&vpor (@x[$b3],$t1,@x[$b3])", "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", "&vpxor (@x[$d2],@x[$a2],@x[$d2])", "&vpshufb (@x[$d2],@x[$d2],$t0)", "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", "&vpxor (@x[$d3],@x[$a3],@x[$d3])", "&vpshufb (@x[$d3],@x[$d3],$t0)", "&vpaddd ($xc,$xc,@x[$d2])", "&vpxor (@x[$b2],$xc,@x[$b2])", "&vpslld ($t1,@x[$b2],7)", "&vpsrld (@x[$b2],@x[$b2],25)", "&vpor (@x[$b2],$t1,@x[$b2])", "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) "&vpaddd ($xc_,$xc_,@x[$d3])", "&vpxor (@x[$b3],$xc_,@x[$b3])", "&vpslld ($t0,@x[$b3],7)", "&vpsrld (@x[$b3],@x[$b3],25)", "&vpor (@x[$b3],$t0,@x[$b3])" ); } my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .globl ChaCha20_ctr32_avx2 .type ChaCha20_ctr32_avx2,\@function,5 .align 32 ChaCha20_ctr32_avx2: .cfi_startproc _CET_ENDBR mov %rsp,%r9 # frame register .cfi_def_cfa_register r9 sub \$0x280+$xframe,%rsp and \$-32,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0xa8(%r9) movaps %xmm7,-0x98(%r9) movaps %xmm8,-0x88(%r9) movaps %xmm9,-0x78(%r9) movaps %xmm10,-0x68(%r9) movaps %xmm11,-0x58(%r9) movaps %xmm12,-0x48(%r9) movaps %xmm13,-0x38(%r9) movaps %xmm14,-0x28(%r9) movaps %xmm15,-0x18(%r9) .L8x_body: ___ $code.=<<___; vzeroupper ################ stack layout # +0x00 SIMD equivalent of @x[8-12] # ... # +0x80 constant copy of key[0-2] smashed by lanes # ... # +0x200 SIMD counters (with nonce smashed by lanes) # ... # +0x280 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] vbroadcasti128 ($key),$xb3 # key[1] vbroadcasti128 16($key),$xt3 # key[2] vbroadcasti128 ($counter),$xd3 # key[3] lea 0x100(%rsp),%rcx # size optimization lea 0x200(%rsp),%rax # size optimization lea .Lrot16(%rip),%r10 lea .Lrot24(%rip),%r11 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... vpshufd \$0x55,$xa3,$xa1 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload vpshufd \$0xaa,$xa3,$xa2 vmovdqa $xa1,0xa0-0x100(%rcx) vpshufd \$0xff,$xa3,$xa3 vmovdqa $xa2,0xc0-0x100(%rcx) vmovdqa $xa3,0xe0-0x100(%rcx) vpshufd \$0x00,$xb3,$xb0 vpshufd \$0x55,$xb3,$xb1 vmovdqa $xb0,0x100-0x100(%rcx) vpshufd \$0xaa,$xb3,$xb2 vmovdqa $xb1,0x120-0x100(%rcx) vpshufd \$0xff,$xb3,$xb3 vmovdqa $xb2,0x140-0x100(%rcx) vmovdqa $xb3,0x160-0x100(%rcx) vpshufd \$0x00,$xt3,$xt0 # "xc0" vpshufd \$0x55,$xt3,$xt1 # "xc1" vmovdqa $xt0,0x180-0x200(%rax) vpshufd \$0xaa,$xt3,$xt2 # "xc2" vmovdqa $xt1,0x1a0-0x200(%rax) vpshufd \$0xff,$xt3,$xt3 # "xc3" vmovdqa $xt2,0x1c0-0x200(%rax) vmovdqa $xt3,0x1e0-0x200(%rax) vpshufd \$0x00,$xd3,$xd0 vpshufd \$0x55,$xd3,$xd1 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet vpshufd \$0xaa,$xd3,$xd2 vmovdqa $xd1,0x220-0x200(%rax) vpshufd \$0xff,$xd3,$xd3 vmovdqa $xd2,0x240-0x200(%rax) vmovdqa $xd3,0x260-0x200(%rax) jmp .Loop_enter8x .align 32 .Loop_outer8x: vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key vmovdqa 0xa0-0x100(%rcx),$xa1 vmovdqa 0xc0-0x100(%rcx),$xa2 vmovdqa 0xe0-0x100(%rcx),$xa3 vmovdqa 0x100-0x100(%rcx),$xb0 vmovdqa 0x120-0x100(%rcx),$xb1 vmovdqa 0x140-0x100(%rcx),$xb2 vmovdqa 0x160-0x100(%rcx),$xb3 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" vmovdqa 0x200-0x200(%rax),$xd0 vmovdqa 0x220-0x200(%rax),$xd1 vmovdqa 0x240-0x200(%rax),$xd2 vmovdqa 0x260-0x200(%rax),$xd3 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters .Loop_enter8x: vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" vbroadcasti128 (%r10),$xt3 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters mov \$10,%eax jmp .Loop8x .align 32 .Loop8x: ___ foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } $code.=<<___; dec %eax jnz .Loop8x lea 0x200(%rsp),%rax # size optimization vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data vpunpckldq $xa3,$xa2,$xt3 vpunpckhdq $xa1,$xa0,$xa0 vpunpckhdq $xa3,$xa2,$xa2 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" vpunpckhqdq $xt3,$xt2,$xt2 # "a1" vpunpcklqdq $xa2,$xa0,$xa3 # "a2" vpunpckhqdq $xa2,$xa0,$xa0 # "a3" ___ ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); $code.=<<___; vpaddd 0x100-0x100(%rcx),$xb0,$xb0 vpaddd 0x120-0x100(%rcx),$xb1,$xb1 vpaddd 0x140-0x100(%rcx),$xb2,$xb2 vpaddd 0x160-0x100(%rcx),$xb3,$xb3 vpunpckldq $xb1,$xb0,$xt2 vpunpckldq $xb3,$xb2,$xt3 vpunpckhdq $xb1,$xb0,$xb0 vpunpckhdq $xb3,$xb2,$xb2 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" vpunpckhqdq $xt3,$xt2,$xt2 # "b1" vpunpcklqdq $xb2,$xb0,$xb3 # "b2" vpunpckhqdq $xb2,$xb0,$xb0 # "b3" ___ ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); $code.=<<___; vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further vperm2i128 \$0x31,$xb0,$xa0,$xb0 vperm2i128 \$0x20,$xb1,$xa1,$xa0 vperm2i128 \$0x31,$xb1,$xa1,$xb1 vperm2i128 \$0x20,$xb2,$xa2,$xa1 vperm2i128 \$0x31,$xb2,$xa2,$xb2 vperm2i128 \$0x20,$xb3,$xa3,$xa2 vperm2i128 \$0x31,$xb3,$xa3,$xb3 ___ ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); $code.=<<___; vmovdqa $xa0,0x00(%rsp) # offload $xaN vmovdqa $xa1,0x20(%rsp) vmovdqa 0x40(%rsp),$xc2 # $xa0 vmovdqa 0x60(%rsp),$xc3 # $xa1 vpaddd 0x180-0x200(%rax),$xc0,$xc0 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 vpunpckldq $xc1,$xc0,$xt2 vpunpckldq $xc3,$xc2,$xt3 vpunpckhdq $xc1,$xc0,$xc0 vpunpckhdq $xc3,$xc2,$xc2 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" vpunpckhqdq $xt3,$xt2,$xt2 # "c1" vpunpcklqdq $xc2,$xc0,$xc3 # "c2" vpunpckhqdq $xc2,$xc0,$xc0 # "c3" ___ ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); $code.=<<___; vpaddd 0x200-0x200(%rax),$xd0,$xd0 vpaddd 0x220-0x200(%rax),$xd1,$xd1 vpaddd 0x240-0x200(%rax),$xd2,$xd2 vpaddd 0x260-0x200(%rax),$xd3,$xd3 vpunpckldq $xd1,$xd0,$xt2 vpunpckldq $xd3,$xd2,$xt3 vpunpckhdq $xd1,$xd0,$xd0 vpunpckhdq $xd3,$xd2,$xd2 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" vpunpckhqdq $xt3,$xt2,$xt2 # "d1" vpunpcklqdq $xd2,$xd0,$xd3 # "d2" vpunpckhqdq $xd2,$xd0,$xd0 # "d3" ___ ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); $code.=<<___; vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further vperm2i128 \$0x31,$xd0,$xc0,$xd0 vperm2i128 \$0x20,$xd1,$xc1,$xc0 vperm2i128 \$0x31,$xd1,$xc1,$xd1 vperm2i128 \$0x20,$xd2,$xc2,$xc1 vperm2i128 \$0x31,$xd2,$xc2,$xd2 vperm2i128 \$0x20,$xd3,$xc3,$xc2 vperm2i128 \$0x31,$xd3,$xc3,$xd3 ___ ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); ($xa0,$xa1)=($xt2,$xt3); $code.=<<___; vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? vmovdqa 0x20(%rsp),$xa1 cmp \$64*8,$len jb .Ltail8x vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 lea 0x80($inp),$inp # size optimization vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) lea 0x80($out),$out # size optimization vpxor 0x00($inp),$xa1,$xa1 vpxor 0x20($inp),$xb1,$xb1 vpxor 0x40($inp),$xc1,$xc1 vpxor 0x60($inp),$xd1,$xd1 lea 0x80($inp),$inp # size optimization vmovdqu $xa1,0x00($out) vmovdqu $xb1,0x20($out) vmovdqu $xc1,0x40($out) vmovdqu $xd1,0x60($out) lea 0x80($out),$out # size optimization vpxor 0x00($inp),$xa2,$xa2 vpxor 0x20($inp),$xb2,$xb2 vpxor 0x40($inp),$xc2,$xc2 vpxor 0x60($inp),$xd2,$xd2 lea 0x80($inp),$inp # size optimization vmovdqu $xa2,0x00($out) vmovdqu $xb2,0x20($out) vmovdqu $xc2,0x40($out) vmovdqu $xd2,0x60($out) lea 0x80($out),$out # size optimization vpxor 0x00($inp),$xa3,$xa3 vpxor 0x20($inp),$xb3,$xb3 vpxor 0x40($inp),$xc3,$xc3 vpxor 0x60($inp),$xd3,$xd3 lea 0x80($inp),$inp # size optimization vmovdqu $xa3,0x00($out) vmovdqu $xb3,0x20($out) vmovdqu $xc3,0x40($out) vmovdqu $xd3,0x60($out) lea 0x80($out),$out # size optimization sub \$64*8,$len jnz .Loop_outer8x jmp .Ldone8x .Ltail8x: cmp \$448,$len jae .L448_or_more8x cmp \$384,$len jae .L384_or_more8x cmp \$320,$len jae .L320_or_more8x cmp \$256,$len jae .L256_or_more8x cmp \$192,$len jae .L192_or_more8x cmp \$128,$len jae .L128_or_more8x cmp \$64,$len jae .L64_or_more8x xor %r10,%r10 vmovdqa $xa0,0x00(%rsp) vmovdqa $xb0,0x20(%rsp) jmp .Loop_tail8x .align 32 .L64_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) je .Ldone8x lea 0x40($inp),$inp # inp+=64*1 xor %r10,%r10 vmovdqa $xc0,0x00(%rsp) lea 0x40($out),$out # out+=64*1 sub \$64,$len # len-=64*1 vmovdqa $xd0,0x20(%rsp) jmp .Loop_tail8x .align 32 .L128_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) je .Ldone8x lea 0x80($inp),$inp # inp+=64*2 xor %r10,%r10 vmovdqa $xa1,0x00(%rsp) lea 0x80($out),$out # out+=64*2 sub \$128,$len # len-=64*2 vmovdqa $xb1,0x20(%rsp) jmp .Loop_tail8x .align 32 .L192_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) je .Ldone8x lea 0xc0($inp),$inp # inp+=64*3 xor %r10,%r10 vmovdqa $xc1,0x00(%rsp) lea 0xc0($out),$out # out+=64*3 sub \$192,$len # len-=64*3 vmovdqa $xd1,0x20(%rsp) jmp .Loop_tail8x .align 32 .L256_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vpxor 0xc0($inp),$xc1,$xc1 vpxor 0xe0($inp),$xd1,$xd1 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) vmovdqu $xc1,0xc0($out) vmovdqu $xd1,0xe0($out) je .Ldone8x lea 0x100($inp),$inp # inp+=64*4 xor %r10,%r10 vmovdqa $xa2,0x00(%rsp) lea 0x100($out),$out # out+=64*4 sub \$256,$len # len-=64*4 vmovdqa $xb2,0x20(%rsp) jmp .Loop_tail8x .align 32 .L320_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vpxor 0xc0($inp),$xc1,$xc1 vpxor 0xe0($inp),$xd1,$xd1 vpxor 0x100($inp),$xa2,$xa2 vpxor 0x120($inp),$xb2,$xb2 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) vmovdqu $xc1,0xc0($out) vmovdqu $xd1,0xe0($out) vmovdqu $xa2,0x100($out) vmovdqu $xb2,0x120($out) je .Ldone8x lea 0x140($inp),$inp # inp+=64*5 xor %r10,%r10 vmovdqa $xc2,0x00(%rsp) lea 0x140($out),$out # out+=64*5 sub \$320,$len # len-=64*5 vmovdqa $xd2,0x20(%rsp) jmp .Loop_tail8x .align 32 .L384_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vpxor 0xc0($inp),$xc1,$xc1 vpxor 0xe0($inp),$xd1,$xd1 vpxor 0x100($inp),$xa2,$xa2 vpxor 0x120($inp),$xb2,$xb2 vpxor 0x140($inp),$xc2,$xc2 vpxor 0x160($inp),$xd2,$xd2 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) vmovdqu $xc1,0xc0($out) vmovdqu $xd1,0xe0($out) vmovdqu $xa2,0x100($out) vmovdqu $xb2,0x120($out) vmovdqu $xc2,0x140($out) vmovdqu $xd2,0x160($out) je .Ldone8x lea 0x180($inp),$inp # inp+=64*6 xor %r10,%r10 vmovdqa $xa3,0x00(%rsp) lea 0x180($out),$out # out+=64*6 sub \$384,$len # len-=64*6 vmovdqa $xb3,0x20(%rsp) jmp .Loop_tail8x .align 32 .L448_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vpxor 0xc0($inp),$xc1,$xc1 vpxor 0xe0($inp),$xd1,$xd1 vpxor 0x100($inp),$xa2,$xa2 vpxor 0x120($inp),$xb2,$xb2 vpxor 0x140($inp),$xc2,$xc2 vpxor 0x160($inp),$xd2,$xd2 vpxor 0x180($inp),$xa3,$xa3 vpxor 0x1a0($inp),$xb3,$xb3 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) vmovdqu $xc1,0xc0($out) vmovdqu $xd1,0xe0($out) vmovdqu $xa2,0x100($out) vmovdqu $xb2,0x120($out) vmovdqu $xc2,0x140($out) vmovdqu $xd2,0x160($out) vmovdqu $xa3,0x180($out) vmovdqu $xb3,0x1a0($out) je .Ldone8x lea 0x1c0($inp),$inp # inp+=64*7 xor %r10,%r10 vmovdqa $xc3,0x00(%rsp) lea 0x1c0($out),$out # out+=64*7 sub \$448,$len # len-=64*7 vmovdqa $xd3,0x20(%rsp) .Loop_tail8x: movzb ($inp,%r10),%eax movzb (%rsp,%r10),%ecx lea 1(%r10),%r10 xor %ecx,%eax mov %al,-1($out,%r10) dec $len jnz .Loop_tail8x .Ldone8x: vzeroall ___ $code.=<<___ if ($win64); movaps -0xa8(%r9),%xmm6 movaps -0x98(%r9),%xmm7 movaps -0x88(%r9),%xmm8 movaps -0x78(%r9),%xmm9 movaps -0x68(%r9),%xmm10 movaps -0x58(%r9),%xmm11 movaps -0x48(%r9),%xmm12 movaps -0x38(%r9),%xmm13 movaps -0x28(%r9),%xmm14 movaps -0x18(%r9),%xmm15 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register rsp .L8x_epilogue: ret .cfi_endproc .size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2 ___ } ######################################################################## # AVX512 code paths were removed # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData lea .Lctr32_body(%rip),%r10 cmp %r10,%rbx # context->Rip<.Lprologue jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp lea .Lno_data(%rip),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lcommon_seh_tail lea 64+24+48(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R14 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler .type ssse3_handler,\@abi-omnipotent .align 16 ssse3_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipR9 mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea -0x28(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$4,%ecx .long 0xa548f3fc # cld; rep movsq jmp .Lcommon_seh_tail .size ssse3_handler,.-ssse3_handler .type full_handler,\@abi-omnipotent .align 16 full_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipR9 mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea -0xa8(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq jmp .Lcommon_seh_tail .size full_handler,.-full_handler .section .pdata .align 4 .rva .LSEH_begin_ChaCha20_ctr32_nohw .rva .LSEH_end_ChaCha20_ctr32_nohw .rva .LSEH_info_ChaCha20_ctr32_nohw .rva .LSEH_begin_ChaCha20_ctr32_ssse3_4x .rva .LSEH_end_ChaCha20_ctr32_ssse3_4x .rva .LSEH_info_ChaCha20_ctr32_ssse3_4x ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_ChaCha20_ctr32_avx2 .rva .LSEH_end_ChaCha20_ctr32_avx2 .rva .LSEH_info_ChaCha20_ctr32_avx2 ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_ChaCha20_ctr32_nohw: .byte 9,0,0,0 .rva se_handler .LSEH_info_ChaCha20_ctr32_ssse3_4x: .byte 9,0,0,0 .rva full_handler .rva .L4x_body,.L4x_epilogue ___ $code.=<<___ if ($avx>1); .LSEH_info_ChaCha20_ctr32_avx2: .byte 9,0,0,0 .rva full_handler .rva .L8x_body,.L8x_epilogue # HandlerData[] ___ } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; s/%x#%[yz]/%x/g; # "down-shift" print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_armv8.pl000064400000000000000000001256451046102023000207230ustar 00000000000000#!/usr/bin/env perl # Copyright (c) 2020, CloudFlare Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################## # # # Author: Vlad Krasnov # # # ############################################################################## $flavour = shift; while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; my ($oup,$inp,$inl,$adp,$adl,$keyp,$itr1,$itr2) = ("x0","x1","x2","x3","x4","x5","x6","x7"); my ($acc0,$acc1,$acc2) = map("x$_",(8..10)); my ($t0,$t1,$t2,$t3) = map("x$_",(11..14)); my ($one, $r0, $r1) = ("x15","x16","x17"); my ($t0w) = $t0 =~ s/x/w/r; my ($A0,$A1,$A2,$A3,$A4,$B0,$B1,$B2,$B3,$B4,$C0,$C1,$C2,$C3,$C4,$D0,$D1,$D2,$D3,$D4) = map("v$_",(0..19)); my ($T0,$T1,$T2,$T3) = map("v$_",(20..23)); my $CONSTS = "v24"; my $INC = "v25"; my $ROL8 = "v26"; my $CLAMP = "v27"; my ($B_STORE, $C_STORE, $D_STORE) = map("v$_",(28..30)); my $S_STORE = $CLAMP; my $LEN_STORE = "v31"; sub chacha_qr { my ($a,$b,$c,$d,$t,$dir)=@_; my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4"); $code.=<<___; add $a.4s, $a.4s, $b.4s eor $d.16b, $d.16b, $a.16b rev32 $d.8h, $d.8h add $c.4s, $c.4s, $d.4s eor $b.16b, $b.16b, $c.16b ushr $t.4s, $b.4s, #20 sli $t.4s, $b.4s, #12 ___ ($t,$b) = ($b,$t); $code.=<<___; add $a.4s, $a.4s, $b.4s eor $d.16b, $d.16b, $a.16b tbl $d.16b, {$d.16b}, $ROL8.16b add $c.4s, $c.4s, $d.4s eor $b.16b, $b.16b, $c.16b ushr $t.4s, $b.4s, #25 sli $t.4s, $b.4s, #7 ___ ($t,$b) = ($b,$t); $code.=<<___; ext $b.16b, $b.16b, $b.16b, $shift_b ext $c.16b, $c.16b, $c.16b, #8 ext $d.16b, $d.16b, $d.16b, $shift_d ___ } sub poly_add { my ($src)=@_; $code.="ldp $t0, $t1, [$src], 16 adds $acc0, $acc0, $t0 adcs $acc1, $acc1, $t1 adc $acc2, $acc2, $one\n"; } sub poly_add_vec { my ($src)=@_; $code.="mov $t0, $src.d[0] mov $t1, $src.d[1] adds $acc0, $acc0, $t0 adcs $acc1, $acc1, $t1 adc $acc2, $acc2, $one\n"; } sub poly_stage1 { $code.="mul $t0, $acc0, $r0 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh $t1, $acc0, $r0 mul $t2, $acc1, $r0 umulh $t3, $acc1, $r0 adds $t1, $t1, $t2 mul $t2, $acc2, $r0 adc $t2, $t2, $t3\n"; } sub poly_stage2 { $code.="mul $t3, $acc0, $r1 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh $acc0, $acc0, $r1 adds $t1, $t1, $t3 mul $t3, $acc1, $r1 umulh $acc1, $acc1, $r1 adcs $t3, $t3, $acc0 mul $acc2, $acc2, $r1 adc $acc2, $acc2, $acc1 adds $t2, $t2, $t3 adc $t3, $acc2, xzr\n"; } # At the beginning of the reduce stage t = [t3:t2:t1:t0] is a product of # r = [r1:r0] and acc = [acc2:acc1:acc0] # r is 124 bits at most (due to clamping) and acc is 131 bits at most # (acc2 is at most 4 before the addition and can be at most 6 when we add in # the next block) therefore t is at most 255 bits big, and t3 is 63 bits. sub poly_reduce_stage { $code.="and $acc2, $t2, #3 // At this point acc2 is 2 bits at most (value of 3) and $acc0, $t2, #-4 extr $t2, $t3, $t2, #2 adds $acc0, $acc0, $t0 lsr $t0, $t3, #2 adc $acc1, $t3, $t0 // No carry out since t0 is 61 bits and t3 is 63 bits adds $acc0, $acc0, $t2 adcs $acc1, $acc1, $t1 adc $acc2, $acc2, xzr // At this point acc2 has the value of 4 at most \n"; } sub poly_mul { &poly_stage1(); &poly_stage2(); &poly_reduce_stage(); } sub chacha_qr_x3 { my ($dir)=@_; my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4"); $code.=<<___; add $A0.4s, $A0.4s, $B0.4s add $A1.4s, $A1.4s, $B1.4s add $A2.4s, $A2.4s, $B2.4s eor $D0.16b, $D0.16b, $A0.16b eor $D1.16b, $D1.16b, $A1.16b eor $D2.16b, $D2.16b, $A2.16b rev32 $D0.8h, $D0.8h rev32 $D1.8h, $D1.8h rev32 $D2.8h, $D2.8h add $C0.4s, $C0.4s, $D0.4s add $C1.4s, $C1.4s, $D1.4s add $C2.4s, $C2.4s, $D2.4s eor $B0.16b, $B0.16b, $C0.16b eor $B1.16b, $B1.16b, $C1.16b eor $B2.16b, $B2.16b, $C2.16b ushr $T0.4s, $B0.4s, #20 sli $T0.4s, $B0.4s, #12 ushr $B0.4s, $B1.4s, #20 sli $B0.4s, $B1.4s, #12 ushr $B1.4s, $B2.4s, #20 sli $B1.4s, $B2.4s, #12 add $A0.4s, $A0.4s, $T0.4s add $A1.4s, $A1.4s, $B0.4s add $A2.4s, $A2.4s, $B1.4s eor $D0.16b, $D0.16b, $A0.16b eor $D1.16b, $D1.16b, $A1.16b eor $D2.16b, $D2.16b, $A2.16b tbl $D0.16b, {$D0.16b}, $ROL8.16b tbl $D1.16b, {$D1.16b}, $ROL8.16b tbl $D2.16b, {$D2.16b}, $ROL8.16b add $C0.4s, $C0.4s, $D0.4s add $C1.4s, $C1.4s, $D1.4s add $C2.4s, $C2.4s, $D2.4s eor $T0.16b, $T0.16b, $C0.16b eor $B0.16b, $B0.16b, $C1.16b eor $B1.16b, $B1.16b, $C2.16b ushr $B2.4s, $B1.4s, #25 sli $B2.4s, $B1.4s, #7 ushr $B1.4s, $B0.4s, #25 sli $B1.4s, $B0.4s, #7 ushr $B0.4s, $T0.4s, #25 sli $B0.4s, $T0.4s, #7 ext $B0.16b, $B0.16b, $B0.16b, $shift_b ext $B1.16b, $B1.16b, $B1.16b, $shift_b ext $B2.16b, $B2.16b, $B2.16b, $shift_b ext $C0.16b, $C0.16b, $C0.16b, #8 ext $C1.16b, $C1.16b, $C1.16b, #8 ext $C2.16b, $C2.16b, $C2.16b, #8 ext $D0.16b, $D0.16b, $D0.16b, $shift_d ext $D1.16b, $D1.16b, $D1.16b, $shift_d ext $D2.16b, $D2.16b, $D2.16b, $shift_d ___ } # When preparing 5 ChaCha20 blocks in parallel, we operate on 4 blocks vertically as introduced by Andrew Moon # the fifth block is done horizontally sub chacha_qr_x5 { my ($dir)=@_; my ($a0,$a1,$a2,$a3) = $dir =~ /left/ ? ($A0,$A1,$A2,$A3) : ($A0,$A1,$A2,$A3); my ($b0,$b1,$b2,$b3) = $dir =~ /left/ ? ($B0,$B1,$B2,$B3) : ($B1,$B2,$B3,$B0); my ($c0,$c1,$c2,$c3) = $dir =~ /left/ ? ($C0,$C1,$C2,$C3) : ($C2,$C3,$C0,$C1); my ($d0,$d1,$d2,$d3) = $dir =~ /left/ ? ($D0,$D1,$D2,$D3) : ($D3,$D0,$D1,$D2); my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4"); $code.=<<___; add $a0.4s, $a0.4s, $b0.4s add $a1.4s, $a1.4s, $b1.4s add $a2.4s, $a2.4s, $b2.4s add $a3.4s, $a3.4s, $b3.4s add $A4.4s, $A4.4s, $B4.4s eor $d0.16b, $d0.16b, $a0.16b eor $d1.16b, $d1.16b, $a1.16b eor $d2.16b, $d2.16b, $a2.16b eor $d3.16b, $d3.16b, $a3.16b eor $D4.16b, $D4.16b, $A4.16b rev32 $d0.8h, $d0.8h rev32 $d1.8h, $d1.8h rev32 $d2.8h, $d2.8h rev32 $d3.8h, $d3.8h rev32 $D4.8h, $D4.8h add $c0.4s, $c0.4s, $d0.4s add $c1.4s, $c1.4s, $d1.4s add $c2.4s, $c2.4s, $d2.4s add $c3.4s, $c3.4s, $d3.4s add $C4.4s, $C4.4s, $D4.4s eor $b0.16b, $b0.16b, $c0.16b eor $b1.16b, $b1.16b, $c1.16b eor $b2.16b, $b2.16b, $c2.16b eor $b3.16b, $b3.16b, $c3.16b eor $B4.16b, $B4.16b, $C4.16b ushr $T0.4s, $b0.4s, #20 sli $T0.4s, $b0.4s, #12 ushr $b0.4s, $b1.4s, #20 sli $b0.4s, $b1.4s, #12 ushr $b1.4s, $b2.4s, #20 sli $b1.4s, $b2.4s, #12 ushr $b2.4s, $b3.4s, #20 sli $b2.4s, $b3.4s, #12 ushr $b3.4s, $B4.4s, #20 sli $b3.4s, $B4.4s, #12 add $a0.4s, $a0.4s, $T0.4s add $a1.4s, $a1.4s, $b0.4s add $a2.4s, $a2.4s, $b1.4s add $a3.4s, $a3.4s, $b2.4s add $A4.4s, $A4.4s, $b3.4s eor $d0.16b, $d0.16b, $a0.16b eor $d1.16b, $d1.16b, $a1.16b eor $d2.16b, $d2.16b, $a2.16b eor $d3.16b, $d3.16b, $a3.16b eor $D4.16b, $D4.16b, $A4.16b tbl $d0.16b, {$d0.16b}, $ROL8.16b tbl $d1.16b, {$d1.16b}, $ROL8.16b tbl $d2.16b, {$d2.16b}, $ROL8.16b tbl $d3.16b, {$d3.16b}, $ROL8.16b tbl $D4.16b, {$D4.16b}, $ROL8.16b add $c0.4s, $c0.4s, $d0.4s add $c1.4s, $c1.4s, $d1.4s add $c2.4s, $c2.4s, $d2.4s add $c3.4s, $c3.4s, $d3.4s add $C4.4s, $C4.4s, $D4.4s eor $T0.16b, $T0.16b, $c0.16b eor $b0.16b, $b0.16b, $c1.16b eor $b1.16b, $b1.16b, $c2.16b eor $b2.16b, $b2.16b, $c3.16b eor $b3.16b, $b3.16b, $C4.16b ushr $B4.4s, $b3.4s, #25 sli $B4.4s, $b3.4s, #7 ushr $b3.4s, $b2.4s, #25 sli $b3.4s, $b2.4s, #7 ushr $b2.4s, $b1.4s, #25 sli $b2.4s, $b1.4s, #7 ushr $b1.4s, $b0.4s, #25 sli $b1.4s, $b0.4s, #7 ushr $b0.4s, $T0.4s, #25 sli $b0.4s, $T0.4s, #7 ext $B4.16b, $B4.16b, $B4.16b, $shift_b ext $C4.16b, $C4.16b, $C4.16b, #8 ext $D4.16b, $D4.16b, $D4.16b, $shift_d ___ } { $code.=<<___; .section .rodata .align 7 .Lchacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .Linc: .long 1,2,3,4 .Lrol8: .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 .Lclamp: .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC .text .type .Lpoly_hash_ad_internal,%function .align 6 .Lpoly_hash_ad_internal: .cfi_startproc cbnz $adl, .Lpoly_hash_intro ret .Lpoly_hash_intro: cmp $adl, #16 b.lt .Lpoly_hash_ad_tail ___ &poly_add($adp); &poly_mul(); $code.=<<___; sub $adl, $adl, #16 b .Lpoly_hash_ad_internal .Lpoly_hash_ad_tail: cbz $adl, .Lpoly_hash_ad_ret eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the AAD sub $adl, $adl, #1 .Lpoly_hash_tail_16_compose: ext $T0.16b, $T0.16b, $T0.16b, #15 ldrb $t0w, [$adp, $adl] mov $T0.b[0], $t0w subs $adl, $adl, #1 b.ge .Lpoly_hash_tail_16_compose ___ &poly_add_vec($T0); &poly_mul(); $code.=<<___; .Lpoly_hash_ad_ret: ret .cfi_endproc .size .Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal ///////////////////////////////// // // void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); // .globl chacha20_poly1305_seal .type chacha20_poly1305_seal,%function .align 6 chacha20_poly1305_seal: AARCH64_SIGN_LINK_REGISTER .cfi_startproc stp x29, x30, [sp, #-80]! .cfi_def_cfa_offset 80 .cfi_offset w30, -72 .cfi_offset w29, -80 mov x29, sp // We probably could do .cfi_def_cfa w29, 80 at this point, but since // we don't actually use the frame pointer like that, it's probably not // worth bothering. stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] .cfi_offset b15, -8 .cfi_offset b14, -16 .cfi_offset b13, -24 .cfi_offset b12, -32 .cfi_offset b11, -40 .cfi_offset b10, -48 .cfi_offset b9, -56 .cfi_offset b8, -64 adrp $t0, :pg_hi21:.Lchacha20_consts add $t0, $t0, :lo12:.Lchacha20_consts ld1 {$CONSTS.16b - $CLAMP.16b}, [$t0] // Load the CONSTS, INC, ROL8 and CLAMP values ld1 {$B_STORE.16b - $D_STORE.16b}, [$keyp] mov $one, #1 // Prepare the Poly1305 state mov $acc0, #0 mov $acc1, #0 mov $acc2, #0 ldr $t1, [$keyp, #56] // The total cipher text length includes extra_in_len add $t1, $t1, $inl mov $LEN_STORE.d[0], $adl // Store the input and aad lengths mov $LEN_STORE.d[1], $t1 cmp $inl, #128 b.le .Lseal_128 // Optimization for smaller buffers // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, // the fifth block (A4-D4) horizontally. ld4r {$A0.4s-$A3.4s}, [$t0] mov $A4.16b, $CONSTS.16b ld4r {$B0.4s-$B3.4s}, [$keyp], #16 mov $B4.16b, $B_STORE.16b ld4r {$C0.4s-$C3.4s}, [$keyp], #16 mov $C4.16b, $C_STORE.16b ld4r {$D0.4s-$D3.4s}, [$keyp] add $D0.4s, $D0.4s, $INC.4s mov $D4.16b, $D_STORE.16b sub $keyp, $keyp, #32 mov $itr1, #10 .align 5 .Lseal_init_rounds: ___ &chacha_qr_x5("left"); &chacha_qr_x5("right"); $code.=<<___; subs $itr1, $itr1, #1 b.hi .Lseal_init_rounds add $D0.4s, $D0.4s, $INC.4s mov $t0, #4 dup $T0.4s, $t0w add $INC.4s, $INC.4s, $T0.4s zip1 $T0.4s, $A0.4s, $A1.4s zip2 $T1.4s, $A0.4s, $A1.4s zip1 $T2.4s, $A2.4s, $A3.4s zip2 $T3.4s, $A2.4s, $A3.4s zip1 $A0.2d, $T0.2d, $T2.2d zip2 $A1.2d, $T0.2d, $T2.2d zip1 $A2.2d, $T1.2d, $T3.2d zip2 $A3.2d, $T1.2d, $T3.2d zip1 $T0.4s, $B0.4s, $B1.4s zip2 $T1.4s, $B0.4s, $B1.4s zip1 $T2.4s, $B2.4s, $B3.4s zip2 $T3.4s, $B2.4s, $B3.4s zip1 $B0.2d, $T0.2d, $T2.2d zip2 $B1.2d, $T0.2d, $T2.2d zip1 $B2.2d, $T1.2d, $T3.2d zip2 $B3.2d, $T1.2d, $T3.2d zip1 $T0.4s, $C0.4s, $C1.4s zip2 $T1.4s, $C0.4s, $C1.4s zip1 $T2.4s, $C2.4s, $C3.4s zip2 $T3.4s, $C2.4s, $C3.4s zip1 $C0.2d, $T0.2d, $T2.2d zip2 $C1.2d, $T0.2d, $T2.2d zip1 $C2.2d, $T1.2d, $T3.2d zip2 $C3.2d, $T1.2d, $T3.2d zip1 $T0.4s, $D0.4s, $D1.4s zip2 $T1.4s, $D0.4s, $D1.4s zip1 $T2.4s, $D2.4s, $D3.4s zip2 $T3.4s, $D2.4s, $D3.4s zip1 $D0.2d, $T0.2d, $T2.2d zip2 $D1.2d, $T0.2d, $T2.2d zip1 $D2.2d, $T1.2d, $T3.2d zip2 $D3.2d, $T1.2d, $T3.2d add $A4.4s, $A4.4s, $CONSTS.4s add $B4.4s, $B4.4s, $B_STORE.4s and $A4.16b, $A4.16b, $CLAMP.16b add $A0.4s, $A0.4s, $CONSTS.4s add $B0.4s, $B0.4s, $B_STORE.4s add $C0.4s, $C0.4s, $C_STORE.4s add $D0.4s, $D0.4s, $D_STORE.4s add $A1.4s, $A1.4s, $CONSTS.4s add $B1.4s, $B1.4s, $B_STORE.4s add $C1.4s, $C1.4s, $C_STORE.4s add $D1.4s, $D1.4s, $D_STORE.4s add $A2.4s, $A2.4s, $CONSTS.4s add $B2.4s, $B2.4s, $B_STORE.4s add $C2.4s, $C2.4s, $C_STORE.4s add $D2.4s, $D2.4s, $D_STORE.4s add $A3.4s, $A3.4s, $CONSTS.4s add $B3.4s, $B3.4s, $B_STORE.4s add $C3.4s, $C3.4s, $C_STORE.4s add $D3.4s, $D3.4s, $D_STORE.4s mov $r0, $A4.d[0] // Move the R key to GPRs mov $r1, $A4.d[1] mov $S_STORE.16b, $B4.16b // Store the S key bl .Lpoly_hash_ad_internal mov $adp, $oup cmp $inl, #256 b.le .Lseal_tail ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A0.16b eor $T1.16b, $T1.16b, $B0.16b eor $T2.16b, $T2.16b, $C0.16b eor $T3.16b, $T3.16b, $D0.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A1.16b eor $T1.16b, $T1.16b, $B1.16b eor $T2.16b, $T2.16b, $C1.16b eor $T3.16b, $T3.16b, $D1.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A2.16b eor $T1.16b, $T1.16b, $B2.16b eor $T2.16b, $T2.16b, $C2.16b eor $T3.16b, $T3.16b, $D2.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A3.16b eor $T1.16b, $T1.16b, $B3.16b eor $T2.16b, $T2.16b, $C3.16b eor $T3.16b, $T3.16b, $D3.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 sub $inl, $inl, #256 mov $itr1, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds mov $itr2, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 .Lseal_main_loop: adrp $t0, :pg_hi21:.Lchacha20_consts add $t0, $t0, :lo12:.Lchacha20_consts ld4r {$A0.4s-$A3.4s}, [$t0] mov $A4.16b, $CONSTS.16b ld4r {$B0.4s-$B3.4s}, [$keyp], #16 mov $B4.16b, $B_STORE.16b ld4r {$C0.4s-$C3.4s}, [$keyp], #16 mov $C4.16b, $C_STORE.16b ld4r {$D0.4s-$D3.4s}, [$keyp] add $D0.4s, $D0.4s, $INC.4s mov $D4.16b, $D_STORE.16b eor $T0.16b, $T0.16b, $T0.16b //zero not $T1.16b, $T0.16b // -1 sub $T1.4s, $INC.4s, $T1.4s // Add +1 ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter) add $D4.4s, $D4.4s, $T0.4s sub $keyp, $keyp, #32 .align 5 .Lseal_main_loop_rounds: ___ &chacha_qr_x5("left"); &poly_add($adp); &poly_mul(); &chacha_qr_x5("right"); $code.=<<___; subs $itr1, $itr1, #1 b.ge .Lseal_main_loop_rounds ___ &poly_add($adp); &poly_mul(); $code.=<<___; subs $itr2, $itr2, #1 b.gt .Lseal_main_loop_rounds eor $T0.16b, $T0.16b, $T0.16b //zero not $T1.16b, $T0.16b // -1 sub $T1.4s, $INC.4s, $T1.4s // Add +1 ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter) add $D4.4s, $D4.4s, $T0.4s add $D0.4s, $D0.4s, $INC.4s mov $t0, #5 dup $T0.4s, $t0w add $INC.4s, $INC.4s, $T0.4s zip1 $T0.4s, $A0.4s, $A1.4s zip2 $T1.4s, $A0.4s, $A1.4s zip1 $T2.4s, $A2.4s, $A3.4s zip2 $T3.4s, $A2.4s, $A3.4s zip1 $A0.2d, $T0.2d, $T2.2d zip2 $A1.2d, $T0.2d, $T2.2d zip1 $A2.2d, $T1.2d, $T3.2d zip2 $A3.2d, $T1.2d, $T3.2d zip1 $T0.4s, $B0.4s, $B1.4s zip2 $T1.4s, $B0.4s, $B1.4s zip1 $T2.4s, $B2.4s, $B3.4s zip2 $T3.4s, $B2.4s, $B3.4s zip1 $B0.2d, $T0.2d, $T2.2d zip2 $B1.2d, $T0.2d, $T2.2d zip1 $B2.2d, $T1.2d, $T3.2d zip2 $B3.2d, $T1.2d, $T3.2d zip1 $T0.4s, $C0.4s, $C1.4s zip2 $T1.4s, $C0.4s, $C1.4s zip1 $T2.4s, $C2.4s, $C3.4s zip2 $T3.4s, $C2.4s, $C3.4s zip1 $C0.2d, $T0.2d, $T2.2d zip2 $C1.2d, $T0.2d, $T2.2d zip1 $C2.2d, $T1.2d, $T3.2d zip2 $C3.2d, $T1.2d, $T3.2d zip1 $T0.4s, $D0.4s, $D1.4s zip2 $T1.4s, $D0.4s, $D1.4s zip1 $T2.4s, $D2.4s, $D3.4s zip2 $T3.4s, $D2.4s, $D3.4s zip1 $D0.2d, $T0.2d, $T2.2d zip2 $D1.2d, $T0.2d, $T2.2d zip1 $D2.2d, $T1.2d, $T3.2d zip2 $D3.2d, $T1.2d, $T3.2d add $A0.4s, $A0.4s, $CONSTS.4s add $B0.4s, $B0.4s, $B_STORE.4s add $C0.4s, $C0.4s, $C_STORE.4s add $D0.4s, $D0.4s, $D_STORE.4s add $A1.4s, $A1.4s, $CONSTS.4s add $B1.4s, $B1.4s, $B_STORE.4s add $C1.4s, $C1.4s, $C_STORE.4s add $D1.4s, $D1.4s, $D_STORE.4s add $A2.4s, $A2.4s, $CONSTS.4s add $B2.4s, $B2.4s, $B_STORE.4s add $C2.4s, $C2.4s, $C_STORE.4s add $D2.4s, $D2.4s, $D_STORE.4s add $A3.4s, $A3.4s, $CONSTS.4s add $B3.4s, $B3.4s, $B_STORE.4s add $C3.4s, $C3.4s, $C_STORE.4s add $D3.4s, $D3.4s, $D_STORE.4s add $A4.4s, $A4.4s, $CONSTS.4s add $B4.4s, $B4.4s, $B_STORE.4s add $C4.4s, $C4.4s, $C_STORE.4s add $D4.4s, $D4.4s, $D_STORE.4s cmp $inl, #320 b.le .Lseal_tail ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A0.16b eor $T1.16b, $T1.16b, $B0.16b eor $T2.16b, $T2.16b, $C0.16b eor $T3.16b, $T3.16b, $D0.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A1.16b eor $T1.16b, $T1.16b, $B1.16b eor $T2.16b, $T2.16b, $C1.16b eor $T3.16b, $T3.16b, $D1.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A2.16b eor $T1.16b, $T1.16b, $B2.16b eor $T2.16b, $T2.16b, $C2.16b eor $T3.16b, $T3.16b, $D2.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A3.16b eor $T1.16b, $T1.16b, $B3.16b eor $T2.16b, $T2.16b, $C3.16b eor $T3.16b, $T3.16b, $D3.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A4.16b eor $T1.16b, $T1.16b, $B4.16b eor $T2.16b, $T2.16b, $C4.16b eor $T3.16b, $T3.16b, $D4.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 sub $inl, $inl, #320 mov $itr1, #0 mov $itr2, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration b .Lseal_main_loop .Lseal_tail: // This part of the function handles the storage and authentication of the last [0,320) bytes // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. cmp $inl, #64 b.lt .Lseal_tail_64 // Store and authenticate 64B blocks per iteration ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A0.16b eor $T1.16b, $T1.16b, $B0.16b eor $T2.16b, $T2.16b, $C0.16b eor $T3.16b, $T3.16b, $D0.16b ___ &poly_add_vec($T0); &poly_mul(); &poly_add_vec($T1); &poly_mul(); &poly_add_vec($T2); &poly_mul(); &poly_add_vec($T3); &poly_mul(); $code.=<<___; st1 {$T0.16b - $T3.16b}, [$oup], #64 sub $inl, $inl, #64 // Shift the state left by 64 bytes for the next iteration of the loop mov $A0.16b, $A1.16b mov $B0.16b, $B1.16b mov $C0.16b, $C1.16b mov $D0.16b, $D1.16b mov $A1.16b, $A2.16b mov $B1.16b, $B2.16b mov $C1.16b, $C2.16b mov $D1.16b, $D2.16b mov $A2.16b, $A3.16b mov $B2.16b, $B3.16b mov $C2.16b, $C3.16b mov $D2.16b, $D3.16b mov $A3.16b, $A4.16b mov $B3.16b, $B4.16b mov $C3.16b, $C4.16b mov $D3.16b, $D4.16b b .Lseal_tail .Lseal_tail_64: ldp $adp, $adl, [$keyp, #48] // extra_in_len and extra_in_ptr // Here we handle the last [0,64) bytes of plaintext cmp $inl, #16 b.lt .Lseal_tail_16 // Each iteration encrypt and authenticate a 16B block ld1 {$T0.16b}, [$inp], #16 eor $T0.16b, $T0.16b, $A0.16b ___ &poly_add_vec($T0); &poly_mul(); $code.=<<___; st1 {$T0.16b}, [$oup], #16 sub $inl, $inl, #16 // Shift the state left by 16 bytes for the next iteration of the loop mov $A0.16b, $B0.16b mov $B0.16b, $C0.16b mov $C0.16b, $D0.16b b .Lseal_tail_64 .Lseal_tail_16: // Here we handle the last [0,16) bytes of ciphertext that require a padded block cbz $inl, .Lseal_hash_extra eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the plaintext/extra in eor $T1.16b, $T1.16b, $T1.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes not $T2.16b, $T0.16b mov $itr1, $inl add $inp, $inp, $inl cbz $adl, .Lseal_tail_16_compose // No extra data to pad with, zero padding mov $itr2, #16 // We need to load some extra_in first for padding sub $itr2, $itr2, $inl cmp $adl, $itr2 csel $itr2, $adl, $itr2, lt // Load the minimum of extra_in_len and the amount needed to fill the register mov $t1, $itr2 add $adp, $adp, $itr2 sub $adl, $adl, $itr2 .Lseal_tail16_compose_extra_in: ext $T0.16b, $T0.16b, $T0.16b, #15 ldrb $t0w, [$adp, #-1]! mov $T0.b[0], $t0w subs $itr2, $itr2, #1 b.gt .Lseal_tail16_compose_extra_in add $adp, $adp, $t1 .Lseal_tail_16_compose: ext $T0.16b, $T0.16b, $T0.16b, #15 ldrb $t0w, [$inp, #-1]! mov $T0.b[0], $t0w ext $T1.16b, $T2.16b, $T1.16b, #15 subs $inl, $inl, #1 b.gt .Lseal_tail_16_compose and $A0.16b, $A0.16b, $T1.16b eor $T0.16b, $T0.16b, $A0.16b mov $T1.16b, $T0.16b .Lseal_tail_16_store: umov $t0w, $T0.b[0] strb $t0w, [$oup], #1 ext $T0.16b, $T0.16b, $T0.16b, #1 subs $itr1, $itr1, #1 b.gt .Lseal_tail_16_store // Hash in the final ct block concatenated with extra_in ___ &poly_add_vec($T1); &poly_mul(); $code.=<<___; .Lseal_hash_extra: cbz $adl, .Lseal_finalize .Lseal_hash_extra_loop: cmp $adl, #16 b.lt .Lseal_hash_extra_tail ld1 {$T0.16b}, [$adp], #16 ___ &poly_add_vec($T0); &poly_mul(); $code.=<<___; sub $adl, $adl, #16 b .Lseal_hash_extra_loop .Lseal_hash_extra_tail: cbz $adl, .Lseal_finalize eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the remaining extra ciphertext add $adp, $adp, $adl .Lseal_hash_extra_load: ext $T0.16b, $T0.16b, $T0.16b, #15 ldrb $t0w, [$adp, #-1]! mov $T0.b[0], $t0w subs $adl, $adl, #1 b.gt .Lseal_hash_extra_load // Hash in the final padded extra_in blcok ___ &poly_add_vec($T0); &poly_mul(); $code.=<<___; .Lseal_finalize: ___ &poly_add_vec($LEN_STORE); &poly_mul(); $code.=<<___; // Final reduction step sub $t1, xzr, $one orr $t2, xzr, #3 subs $t0, $acc0, #-5 sbcs $t1, $acc1, $t1 sbcs $t2, $acc2, $t2 csel $acc0, $t0, $acc0, cs csel $acc1, $t1, $acc1, cs csel $acc2, $t2, $acc2, cs ___ &poly_add_vec($S_STORE); $code.=<<___; stp $acc0, $acc1, [$keyp] ldp d8, d9, [sp, #16] ldp d10, d11, [sp, #32] ldp d12, d13, [sp, #48] ldp d14, d15, [sp, #64] .cfi_restore b15 .cfi_restore b14 .cfi_restore b13 .cfi_restore b12 .cfi_restore b11 .cfi_restore b10 .cfi_restore b9 .cfi_restore b8 ldp x29, x30, [sp], 80 .cfi_restore w29 .cfi_restore w30 .cfi_def_cfa_offset 0 AARCH64_VALIDATE_LINK_REGISTER ret .Lseal_128: // On some architectures preparing 5 blocks for small buffers is wasteful eor $INC.16b, $INC.16b, $INC.16b mov $t0, #1 mov $INC.s[0], $t0w mov $A0.16b, $CONSTS.16b mov $A1.16b, $CONSTS.16b mov $A2.16b, $CONSTS.16b mov $B0.16b, $B_STORE.16b mov $B1.16b, $B_STORE.16b mov $B2.16b, $B_STORE.16b mov $C0.16b, $C_STORE.16b mov $C1.16b, $C_STORE.16b mov $C2.16b, $C_STORE.16b mov $D2.16b, $D_STORE.16b add $D0.4s, $D2.4s, $INC.4s add $D1.4s, $D0.4s, $INC.4s mov $itr1, #10 .Lseal_128_rounds: ___ &chacha_qr_x3("left"); &chacha_qr_x3("right"); $code.=<<___; subs $itr1, $itr1, #1 b.hi .Lseal_128_rounds add $A0.4s, $A0.4s, $CONSTS.4s add $A1.4s, $A1.4s, $CONSTS.4s add $A2.4s, $A2.4s, $CONSTS.4s add $B0.4s, $B0.4s, $B_STORE.4s add $B1.4s, $B1.4s, $B_STORE.4s add $B2.4s, $B2.4s, $B_STORE.4s // Only the first 32 bytes of the third block (counter = 0) are needed, // so skip updating $C2 and $D2. add $C0.4s, $C0.4s, $C_STORE.4s add $C1.4s, $C1.4s, $C_STORE.4s add $D_STORE.4s, $D_STORE.4s, $INC.4s add $D0.4s, $D0.4s, $D_STORE.4s add $D_STORE.4s, $D_STORE.4s, $INC.4s add $D1.4s, $D1.4s, $D_STORE.4s and $A2.16b, $A2.16b, $CLAMP.16b mov $r0, $A2.d[0] // Move the R key to GPRs mov $r1, $A2.d[1] mov $S_STORE.16b, $B2.16b // Store the S key bl .Lpoly_hash_ad_internal b .Lseal_tail .cfi_endproc .size chacha20_poly1305_seal,.-chacha20_poly1305_seal ///////////////////////////////// // // void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); // .globl chacha20_poly1305_open .type chacha20_poly1305_open,%function .align 6 chacha20_poly1305_open: AARCH64_SIGN_LINK_REGISTER .cfi_startproc stp x29, x30, [sp, #-80]! .cfi_def_cfa_offset 80 .cfi_offset w30, -72 .cfi_offset w29, -80 mov x29, sp // We probably could do .cfi_def_cfa w29, 80 at this point, but since // we don't actually use the frame pointer like that, it's probably not // worth bothering. stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] .cfi_offset b15, -8 .cfi_offset b14, -16 .cfi_offset b13, -24 .cfi_offset b12, -32 .cfi_offset b11, -40 .cfi_offset b10, -48 .cfi_offset b9, -56 .cfi_offset b8, -64 adrp $t0, :pg_hi21:.Lchacha20_consts add $t0, $t0, :lo12:.Lchacha20_consts ld1 {$CONSTS.16b - $CLAMP.16b}, [$t0] // Load the CONSTS, INC, ROL8 and CLAMP values ld1 {$B_STORE.16b - $D_STORE.16b}, [$keyp] mov $one, #1 // Prepare the Poly1305 state mov $acc0, #0 mov $acc1, #0 mov $acc2, #0 mov $LEN_STORE.d[0], $adl // Store the input and aad lengths mov $LEN_STORE.d[1], $inl cmp $inl, #128 b.le .Lopen_128 // Optimization for smaller buffers // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys mov $A0.16b, $CONSTS.16b mov $B0.16b, $B_STORE.16b mov $C0.16b, $C_STORE.16b mov $D0.16b, $D_STORE.16b mov $itr1, #10 .align 5 .Lopen_init_rounds: ___ &chacha_qr($A0, $B0, $C0, $D0, $T0, "left"); &chacha_qr($A0, $B0, $C0, $D0, $T0, "right"); $code.=<<___; subs $itr1, $itr1, #1 b.hi .Lopen_init_rounds add $A0.4s, $A0.4s, $CONSTS.4s add $B0.4s, $B0.4s, $B_STORE.4s and $A0.16b, $A0.16b, $CLAMP.16b mov $r0, $A0.d[0] // Move the R key to GPRs mov $r1, $A0.d[1] mov $S_STORE.16b, $B0.16b // Store the S key bl .Lpoly_hash_ad_internal .Lopen_ad_done: mov $adp, $inp // Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes .Lopen_main_loop: cmp $inl, #192 b.lt .Lopen_tail adrp $t0, :pg_hi21:.Lchacha20_consts add $t0, $t0, :lo12:.Lchacha20_consts ld4r {$A0.4s-$A3.4s}, [$t0] mov $A4.16b, $CONSTS.16b ld4r {$B0.4s-$B3.4s}, [$keyp], #16 mov $B4.16b, $B_STORE.16b ld4r {$C0.4s-$C3.4s}, [$keyp], #16 mov $C4.16b, $C_STORE.16b ld4r {$D0.4s-$D3.4s}, [$keyp] sub $keyp, $keyp, #32 add $D0.4s, $D0.4s, $INC.4s mov $D4.16b, $D_STORE.16b eor $T0.16b, $T0.16b, $T0.16b //zero not $T1.16b, $T0.16b // -1 sub $T1.4s, $INC.4s, $T1.4s // Add +1 ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter) add $D4.4s, $D4.4s, $T0.4s lsr $adl, $inl, #4 // How many whole blocks we have to hash, will always be at least 12 sub $adl, $adl, #10 mov $itr2, #10 subs $itr1, $itr2, $adl subs $itr1, $itr2, $adl // itr1 can be negative if we have more than 320 bytes to hash csel $itr2, $itr2, $adl, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full cbz $itr2, .Lopen_main_loop_rounds_short .align 5 .Lopen_main_loop_rounds: ___ &poly_add($adp); &poly_mul(); $code.=<<___; .Lopen_main_loop_rounds_short: ___ &chacha_qr_x5("left"); &poly_add($adp); &poly_mul(); &chacha_qr_x5("right"); $code.=<<___; subs $itr2, $itr2, #1 b.gt .Lopen_main_loop_rounds subs $itr1, $itr1, #1 b.ge .Lopen_main_loop_rounds_short ___ $code.=<<___; eor $T0.16b, $T0.16b, $T0.16b //zero not $T1.16b, $T0.16b // -1 sub $T1.4s, $INC.4s, $T1.4s // Add +1 ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter) add $D4.4s, $D4.4s, $T0.4s add $D0.4s, $D0.4s, $INC.4s mov $t0, #5 dup $T0.4s, $t0w add $INC.4s, $INC.4s, $T0.4s zip1 $T0.4s, $A0.4s, $A1.4s zip2 $T1.4s, $A0.4s, $A1.4s zip1 $T2.4s, $A2.4s, $A3.4s zip2 $T3.4s, $A2.4s, $A3.4s zip1 $A0.2d, $T0.2d, $T2.2d zip2 $A1.2d, $T0.2d, $T2.2d zip1 $A2.2d, $T1.2d, $T3.2d zip2 $A3.2d, $T1.2d, $T3.2d zip1 $T0.4s, $B0.4s, $B1.4s zip2 $T1.4s, $B0.4s, $B1.4s zip1 $T2.4s, $B2.4s, $B3.4s zip2 $T3.4s, $B2.4s, $B3.4s zip1 $B0.2d, $T0.2d, $T2.2d zip2 $B1.2d, $T0.2d, $T2.2d zip1 $B2.2d, $T1.2d, $T3.2d zip2 $B3.2d, $T1.2d, $T3.2d zip1 $T0.4s, $C0.4s, $C1.4s zip2 $T1.4s, $C0.4s, $C1.4s zip1 $T2.4s, $C2.4s, $C3.4s zip2 $T3.4s, $C2.4s, $C3.4s zip1 $C0.2d, $T0.2d, $T2.2d zip2 $C1.2d, $T0.2d, $T2.2d zip1 $C2.2d, $T1.2d, $T3.2d zip2 $C3.2d, $T1.2d, $T3.2d zip1 $T0.4s, $D0.4s, $D1.4s zip2 $T1.4s, $D0.4s, $D1.4s zip1 $T2.4s, $D2.4s, $D3.4s zip2 $T3.4s, $D2.4s, $D3.4s zip1 $D0.2d, $T0.2d, $T2.2d zip2 $D1.2d, $T0.2d, $T2.2d zip1 $D2.2d, $T1.2d, $T3.2d zip2 $D3.2d, $T1.2d, $T3.2d add $A0.4s, $A0.4s, $CONSTS.4s add $B0.4s, $B0.4s, $B_STORE.4s add $C0.4s, $C0.4s, $C_STORE.4s add $D0.4s, $D0.4s, $D_STORE.4s add $A1.4s, $A1.4s, $CONSTS.4s add $B1.4s, $B1.4s, $B_STORE.4s add $C1.4s, $C1.4s, $C_STORE.4s add $D1.4s, $D1.4s, $D_STORE.4s add $A2.4s, $A2.4s, $CONSTS.4s add $B2.4s, $B2.4s, $B_STORE.4s add $C2.4s, $C2.4s, $C_STORE.4s add $D2.4s, $D2.4s, $D_STORE.4s add $A3.4s, $A3.4s, $CONSTS.4s add $B3.4s, $B3.4s, $B_STORE.4s add $C3.4s, $C3.4s, $C_STORE.4s add $D3.4s, $D3.4s, $D_STORE.4s add $A4.4s, $A4.4s, $CONSTS.4s add $B4.4s, $B4.4s, $B_STORE.4s add $C4.4s, $C4.4s, $C_STORE.4s add $D4.4s, $D4.4s, $D_STORE.4s // We can always safely store 192 bytes ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A0.16b eor $T1.16b, $T1.16b, $B0.16b eor $T2.16b, $T2.16b, $C0.16b eor $T3.16b, $T3.16b, $D0.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A1.16b eor $T1.16b, $T1.16b, $B1.16b eor $T2.16b, $T2.16b, $C1.16b eor $T3.16b, $T3.16b, $D1.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A2.16b eor $T1.16b, $T1.16b, $B2.16b eor $T2.16b, $T2.16b, $C2.16b eor $T3.16b, $T3.16b, $D2.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 sub $inl, $inl, #192 mov $A0.16b, $A3.16b mov $B0.16b, $B3.16b mov $C0.16b, $C3.16b mov $D0.16b, $D3.16b cmp $inl, #64 b.lt .Lopen_tail_64_store ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A3.16b eor $T1.16b, $T1.16b, $B3.16b eor $T2.16b, $T2.16b, $C3.16b eor $T3.16b, $T3.16b, $D3.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 sub $inl, $inl, #64 mov $A0.16b, $A4.16b mov $B0.16b, $B4.16b mov $C0.16b, $C4.16b mov $D0.16b, $D4.16b cmp $inl, #64 b.lt .Lopen_tail_64_store ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A4.16b eor $T1.16b, $T1.16b, $B4.16b eor $T2.16b, $T2.16b, $C4.16b eor $T3.16b, $T3.16b, $D4.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 sub $inl, $inl, #64 b .Lopen_main_loop .Lopen_tail: cbz $inl, .Lopen_finalize lsr $adl, $inl, #4 // How many whole blocks we have to hash cmp $inl, #64 b.le .Lopen_tail_64 cmp $inl, #128 b.le .Lopen_tail_128 .Lopen_tail_192: // We need three more blocks mov $A0.16b, $CONSTS.16b mov $A1.16b, $CONSTS.16b mov $A2.16b, $CONSTS.16b mov $B0.16b, $B_STORE.16b mov $B1.16b, $B_STORE.16b mov $B2.16b, $B_STORE.16b mov $C0.16b, $C_STORE.16b mov $C1.16b, $C_STORE.16b mov $C2.16b, $C_STORE.16b mov $D0.16b, $D_STORE.16b mov $D1.16b, $D_STORE.16b mov $D2.16b, $D_STORE.16b eor $T3.16b, $T3.16b, $T3.16b eor $T1.16b, $T1.16b, $T1.16b ins $T3.s[0], $INC.s[0] ins $T1.d[0], $one add $T2.4s, $T3.4s, $T1.4s add $T1.4s, $T2.4s, $T1.4s add $D0.4s, $D0.4s, $T1.4s add $D1.4s, $D1.4s, $T3.4s add $D2.4s, $D2.4s, $T2.4s mov $itr2, #10 subs $itr1, $itr2, $adl // itr1 can be negative if we have more than 160 bytes to hash csel $itr2, $itr2, $adl, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing sub $adl, $adl, $itr2 cbz $itr2, .Lopen_tail_192_rounds_no_hash .Lopen_tail_192_rounds: ___ &poly_add($adp); &poly_mul(); $code.=<<___; .Lopen_tail_192_rounds_no_hash: ___ &chacha_qr_x3("left"); &chacha_qr_x3("right"); $code.=<<___; subs $itr2, $itr2, #1 b.gt .Lopen_tail_192_rounds subs $itr1, $itr1, #1 b.ge .Lopen_tail_192_rounds_no_hash // We hashed 160 bytes at most, may still have 32 bytes left .Lopen_tail_192_hash: cbz $adl, .Lopen_tail_192_hash_done ___ &poly_add($adp); &poly_mul(); $code.=<<___; sub $adl, $adl, #1 b .Lopen_tail_192_hash .Lopen_tail_192_hash_done: add $A0.4s, $A0.4s, $CONSTS.4s add $A1.4s, $A1.4s, $CONSTS.4s add $A2.4s, $A2.4s, $CONSTS.4s add $B0.4s, $B0.4s, $B_STORE.4s add $B1.4s, $B1.4s, $B_STORE.4s add $B2.4s, $B2.4s, $B_STORE.4s add $C0.4s, $C0.4s, $C_STORE.4s add $C1.4s, $C1.4s, $C_STORE.4s add $C2.4s, $C2.4s, $C_STORE.4s add $D0.4s, $D0.4s, $D_STORE.4s add $D1.4s, $D1.4s, $D_STORE.4s add $D2.4s, $D2.4s, $D_STORE.4s add $D0.4s, $D0.4s, $T1.4s add $D1.4s, $D1.4s, $T3.4s add $D2.4s, $D2.4s, $T2.4s ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A1.16b eor $T1.16b, $T1.16b, $B1.16b eor $T2.16b, $T2.16b, $C1.16b eor $T3.16b, $T3.16b, $D1.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A2.16b eor $T1.16b, $T1.16b, $B2.16b eor $T2.16b, $T2.16b, $C2.16b eor $T3.16b, $T3.16b, $D2.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 sub $inl, $inl, #128 b .Lopen_tail_64_store .Lopen_tail_128: // We need two more blocks mov $A0.16b, $CONSTS.16b mov $A1.16b, $CONSTS.16b mov $B0.16b, $B_STORE.16b mov $B1.16b, $B_STORE.16b mov $C0.16b, $C_STORE.16b mov $C1.16b, $C_STORE.16b mov $D0.16b, $D_STORE.16b mov $D1.16b, $D_STORE.16b eor $T3.16b, $T3.16b, $T3.16b eor $T2.16b, $T2.16b, $T2.16b ins $T3.s[0], $INC.s[0] ins $T2.d[0], $one add $T2.4s, $T2.4s, $T3.4s add $D0.4s, $D0.4s, $T2.4s add $D1.4s, $D1.4s, $T3.4s mov $itr1, #10 sub $itr1, $itr1, $adl .Lopen_tail_128_rounds: ___ &chacha_qr($A0, $B0, $C0, $D0, $T0, "left"); &chacha_qr($A1, $B1, $C1, $D1, $T0, "left"); &chacha_qr($A0, $B0, $C0, $D0, $T0, "right"); &chacha_qr($A1, $B1, $C1, $D1, $T0, "right"); $code.=<<___; subs $itr1, $itr1, #1 b.gt .Lopen_tail_128_rounds cbz $adl, .Lopen_tail_128_rounds_done subs $adl, $adl, #1 ___ &poly_add($adp); &poly_mul(); $code.=<<___; b .Lopen_tail_128_rounds .Lopen_tail_128_rounds_done: add $A0.4s, $A0.4s, $CONSTS.4s add $A1.4s, $A1.4s, $CONSTS.4s add $B0.4s, $B0.4s, $B_STORE.4s add $B1.4s, $B1.4s, $B_STORE.4s add $C0.4s, $C0.4s, $C_STORE.4s add $C1.4s, $C1.4s, $C_STORE.4s add $D0.4s, $D0.4s, $D_STORE.4s add $D1.4s, $D1.4s, $D_STORE.4s add $D0.4s, $D0.4s, $T2.4s add $D1.4s, $D1.4s, $T3.4s ld1 {$T0.16b - $T3.16b}, [$inp], #64 eor $T0.16b, $T0.16b, $A1.16b eor $T1.16b, $T1.16b, $B1.16b eor $T2.16b, $T2.16b, $C1.16b eor $T3.16b, $T3.16b, $D1.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 sub $inl, $inl, #64 b .Lopen_tail_64_store .Lopen_tail_64: // We just need a single block mov $A0.16b, $CONSTS.16b mov $B0.16b, $B_STORE.16b mov $C0.16b, $C_STORE.16b mov $D0.16b, $D_STORE.16b eor $T3.16b, $T3.16b, $T3.16b ins $T3.s[0], $INC.s[0] add $D0.4s, $D0.4s, $T3.4s mov $itr1, #10 sub $itr1, $itr1, $adl .Lopen_tail_64_rounds: ___ &chacha_qr($A0, $B0, $C0, $D0, $T0, "left"); &chacha_qr($A0, $B0, $C0, $D0, $T0, "right"); $code.=<<___; subs $itr1, $itr1, #1 b.gt .Lopen_tail_64_rounds cbz $adl, .Lopen_tail_64_rounds_done subs $adl, $adl, #1 ___ &poly_add($adp); &poly_mul(); $code.=<<___; b .Lopen_tail_64_rounds .Lopen_tail_64_rounds_done: add $A0.4s, $A0.4s, $CONSTS.4s add $B0.4s, $B0.4s, $B_STORE.4s add $C0.4s, $C0.4s, $C_STORE.4s add $D0.4s, $D0.4s, $D_STORE.4s add $D0.4s, $D0.4s, $T3.4s .Lopen_tail_64_store: cmp $inl, #16 b.lt .Lopen_tail_16 ld1 {$T0.16b}, [$inp], #16 eor $T0.16b, $T0.16b, $A0.16b st1 {$T0.16b}, [$oup], #16 mov $A0.16b, $B0.16b mov $B0.16b, $C0.16b mov $C0.16b, $D0.16b sub $inl, $inl, #16 b .Lopen_tail_64_store .Lopen_tail_16: // Here we handle the last [0,16) bytes that require a padded block cbz $inl, .Lopen_finalize eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the ciphertext eor $T1.16b, $T1.16b, $T1.16b // Use T1 to generate an AND mask not $T2.16b, $T0.16b add $itr2, $inp, $inl mov $itr1, $inl .Lopen_tail_16_compose: ext $T0.16b, $T0.16b, $T0.16b, #15 ldrb $t0w, [$itr2, #-1]! mov $T0.b[0], $t0w ext $T1.16b, $T2.16b, $T1.16b, #15 subs $inl, $inl, #1 b.gt .Lopen_tail_16_compose and $T0.16b, $T0.16b, $T1.16b // Hash in the final padded block ___ &poly_add_vec($T0); &poly_mul(); $code.=<<___; eor $T0.16b, $T0.16b, $A0.16b .Lopen_tail_16_store: umov $t0w, $T0.b[0] strb $t0w, [$oup], #1 ext $T0.16b, $T0.16b, $T0.16b, #1 subs $itr1, $itr1, #1 b.gt .Lopen_tail_16_store .Lopen_finalize: ___ &poly_add_vec($LEN_STORE); &poly_mul(); $code.=<<___; // Final reduction step sub $t1, xzr, $one orr $t2, xzr, #3 subs $t0, $acc0, #-5 sbcs $t1, $acc1, $t1 sbcs $t2, $acc2, $t2 csel $acc0, $t0, $acc0, cs csel $acc1, $t1, $acc1, cs csel $acc2, $t2, $acc2, cs ___ &poly_add_vec($S_STORE); $code.=<<___; stp $acc0, $acc1, [$keyp] ldp d8, d9, [sp, #16] ldp d10, d11, [sp, #32] ldp d12, d13, [sp, #48] ldp d14, d15, [sp, #64] .cfi_restore b15 .cfi_restore b14 .cfi_restore b13 .cfi_restore b12 .cfi_restore b11 .cfi_restore b10 .cfi_restore b9 .cfi_restore b8 ldp x29, x30, [sp], 80 .cfi_restore w29 .cfi_restore w30 .cfi_def_cfa_offset 0 AARCH64_VALIDATE_LINK_REGISTER ret .Lopen_128: // On some architectures preparing 5 blocks for small buffers is wasteful eor $INC.16b, $INC.16b, $INC.16b mov $t0, #1 mov $INC.s[0], $t0w mov $A0.16b, $CONSTS.16b mov $A1.16b, $CONSTS.16b mov $A2.16b, $CONSTS.16b mov $B0.16b, $B_STORE.16b mov $B1.16b, $B_STORE.16b mov $B2.16b, $B_STORE.16b mov $C0.16b, $C_STORE.16b mov $C1.16b, $C_STORE.16b mov $C2.16b, $C_STORE.16b mov $D2.16b, $D_STORE.16b add $D0.4s, $D2.4s, $INC.4s add $D1.4s, $D0.4s, $INC.4s mov $itr1, #10 .Lopen_128_rounds: ___ &chacha_qr_x3("left"); &chacha_qr_x3("right"); $code.=<<___; subs $itr1, $itr1, #1 b.hi .Lopen_128_rounds add $A0.4s, $A0.4s, $CONSTS.4s add $A1.4s, $A1.4s, $CONSTS.4s add $A2.4s, $A2.4s, $CONSTS.4s add $B0.4s, $B0.4s, $B_STORE.4s add $B1.4s, $B1.4s, $B_STORE.4s add $B2.4s, $B2.4s, $B_STORE.4s add $C0.4s, $C0.4s, $C_STORE.4s add $C1.4s, $C1.4s, $C_STORE.4s add $D_STORE.4s, $D_STORE.4s, $INC.4s add $D0.4s, $D0.4s, $D_STORE.4s add $D_STORE.4s, $D_STORE.4s, $INC.4s add $D1.4s, $D1.4s, $D_STORE.4s and $A2.16b, $A2.16b, $CLAMP.16b mov $r0, $A2.d[0] // Move the R key to GPRs mov $r1, $A2.d[1] mov $S_STORE.16b, $B2.16b // Store the S key bl .Lpoly_hash_ad_internal .Lopen_128_store: cmp $inl, #64 b.lt .Lopen_128_store_64 ld1 {$T0.16b - $T3.16b}, [$inp], #64 ___ &poly_add_vec($T0); &poly_mul(); &poly_add_vec($T1); &poly_mul(); &poly_add_vec($T2); &poly_mul(); &poly_add_vec($T3); &poly_mul(); $code.=<<___; eor $T0.16b, $T0.16b, $A0.16b eor $T1.16b, $T1.16b, $B0.16b eor $T2.16b, $T2.16b, $C0.16b eor $T3.16b, $T3.16b, $D0.16b st1 {$T0.16b - $T3.16b}, [$oup], #64 sub $inl, $inl, #64 mov $A0.16b, $A1.16b mov $B0.16b, $B1.16b mov $C0.16b, $C1.16b mov $D0.16b, $D1.16b .Lopen_128_store_64: lsr $adl, $inl, #4 mov $adp, $inp .Lopen_128_hash_64: cbz $adl, .Lopen_tail_64_store ___ &poly_add($adp); &poly_mul(); $code.=<<___; sub $adl, $adl, #1 b .Lopen_128_hash_64 .cfi_endproc .size chacha20_poly1305_open,.-chacha20_poly1305_open ___ } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; print $_,"\n"; } close STDOUT or die "error closing STDOUT"; ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_x86_64.pl000064400000000000000000002337261046102023000206240ustar 00000000000000#!/usr/bin/env perl # Copyright (c) 2015, CloudFlare Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################## # # # Author: Vlad Krasnov # # # ############################################################################## $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $avx = 2; $code.=<<___; .section .rodata .align 64 chacha20_poly1305_constants: .Lchacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .Lrol8: .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 .Lrol16: .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 .Lavx2_init: .long 0,0,0,0 .Lsse_inc: .long 1,0,0,0 .Lavx2_inc: .long 2,0,0,0,2,0,0,0 .Lclamp: .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC .quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF .align 16 .Land_masks: .byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .text ___ my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2,$adl)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8","%r8"); my ($acc0,$acc1,$acc2)=map("%r$_",(10..12)); my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9"); my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15)); my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); my $xmm_storage = 0; if ($win64) { $xmm_storage = 10*16; } my $xmm_store="0*16(%rbp)"; my $r_store="$xmm_storage+0*16(%rbp)"; my $s_store="$xmm_storage+1*16(%rbp)"; my $len_store="$xmm_storage+2*16(%rbp)"; my $state1_store="$xmm_storage+3*16(%rbp)"; my $state2_store="$xmm_storage+4*16(%rbp)"; my $tmp_store="$xmm_storage+5*16(%rbp)"; my $ctr0_store="$xmm_storage+6*16(%rbp)"; my $ctr1_store="$xmm_storage+7*16(%rbp)"; my $ctr2_store="$xmm_storage+8*16(%rbp)"; my $ctr3_store="$xmm_storage+9*16(%rbp)"; sub chacha_qr { my ($a,$b,$c,$d,$t,$dir)=@_; $code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/); $code.="paddd $b, $a pxor $a, $d pshufb .Lrol16(%rip), $d paddd $d, $c pxor $c, $b movdqa $b, $t pslld \$12, $t psrld \$20, $b pxor $t, $b paddd $b, $a pxor $a, $d pshufb .Lrol8(%rip), $d paddd $d, $c pxor $c, $b movdqa $b, $t pslld \$7, $t psrld \$25, $b pxor $t, $b\n"; $code.="palignr \$4, $b, $b palignr \$8, $c, $c palignr \$12, $d, $d\n" if ($dir =~ /left/); $code.="palignr \$12, $b, $b palignr \$8, $c, $c palignr \$4, $d, $d\n" if ($dir =~ /right/); $code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/); } sub poly_add { my ($src)=@_; $code.="add 0+$src, $acc0 adc 8+$src, $acc1 adc \$1, $acc2\n"; } sub poly_stage1 { $code.="mov 0+$r_store, %rax mov %rax, $t2 mul $acc0 mov %rax, $t0 mov %rdx, $t1 mov 0+$r_store, %rax mul $acc1 imulq $acc2, $t2 add %rax, $t1 adc %rdx, $t2\n"; } sub poly_stage2 { $code.="mov 8+$r_store, %rax mov %rax, $t3 mul $acc0 add %rax, $t1 adc \$0, %rdx mov %rdx, $acc0 mov 8+$r_store, %rax mul $acc1 add %rax, $t2 adc \$0, %rdx\n"; } sub poly_stage3 { $code.="imulq $acc2, $t3 add $acc0, $t2 adc %rdx, $t3\n"; } # At the beginning of the reduce stage t = [t3:t2:t1:t0] is a product of # r = [r1:r0] and acc = [acc2:acc1:acc0] # r is 124 bits at most (due to clamping) and acc is 131 bits at most # (acc2 is at most 4 before the addition and can be at most 6 when we add in # the next block) therefore t is at most 255 bits big, and t3 is 63 bits. sub poly_reduce_stage { $code.="mov $t0, $acc0 mov $t1, $acc1 mov $t2, $acc2 and \$3, $acc2 # At this point acc2 is 2 bits at most (value of 3) mov $t2, $t0 and \$-4, $t0 mov $t3, $t1 shrd \$2, $t3, $t2 shr \$2, $t3 add $t0, $t2 adc $t1, $t3 # No carry out since t3 is 61 bits and t1 is 63 bits add $t2, $acc0 adc $t3, $acc1 adc \$0, $acc2\n"; # At this point acc2 has the value of 4 at most } sub poly_mul { &poly_stage1(); &poly_stage2(); &poly_stage3(); &poly_reduce_stage(); } sub prep_state { my ($n)=@_; $code.="movdqa .Lchacha20_consts(%rip), $A0 movdqa $state1_store, $B0 movdqa $state2_store, $C0\n"; $code.="movdqa $A0, $A1 movdqa $B0, $B1 movdqa $C0, $C1\n" if ($n ge 2); $code.="movdqa $A0, $A2 movdqa $B0, $B2 movdqa $C0, $C2\n" if ($n ge 3); $code.="movdqa $A0, $A3 movdqa $B0, $B3 movdqa $C0, $C3\n" if ($n ge 4); $code.="movdqa $ctr0_store, $D0 paddd .Lsse_inc(%rip), $D0 movdqa $D0, $ctr0_store\n" if ($n eq 1); $code.="movdqa $ctr0_store, $D1 paddd .Lsse_inc(%rip), $D1 movdqa $D1, $D0 paddd .Lsse_inc(%rip), $D0 movdqa $D0, $ctr0_store movdqa $D1, $ctr1_store\n" if ($n eq 2); $code.="movdqa $ctr0_store, $D2 paddd .Lsse_inc(%rip), $D2 movdqa $D2, $D1 paddd .Lsse_inc(%rip), $D1 movdqa $D1, $D0 paddd .Lsse_inc(%rip), $D0 movdqa $D0, $ctr0_store movdqa $D1, $ctr1_store movdqa $D2, $ctr2_store\n" if ($n eq 3); $code.="movdqa $ctr0_store, $D3 paddd .Lsse_inc(%rip), $D3 movdqa $D3, $D2 paddd .Lsse_inc(%rip), $D2 movdqa $D2, $D1 paddd .Lsse_inc(%rip), $D1 movdqa $D1, $D0 paddd .Lsse_inc(%rip), $D0 movdqa $D0, $ctr0_store movdqa $D1, $ctr1_store movdqa $D2, $ctr2_store movdqa $D3, $ctr3_store\n" if ($n eq 4); } sub finalize_state { my ($n)=@_; $code.="paddd .Lchacha20_consts(%rip), $A3 paddd $state1_store, $B3 paddd $state2_store, $C3 paddd $ctr3_store, $D3\n" if ($n eq 4); $code.="paddd .Lchacha20_consts(%rip), $A2 paddd $state1_store, $B2 paddd $state2_store, $C2 paddd $ctr2_store, $D2\n" if ($n ge 3); $code.="paddd .Lchacha20_consts(%rip), $A1 paddd $state1_store, $B1 paddd $state2_store, $C1 paddd $ctr1_store, $D1\n" if ($n ge 2); $code.="paddd .Lchacha20_consts(%rip), $A0 paddd $state1_store, $B0 paddd $state2_store, $C0 paddd $ctr0_store, $D0\n"; } sub xor_stream { my ($A, $B, $C, $D, $offset)=@_; $code.="movdqu 0*16 + $offset($inp), $A3 movdqu 1*16 + $offset($inp), $B3 movdqu 2*16 + $offset($inp), $C3 movdqu 3*16 + $offset($inp), $D3 pxor $A3, $A pxor $B3, $B pxor $C3, $C pxor $D, $D3 movdqu $A, 0*16 + $offset($oup) movdqu $B, 1*16 + $offset($oup) movdqu $C, 2*16 + $offset($oup) movdqu $D3, 3*16 + $offset($oup)\n"; } sub xor_stream_using_temp { my ($A, $B, $C, $D, $offset, $temp)=@_; $code.="movdqa $temp, $tmp_store movdqu 0*16 + $offset($inp), $temp pxor $A, $temp movdqu $temp, 0*16 + $offset($oup) movdqu 1*16 + $offset($inp), $temp pxor $B, $temp movdqu $temp, 1*16 + $offset($oup) movdqu 2*16 + $offset($inp), $temp pxor $C, $temp movdqu $temp, 2*16 + $offset($oup) movdqu 3*16 + $offset($inp), $temp pxor $D, $temp movdqu $temp, 3*16 + $offset($oup)\n"; } sub gen_chacha_round { my ($rot1, $rot2, $shift)=@_; my $round=""; $round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20); $round.="movdqa $rot2, $C0 paddd $B3, $A3 paddd $B2, $A2 paddd $B1, $A1 paddd $B0, $A0 pxor $A3, $D3 pxor $A2, $D2 pxor $A1, $D1 pxor $A0, $D0 pshufb $C0, $D3 pshufb $C0, $D2 pshufb $C0, $D1 pshufb $C0, $D0 movdqa $tmp_store, $C0 paddd $D3, $C3 paddd $D2, $C2 paddd $D1, $C1 paddd $D0, $C0 pxor $C3, $B3 pxor $C2, $B2 pxor $C1, $B1 pxor $C0, $B0 movdqa $C0, $tmp_store movdqa $B3, $C0 psrld \$$rot1, $C0 pslld \$32-$rot1, $B3 pxor $C0, $B3 movdqa $B2, $C0 psrld \$$rot1, $C0 pslld \$32-$rot1, $B2 pxor $C0, $B2 movdqa $B1, $C0 psrld \$$rot1, $C0 pslld \$32-$rot1, $B1 pxor $C0, $B1 movdqa $B0, $C0 psrld \$$rot1, $C0 pslld \$32-$rot1, $B0 pxor $C0, $B0\n"; ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); $round.="movdqa $tmp_store, $C0 palignr \$$s1, $B3, $B3 palignr \$$s2, $C3, $C3 palignr \$$s3, $D3, $D3 palignr \$$s1, $B2, $B2 palignr \$$s2, $C2, $C2 palignr \$$s3, $D2, $D2 palignr \$$s1, $B1, $B1 palignr \$$s2, $C1, $C1 palignr \$$s3, $D1, $D1 palignr \$$s1, $B0, $B0 palignr \$$s2, $C0, $C0 palignr \$$s3, $D0, $D0\n" if (($shift =~ /left/) || ($shift =~ /right/)); return $round; }; $chacha_body = &gen_chacha_round(20, ".Lrol16(%rip)") . &gen_chacha_round(25, ".Lrol8(%rip)", "left") . &gen_chacha_round(20, ".Lrol16(%rip)") . &gen_chacha_round(25, ".Lrol8(%rip)", "right"); my @loop_body = split /\n/, $chacha_body; sub emit_body { my ($n)=@_; for (my $i=0; $i < $n; $i++) { $code=$code.shift(@loop_body)."\n"; }; } { ################################################################################ # void poly_hash_ad_internal(); $code.=" .type poly_hash_ad_internal,\@abi-omnipotent .align 64 poly_hash_ad_internal: .cfi_startproc .cfi_def_cfa rsp, 8 xor $acc0, $acc0 xor $acc1, $acc1 xor $acc2, $acc2 cmp \$13, $itr2 jne .Lhash_ad_loop .Lpoly_fast_tls_ad: # Special treatment for the TLS case of 13 bytes mov ($adp), $acc0 mov 5($adp), $acc1 shr \$24, $acc1 mov \$1, $acc2\n"; &poly_mul(); $code.=" ret .Lhash_ad_loop: # Hash in 16 byte chunk cmp \$16, $itr2 jb .Lhash_ad_tail\n"; &poly_add("0($adp)"); &poly_mul(); $code.=" lea 1*16($adp), $adp sub \$16, $itr2 jmp .Lhash_ad_loop .Lhash_ad_tail: cmp \$0, $itr2 je .Lhash_ad_done # Hash last < 16 byte tail xor $t0, $t0 xor $t1, $t1 xor $t2, $t2 add $itr2, $adp .Lhash_ad_tail_loop: shld \$8, $t0, $t1 shl \$8, $t0 movzxb -1($adp), $t2 xor $t2, $t0 dec $adp dec $itr2 jne .Lhash_ad_tail_loop add $t0, $acc0 adc $t1, $acc1 adc \$1, $acc2\n"; &poly_mul(); $code.=" # Finished AD .Lhash_ad_done: ret .cfi_endproc .size poly_hash_ad_internal, .-poly_hash_ad_internal\n"; } { ################################################################################ # void chacha20_poly1305_open(uint8_t *out_plaintext, const uint8_t *ciphertext, # size_t plaintext_len, const uint8_t *ad, # size_t ad_len, # union chacha20_poly1305_open_data *aead_data) # $code.=" .globl chacha20_poly1305_open_sse41 .type chacha20_poly1305_open_sse41,\@function,6 .align 64 chacha20_poly1305_open_sse41: .cfi_startproc _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 # We write the calculated authenticator back to keyp at the end, so save # the pointer on the stack too. push $keyp .cfi_push $keyp sub \$288 + $xmm_storage + 32, %rsp .cfi_adjust_cfa_offset 288 + 32 lea 32(%rsp), %rbp and \$-32, %rbp\n"; $code.=" movaps %xmm6,16*0+$xmm_store movaps %xmm7,16*1+$xmm_store movaps %xmm8,16*2+$xmm_store movaps %xmm9,16*3+$xmm_store movaps %xmm10,16*4+$xmm_store movaps %xmm11,16*5+$xmm_store movaps %xmm12,16*6+$xmm_store movaps %xmm13,16*7+$xmm_store movaps %xmm14,16*8+$xmm_store movaps %xmm15,16*9+$xmm_store\n" if ($win64); $code.=" mov %rdx, $inl mov $adl, 0+$len_store mov $inl, 8+$len_store cmp \$128, $inl jbe .Lopen_sse_128 # For long buffers, prepare the poly key first movdqa .Lchacha20_consts(%rip), $A0 movdqu 0*16($keyp), $B0 movdqu 1*16($keyp), $C0 movdqu 2*16($keyp), $D0 movdqa $D0, $T1 # Store on stack, to free keyp movdqa $B0, $state1_store movdqa $C0, $state2_store movdqa $D0, $ctr0_store mov \$10, $acc0 .Lopen_sse_init_rounds:\n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" dec $acc0 jne .Lopen_sse_init_rounds # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded paddd .Lchacha20_consts(%rip), $A0 paddd $state1_store, $B0 # Clamp and store the key pand .Lclamp(%rip), $A0 movdqa $A0, $r_store movdqa $B0, $s_store # Hash mov $adl, $itr2 call poly_hash_ad_internal .Lopen_sse_main_loop: cmp \$16*16, $inl jb .Lopen_sse_tail # Load state, increment counter blocks\n"; &prep_state(4); $code.=" # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 mov \$4, $itr1 mov $inp, $itr2 .Lopen_sse_main_loop_rounds:\n"; &emit_body(20); &poly_add("0($itr2)"); $code.=" lea 2*8($itr2), $itr2\n"; &emit_body(20); &poly_stage1(); &emit_body(20); &poly_stage2(); &emit_body(20); &poly_stage3(); &emit_body(20); &poly_reduce_stage(); foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; $code.=" dec $itr1 jge .Lopen_sse_main_loop_rounds\n"; &poly_add("0($itr2)"); &poly_mul(); $code.=" lea 2*8($itr2), $itr2 cmp \$-6, $itr1 jg .Lopen_sse_main_loop_rounds\n"; &finalize_state(4); &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); &xor_stream($A2, $B2, $C2, $D2, "4*16"); &xor_stream($A1, $B1, $C1, $D1, "8*16"); &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.=" lea 16*16($inp), $inp lea 16*16($oup), $oup sub \$16*16, $inl jmp .Lopen_sse_main_loop .Lopen_sse_tail: # Handle the various tail sizes efficiently test $inl, $inl jz .Lopen_sse_finalize cmp \$12*16, $inl ja .Lopen_sse_tail_256 cmp \$8*16, $inl ja .Lopen_sse_tail_192 cmp \$4*16, $inl ja .Lopen_sse_tail_128\n"; ############################################################################### # At most 64 bytes are left &prep_state(1); $code.=" xor $itr2, $itr2 mov $inl, $itr1 cmp \$16, $itr1 jb .Lopen_sse_tail_64_rounds .Lopen_sse_tail_64_rounds_and_x1hash: \n"; &poly_add("0($inp,$itr2)"); &poly_mul(); $code.=" sub \$16, $itr1 .Lopen_sse_tail_64_rounds: add \$16, $itr2\n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" cmp \$16, $itr1 jae .Lopen_sse_tail_64_rounds_and_x1hash cmp \$10*16, $itr2 jne .Lopen_sse_tail_64_rounds\n"; &finalize_state(1); $code.=" jmp .Lopen_sse_tail_64_dec_loop ############################################################################### .Lopen_sse_tail_128:\n"; # 65 - 128 bytes are left &prep_state(2); $code.=" mov $inl, $itr1 and \$-16, $itr1 xor $itr2, $itr2 .Lopen_sse_tail_128_rounds_and_x1hash: \n"; &poly_add("0($inp,$itr2)"); &poly_mul(); $code.=" .Lopen_sse_tail_128_rounds: add \$16, $itr2\n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.=" cmp $itr1, $itr2 jb .Lopen_sse_tail_128_rounds_and_x1hash cmp \$10*16, $itr2 jne .Lopen_sse_tail_128_rounds\n"; &finalize_state(2); &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.=" sub \$4*16, $inl lea 4*16($inp), $inp lea 4*16($oup), $oup jmp .Lopen_sse_tail_64_dec_loop ############################################################################### .Lopen_sse_tail_192:\n"; # 129 - 192 bytes are left &prep_state(3); $code.=" mov $inl, $itr1 mov \$10*16, $itr2 cmp \$10*16, $itr1 cmovg $itr2, $itr1 and \$-16, $itr1 xor $itr2, $itr2 .Lopen_sse_tail_192_rounds_and_x1hash: \n"; &poly_add("0($inp,$itr2)"); &poly_mul(); $code.=" .Lopen_sse_tail_192_rounds: add \$16, $itr2\n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" cmp $itr1, $itr2 jb .Lopen_sse_tail_192_rounds_and_x1hash cmp \$10*16, $itr2 jne .Lopen_sse_tail_192_rounds cmp \$11*16, $inl jb .Lopen_sse_tail_192_finish\n"; &poly_add("10*16($inp)"); &poly_mul(); $code.=" cmp \$12*16, $inl jb .Lopen_sse_tail_192_finish\n"; &poly_add("11*16($inp)"); &poly_mul(); $code.=" .Lopen_sse_tail_192_finish: \n"; &finalize_state(3); &xor_stream($A2, $B2, $C2, $D2, "0*16"); &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.=" sub \$8*16, $inl lea 8*16($inp), $inp lea 8*16($oup), $oup jmp .Lopen_sse_tail_64_dec_loop ############################################################################### .Lopen_sse_tail_256:\n"; # 193 - 255 bytes are left &prep_state(4); $code.=" xor $itr2, $itr2 .Lopen_sse_tail_256_rounds_and_x1hash: \n"; &poly_add("0($inp,$itr2)"); &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left"); &chacha_qr($A1,$B1,$C1,$D1,$C3,"left"); &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load"); &poly_stage1(); &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load"); &poly_stage2(); &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right"); &chacha_qr($A1,$B1,$C1,$D1,$C3,"right"); &poly_stage3(); &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load"); &poly_reduce_stage(); &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.=" add \$16, $itr2 cmp \$10*16, $itr2 jb .Lopen_sse_tail_256_rounds_and_x1hash mov $inl, $itr1 and \$-16, $itr1 .Lopen_sse_tail_256_hash: \n"; &poly_add("0($inp,$itr2)"); &poly_mul(); $code.=" add \$16, $itr2 cmp $itr1, $itr2 jb .Lopen_sse_tail_256_hash\n"; &finalize_state(4); &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); &xor_stream($A2, $B2, $C2, $D2, "4*16"); &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.=" movdqa $tmp_store, $D0 sub \$12*16, $inl lea 12*16($inp), $inp lea 12*16($oup), $oup ############################################################################### # Decrypt the remaining data, 16B at a time, using existing stream .Lopen_sse_tail_64_dec_loop: cmp \$16, $inl jb .Lopen_sse_tail_16_init sub \$16, $inl movdqu ($inp), $T0 pxor $T0, $A0 movdqu $A0, ($oup) lea 16($inp), $inp lea 16($oup), $oup movdqa $B0, $A0 movdqa $C0, $B0 movdqa $D0, $C0 jmp .Lopen_sse_tail_64_dec_loop .Lopen_sse_tail_16_init: movdqa $A0, $A1 # Decrypt up to 16 bytes at the end. .Lopen_sse_tail_16: test $inl, $inl jz .Lopen_sse_finalize # Read the final bytes into $T0. They need to be read in reverse order so # that they end up in the correct order in $T0. pxor $T0, $T0 lea -1($inp,$inl), $inp movq $inl, $itr2 .Lopen_sse_tail_16_compose: pslldq \$1, $T0 pinsrb \$0, ($inp), $T0 sub \$1, $inp sub \$1, $itr2 jnz .Lopen_sse_tail_16_compose movq $T0, $t0 pextrq \$1, $T0, $t1 # The final bytes of keystream are in $A1. pxor $A1, $T0 # Copy the plaintext bytes out. .Lopen_sse_tail_16_extract: pextrb \$0, $T0, ($oup) psrldq \$1, $T0 add \$1, $oup sub \$1, $inl jne .Lopen_sse_tail_16_extract add $t0, $acc0 adc $t1, $acc1 adc \$1, $acc2\n"; &poly_mul(); $code.=" .Lopen_sse_finalize:\n"; &poly_add($len_store); &poly_mul(); $code.=" # Final reduce mov $acc0, $t0 mov $acc1, $t1 mov $acc2, $t2 sub \$-5, $acc0 sbb \$-1, $acc1 sbb \$3, $acc2 cmovc $t0, $acc0 cmovc $t1, $acc1 cmovc $t2, $acc2 # Add in s part of the key add 0+$s_store, $acc0 adc 8+$s_store, $acc1\n"; $code.=" movaps 16*0+$xmm_store, %xmm6 movaps 16*1+$xmm_store, %xmm7 movaps 16*2+$xmm_store, %xmm8 movaps 16*3+$xmm_store, %xmm9 movaps 16*4+$xmm_store, %xmm10 movaps 16*5+$xmm_store, %xmm11 movaps 16*6+$xmm_store, %xmm12 movaps 16*7+$xmm_store, %xmm13 movaps 16*8+$xmm_store, %xmm14 movaps 16*9+$xmm_store, %xmm15\n" if ($win64); $code.=" .cfi_remember_state add \$288 + $xmm_storage + 32, %rsp .cfi_adjust_cfa_offset -(288 + 32) # The tag replaces the key on return pop $keyp .cfi_pop $keyp mov $acc0, ($keyp) mov $acc1, 8($keyp) pop %r15 .cfi_pop %r15 pop %r14 .cfi_pop %r14 pop %r13 .cfi_pop %r13 pop %r12 .cfi_pop %r12 pop %rbx .cfi_pop %rbx pop %rbp .cfi_pop %rbp ret ############################################################################### .Lopen_sse_128: .cfi_restore_state movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 movdqu 2*16($keyp), $D0 movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1 movdqa $D1, $D2\npaddd .Lsse_inc(%rip), $D2 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3 mov \$10, $acc0 .Lopen_sse_128_rounds: \n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" dec $acc0 jnz .Lopen_sse_128_rounds paddd .Lchacha20_consts(%rip), $A0 paddd .Lchacha20_consts(%rip), $A1 paddd .Lchacha20_consts(%rip), $A2 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 paddd $T2, $C1\npaddd $T2, $C2 paddd $T3, $D1 paddd .Lsse_inc(%rip), $T3 paddd $T3, $D2 # Clamp and store the key pand .Lclamp(%rip), $A0 movdqa $A0, $r_store movdqa $B0, $s_store # Hash mov $adl, $itr2 call poly_hash_ad_internal .Lopen_sse_128_xor_hash: cmp \$16, $inl jb .Lopen_sse_tail_16 sub \$16, $inl\n"; # Load for hashing &poly_add("0*8($inp)"); $code.=" # Load for decryption movdqu 0*16($inp), $T0 pxor $T0, $A1 movdqu $A1, 0*16($oup) lea 1*16($inp), $inp lea 1*16($oup), $oup\n"; &poly_mul(); $code.=" # Shift the stream left movdqa $B1, $A1 movdqa $C1, $B1 movdqa $D1, $C1 movdqa $A2, $D1 movdqa $B2, $A2 movdqa $C2, $B2 movdqa $D2, $C2 jmp .Lopen_sse_128_xor_hash .size chacha20_poly1305_open_sse41, .-chacha20_poly1305_open_sse41 .cfi_endproc ################################################################################ ################################################################################ # void chacha20_poly1305_seal(uint8_t *out_ciphertext, const uint8_t *plaintext, # size_t plaintext_len, const uint8_t *ad, # size_t ad_len, # union chacha20_poly1305_seal_data *data); .globl chacha20_poly1305_seal_sse41 .type chacha20_poly1305_seal_sse41,\@function,6 .align 64 chacha20_poly1305_seal_sse41: .cfi_startproc _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 # We write the calculated authenticator back to keyp at the end, so save # the pointer on the stack too. push $keyp .cfi_push $keyp sub \$288 + $xmm_storage + 32, %rsp .cfi_adjust_cfa_offset 288 + 32 lea 32(%rsp), %rbp and \$-32, %rbp\n"; $code.=" movaps %xmm6,16*0+$xmm_store movaps %xmm7,16*1+$xmm_store movaps %xmm8,16*2+$xmm_store movaps %xmm9,16*3+$xmm_store movaps %xmm10,16*4+$xmm_store movaps %xmm11,16*5+$xmm_store movaps %xmm12,16*6+$xmm_store movaps %xmm13,16*7+$xmm_store movaps %xmm14,16*8+$xmm_store movaps %xmm15,16*9+$xmm_store\n" if ($win64); $code.=" mov 56($keyp), $inl # extra_in_len addq %rdx, $inl mov $adl, 0+$len_store mov $inl, 8+$len_store mov %rdx, $inl cmp \$128, $inl jbe .Lseal_sse_128 # For longer buffers, prepare the poly key + some stream movdqa .Lchacha20_consts(%rip), $A0 movdqu 0*16($keyp), $B0 movdqu 1*16($keyp), $C0 movdqu 2*16($keyp), $D0 movdqa $A0, $A1 movdqa $A0, $A2 movdqa $A0, $A3 movdqa $B0, $B1 movdqa $B0, $B2 movdqa $B0, $B3 movdqa $C0, $C1 movdqa $C0, $C2 movdqa $C0, $C3 movdqa $D0, $D3 paddd .Lsse_inc(%rip), $D0 movdqa $D0, $D2 paddd .Lsse_inc(%rip), $D0 movdqa $D0, $D1 paddd .Lsse_inc(%rip), $D0 # Store on stack movdqa $B0, $state1_store movdqa $C0, $state2_store movdqa $D0, $ctr0_store movdqa $D1, $ctr1_store movdqa $D2, $ctr2_store movdqa $D3, $ctr3_store mov \$10, $acc0 .Lseal_sse_init_rounds: \n"; foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; $code.=" dec $acc0 jnz .Lseal_sse_init_rounds\n"; &finalize_state(4); $code.=" # Clamp and store the key pand .Lclamp(%rip), $A3 movdqa $A3, $r_store movdqa $B3, $s_store # Hash mov $adl, $itr2 call poly_hash_ad_internal\n"; &xor_stream($A2,$B2,$C2,$D2,"0*16"); &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.=" cmp \$12*16, $inl ja .Lseal_sse_main_init mov \$8*16, $itr1 sub \$8*16, $inl lea 8*16($inp), $inp jmp .Lseal_sse_128_tail_hash .Lseal_sse_main_init:\n"; &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.=" mov \$12*16, $itr1 sub \$12*16, $inl lea 12*16($inp), $inp mov \$2, $itr1 mov \$8, $itr2 cmp \$4*16, $inl jbe .Lseal_sse_tail_64 cmp \$8*16, $inl jbe .Lseal_sse_tail_128 cmp \$12*16, $inl jbe .Lseal_sse_tail_192 .Lseal_sse_main_loop: \n"; # The main loop &prep_state(4); $code.=" .align 32 .Lseal_sse_main_rounds: \n"; &emit_body(20); &poly_add("0($oup)"); &emit_body(20); &poly_stage1(); &emit_body(20); &poly_stage2(); &emit_body(20); &poly_stage3(); &emit_body(20); &poly_reduce_stage(); foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; $code.=" lea 16($oup), $oup dec $itr2 jge .Lseal_sse_main_rounds\n"; &poly_add("0*8($oup)"); &poly_mul(); $code.=" lea 16($oup), $oup dec $itr1 jg .Lseal_sse_main_rounds\n"; &finalize_state(4);$code.=" movdqa $D2, $tmp_store\n"; &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.=" movdqa $tmp_store, $D2\n"; &xor_stream($A2,$B2,$C2,$D2, 4*16); &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.=" cmp \$16*16, $inl ja .Lseal_sse_main_loop_xor mov \$12*16, $itr1 sub \$12*16, $inl lea 12*16($inp), $inp jmp .Lseal_sse_128_tail_hash .Lseal_sse_main_loop_xor: \n"; &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.=" lea 16*16($inp), $inp sub \$16*16, $inl mov \$6, $itr1 mov \$4, $itr2 cmp \$12*16, $inl jg .Lseal_sse_main_loop mov $inl, $itr1 test $inl, $inl je .Lseal_sse_128_tail_hash mov \$6, $itr1 cmp \$8*16, $inl ja .Lseal_sse_tail_192 cmp \$4*16, $inl ja .Lseal_sse_tail_128 ############################################################################### .Lseal_sse_tail_64: \n"; &prep_state(1); $code.=" .Lseal_sse_tail_64_rounds_and_x2hash: \n"; &poly_add("0($oup)"); &poly_mul(); $code.=" lea 16($oup), $oup .Lseal_sse_tail_64_rounds_and_x1hash: \n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); &poly_add("0($oup)"); &poly_mul(); $code.=" lea 16($oup), $oup dec $itr1 jg .Lseal_sse_tail_64_rounds_and_x2hash dec $itr2 jge .Lseal_sse_tail_64_rounds_and_x1hash\n"; &finalize_state(1); $code.=" jmp .Lseal_sse_128_tail_xor ############################################################################### .Lseal_sse_tail_128:\n"; &prep_state(2); $code.=" .Lseal_sse_tail_128_rounds_and_x2hash: \n"; &poly_add("0($oup)"); &poly_mul(); $code.=" lea 16($oup), $oup .Lseal_sse_tail_128_rounds_and_x1hash: \n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); &poly_add("0($oup)"); &poly_mul(); &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.=" lea 16($oup), $oup dec $itr1 jg .Lseal_sse_tail_128_rounds_and_x2hash dec $itr2 jge .Lseal_sse_tail_128_rounds_and_x1hash\n"; &finalize_state(2); &xor_stream($A1,$B1,$C1,$D1,0*16); $code.=" mov \$4*16, $itr1 sub \$4*16, $inl lea 4*16($inp), $inp jmp .Lseal_sse_128_tail_hash ############################################################################### .Lseal_sse_tail_192:\n"; &prep_state(3); $code.=" .Lseal_sse_tail_192_rounds_and_x2hash: \n"; &poly_add("0($oup)"); &poly_mul(); $code.=" lea 16($oup), $oup .Lseal_sse_tail_192_rounds_and_x1hash: \n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); &poly_add("0($oup)"); &poly_mul(); &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" lea 16($oup), $oup dec $itr1 jg .Lseal_sse_tail_192_rounds_and_x2hash dec $itr2 jge .Lseal_sse_tail_192_rounds_and_x1hash\n"; &finalize_state(3); &xor_stream($A2,$B2,$C2,$D2,0*16); &xor_stream($A1,$B1,$C1,$D1,4*16); $code.=" mov \$8*16, $itr1 sub \$8*16, $inl lea 8*16($inp), $inp ############################################################################### .Lseal_sse_128_tail_hash: cmp \$16, $itr1 jb .Lseal_sse_128_tail_xor\n"; &poly_add("0($oup)"); &poly_mul(); $code.=" sub \$16, $itr1 lea 16($oup), $oup jmp .Lseal_sse_128_tail_hash .Lseal_sse_128_tail_xor: cmp \$16, $inl jb .Lseal_sse_tail_16 sub \$16, $inl # Load for decryption movdqu 0*16($inp), $T0 pxor $T0, $A0 movdqu $A0, 0*16($oup) # Then hash add 0*8($oup), $acc0 adc 1*8($oup), $acc1 adc \$1, $acc2 lea 1*16($inp), $inp lea 1*16($oup), $oup\n"; &poly_mul(); $code.=" # Shift the stream left movdqa $B0, $A0 movdqa $C0, $B0 movdqa $D0, $C0 movdqa $A1, $D0 movdqa $B1, $A1 movdqa $C1, $B1 movdqa $D1, $C1 jmp .Lseal_sse_128_tail_xor .Lseal_sse_tail_16: test $inl, $inl jz .Lprocess_blocks_of_extra_in # We can only load the PT one byte at a time to avoid buffer overread mov $inl, $itr2 mov $inl, $itr1 lea -1($inp,$inl), $inp pxor $T3, $T3 .Lseal_sse_tail_16_compose: pslldq \$1, $T3 pinsrb \$0, ($inp), $T3 lea -1($inp), $inp dec $itr1 jne .Lseal_sse_tail_16_compose # XOR the keystream with the plaintext. pxor $A0, $T3 # Write ciphertext out, byte-by-byte. movq $inl, $itr1 movdqu $T3, $A0 .Lseal_sse_tail_16_extract: pextrb \$0, $A0, ($oup) psrldq \$1, $A0 add \$1, $oup sub \$1, $itr1 jnz .Lseal_sse_tail_16_extract # $T3 contains the final (partial, non-empty) block of ciphertext which # needs to be fed into the Poly1305 state. The right-most $inl bytes of it # are valid. We need to fill it with extra_in bytes until full, or until we # run out of bytes. # # $keyp points to the tag output, which is actually a struct with the # extra_in pointer and length at offset 48. movq 288 + $xmm_storage + 32(%rsp), $keyp movq 56($keyp), $t1 # extra_in_len movq 48($keyp), $t0 # extra_in test $t1, $t1 jz .Lprocess_partial_block # Common case: no bytes of extra_in movq \$16, $t2 subq $inl, $t2 # 16-$inl is the number of bytes that fit into $T3. cmpq $t2, $t1 # if extra_in_len < 16-$inl, only copy extra_in_len # (note that AT&T syntax reverses the arguments) jge .Lload_extra_in movq $t1, $t2 .Lload_extra_in: # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load # into $T3. They are loaded in reverse order. leaq -1($t0,$t2), $inp # Update extra_in and extra_in_len to reflect the bytes that are about to # be read. addq $t2, $t0 subq $t2, $t1 movq $t0, 48($keyp) movq $t1, 56($keyp) # Update $itr2, which is used to select the mask later on, to reflect the # extra bytes about to be added. addq $t2, $itr2 # Load $t2 bytes of extra_in into $T2. pxor $T2, $T2 .Lload_extra_load_loop: pslldq \$1, $T2 pinsrb \$0, ($inp), $T2 lea -1($inp), $inp sub \$1, $t2 jnz .Lload_extra_load_loop # Shift $T2 up the length of the remainder from the main encryption. Sadly, # the shift for an XMM register has to be a constant, thus we loop to do # this. movq $inl, $t2 .Lload_extra_shift_loop: pslldq \$1, $T2 sub \$1, $t2 jnz .Lload_extra_shift_loop # Mask $T3 (the remainder from the main encryption) so that superfluous # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are # disjoint and so we can merge them with an OR. lea .Land_masks(%rip), $t2 shl \$4, $inl pand -16($t2,$inl), $T3 # Merge $T2 into $T3, forming the remainder block. por $T2, $T3 # The block of ciphertext + extra_in is ready to be included in the # Poly1305 state. movq $T3, $t0 pextrq \$1, $T3, $t1 add $t0, $acc0 adc $t1, $acc1 adc \$1, $acc2\n"; &poly_mul(); $code.=" .Lprocess_blocks_of_extra_in: # There may be additional bytes of extra_in to process. movq 288+32+$xmm_storage (%rsp), $keyp movq 48($keyp), $inp # extra_in movq 56($keyp), $itr2 # extra_in_len movq $itr2, $itr1 shr \$4, $itr2 # number of blocks .Lprocess_extra_hash_loop: jz process_extra_in_trailer\n"; &poly_add("0($inp)"); &poly_mul(); $code.=" leaq 16($inp), $inp subq \$1, $itr2 jmp .Lprocess_extra_hash_loop process_extra_in_trailer: andq \$15, $itr1 # remaining num bytes (<16) of extra_in movq $itr1, $inl jz .Ldo_length_block leaq -1($inp,$itr1), $inp .Lprocess_extra_in_trailer_load: pslldq \$1, $T3 pinsrb \$0, ($inp), $T3 lea -1($inp), $inp sub \$1, $itr1 jnz .Lprocess_extra_in_trailer_load .Lprocess_partial_block: # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0 lea .Land_masks(%rip), $t2 shl \$4, $inl pand -16($t2,$inl), $T3 movq $T3, $t0 pextrq \$1, $T3, $t1 add $t0, $acc0 adc $t1, $acc1 adc \$1, $acc2\n"; &poly_mul(); $code.=" .Ldo_length_block:\n"; &poly_add($len_store); &poly_mul(); $code.=" # Final reduce mov $acc0, $t0 mov $acc1, $t1 mov $acc2, $t2 sub \$-5, $acc0 sbb \$-1, $acc1 sbb \$3, $acc2 cmovc $t0, $acc0 cmovc $t1, $acc1 cmovc $t2, $acc2 # Add in s part of the key add 0+$s_store, $acc0 adc 8+$s_store, $acc1\n"; $code.=" movaps 16*0+$xmm_store, %xmm6 movaps 16*1+$xmm_store, %xmm7 movaps 16*2+$xmm_store, %xmm8 movaps 16*3+$xmm_store, %xmm9 movaps 16*4+$xmm_store, %xmm10 movaps 16*5+$xmm_store, %xmm11 movaps 16*6+$xmm_store, %xmm12 movaps 16*7+$xmm_store, %xmm13 movaps 16*8+$xmm_store, %xmm14 movaps 16*9+$xmm_store, %xmm15\n" if ($win64); $code.=" .cfi_remember_state add \$288 + $xmm_storage + 32, %rsp .cfi_adjust_cfa_offset -(288 + 32) # The tag replaces the key on return pop $keyp .cfi_pop $keyp mov $acc0, ($keyp) mov $acc1, 8($keyp) pop %r15 .cfi_pop %r15 pop %r14 .cfi_pop %r14 pop %r13 .cfi_pop %r13 pop %r12 .cfi_pop %r12 pop %rbx .cfi_pop %rbx pop %rbp .cfi_pop %rbp ret ################################################################################ .Lseal_sse_128: .cfi_restore_state movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 movdqu 2*16($keyp), $D2 movdqa $D2, $D0\npaddd .Lsse_inc(%rip), $D0 movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3 mov \$10, $acc0 .Lseal_sse_128_rounds:\n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" dec $acc0 jnz .Lseal_sse_128_rounds paddd .Lchacha20_consts(%rip), $A0 paddd .Lchacha20_consts(%rip), $A1 paddd .Lchacha20_consts(%rip), $A2 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 paddd $T2, $C0\npaddd $T2, $C1 paddd $T3, $D0 paddd .Lsse_inc(%rip), $T3 paddd $T3, $D1 # Clamp and store the key pand .Lclamp(%rip), $A2 movdqa $A2, $r_store movdqa $B2, $s_store # Hash mov %r8, $itr2 call poly_hash_ad_internal jmp .Lseal_sse_128_tail_xor .size chacha20_poly1305_seal_sse41, .-chacha20_poly1305_seal_sse41 .cfi_endproc\n"; } if ($avx>1) { ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15)); my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15)); ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); $state1_store="$xmm_storage+2*32(%rbp)"; $state2_store="$xmm_storage+3*32(%rbp)"; $tmp_store="$xmm_storage+4*32(%rbp)"; $ctr0_store="$xmm_storage+5*32(%rbp)"; $ctr1_store="$xmm_storage+6*32(%rbp)"; $ctr2_store="$xmm_storage+7*32(%rbp)"; $ctr3_store="$xmm_storage+8*32(%rbp)"; sub chacha_qr_avx2 { my ($a,$b,$c,$d,$t,$dir)=@_; $code.=<<___ if ($dir =~ /store/); vmovdqa $t, $tmp_store ___ $code.=<<___; vpaddd $b, $a, $a vpxor $a, $d, $d vpshufb .Lrol16(%rip), $d, $d vpaddd $d, $c, $c vpxor $c, $b, $b vpsrld \$20, $b, $t vpslld \$12, $b, $b vpxor $t, $b, $b vpaddd $b, $a, $a vpxor $a, $d, $d vpshufb .Lrol8(%rip), $d, $d vpaddd $d, $c, $c vpxor $c, $b, $b vpslld \$7, $b, $t vpsrld \$25, $b, $b vpxor $t, $b, $b ___ $code.=<<___ if ($dir =~ /left/); vpalignr \$12, $d, $d, $d vpalignr \$8, $c, $c, $c vpalignr \$4, $b, $b, $b ___ $code.=<<___ if ($dir =~ /right/); vpalignr \$4, $d, $d, $d vpalignr \$8, $c, $c, $c vpalignr \$12, $b, $b, $b ___ $code.=<<___ if ($dir =~ /load/); vmovdqa $tmp_store, $t ___ } sub prep_state_avx2 { my ($n)=@_; $code.=<<___; vmovdqa .Lchacha20_consts(%rip), $A0 vmovdqa $state1_store, $B0 vmovdqa $state2_store, $C0 ___ $code.=<<___ if ($n ge 2); vmovdqa $A0, $A1 vmovdqa $B0, $B1 vmovdqa $C0, $C1 ___ $code.=<<___ if ($n ge 3); vmovdqa $A0, $A2 vmovdqa $B0, $B2 vmovdqa $C0, $C2 ___ $code.=<<___ if ($n ge 4); vmovdqa $A0, $A3 vmovdqa $B0, $B3 vmovdqa $C0, $C3 ___ $code.=<<___ if ($n eq 1); vmovdqa .Lavx2_inc(%rip), $D0 vpaddd $ctr0_store, $D0, $D0 vmovdqa $D0, $ctr0_store ___ $code.=<<___ if ($n eq 2); vmovdqa .Lavx2_inc(%rip), $D0 vpaddd $ctr0_store, $D0, $D1 vpaddd $D1, $D0, $D0 vmovdqa $D0, $ctr0_store vmovdqa $D1, $ctr1_store ___ $code.=<<___ if ($n eq 3); vmovdqa .Lavx2_inc(%rip), $D0 vpaddd $ctr0_store, $D0, $D2 vpaddd $D2, $D0, $D1 vpaddd $D1, $D0, $D0 vmovdqa $D0, $ctr0_store vmovdqa $D1, $ctr1_store vmovdqa $D2, $ctr2_store ___ $code.=<<___ if ($n eq 4); vmovdqa .Lavx2_inc(%rip), $D0 vpaddd $ctr0_store, $D0, $D3 vpaddd $D3, $D0, $D2 vpaddd $D2, $D0, $D1 vpaddd $D1, $D0, $D0 vmovdqa $D3, $ctr3_store vmovdqa $D2, $ctr2_store vmovdqa $D1, $ctr1_store vmovdqa $D0, $ctr0_store ___ } sub finalize_state_avx2 { my ($n)=@_; $code.=<<___ if ($n eq 4); vpaddd .Lchacha20_consts(%rip), $A3, $A3 vpaddd $state1_store, $B3, $B3 vpaddd $state2_store, $C3, $C3 vpaddd $ctr3_store, $D3, $D3 ___ $code.=<<___ if ($n ge 3); vpaddd .Lchacha20_consts(%rip), $A2, $A2 vpaddd $state1_store, $B2, $B2 vpaddd $state2_store, $C2, $C2 vpaddd $ctr2_store, $D2, $D2 ___ $code.=<<___ if ($n ge 2); vpaddd .Lchacha20_consts(%rip), $A1, $A1 vpaddd $state1_store, $B1, $B1 vpaddd $state2_store, $C1, $C1 vpaddd $ctr1_store, $D1, $D1 ___ $code.=<<___; vpaddd .Lchacha20_consts(%rip), $A0, $A0 vpaddd $state1_store, $B0, $B0 vpaddd $state2_store, $C0, $C0 vpaddd $ctr0_store, $D0, $D0 ___ } sub xor_stream_avx2 { my ($A, $B, $C, $D, $offset, $hlp)=@_; $code.=<<___; vperm2i128 \$0x02, $A, $B, $hlp vperm2i128 \$0x13, $A, $B, $B vperm2i128 \$0x02, $C, $D, $A vperm2i128 \$0x13, $C, $D, $C vpxor 0*32+$offset($inp), $hlp, $hlp vpxor 1*32+$offset($inp), $A, $A vpxor 2*32+$offset($inp), $B, $B vpxor 3*32+$offset($inp), $C, $C vmovdqu $hlp, 0*32+$offset($oup) vmovdqu $A, 1*32+$offset($oup) vmovdqu $B, 2*32+$offset($oup) vmovdqu $C, 3*32+$offset($oup) ___ } sub finish_stream_avx2 { my ($A, $B, $C, $D, $hlp)=@_; $code.=<<___; vperm2i128 \$0x13, $A, $B, $hlp vperm2i128 \$0x02, $A, $B, $A vperm2i128 \$0x02, $C, $D, $B vperm2i128 \$0x13, $C, $D, $D vmovdqa $hlp, $C ___ } sub poly_stage1_mulx { $code.=<<___; mov 0+$r_store, %rdx mov %rdx, $t2 mulx $acc0, $t0, $t1 mulx $acc1, %rax, %rdx imulq $acc2, $t2 add %rax, $t1 adc %rdx, $t2 ___ } sub poly_stage2_mulx { $code.=<<___; mov 8+$r_store, %rdx mulx $acc0, $acc0, %rax add $acc0, $t1 mulx $acc1, $acc1, $t3 adc $acc1, $t2 adc \$0, $t3 imulq $acc2, %rdx ___ } sub poly_stage3_mulx { $code.=<<___; add %rax, $t2 adc %rdx, $t3 ___ } sub poly_mul_mulx { &poly_stage1_mulx(); &poly_stage2_mulx(); &poly_stage3_mulx(); &poly_reduce_stage(); } sub gen_chacha_round_avx2 { my ($rot1, $rot2, $shift)=@_; my $round=""; $round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20); $round=$round ."vmovdqa $rot2, $C0 vpaddd $B3, $A3, $A3 vpaddd $B2, $A2, $A2 vpaddd $B1, $A1, $A1 vpaddd $B0, $A0, $A0 vpxor $A3, $D3, $D3 vpxor $A2, $D2, $D2 vpxor $A1, $D1, $D1 vpxor $A0, $D0, $D0 vpshufb $C0, $D3, $D3 vpshufb $C0, $D2, $D2 vpshufb $C0, $D1, $D1 vpshufb $C0, $D0, $D0 vpaddd $D3, $C3, $C3 vpaddd $D2, $C2, $C2 vpaddd $D1, $C1, $C1 vpaddd $tmp_store, $D0, $C0 vpxor $C3, $B3, $B3 vpxor $C2, $B2, $B2 vpxor $C1, $B1, $B1 vpxor $C0, $B0, $B0 vmovdqa $C0, $tmp_store vpsrld \$$rot1, $B3, $C0 vpslld \$32-$rot1, $B3, $B3 vpxor $C0, $B3, $B3 vpsrld \$$rot1, $B2, $C0 vpslld \$32-$rot1, $B2, $B2 vpxor $C0, $B2, $B2 vpsrld \$$rot1, $B1, $C0 vpslld \$32-$rot1, $B1, $B1 vpxor $C0, $B1, $B1 vpsrld \$$rot1, $B0, $C0 vpslld \$32-$rot1, $B0, $B0 vpxor $C0, $B0, $B0\n"; ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); $round=$round ."vmovdqa $tmp_store, $C0 vpalignr \$$s1, $B3, $B3, $B3 vpalignr \$$s2, $C3, $C3, $C3 vpalignr \$$s3, $D3, $D3, $D3 vpalignr \$$s1, $B2, $B2, $B2 vpalignr \$$s2, $C2, $C2, $C2 vpalignr \$$s3, $D2, $D2, $D2 vpalignr \$$s1, $B1, $B1, $B1 vpalignr \$$s2, $C1, $C1, $C1 vpalignr \$$s3, $D1, $D1, $D1 vpalignr \$$s1, $B0, $B0, $B0 vpalignr \$$s2, $C0, $C0, $C0 vpalignr \$$s3, $D0, $D0, $D0\n" if (($shift =~ /left/) || ($shift =~ /right/)); return $round; }; $chacha_body = &gen_chacha_round_avx2(20, ".Lrol16(%rip)") . &gen_chacha_round_avx2(25, ".Lrol8(%rip)", "left") . &gen_chacha_round_avx2(20, ".Lrol16(%rip)") . &gen_chacha_round_avx2(25, ".Lrol8(%rip)", "right"); @loop_body = split /\n/, $chacha_body; $code.=" ############################################################################### .globl chacha20_poly1305_open_avx2 .type chacha20_poly1305_open_avx2,\@function,6 .align 64 chacha20_poly1305_open_avx2: .cfi_startproc _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 # We write the calculated authenticator back to keyp at the end, so save # the pointer on the stack too. push $keyp .cfi_push $keyp sub \$288 + $xmm_storage + 32, %rsp .cfi_adjust_cfa_offset 288 + 32 lea 32(%rsp), %rbp and \$-32, %rbp\n"; $code.=" movaps %xmm6,16*0+$xmm_store movaps %xmm7,16*1+$xmm_store movaps %xmm8,16*2+$xmm_store movaps %xmm9,16*3+$xmm_store movaps %xmm10,16*4+$xmm_store movaps %xmm11,16*5+$xmm_store movaps %xmm12,16*6+$xmm_store movaps %xmm13,16*7+$xmm_store movaps %xmm14,16*8+$xmm_store movaps %xmm15,16*9+$xmm_store\n" if ($win64); $code.=" mov %rdx, $inl mov $adl, 0+$len_store mov $inl, 8+$len_store vzeroupper vmovdqa .Lchacha20_consts(%rip), $A0 vbroadcasti128 0*16($keyp), $B0 vbroadcasti128 1*16($keyp), $C0 vbroadcasti128 2*16($keyp), $D0 vpaddd .Lavx2_init(%rip), $D0, $D0 cmp \$6*32, $inl jbe .Lopen_avx2_192 cmp \$10*32, $inl jbe .Lopen_avx2_320 vmovdqa $B0, $state1_store vmovdqa $C0, $state2_store vmovdqa $D0, $ctr0_store mov \$10, $acc0 .Lopen_avx2_init_rounds: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" dec $acc0 jne .Lopen_avx2_init_rounds vpaddd .Lchacha20_consts(%rip), $A0, $A0 vpaddd $state1_store, $B0, $B0 vpaddd $state2_store, $C0, $C0 vpaddd $ctr0_store, $D0, $D0 vperm2i128 \$0x02, $A0, $B0, $T0 # Clamp and store key vpand .Lclamp(%rip), $T0, $T0 vmovdqa $T0, $r_store # Stream for the first 64 bytes vperm2i128 \$0x13, $A0, $B0, $A0 vperm2i128 \$0x13, $C0, $D0, $B0 # Hash AD + first 64 bytes mov $adl, $itr2 call poly_hash_ad_internal # Hash first 64 bytes xor $itr1, $itr1 .Lopen_avx2_init_hash: \n"; &poly_add("0($inp,$itr1)"); &poly_mul(); $code.=" add \$16, $itr1 cmp \$2*32, $itr1 jne .Lopen_avx2_init_hash # Decrypt first 64 bytes vpxor 0*32($inp), $A0, $A0 vpxor 1*32($inp), $B0, $B0 # Store first 64 bytes of decrypted data vmovdqu $A0, 0*32($oup) vmovdqu $B0, 1*32($oup) lea 2*32($inp), $inp lea 2*32($oup), $oup sub \$2*32, $inl .Lopen_avx2_main_loop: # Hash and decrypt 512 bytes each iteration cmp \$16*32, $inl jb .Lopen_avx2_main_loop_done\n"; &prep_state_avx2(4); $code.=" xor $itr1, $itr1 .Lopen_avx2_main_loop_rounds: \n"; &poly_add("0*8($inp,$itr1)"); &emit_body(10); &poly_stage1_mulx(); &emit_body(9); &poly_stage2_mulx(); &emit_body(12); &poly_stage3_mulx(); &emit_body(10); &poly_reduce_stage(); &emit_body(9); &poly_add("2*8($inp,$itr1)"); &emit_body(8); &poly_stage1_mulx(); &emit_body(18); &poly_stage2_mulx(); &emit_body(18); &poly_stage3_mulx(); &emit_body(9); &poly_reduce_stage(); &emit_body(8); &poly_add("4*8($inp,$itr1)"); $code.=" lea 6*8($itr1), $itr1\n"; &emit_body(18); &poly_stage1_mulx(); &emit_body(8); &poly_stage2_mulx(); &emit_body(8); &poly_stage3_mulx(); &emit_body(18); &poly_reduce_stage(); foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; $code.=" cmp \$10*6*8, $itr1 jne .Lopen_avx2_main_loop_rounds\n"; &finalize_state_avx2(4); $code.=" vmovdqa $A0, $tmp_store\n"; &poly_add("10*6*8($inp)"); &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" vmovdqa $tmp_store, $A0\n"; &poly_mul(); &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); &poly_add("10*6*8+2*8($inp)"); &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); &poly_mul(); &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" lea 16*32($inp), $inp lea 16*32($oup), $oup sub \$16*32, $inl jmp .Lopen_avx2_main_loop .Lopen_avx2_main_loop_done: test $inl, $inl vzeroupper je .Lopen_sse_finalize cmp \$12*32, $inl ja .Lopen_avx2_tail_512 cmp \$8*32, $inl ja .Lopen_avx2_tail_384 cmp \$4*32, $inl ja .Lopen_avx2_tail_256\n"; ############################################################################### # 1-128 bytes left &prep_state_avx2(1); $code.=" xor $itr2, $itr2 mov $inl, $itr1 and \$-16, $itr1 test $itr1, $itr1 je .Lopen_avx2_tail_128_rounds # Have nothing to hash .Lopen_avx2_tail_128_rounds_and_x1hash: \n"; &poly_add("0*8($inp,$itr2)"); &poly_mul(); $code.=" .Lopen_avx2_tail_128_rounds: add \$16, $itr2\n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" cmp $itr1, $itr2 jb .Lopen_avx2_tail_128_rounds_and_x1hash cmp \$160, $itr2 jne .Lopen_avx2_tail_128_rounds\n"; &finalize_state_avx2(1); &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" jmp .Lopen_avx2_tail_128_xor ############################################################################### .Lopen_avx2_tail_256: \n"; # 129-256 bytes left &prep_state_avx2(2); $code.=" mov $inl, $tmp_store mov $inl, $itr1 sub \$4*32, $itr1 shr \$4, $itr1 mov \$10, $itr2 cmp \$10, $itr1 cmovg $itr2, $itr1 mov $inp, $inl xor $itr2, $itr2 .Lopen_avx2_tail_256_rounds_and_x1hash: \n"; &poly_add("0*8($inl)"); &poly_mul_mulx(); $code.=" lea 16($inl), $inl .Lopen_avx2_tail_256_rounds: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.=" inc $itr2\n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" cmp $itr1, $itr2 jb .Lopen_avx2_tail_256_rounds_and_x1hash cmp \$10, $itr2 jne .Lopen_avx2_tail_256_rounds mov $inl, $itr2 sub $inp, $inl mov $inl, $itr1 mov $tmp_store, $inl .Lopen_avx2_tail_256_hash: add \$16, $itr1 cmp $inl, $itr1 jg .Lopen_avx2_tail_256_done\n"; &poly_add("0*8($itr2)"); &poly_mul_mulx(); $code.=" lea 16($itr2), $itr2 jmp .Lopen_avx2_tail_256_hash .Lopen_avx2_tail_256_done: \n"; &finalize_state_avx2(2); &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0); &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" lea 4*32($inp), $inp lea 4*32($oup), $oup sub \$4*32, $inl jmp .Lopen_avx2_tail_128_xor ############################################################################### .Lopen_avx2_tail_384: \n"; # 257-383 bytes left &prep_state_avx2(3); $code.=" mov $inl, $tmp_store mov $inl, $itr1 sub \$8*32, $itr1 shr \$4, $itr1 add \$6, $itr1 mov \$10, $itr2 cmp \$10, $itr1 cmovg $itr2, $itr1 mov $inp, $inl xor $itr2, $itr2 .Lopen_avx2_tail_384_rounds_and_x2hash: \n"; &poly_add("0*8($inl)"); &poly_mul_mulx(); $code.=" lea 16($inl), $inl .Lopen_avx2_tail_384_rounds_and_x1hash: \n"; &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &poly_add("0*8($inl)"); &poly_mul(); $code.=" lea 16($inl), $inl inc $itr2\n"; &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" cmp $itr1, $itr2 jb .Lopen_avx2_tail_384_rounds_and_x2hash cmp \$10, $itr2 jne .Lopen_avx2_tail_384_rounds_and_x1hash mov $inl, $itr2 sub $inp, $inl mov $inl, $itr1 mov $tmp_store, $inl .Lopen_avx2_384_tail_hash: add \$16, $itr1 cmp $inl, $itr1 jg .Lopen_avx2_384_tail_done\n"; &poly_add("0*8($itr2)"); &poly_mul_mulx(); $code.=" lea 16($itr2), $itr2 jmp .Lopen_avx2_384_tail_hash .Lopen_avx2_384_tail_done: \n"; &finalize_state_avx2(3); &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0); &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0); &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" lea 8*32($inp), $inp lea 8*32($oup), $oup sub \$8*32, $inl jmp .Lopen_avx2_tail_128_xor ############################################################################### .Lopen_avx2_tail_512: \n"; # 384-512 bytes left &prep_state_avx2(4); $code.=" xor $itr1, $itr1 mov $inp, $itr2 .Lopen_avx2_tail_512_rounds_and_x2hash: \n"; &poly_add("0*8($itr2)"); &poly_mul(); $code.=" lea 2*8($itr2), $itr2 .Lopen_avx2_tail_512_rounds_and_x1hash: \n"; &emit_body(37); &poly_add("0*8($itr2)"); &poly_mul_mulx(); &emit_body(48); &poly_add("2*8($itr2)"); &poly_mul_mulx(); $code.=" lea 4*8($itr2), $itr2\n"; foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; $code.=" inc $itr1 cmp \$4, $itr1 jl .Lopen_avx2_tail_512_rounds_and_x2hash cmp \$10, $itr1 jne .Lopen_avx2_tail_512_rounds_and_x1hash mov $inl, $itr1 sub \$12*32, $itr1 and \$-16, $itr1 .Lopen_avx2_tail_512_hash: test $itr1, $itr1 je .Lopen_avx2_tail_512_done\n"; &poly_add("0*8($itr2)"); &poly_mul_mulx(); $code.=" lea 2*8($itr2), $itr2 sub \$2*8, $itr1 jmp .Lopen_avx2_tail_512_hash .Lopen_avx2_tail_512_done: \n"; &finalize_state_avx2(4); $code.=" vmovdqa $A0, $tmp_store\n"; &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" vmovdqa $tmp_store, $A0\n"; &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.=" lea 12*32($inp), $inp lea 12*32($oup), $oup sub \$12*32, $inl .Lopen_avx2_tail_128_xor: cmp \$32, $inl jb .Lopen_avx2_tail_32_xor sub \$32, $inl vpxor ($inp), $A0, $A0 vmovdqu $A0, ($oup) lea 1*32($inp), $inp lea 1*32($oup), $oup vmovdqa $B0, $A0 vmovdqa $C0, $B0 vmovdqa $D0, $C0 jmp .Lopen_avx2_tail_128_xor .Lopen_avx2_tail_32_xor: cmp \$16, $inl vmovdqa $A0x, $A1x jb .Lopen_avx2_exit sub \$16, $inl #load for decryption vpxor ($inp), $A0x, $A1x vmovdqu $A1x, ($oup) lea 1*16($inp), $inp lea 1*16($oup), $oup vperm2i128 \$0x11, $A0, $A0, $A0 vmovdqa $A0x, $A1x .Lopen_avx2_exit: vzeroupper jmp .Lopen_sse_tail_16 ############################################################################### .Lopen_avx2_192: vmovdqa $A0, $A1 vmovdqa $A0, $A2 vmovdqa $B0, $B1 vmovdqa $B0, $B2 vmovdqa $C0, $C1 vmovdqa $C0, $C2 vpaddd .Lavx2_inc(%rip), $D0, $D1 vmovdqa $D0, $T2 vmovdqa $D1, $T3 mov \$10, $acc0 .Lopen_avx2_192_rounds: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" dec $acc0 jne .Lopen_avx2_192_rounds vpaddd $A2, $A0, $A0 vpaddd $A2, $A1, $A1 vpaddd $B2, $B0, $B0 vpaddd $B2, $B1, $B1 vpaddd $C2, $C0, $C0 vpaddd $C2, $C1, $C1 vpaddd $T2, $D0, $D0 vpaddd $T3, $D1, $D1 vperm2i128 \$0x02, $A0, $B0, $T0 # Clamp and store the key vpand .Lclamp(%rip), $T0, $T0 vmovdqa $T0, $r_store # Stream for up to 192 bytes vperm2i128 \$0x13, $A0, $B0, $A0 vperm2i128 \$0x13, $C0, $D0, $B0 vperm2i128 \$0x02, $A1, $B1, $C0 vperm2i128 \$0x02, $C1, $D1, $D0 vperm2i128 \$0x13, $A1, $B1, $A1 vperm2i128 \$0x13, $C1, $D1, $B1 .Lopen_avx2_short: mov $adl, $itr2 call poly_hash_ad_internal .Lopen_avx2_short_hash_and_xor_loop: cmp \$32, $inl jb .Lopen_avx2_short_tail_32 sub \$32, $inl\n"; # Load + hash &poly_add("0*8($inp)"); &poly_mul(); &poly_add("2*8($inp)"); &poly_mul(); $code.=" # Load + decrypt vpxor ($inp), $A0, $A0 vmovdqu $A0, ($oup) lea 1*32($inp), $inp lea 1*32($oup), $oup # Shift stream vmovdqa $B0, $A0 vmovdqa $C0, $B0 vmovdqa $D0, $C0 vmovdqa $A1, $D0 vmovdqa $B1, $A1 vmovdqa $C1, $B1 vmovdqa $D1, $C1 vmovdqa $A2, $D1 vmovdqa $B2, $A2 jmp .Lopen_avx2_short_hash_and_xor_loop .Lopen_avx2_short_tail_32: cmp \$16, $inl vmovdqa $A0x, $A1x jb .Lopen_avx2_short_tail_32_exit sub \$16, $inl\n"; &poly_add("0*8($inp)"); &poly_mul(); $code.=" vpxor ($inp), $A0x, $A3x vmovdqu $A3x, ($oup) lea 1*16($inp), $inp lea 1*16($oup), $oup vextracti128 \$1, $A0, $A1x .Lopen_avx2_short_tail_32_exit: vzeroupper jmp .Lopen_sse_tail_16 ############################################################################### .Lopen_avx2_320: vmovdqa $A0, $A1 vmovdqa $A0, $A2 vmovdqa $B0, $B1 vmovdqa $B0, $B2 vmovdqa $C0, $C1 vmovdqa $C0, $C2 vpaddd .Lavx2_inc(%rip), $D0, $D1 vpaddd .Lavx2_inc(%rip), $D1, $D2 vmovdqa $B0, $T1 vmovdqa $C0, $T2 vmovdqa $D0, $ctr0_store vmovdqa $D1, $ctr1_store vmovdqa $D2, $ctr2_store mov \$10, $acc0 .Lopen_avx2_320_rounds: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" dec $acc0 jne .Lopen_avx2_320_rounds vpaddd .Lchacha20_consts(%rip), $A0, $A0 vpaddd .Lchacha20_consts(%rip), $A1, $A1 vpaddd .Lchacha20_consts(%rip), $A2, $A2 vpaddd $T1, $B0, $B0 vpaddd $T1, $B1, $B1 vpaddd $T1, $B2, $B2 vpaddd $T2, $C0, $C0 vpaddd $T2, $C1, $C1 vpaddd $T2, $C2, $C2 vpaddd $ctr0_store, $D0, $D0 vpaddd $ctr1_store, $D1, $D1 vpaddd $ctr2_store, $D2, $D2 vperm2i128 \$0x02, $A0, $B0, $T0 # Clamp and store the key vpand .Lclamp(%rip), $T0, $T0 vmovdqa $T0, $r_store # Stream for up to 320 bytes vperm2i128 \$0x13, $A0, $B0, $A0 vperm2i128 \$0x13, $C0, $D0, $B0 vperm2i128 \$0x02, $A1, $B1, $C0 vperm2i128 \$0x02, $C1, $D1, $D0 vperm2i128 \$0x13, $A1, $B1, $A1 vperm2i128 \$0x13, $C1, $D1, $B1 vperm2i128 \$0x02, $A2, $B2, $C1 vperm2i128 \$0x02, $C2, $D2, $D1 vperm2i128 \$0x13, $A2, $B2, $A2 vperm2i128 \$0x13, $C2, $D2, $B2 jmp .Lopen_avx2_short .size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 .cfi_endproc ############################################################################### ############################################################################### .globl chacha20_poly1305_seal_avx2 .type chacha20_poly1305_seal_avx2,\@function,6 .align 64 chacha20_poly1305_seal_avx2: .cfi_startproc _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 # We write the calculated authenticator back to keyp at the end, so save # the pointer on the stack too. push $keyp .cfi_push $keyp sub \$288 + $xmm_storage + 32, %rsp .cfi_adjust_cfa_offset 288 + 32 lea 32(%rsp), %rbp and \$-32, %rbp\n"; $code.=" movaps %xmm6,16*0+$xmm_store movaps %xmm7,16*1+$xmm_store movaps %xmm8,16*2+$xmm_store movaps %xmm9,16*3+$xmm_store movaps %xmm10,16*4+$xmm_store movaps %xmm11,16*5+$xmm_store movaps %xmm12,16*6+$xmm_store movaps %xmm13,16*7+$xmm_store movaps %xmm14,16*8+$xmm_store movaps %xmm15,16*9+$xmm_store\n" if ($win64); $code.=" mov 56($keyp), $inl # extra_in_len addq %rdx, $inl mov $adl, 0+$len_store mov $inl, 8+$len_store mov %rdx, $inl vzeroupper vmovdqa .Lchacha20_consts(%rip), $A0 vbroadcasti128 0*16($keyp), $B0 vbroadcasti128 1*16($keyp), $C0 vbroadcasti128 2*16($keyp), $D0 vpaddd .Lavx2_init(%rip), $D0, $D0 cmp \$6*32, $inl jbe .Lseal_avx2_192 cmp \$10*32, $inl jbe .Lseal_avx2_320 vmovdqa $A0, $A1 vmovdqa $A0, $A2 vmovdqa $A0, $A3 vmovdqa $B0, $B1 vmovdqa $B0, $B2 vmovdqa $B0, $B3 vmovdqa $B0, $state1_store vmovdqa $C0, $C1 vmovdqa $C0, $C2 vmovdqa $C0, $C3 vmovdqa $C0, $state2_store vmovdqa $D0, $D3 vpaddd .Lavx2_inc(%rip), $D3, $D2 vpaddd .Lavx2_inc(%rip), $D2, $D1 vpaddd .Lavx2_inc(%rip), $D1, $D0 vmovdqa $D0, $ctr0_store vmovdqa $D1, $ctr1_store vmovdqa $D2, $ctr2_store vmovdqa $D3, $ctr3_store mov \$10, $acc0 .Lseal_avx2_init_rounds: \n"; foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; $code.=" dec $acc0 jnz .Lseal_avx2_init_rounds\n"; &finalize_state_avx2(4); $code.=" vperm2i128 \$0x13, $C3, $D3, $C3 vperm2i128 \$0x02, $A3, $B3, $D3 vperm2i128 \$0x13, $A3, $B3, $A3 vpand .Lclamp(%rip), $D3, $D3 vmovdqa $D3, $r_store mov $adl, $itr2 call poly_hash_ad_internal # Safely store 320 bytes (otherwise would handle with optimized call) vpxor 0*32($inp), $A3, $A3 vpxor 1*32($inp), $C3, $C3 vmovdqu $A3, 0*32($oup) vmovdqu $C3, 1*32($oup)\n"; &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3); &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3); &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.=" lea 10*32($inp), $inp sub \$10*32, $inl mov \$10*32, $itr1 cmp \$4*32, $inl jbe .Lseal_avx2_short_hash_remainder vpxor 0*32($inp), $A0, $A0 vpxor 1*32($inp), $B0, $B0 vpxor 2*32($inp), $C0, $C0 vpxor 3*32($inp), $D0, $D0 vmovdqu $A0, 10*32($oup) vmovdqu $B0, 11*32($oup) vmovdqu $C0, 12*32($oup) vmovdqu $D0, 13*32($oup) lea 4*32($inp), $inp sub \$4*32, $inl mov \$8, $itr1 mov \$2, $itr2 cmp \$4*32, $inl jbe .Lseal_avx2_tail_128 cmp \$8*32, $inl jbe .Lseal_avx2_tail_256 cmp \$12*32, $inl jbe .Lseal_avx2_tail_384 cmp \$16*32, $inl jbe .Lseal_avx2_tail_512\n"; # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop &prep_state_avx2(4); foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; &emit_body(41); @loop_body = split /\n/, $chacha_body; $code.=" sub \$16, $oup mov \$9, $itr1 jmp .Lseal_avx2_main_loop_rounds_entry .align 32 .Lseal_avx2_main_loop: \n"; &prep_state_avx2(4); $code.=" mov \$10, $itr1 .align 32 .Lseal_avx2_main_loop_rounds: \n"; &poly_add("0*8($oup)"); &emit_body(10); &poly_stage1_mulx(); &emit_body(9); &poly_stage2_mulx(); &emit_body(12); &poly_stage3_mulx(); &emit_body(10); &poly_reduce_stage(); $code.=" .Lseal_avx2_main_loop_rounds_entry: \n"; &emit_body(9); &poly_add("2*8($oup)"); &emit_body(8); &poly_stage1_mulx(); &emit_body(18); &poly_stage2_mulx(); &emit_body(18); &poly_stage3_mulx(); &emit_body(9); &poly_reduce_stage(); &emit_body(8); &poly_add("4*8($oup)"); $code.=" lea 6*8($oup), $oup\n"; &emit_body(18); &poly_stage1_mulx(); &emit_body(8); &poly_stage2_mulx(); &emit_body(8); &poly_stage3_mulx(); &emit_body(18); &poly_reduce_stage(); foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; $code.=" dec $itr1 jne .Lseal_avx2_main_loop_rounds\n"; &finalize_state_avx2(4); $code.=" vmovdqa $A0, $tmp_store\n"; &poly_add("0*8($oup)"); &poly_mul_mulx(); &poly_add("2*8($oup)"); &poly_mul_mulx(); $code.=" lea 4*8($oup), $oup\n"; &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" vmovdqa $tmp_store, $A0\n"; &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" lea 16*32($inp), $inp sub \$16*32, $inl cmp \$16*32, $inl jg .Lseal_avx2_main_loop \n"; &poly_add("0*8($oup)"); &poly_mul_mulx(); &poly_add("2*8($oup)"); &poly_mul_mulx(); $code.=" lea 4*8($oup), $oup mov \$10, $itr1 xor $itr2, $itr2 cmp \$12*32, $inl ja .Lseal_avx2_tail_512 cmp \$8*32, $inl ja .Lseal_avx2_tail_384 cmp \$4*32, $inl ja .Lseal_avx2_tail_256 ############################################################################### .Lseal_avx2_tail_128:\n"; &prep_state_avx2(1); $code.=" .Lseal_avx2_tail_128_rounds_and_3xhash: \n"; &poly_add("0($oup)"); &poly_mul_mulx(); $code.=" lea 2*8($oup), $oup .Lseal_avx2_tail_128_rounds_and_2xhash: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &poly_add("0*8($oup)"); &poly_mul_mulx(); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); &poly_add("2*8($oup)"); &poly_mul_mulx(); $code.=" lea 4*8($oup), $oup dec $itr1 jg .Lseal_avx2_tail_128_rounds_and_3xhash dec $itr2 jge .Lseal_avx2_tail_128_rounds_and_2xhash\n"; &finalize_state_avx2(1); &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" jmp .Lseal_avx2_short_loop ############################################################################### .Lseal_avx2_tail_256:\n"; &prep_state_avx2(2); $code.=" .Lseal_avx2_tail_256_rounds_and_3xhash: \n"; &poly_add("0($oup)"); &poly_mul(); $code.=" lea 2*8($oup), $oup .Lseal_avx2_tail_256_rounds_and_2xhash: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &poly_add("0*8($oup)"); &poly_mul(); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); &poly_add("2*8($oup)"); &poly_mul(); $code.=" lea 4*8($oup), $oup dec $itr1 jg .Lseal_avx2_tail_256_rounds_and_3xhash dec $itr2 jge .Lseal_avx2_tail_256_rounds_and_2xhash\n"; &finalize_state_avx2(2); &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0); &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" mov \$4*32, $itr1 lea 4*32($inp), $inp sub \$4*32, $inl jmp .Lseal_avx2_short_hash_remainder ############################################################################### .Lseal_avx2_tail_384:\n"; &prep_state_avx2(3); $code.=" .Lseal_avx2_tail_384_rounds_and_3xhash: \n"; &poly_add("0($oup)"); &poly_mul(); $code.=" lea 2*8($oup), $oup .Lseal_avx2_tail_384_rounds_and_2xhash: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &poly_add("0*8($oup)"); &poly_mul(); &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); &poly_add("2*8($oup)"); &poly_mul(); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" lea 4*8($oup), $oup dec $itr1 jg .Lseal_avx2_tail_384_rounds_and_3xhash dec $itr2 jge .Lseal_avx2_tail_384_rounds_and_2xhash\n"; &finalize_state_avx2(3); &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0); &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0); &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" mov \$8*32, $itr1 lea 8*32($inp), $inp sub \$8*32, $inl jmp .Lseal_avx2_short_hash_remainder ############################################################################### .Lseal_avx2_tail_512:\n"; &prep_state_avx2(4); $code.=" .Lseal_avx2_tail_512_rounds_and_3xhash: \n"; &poly_add("0($oup)"); &poly_mul_mulx(); $code.=" lea 2*8($oup), $oup .Lseal_avx2_tail_512_rounds_and_2xhash: \n"; &emit_body(20); &poly_add("0*8($oup)"); &emit_body(20); &poly_stage1_mulx(); &emit_body(20); &poly_stage2_mulx(); &emit_body(20); &poly_stage3_mulx(); &emit_body(20); &poly_reduce_stage(); &emit_body(20); &poly_add("2*8($oup)"); &emit_body(20); &poly_stage1_mulx(); &emit_body(20); &poly_stage2_mulx(); &emit_body(20); &poly_stage3_mulx(); &emit_body(20); &poly_reduce_stage(); foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; $code.=" lea 4*8($oup), $oup dec $itr1 jg .Lseal_avx2_tail_512_rounds_and_3xhash dec $itr2 jge .Lseal_avx2_tail_512_rounds_and_2xhash\n"; &finalize_state_avx2(4); $code.=" vmovdqa $A0, $tmp_store\n"; &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" vmovdqa $tmp_store, $A0\n"; &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" mov \$12*32, $itr1 lea 12*32($inp), $inp sub \$12*32, $inl jmp .Lseal_avx2_short_hash_remainder ################################################################################ .Lseal_avx2_320: vmovdqa $A0, $A1 vmovdqa $A0, $A2 vmovdqa $B0, $B1 vmovdqa $B0, $B2 vmovdqa $C0, $C1 vmovdqa $C0, $C2 vpaddd .Lavx2_inc(%rip), $D0, $D1 vpaddd .Lavx2_inc(%rip), $D1, $D2 vmovdqa $B0, $T1 vmovdqa $C0, $T2 vmovdqa $D0, $ctr0_store vmovdqa $D1, $ctr1_store vmovdqa $D2, $ctr2_store mov \$10, $acc0 .Lseal_avx2_320_rounds: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" dec $acc0 jne .Lseal_avx2_320_rounds vpaddd .Lchacha20_consts(%rip), $A0, $A0 vpaddd .Lchacha20_consts(%rip), $A1, $A1 vpaddd .Lchacha20_consts(%rip), $A2, $A2 vpaddd $T1, $B0, $B0 vpaddd $T1, $B1, $B1 vpaddd $T1, $B2, $B2 vpaddd $T2, $C0, $C0 vpaddd $T2, $C1, $C1 vpaddd $T2, $C2, $C2 vpaddd $ctr0_store, $D0, $D0 vpaddd $ctr1_store, $D1, $D1 vpaddd $ctr2_store, $D2, $D2 vperm2i128 \$0x02, $A0, $B0, $T0 # Clamp and store the key vpand .Lclamp(%rip), $T0, $T0 vmovdqa $T0, $r_store # Stream for up to 320 bytes vperm2i128 \$0x13, $A0, $B0, $A0 vperm2i128 \$0x13, $C0, $D0, $B0 vperm2i128 \$0x02, $A1, $B1, $C0 vperm2i128 \$0x02, $C1, $D1, $D0 vperm2i128 \$0x13, $A1, $B1, $A1 vperm2i128 \$0x13, $C1, $D1, $B1 vperm2i128 \$0x02, $A2, $B2, $C1 vperm2i128 \$0x02, $C2, $D2, $D1 vperm2i128 \$0x13, $A2, $B2, $A2 vperm2i128 \$0x13, $C2, $D2, $B2 jmp .Lseal_avx2_short ################################################################################ .Lseal_avx2_192: vmovdqa $A0, $A1 vmovdqa $A0, $A2 vmovdqa $B0, $B1 vmovdqa $B0, $B2 vmovdqa $C0, $C1 vmovdqa $C0, $C2 vpaddd .Lavx2_inc(%rip), $D0, $D1 vmovdqa $D0, $T2 vmovdqa $D1, $T3 mov \$10, $acc0 .Lseal_avx2_192_rounds: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" dec $acc0 jne .Lseal_avx2_192_rounds vpaddd $A2, $A0, $A0 vpaddd $A2, $A1, $A1 vpaddd $B2, $B0, $B0 vpaddd $B2, $B1, $B1 vpaddd $C2, $C0, $C0 vpaddd $C2, $C1, $C1 vpaddd $T2, $D0, $D0 vpaddd $T3, $D1, $D1 vperm2i128 \$0x02, $A0, $B0, $T0 # Clamp and store the key vpand .Lclamp(%rip), $T0, $T0 vmovdqa $T0, $r_store # Stream for up to 192 bytes vperm2i128 \$0x13, $A0, $B0, $A0 vperm2i128 \$0x13, $C0, $D0, $B0 vperm2i128 \$0x02, $A1, $B1, $C0 vperm2i128 \$0x02, $C1, $D1, $D0 vperm2i128 \$0x13, $A1, $B1, $A1 vperm2i128 \$0x13, $C1, $D1, $B1 .Lseal_avx2_short: mov $adl, $itr2 call poly_hash_ad_internal xor $itr1, $itr1 .Lseal_avx2_short_hash_remainder: cmp \$16, $itr1 jb .Lseal_avx2_short_loop\n"; &poly_add("0($oup)"); &poly_mul(); $code.=" sub \$16, $itr1 add \$16, $oup jmp .Lseal_avx2_short_hash_remainder .Lseal_avx2_short_loop: cmp \$32, $inl jb .Lseal_avx2_short_tail sub \$32, $inl # Encrypt vpxor ($inp), $A0, $A0 vmovdqu $A0, ($oup) lea 1*32($inp), $inp # Load + hash\n"; &poly_add("0*8($oup)"); &poly_mul(); &poly_add("2*8($oup)"); &poly_mul(); $code.=" lea 1*32($oup), $oup # Shift stream vmovdqa $B0, $A0 vmovdqa $C0, $B0 vmovdqa $D0, $C0 vmovdqa $A1, $D0 vmovdqa $B1, $A1 vmovdqa $C1, $B1 vmovdqa $D1, $C1 vmovdqa $A2, $D1 vmovdqa $B2, $A2 jmp .Lseal_avx2_short_loop .Lseal_avx2_short_tail: cmp \$16, $inl jb .Lseal_avx2_exit sub \$16, $inl vpxor ($inp), $A0x, $A3x vmovdqu $A3x, ($oup) lea 1*16($inp), $inp\n"; &poly_add("0*8($oup)"); &poly_mul(); $code.=" lea 1*16($oup), $oup vextracti128 \$1, $A0, $A0x .Lseal_avx2_exit: vzeroupper jmp .Lseal_sse_tail_16 .cfi_endproc .size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2 "; } $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/constant_time_test.c000064400000000000000000000062541046102023000163360ustar 00000000000000// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "internal.h" int bssl_constant_time_test_main(void); static int test_binary_op_w(crypto_word_t (*op)(crypto_word_t a, crypto_word_t b), crypto_word_t a, crypto_word_t b, int is_true) { crypto_word_t c = op(a, b); if (is_true && c != CONSTTIME_TRUE_W) { return 1; } else if (!is_true && c != CONSTTIME_FALSE_W) { return 1; } return 0; } static int test_is_zero_w(crypto_word_t a) { crypto_word_t c = constant_time_is_zero_w(a); if (a == 0 && c != CONSTTIME_TRUE_W) { return 1; } else if (a != 0 && c != CONSTTIME_FALSE_W) { return 1; } c = constant_time_is_nonzero_w(a); if (a == 0 && c != CONSTTIME_FALSE_W) { return 1; } else if (a != 0 && c != CONSTTIME_TRUE_W) { return 1; } return 0; } static int test_select_w(crypto_word_t a, crypto_word_t b) { crypto_word_t selected = constant_time_select_w(CONSTTIME_TRUE_W, a, b); if (selected != a) { return 1; } selected = constant_time_select_w(CONSTTIME_FALSE_W, a, b); if (selected != b) { return 1; } return 0; } static crypto_word_t test_values_w[] = { 0, 1, 1024, 12345, 32000, #if defined(OPENSSL_64_BIT) 0xffffffff / 2 - 1, 0xffffffff / 2, 0xffffffff / 2 + 1, 0xffffffff - 1, 0xffffffff, #endif SIZE_MAX / 2 - 1, SIZE_MAX / 2, SIZE_MAX / 2 + 1, SIZE_MAX - 1, SIZE_MAX }; int bssl_constant_time_test_main(void) { int num_failed = 0; for (size_t i = 0; i < sizeof(test_values_w) / sizeof(test_values_w[0]); ++i) { crypto_word_t a = test_values_w[i]; num_failed += test_is_zero_w(a); for (size_t j = 0; j < sizeof(test_values_w) / sizeof(test_values_w[0]); ++j) { crypto_word_t b = test_values_w[j]; num_failed += test_binary_op_w(&constant_time_eq_w, a, b, a == b); num_failed += test_binary_op_w(&constant_time_eq_w, b, a, b == a); num_failed += test_select_w(a, b); } } return num_failed == 0; } // Exposes `constant_time_conditional_memcpy` to Rust for tests only. void bssl_constant_time_test_conditional_memcpy(uint8_t dst[256], const uint8_t src[256], crypto_word_t b) { constant_time_conditional_memcpy(dst, src, 256, b); } // Exposes `constant_time_conditional_memxor` to Rust for tests only. void bssl_constant_time_test_conditional_memxor(uint8_t dst[256], const uint8_t src[256], crypto_word_t b) { constant_time_conditional_memxor(dst, src, 256, b); } ring-0.17.14/crypto/cpu_intel.c000064400000000000000000000171211046102023000144050ustar 00000000000000// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(push, 3) #include #include #pragma warning(pop) #endif #include "internal.h" // OPENSSL_cpuid runs the cpuid instruction. |leaf| is passed in as EAX and ECX // is set to zero. It writes EAX, EBX, ECX, and EDX to |*out_eax| through // |*out_edx|. static void OPENSSL_cpuid(uint32_t *out_eax, uint32_t *out_ebx, uint32_t *out_ecx, uint32_t *out_edx, uint32_t leaf) { #if defined(_MSC_VER) && !defined(__clang__) int tmp[4]; __cpuid(tmp, (int)leaf); *out_eax = (uint32_t)tmp[0]; *out_ebx = (uint32_t)tmp[1]; *out_ecx = (uint32_t)tmp[2]; *out_edx = (uint32_t)tmp[3]; #elif defined(__pic__) && defined(OPENSSL_32_BIT) // Inline assembly may not clobber the PIC register. For 32-bit, this is EBX. // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602. __asm__ volatile ( "xor %%ecx, %%ecx\n" "mov %%ebx, %%edi\n" "cpuid\n" "xchg %%edi, %%ebx\n" : "=a"(*out_eax), "=D"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) : "a"(leaf) ); #else __asm__ volatile ( "xor %%ecx, %%ecx\n" "cpuid\n" : "=a"(*out_eax), "=b"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) : "a"(leaf) ); #endif } // OPENSSL_xgetbv returns the value of an Intel Extended Control Register (XCR). // Currently only XCR0 is defined by Intel so |xcr| should always be zero. // // See https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family static uint64_t OPENSSL_xgetbv(uint32_t xcr) { #if defined(_MSC_VER) && !defined(__clang__) return (uint64_t)_xgetbv(xcr); #else uint32_t eax, edx; __asm__ volatile ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); return (((uint64_t)edx) << 32) | eax; #endif } void OPENSSL_cpuid_setup(uint32_t OPENSSL_ia32cap_P[4]) { // Determine the vendor and maximum input value. uint32_t eax, ebx, ecx, edx; OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0); uint32_t num_ids = eax; int is_intel = ebx == 0x756e6547 /* Genu */ && edx == 0x49656e69 /* ineI */ && ecx == 0x6c65746e /* ntel */; uint32_t extended_features[2] = {0}; if (num_ids >= 7) { OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 7); extended_features[0] = ebx; extended_features[1] = ecx; } OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1); const uint32_t base_family = (eax >> 8) & 15; const uint32_t base_model = (eax >> 4) & 15; uint32_t family = base_family; uint32_t model = base_model; if (base_family == 15) { const uint32_t ext_family = (eax >> 20) & 255; family += ext_family; } if (base_family == 6 || base_family == 15) { const uint32_t ext_model = (eax >> 16) & 15; model |= ext_model << 4; } // Reserved bit #30 is repurposed to signal an Intel CPU. if (is_intel) { edx |= (1u << 30); } else { edx &= ~(1u << 30); } uint64_t xcr0 = 0; if (ecx & (1u << 27)) { // XCR0 may only be queried if the OSXSAVE bit is set. xcr0 = OPENSSL_xgetbv(0); } // See Intel manual, volume 1, section 14.3. if ((xcr0 & 6) != 6) { // YMM registers cannot be used. ecx &= ~(1u << 28); // AVX ecx &= ~(1u << 12); // FMA ecx &= ~(1u << 11); // AMD XOP extended_features[0] &= ~(1u << 5); // AVX2 extended_features[1] &= ~(1u << 9); // VAES extended_features[1] &= ~(1u << 10); // VPCLMULQDQ } // See Intel manual, volume 1, sections 15.2 ("Detection of AVX-512 Foundation // Instructions") through 15.4 ("Detection of Intel AVX-512 Instruction Groups // Operating at 256 and 128-bit Vector Lengths"). if ((xcr0 & 0xe6) != 0xe6) { // Without XCR0.111xx11x, no AVX512 feature can be used. This includes ZMM // registers, masking, SIMD registers 16-31 (even if accessed as YMM or // XMM), and EVEX-coded instructions (even on YMM or XMM). Even if only // XCR0.ZMM_Hi256 is missing, it isn't valid to use AVX512 features on // shorter vectors, since AVX512 ties everything to the availability of // 512-bit vectors. See the above-mentioned sections of the Intel manual, // which say that *all* these XCR0 bits must be checked even when just using // 128-bit or 256-bit vectors, and also volume 2a section 2.7.11 ("#UD // Equations for EVEX") which says that all EVEX-coded instructions raise an // undefined-instruction exception if any of these XCR0 bits is zero. // // AVX10 fixes this by reorganizing the features that used to be part of // "AVX512" and allowing them to be used independently of 512-bit support. // TODO: add AVX10 detection. extended_features[0] &= ~(1u << 16); // AVX512F extended_features[0] &= ~(1u << 17); // AVX512DQ extended_features[0] &= ~(1u << 21); // AVX512IFMA extended_features[0] &= ~(1u << 26); // AVX512PF extended_features[0] &= ~(1u << 27); // AVX512ER extended_features[0] &= ~(1u << 28); // AVX512CD extended_features[0] &= ~(1u << 30); // AVX512BW extended_features[0] &= ~(1u << 31); // AVX512VL extended_features[1] &= ~(1u << 1); // AVX512VBMI extended_features[1] &= ~(1u << 6); // AVX512VBMI2 extended_features[1] &= ~(1u << 11); // AVX512VNNI extended_features[1] &= ~(1u << 12); // AVX512BITALG extended_features[1] &= ~(1u << 14); // AVX512VPOPCNTDQ } // Repurpose the bit for the removed MPX feature to indicate when using zmm // registers should be avoided even when they are supported. (When set, AVX512 // features can still be used, but only using ymm or xmm registers.) Skylake // suffered from severe downclocking when zmm registers were used, which // affected unrelated code running on the system, making zmm registers not too // useful outside of benchmarks. The situation improved significantly by Ice // Lake, but a small amount of downclocking remained. (See // https://lore.kernel.org/linux-crypto/e8ce1146-3952-6977-1d0e-a22758e58914@intel.com/) // We take a conservative approach of not allowing zmm registers until after // Ice Lake and Tiger Lake, i.e. until Sapphire Rapids on the server side. // // AMD CPUs, which support AVX512 starting with Zen 4, have not been reported // to have any downclocking problem when zmm registers are used. if (is_intel && family == 6 && (model == 85 || // Skylake, Cascade Lake, Cooper Lake (server) model == 106 || // Ice Lake (server) model == 108 || // Ice Lake (micro server) model == 125 || // Ice Lake (client) model == 126 || // Ice Lake (mobile) model == 140 || // Tiger Lake (mobile) model == 141)) { // Tiger Lake (client) extended_features[0] |= 1u << 14; } else { extended_features[0] &= ~(1u << 14); } OPENSSL_ia32cap_P[0] = edx; OPENSSL_ia32cap_P[1] = ecx; OPENSSL_ia32cap_P[2] = extended_features[0]; OPENSSL_ia32cap_P[3] = extended_features[1]; } #endif // !OPENSSL_NO_ASM && (OPENSSL_X86 || OPENSSL_X86_64) ring-0.17.14/crypto/crypto.c000064400000000000000000000024531046102023000137450ustar 00000000000000// Copyright 2014 The BoringSSL Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include // Our assembly does not use the GOT to reference symbols, which means // references to visible symbols will often require a TEXTREL. This is // undesirable, so all assembly-referenced symbols should be hidden. CPU // capabilities are the only such symbols defined in C. Explicitly hide them, // rather than rely on being built with -fvisibility=hidden. #if defined(OPENSSL_WINDOWS) #define HIDDEN #else #define HIDDEN __attribute__((visibility("hidden"))) #endif #if defined(OPENSSL_X86_64) // These are declared as `AtomicU32` on the Rust side. HIDDEN uint32_t avx2_available = 0; HIDDEN uint32_t adx_bmi2_available = 0; #elif defined(OPENSSL_ARM) HIDDEN uint32_t neon_available = 0; #endif ring-0.17.14/crypto/curve25519/asm/x25519-asm-arm.S000064400000000000000000001207501046102023000171300ustar 00000000000000// Copyright 2015 The BoringSSL Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /* This file is taken from crypto_scalarmult/curve25519/neon2/scalarmult.s in * SUPERCOP 20141124 (http://bench.cr.yp.to/supercop.html). That code is public * domain licensed but the standard Apache 2.0 license is included above to keep * licensing simple. */ #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) .fpu neon .text .align 4 .global x25519_NEON .hidden x25519_NEON .type x25519_NEON, %function x25519_NEON: vpush {q4,q5,q6,q7} mov r12,sp sub sp,sp,#736 and sp,sp,#0xffffffe0 strd r4,[sp,#0] strd r6,[sp,#8] strd r8,[sp,#16] strd r10,[sp,#24] str r12,[sp,#480] str r14,[sp,#484] mov r0,r0 mov r1,r1 mov r2,r2 add r3,sp,#32 ldr r4,=0 ldr r5,=254 vmov.i32 q0,#1 vshr.u64 q1,q0,#7 vshr.u64 q0,q0,#8 vmov.i32 d4,#19 vmov.i32 d5,#38 add r6,sp,#512 vst1.8 {d2-d3},[r6,: 128] add r6,sp,#528 vst1.8 {d0-d1},[r6,: 128] add r6,sp,#544 vst1.8 {d4-d5},[r6,: 128] add r6,r3,#0 vmov.i32 q2,#0 vst1.8 {d4-d5},[r6,: 128]! vst1.8 {d4-d5},[r6,: 128]! vst1.8 d4,[r6,: 64] add r6,r3,#0 ldr r7,=960 sub r7,r7,#2 neg r7,r7 sub r7,r7,r7,LSL #7 str r7,[r6] add r6,sp,#704 vld1.8 {d4-d5},[r1]! vld1.8 {d6-d7},[r1] vst1.8 {d4-d5},[r6,: 128]! vst1.8 {d6-d7},[r6,: 128] sub r1,r6,#16 ldrb r6,[r1] and r6,r6,#248 strb r6,[r1] ldrb r6,[r1,#31] and r6,r6,#127 orr r6,r6,#64 strb r6,[r1,#31] vmov.i64 q2,#0xffffffff vshr.u64 q3,q2,#7 vshr.u64 q2,q2,#6 vld1.8 {d8},[r2] vld1.8 {d10},[r2] add r2,r2,#6 vld1.8 {d12},[r2] vld1.8 {d14},[r2] add r2,r2,#6 vld1.8 {d16},[r2] add r2,r2,#4 vld1.8 {d18},[r2] vld1.8 {d20},[r2] add r2,r2,#6 vld1.8 {d22},[r2] add r2,r2,#2 vld1.8 {d24},[r2] vld1.8 {d26},[r2] vshr.u64 q5,q5,#26 vshr.u64 q6,q6,#3 vshr.u64 q7,q7,#29 vshr.u64 q8,q8,#6 vshr.u64 q10,q10,#25 vshr.u64 q11,q11,#3 vshr.u64 q12,q12,#12 vshr.u64 q13,q13,#38 vand q4,q4,q2 vand q6,q6,q2 vand q8,q8,q2 vand q10,q10,q2 vand q2,q12,q2 vand q5,q5,q3 vand q7,q7,q3 vand q9,q9,q3 vand q11,q11,q3 vand q3,q13,q3 add r2,r3,#48 vadd.i64 q12,q4,q1 vadd.i64 q13,q10,q1 vshr.s64 q12,q12,#26 vshr.s64 q13,q13,#26 vadd.i64 q5,q5,q12 vshl.i64 q12,q12,#26 vadd.i64 q14,q5,q0 vadd.i64 q11,q11,q13 vshl.i64 q13,q13,#26 vadd.i64 q15,q11,q0 vsub.i64 q4,q4,q12 vshr.s64 q12,q14,#25 vsub.i64 q10,q10,q13 vshr.s64 q13,q15,#25 vadd.i64 q6,q6,q12 vshl.i64 q12,q12,#25 vadd.i64 q14,q6,q1 vadd.i64 q2,q2,q13 vsub.i64 q5,q5,q12 vshr.s64 q12,q14,#26 vshl.i64 q13,q13,#25 vadd.i64 q14,q2,q1 vadd.i64 q7,q7,q12 vshl.i64 q12,q12,#26 vadd.i64 q15,q7,q0 vsub.i64 q11,q11,q13 vshr.s64 q13,q14,#26 vsub.i64 q6,q6,q12 vshr.s64 q12,q15,#25 vadd.i64 q3,q3,q13 vshl.i64 q13,q13,#26 vadd.i64 q14,q3,q0 vadd.i64 q8,q8,q12 vshl.i64 q12,q12,#25 vadd.i64 q15,q8,q1 add r2,r2,#8 vsub.i64 q2,q2,q13 vshr.s64 q13,q14,#25 vsub.i64 q7,q7,q12 vshr.s64 q12,q15,#26 vadd.i64 q14,q13,q13 vadd.i64 q9,q9,q12 vtrn.32 d12,d14 vshl.i64 q12,q12,#26 vtrn.32 d13,d15 vadd.i64 q0,q9,q0 vadd.i64 q4,q4,q14 vst1.8 d12,[r2,: 64]! vshl.i64 q6,q13,#4 vsub.i64 q7,q8,q12 vshr.s64 q0,q0,#25 vadd.i64 q4,q4,q6 vadd.i64 q6,q10,q0 vshl.i64 q0,q0,#25 vadd.i64 q8,q6,q1 vadd.i64 q4,q4,q13 vshl.i64 q10,q13,#25 vadd.i64 q1,q4,q1 vsub.i64 q0,q9,q0 vshr.s64 q8,q8,#26 vsub.i64 q3,q3,q10 vtrn.32 d14,d0 vshr.s64 q1,q1,#26 vtrn.32 d15,d1 vadd.i64 q0,q11,q8 vst1.8 d14,[r2,: 64] vshl.i64 q7,q8,#26 vadd.i64 q5,q5,q1 vtrn.32 d4,d6 vshl.i64 q1,q1,#26 vtrn.32 d5,d7 vsub.i64 q3,q6,q7 add r2,r2,#16 vsub.i64 q1,q4,q1 vst1.8 d4,[r2,: 64] vtrn.32 d6,d0 vtrn.32 d7,d1 sub r2,r2,#8 vtrn.32 d2,d10 vtrn.32 d3,d11 vst1.8 d6,[r2,: 64] sub r2,r2,#24 vst1.8 d2,[r2,: 64] add r2,r3,#96 vmov.i32 q0,#0 vmov.i64 d2,#0xff vmov.i64 d3,#0 vshr.u32 q1,q1,#7 vst1.8 {d2-d3},[r2,: 128]! vst1.8 {d0-d1},[r2,: 128]! vst1.8 d0,[r2,: 64] add r2,r3,#144 vmov.i32 q0,#0 vst1.8 {d0-d1},[r2,: 128]! vst1.8 {d0-d1},[r2,: 128]! vst1.8 d0,[r2,: 64] add r2,r3,#240 vmov.i32 q0,#0 vmov.i64 d2,#0xff vmov.i64 d3,#0 vshr.u32 q1,q1,#7 vst1.8 {d2-d3},[r2,: 128]! vst1.8 {d0-d1},[r2,: 128]! vst1.8 d0,[r2,: 64] add r2,r3,#48 add r6,r3,#192 vld1.8 {d0-d1},[r2,: 128]! vld1.8 {d2-d3},[r2,: 128]! vld1.8 {d4},[r2,: 64] vst1.8 {d0-d1},[r6,: 128]! vst1.8 {d2-d3},[r6,: 128]! vst1.8 d4,[r6,: 64] ._mainloop: mov r2,r5,LSR #3 and r6,r5,#7 ldrb r2,[r1,r2] mov r2,r2,LSR r6 and r2,r2,#1 str r5,[sp,#488] eor r4,r4,r2 str r2,[sp,#492] neg r2,r4 add r4,r3,#96 add r5,r3,#192 add r6,r3,#144 vld1.8 {d8-d9},[r4,: 128]! add r7,r3,#240 vld1.8 {d10-d11},[r5,: 128]! veor q6,q4,q5 vld1.8 {d14-d15},[r6,: 128]! vdup.i32 q8,r2 vld1.8 {d18-d19},[r7,: 128]! veor q10,q7,q9 vld1.8 {d22-d23},[r4,: 128]! vand q6,q6,q8 vld1.8 {d24-d25},[r5,: 128]! vand q10,q10,q8 vld1.8 {d26-d27},[r6,: 128]! veor q4,q4,q6 vld1.8 {d28-d29},[r7,: 128]! veor q5,q5,q6 vld1.8 {d0},[r4,: 64] veor q6,q7,q10 vld1.8 {d2},[r5,: 64] veor q7,q9,q10 vld1.8 {d4},[r6,: 64] veor q9,q11,q12 vld1.8 {d6},[r7,: 64] veor q10,q0,q1 sub r2,r4,#32 vand q9,q9,q8 sub r4,r5,#32 vand q10,q10,q8 sub r5,r6,#32 veor q11,q11,q9 sub r6,r7,#32 veor q0,q0,q10 veor q9,q12,q9 veor q1,q1,q10 veor q10,q13,q14 veor q12,q2,q3 vand q10,q10,q8 vand q8,q12,q8 veor q12,q13,q10 veor q2,q2,q8 veor q10,q14,q10 veor q3,q3,q8 vadd.i32 q8,q4,q6 vsub.i32 q4,q4,q6 vst1.8 {d16-d17},[r2,: 128]! vadd.i32 q6,q11,q12 vst1.8 {d8-d9},[r5,: 128]! vsub.i32 q4,q11,q12 vst1.8 {d12-d13},[r2,: 128]! vadd.i32 q6,q0,q2 vst1.8 {d8-d9},[r5,: 128]! vsub.i32 q0,q0,q2 vst1.8 d12,[r2,: 64] vadd.i32 q2,q5,q7 vst1.8 d0,[r5,: 64] vsub.i32 q0,q5,q7 vst1.8 {d4-d5},[r4,: 128]! vadd.i32 q2,q9,q10 vst1.8 {d0-d1},[r6,: 128]! vsub.i32 q0,q9,q10 vst1.8 {d4-d5},[r4,: 128]! vadd.i32 q2,q1,q3 vst1.8 {d0-d1},[r6,: 128]! vsub.i32 q0,q1,q3 vst1.8 d4,[r4,: 64] vst1.8 d0,[r6,: 64] add r2,sp,#544 add r4,r3,#96 add r5,r3,#144 vld1.8 {d0-d1},[r2,: 128] vld1.8 {d2-d3},[r4,: 128]! vld1.8 {d4-d5},[r5,: 128]! vzip.i32 q1,q2 vld1.8 {d6-d7},[r4,: 128]! vld1.8 {d8-d9},[r5,: 128]! vshl.i32 q5,q1,#1 vzip.i32 q3,q4 vshl.i32 q6,q2,#1 vld1.8 {d14},[r4,: 64] vshl.i32 q8,q3,#1 vld1.8 {d15},[r5,: 64] vshl.i32 q9,q4,#1 vmul.i32 d21,d7,d1 vtrn.32 d14,d15 vmul.i32 q11,q4,q0 vmul.i32 q0,q7,q0 vmull.s32 q12,d2,d2 vmlal.s32 q12,d11,d1 vmlal.s32 q12,d12,d0 vmlal.s32 q12,d13,d23 vmlal.s32 q12,d16,d22 vmlal.s32 q12,d7,d21 vmull.s32 q10,d2,d11 vmlal.s32 q10,d4,d1 vmlal.s32 q10,d13,d0 vmlal.s32 q10,d6,d23 vmlal.s32 q10,d17,d22 vmull.s32 q13,d10,d4 vmlal.s32 q13,d11,d3 vmlal.s32 q13,d13,d1 vmlal.s32 q13,d16,d0 vmlal.s32 q13,d17,d23 vmlal.s32 q13,d8,d22 vmull.s32 q1,d10,d5 vmlal.s32 q1,d11,d4 vmlal.s32 q1,d6,d1 vmlal.s32 q1,d17,d0 vmlal.s32 q1,d8,d23 vmull.s32 q14,d10,d6 vmlal.s32 q14,d11,d13 vmlal.s32 q14,d4,d4 vmlal.s32 q14,d17,d1 vmlal.s32 q14,d18,d0 vmlal.s32 q14,d9,d23 vmull.s32 q11,d10,d7 vmlal.s32 q11,d11,d6 vmlal.s32 q11,d12,d5 vmlal.s32 q11,d8,d1 vmlal.s32 q11,d19,d0 vmull.s32 q15,d10,d8 vmlal.s32 q15,d11,d17 vmlal.s32 q15,d12,d6 vmlal.s32 q15,d13,d5 vmlal.s32 q15,d19,d1 vmlal.s32 q15,d14,d0 vmull.s32 q2,d10,d9 vmlal.s32 q2,d11,d8 vmlal.s32 q2,d12,d7 vmlal.s32 q2,d13,d6 vmlal.s32 q2,d14,d1 vmull.s32 q0,d15,d1 vmlal.s32 q0,d10,d14 vmlal.s32 q0,d11,d19 vmlal.s32 q0,d12,d8 vmlal.s32 q0,d13,d17 vmlal.s32 q0,d6,d6 add r2,sp,#512 vld1.8 {d18-d19},[r2,: 128] vmull.s32 q3,d16,d7 vmlal.s32 q3,d10,d15 vmlal.s32 q3,d11,d14 vmlal.s32 q3,d12,d9 vmlal.s32 q3,d13,d8 add r2,sp,#528 vld1.8 {d8-d9},[r2,: 128] vadd.i64 q5,q12,q9 vadd.i64 q6,q15,q9 vshr.s64 q5,q5,#26 vshr.s64 q6,q6,#26 vadd.i64 q7,q10,q5 vshl.i64 q5,q5,#26 vadd.i64 q8,q7,q4 vadd.i64 q2,q2,q6 vshl.i64 q6,q6,#26 vadd.i64 q10,q2,q4 vsub.i64 q5,q12,q5 vshr.s64 q8,q8,#25 vsub.i64 q6,q15,q6 vshr.s64 q10,q10,#25 vadd.i64 q12,q13,q8 vshl.i64 q8,q8,#25 vadd.i64 q13,q12,q9 vadd.i64 q0,q0,q10 vsub.i64 q7,q7,q8 vshr.s64 q8,q13,#26 vshl.i64 q10,q10,#25 vadd.i64 q13,q0,q9 vadd.i64 q1,q1,q8 vshl.i64 q8,q8,#26 vadd.i64 q15,q1,q4 vsub.i64 q2,q2,q10 vshr.s64 q10,q13,#26 vsub.i64 q8,q12,q8 vshr.s64 q12,q15,#25 vadd.i64 q3,q3,q10 vshl.i64 q10,q10,#26 vadd.i64 q13,q3,q4 vadd.i64 q14,q14,q12 add r2,r3,#288 vshl.i64 q12,q12,#25 add r4,r3,#336 vadd.i64 q15,q14,q9 add r2,r2,#8 vsub.i64 q0,q0,q10 add r4,r4,#8 vshr.s64 q10,q13,#25 vsub.i64 q1,q1,q12 vshr.s64 q12,q15,#26 vadd.i64 q13,q10,q10 vadd.i64 q11,q11,q12 vtrn.32 d16,d2 vshl.i64 q12,q12,#26 vtrn.32 d17,d3 vadd.i64 q1,q11,q4 vadd.i64 q4,q5,q13 vst1.8 d16,[r2,: 64]! vshl.i64 q5,q10,#4 vst1.8 d17,[r4,: 64]! vsub.i64 q8,q14,q12 vshr.s64 q1,q1,#25 vadd.i64 q4,q4,q5 vadd.i64 q5,q6,q1 vshl.i64 q1,q1,#25 vadd.i64 q6,q5,q9 vadd.i64 q4,q4,q10 vshl.i64 q10,q10,#25 vadd.i64 q9,q4,q9 vsub.i64 q1,q11,q1 vshr.s64 q6,q6,#26 vsub.i64 q3,q3,q10 vtrn.32 d16,d2 vshr.s64 q9,q9,#26 vtrn.32 d17,d3 vadd.i64 q1,q2,q6 vst1.8 d16,[r2,: 64] vshl.i64 q2,q6,#26 vst1.8 d17,[r4,: 64] vadd.i64 q6,q7,q9 vtrn.32 d0,d6 vshl.i64 q7,q9,#26 vtrn.32 d1,d7 vsub.i64 q2,q5,q2 add r2,r2,#16 vsub.i64 q3,q4,q7 vst1.8 d0,[r2,: 64] add r4,r4,#16 vst1.8 d1,[r4,: 64] vtrn.32 d4,d2 vtrn.32 d5,d3 sub r2,r2,#8 sub r4,r4,#8 vtrn.32 d6,d12 vtrn.32 d7,d13 vst1.8 d4,[r2,: 64] vst1.8 d5,[r4,: 64] sub r2,r2,#24 sub r4,r4,#24 vst1.8 d6,[r2,: 64] vst1.8 d7,[r4,: 64] add r2,r3,#240 add r4,r3,#96 vld1.8 {d0-d1},[r4,: 128]! vld1.8 {d2-d3},[r4,: 128]! vld1.8 {d4},[r4,: 64] add r4,r3,#144 vld1.8 {d6-d7},[r4,: 128]! vtrn.32 q0,q3 vld1.8 {d8-d9},[r4,: 128]! vshl.i32 q5,q0,#4 vtrn.32 q1,q4 vshl.i32 q6,q3,#4 vadd.i32 q5,q5,q0 vadd.i32 q6,q6,q3 vshl.i32 q7,q1,#4 vld1.8 {d5},[r4,: 64] vshl.i32 q8,q4,#4 vtrn.32 d4,d5 vadd.i32 q7,q7,q1 vadd.i32 q8,q8,q4 vld1.8 {d18-d19},[r2,: 128]! vshl.i32 q10,q2,#4 vld1.8 {d22-d23},[r2,: 128]! vadd.i32 q10,q10,q2 vld1.8 {d24},[r2,: 64] vadd.i32 q5,q5,q0 add r2,r3,#192 vld1.8 {d26-d27},[r2,: 128]! vadd.i32 q6,q6,q3 vld1.8 {d28-d29},[r2,: 128]! vadd.i32 q8,q8,q4 vld1.8 {d25},[r2,: 64] vadd.i32 q10,q10,q2 vtrn.32 q9,q13 vadd.i32 q7,q7,q1 vadd.i32 q5,q5,q0 vtrn.32 q11,q14 vadd.i32 q6,q6,q3 add r2,sp,#560 vadd.i32 q10,q10,q2 vtrn.32 d24,d25 vst1.8 {d12-d13},[r2,: 128] vshl.i32 q6,q13,#1 add r2,sp,#576 vst1.8 {d20-d21},[r2,: 128] vshl.i32 q10,q14,#1 add r2,sp,#592 vst1.8 {d12-d13},[r2,: 128] vshl.i32 q15,q12,#1 vadd.i32 q8,q8,q4 vext.32 d10,d31,d30,#0 vadd.i32 q7,q7,q1 add r2,sp,#608 vst1.8 {d16-d17},[r2,: 128] vmull.s32 q8,d18,d5 vmlal.s32 q8,d26,d4 vmlal.s32 q8,d19,d9 vmlal.s32 q8,d27,d3 vmlal.s32 q8,d22,d8 vmlal.s32 q8,d28,d2 vmlal.s32 q8,d23,d7 vmlal.s32 q8,d29,d1 vmlal.s32 q8,d24,d6 vmlal.s32 q8,d25,d0 add r2,sp,#624 vst1.8 {d14-d15},[r2,: 128] vmull.s32 q2,d18,d4 vmlal.s32 q2,d12,d9 vmlal.s32 q2,d13,d8 vmlal.s32 q2,d19,d3 vmlal.s32 q2,d22,d2 vmlal.s32 q2,d23,d1 vmlal.s32 q2,d24,d0 add r2,sp,#640 vst1.8 {d20-d21},[r2,: 128] vmull.s32 q7,d18,d9 vmlal.s32 q7,d26,d3 vmlal.s32 q7,d19,d8 vmlal.s32 q7,d27,d2 vmlal.s32 q7,d22,d7 vmlal.s32 q7,d28,d1 vmlal.s32 q7,d23,d6 vmlal.s32 q7,d29,d0 add r2,sp,#656 vst1.8 {d10-d11},[r2,: 128] vmull.s32 q5,d18,d3 vmlal.s32 q5,d19,d2 vmlal.s32 q5,d22,d1 vmlal.s32 q5,d23,d0 vmlal.s32 q5,d12,d8 add r2,sp,#672 vst1.8 {d16-d17},[r2,: 128] vmull.s32 q4,d18,d8 vmlal.s32 q4,d26,d2 vmlal.s32 q4,d19,d7 vmlal.s32 q4,d27,d1 vmlal.s32 q4,d22,d6 vmlal.s32 q4,d28,d0 vmull.s32 q8,d18,d7 vmlal.s32 q8,d26,d1 vmlal.s32 q8,d19,d6 vmlal.s32 q8,d27,d0 add r2,sp,#576 vld1.8 {d20-d21},[r2,: 128] vmlal.s32 q7,d24,d21 vmlal.s32 q7,d25,d20 vmlal.s32 q4,d23,d21 vmlal.s32 q4,d29,d20 vmlal.s32 q8,d22,d21 vmlal.s32 q8,d28,d20 vmlal.s32 q5,d24,d20 add r2,sp,#576 vst1.8 {d14-d15},[r2,: 128] vmull.s32 q7,d18,d6 vmlal.s32 q7,d26,d0 add r2,sp,#656 vld1.8 {d30-d31},[r2,: 128] vmlal.s32 q2,d30,d21 vmlal.s32 q7,d19,d21 vmlal.s32 q7,d27,d20 add r2,sp,#624 vld1.8 {d26-d27},[r2,: 128] vmlal.s32 q4,d25,d27 vmlal.s32 q8,d29,d27 vmlal.s32 q8,d25,d26 vmlal.s32 q7,d28,d27 vmlal.s32 q7,d29,d26 add r2,sp,#608 vld1.8 {d28-d29},[r2,: 128] vmlal.s32 q4,d24,d29 vmlal.s32 q8,d23,d29 vmlal.s32 q8,d24,d28 vmlal.s32 q7,d22,d29 vmlal.s32 q7,d23,d28 add r2,sp,#608 vst1.8 {d8-d9},[r2,: 128] add r2,sp,#560 vld1.8 {d8-d9},[r2,: 128] vmlal.s32 q7,d24,d9 vmlal.s32 q7,d25,d31 vmull.s32 q1,d18,d2 vmlal.s32 q1,d19,d1 vmlal.s32 q1,d22,d0 vmlal.s32 q1,d24,d27 vmlal.s32 q1,d23,d20 vmlal.s32 q1,d12,d7 vmlal.s32 q1,d13,d6 vmull.s32 q6,d18,d1 vmlal.s32 q6,d19,d0 vmlal.s32 q6,d23,d27 vmlal.s32 q6,d22,d20 vmlal.s32 q6,d24,d26 vmull.s32 q0,d18,d0 vmlal.s32 q0,d22,d27 vmlal.s32 q0,d23,d26 vmlal.s32 q0,d24,d31 vmlal.s32 q0,d19,d20 add r2,sp,#640 vld1.8 {d18-d19},[r2,: 128] vmlal.s32 q2,d18,d7 vmlal.s32 q2,d19,d6 vmlal.s32 q5,d18,d6 vmlal.s32 q5,d19,d21 vmlal.s32 q1,d18,d21 vmlal.s32 q1,d19,d29 vmlal.s32 q0,d18,d28 vmlal.s32 q0,d19,d9 vmlal.s32 q6,d18,d29 vmlal.s32 q6,d19,d28 add r2,sp,#592 vld1.8 {d18-d19},[r2,: 128] add r2,sp,#512 vld1.8 {d22-d23},[r2,: 128] vmlal.s32 q5,d19,d7 vmlal.s32 q0,d18,d21 vmlal.s32 q0,d19,d29 vmlal.s32 q6,d18,d6 add r2,sp,#528 vld1.8 {d6-d7},[r2,: 128] vmlal.s32 q6,d19,d21 add r2,sp,#576 vld1.8 {d18-d19},[r2,: 128] vmlal.s32 q0,d30,d8 add r2,sp,#672 vld1.8 {d20-d21},[r2,: 128] vmlal.s32 q5,d30,d29 add r2,sp,#608 vld1.8 {d24-d25},[r2,: 128] vmlal.s32 q1,d30,d28 vadd.i64 q13,q0,q11 vadd.i64 q14,q5,q11 vmlal.s32 q6,d30,d9 vshr.s64 q4,q13,#26 vshr.s64 q13,q14,#26 vadd.i64 q7,q7,q4 vshl.i64 q4,q4,#26 vadd.i64 q14,q7,q3 vadd.i64 q9,q9,q13 vshl.i64 q13,q13,#26 vadd.i64 q15,q9,q3 vsub.i64 q0,q0,q4 vshr.s64 q4,q14,#25 vsub.i64 q5,q5,q13 vshr.s64 q13,q15,#25 vadd.i64 q6,q6,q4 vshl.i64 q4,q4,#25 vadd.i64 q14,q6,q11 vadd.i64 q2,q2,q13 vsub.i64 q4,q7,q4 vshr.s64 q7,q14,#26 vshl.i64 q13,q13,#25 vadd.i64 q14,q2,q11 vadd.i64 q8,q8,q7 vshl.i64 q7,q7,#26 vadd.i64 q15,q8,q3 vsub.i64 q9,q9,q13 vshr.s64 q13,q14,#26 vsub.i64 q6,q6,q7 vshr.s64 q7,q15,#25 vadd.i64 q10,q10,q13 vshl.i64 q13,q13,#26 vadd.i64 q14,q10,q3 vadd.i64 q1,q1,q7 add r2,r3,#144 vshl.i64 q7,q7,#25 add r4,r3,#96 vadd.i64 q15,q1,q11 add r2,r2,#8 vsub.i64 q2,q2,q13 add r4,r4,#8 vshr.s64 q13,q14,#25 vsub.i64 q7,q8,q7 vshr.s64 q8,q15,#26 vadd.i64 q14,q13,q13 vadd.i64 q12,q12,q8 vtrn.32 d12,d14 vshl.i64 q8,q8,#26 vtrn.32 d13,d15 vadd.i64 q3,q12,q3 vadd.i64 q0,q0,q14 vst1.8 d12,[r2,: 64]! vshl.i64 q7,q13,#4 vst1.8 d13,[r4,: 64]! vsub.i64 q1,q1,q8 vshr.s64 q3,q3,#25 vadd.i64 q0,q0,q7 vadd.i64 q5,q5,q3 vshl.i64 q3,q3,#25 vadd.i64 q6,q5,q11 vadd.i64 q0,q0,q13 vshl.i64 q7,q13,#25 vadd.i64 q8,q0,q11 vsub.i64 q3,q12,q3 vshr.s64 q6,q6,#26 vsub.i64 q7,q10,q7 vtrn.32 d2,d6 vshr.s64 q8,q8,#26 vtrn.32 d3,d7 vadd.i64 q3,q9,q6 vst1.8 d2,[r2,: 64] vshl.i64 q6,q6,#26 vst1.8 d3,[r4,: 64] vadd.i64 q1,q4,q8 vtrn.32 d4,d14 vshl.i64 q4,q8,#26 vtrn.32 d5,d15 vsub.i64 q5,q5,q6 add r2,r2,#16 vsub.i64 q0,q0,q4 vst1.8 d4,[r2,: 64] add r4,r4,#16 vst1.8 d5,[r4,: 64] vtrn.32 d10,d6 vtrn.32 d11,d7 sub r2,r2,#8 sub r4,r4,#8 vtrn.32 d0,d2 vtrn.32 d1,d3 vst1.8 d10,[r2,: 64] vst1.8 d11,[r4,: 64] sub r2,r2,#24 sub r4,r4,#24 vst1.8 d0,[r2,: 64] vst1.8 d1,[r4,: 64] add r2,r3,#288 add r4,r3,#336 vld1.8 {d0-d1},[r2,: 128]! vld1.8 {d2-d3},[r4,: 128]! vsub.i32 q0,q0,q1 vld1.8 {d2-d3},[r2,: 128]! vld1.8 {d4-d5},[r4,: 128]! vsub.i32 q1,q1,q2 add r5,r3,#240 vld1.8 {d4},[r2,: 64] vld1.8 {d6},[r4,: 64] vsub.i32 q2,q2,q3 vst1.8 {d0-d1},[r5,: 128]! vst1.8 {d2-d3},[r5,: 128]! vst1.8 d4,[r5,: 64] add r2,r3,#144 add r4,r3,#96 add r5,r3,#144 add r6,r3,#192 vld1.8 {d0-d1},[r2,: 128]! vld1.8 {d2-d3},[r4,: 128]! vsub.i32 q2,q0,q1 vadd.i32 q0,q0,q1 vld1.8 {d2-d3},[r2,: 128]! vld1.8 {d6-d7},[r4,: 128]! vsub.i32 q4,q1,q3 vadd.i32 q1,q1,q3 vld1.8 {d6},[r2,: 64] vld1.8 {d10},[r4,: 64] vsub.i32 q6,q3,q5 vadd.i32 q3,q3,q5 vst1.8 {d4-d5},[r5,: 128]! vst1.8 {d0-d1},[r6,: 128]! vst1.8 {d8-d9},[r5,: 128]! vst1.8 {d2-d3},[r6,: 128]! vst1.8 d12,[r5,: 64] vst1.8 d6,[r6,: 64] add r2,r3,#0 add r4,r3,#240 vld1.8 {d0-d1},[r4,: 128]! vld1.8 {d2-d3},[r4,: 128]! vld1.8 {d4},[r4,: 64] add r4,r3,#336 vld1.8 {d6-d7},[r4,: 128]! vtrn.32 q0,q3 vld1.8 {d8-d9},[r4,: 128]! vshl.i32 q5,q0,#4 vtrn.32 q1,q4 vshl.i32 q6,q3,#4 vadd.i32 q5,q5,q0 vadd.i32 q6,q6,q3 vshl.i32 q7,q1,#4 vld1.8 {d5},[r4,: 64] vshl.i32 q8,q4,#4 vtrn.32 d4,d5 vadd.i32 q7,q7,q1 vadd.i32 q8,q8,q4 vld1.8 {d18-d19},[r2,: 128]! vshl.i32 q10,q2,#4 vld1.8 {d22-d23},[r2,: 128]! vadd.i32 q10,q10,q2 vld1.8 {d24},[r2,: 64] vadd.i32 q5,q5,q0 add r2,r3,#288 vld1.8 {d26-d27},[r2,: 128]! vadd.i32 q6,q6,q3 vld1.8 {d28-d29},[r2,: 128]! vadd.i32 q8,q8,q4 vld1.8 {d25},[r2,: 64] vadd.i32 q10,q10,q2 vtrn.32 q9,q13 vadd.i32 q7,q7,q1 vadd.i32 q5,q5,q0 vtrn.32 q11,q14 vadd.i32 q6,q6,q3 add r2,sp,#560 vadd.i32 q10,q10,q2 vtrn.32 d24,d25 vst1.8 {d12-d13},[r2,: 128] vshl.i32 q6,q13,#1 add r2,sp,#576 vst1.8 {d20-d21},[r2,: 128] vshl.i32 q10,q14,#1 add r2,sp,#592 vst1.8 {d12-d13},[r2,: 128] vshl.i32 q15,q12,#1 vadd.i32 q8,q8,q4 vext.32 d10,d31,d30,#0 vadd.i32 q7,q7,q1 add r2,sp,#608 vst1.8 {d16-d17},[r2,: 128] vmull.s32 q8,d18,d5 vmlal.s32 q8,d26,d4 vmlal.s32 q8,d19,d9 vmlal.s32 q8,d27,d3 vmlal.s32 q8,d22,d8 vmlal.s32 q8,d28,d2 vmlal.s32 q8,d23,d7 vmlal.s32 q8,d29,d1 vmlal.s32 q8,d24,d6 vmlal.s32 q8,d25,d0 add r2,sp,#624 vst1.8 {d14-d15},[r2,: 128] vmull.s32 q2,d18,d4 vmlal.s32 q2,d12,d9 vmlal.s32 q2,d13,d8 vmlal.s32 q2,d19,d3 vmlal.s32 q2,d22,d2 vmlal.s32 q2,d23,d1 vmlal.s32 q2,d24,d0 add r2,sp,#640 vst1.8 {d20-d21},[r2,: 128] vmull.s32 q7,d18,d9 vmlal.s32 q7,d26,d3 vmlal.s32 q7,d19,d8 vmlal.s32 q7,d27,d2 vmlal.s32 q7,d22,d7 vmlal.s32 q7,d28,d1 vmlal.s32 q7,d23,d6 vmlal.s32 q7,d29,d0 add r2,sp,#656 vst1.8 {d10-d11},[r2,: 128] vmull.s32 q5,d18,d3 vmlal.s32 q5,d19,d2 vmlal.s32 q5,d22,d1 vmlal.s32 q5,d23,d0 vmlal.s32 q5,d12,d8 add r2,sp,#672 vst1.8 {d16-d17},[r2,: 128] vmull.s32 q4,d18,d8 vmlal.s32 q4,d26,d2 vmlal.s32 q4,d19,d7 vmlal.s32 q4,d27,d1 vmlal.s32 q4,d22,d6 vmlal.s32 q4,d28,d0 vmull.s32 q8,d18,d7 vmlal.s32 q8,d26,d1 vmlal.s32 q8,d19,d6 vmlal.s32 q8,d27,d0 add r2,sp,#576 vld1.8 {d20-d21},[r2,: 128] vmlal.s32 q7,d24,d21 vmlal.s32 q7,d25,d20 vmlal.s32 q4,d23,d21 vmlal.s32 q4,d29,d20 vmlal.s32 q8,d22,d21 vmlal.s32 q8,d28,d20 vmlal.s32 q5,d24,d20 add r2,sp,#576 vst1.8 {d14-d15},[r2,: 128] vmull.s32 q7,d18,d6 vmlal.s32 q7,d26,d0 add r2,sp,#656 vld1.8 {d30-d31},[r2,: 128] vmlal.s32 q2,d30,d21 vmlal.s32 q7,d19,d21 vmlal.s32 q7,d27,d20 add r2,sp,#624 vld1.8 {d26-d27},[r2,: 128] vmlal.s32 q4,d25,d27 vmlal.s32 q8,d29,d27 vmlal.s32 q8,d25,d26 vmlal.s32 q7,d28,d27 vmlal.s32 q7,d29,d26 add r2,sp,#608 vld1.8 {d28-d29},[r2,: 128] vmlal.s32 q4,d24,d29 vmlal.s32 q8,d23,d29 vmlal.s32 q8,d24,d28 vmlal.s32 q7,d22,d29 vmlal.s32 q7,d23,d28 add r2,sp,#608 vst1.8 {d8-d9},[r2,: 128] add r2,sp,#560 vld1.8 {d8-d9},[r2,: 128] vmlal.s32 q7,d24,d9 vmlal.s32 q7,d25,d31 vmull.s32 q1,d18,d2 vmlal.s32 q1,d19,d1 vmlal.s32 q1,d22,d0 vmlal.s32 q1,d24,d27 vmlal.s32 q1,d23,d20 vmlal.s32 q1,d12,d7 vmlal.s32 q1,d13,d6 vmull.s32 q6,d18,d1 vmlal.s32 q6,d19,d0 vmlal.s32 q6,d23,d27 vmlal.s32 q6,d22,d20 vmlal.s32 q6,d24,d26 vmull.s32 q0,d18,d0 vmlal.s32 q0,d22,d27 vmlal.s32 q0,d23,d26 vmlal.s32 q0,d24,d31 vmlal.s32 q0,d19,d20 add r2,sp,#640 vld1.8 {d18-d19},[r2,: 128] vmlal.s32 q2,d18,d7 vmlal.s32 q2,d19,d6 vmlal.s32 q5,d18,d6 vmlal.s32 q5,d19,d21 vmlal.s32 q1,d18,d21 vmlal.s32 q1,d19,d29 vmlal.s32 q0,d18,d28 vmlal.s32 q0,d19,d9 vmlal.s32 q6,d18,d29 vmlal.s32 q6,d19,d28 add r2,sp,#592 vld1.8 {d18-d19},[r2,: 128] add r2,sp,#512 vld1.8 {d22-d23},[r2,: 128] vmlal.s32 q5,d19,d7 vmlal.s32 q0,d18,d21 vmlal.s32 q0,d19,d29 vmlal.s32 q6,d18,d6 add r2,sp,#528 vld1.8 {d6-d7},[r2,: 128] vmlal.s32 q6,d19,d21 add r2,sp,#576 vld1.8 {d18-d19},[r2,: 128] vmlal.s32 q0,d30,d8 add r2,sp,#672 vld1.8 {d20-d21},[r2,: 128] vmlal.s32 q5,d30,d29 add r2,sp,#608 vld1.8 {d24-d25},[r2,: 128] vmlal.s32 q1,d30,d28 vadd.i64 q13,q0,q11 vadd.i64 q14,q5,q11 vmlal.s32 q6,d30,d9 vshr.s64 q4,q13,#26 vshr.s64 q13,q14,#26 vadd.i64 q7,q7,q4 vshl.i64 q4,q4,#26 vadd.i64 q14,q7,q3 vadd.i64 q9,q9,q13 vshl.i64 q13,q13,#26 vadd.i64 q15,q9,q3 vsub.i64 q0,q0,q4 vshr.s64 q4,q14,#25 vsub.i64 q5,q5,q13 vshr.s64 q13,q15,#25 vadd.i64 q6,q6,q4 vshl.i64 q4,q4,#25 vadd.i64 q14,q6,q11 vadd.i64 q2,q2,q13 vsub.i64 q4,q7,q4 vshr.s64 q7,q14,#26 vshl.i64 q13,q13,#25 vadd.i64 q14,q2,q11 vadd.i64 q8,q8,q7 vshl.i64 q7,q7,#26 vadd.i64 q15,q8,q3 vsub.i64 q9,q9,q13 vshr.s64 q13,q14,#26 vsub.i64 q6,q6,q7 vshr.s64 q7,q15,#25 vadd.i64 q10,q10,q13 vshl.i64 q13,q13,#26 vadd.i64 q14,q10,q3 vadd.i64 q1,q1,q7 add r2,r3,#288 vshl.i64 q7,q7,#25 add r4,r3,#96 vadd.i64 q15,q1,q11 add r2,r2,#8 vsub.i64 q2,q2,q13 add r4,r4,#8 vshr.s64 q13,q14,#25 vsub.i64 q7,q8,q7 vshr.s64 q8,q15,#26 vadd.i64 q14,q13,q13 vadd.i64 q12,q12,q8 vtrn.32 d12,d14 vshl.i64 q8,q8,#26 vtrn.32 d13,d15 vadd.i64 q3,q12,q3 vadd.i64 q0,q0,q14 vst1.8 d12,[r2,: 64]! vshl.i64 q7,q13,#4 vst1.8 d13,[r4,: 64]! vsub.i64 q1,q1,q8 vshr.s64 q3,q3,#25 vadd.i64 q0,q0,q7 vadd.i64 q5,q5,q3 vshl.i64 q3,q3,#25 vadd.i64 q6,q5,q11 vadd.i64 q0,q0,q13 vshl.i64 q7,q13,#25 vadd.i64 q8,q0,q11 vsub.i64 q3,q12,q3 vshr.s64 q6,q6,#26 vsub.i64 q7,q10,q7 vtrn.32 d2,d6 vshr.s64 q8,q8,#26 vtrn.32 d3,d7 vadd.i64 q3,q9,q6 vst1.8 d2,[r2,: 64] vshl.i64 q6,q6,#26 vst1.8 d3,[r4,: 64] vadd.i64 q1,q4,q8 vtrn.32 d4,d14 vshl.i64 q4,q8,#26 vtrn.32 d5,d15 vsub.i64 q5,q5,q6 add r2,r2,#16 vsub.i64 q0,q0,q4 vst1.8 d4,[r2,: 64] add r4,r4,#16 vst1.8 d5,[r4,: 64] vtrn.32 d10,d6 vtrn.32 d11,d7 sub r2,r2,#8 sub r4,r4,#8 vtrn.32 d0,d2 vtrn.32 d1,d3 vst1.8 d10,[r2,: 64] vst1.8 d11,[r4,: 64] sub r2,r2,#24 sub r4,r4,#24 vst1.8 d0,[r2,: 64] vst1.8 d1,[r4,: 64] add r2,sp,#544 add r4,r3,#144 add r5,r3,#192 vld1.8 {d0-d1},[r2,: 128] vld1.8 {d2-d3},[r4,: 128]! vld1.8 {d4-d5},[r5,: 128]! vzip.i32 q1,q2 vld1.8 {d6-d7},[r4,: 128]! vld1.8 {d8-d9},[r5,: 128]! vshl.i32 q5,q1,#1 vzip.i32 q3,q4 vshl.i32 q6,q2,#1 vld1.8 {d14},[r4,: 64] vshl.i32 q8,q3,#1 vld1.8 {d15},[r5,: 64] vshl.i32 q9,q4,#1 vmul.i32 d21,d7,d1 vtrn.32 d14,d15 vmul.i32 q11,q4,q0 vmul.i32 q0,q7,q0 vmull.s32 q12,d2,d2 vmlal.s32 q12,d11,d1 vmlal.s32 q12,d12,d0 vmlal.s32 q12,d13,d23 vmlal.s32 q12,d16,d22 vmlal.s32 q12,d7,d21 vmull.s32 q10,d2,d11 vmlal.s32 q10,d4,d1 vmlal.s32 q10,d13,d0 vmlal.s32 q10,d6,d23 vmlal.s32 q10,d17,d22 vmull.s32 q13,d10,d4 vmlal.s32 q13,d11,d3 vmlal.s32 q13,d13,d1 vmlal.s32 q13,d16,d0 vmlal.s32 q13,d17,d23 vmlal.s32 q13,d8,d22 vmull.s32 q1,d10,d5 vmlal.s32 q1,d11,d4 vmlal.s32 q1,d6,d1 vmlal.s32 q1,d17,d0 vmlal.s32 q1,d8,d23 vmull.s32 q14,d10,d6 vmlal.s32 q14,d11,d13 vmlal.s32 q14,d4,d4 vmlal.s32 q14,d17,d1 vmlal.s32 q14,d18,d0 vmlal.s32 q14,d9,d23 vmull.s32 q11,d10,d7 vmlal.s32 q11,d11,d6 vmlal.s32 q11,d12,d5 vmlal.s32 q11,d8,d1 vmlal.s32 q11,d19,d0 vmull.s32 q15,d10,d8 vmlal.s32 q15,d11,d17 vmlal.s32 q15,d12,d6 vmlal.s32 q15,d13,d5 vmlal.s32 q15,d19,d1 vmlal.s32 q15,d14,d0 vmull.s32 q2,d10,d9 vmlal.s32 q2,d11,d8 vmlal.s32 q2,d12,d7 vmlal.s32 q2,d13,d6 vmlal.s32 q2,d14,d1 vmull.s32 q0,d15,d1 vmlal.s32 q0,d10,d14 vmlal.s32 q0,d11,d19 vmlal.s32 q0,d12,d8 vmlal.s32 q0,d13,d17 vmlal.s32 q0,d6,d6 add r2,sp,#512 vld1.8 {d18-d19},[r2,: 128] vmull.s32 q3,d16,d7 vmlal.s32 q3,d10,d15 vmlal.s32 q3,d11,d14 vmlal.s32 q3,d12,d9 vmlal.s32 q3,d13,d8 add r2,sp,#528 vld1.8 {d8-d9},[r2,: 128] vadd.i64 q5,q12,q9 vadd.i64 q6,q15,q9 vshr.s64 q5,q5,#26 vshr.s64 q6,q6,#26 vadd.i64 q7,q10,q5 vshl.i64 q5,q5,#26 vadd.i64 q8,q7,q4 vadd.i64 q2,q2,q6 vshl.i64 q6,q6,#26 vadd.i64 q10,q2,q4 vsub.i64 q5,q12,q5 vshr.s64 q8,q8,#25 vsub.i64 q6,q15,q6 vshr.s64 q10,q10,#25 vadd.i64 q12,q13,q8 vshl.i64 q8,q8,#25 vadd.i64 q13,q12,q9 vadd.i64 q0,q0,q10 vsub.i64 q7,q7,q8 vshr.s64 q8,q13,#26 vshl.i64 q10,q10,#25 vadd.i64 q13,q0,q9 vadd.i64 q1,q1,q8 vshl.i64 q8,q8,#26 vadd.i64 q15,q1,q4 vsub.i64 q2,q2,q10 vshr.s64 q10,q13,#26 vsub.i64 q8,q12,q8 vshr.s64 q12,q15,#25 vadd.i64 q3,q3,q10 vshl.i64 q10,q10,#26 vadd.i64 q13,q3,q4 vadd.i64 q14,q14,q12 add r2,r3,#144 vshl.i64 q12,q12,#25 add r4,r3,#192 vadd.i64 q15,q14,q9 add r2,r2,#8 vsub.i64 q0,q0,q10 add r4,r4,#8 vshr.s64 q10,q13,#25 vsub.i64 q1,q1,q12 vshr.s64 q12,q15,#26 vadd.i64 q13,q10,q10 vadd.i64 q11,q11,q12 vtrn.32 d16,d2 vshl.i64 q12,q12,#26 vtrn.32 d17,d3 vadd.i64 q1,q11,q4 vadd.i64 q4,q5,q13 vst1.8 d16,[r2,: 64]! vshl.i64 q5,q10,#4 vst1.8 d17,[r4,: 64]! vsub.i64 q8,q14,q12 vshr.s64 q1,q1,#25 vadd.i64 q4,q4,q5 vadd.i64 q5,q6,q1 vshl.i64 q1,q1,#25 vadd.i64 q6,q5,q9 vadd.i64 q4,q4,q10 vshl.i64 q10,q10,#25 vadd.i64 q9,q4,q9 vsub.i64 q1,q11,q1 vshr.s64 q6,q6,#26 vsub.i64 q3,q3,q10 vtrn.32 d16,d2 vshr.s64 q9,q9,#26 vtrn.32 d17,d3 vadd.i64 q1,q2,q6 vst1.8 d16,[r2,: 64] vshl.i64 q2,q6,#26 vst1.8 d17,[r4,: 64] vadd.i64 q6,q7,q9 vtrn.32 d0,d6 vshl.i64 q7,q9,#26 vtrn.32 d1,d7 vsub.i64 q2,q5,q2 add r2,r2,#16 vsub.i64 q3,q4,q7 vst1.8 d0,[r2,: 64] add r4,r4,#16 vst1.8 d1,[r4,: 64] vtrn.32 d4,d2 vtrn.32 d5,d3 sub r2,r2,#8 sub r4,r4,#8 vtrn.32 d6,d12 vtrn.32 d7,d13 vst1.8 d4,[r2,: 64] vst1.8 d5,[r4,: 64] sub r2,r2,#24 sub r4,r4,#24 vst1.8 d6,[r2,: 64] vst1.8 d7,[r4,: 64] add r2,r3,#336 add r4,r3,#288 vld1.8 {d0-d1},[r2,: 128]! vld1.8 {d2-d3},[r4,: 128]! vadd.i32 q0,q0,q1 vld1.8 {d2-d3},[r2,: 128]! vld1.8 {d4-d5},[r4,: 128]! vadd.i32 q1,q1,q2 add r5,r3,#288 vld1.8 {d4},[r2,: 64] vld1.8 {d6},[r4,: 64] vadd.i32 q2,q2,q3 vst1.8 {d0-d1},[r5,: 128]! vst1.8 {d2-d3},[r5,: 128]! vst1.8 d4,[r5,: 64] add r2,r3,#48 add r4,r3,#144 vld1.8 {d0-d1},[r4,: 128]! vld1.8 {d2-d3},[r4,: 128]! vld1.8 {d4},[r4,: 64] add r4,r3,#288 vld1.8 {d6-d7},[r4,: 128]! vtrn.32 q0,q3 vld1.8 {d8-d9},[r4,: 128]! vshl.i32 q5,q0,#4 vtrn.32 q1,q4 vshl.i32 q6,q3,#4 vadd.i32 q5,q5,q0 vadd.i32 q6,q6,q3 vshl.i32 q7,q1,#4 vld1.8 {d5},[r4,: 64] vshl.i32 q8,q4,#4 vtrn.32 d4,d5 vadd.i32 q7,q7,q1 vadd.i32 q8,q8,q4 vld1.8 {d18-d19},[r2,: 128]! vshl.i32 q10,q2,#4 vld1.8 {d22-d23},[r2,: 128]! vadd.i32 q10,q10,q2 vld1.8 {d24},[r2,: 64] vadd.i32 q5,q5,q0 add r2,r3,#240 vld1.8 {d26-d27},[r2,: 128]! vadd.i32 q6,q6,q3 vld1.8 {d28-d29},[r2,: 128]! vadd.i32 q8,q8,q4 vld1.8 {d25},[r2,: 64] vadd.i32 q10,q10,q2 vtrn.32 q9,q13 vadd.i32 q7,q7,q1 vadd.i32 q5,q5,q0 vtrn.32 q11,q14 vadd.i32 q6,q6,q3 add r2,sp,#560 vadd.i32 q10,q10,q2 vtrn.32 d24,d25 vst1.8 {d12-d13},[r2,: 128] vshl.i32 q6,q13,#1 add r2,sp,#576 vst1.8 {d20-d21},[r2,: 128] vshl.i32 q10,q14,#1 add r2,sp,#592 vst1.8 {d12-d13},[r2,: 128] vshl.i32 q15,q12,#1 vadd.i32 q8,q8,q4 vext.32 d10,d31,d30,#0 vadd.i32 q7,q7,q1 add r2,sp,#608 vst1.8 {d16-d17},[r2,: 128] vmull.s32 q8,d18,d5 vmlal.s32 q8,d26,d4 vmlal.s32 q8,d19,d9 vmlal.s32 q8,d27,d3 vmlal.s32 q8,d22,d8 vmlal.s32 q8,d28,d2 vmlal.s32 q8,d23,d7 vmlal.s32 q8,d29,d1 vmlal.s32 q8,d24,d6 vmlal.s32 q8,d25,d0 add r2,sp,#624 vst1.8 {d14-d15},[r2,: 128] vmull.s32 q2,d18,d4 vmlal.s32 q2,d12,d9 vmlal.s32 q2,d13,d8 vmlal.s32 q2,d19,d3 vmlal.s32 q2,d22,d2 vmlal.s32 q2,d23,d1 vmlal.s32 q2,d24,d0 add r2,sp,#640 vst1.8 {d20-d21},[r2,: 128] vmull.s32 q7,d18,d9 vmlal.s32 q7,d26,d3 vmlal.s32 q7,d19,d8 vmlal.s32 q7,d27,d2 vmlal.s32 q7,d22,d7 vmlal.s32 q7,d28,d1 vmlal.s32 q7,d23,d6 vmlal.s32 q7,d29,d0 add r2,sp,#656 vst1.8 {d10-d11},[r2,: 128] vmull.s32 q5,d18,d3 vmlal.s32 q5,d19,d2 vmlal.s32 q5,d22,d1 vmlal.s32 q5,d23,d0 vmlal.s32 q5,d12,d8 add r2,sp,#672 vst1.8 {d16-d17},[r2,: 128] vmull.s32 q4,d18,d8 vmlal.s32 q4,d26,d2 vmlal.s32 q4,d19,d7 vmlal.s32 q4,d27,d1 vmlal.s32 q4,d22,d6 vmlal.s32 q4,d28,d0 vmull.s32 q8,d18,d7 vmlal.s32 q8,d26,d1 vmlal.s32 q8,d19,d6 vmlal.s32 q8,d27,d0 add r2,sp,#576 vld1.8 {d20-d21},[r2,: 128] vmlal.s32 q7,d24,d21 vmlal.s32 q7,d25,d20 vmlal.s32 q4,d23,d21 vmlal.s32 q4,d29,d20 vmlal.s32 q8,d22,d21 vmlal.s32 q8,d28,d20 vmlal.s32 q5,d24,d20 add r2,sp,#576 vst1.8 {d14-d15},[r2,: 128] vmull.s32 q7,d18,d6 vmlal.s32 q7,d26,d0 add r2,sp,#656 vld1.8 {d30-d31},[r2,: 128] vmlal.s32 q2,d30,d21 vmlal.s32 q7,d19,d21 vmlal.s32 q7,d27,d20 add r2,sp,#624 vld1.8 {d26-d27},[r2,: 128] vmlal.s32 q4,d25,d27 vmlal.s32 q8,d29,d27 vmlal.s32 q8,d25,d26 vmlal.s32 q7,d28,d27 vmlal.s32 q7,d29,d26 add r2,sp,#608 vld1.8 {d28-d29},[r2,: 128] vmlal.s32 q4,d24,d29 vmlal.s32 q8,d23,d29 vmlal.s32 q8,d24,d28 vmlal.s32 q7,d22,d29 vmlal.s32 q7,d23,d28 add r2,sp,#608 vst1.8 {d8-d9},[r2,: 128] add r2,sp,#560 vld1.8 {d8-d9},[r2,: 128] vmlal.s32 q7,d24,d9 vmlal.s32 q7,d25,d31 vmull.s32 q1,d18,d2 vmlal.s32 q1,d19,d1 vmlal.s32 q1,d22,d0 vmlal.s32 q1,d24,d27 vmlal.s32 q1,d23,d20 vmlal.s32 q1,d12,d7 vmlal.s32 q1,d13,d6 vmull.s32 q6,d18,d1 vmlal.s32 q6,d19,d0 vmlal.s32 q6,d23,d27 vmlal.s32 q6,d22,d20 vmlal.s32 q6,d24,d26 vmull.s32 q0,d18,d0 vmlal.s32 q0,d22,d27 vmlal.s32 q0,d23,d26 vmlal.s32 q0,d24,d31 vmlal.s32 q0,d19,d20 add r2,sp,#640 vld1.8 {d18-d19},[r2,: 128] vmlal.s32 q2,d18,d7 vmlal.s32 q2,d19,d6 vmlal.s32 q5,d18,d6 vmlal.s32 q5,d19,d21 vmlal.s32 q1,d18,d21 vmlal.s32 q1,d19,d29 vmlal.s32 q0,d18,d28 vmlal.s32 q0,d19,d9 vmlal.s32 q6,d18,d29 vmlal.s32 q6,d19,d28 add r2,sp,#592 vld1.8 {d18-d19},[r2,: 128] add r2,sp,#512 vld1.8 {d22-d23},[r2,: 128] vmlal.s32 q5,d19,d7 vmlal.s32 q0,d18,d21 vmlal.s32 q0,d19,d29 vmlal.s32 q6,d18,d6 add r2,sp,#528 vld1.8 {d6-d7},[r2,: 128] vmlal.s32 q6,d19,d21 add r2,sp,#576 vld1.8 {d18-d19},[r2,: 128] vmlal.s32 q0,d30,d8 add r2,sp,#672 vld1.8 {d20-d21},[r2,: 128] vmlal.s32 q5,d30,d29 add r2,sp,#608 vld1.8 {d24-d25},[r2,: 128] vmlal.s32 q1,d30,d28 vadd.i64 q13,q0,q11 vadd.i64 q14,q5,q11 vmlal.s32 q6,d30,d9 vshr.s64 q4,q13,#26 vshr.s64 q13,q14,#26 vadd.i64 q7,q7,q4 vshl.i64 q4,q4,#26 vadd.i64 q14,q7,q3 vadd.i64 q9,q9,q13 vshl.i64 q13,q13,#26 vadd.i64 q15,q9,q3 vsub.i64 q0,q0,q4 vshr.s64 q4,q14,#25 vsub.i64 q5,q5,q13 vshr.s64 q13,q15,#25 vadd.i64 q6,q6,q4 vshl.i64 q4,q4,#25 vadd.i64 q14,q6,q11 vadd.i64 q2,q2,q13 vsub.i64 q4,q7,q4 vshr.s64 q7,q14,#26 vshl.i64 q13,q13,#25 vadd.i64 q14,q2,q11 vadd.i64 q8,q8,q7 vshl.i64 q7,q7,#26 vadd.i64 q15,q8,q3 vsub.i64 q9,q9,q13 vshr.s64 q13,q14,#26 vsub.i64 q6,q6,q7 vshr.s64 q7,q15,#25 vadd.i64 q10,q10,q13 vshl.i64 q13,q13,#26 vadd.i64 q14,q10,q3 vadd.i64 q1,q1,q7 add r2,r3,#240 vshl.i64 q7,q7,#25 add r4,r3,#144 vadd.i64 q15,q1,q11 add r2,r2,#8 vsub.i64 q2,q2,q13 add r4,r4,#8 vshr.s64 q13,q14,#25 vsub.i64 q7,q8,q7 vshr.s64 q8,q15,#26 vadd.i64 q14,q13,q13 vadd.i64 q12,q12,q8 vtrn.32 d12,d14 vshl.i64 q8,q8,#26 vtrn.32 d13,d15 vadd.i64 q3,q12,q3 vadd.i64 q0,q0,q14 vst1.8 d12,[r2,: 64]! vshl.i64 q7,q13,#4 vst1.8 d13,[r4,: 64]! vsub.i64 q1,q1,q8 vshr.s64 q3,q3,#25 vadd.i64 q0,q0,q7 vadd.i64 q5,q5,q3 vshl.i64 q3,q3,#25 vadd.i64 q6,q5,q11 vadd.i64 q0,q0,q13 vshl.i64 q7,q13,#25 vadd.i64 q8,q0,q11 vsub.i64 q3,q12,q3 vshr.s64 q6,q6,#26 vsub.i64 q7,q10,q7 vtrn.32 d2,d6 vshr.s64 q8,q8,#26 vtrn.32 d3,d7 vadd.i64 q3,q9,q6 vst1.8 d2,[r2,: 64] vshl.i64 q6,q6,#26 vst1.8 d3,[r4,: 64] vadd.i64 q1,q4,q8 vtrn.32 d4,d14 vshl.i64 q4,q8,#26 vtrn.32 d5,d15 vsub.i64 q5,q5,q6 add r2,r2,#16 vsub.i64 q0,q0,q4 vst1.8 d4,[r2,: 64] add r4,r4,#16 vst1.8 d5,[r4,: 64] vtrn.32 d10,d6 vtrn.32 d11,d7 sub r2,r2,#8 sub r4,r4,#8 vtrn.32 d0,d2 vtrn.32 d1,d3 vst1.8 d10,[r2,: 64] vst1.8 d11,[r4,: 64] sub r2,r2,#24 sub r4,r4,#24 vst1.8 d0,[r2,: 64] vst1.8 d1,[r4,: 64] ldr r2,[sp,#488] ldr r4,[sp,#492] subs r5,r2,#1 bge ._mainloop add r1,r3,#144 add r2,r3,#336 vld1.8 {d0-d1},[r1,: 128]! vld1.8 {d2-d3},[r1,: 128]! vld1.8 {d4},[r1,: 64] vst1.8 {d0-d1},[r2,: 128]! vst1.8 {d2-d3},[r2,: 128]! vst1.8 d4,[r2,: 64] ldr r1,=0 ._invertloop: add r2,r3,#144 ldr r4,=0 ldr r5,=2 cmp r1,#1 ldreq r5,=1 addeq r2,r3,#336 addeq r4,r3,#48 cmp r1,#2 ldreq r5,=1 addeq r2,r3,#48 cmp r1,#3 ldreq r5,=5 addeq r4,r3,#336 cmp r1,#4 ldreq r5,=10 cmp r1,#5 ldreq r5,=20 cmp r1,#6 ldreq r5,=10 addeq r2,r3,#336 addeq r4,r3,#336 cmp r1,#7 ldreq r5,=50 cmp r1,#8 ldreq r5,=100 cmp r1,#9 ldreq r5,=50 addeq r2,r3,#336 cmp r1,#10 ldreq r5,=5 addeq r2,r3,#48 cmp r1,#11 ldreq r5,=0 addeq r2,r3,#96 add r6,r3,#144 add r7,r3,#288 vld1.8 {d0-d1},[r6,: 128]! vld1.8 {d2-d3},[r6,: 128]! vld1.8 {d4},[r6,: 64] vst1.8 {d0-d1},[r7,: 128]! vst1.8 {d2-d3},[r7,: 128]! vst1.8 d4,[r7,: 64] cmp r5,#0 beq ._skipsquaringloop ._squaringloop: add r6,r3,#288 add r7,r3,#288 add r8,r3,#288 vmov.i32 q0,#19 vmov.i32 q1,#0 vmov.i32 q2,#1 vzip.i32 q1,q2 vld1.8 {d4-d5},[r7,: 128]! vld1.8 {d6-d7},[r7,: 128]! vld1.8 {d9},[r7,: 64] vld1.8 {d10-d11},[r6,: 128]! add r7,sp,#416 vld1.8 {d12-d13},[r6,: 128]! vmul.i32 q7,q2,q0 vld1.8 {d8},[r6,: 64] vext.32 d17,d11,d10,#1 vmul.i32 q9,q3,q0 vext.32 d16,d10,d8,#1 vshl.u32 q10,q5,q1 vext.32 d22,d14,d4,#1 vext.32 d24,d18,d6,#1 vshl.u32 q13,q6,q1 vshl.u32 d28,d8,d2 vrev64.i32 d22,d22 vmul.i32 d1,d9,d1 vrev64.i32 d24,d24 vext.32 d29,d8,d13,#1 vext.32 d0,d1,d9,#1 vrev64.i32 d0,d0 vext.32 d2,d9,d1,#1 vext.32 d23,d15,d5,#1 vmull.s32 q4,d20,d4 vrev64.i32 d23,d23 vmlal.s32 q4,d21,d1 vrev64.i32 d2,d2 vmlal.s32 q4,d26,d19 vext.32 d3,d5,d15,#1 vmlal.s32 q4,d27,d18 vrev64.i32 d3,d3 vmlal.s32 q4,d28,d15 vext.32 d14,d12,d11,#1 vmull.s32 q5,d16,d23 vext.32 d15,d13,d12,#1 vmlal.s32 q5,d17,d4 vst1.8 d8,[r7,: 64]! vmlal.s32 q5,d14,d1 vext.32 d12,d9,d8,#0 vmlal.s32 q5,d15,d19 vmov.i64 d13,#0 vmlal.s32 q5,d29,d18 vext.32 d25,d19,d7,#1 vmlal.s32 q6,d20,d5 vrev64.i32 d25,d25 vmlal.s32 q6,d21,d4 vst1.8 d11,[r7,: 64]! vmlal.s32 q6,d26,d1 vext.32 d9,d10,d10,#0 vmlal.s32 q6,d27,d19 vmov.i64 d8,#0 vmlal.s32 q6,d28,d18 vmlal.s32 q4,d16,d24 vmlal.s32 q4,d17,d5 vmlal.s32 q4,d14,d4 vst1.8 d12,[r7,: 64]! vmlal.s32 q4,d15,d1 vext.32 d10,d13,d12,#0 vmlal.s32 q4,d29,d19 vmov.i64 d11,#0 vmlal.s32 q5,d20,d6 vmlal.s32 q5,d21,d5 vmlal.s32 q5,d26,d4 vext.32 d13,d8,d8,#0 vmlal.s32 q5,d27,d1 vmov.i64 d12,#0 vmlal.s32 q5,d28,d19 vst1.8 d9,[r7,: 64]! vmlal.s32 q6,d16,d25 vmlal.s32 q6,d17,d6 vst1.8 d10,[r7,: 64] vmlal.s32 q6,d14,d5 vext.32 d8,d11,d10,#0 vmlal.s32 q6,d15,d4 vmov.i64 d9,#0 vmlal.s32 q6,d29,d1 vmlal.s32 q4,d20,d7 vmlal.s32 q4,d21,d6 vmlal.s32 q4,d26,d5 vext.32 d11,d12,d12,#0 vmlal.s32 q4,d27,d4 vmov.i64 d10,#0 vmlal.s32 q4,d28,d1 vmlal.s32 q5,d16,d0 sub r6,r7,#32 vmlal.s32 q5,d17,d7 vmlal.s32 q5,d14,d6 vext.32 d30,d9,d8,#0 vmlal.s32 q5,d15,d5 vld1.8 {d31},[r6,: 64]! vmlal.s32 q5,d29,d4 vmlal.s32 q15,d20,d0 vext.32 d0,d6,d18,#1 vmlal.s32 q15,d21,d25 vrev64.i32 d0,d0 vmlal.s32 q15,d26,d24 vext.32 d1,d7,d19,#1 vext.32 d7,d10,d10,#0 vmlal.s32 q15,d27,d23 vrev64.i32 d1,d1 vld1.8 {d6},[r6,: 64] vmlal.s32 q15,d28,d22 vmlal.s32 q3,d16,d4 add r6,r6,#24 vmlal.s32 q3,d17,d2 vext.32 d4,d31,d30,#0 vmov d17,d11 vmlal.s32 q3,d14,d1 vext.32 d11,d13,d13,#0 vext.32 d13,d30,d30,#0 vmlal.s32 q3,d15,d0 vext.32 d1,d8,d8,#0 vmlal.s32 q3,d29,d3 vld1.8 {d5},[r6,: 64] sub r6,r6,#16 vext.32 d10,d6,d6,#0 vmov.i32 q1,#0xffffffff vshl.i64 q4,q1,#25 add r7,sp,#512 vld1.8 {d14-d15},[r7,: 128] vadd.i64 q9,q2,q7 vshl.i64 q1,q1,#26 vshr.s64 q10,q9,#26 vld1.8 {d0},[r6,: 64]! vadd.i64 q5,q5,q10 vand q9,q9,q1 vld1.8 {d16},[r6,: 64]! add r6,sp,#528 vld1.8 {d20-d21},[r6,: 128] vadd.i64 q11,q5,q10 vsub.i64 q2,q2,q9 vshr.s64 q9,q11,#25 vext.32 d12,d5,d4,#0 vand q11,q11,q4 vadd.i64 q0,q0,q9 vmov d19,d7 vadd.i64 q3,q0,q7 vsub.i64 q5,q5,q11 vshr.s64 q11,q3,#26 vext.32 d18,d11,d10,#0 vand q3,q3,q1 vadd.i64 q8,q8,q11 vadd.i64 q11,q8,q10 vsub.i64 q0,q0,q3 vshr.s64 q3,q11,#25 vand q11,q11,q4 vadd.i64 q3,q6,q3 vadd.i64 q6,q3,q7 vsub.i64 q8,q8,q11 vshr.s64 q11,q6,#26 vand q6,q6,q1 vadd.i64 q9,q9,q11 vadd.i64 d25,d19,d21 vsub.i64 q3,q3,q6 vshr.s64 d23,d25,#25 vand q4,q12,q4 vadd.i64 d21,d23,d23 vshl.i64 d25,d23,#4 vadd.i64 d21,d21,d23 vadd.i64 d25,d25,d21 vadd.i64 d4,d4,d25 vzip.i32 q0,q8 vadd.i64 d12,d4,d14 add r6,r8,#8 vst1.8 d0,[r6,: 64] vsub.i64 d19,d19,d9 add r6,r6,#16 vst1.8 d16,[r6,: 64] vshr.s64 d22,d12,#26 vand q0,q6,q1 vadd.i64 d10,d10,d22 vzip.i32 q3,q9 vsub.i64 d4,d4,d0 sub r6,r6,#8 vst1.8 d6,[r6,: 64] add r6,r6,#16 vst1.8 d18,[r6,: 64] vzip.i32 q2,q5 sub r6,r6,#32 vst1.8 d4,[r6,: 64] subs r5,r5,#1 bhi ._squaringloop ._skipsquaringloop: mov r2,r2 add r5,r3,#288 add r6,r3,#144 vmov.i32 q0,#19 vmov.i32 q1,#0 vmov.i32 q2,#1 vzip.i32 q1,q2 vld1.8 {d4-d5},[r5,: 128]! vld1.8 {d6-d7},[r5,: 128]! vld1.8 {d9},[r5,: 64] vld1.8 {d10-d11},[r2,: 128]! add r5,sp,#416 vld1.8 {d12-d13},[r2,: 128]! vmul.i32 q7,q2,q0 vld1.8 {d8},[r2,: 64] vext.32 d17,d11,d10,#1 vmul.i32 q9,q3,q0 vext.32 d16,d10,d8,#1 vshl.u32 q10,q5,q1 vext.32 d22,d14,d4,#1 vext.32 d24,d18,d6,#1 vshl.u32 q13,q6,q1 vshl.u32 d28,d8,d2 vrev64.i32 d22,d22 vmul.i32 d1,d9,d1 vrev64.i32 d24,d24 vext.32 d29,d8,d13,#1 vext.32 d0,d1,d9,#1 vrev64.i32 d0,d0 vext.32 d2,d9,d1,#1 vext.32 d23,d15,d5,#1 vmull.s32 q4,d20,d4 vrev64.i32 d23,d23 vmlal.s32 q4,d21,d1 vrev64.i32 d2,d2 vmlal.s32 q4,d26,d19 vext.32 d3,d5,d15,#1 vmlal.s32 q4,d27,d18 vrev64.i32 d3,d3 vmlal.s32 q4,d28,d15 vext.32 d14,d12,d11,#1 vmull.s32 q5,d16,d23 vext.32 d15,d13,d12,#1 vmlal.s32 q5,d17,d4 vst1.8 d8,[r5,: 64]! vmlal.s32 q5,d14,d1 vext.32 d12,d9,d8,#0 vmlal.s32 q5,d15,d19 vmov.i64 d13,#0 vmlal.s32 q5,d29,d18 vext.32 d25,d19,d7,#1 vmlal.s32 q6,d20,d5 vrev64.i32 d25,d25 vmlal.s32 q6,d21,d4 vst1.8 d11,[r5,: 64]! vmlal.s32 q6,d26,d1 vext.32 d9,d10,d10,#0 vmlal.s32 q6,d27,d19 vmov.i64 d8,#0 vmlal.s32 q6,d28,d18 vmlal.s32 q4,d16,d24 vmlal.s32 q4,d17,d5 vmlal.s32 q4,d14,d4 vst1.8 d12,[r5,: 64]! vmlal.s32 q4,d15,d1 vext.32 d10,d13,d12,#0 vmlal.s32 q4,d29,d19 vmov.i64 d11,#0 vmlal.s32 q5,d20,d6 vmlal.s32 q5,d21,d5 vmlal.s32 q5,d26,d4 vext.32 d13,d8,d8,#0 vmlal.s32 q5,d27,d1 vmov.i64 d12,#0 vmlal.s32 q5,d28,d19 vst1.8 d9,[r5,: 64]! vmlal.s32 q6,d16,d25 vmlal.s32 q6,d17,d6 vst1.8 d10,[r5,: 64] vmlal.s32 q6,d14,d5 vext.32 d8,d11,d10,#0 vmlal.s32 q6,d15,d4 vmov.i64 d9,#0 vmlal.s32 q6,d29,d1 vmlal.s32 q4,d20,d7 vmlal.s32 q4,d21,d6 vmlal.s32 q4,d26,d5 vext.32 d11,d12,d12,#0 vmlal.s32 q4,d27,d4 vmov.i64 d10,#0 vmlal.s32 q4,d28,d1 vmlal.s32 q5,d16,d0 sub r2,r5,#32 vmlal.s32 q5,d17,d7 vmlal.s32 q5,d14,d6 vext.32 d30,d9,d8,#0 vmlal.s32 q5,d15,d5 vld1.8 {d31},[r2,: 64]! vmlal.s32 q5,d29,d4 vmlal.s32 q15,d20,d0 vext.32 d0,d6,d18,#1 vmlal.s32 q15,d21,d25 vrev64.i32 d0,d0 vmlal.s32 q15,d26,d24 vext.32 d1,d7,d19,#1 vext.32 d7,d10,d10,#0 vmlal.s32 q15,d27,d23 vrev64.i32 d1,d1 vld1.8 {d6},[r2,: 64] vmlal.s32 q15,d28,d22 vmlal.s32 q3,d16,d4 add r2,r2,#24 vmlal.s32 q3,d17,d2 vext.32 d4,d31,d30,#0 vmov d17,d11 vmlal.s32 q3,d14,d1 vext.32 d11,d13,d13,#0 vext.32 d13,d30,d30,#0 vmlal.s32 q3,d15,d0 vext.32 d1,d8,d8,#0 vmlal.s32 q3,d29,d3 vld1.8 {d5},[r2,: 64] sub r2,r2,#16 vext.32 d10,d6,d6,#0 vmov.i32 q1,#0xffffffff vshl.i64 q4,q1,#25 add r5,sp,#512 vld1.8 {d14-d15},[r5,: 128] vadd.i64 q9,q2,q7 vshl.i64 q1,q1,#26 vshr.s64 q10,q9,#26 vld1.8 {d0},[r2,: 64]! vadd.i64 q5,q5,q10 vand q9,q9,q1 vld1.8 {d16},[r2,: 64]! add r2,sp,#528 vld1.8 {d20-d21},[r2,: 128] vadd.i64 q11,q5,q10 vsub.i64 q2,q2,q9 vshr.s64 q9,q11,#25 vext.32 d12,d5,d4,#0 vand q11,q11,q4 vadd.i64 q0,q0,q9 vmov d19,d7 vadd.i64 q3,q0,q7 vsub.i64 q5,q5,q11 vshr.s64 q11,q3,#26 vext.32 d18,d11,d10,#0 vand q3,q3,q1 vadd.i64 q8,q8,q11 vadd.i64 q11,q8,q10 vsub.i64 q0,q0,q3 vshr.s64 q3,q11,#25 vand q11,q11,q4 vadd.i64 q3,q6,q3 vadd.i64 q6,q3,q7 vsub.i64 q8,q8,q11 vshr.s64 q11,q6,#26 vand q6,q6,q1 vadd.i64 q9,q9,q11 vadd.i64 d25,d19,d21 vsub.i64 q3,q3,q6 vshr.s64 d23,d25,#25 vand q4,q12,q4 vadd.i64 d21,d23,d23 vshl.i64 d25,d23,#4 vadd.i64 d21,d21,d23 vadd.i64 d25,d25,d21 vadd.i64 d4,d4,d25 vzip.i32 q0,q8 vadd.i64 d12,d4,d14 add r2,r6,#8 vst1.8 d0,[r2,: 64] vsub.i64 d19,d19,d9 add r2,r2,#16 vst1.8 d16,[r2,: 64] vshr.s64 d22,d12,#26 vand q0,q6,q1 vadd.i64 d10,d10,d22 vzip.i32 q3,q9 vsub.i64 d4,d4,d0 sub r2,r2,#8 vst1.8 d6,[r2,: 64] add r2,r2,#16 vst1.8 d18,[r2,: 64] vzip.i32 q2,q5 sub r2,r2,#32 vst1.8 d4,[r2,: 64] cmp r4,#0 beq ._skippostcopy add r2,r3,#144 mov r4,r4 vld1.8 {d0-d1},[r2,: 128]! vld1.8 {d2-d3},[r2,: 128]! vld1.8 {d4},[r2,: 64] vst1.8 {d0-d1},[r4,: 128]! vst1.8 {d2-d3},[r4,: 128]! vst1.8 d4,[r4,: 64] ._skippostcopy: cmp r1,#1 bne ._skipfinalcopy add r2,r3,#288 add r4,r3,#144 vld1.8 {d0-d1},[r2,: 128]! vld1.8 {d2-d3},[r2,: 128]! vld1.8 {d4},[r2,: 64] vst1.8 {d0-d1},[r4,: 128]! vst1.8 {d2-d3},[r4,: 128]! vst1.8 d4,[r4,: 64] ._skipfinalcopy: add r1,r1,#1 cmp r1,#12 blo ._invertloop add r1,r3,#144 ldr r2,[r1],#4 ldr r3,[r1],#4 ldr r4,[r1],#4 ldr r5,[r1],#4 ldr r6,[r1],#4 ldr r7,[r1],#4 ldr r8,[r1],#4 ldr r9,[r1],#4 ldr r10,[r1],#4 ldr r1,[r1] add r11,r1,r1,LSL #4 add r11,r11,r1,LSL #1 add r11,r11,#16777216 mov r11,r11,ASR #25 add r11,r11,r2 mov r11,r11,ASR #26 add r11,r11,r3 mov r11,r11,ASR #25 add r11,r11,r4 mov r11,r11,ASR #26 add r11,r11,r5 mov r11,r11,ASR #25 add r11,r11,r6 mov r11,r11,ASR #26 add r11,r11,r7 mov r11,r11,ASR #25 add r11,r11,r8 mov r11,r11,ASR #26 add r11,r11,r9 mov r11,r11,ASR #25 add r11,r11,r10 mov r11,r11,ASR #26 add r11,r11,r1 mov r11,r11,ASR #25 add r2,r2,r11 add r2,r2,r11,LSL #1 add r2,r2,r11,LSL #4 mov r11,r2,ASR #26 add r3,r3,r11 sub r2,r2,r11,LSL #26 mov r11,r3,ASR #25 add r4,r4,r11 sub r3,r3,r11,LSL #25 mov r11,r4,ASR #26 add r5,r5,r11 sub r4,r4,r11,LSL #26 mov r11,r5,ASR #25 add r6,r6,r11 sub r5,r5,r11,LSL #25 mov r11,r6,ASR #26 add r7,r7,r11 sub r6,r6,r11,LSL #26 mov r11,r7,ASR #25 add r8,r8,r11 sub r7,r7,r11,LSL #25 mov r11,r8,ASR #26 add r9,r9,r11 sub r8,r8,r11,LSL #26 mov r11,r9,ASR #25 add r10,r10,r11 sub r9,r9,r11,LSL #25 mov r11,r10,ASR #26 add r1,r1,r11 sub r10,r10,r11,LSL #26 mov r11,r1,ASR #25 sub r1,r1,r11,LSL #25 add r2,r2,r3,LSL #26 mov r3,r3,LSR #6 add r3,r3,r4,LSL #19 mov r4,r4,LSR #13 add r4,r4,r5,LSL #13 mov r5,r5,LSR #19 add r5,r5,r6,LSL #6 add r6,r7,r8,LSL #25 mov r7,r8,LSR #7 add r7,r7,r9,LSL #19 mov r8,r9,LSR #13 add r8,r8,r10,LSL #12 mov r9,r10,LSR #20 add r1,r9,r1,LSL #6 str r2,[r0],#4 str r3,[r0],#4 str r4,[r0],#4 str r5,[r0],#4 str r6,[r0],#4 str r7,[r0],#4 str r8,[r0],#4 str r1,[r0] ldrd r4,[sp,#0] ldrd r6,[sp,#8] ldrd r8,[sp,#16] ldrd r10,[sp,#24] ldr r12,[sp,#480] ldr r14,[sp,#484] ldr r0,=0 mov sp,r12 vpop {q4,q5,q6,q7} bx lr #endif /* !OPENSSL_NO_ASM && OPENSSL_ARM && __ELF__ */ ring-0.17.14/crypto/curve25519/curve25519.c000064400000000000000000001436331046102023000157170ustar 00000000000000// Copyright 2020 The BoringSSL Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP // 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as // public domain. Other parts have been replaced to call into code generated by // Fiat (https://github.com/mit-plv/fiat-crypto) in //third_party/fiat. // // The field functions are shared by Ed25519 and X25519 where possible. #include #include "internal.h" #include "../internal.h" #if defined(_MSC_VER) && !defined(__clang__) // '=': conversion from 'int64_t' to 'int32_t', possible loss of data #pragma warning(disable: 4242) // '=': conversion from 'int32_t' to 'uint8_t', possible loss of data #pragma warning(disable: 4244) #endif #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic ignored "-Wconversion" #pragma GCC diagnostic ignored "-Wsign-conversion" #endif #if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic ignored "-Winline" #endif // Various pre-computed constants. #include "./curve25519_tables.h" #if defined(BORINGSSL_HAS_UINT128) #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wpedantic" #endif #include "../../third_party/fiat/curve25519_64.h" #elif defined(OPENSSL_64_BIT) #include "../../third_party/fiat/curve25519_64_msvc.h" #else #include "../../third_party/fiat/curve25519_32.h" #endif // Low-level intrinsic operations static uint64_t load_3(const uint8_t *in) { uint64_t result; result = (uint64_t)in[0]; result |= ((uint64_t)in[1]) << 8; result |= ((uint64_t)in[2]) << 16; return result; } static uint64_t load_4(const uint8_t *in) { uint64_t result; result = (uint64_t)in[0]; result |= ((uint64_t)in[1]) << 8; result |= ((uint64_t)in[2]) << 16; result |= ((uint64_t)in[3]) << 24; return result; } // Field operations. #if defined(OPENSSL_64_BIT) // assert_fe asserts that |f| satisfies bounds: // // [[0x0 ~> 0x8cccccccccccc], // [0x0 ~> 0x8cccccccccccc], // [0x0 ~> 0x8cccccccccccc], // [0x0 ~> 0x8cccccccccccc], // [0x0 ~> 0x8cccccccccccc]] // // See comments in curve25519_64.h for which functions use these bounds for // inputs or outputs. #define assert_fe(f) \ do { \ for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ declassify_assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc)); \ } \ } while (0) // assert_fe_loose asserts that |f| satisfies bounds: // // [[0x0 ~> 0x1a666666666664], // [0x0 ~> 0x1a666666666664], // [0x0 ~> 0x1a666666666664], // [0x0 ~> 0x1a666666666664], // [0x0 ~> 0x1a666666666664]] // // See comments in curve25519_64.h for which functions use these bounds for // inputs or outputs. #define assert_fe_loose(f) \ do { \ for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ declassify_assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \ } \ } while (0) #else // assert_fe asserts that |f| satisfies bounds: // // [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], // [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], // [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], // [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], // [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]] // // See comments in curve25519_32.h for which functions use these bounds for // inputs or outputs. #define assert_fe(f) \ do { \ for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ declassify_assert(f[_assert_fe_i] <= \ ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u)); \ } \ } while (0) // assert_fe_loose asserts that |f| satisfies bounds: // // [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], // [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], // [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], // [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], // [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]] // // See comments in curve25519_32.h for which functions use these bounds for // inputs or outputs. #define assert_fe_loose(f) \ do { \ for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ declassify_assert(f[_assert_fe_i] <= \ ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u)); \ } \ } while (0) #endif // OPENSSL_64_BIT OPENSSL_STATIC_ASSERT(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS, "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe"); static void fe_frombytes_strict(fe *h, const uint8_t s[32]) { // |fiat_25519_from_bytes| requires the top-most bit be clear. declassify_assert((s[31] & 0x80) == 0); fiat_25519_from_bytes(h->v, s); assert_fe(h->v); } static void fe_frombytes(fe *h, const uint8_t s[32]) { uint8_t s_copy[32]; OPENSSL_memcpy(s_copy, s, 32); s_copy[31] &= 0x7f; fe_frombytes_strict(h, s_copy); } static void fe_tobytes(uint8_t s[32], const fe *f) { assert_fe(f->v); fiat_25519_to_bytes(s, f->v); } // h = 0 static void fe_0(fe *h) { OPENSSL_memset(h, 0, sizeof(fe)); } #if defined(OPENSSL_SMALL) static void fe_loose_0(fe_loose *h) { OPENSSL_memset(h, 0, sizeof(fe_loose)); } #endif // h = 1 static void fe_1(fe *h) { OPENSSL_memset(h, 0, sizeof(fe)); h->v[0] = 1; } #if defined(OPENSSL_SMALL) static void fe_loose_1(fe_loose *h) { OPENSSL_memset(h, 0, sizeof(fe_loose)); h->v[0] = 1; } #endif // h = f + g // Can overlap h with f or g. static void fe_add(fe_loose *h, const fe *f, const fe *g) { assert_fe(f->v); assert_fe(g->v); fiat_25519_add(h->v, f->v, g->v); assert_fe_loose(h->v); } // h = f - g // Can overlap h with f or g. static void fe_sub(fe_loose *h, const fe *f, const fe *g) { assert_fe(f->v); assert_fe(g->v); fiat_25519_sub(h->v, f->v, g->v); assert_fe_loose(h->v); } static void fe_carry(fe *h, const fe_loose* f) { assert_fe_loose(f->v); fiat_25519_carry(h->v, f->v); assert_fe(h->v); } static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS], const fe_limb_t in1[FE_NUM_LIMBS], const fe_limb_t in2[FE_NUM_LIMBS]) { assert_fe_loose(in1); assert_fe_loose(in2); fiat_25519_carry_mul(out, in1, in2); assert_fe(out); } static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) { fe_mul_impl(h->v, f->v, g->v); } #if defined(OPENSSL_SMALL) static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) { fe_mul_impl(h->v, f->v, g->v); } #endif static void fe_mul_ttt(fe *h, const fe *f, const fe *g) { fe_mul_impl(h->v, f->v, g->v); } static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) { fe_mul_impl(h->v, f->v, g->v); } static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) { fe_mul_impl(h->v, f->v, g->v); } static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) { fe_mul_impl(h->v, f->v, g->v); } static void fe_sq_tl(fe *h, const fe_loose *f) { assert_fe_loose(f->v); fiat_25519_carry_square(h->v, f->v); assert_fe(h->v); } static void fe_sq_tt(fe *h, const fe *f) { assert_fe_loose(f->v); fiat_25519_carry_square(h->v, f->v); assert_fe(h->v); } // Replace (f,g) with (g,f) if b == 1; // replace (f,g) with (f,g) if b == 0. // // Preconditions: b in {0,1}. static void fe_cswap(fe *f, fe *g, fe_limb_t b) { b = 0-b; for (unsigned i = 0; i < FE_NUM_LIMBS; i++) { fe_limb_t x = f->v[i] ^ g->v[i]; x &= b; f->v[i] ^= x; g->v[i] ^= x; } } static void fe_mul121666(fe *h, const fe_loose *f) { assert_fe_loose(f->v); fiat_25519_carry_scmul_121666(h->v, f->v); assert_fe(h->v); } // h = -f static void fe_neg(fe_loose *h, const fe *f) { assert_fe(f->v); fiat_25519_opp(h->v, f->v); assert_fe_loose(h->v); } // Replace (f,g) with (g,g) if b == 1; // replace (f,g) with (f,g) if b == 0. // // Preconditions: b in {0,1}. static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) { // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a // different one. b = 0-b; for (unsigned i = 0; i < FE_NUM_LIMBS; i++) { fe_limb_t x = f->v[i] ^ g->v[i]; x &= b; f->v[i] ^= x; } } // h = f static void fe_copy(fe *h, const fe *f) { fe_limbs_copy(h->v, f->v); } static void fe_copy_lt(fe_loose *h, const fe *f) { OPENSSL_STATIC_ASSERT(sizeof(fe_loose) == sizeof(fe), "fe and fe_loose mismatch"); fe_limbs_copy(h->v, f->v); } static void fe_loose_invert(fe *out, const fe_loose *z) { fe t0; fe t1; fe t2; fe t3; int i; fe_sq_tl(&t0, z); fe_sq_tt(&t1, &t0); for (i = 1; i < 2; ++i) { fe_sq_tt(&t1, &t1); } fe_mul_tlt(&t1, z, &t1); fe_mul_ttt(&t0, &t0, &t1); fe_sq_tt(&t2, &t0); fe_mul_ttt(&t1, &t1, &t2); fe_sq_tt(&t2, &t1); for (i = 1; i < 5; ++i) { fe_sq_tt(&t2, &t2); } fe_mul_ttt(&t1, &t2, &t1); fe_sq_tt(&t2, &t1); for (i = 1; i < 10; ++i) { fe_sq_tt(&t2, &t2); } fe_mul_ttt(&t2, &t2, &t1); fe_sq_tt(&t3, &t2); for (i = 1; i < 20; ++i) { fe_sq_tt(&t3, &t3); } fe_mul_ttt(&t2, &t3, &t2); fe_sq_tt(&t2, &t2); for (i = 1; i < 10; ++i) { fe_sq_tt(&t2, &t2); } fe_mul_ttt(&t1, &t2, &t1); fe_sq_tt(&t2, &t1); for (i = 1; i < 50; ++i) { fe_sq_tt(&t2, &t2); } fe_mul_ttt(&t2, &t2, &t1); fe_sq_tt(&t3, &t2); for (i = 1; i < 100; ++i) { fe_sq_tt(&t3, &t3); } fe_mul_ttt(&t2, &t3, &t2); fe_sq_tt(&t2, &t2); for (i = 1; i < 50; ++i) { fe_sq_tt(&t2, &t2); } fe_mul_ttt(&t1, &t2, &t1); fe_sq_tt(&t1, &t1); for (i = 1; i < 5; ++i) { fe_sq_tt(&t1, &t1); } fe_mul_ttt(out, &t1, &t0); } static void fe_invert(fe *out, const fe *z) { fe_loose l; fe_copy_lt(&l, z); fe_loose_invert(out, &l); } // return 0 if f == 0 // return 1 if f != 0 static int fe_isnonzero(const fe_loose *f) { fe tight; fe_carry(&tight, f); uint8_t s[32]; fe_tobytes(s, &tight); static const uint8_t zero[32] = {0}; return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0; } // return 1 if f is in {1,3,5,...,q-2} // return 0 if f is in {0,2,4,...,q-1} static int fe_isnegative(const fe *f) { uint8_t s[32]; fe_tobytes(s, f); return s[0] & 1; } static void fe_sq2_tt(fe *h, const fe *f) { // h = f^2 fe_sq_tt(h, f); // h = h + h fe_loose tmp; fe_add(&tmp, h, h); fe_carry(h, &tmp); } static void fe_pow22523(fe *out, const fe *z) { fe t0; fe t1; fe t2; int i; fe_sq_tt(&t0, z); fe_sq_tt(&t1, &t0); for (i = 1; i < 2; ++i) { fe_sq_tt(&t1, &t1); } fe_mul_ttt(&t1, z, &t1); fe_mul_ttt(&t0, &t0, &t1); fe_sq_tt(&t0, &t0); fe_mul_ttt(&t0, &t1, &t0); fe_sq_tt(&t1, &t0); for (i = 1; i < 5; ++i) { fe_sq_tt(&t1, &t1); } fe_mul_ttt(&t0, &t1, &t0); fe_sq_tt(&t1, &t0); for (i = 1; i < 10; ++i) { fe_sq_tt(&t1, &t1); } fe_mul_ttt(&t1, &t1, &t0); fe_sq_tt(&t2, &t1); for (i = 1; i < 20; ++i) { fe_sq_tt(&t2, &t2); } fe_mul_ttt(&t1, &t2, &t1); fe_sq_tt(&t1, &t1); for (i = 1; i < 10; ++i) { fe_sq_tt(&t1, &t1); } fe_mul_ttt(&t0, &t1, &t0); fe_sq_tt(&t1, &t0); for (i = 1; i < 50; ++i) { fe_sq_tt(&t1, &t1); } fe_mul_ttt(&t1, &t1, &t0); fe_sq_tt(&t2, &t1); for (i = 1; i < 100; ++i) { fe_sq_tt(&t2, &t2); } fe_mul_ttt(&t1, &t2, &t1); fe_sq_tt(&t1, &t1); for (i = 1; i < 50; ++i) { fe_sq_tt(&t1, &t1); } fe_mul_ttt(&t0, &t1, &t0); fe_sq_tt(&t0, &t0); for (i = 1; i < 2; ++i) { fe_sq_tt(&t0, &t0); } fe_mul_ttt(out, &t0, z); } // Group operations. int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) { fe u; fe_loose v; fe w; fe vxx; fe_loose check; fe_frombytes(&h->Y, s); fe_1(&h->Z); fe_sq_tt(&w, &h->Y); fe_mul_ttt(&vxx, &w, &d); fe_sub(&v, &w, &h->Z); // u = y^2-1 fe_carry(&u, &v); fe_add(&v, &vxx, &h->Z); // v = dy^2+1 fe_mul_ttl(&w, &u, &v); // w = u*v fe_pow22523(&h->X, &w); // x = w^((q-5)/8) fe_mul_ttt(&h->X, &h->X, &u); // x = u*w^((q-5)/8) fe_sq_tt(&vxx, &h->X); fe_mul_ttl(&vxx, &vxx, &v); fe_sub(&check, &vxx, &u); if (fe_isnonzero(&check)) { fe_add(&check, &vxx, &u); if (fe_isnonzero(&check)) { return 0; } fe_mul_ttt(&h->X, &h->X, &sqrtm1); } if (fe_isnegative(&h->X) != (s[31] >> 7)) { fe_loose t; fe_neg(&t, &h->X); fe_carry(&h->X, &t); } fe_mul_ttt(&h->T, &h->X, &h->Y); return 1; } static void ge_p2_0(ge_p2 *h) { fe_0(&h->X); fe_1(&h->Y); fe_1(&h->Z); } static void ge_p3_0(ge_p3 *h) { fe_0(&h->X); fe_1(&h->Y); fe_1(&h->Z); fe_0(&h->T); } #if defined(OPENSSL_SMALL) static void ge_precomp_0(ge_precomp *h) { fe_loose_1(&h->yplusx); fe_loose_1(&h->yminusx); fe_loose_0(&h->xy2d); } #endif // r = p static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) { fe_copy(&r->X, &p->X); fe_copy(&r->Y, &p->Y); fe_copy(&r->Z, &p->Z); } // r = p static void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) { fe_add(&r->YplusX, &p->Y, &p->X); fe_sub(&r->YminusX, &p->Y, &p->X); fe_copy_lt(&r->Z, &p->Z); fe_mul_ltt(&r->T2d, &p->T, &d2); } // r = p static void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) { fe_mul_tll(&r->X, &p->X, &p->T); fe_mul_tll(&r->Y, &p->Y, &p->Z); fe_mul_tll(&r->Z, &p->Z, &p->T); } // r = p static void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) { fe_mul_tll(&r->X, &p->X, &p->T); fe_mul_tll(&r->Y, &p->Y, &p->Z); fe_mul_tll(&r->Z, &p->Z, &p->T); fe_mul_tll(&r->T, &p->X, &p->Y); } // r = 2 * p static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) { fe trX, trZ, trT; fe t0; fe_sq_tt(&trX, &p->X); fe_sq_tt(&trZ, &p->Y); fe_sq2_tt(&trT, &p->Z); fe_add(&r->Y, &p->X, &p->Y); fe_sq_tl(&t0, &r->Y); fe_add(&r->Y, &trZ, &trX); fe_sub(&r->Z, &trZ, &trX); fe_carry(&trZ, &r->Y); fe_sub(&r->X, &t0, &trZ); fe_carry(&trZ, &r->Z); fe_sub(&r->T, &trT, &trZ); } // r = 2 * p static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) { ge_p2 q; ge_p3_to_p2(&q, p); ge_p2_dbl(r, &q); } // r = p + q static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { fe trY, trZ, trT; fe_add(&r->X, &p->Y, &p->X); fe_sub(&r->Y, &p->Y, &p->X); fe_mul_tll(&trZ, &r->X, &q->yplusx); fe_mul_tll(&trY, &r->Y, &q->yminusx); fe_mul_tlt(&trT, &q->xy2d, &p->T); fe_add(&r->T, &p->Z, &p->Z); fe_sub(&r->X, &trZ, &trY); fe_add(&r->Y, &trZ, &trY); fe_carry(&trZ, &r->T); fe_add(&r->Z, &trZ, &trT); fe_sub(&r->T, &trZ, &trT); } // r = p - q static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { fe trY, trZ, trT; fe_add(&r->X, &p->Y, &p->X); fe_sub(&r->Y, &p->Y, &p->X); fe_mul_tll(&trZ, &r->X, &q->yminusx); fe_mul_tll(&trY, &r->Y, &q->yplusx); fe_mul_tlt(&trT, &q->xy2d, &p->T); fe_add(&r->T, &p->Z, &p->Z); fe_sub(&r->X, &trZ, &trY); fe_add(&r->Y, &trZ, &trY); fe_carry(&trZ, &r->T); fe_sub(&r->Z, &trZ, &trT); fe_add(&r->T, &trZ, &trT); } // r = p + q static void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { fe trX, trY, trZ, trT; fe_add(&r->X, &p->Y, &p->X); fe_sub(&r->Y, &p->Y, &p->X); fe_mul_tll(&trZ, &r->X, &q->YplusX); fe_mul_tll(&trY, &r->Y, &q->YminusX); fe_mul_tlt(&trT, &q->T2d, &p->T); fe_mul_ttl(&trX, &p->Z, &q->Z); fe_add(&r->T, &trX, &trX); fe_sub(&r->X, &trZ, &trY); fe_add(&r->Y, &trZ, &trY); fe_carry(&trZ, &r->T); fe_add(&r->Z, &trZ, &trT); fe_sub(&r->T, &trZ, &trT); } // r = p - q static void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { fe trX, trY, trZ, trT; fe_add(&r->X, &p->Y, &p->X); fe_sub(&r->Y, &p->Y, &p->X); fe_mul_tll(&trZ, &r->X, &q->YminusX); fe_mul_tll(&trY, &r->Y, &q->YplusX); fe_mul_tlt(&trT, &q->T2d, &p->T); fe_mul_ttl(&trX, &p->Z, &q->Z); fe_add(&r->T, &trX, &trX); fe_sub(&r->X, &trZ, &trY); fe_add(&r->Y, &trZ, &trY); fe_carry(&trZ, &r->T); fe_sub(&r->Z, &trZ, &trT); fe_add(&r->T, &trZ, &trT); } static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) { fe_cmov(&t->yplusx, &u->yplusx, b); fe_cmov(&t->yminusx, &u->yminusx, b); fe_cmov(&t->xy2d, &u->xy2d, b); } #if defined(OPENSSL_SMALL) static void x25519_ge_scalarmult_small_precomp( ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) { // precomp_table is first expanded into matching |ge_precomp| // elements. ge_precomp multiples[15]; unsigned i; for (i = 0; i < 15; i++) { // The precomputed table is assumed to already clear the top bit, so // |fe_frombytes_strict| may be used directly. const uint8_t *bytes = &precomp_table[i*(2 * 32)]; fe x, y; fe_frombytes_strict(&x, bytes); fe_frombytes_strict(&y, bytes + 32); ge_precomp *out = &multiples[i]; fe_add(&out->yplusx, &y, &x); fe_sub(&out->yminusx, &y, &x); fe_mul_ltt(&out->xy2d, &x, &y); fe_mul_llt(&out->xy2d, &out->xy2d, &d2); } // See the comment above |k25519SmallPrecomp| about the structure of the // precomputed elements. This loop does 64 additions and 64 doublings to // calculate the result. ge_p3_0(h); for (i = 63; i < 64; i--) { unsigned j; signed char index = 0; for (j = 0; j < 4; j++) { const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7)); index |= (bit << j); } ge_precomp e; ge_precomp_0(&e); for (j = 1; j < 16; j++) { cmov(&e, &multiples[j-1], 1&constant_time_eq_w(index, j)); } ge_cached cached; ge_p1p1 r; x25519_ge_p3_to_cached(&cached, h); x25519_ge_add(&r, h, &cached); x25519_ge_p1p1_to_p3(h, &r); ge_madd(&r, h, &e); x25519_ge_p1p1_to_p3(h, &r); } } void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32], int use_adx) { (void)use_adx; x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp); } #else static void table_select(ge_precomp *t, const int pos, const signed char b) { uint8_t bnegative = constant_time_msb_w(b); uint8_t babs = b - ((bnegative & b) << 1); uint8_t t_bytes[3][32] = { {constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}}; #if defined(__clang__) // materialize for vectorization, 6% speedup __asm__("" : "+m" (t_bytes) : /*no inputs*/); #endif OPENSSL_STATIC_ASSERT(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), ""); for (int i = 0; i < 8; i++) { constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i], sizeof(t_bytes), constant_time_eq_w(babs, 1 + i)); } fe yplusx, yminusx, xy2d; fe_frombytes_strict(&yplusx, t_bytes[0]); fe_frombytes_strict(&yminusx, t_bytes[1]); fe_frombytes_strict(&xy2d, t_bytes[2]); fe_copy_lt(&t->yplusx, &yplusx); fe_copy_lt(&t->yminusx, &yminusx); fe_copy_lt(&t->xy2d, &xy2d); ge_precomp minust; fe_copy_lt(&minust.yplusx, &yminusx); fe_copy_lt(&minust.yminusx, &yplusx); fe_neg(&minust.xy2d, &xy2d); cmov(t, &minust, bnegative>>7); } // h = a * B // where a = a[0]+256*a[1]+...+256^31 a[31] // B is the Ed25519 base point (x,4/5) with x positive. // // Preconditions: // a[31] <= 127 void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32], int use_adx) { #if defined(BORINGSSL_FE25519_ADX) if (use_adx) { uint8_t t[4][32]; x25519_ge_scalarmult_base_adx(t, a); fiat_25519_from_bytes(h->X.v, t[0]); fiat_25519_from_bytes(h->Y.v, t[1]); fiat_25519_from_bytes(h->Z.v, t[2]); fiat_25519_from_bytes(h->T.v, t[3]); return; } #else (void)use_adx; #endif signed char e[64]; signed char carry; ge_p1p1 r; ge_p2 s; ge_precomp t; int i; for (i = 0; i < 32; ++i) { e[2 * i + 0] = (a[i] >> 0) & 15; e[2 * i + 1] = (a[i] >> 4) & 15; } // each e[i] is between 0 and 15 // e[63] is between 0 and 7 carry = 0; for (i = 0; i < 63; ++i) { e[i] += carry; carry = e[i] + 8; carry >>= 4; e[i] -= carry << 4; } e[63] += carry; // each e[i] is between -8 and 8 ge_p3_0(h); for (i = 1; i < 64; i += 2) { table_select(&t, i / 2, e[i]); ge_madd(&r, h, &t); x25519_ge_p1p1_to_p3(h, &r); } ge_p3_dbl(&r, h); x25519_ge_p1p1_to_p2(&s, &r); ge_p2_dbl(&r, &s); x25519_ge_p1p1_to_p2(&s, &r); ge_p2_dbl(&r, &s); x25519_ge_p1p1_to_p2(&s, &r); ge_p2_dbl(&r, &s); x25519_ge_p1p1_to_p3(h, &r); for (i = 0; i < 64; i += 2) { table_select(&t, i / 2, e[i]); ge_madd(&r, h, &t); x25519_ge_p1p1_to_p3(h, &r); } } #endif static void slide(signed char *r, const uint8_t *a) { int i; int b; int k; for (i = 0; i < 256; ++i) { r[i] = 1 & (a[i >> 3] >> (i & 7)); } for (i = 0; i < 256; ++i) { if (r[i]) { for (b = 1; b <= 6 && i + b < 256; ++b) { if (r[i + b]) { if (r[i] + (r[i + b] << b) <= 15) { r[i] += r[i + b] << b; r[i + b] = 0; } else if (r[i] - (r[i + b] << b) >= -15) { r[i] -= r[i + b] << b; for (k = i + b; k < 256; ++k) { if (!r[k]) { r[k] = 1; break; } r[k] = 0; } } else { break; } } } } } } // r = a * A + b * B // where a = a[0]+256*a[1]+...+256^31 a[31]. // and b = b[0]+256*b[1]+...+256^31 b[31]. // B is the Ed25519 base point (x,4/5) with x positive. static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a, const ge_p3 *A, const uint8_t *b) { signed char aslide[256]; signed char bslide[256]; ge_cached Ai[8]; // A,3A,5A,7A,9A,11A,13A,15A ge_p1p1 t; ge_p3 u; ge_p3 A2; int i; slide(aslide, a); slide(bslide, b); x25519_ge_p3_to_cached(&Ai[0], A); ge_p3_dbl(&t, A); x25519_ge_p1p1_to_p3(&A2, &t); x25519_ge_add(&t, &A2, &Ai[0]); x25519_ge_p1p1_to_p3(&u, &t); x25519_ge_p3_to_cached(&Ai[1], &u); x25519_ge_add(&t, &A2, &Ai[1]); x25519_ge_p1p1_to_p3(&u, &t); x25519_ge_p3_to_cached(&Ai[2], &u); x25519_ge_add(&t, &A2, &Ai[2]); x25519_ge_p1p1_to_p3(&u, &t); x25519_ge_p3_to_cached(&Ai[3], &u); x25519_ge_add(&t, &A2, &Ai[3]); x25519_ge_p1p1_to_p3(&u, &t); x25519_ge_p3_to_cached(&Ai[4], &u); x25519_ge_add(&t, &A2, &Ai[4]); x25519_ge_p1p1_to_p3(&u, &t); x25519_ge_p3_to_cached(&Ai[5], &u); x25519_ge_add(&t, &A2, &Ai[5]); x25519_ge_p1p1_to_p3(&u, &t); x25519_ge_p3_to_cached(&Ai[6], &u); x25519_ge_add(&t, &A2, &Ai[6]); x25519_ge_p1p1_to_p3(&u, &t); x25519_ge_p3_to_cached(&Ai[7], &u); ge_p2_0(r); for (i = 255; i >= 0; --i) { if (aslide[i] || bslide[i]) { break; } } for (; i >= 0; --i) { ge_p2_dbl(&t, r); if (aslide[i] > 0) { x25519_ge_p1p1_to_p3(&u, &t); x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]); } else if (aslide[i] < 0) { x25519_ge_p1p1_to_p3(&u, &t); x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]); } if (bslide[i] > 0) { x25519_ge_p1p1_to_p3(&u, &t); ge_madd(&t, &u, &Bi[bslide[i] / 2]); } else if (bslide[i] < 0) { x25519_ge_p1p1_to_p3(&u, &t); ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]); } x25519_ge_p1p1_to_p2(r, &t); } } // int64_lshift21 returns |a << 21| but is defined when shifting bits into the // sign bit. This works around a language flaw in C. static inline int64_t int64_lshift21(int64_t a) { return (int64_t)((uint64_t)a << 21); } // The set of scalars is \Z/l // where l = 2^252 + 27742317777372353535851937790883648493. // Input: // s[0]+256*s[1]+...+256^63*s[63] = s // // Output: // s[0]+256*s[1]+...+256^31*s[31] = s mod l // where l = 2^252 + 27742317777372353535851937790883648493. // Overwrites s in place. void x25519_sc_reduce(uint8_t s[64]) { int64_t s0 = 2097151 & load_3(s); int64_t s1 = 2097151 & (load_4(s + 2) >> 5); int64_t s2 = 2097151 & (load_3(s + 5) >> 2); int64_t s3 = 2097151 & (load_4(s + 7) >> 7); int64_t s4 = 2097151 & (load_4(s + 10) >> 4); int64_t s5 = 2097151 & (load_3(s + 13) >> 1); int64_t s6 = 2097151 & (load_4(s + 15) >> 6); int64_t s7 = 2097151 & (load_3(s + 18) >> 3); int64_t s8 = 2097151 & load_3(s + 21); int64_t s9 = 2097151 & (load_4(s + 23) >> 5); int64_t s10 = 2097151 & (load_3(s + 26) >> 2); int64_t s11 = 2097151 & (load_4(s + 28) >> 7); int64_t s12 = 2097151 & (load_4(s + 31) >> 4); int64_t s13 = 2097151 & (load_3(s + 34) >> 1); int64_t s14 = 2097151 & (load_4(s + 36) >> 6); int64_t s15 = 2097151 & (load_3(s + 39) >> 3); int64_t s16 = 2097151 & load_3(s + 42); int64_t s17 = 2097151 & (load_4(s + 44) >> 5); int64_t s18 = 2097151 & (load_3(s + 47) >> 2); int64_t s19 = 2097151 & (load_4(s + 49) >> 7); int64_t s20 = 2097151 & (load_4(s + 52) >> 4); int64_t s21 = 2097151 & (load_3(s + 55) >> 1); int64_t s22 = 2097151 & (load_4(s + 57) >> 6); int64_t s23 = (load_4(s + 60) >> 3); int64_t carry0; int64_t carry1; int64_t carry2; int64_t carry3; int64_t carry4; int64_t carry5; int64_t carry6; int64_t carry7; int64_t carry8; int64_t carry9; int64_t carry10; int64_t carry11; int64_t carry12; int64_t carry13; int64_t carry14; int64_t carry15; int64_t carry16; s11 += s23 * 666643; s12 += s23 * 470296; s13 += s23 * 654183; s14 -= s23 * 997805; s15 += s23 * 136657; s16 -= s23 * 683901; s23 = 0; s10 += s22 * 666643; s11 += s22 * 470296; s12 += s22 * 654183; s13 -= s22 * 997805; s14 += s22 * 136657; s15 -= s22 * 683901; s22 = 0; s9 += s21 * 666643; s10 += s21 * 470296; s11 += s21 * 654183; s12 -= s21 * 997805; s13 += s21 * 136657; s14 -= s21 * 683901; s21 = 0; s8 += s20 * 666643; s9 += s20 * 470296; s10 += s20 * 654183; s11 -= s20 * 997805; s12 += s20 * 136657; s13 -= s20 * 683901; s20 = 0; s7 += s19 * 666643; s8 += s19 * 470296; s9 += s19 * 654183; s10 -= s19 * 997805; s11 += s19 * 136657; s12 -= s19 * 683901; s19 = 0; s6 += s18 * 666643; s7 += s18 * 470296; s8 += s18 * 654183; s9 -= s18 * 997805; s10 += s18 * 136657; s11 -= s18 * 683901; s18 = 0; carry6 = (s6 + (1 << 20)) >> 21; s7 += carry6; s6 -= int64_lshift21(carry6); carry8 = (s8 + (1 << 20)) >> 21; s9 += carry8; s8 -= int64_lshift21(carry8); carry10 = (s10 + (1 << 20)) >> 21; s11 += carry10; s10 -= int64_lshift21(carry10); carry12 = (s12 + (1 << 20)) >> 21; s13 += carry12; s12 -= int64_lshift21(carry12); carry14 = (s14 + (1 << 20)) >> 21; s15 += carry14; s14 -= int64_lshift21(carry14); carry16 = (s16 + (1 << 20)) >> 21; s17 += carry16; s16 -= int64_lshift21(carry16); carry7 = (s7 + (1 << 20)) >> 21; s8 += carry7; s7 -= int64_lshift21(carry7); carry9 = (s9 + (1 << 20)) >> 21; s10 += carry9; s9 -= int64_lshift21(carry9); carry11 = (s11 + (1 << 20)) >> 21; s12 += carry11; s11 -= int64_lshift21(carry11); carry13 = (s13 + (1 << 20)) >> 21; s14 += carry13; s13 -= int64_lshift21(carry13); carry15 = (s15 + (1 << 20)) >> 21; s16 += carry15; s15 -= int64_lshift21(carry15); s5 += s17 * 666643; s6 += s17 * 470296; s7 += s17 * 654183; s8 -= s17 * 997805; s9 += s17 * 136657; s10 -= s17 * 683901; s17 = 0; s4 += s16 * 666643; s5 += s16 * 470296; s6 += s16 * 654183; s7 -= s16 * 997805; s8 += s16 * 136657; s9 -= s16 * 683901; s16 = 0; s3 += s15 * 666643; s4 += s15 * 470296; s5 += s15 * 654183; s6 -= s15 * 997805; s7 += s15 * 136657; s8 -= s15 * 683901; s15 = 0; s2 += s14 * 666643; s3 += s14 * 470296; s4 += s14 * 654183; s5 -= s14 * 997805; s6 += s14 * 136657; s7 -= s14 * 683901; s14 = 0; s1 += s13 * 666643; s2 += s13 * 470296; s3 += s13 * 654183; s4 -= s13 * 997805; s5 += s13 * 136657; s6 -= s13 * 683901; s13 = 0; s0 += s12 * 666643; s1 += s12 * 470296; s2 += s12 * 654183; s3 -= s12 * 997805; s4 += s12 * 136657; s5 -= s12 * 683901; s12 = 0; carry0 = (s0 + (1 << 20)) >> 21; s1 += carry0; s0 -= int64_lshift21(carry0); carry2 = (s2 + (1 << 20)) >> 21; s3 += carry2; s2 -= int64_lshift21(carry2); carry4 = (s4 + (1 << 20)) >> 21; s5 += carry4; s4 -= int64_lshift21(carry4); carry6 = (s6 + (1 << 20)) >> 21; s7 += carry6; s6 -= int64_lshift21(carry6); carry8 = (s8 + (1 << 20)) >> 21; s9 += carry8; s8 -= int64_lshift21(carry8); carry10 = (s10 + (1 << 20)) >> 21; s11 += carry10; s10 -= int64_lshift21(carry10); carry1 = (s1 + (1 << 20)) >> 21; s2 += carry1; s1 -= int64_lshift21(carry1); carry3 = (s3 + (1 << 20)) >> 21; s4 += carry3; s3 -= int64_lshift21(carry3); carry5 = (s5 + (1 << 20)) >> 21; s6 += carry5; s5 -= int64_lshift21(carry5); carry7 = (s7 + (1 << 20)) >> 21; s8 += carry7; s7 -= int64_lshift21(carry7); carry9 = (s9 + (1 << 20)) >> 21; s10 += carry9; s9 -= int64_lshift21(carry9); carry11 = (s11 + (1 << 20)) >> 21; s12 += carry11; s11 -= int64_lshift21(carry11); s0 += s12 * 666643; s1 += s12 * 470296; s2 += s12 * 654183; s3 -= s12 * 997805; s4 += s12 * 136657; s5 -= s12 * 683901; s12 = 0; carry0 = s0 >> 21; s1 += carry0; s0 -= int64_lshift21(carry0); carry1 = s1 >> 21; s2 += carry1; s1 -= int64_lshift21(carry1); carry2 = s2 >> 21; s3 += carry2; s2 -= int64_lshift21(carry2); carry3 = s3 >> 21; s4 += carry3; s3 -= int64_lshift21(carry3); carry4 = s4 >> 21; s5 += carry4; s4 -= int64_lshift21(carry4); carry5 = s5 >> 21; s6 += carry5; s5 -= int64_lshift21(carry5); carry6 = s6 >> 21; s7 += carry6; s6 -= int64_lshift21(carry6); carry7 = s7 >> 21; s8 += carry7; s7 -= int64_lshift21(carry7); carry8 = s8 >> 21; s9 += carry8; s8 -= int64_lshift21(carry8); carry9 = s9 >> 21; s10 += carry9; s9 -= int64_lshift21(carry9); carry10 = s10 >> 21; s11 += carry10; s10 -= int64_lshift21(carry10); carry11 = s11 >> 21; s12 += carry11; s11 -= int64_lshift21(carry11); s0 += s12 * 666643; s1 += s12 * 470296; s2 += s12 * 654183; s3 -= s12 * 997805; s4 += s12 * 136657; s5 -= s12 * 683901; s12 = 0; carry0 = s0 >> 21; s1 += carry0; s0 -= int64_lshift21(carry0); carry1 = s1 >> 21; s2 += carry1; s1 -= int64_lshift21(carry1); carry2 = s2 >> 21; s3 += carry2; s2 -= int64_lshift21(carry2); carry3 = s3 >> 21; s4 += carry3; s3 -= int64_lshift21(carry3); carry4 = s4 >> 21; s5 += carry4; s4 -= int64_lshift21(carry4); carry5 = s5 >> 21; s6 += carry5; s5 -= int64_lshift21(carry5); carry6 = s6 >> 21; s7 += carry6; s6 -= int64_lshift21(carry6); carry7 = s7 >> 21; s8 += carry7; s7 -= int64_lshift21(carry7); carry8 = s8 >> 21; s9 += carry8; s8 -= int64_lshift21(carry8); carry9 = s9 >> 21; s10 += carry9; s9 -= int64_lshift21(carry9); carry10 = s10 >> 21; s11 += carry10; s10 -= int64_lshift21(carry10); s[0] = s0 >> 0; s[1] = s0 >> 8; s[2] = (s0 >> 16) | (s1 << 5); s[3] = s1 >> 3; s[4] = s1 >> 11; s[5] = (s1 >> 19) | (s2 << 2); s[6] = s2 >> 6; s[7] = (s2 >> 14) | (s3 << 7); s[8] = s3 >> 1; s[9] = s3 >> 9; s[10] = (s3 >> 17) | (s4 << 4); s[11] = s4 >> 4; s[12] = s4 >> 12; s[13] = (s4 >> 20) | (s5 << 1); s[14] = s5 >> 7; s[15] = (s5 >> 15) | (s6 << 6); s[16] = s6 >> 2; s[17] = s6 >> 10; s[18] = (s6 >> 18) | (s7 << 3); s[19] = s7 >> 5; s[20] = s7 >> 13; s[21] = s8 >> 0; s[22] = s8 >> 8; s[23] = (s8 >> 16) | (s9 << 5); s[24] = s9 >> 3; s[25] = s9 >> 11; s[26] = (s9 >> 19) | (s10 << 2); s[27] = s10 >> 6; s[28] = (s10 >> 14) | (s11 << 7); s[29] = s11 >> 1; s[30] = s11 >> 9; s[31] = s11 >> 17; } // Input: // a[0]+256*a[1]+...+256^31*a[31] = a // b[0]+256*b[1]+...+256^31*b[31] = b // c[0]+256*c[1]+...+256^31*c[31] = c // // Output: // s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l // where l = 2^252 + 27742317777372353535851937790883648493. static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b, const uint8_t *c) { int64_t a0 = 2097151 & load_3(a); int64_t a1 = 2097151 & (load_4(a + 2) >> 5); int64_t a2 = 2097151 & (load_3(a + 5) >> 2); int64_t a3 = 2097151 & (load_4(a + 7) >> 7); int64_t a4 = 2097151 & (load_4(a + 10) >> 4); int64_t a5 = 2097151 & (load_3(a + 13) >> 1); int64_t a6 = 2097151 & (load_4(a + 15) >> 6); int64_t a7 = 2097151 & (load_3(a + 18) >> 3); int64_t a8 = 2097151 & load_3(a + 21); int64_t a9 = 2097151 & (load_4(a + 23) >> 5); int64_t a10 = 2097151 & (load_3(a + 26) >> 2); int64_t a11 = (load_4(a + 28) >> 7); int64_t b0 = 2097151 & load_3(b); int64_t b1 = 2097151 & (load_4(b + 2) >> 5); int64_t b2 = 2097151 & (load_3(b + 5) >> 2); int64_t b3 = 2097151 & (load_4(b + 7) >> 7); int64_t b4 = 2097151 & (load_4(b + 10) >> 4); int64_t b5 = 2097151 & (load_3(b + 13) >> 1); int64_t b6 = 2097151 & (load_4(b + 15) >> 6); int64_t b7 = 2097151 & (load_3(b + 18) >> 3); int64_t b8 = 2097151 & load_3(b + 21); int64_t b9 = 2097151 & (load_4(b + 23) >> 5); int64_t b10 = 2097151 & (load_3(b + 26) >> 2); int64_t b11 = (load_4(b + 28) >> 7); int64_t c0 = 2097151 & load_3(c); int64_t c1 = 2097151 & (load_4(c + 2) >> 5); int64_t c2 = 2097151 & (load_3(c + 5) >> 2); int64_t c3 = 2097151 & (load_4(c + 7) >> 7); int64_t c4 = 2097151 & (load_4(c + 10) >> 4); int64_t c5 = 2097151 & (load_3(c + 13) >> 1); int64_t c6 = 2097151 & (load_4(c + 15) >> 6); int64_t c7 = 2097151 & (load_3(c + 18) >> 3); int64_t c8 = 2097151 & load_3(c + 21); int64_t c9 = 2097151 & (load_4(c + 23) >> 5); int64_t c10 = 2097151 & (load_3(c + 26) >> 2); int64_t c11 = (load_4(c + 28) >> 7); int64_t s0; int64_t s1; int64_t s2; int64_t s3; int64_t s4; int64_t s5; int64_t s6; int64_t s7; int64_t s8; int64_t s9; int64_t s10; int64_t s11; int64_t s12; int64_t s13; int64_t s14; int64_t s15; int64_t s16; int64_t s17; int64_t s18; int64_t s19; int64_t s20; int64_t s21; int64_t s22; int64_t s23; int64_t carry0; int64_t carry1; int64_t carry2; int64_t carry3; int64_t carry4; int64_t carry5; int64_t carry6; int64_t carry7; int64_t carry8; int64_t carry9; int64_t carry10; int64_t carry11; int64_t carry12; int64_t carry13; int64_t carry14; int64_t carry15; int64_t carry16; int64_t carry17; int64_t carry18; int64_t carry19; int64_t carry20; int64_t carry21; int64_t carry22; s0 = c0 + a0 * b0; s1 = c1 + a0 * b1 + a1 * b0; s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0; s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0; s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0; s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0; s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0; s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 + a6 * b1 + a7 * b0; s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 + a6 * b2 + a7 * b1 + a8 * b0; s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 + a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0; s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 + a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0; s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 + a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0; s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 + a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1; s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 + a9 * b4 + a10 * b3 + a11 * b2; s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 + a10 * b4 + a11 * b3; s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 + a11 * b4; s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5; s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6; s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7; s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8; s20 = a9 * b11 + a10 * b10 + a11 * b9; s21 = a10 * b11 + a11 * b10; s22 = a11 * b11; s23 = 0; carry0 = (s0 + (1 << 20)) >> 21; s1 += carry0; s0 -= int64_lshift21(carry0); carry2 = (s2 + (1 << 20)) >> 21; s3 += carry2; s2 -= int64_lshift21(carry2); carry4 = (s4 + (1 << 20)) >> 21; s5 += carry4; s4 -= int64_lshift21(carry4); carry6 = (s6 + (1 << 20)) >> 21; s7 += carry6; s6 -= int64_lshift21(carry6); carry8 = (s8 + (1 << 20)) >> 21; s9 += carry8; s8 -= int64_lshift21(carry8); carry10 = (s10 + (1 << 20)) >> 21; s11 += carry10; s10 -= int64_lshift21(carry10); carry12 = (s12 + (1 << 20)) >> 21; s13 += carry12; s12 -= int64_lshift21(carry12); carry14 = (s14 + (1 << 20)) >> 21; s15 += carry14; s14 -= int64_lshift21(carry14); carry16 = (s16 + (1 << 20)) >> 21; s17 += carry16; s16 -= int64_lshift21(carry16); carry18 = (s18 + (1 << 20)) >> 21; s19 += carry18; s18 -= int64_lshift21(carry18); carry20 = (s20 + (1 << 20)) >> 21; s21 += carry20; s20 -= int64_lshift21(carry20); carry22 = (s22 + (1 << 20)) >> 21; s23 += carry22; s22 -= int64_lshift21(carry22); carry1 = (s1 + (1 << 20)) >> 21; s2 += carry1; s1 -= int64_lshift21(carry1); carry3 = (s3 + (1 << 20)) >> 21; s4 += carry3; s3 -= int64_lshift21(carry3); carry5 = (s5 + (1 << 20)) >> 21; s6 += carry5; s5 -= int64_lshift21(carry5); carry7 = (s7 + (1 << 20)) >> 21; s8 += carry7; s7 -= int64_lshift21(carry7); carry9 = (s9 + (1 << 20)) >> 21; s10 += carry9; s9 -= int64_lshift21(carry9); carry11 = (s11 + (1 << 20)) >> 21; s12 += carry11; s11 -= int64_lshift21(carry11); carry13 = (s13 + (1 << 20)) >> 21; s14 += carry13; s13 -= int64_lshift21(carry13); carry15 = (s15 + (1 << 20)) >> 21; s16 += carry15; s15 -= int64_lshift21(carry15); carry17 = (s17 + (1 << 20)) >> 21; s18 += carry17; s17 -= int64_lshift21(carry17); carry19 = (s19 + (1 << 20)) >> 21; s20 += carry19; s19 -= int64_lshift21(carry19); carry21 = (s21 + (1 << 20)) >> 21; s22 += carry21; s21 -= int64_lshift21(carry21); s11 += s23 * 666643; s12 += s23 * 470296; s13 += s23 * 654183; s14 -= s23 * 997805; s15 += s23 * 136657; s16 -= s23 * 683901; s23 = 0; s10 += s22 * 666643; s11 += s22 * 470296; s12 += s22 * 654183; s13 -= s22 * 997805; s14 += s22 * 136657; s15 -= s22 * 683901; s22 = 0; s9 += s21 * 666643; s10 += s21 * 470296; s11 += s21 * 654183; s12 -= s21 * 997805; s13 += s21 * 136657; s14 -= s21 * 683901; s21 = 0; s8 += s20 * 666643; s9 += s20 * 470296; s10 += s20 * 654183; s11 -= s20 * 997805; s12 += s20 * 136657; s13 -= s20 * 683901; s20 = 0; s7 += s19 * 666643; s8 += s19 * 470296; s9 += s19 * 654183; s10 -= s19 * 997805; s11 += s19 * 136657; s12 -= s19 * 683901; s19 = 0; s6 += s18 * 666643; s7 += s18 * 470296; s8 += s18 * 654183; s9 -= s18 * 997805; s10 += s18 * 136657; s11 -= s18 * 683901; s18 = 0; carry6 = (s6 + (1 << 20)) >> 21; s7 += carry6; s6 -= int64_lshift21(carry6); carry8 = (s8 + (1 << 20)) >> 21; s9 += carry8; s8 -= int64_lshift21(carry8); carry10 = (s10 + (1 << 20)) >> 21; s11 += carry10; s10 -= int64_lshift21(carry10); carry12 = (s12 + (1 << 20)) >> 21; s13 += carry12; s12 -= int64_lshift21(carry12); carry14 = (s14 + (1 << 20)) >> 21; s15 += carry14; s14 -= int64_lshift21(carry14); carry16 = (s16 + (1 << 20)) >> 21; s17 += carry16; s16 -= int64_lshift21(carry16); carry7 = (s7 + (1 << 20)) >> 21; s8 += carry7; s7 -= int64_lshift21(carry7); carry9 = (s9 + (1 << 20)) >> 21; s10 += carry9; s9 -= int64_lshift21(carry9); carry11 = (s11 + (1 << 20)) >> 21; s12 += carry11; s11 -= int64_lshift21(carry11); carry13 = (s13 + (1 << 20)) >> 21; s14 += carry13; s13 -= int64_lshift21(carry13); carry15 = (s15 + (1 << 20)) >> 21; s16 += carry15; s15 -= int64_lshift21(carry15); s5 += s17 * 666643; s6 += s17 * 470296; s7 += s17 * 654183; s8 -= s17 * 997805; s9 += s17 * 136657; s10 -= s17 * 683901; s17 = 0; s4 += s16 * 666643; s5 += s16 * 470296; s6 += s16 * 654183; s7 -= s16 * 997805; s8 += s16 * 136657; s9 -= s16 * 683901; s16 = 0; s3 += s15 * 666643; s4 += s15 * 470296; s5 += s15 * 654183; s6 -= s15 * 997805; s7 += s15 * 136657; s8 -= s15 * 683901; s15 = 0; s2 += s14 * 666643; s3 += s14 * 470296; s4 += s14 * 654183; s5 -= s14 * 997805; s6 += s14 * 136657; s7 -= s14 * 683901; s14 = 0; s1 += s13 * 666643; s2 += s13 * 470296; s3 += s13 * 654183; s4 -= s13 * 997805; s5 += s13 * 136657; s6 -= s13 * 683901; s13 = 0; s0 += s12 * 666643; s1 += s12 * 470296; s2 += s12 * 654183; s3 -= s12 * 997805; s4 += s12 * 136657; s5 -= s12 * 683901; s12 = 0; carry0 = (s0 + (1 << 20)) >> 21; s1 += carry0; s0 -= int64_lshift21(carry0); carry2 = (s2 + (1 << 20)) >> 21; s3 += carry2; s2 -= int64_lshift21(carry2); carry4 = (s4 + (1 << 20)) >> 21; s5 += carry4; s4 -= int64_lshift21(carry4); carry6 = (s6 + (1 << 20)) >> 21; s7 += carry6; s6 -= int64_lshift21(carry6); carry8 = (s8 + (1 << 20)) >> 21; s9 += carry8; s8 -= int64_lshift21(carry8); carry10 = (s10 + (1 << 20)) >> 21; s11 += carry10; s10 -= int64_lshift21(carry10); carry1 = (s1 + (1 << 20)) >> 21; s2 += carry1; s1 -= int64_lshift21(carry1); carry3 = (s3 + (1 << 20)) >> 21; s4 += carry3; s3 -= int64_lshift21(carry3); carry5 = (s5 + (1 << 20)) >> 21; s6 += carry5; s5 -= int64_lshift21(carry5); carry7 = (s7 + (1 << 20)) >> 21; s8 += carry7; s7 -= int64_lshift21(carry7); carry9 = (s9 + (1 << 20)) >> 21; s10 += carry9; s9 -= int64_lshift21(carry9); carry11 = (s11 + (1 << 20)) >> 21; s12 += carry11; s11 -= int64_lshift21(carry11); s0 += s12 * 666643; s1 += s12 * 470296; s2 += s12 * 654183; s3 -= s12 * 997805; s4 += s12 * 136657; s5 -= s12 * 683901; s12 = 0; carry0 = s0 >> 21; s1 += carry0; s0 -= int64_lshift21(carry0); carry1 = s1 >> 21; s2 += carry1; s1 -= int64_lshift21(carry1); carry2 = s2 >> 21; s3 += carry2; s2 -= int64_lshift21(carry2); carry3 = s3 >> 21; s4 += carry3; s3 -= int64_lshift21(carry3); carry4 = s4 >> 21; s5 += carry4; s4 -= int64_lshift21(carry4); carry5 = s5 >> 21; s6 += carry5; s5 -= int64_lshift21(carry5); carry6 = s6 >> 21; s7 += carry6; s6 -= int64_lshift21(carry6); carry7 = s7 >> 21; s8 += carry7; s7 -= int64_lshift21(carry7); carry8 = s8 >> 21; s9 += carry8; s8 -= int64_lshift21(carry8); carry9 = s9 >> 21; s10 += carry9; s9 -= int64_lshift21(carry9); carry10 = s10 >> 21; s11 += carry10; s10 -= int64_lshift21(carry10); carry11 = s11 >> 21; s12 += carry11; s11 -= int64_lshift21(carry11); s0 += s12 * 666643; s1 += s12 * 470296; s2 += s12 * 654183; s3 -= s12 * 997805; s4 += s12 * 136657; s5 -= s12 * 683901; s12 = 0; carry0 = s0 >> 21; s1 += carry0; s0 -= int64_lshift21(carry0); carry1 = s1 >> 21; s2 += carry1; s1 -= int64_lshift21(carry1); carry2 = s2 >> 21; s3 += carry2; s2 -= int64_lshift21(carry2); carry3 = s3 >> 21; s4 += carry3; s3 -= int64_lshift21(carry3); carry4 = s4 >> 21; s5 += carry4; s4 -= int64_lshift21(carry4); carry5 = s5 >> 21; s6 += carry5; s5 -= int64_lshift21(carry5); carry6 = s6 >> 21; s7 += carry6; s6 -= int64_lshift21(carry6); carry7 = s7 >> 21; s8 += carry7; s7 -= int64_lshift21(carry7); carry8 = s8 >> 21; s9 += carry8; s8 -= int64_lshift21(carry8); carry9 = s9 >> 21; s10 += carry9; s9 -= int64_lshift21(carry9); carry10 = s10 >> 21; s11 += carry10; s10 -= int64_lshift21(carry10); s[0] = s0 >> 0; s[1] = s0 >> 8; s[2] = (s0 >> 16) | (s1 << 5); s[3] = s1 >> 3; s[4] = s1 >> 11; s[5] = (s1 >> 19) | (s2 << 2); s[6] = s2 >> 6; s[7] = (s2 >> 14) | (s3 << 7); s[8] = s3 >> 1; s[9] = s3 >> 9; s[10] = (s3 >> 17) | (s4 << 4); s[11] = s4 >> 4; s[12] = s4 >> 12; s[13] = (s4 >> 20) | (s5 << 1); s[14] = s5 >> 7; s[15] = (s5 >> 15) | (s6 << 6); s[16] = s6 >> 2; s[17] = s6 >> 10; s[18] = (s6 >> 18) | (s7 << 3); s[19] = s7 >> 5; s[20] = s7 >> 13; s[21] = s8 >> 0; s[22] = s8 >> 8; s[23] = (s8 >> 16) | (s9 << 5); s[24] = s9 >> 3; s[25] = s9 >> 11; s[26] = (s9 >> 19) | (s10 << 2); s[27] = s10 >> 6; s[28] = (s10 >> 14) | (s11 << 7); s[29] = s11 >> 1; s[30] = s11 >> 9; s[31] = s11 >> 17; } void x25519_scalar_mult_generic_masked(uint8_t out[32], const uint8_t scalar_masked[32], const uint8_t point[32]) { fe x1, x2, z2, x3, z3, tmp0, tmp1; fe_loose x2l, z2l, x3l, tmp0l, tmp1l; uint8_t e[32]; OPENSSL_memcpy(e, scalar_masked, 32); // The following implementation was transcribed to Coq and proven to // correspond to unary scalar multiplication in affine coordinates given that // x1 != 0 is the x coordinate of some point on the curve. It was also checked // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2 // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the // underlying field, so it applies to Curve25519 itself and the quadratic // twist of Curve25519. It was not proven in Coq that prime-field arithmetic // correctly simulates extension-field arithmetic on prime-field values. // The decoding of the byte array representation of e was not considered. // Specification of Montgomery curves in affine coordinates: // // Proof that these form a group that is isomorphic to a Weierstrass curve: // // Coq transcription and correctness proof of the loop (where scalarbits=255): // // // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0 fe_frombytes(&x1, point); fe_1(&x2); fe_0(&z2); fe_copy(&x3, &x1); fe_1(&z3); unsigned swap = 0; int pos; for (pos = 254; pos >= 0; --pos) { // loop invariant as of right before the test, for the case where x1 != 0: // pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero // let r := e >> (pos+1) in the following equalities of projective points: // to_xz (r*P) === if swap then (x3, z3) else (x2, z2) // to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) // x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P) unsigned b = 1 & (e[pos / 8] >> (pos & 7)); swap ^= b; fe_cswap(&x2, &x3, swap); fe_cswap(&z2, &z3, swap); swap = b; // Coq transcription of ladderstep formula (called from transcribed loop): // // // x1 != 0 // x1 = 0 fe_sub(&tmp0l, &x3, &z3); fe_sub(&tmp1l, &x2, &z2); fe_add(&x2l, &x2, &z2); fe_add(&z2l, &x3, &z3); fe_mul_tll(&z3, &tmp0l, &x2l); fe_mul_tll(&z2, &z2l, &tmp1l); fe_sq_tl(&tmp0, &tmp1l); fe_sq_tl(&tmp1, &x2l); fe_add(&x3l, &z3, &z2); fe_sub(&z2l, &z3, &z2); fe_mul_ttt(&x2, &tmp1, &tmp0); fe_sub(&tmp1l, &tmp1, &tmp0); fe_sq_tl(&z2, &z2l); fe_mul121666(&z3, &tmp1l); fe_sq_tl(&x3, &x3l); fe_add(&tmp0l, &tmp0, &z3); fe_mul_ttt(&z3, &x1, &z2); fe_mul_tll(&z2, &tmp1l, &tmp0l); } // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2) fe_cswap(&x2, &x3, swap); fe_cswap(&z2, &z3, swap); fe_invert(&z2, &z2); fe_mul_ttt(&x2, &x2, &z2); fe_tobytes(out, &x2); } void x25519_public_from_private_generic_masked(uint8_t out_public_value[32], const uint8_t private_key_masked[32], int use_adx) { uint8_t e[32]; OPENSSL_memcpy(e, private_key_masked, 32); ge_p3 A; x25519_ge_scalarmult_base(&A, e, use_adx); // We only need the u-coordinate of the curve25519 point. The map is // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y). fe_loose zplusy, zminusy; fe zminusy_inv; fe_add(&zplusy, &A.Z, &A.Y); fe_sub(&zminusy, &A.Z, &A.Y); fe_loose_invert(&zminusy_inv, &zminusy); fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv); fe_tobytes(out_public_value, &zminusy_inv); CONSTTIME_DECLASSIFY(out_public_value, 32); } void x25519_fe_invert(fe *out, const fe *z) { fe_invert(out, z); } uint8_t x25519_fe_isnegative(const fe *f) { return (uint8_t)fe_isnegative(f); } void x25519_fe_mul_ttt(fe *h, const fe *f, const fe *g) { fe_mul_ttt(h, f, g); } void x25519_fe_neg(fe *f) { fe_loose t; fe_neg(&t, f); fe_carry(f, &t); } void x25519_fe_tobytes(uint8_t s[32], const fe *h) { fe_tobytes(s, h); } void x25519_ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a, const ge_p3 *A, const uint8_t *b) { ge_double_scalarmult_vartime(r, a, A, b); } void x25519_sc_mask(uint8_t a[32]) { a[0] &= 248; a[31] &= 127; a[31] |= 64; } void x25519_sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b, const uint8_t *c) { sc_muladd(s, a, b, c); } ring-0.17.14/crypto/curve25519/curve25519_64_adx.c000064400000000000000000000015351046102023000170560ustar 00000000000000// Copyright 2023 The BoringSSL Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "internal.h" #if defined(BORINGSSL_FE25519_ADX) #pragma GCC diagnostic ignored "-Wconversion" #pragma GCC diagnostic ignored "-Wpedantic" #pragma GCC diagnostic ignored "-Wsign-conversion" #include "../../third_party/fiat/curve25519_64_adx.h" #endif ring-0.17.14/crypto/curve25519/curve25519_tables.h000064400000000000000000006052461046102023000172610ustar 00000000000000// Copyright 2020 The BoringSSL Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // This file is generated from // ./make_curve25519_tables.py > curve25519_tables.h static const fe d = {{ #if defined(OPENSSL_64_BIT) 929955233495203, 466365720129213, 1662059464998953, 2033849074728123, 1442794654840575 #else 56195235, 13857412, 51736253, 6949390, 114729, 24766616, 60832955, 30306712, 48412415, 21499315 #endif }}; static const fe sqrtm1 = {{ #if defined(OPENSSL_64_BIT) 1718705420411056, 234908883556509, 2233514472574048, 2117202627021982, 765476049583133 #else 34513072, 25610706, 9377949, 3500415, 12389472, 33281959, 41962654, 31548777, 326685, 11406482 #endif }}; static const fe d2 = {{ #if defined(OPENSSL_64_BIT) 1859910466990425, 932731440258426, 1072319116312658, 1815898335770999, 633789495995903 #else 45281625, 27714825, 36363642, 13898781, 229458, 15978800, 54557047, 27058993, 29715967, 9444199 #endif }}; #if defined(OPENSSL_SMALL) // This block of code replaces the standard base-point table with a much smaller // one. The standard table is 30,720 bytes while this one is just 960. // // This table contains 15 pairs of group elements, (x, y), where each field // element is serialised with |fe_tobytes|. If |i| is the index of the group // element then consider i+1 as a four-bit number: (i₀, i₁, i₂, i₃) (where i₀ // is the most significant bit). The value of the group element is then: // (i₀×2^192 + i₁×2^128 + i₂×2^64 + i₃)G, where G is the generator. static const uint8_t k25519SmallPrecomp[15 * 2 * 32] = { 0x1a, 0xd5, 0x25, 0x8f, 0x60, 0x2d, 0x56, 0xc9, 0xb2, 0xa7, 0x25, 0x95, 0x60, 0xc7, 0x2c, 0x69, 0x5c, 0xdc, 0xd6, 0xfd, 0x31, 0xe2, 0xa4, 0xc0, 0xfe, 0x53, 0x6e, 0xcd, 0xd3, 0x36, 0x69, 0x21, 0x58, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x02, 0xa2, 0xed, 0xf4, 0x8f, 0x6b, 0x0b, 0x3e, 0xeb, 0x35, 0x1a, 0xd5, 0x7e, 0xdb, 0x78, 0x00, 0x96, 0x8a, 0xa0, 0xb4, 0xcf, 0x60, 0x4b, 0xd4, 0xd5, 0xf9, 0x2d, 0xbf, 0x88, 0xbd, 0x22, 0x62, 0x13, 0x53, 0xe4, 0x82, 0x57, 0xfa, 0x1e, 0x8f, 0x06, 0x2b, 0x90, 0xba, 0x08, 0xb6, 0x10, 0x54, 0x4f, 0x7c, 0x1b, 0x26, 0xed, 0xda, 0x6b, 0xdd, 0x25, 0xd0, 0x4e, 0xea, 0x42, 0xbb, 0x25, 0x03, 0xa2, 0xfb, 0xcc, 0x61, 0x67, 0x06, 0x70, 0x1a, 0xc4, 0x78, 0x3a, 0xff, 0x32, 0x62, 0xdd, 0x2c, 0xab, 0x50, 0x19, 0x3b, 0xf2, 0x9b, 0x7d, 0xb8, 0xfd, 0x4f, 0x29, 0x9c, 0xa7, 0x91, 0xba, 0x0e, 0x46, 0x5e, 0x51, 0xfe, 0x1d, 0xbf, 0xe5, 0xe5, 0x9b, 0x95, 0x0d, 0x67, 0xf8, 0xd1, 0xb5, 0x5a, 0xa1, 0x93, 0x2c, 0xc3, 0xde, 0x0e, 0x97, 0x85, 0x2d, 0x7f, 0xea, 0xab, 0x3e, 0x47, 0x30, 0x18, 0x24, 0xe8, 0xb7, 0x60, 0xae, 0x47, 0x80, 0xfc, 0xe5, 0x23, 0xe7, 0xc2, 0xc9, 0x85, 0xe6, 0x98, 0xa0, 0x29, 0x4e, 0xe1, 0x84, 0x39, 0x2d, 0x95, 0x2c, 0xf3, 0x45, 0x3c, 0xff, 0xaf, 0x27, 0x4c, 0x6b, 0xa6, 0xf5, 0x4b, 0x11, 0xbd, 0xba, 0x5b, 0x9e, 0xc4, 0xa4, 0x51, 0x1e, 0xbe, 0xd0, 0x90, 0x3a, 0x9c, 0xc2, 0x26, 0xb6, 0x1e, 0xf1, 0x95, 0x7d, 0xc8, 0x6d, 0x52, 0xe6, 0x99, 0x2c, 0x5f, 0x9a, 0x96, 0x0c, 0x68, 0x29, 0xfd, 0xe2, 0xfb, 0xe6, 0xbc, 0xec, 0x31, 0x08, 0xec, 0xe6, 0xb0, 0x53, 0x60, 0xc3, 0x8c, 0xbe, 0xc1, 0xb3, 0x8a, 0x8f, 0xe4, 0x88, 0x2b, 0x55, 0xe5, 0x64, 0x6e, 0x9b, 0xd0, 0xaf, 0x7b, 0x64, 0x2a, 0x35, 0x25, 0x10, 0x52, 0xc5, 0x9e, 0x58, 0x11, 0x39, 0x36, 0x45, 0x51, 0xb8, 0x39, 0x93, 0xfc, 0x9d, 0x6a, 0xbe, 0x58, 0xcb, 0xa4, 0x0f, 0x51, 0x3c, 0x38, 0x05, 0xca, 0xab, 0x43, 0x63, 0x0e, 0xf3, 0x8b, 0x41, 0xa6, 0xf8, 0x9b, 0x53, 0x70, 0x80, 0x53, 0x86, 0x5e, 0x8f, 0xe3, 0xc3, 0x0d, 0x18, 0xc8, 0x4b, 0x34, 0x1f, 0xd8, 0x1d, 0xbc, 0xf2, 0x6d, 0x34, 0x3a, 0xbe, 0xdf, 0xd9, 0xf6, 0xf3, 0x89, 0xa1, 0xe1, 0x94, 0x9f, 0x5d, 0x4c, 0x5d, 0xe9, 0xa1, 0x49, 0x92, 0xef, 0x0e, 0x53, 0x81, 0x89, 0x58, 0x87, 0xa6, 0x37, 0xf1, 0xdd, 0x62, 0x60, 0x63, 0x5a, 0x9d, 0x1b, 0x8c, 0xc6, 0x7d, 0x52, 0xea, 0x70, 0x09, 0x6a, 0xe1, 0x32, 0xf3, 0x73, 0x21, 0x1f, 0x07, 0x7b, 0x7c, 0x9b, 0x49, 0xd8, 0xc0, 0xf3, 0x25, 0x72, 0x6f, 0x9d, 0xed, 0x31, 0x67, 0x36, 0x36, 0x54, 0x40, 0x92, 0x71, 0xe6, 0x11, 0x28, 0x11, 0xad, 0x93, 0x32, 0x85, 0x7b, 0x3e, 0xb7, 0x3b, 0x49, 0x13, 0x1c, 0x07, 0xb0, 0x2e, 0x93, 0xaa, 0xfd, 0xfd, 0x28, 0x47, 0x3d, 0x8d, 0xd2, 0xda, 0xc7, 0x44, 0xd6, 0x7a, 0xdb, 0x26, 0x7d, 0x1d, 0xb8, 0xe1, 0xde, 0x9d, 0x7a, 0x7d, 0x17, 0x7e, 0x1c, 0x37, 0x04, 0x8d, 0x2d, 0x7c, 0x5e, 0x18, 0x38, 0x1e, 0xaf, 0xc7, 0x1b, 0x33, 0x48, 0x31, 0x00, 0x59, 0xf6, 0xf2, 0xca, 0x0f, 0x27, 0x1b, 0x63, 0x12, 0x7e, 0x02, 0x1d, 0x49, 0xc0, 0x5d, 0x79, 0x87, 0xef, 0x5e, 0x7a, 0x2f, 0x1f, 0x66, 0x55, 0xd8, 0x09, 0xd9, 0x61, 0x38, 0x68, 0xb0, 0x07, 0xa3, 0xfc, 0xcc, 0x85, 0x10, 0x7f, 0x4c, 0x65, 0x65, 0xb3, 0xfa, 0xfa, 0xa5, 0x53, 0x6f, 0xdb, 0x74, 0x4c, 0x56, 0x46, 0x03, 0xe2, 0xd5, 0x7a, 0x29, 0x1c, 0xc6, 0x02, 0xbc, 0x59, 0xf2, 0x04, 0x75, 0x63, 0xc0, 0x84, 0x2f, 0x60, 0x1c, 0x67, 0x76, 0xfd, 0x63, 0x86, 0xf3, 0xfa, 0xbf, 0xdc, 0xd2, 0x2d, 0x90, 0x91, 0xbd, 0x33, 0xa9, 0xe5, 0x66, 0x0c, 0xda, 0x42, 0x27, 0xca, 0xf4, 0x66, 0xc2, 0xec, 0x92, 0x14, 0x57, 0x06, 0x63, 0xd0, 0x4d, 0x15, 0x06, 0xeb, 0x69, 0x58, 0x4f, 0x77, 0xc5, 0x8b, 0xc7, 0xf0, 0x8e, 0xed, 0x64, 0xa0, 0xb3, 0x3c, 0x66, 0x71, 0xc6, 0x2d, 0xda, 0x0a, 0x0d, 0xfe, 0x70, 0x27, 0x64, 0xf8, 0x27, 0xfa, 0xf6, 0x5f, 0x30, 0xa5, 0x0d, 0x6c, 0xda, 0xf2, 0x62, 0x5e, 0x78, 0x47, 0xd3, 0x66, 0x00, 0x1c, 0xfd, 0x56, 0x1f, 0x5d, 0x3f, 0x6f, 0xf4, 0x4c, 0xd8, 0xfd, 0x0e, 0x27, 0xc9, 0x5c, 0x2b, 0xbc, 0xc0, 0xa4, 0xe7, 0x23, 0x29, 0x02, 0x9f, 0x31, 0xd6, 0xe9, 0xd7, 0x96, 0xf4, 0xe0, 0x5e, 0x0b, 0x0e, 0x13, 0xee, 0x3c, 0x09, 0xed, 0xf2, 0x3d, 0x76, 0x91, 0xc3, 0xa4, 0x97, 0xae, 0xd4, 0x87, 0xd0, 0x5d, 0xf6, 0x18, 0x47, 0x1f, 0x1d, 0x67, 0xf2, 0xcf, 0x63, 0xa0, 0x91, 0x27, 0xf8, 0x93, 0x45, 0x75, 0x23, 0x3f, 0xd1, 0xf1, 0xad, 0x23, 0xdd, 0x64, 0x93, 0x96, 0x41, 0x70, 0x7f, 0xf7, 0xf5, 0xa9, 0x89, 0xa2, 0x34, 0xb0, 0x8d, 0x1b, 0xae, 0x19, 0x15, 0x49, 0x58, 0x23, 0x6d, 0x87, 0x15, 0x4f, 0x81, 0x76, 0xfb, 0x23, 0xb5, 0xea, 0xcf, 0xac, 0x54, 0x8d, 0x4e, 0x42, 0x2f, 0xeb, 0x0f, 0x63, 0xdb, 0x68, 0x37, 0xa8, 0xcf, 0x8b, 0xab, 0xf5, 0xa4, 0x6e, 0x96, 0x2a, 0xb2, 0xd6, 0xbe, 0x9e, 0xbd, 0x0d, 0xb4, 0x42, 0xa9, 0xcf, 0x01, 0x83, 0x8a, 0x17, 0x47, 0x76, 0xc4, 0xc6, 0x83, 0x04, 0x95, 0x0b, 0xfc, 0x11, 0xc9, 0x62, 0xb8, 0x0c, 0x76, 0x84, 0xd9, 0xb9, 0x37, 0xfa, 0xfc, 0x7c, 0xc2, 0x6d, 0x58, 0x3e, 0xb3, 0x04, 0xbb, 0x8c, 0x8f, 0x48, 0xbc, 0x91, 0x27, 0xcc, 0xf9, 0xb7, 0x22, 0x19, 0x83, 0x2e, 0x09, 0xb5, 0x72, 0xd9, 0x54, 0x1c, 0x4d, 0xa1, 0xea, 0x0b, 0xf1, 0xc6, 0x08, 0x72, 0x46, 0x87, 0x7a, 0x6e, 0x80, 0x56, 0x0a, 0x8a, 0xc0, 0xdd, 0x11, 0x6b, 0xd6, 0xdd, 0x47, 0xdf, 0x10, 0xd9, 0xd8, 0xea, 0x7c, 0xb0, 0x8f, 0x03, 0x00, 0x2e, 0xc1, 0x8f, 0x44, 0xa8, 0xd3, 0x30, 0x06, 0x89, 0xa2, 0xf9, 0x34, 0xad, 0xdc, 0x03, 0x85, 0xed, 0x51, 0xa7, 0x82, 0x9c, 0xe7, 0x5d, 0x52, 0x93, 0x0c, 0x32, 0x9a, 0x5b, 0xe1, 0xaa, 0xca, 0xb8, 0x02, 0x6d, 0x3a, 0xd4, 0xb1, 0x3a, 0xf0, 0x5f, 0xbe, 0xb5, 0x0d, 0x10, 0x6b, 0x38, 0x32, 0xac, 0x76, 0x80, 0xbd, 0xca, 0x94, 0x71, 0x7a, 0xf2, 0xc9, 0x35, 0x2a, 0xde, 0x9f, 0x42, 0x49, 0x18, 0x01, 0xab, 0xbc, 0xef, 0x7c, 0x64, 0x3f, 0x58, 0x3d, 0x92, 0x59, 0xdb, 0x13, 0xdb, 0x58, 0x6e, 0x0a, 0xe0, 0xb7, 0x91, 0x4a, 0x08, 0x20, 0xd6, 0x2e, 0x3c, 0x45, 0xc9, 0x8b, 0x17, 0x79, 0xe7, 0xc7, 0x90, 0x99, 0x3a, 0x18, 0x25, }; #else // k25519Precomp[i][j] = (j+1)*256^i*B const uint8_t k25519Precomp[32][8][3][32] = { { { {0x85, 0x3b, 0x8c, 0xf5, 0xc6, 0x93, 0xbc, 0x2f, 0x19, 0xe, 0x8c, 0xfb, 0xc6, 0x2d, 0x93, 0xcf, 0xc2, 0x42, 0x3d, 0x64, 0x98, 0x48, 0xb, 0x27, 0x65, 0xba, 0xd4, 0x33, 0x3a, 0x9d, 0xcf, 0x7}, {0x3e, 0x91, 0x40, 0xd7, 0x5, 0x39, 0x10, 0x9d, 0xb3, 0xbe, 0x40, 0xd1, 0x5, 0x9f, 0x39, 0xfd, 0x9, 0x8a, 0x8f, 0x68, 0x34, 0x84, 0xc1, 0xa5, 0x67, 0x12, 0xf8, 0x98, 0x92, 0x2f, 0xfd, 0x44}, {0x68, 0xaa, 0x7a, 0x87, 0x5, 0x12, 0xc9, 0xab, 0x9e, 0xc4, 0xaa, 0xcc, 0x23, 0xe8, 0xd9, 0x26, 0x8c, 0x59, 0x43, 0xdd, 0xcb, 0x7d, 0x1b, 0x5a, 0xa8, 0x65, 0xc, 0x9f, 0x68, 0x7b, 0x11, 0x6f}, }, { {0xd7, 0x71, 0x3c, 0x93, 0xfc, 0xe7, 0x24, 0x92, 0xb5, 0xf5, 0xf, 0x7a, 0x96, 0x9d, 0x46, 0x9f, 0x2, 0x7, 0xd6, 0xe1, 0x65, 0x9a, 0xa6, 0x5a, 0x2e, 0x2e, 0x7d, 0xa8, 0x3f, 0x6, 0xc, 0x59}, {0xa8, 0xd5, 0xb4, 0x42, 0x60, 0xa5, 0x99, 0x8a, 0xf6, 0xac, 0x60, 0x4e, 0xc, 0x81, 0x2b, 0x8f, 0xaa, 0x37, 0x6e, 0xb1, 0x6b, 0x23, 0x9e, 0xe0, 0x55, 0x25, 0xc9, 0x69, 0xa6, 0x95, 0xb5, 0x6b}, {0x5f, 0x7a, 0x9b, 0xa5, 0xb3, 0xa8, 0xfa, 0x43, 0x78, 0xcf, 0x9a, 0x5d, 0xdd, 0x6b, 0xc1, 0x36, 0x31, 0x6a, 0x3d, 0xb, 0x84, 0xa0, 0xf, 0x50, 0x73, 0xb, 0xa5, 0x3e, 0xb1, 0xf5, 0x1a, 0x70}, }, { {0x30, 0x97, 0xee, 0x4c, 0xa8, 0xb0, 0x25, 0xaf, 0x8a, 0x4b, 0x86, 0xe8, 0x30, 0x84, 0x5a, 0x2, 0x32, 0x67, 0x1, 0x9f, 0x2, 0x50, 0x1b, 0xc1, 0xf4, 0xf8, 0x80, 0x9a, 0x1b, 0x4e, 0x16, 0x7a}, {0x65, 0xd2, 0xfc, 0xa4, 0xe8, 0x1f, 0x61, 0x56, 0x7d, 0xba, 0xc1, 0xe5, 0xfd, 0x53, 0xd3, 0x3b, 0xbd, 0xd6, 0x4b, 0x21, 0x1a, 0xf3, 0x31, 0x81, 0x62, 0xda, 0x5b, 0x55, 0x87, 0x15, 0xb9, 0x2a}, {0x89, 0xd8, 0xd0, 0xd, 0x3f, 0x93, 0xae, 0x14, 0x62, 0xda, 0x35, 0x1c, 0x22, 0x23, 0x94, 0x58, 0x4c, 0xdb, 0xf2, 0x8c, 0x45, 0xe5, 0x70, 0xd1, 0xc6, 0xb4, 0xb9, 0x12, 0xaf, 0x26, 0x28, 0x5a}, }, { {0x9f, 0x9, 0xfc, 0x8e, 0xb9, 0x51, 0x73, 0x28, 0x38, 0x25, 0xfd, 0x7d, 0xf4, 0xc6, 0x65, 0x67, 0x65, 0x92, 0xa, 0xfb, 0x3d, 0x8d, 0x34, 0xca, 0x27, 0x87, 0xe5, 0x21, 0x3, 0x91, 0xe, 0x68}, {0xbf, 0x18, 0x68, 0x5, 0xa, 0x5, 0xfe, 0x95, 0xa9, 0xfa, 0x60, 0x56, 0x71, 0x89, 0x7e, 0x32, 0x73, 0x50, 0xa0, 0x6, 0xcd, 0xe3, 0xe8, 0xc3, 0x9a, 0xa4, 0x45, 0x74, 0x4c, 0x3f, 0x93, 0x27}, {0x9, 0xff, 0x76, 0xc4, 0xe9, 0xfb, 0x13, 0x5a, 0x72, 0xc1, 0x5c, 0x7b, 0x45, 0x39, 0x9e, 0x6e, 0x94, 0x44, 0x2b, 0x10, 0xf9, 0xdc, 0xdb, 0x5d, 0x2b, 0x3e, 0x55, 0x63, 0xbf, 0xc, 0x9d, 0x7f}, }, { {0x33, 0xbb, 0xa5, 0x8, 0x44, 0xbc, 0x12, 0xa2, 0x2, 0xed, 0x5e, 0xc7, 0xc3, 0x48, 0x50, 0x8d, 0x44, 0xec, 0xbf, 0x5a, 0xc, 0xeb, 0x1b, 0xdd, 0xeb, 0x6, 0xe2, 0x46, 0xf1, 0xcc, 0x45, 0x29}, {0xba, 0xd6, 0x47, 0xa4, 0xc3, 0x82, 0x91, 0x7f, 0xb7, 0x29, 0x27, 0x4b, 0xd1, 0x14, 0x0, 0xd5, 0x87, 0xa0, 0x64, 0xb8, 0x1c, 0xf1, 0x3c, 0xe3, 0xf3, 0x55, 0x1b, 0xeb, 0x73, 0x7e, 0x4a, 0x15}, {0x85, 0x82, 0x2a, 0x81, 0xf1, 0xdb, 0xbb, 0xbc, 0xfc, 0xd1, 0xbd, 0xd0, 0x7, 0x8, 0xe, 0x27, 0x2d, 0xa7, 0xbd, 0x1b, 0xb, 0x67, 0x1b, 0xb4, 0x9a, 0xb6, 0x3b, 0x6b, 0x69, 0xbe, 0xaa, 0x43}, }, { {0x31, 0x71, 0x15, 0x77, 0xeb, 0xee, 0xc, 0x3a, 0x88, 0xaf, 0xc8, 0x0, 0x89, 0x15, 0x27, 0x9b, 0x36, 0xa7, 0x59, 0xda, 0x68, 0xb6, 0x65, 0x80, 0xbd, 0x38, 0xcc, 0xa2, 0xb6, 0x7b, 0xe5, 0x51}, {0xa4, 0x8c, 0x7d, 0x7b, 0xb6, 0x6, 0x98, 0x49, 0x39, 0x27, 0xd2, 0x27, 0x84, 0xe2, 0x5b, 0x57, 0xb9, 0x53, 0x45, 0x20, 0xe7, 0x5c, 0x8, 0xbb, 0x84, 0x78, 0x41, 0xae, 0x41, 0x4c, 0xb6, 0x38}, {0x71, 0x4b, 0xea, 0x2, 0x67, 0x32, 0xac, 0x85, 0x1, 0xbb, 0xa1, 0x41, 0x3, 0xe0, 0x70, 0xbe, 0x44, 0xc1, 0x3b, 0x8, 0x4b, 0xa2, 0xe4, 0x53, 0xe3, 0x61, 0xd, 0x9f, 0x1a, 0xe9, 0xb8, 0x10}, }, { {0xbf, 0xa3, 0x4e, 0x94, 0xd0, 0x5c, 0x1a, 0x6b, 0xd2, 0xc0, 0x9d, 0xb3, 0x3a, 0x35, 0x70, 0x74, 0x49, 0x2e, 0x54, 0x28, 0x82, 0x52, 0xb2, 0x71, 0x7e, 0x92, 0x3c, 0x28, 0x69, 0xea, 0x1b, 0x46}, {0xb1, 0x21, 0x32, 0xaa, 0x9a, 0x2c, 0x6f, 0xba, 0xa7, 0x23, 0xba, 0x3b, 0x53, 0x21, 0xa0, 0x6c, 0x3a, 0x2c, 0x19, 0x92, 0x4f, 0x76, 0xea, 0x9d, 0xe0, 0x17, 0x53, 0x2e, 0x5d, 0xdd, 0x6e, 0x1d}, {0xa2, 0xb3, 0xb8, 0x1, 0xc8, 0x6d, 0x83, 0xf1, 0x9a, 0xa4, 0x3e, 0x5, 0x47, 0x5f, 0x3, 0xb3, 0xf3, 0xad, 0x77, 0x58, 0xba, 0x41, 0x9c, 0x52, 0xa7, 0x90, 0xf, 0x6a, 0x1c, 0xbb, 0x9f, 0x7a}, }, { {0x8f, 0x3e, 0xdd, 0x4, 0x66, 0x59, 0xb7, 0x59, 0x2c, 0x70, 0x88, 0xe2, 0x77, 0x3, 0xb3, 0x6c, 0x23, 0xc3, 0xd9, 0x5e, 0x66, 0x9c, 0x33, 0xb1, 0x2f, 0xe5, 0xbc, 0x61, 0x60, 0xe7, 0x15, 0x9}, {0xd9, 0x34, 0x92, 0xf3, 0xed, 0x5d, 0xa7, 0xe2, 0xf9, 0x58, 0xb5, 0xe1, 0x80, 0x76, 0x3d, 0x96, 0xfb, 0x23, 0x3c, 0x6e, 0xac, 0x41, 0x27, 0x2c, 0xc3, 0x1, 0xe, 0x32, 0xa1, 0x24, 0x90, 0x3a}, {0x1a, 0x91, 0xa2, 0xc9, 0xd9, 0xf5, 0xc1, 0xe7, 0xd7, 0xa7, 0xcc, 0x8b, 0x78, 0x71, 0xa3, 0xb8, 0x32, 0x2a, 0xb6, 0xe, 0x19, 0x12, 0x64, 0x63, 0x95, 0x4e, 0xcc, 0x2e, 0x5c, 0x7c, 0x90, 0x26}, }, }, { { {0x1d, 0x9c, 0x2f, 0x63, 0xe, 0xdd, 0xcc, 0x2e, 0x15, 0x31, 0x89, 0x76, 0x96, 0xb6, 0xd0, 0x51, 0x58, 0x7a, 0x63, 0xa8, 0x6b, 0xb7, 0xdf, 0x52, 0x39, 0xef, 0xe, 0xa0, 0x49, 0x7d, 0xd3, 0x6d}, {0x5e, 0x51, 0xaa, 0x49, 0x54, 0x63, 0x5b, 0xed, 0x3a, 0x82, 0xc6, 0xb, 0x9f, 0xc4, 0x65, 0xa8, 0xc4, 0xd1, 0x42, 0x5b, 0xe9, 0x1f, 0xc, 0x85, 0xb9, 0x15, 0xd3, 0x3, 0x6f, 0x6d, 0xd7, 0x30}, {0xc7, 0xe4, 0x6, 0x21, 0x17, 0x44, 0x44, 0x6c, 0x69, 0x7f, 0x8d, 0x92, 0x80, 0xd6, 0x53, 0xfb, 0x26, 0x3f, 0x4d, 0x69, 0xa4, 0x9e, 0x73, 0xb4, 0xb0, 0x4b, 0x86, 0x2e, 0x11, 0x97, 0xc6, 0x10}, }, { {0x5, 0xc8, 0x58, 0x83, 0xa0, 0x2a, 0xa6, 0xc, 0x47, 0x42, 0x20, 0x7a, 0xe3, 0x4a, 0x3d, 0x6a, 0xdc, 0xed, 0x11, 0x3b, 0xa6, 0xd3, 0x64, 0x74, 0xef, 0x6, 0x8, 0x55, 0xaf, 0x9b, 0xbf, 0x3}, {0xde, 0x5f, 0xbe, 0x7d, 0x27, 0xc4, 0x93, 0x64, 0xa2, 0x7e, 0xad, 0x19, 0xad, 0x4f, 0x5d, 0x26, 0x90, 0x45, 0x30, 0x46, 0xc8, 0xdf, 0x0, 0xe, 0x9, 0xfe, 0x66, 0xed, 0xab, 0x1c, 0xe6, 0x25}, {0x4, 0x66, 0x58, 0xcc, 0x28, 0xe1, 0x13, 0x3f, 0x7e, 0x74, 0x59, 0xb4, 0xec, 0x73, 0x58, 0x6f, 0xf5, 0x68, 0x12, 0xcc, 0xed, 0x3d, 0xb6, 0xa0, 0x2c, 0xe2, 0x86, 0x45, 0x63, 0x78, 0x6d, 0x56}, }, { {0xd0, 0x2f, 0x5a, 0xc6, 0x85, 0x42, 0x5, 0xa1, 0xc3, 0x67, 0x16, 0xf3, 0x2a, 0x11, 0x64, 0x6c, 0x58, 0xee, 0x1a, 0x73, 0x40, 0xe2, 0xa, 0x68, 0x2a, 0xb2, 0x93, 0x47, 0xf3, 0xa5, 0xfb, 0x14}, {0x34, 0x8, 0xc1, 0x9c, 0x9f, 0xa4, 0x37, 0x16, 0x51, 0xc4, 0x9b, 0xa8, 0xd5, 0x56, 0x8e, 0xbc, 0xdb, 0xd2, 0x7f, 0x7f, 0xf, 0xec, 0xb5, 0x1c, 0xd9, 0x35, 0xcc, 0x5e, 0xca, 0x5b, 0x97, 0x33}, {0xd4, 0xf7, 0x85, 0x69, 0x16, 0x46, 0xd7, 0x3c, 0x57, 0x0, 0xc8, 0xc9, 0x84, 0x5e, 0x3e, 0x59, 0x1e, 0x13, 0x61, 0x7b, 0xb6, 0xf2, 0xc3, 0x2f, 0x6c, 0x52, 0xfc, 0x83, 0xea, 0x9c, 0x82, 0x14}, }, { {0xb8, 0xec, 0x71, 0x4e, 0x2f, 0xb, 0xe7, 0x21, 0xe3, 0x77, 0xa4, 0x40, 0xb9, 0xdd, 0x56, 0xe6, 0x80, 0x4f, 0x1d, 0xce, 0xce, 0x56, 0x65, 0xbf, 0x7e, 0x7b, 0x5d, 0x53, 0xc4, 0x3b, 0xfc, 0x5}, {0xc2, 0x95, 0xdd, 0x97, 0x84, 0x7b, 0x43, 0xff, 0xa7, 0xb5, 0x4e, 0xaa, 0x30, 0x4e, 0x74, 0x6c, 0x8b, 0xe8, 0x85, 0x3c, 0x61, 0x5d, 0xc, 0x9e, 0x73, 0x81, 0x75, 0x5f, 0x1e, 0xc7, 0xd9, 0x2f}, {0xdd, 0xde, 0xaf, 0x52, 0xae, 0xb3, 0xb8, 0x24, 0xcf, 0x30, 0x3b, 0xed, 0x8c, 0x63, 0x95, 0x34, 0x95, 0x81, 0xbe, 0xa9, 0x83, 0xbc, 0xa4, 0x33, 0x4, 0x1f, 0x65, 0x5c, 0x47, 0x67, 0x37, 0x37}, }, { {0x90, 0x65, 0x24, 0x14, 0xcb, 0x95, 0x40, 0x63, 0x35, 0x55, 0xc1, 0x16, 0x40, 0x14, 0x12, 0xef, 0x60, 0xbc, 0x10, 0x89, 0xc, 0x14, 0x38, 0x9e, 0x8c, 0x7c, 0x90, 0x30, 0x57, 0x90, 0xf5, 0x6b}, {0xd9, 0xad, 0xd1, 0x40, 0xfd, 0x99, 0xba, 0x2f, 0x27, 0xd0, 0xf4, 0x96, 0x6f, 0x16, 0x7, 0xb3, 0xae, 0x3b, 0xf0, 0x15, 0x52, 0xf0, 0x63, 0x43, 0x99, 0xf9, 0x18, 0x3b, 0x6c, 0xa5, 0xbe, 0x1f}, {0x8a, 0x5b, 0x41, 0xe1, 0xf1, 0x78, 0xa7, 0xf, 0x7e, 0xa7, 0xc3, 0xba, 0xf7, 0x9f, 0x40, 0x6, 0x50, 0x9a, 0xa2, 0x9a, 0xb8, 0xd7, 0x52, 0x6f, 0x56, 0x5a, 0x63, 0x7a, 0xf6, 0x1c, 0x52, 0x2}, }, { {0xe4, 0x5e, 0x2f, 0x77, 0x20, 0x67, 0x14, 0xb1, 0xce, 0x9a, 0x7, 0x96, 0xb1, 0x94, 0xf8, 0xe8, 0x4a, 0x82, 0xac, 0x0, 0x4d, 0x22, 0xf8, 0x4a, 0xc4, 0x6c, 0xcd, 0xf7, 0xd9, 0x53, 0x17, 0x0}, {0x94, 0x52, 0x9d, 0xa, 0xb, 0xee, 0x3f, 0x51, 0x66, 0x5a, 0xdf, 0xf, 0x5c, 0xe7, 0x98, 0x8f, 0xce, 0x7, 0xe1, 0xbf, 0x88, 0x86, 0x61, 0xd4, 0xed, 0x2c, 0x38, 0x71, 0x7e, 0xa, 0xa0, 0x3f}, {0x34, 0xdb, 0x3d, 0x96, 0x2d, 0x23, 0x69, 0x3c, 0x58, 0x38, 0x97, 0xb4, 0xda, 0x87, 0xde, 0x1d, 0x85, 0xf2, 0x91, 0xa0, 0xf9, 0xd1, 0xd7, 0xaa, 0xb6, 0xed, 0x48, 0xa0, 0x2f, 0xfe, 0xb5, 0x12}, }, { {0x92, 0x1e, 0x6f, 0xad, 0x26, 0x7c, 0x2b, 0xdf, 0x13, 0x89, 0x4b, 0x50, 0x23, 0xd3, 0x66, 0x4b, 0xc3, 0x8b, 0x1c, 0x75, 0xc0, 0x9d, 0x40, 0x8c, 0xb8, 0xc7, 0x96, 0x7, 0xc2, 0x93, 0x7e, 0x6f}, {0x4d, 0xe3, 0xfc, 0x96, 0xc4, 0xfb, 0xf0, 0x71, 0xed, 0x5b, 0xf3, 0xad, 0x6b, 0x82, 0xb9, 0x73, 0x61, 0xc5, 0x28, 0xff, 0x61, 0x72, 0x4, 0xd2, 0x6f, 0x20, 0xb1, 0x6f, 0xf9, 0x76, 0x9b, 0x74}, {0x5, 0xae, 0xa6, 0xae, 0x4, 0xf6, 0x5a, 0x1f, 0x99, 0x9c, 0xe4, 0xbe, 0xf1, 0x51, 0x23, 0xc1, 0x66, 0x6b, 0xff, 0xee, 0xb5, 0x8, 0xa8, 0x61, 0x51, 0x21, 0xe0, 0x1, 0xf, 0xc1, 0xce, 0xf}, }, { {0x45, 0x4e, 0x24, 0xc4, 0x9d, 0xd2, 0xf2, 0x3d, 0xa, 0xde, 0xd8, 0x93, 0x74, 0xe, 0x2, 0x2b, 0x4d, 0x21, 0xc, 0x82, 0x7e, 0x6, 0xc8, 0x6c, 0xa, 0xb9, 0xea, 0x6f, 0x16, 0x79, 0x37, 0x41}, {0x44, 0x1e, 0xfe, 0x49, 0xa6, 0x58, 0x4d, 0x64, 0x7e, 0x77, 0xad, 0x31, 0xa2, 0xae, 0xfc, 0x21, 0xd2, 0xd0, 0x7f, 0x88, 0x5a, 0x1c, 0x44, 0x2, 0xf3, 0x11, 0xc5, 0x83, 0x71, 0xaa, 0x1, 0x49}, {0xf0, 0xf8, 0x1a, 0x8c, 0x54, 0xb7, 0xb1, 0x8, 0xb4, 0x99, 0x62, 0x24, 0x7c, 0x7a, 0xf, 0xce, 0x39, 0xd9, 0x6, 0x1e, 0xf9, 0xb0, 0x60, 0xf7, 0x13, 0x12, 0x6d, 0x72, 0x7b, 0x88, 0xbb, 0x41}, }, }, { { {0xae, 0x91, 0x66, 0x7c, 0x59, 0x4c, 0x23, 0x7e, 0xc8, 0xb4, 0x85, 0xa, 0x3d, 0x9d, 0x88, 0x64, 0xe7, 0xfa, 0x4a, 0x35, 0xc, 0xc9, 0xe2, 0xda, 0x1d, 0x9e, 0x6a, 0xc, 0x7, 0x1e, 0x87, 0xa}, {0xbe, 0x46, 0x43, 0x74, 0x44, 0x7d, 0xe8, 0x40, 0x25, 0x2b, 0xb5, 0x15, 0xd4, 0xda, 0x48, 0x1d, 0x3e, 0x60, 0x3b, 0xa1, 0x18, 0x8a, 0x3a, 0x7c, 0xf7, 0xbd, 0xcd, 0x2f, 0xc1, 0x28, 0xb7, 0x4e}, {0x89, 0x89, 0xbc, 0x4b, 0x99, 0xb5, 0x1, 0x33, 0x60, 0x42, 0xdd, 0x5b, 0x3a, 0xae, 0x6b, 0x73, 0x3c, 0x9e, 0xd5, 0x19, 0xe2, 0xad, 0x61, 0xd, 0x64, 0xd4, 0x85, 0x26, 0xf, 0x30, 0xe7, 0x3e}, }, { {0x18, 0x75, 0x1e, 0x84, 0x47, 0x79, 0xfa, 0x43, 0xd7, 0x46, 0x9c, 0x63, 0x59, 0xfa, 0xc6, 0xe5, 0x74, 0x2b, 0x5, 0xe3, 0x1d, 0x5e, 0x6, 0xa1, 0x30, 0x90, 0xb8, 0xcf, 0xa2, 0xc6, 0x47, 0x7d}, {0xb7, 0xd6, 0x7d, 0x9e, 0xe4, 0x55, 0xd2, 0xf5, 0xac, 0x1e, 0xb, 0x61, 0x5c, 0x11, 0x16, 0x80, 0xca, 0x87, 0xe1, 0x92, 0x5d, 0x97, 0x99, 0x3c, 0xc2, 0x25, 0x91, 0x97, 0x62, 0x57, 0x81, 0x13}, {0xe0, 0xd6, 0xf0, 0x8e, 0x14, 0xd0, 0xda, 0x3f, 0x3c, 0x6f, 0x54, 0x91, 0x9a, 0x74, 0x3e, 0x9d, 0x57, 0x81, 0xbb, 0x26, 0x10, 0x62, 0xec, 0x71, 0x80, 0xec, 0xc9, 0x34, 0x8d, 0xf5, 0x8c, 0x14}, }, { {0x6d, 0x75, 0xe4, 0x9a, 0x7d, 0x2f, 0x57, 0xe2, 0x7f, 0x48, 0xf3, 0x88, 0xbb, 0x45, 0xc3, 0x56, 0x8d, 0xa8, 0x60, 0x69, 0x6d, 0xb, 0xd1, 0x9f, 0xb9, 0xa1, 0xae, 0x4e, 0xad, 0xeb, 0x8f, 0x27}, {0x27, 0xf0, 0x34, 0x79, 0xf6, 0x92, 0xa4, 0x46, 0xa9, 0xa, 0x84, 0xf6, 0xbe, 0x84, 0x99, 0x46, 0x54, 0x18, 0x61, 0x89, 0x2a, 0xbc, 0xa1, 0x5c, 0xd4, 0xbb, 0x5d, 0xbd, 0x1e, 0xfa, 0xf2, 0x3f}, {0x66, 0x39, 0x93, 0x8c, 0x1f, 0x68, 0xaa, 0xb1, 0x98, 0xc, 0x29, 0x20, 0x9c, 0x94, 0x21, 0x8c, 0x52, 0x3c, 0x9d, 0x21, 0x91, 0x52, 0x11, 0x39, 0x7b, 0x67, 0x9c, 0xfe, 0x2, 0xdd, 0x4, 0x41}, }, { {0xb8, 0x6a, 0x9, 0xdb, 0x6, 0x4e, 0x21, 0x81, 0x35, 0x4f, 0xe4, 0xc, 0xc9, 0xb6, 0xa8, 0x21, 0xf5, 0x2a, 0x9e, 0x40, 0x2a, 0xc1, 0x24, 0x65, 0x81, 0xa4, 0xfc, 0x8e, 0xa4, 0xb5, 0x65, 0x1}, {0x2a, 0x42, 0x24, 0x11, 0x5e, 0xbf, 0xb2, 0x72, 0xb5, 0x3a, 0xa3, 0x98, 0x33, 0xc, 0xfa, 0xa1, 0x66, 0xb6, 0x52, 0xfa, 0x1, 0x61, 0xcb, 0x94, 0xd5, 0x53, 0xaf, 0xaf, 0x0, 0x3b, 0x86, 0x2c}, {0x76, 0x6a, 0x84, 0xa0, 0x74, 0xa4, 0x90, 0xf1, 0xc0, 0x7c, 0x2f, 0xcd, 0x84, 0xf9, 0xef, 0x12, 0x8f, 0x2b, 0xaa, 0x58, 0x6, 0x29, 0x5e, 0x69, 0xb8, 0xc8, 0xfe, 0xbf, 0xd9, 0x67, 0x1b, 0x59}, }, { {0x5d, 0xb5, 0x18, 0x9f, 0x71, 0xb3, 0xb9, 0x99, 0x1e, 0x64, 0x8c, 0xa1, 0xfa, 0xe5, 0x65, 0xe4, 0xed, 0x5, 0x9f, 0xc2, 0x36, 0x11, 0x8, 0x61, 0x8b, 0x12, 0x30, 0x70, 0x86, 0x4f, 0x9b, 0x48}, {0xfa, 0x9b, 0xb4, 0x80, 0x1c, 0xd, 0x2f, 0x31, 0x8a, 0xec, 0xf3, 0xab, 0x5e, 0x51, 0x79, 0x59, 0x88, 0x1c, 0xf0, 0x9e, 0xc0, 0x33, 0x70, 0x72, 0xcb, 0x7b, 0x8f, 0xca, 0xc7, 0x2e, 0xe0, 0x3d}, {0xef, 0x92, 0xeb, 0x3a, 0x2d, 0x10, 0x32, 0xd2, 0x61, 0xa8, 0x16, 0x61, 0xb4, 0x53, 0x62, 0xe1, 0x24, 0xaa, 0xb, 0x19, 0xe7, 0xab, 0x7e, 0x3d, 0xbf, 0xbe, 0x6c, 0x49, 0xba, 0xfb, 0xf5, 0x49}, }, { {0x2e, 0x57, 0x9c, 0x1e, 0x8c, 0x62, 0x5d, 0x15, 0x41, 0x47, 0x88, 0xc5, 0xac, 0x86, 0x4d, 0x8a, 0xeb, 0x63, 0x57, 0x51, 0xf6, 0x52, 0xa3, 0x91, 0x5b, 0x51, 0x67, 0x88, 0xc2, 0xa6, 0xa1, 0x6}, {0xd4, 0xcf, 0x5b, 0x8a, 0x10, 0x9a, 0x94, 0x30, 0xeb, 0x73, 0x64, 0xbc, 0x70, 0xdd, 0x40, 0xdc, 0x1c, 0xd, 0x7c, 0x30, 0xc1, 0x94, 0xc2, 0x92, 0x74, 0x6e, 0xfa, 0xcb, 0x6d, 0xa8, 0x4, 0x56}, {0xb6, 0x64, 0x17, 0x7c, 0xd4, 0xd1, 0x88, 0x72, 0x51, 0x8b, 0x41, 0xe0, 0x40, 0x11, 0x54, 0x72, 0xd1, 0xf6, 0xac, 0x18, 0x60, 0x1a, 0x3, 0x9f, 0xc6, 0x42, 0x27, 0xfe, 0x89, 0x9e, 0x98, 0x20}, }, { {0x2e, 0xec, 0xea, 0x85, 0x8b, 0x27, 0x74, 0x16, 0xdf, 0x2b, 0xcb, 0x7a, 0x7, 0xdc, 0x21, 0x56, 0x5a, 0xf4, 0xcb, 0x61, 0x16, 0x4c, 0xa, 0x64, 0xd3, 0x95, 0x5, 0xf7, 0x50, 0x99, 0xb, 0x73}, {0x7f, 0xcc, 0x2d, 0x3a, 0xfd, 0x77, 0x97, 0x49, 0x92, 0xd8, 0x4f, 0xa5, 0x2c, 0x7c, 0x85, 0x32, 0xa0, 0xe3, 0x7, 0xd2, 0x64, 0xd8, 0x79, 0xa2, 0x29, 0x7e, 0xa6, 0xc, 0x1d, 0xed, 0x3, 0x4}, {0x52, 0xc5, 0x4e, 0x87, 0x35, 0x2d, 0x4b, 0xc9, 0x8d, 0x6f, 0x24, 0x98, 0xcf, 0xc8, 0xe6, 0xc5, 0xce, 0x35, 0xc0, 0x16, 0xfa, 0x46, 0xcb, 0xf7, 0xcc, 0x3d, 0x30, 0x8, 0x43, 0x45, 0xd7, 0x5b}, }, { {0x2a, 0x79, 0xe7, 0x15, 0x21, 0x93, 0xc4, 0x85, 0xc9, 0xdd, 0xcd, 0xbd, 0xa2, 0x89, 0x4c, 0xc6, 0x62, 0xd7, 0xa3, 0xad, 0xa8, 0x3d, 0x1e, 0x9d, 0x2c, 0xf8, 0x67, 0x30, 0x12, 0xdb, 0xb7, 0x5b}, {0xc2, 0x4c, 0xb2, 0x28, 0x95, 0xd1, 0x9a, 0x7f, 0x81, 0xc1, 0x35, 0x63, 0x65, 0x54, 0x6b, 0x7f, 0x36, 0x72, 0xc0, 0x4f, 0x6e, 0xb6, 0xb8, 0x66, 0x83, 0xad, 0x80, 0x73, 0x0, 0x78, 0x3a, 0x13}, {0xbe, 0x62, 0xca, 0xc6, 0x67, 0xf4, 0x61, 0x9, 0xee, 0x52, 0x19, 0x21, 0xd6, 0x21, 0xec, 0x4, 0x70, 0x47, 0xd5, 0x9b, 0x77, 0x60, 0x23, 0x18, 0xd2, 0xe0, 0xf0, 0x58, 0x6d, 0xca, 0xd, 0x74}, }, }, { { {0x3c, 0x43, 0x78, 0x4, 0x57, 0x8c, 0x1a, 0x23, 0x9d, 0x43, 0x81, 0xc2, 0xe, 0x27, 0xb5, 0xb7, 0x9f, 0x7, 0xd9, 0xe3, 0xea, 0x99, 0xaa, 0xdb, 0xd9, 0x3, 0x2b, 0x6c, 0x25, 0xf5, 0x3, 0x2c}, {0x4e, 0xce, 0xcf, 0x52, 0x7, 0xee, 0x48, 0xdf, 0xb7, 0x8, 0xec, 0x6, 0xf3, 0xfa, 0xff, 0xc3, 0xc4, 0x59, 0x54, 0xb9, 0x2a, 0xb, 0x71, 0x5, 0x8d, 0xa3, 0x3e, 0x96, 0xfa, 0x25, 0x1d, 0x16}, {0x7d, 0xa4, 0x53, 0x7b, 0x75, 0x18, 0xf, 0x79, 0x79, 0x58, 0xc, 0xcf, 0x30, 0x1, 0x7b, 0x30, 0xf9, 0xf7, 0x7e, 0x25, 0x77, 0x3d, 0x90, 0x31, 0xaf, 0xbb, 0x96, 0xbd, 0xbd, 0x68, 0x94, 0x69}, }, { {0x48, 0x19, 0xa9, 0x6a, 0xe6, 0x3d, 0xdd, 0xd8, 0xcc, 0xd2, 0xc0, 0x2f, 0xc2, 0x64, 0x50, 0x48, 0x2f, 0xea, 0xfd, 0x34, 0x66, 0x24, 0x48, 0x9b, 0x3a, 0x2e, 0x4a, 0x6c, 0x4e, 0x1c, 0x3e, 0x29}, {0xcf, 0xfe, 0xda, 0xf4, 0x46, 0x2f, 0x1f, 0xbd, 0xf7, 0xd6, 0x7f, 0xa4, 0x14, 0x1, 0xef, 0x7c, 0x7f, 0xb3, 0x47, 0x4a, 0xda, 0xfd, 0x1f, 0xd3, 0x85, 0x57, 0x90, 0x73, 0xa4, 0x19, 0x52, 0x52}, {0xe1, 0x12, 0x51, 0x92, 0x4b, 0x13, 0x6e, 0x37, 0xa0, 0x5d, 0xa1, 0xdc, 0xb5, 0x78, 0x37, 0x70, 0x11, 0x31, 0x1c, 0x46, 0xaf, 0x89, 0x45, 0xb0, 0x23, 0x28, 0x3, 0x7f, 0x44, 0x5c, 0x60, 0x5b}, }, { {0x4c, 0xf0, 0xe7, 0xf0, 0xc6, 0xfe, 0xe9, 0x3b, 0x62, 0x49, 0xe3, 0x75, 0x9e, 0x57, 0x6a, 0x86, 0x1a, 0xe6, 0x1d, 0x1e, 0x16, 0xef, 0x42, 0x55, 0xd5, 0xbd, 0x5a, 0xcc, 0xf4, 0xfe, 0x12, 0x2f}, {0x89, 0x7c, 0xc4, 0x20, 0x59, 0x80, 0x65, 0xb9, 0xcc, 0x8f, 0x3b, 0x92, 0xc, 0x10, 0xf0, 0xe7, 0x77, 0xef, 0xe2, 0x2, 0x65, 0x25, 0x1, 0x0, 0xee, 0xb3, 0xae, 0xa8, 0xce, 0x6d, 0xa7, 0x24}, {0x40, 0xc7, 0xc0, 0xdf, 0xb2, 0x22, 0x45, 0xa, 0x7, 0xa4, 0xc9, 0x40, 0x7f, 0x6e, 0xd0, 0x10, 0x68, 0xf6, 0xcf, 0x78, 0x41, 0x14, 0xcf, 0xc6, 0x90, 0x37, 0xa4, 0x18, 0x25, 0x7b, 0x60, 0x5e}, }, { {0x14, 0xcf, 0x96, 0xa5, 0x1c, 0x43, 0x2c, 0xa0, 0x0, 0xe4, 0xd3, 0xae, 0x40, 0x2d, 0xc4, 0xe3, 0xdb, 0x26, 0xf, 0x2e, 0x80, 0x26, 0x45, 0xd2, 0x68, 0x70, 0x45, 0x9e, 0x13, 0x33, 0x1f, 0x20}, {0x18, 0x18, 0xdf, 0x6c, 0x8f, 0x1d, 0xb3, 0x58, 0xa2, 0x58, 0x62, 0xc3, 0x4f, 0xa7, 0xcf, 0x35, 0x6e, 0x1d, 0xe6, 0x66, 0x4f, 0xff, 0xb3, 0xe1, 0xf7, 0xd5, 0xcd, 0x6c, 0xab, 0xac, 0x67, 0x50}, {0x51, 0x9d, 0x3, 0x8, 0x6b, 0x7f, 0x52, 0xfd, 0x6, 0x0, 0x7c, 0x1, 0x64, 0x49, 0xb1, 0x18, 0xa8, 0xa4, 0x25, 0x2e, 0xb0, 0xe, 0x22, 0xd5, 0x75, 0x3, 0x46, 0x62, 0x88, 0xba, 0x7c, 0x39}, }, { {0xe7, 0x79, 0x13, 0xc8, 0xfb, 0xc3, 0x15, 0x78, 0xf1, 0x2a, 0xe1, 0xdd, 0x20, 0x94, 0x61, 0xa6, 0xd5, 0xfd, 0xa8, 0x85, 0xf8, 0xc0, 0xa9, 0xff, 0x52, 0xc2, 0xe1, 0xc1, 0x22, 0x40, 0x1b, 0x77}, {0xb2, 0x59, 0x59, 0xf0, 0x93, 0x30, 0xc1, 0x30, 0x76, 0x79, 0xa9, 0xe9, 0x8d, 0xa1, 0x3a, 0xe2, 0x26, 0x5e, 0x1d, 0x72, 0x91, 0xd4, 0x2f, 0x22, 0x3a, 0x6c, 0x6e, 0x76, 0x20, 0xd3, 0x39, 0x23}, {0xa7, 0x2f, 0x3a, 0x51, 0x86, 0xd9, 0x7d, 0xd8, 0x8, 0xcf, 0xd4, 0xf9, 0x71, 0x9b, 0xac, 0xf5, 0xb3, 0x83, 0xa2, 0x1e, 0x1b, 0xc3, 0x6b, 0xd0, 0x76, 0x1a, 0x97, 0x19, 0x92, 0x18, 0x1a, 0x33}, }, { {0xaf, 0x72, 0x75, 0x9d, 0x3a, 0x2f, 0x51, 0x26, 0x9e, 0x4a, 0x7, 0x68, 0x88, 0xe2, 0xcb, 0x5b, 0xc4, 0xf7, 0x80, 0x11, 0xc1, 0xc1, 0xed, 0x84, 0x7b, 0xa6, 0x49, 0xf6, 0x9f, 0x61, 0xc9, 0x1a}, {0xc6, 0x80, 0x4f, 0xfb, 0x45, 0x6f, 0x16, 0xf5, 0xcf, 0x75, 0xc7, 0x61, 0xde, 0xc7, 0x36, 0x9c, 0x1c, 0xd9, 0x41, 0x90, 0x1b, 0xe8, 0xd4, 0xe3, 0x21, 0xfe, 0xbd, 0x83, 0x6b, 0x7c, 0x16, 0x31}, {0x68, 0x10, 0x4b, 0x52, 0x42, 0x38, 0x2b, 0xf2, 0x87, 0xe9, 0x9c, 0xee, 0x3b, 0x34, 0x68, 0x50, 0xc8, 0x50, 0x62, 0x4a, 0x84, 0x71, 0x9d, 0xfc, 0x11, 0xb1, 0x8, 0x1f, 0x34, 0x36, 0x24, 0x61}, }, { {0x38, 0x26, 0x2d, 0x1a, 0xe3, 0x49, 0x63, 0x8b, 0x35, 0xfd, 0xd3, 0x9b, 0x0, 0xb7, 0xdf, 0x9d, 0xa4, 0x6b, 0xa0, 0xa3, 0xb8, 0xf1, 0x8b, 0x7f, 0x45, 0x4, 0xd9, 0x78, 0x31, 0xaa, 0x22, 0x15}, {0x8d, 0x89, 0x4e, 0x87, 0xdb, 0x41, 0x9d, 0xd9, 0x20, 0xdc, 0x7, 0x6c, 0xf1, 0xa5, 0xfe, 0x9, 0xbc, 0x9b, 0xf, 0xd0, 0x67, 0x2c, 0x3d, 0x79, 0x40, 0xff, 0x5e, 0x9e, 0x30, 0xe2, 0xeb, 0x46}, {0x38, 0x49, 0x61, 0x69, 0x53, 0x2f, 0x38, 0x2c, 0x10, 0x6d, 0x2d, 0xb7, 0x9a, 0x40, 0xfe, 0xda, 0x27, 0xf2, 0x46, 0xb6, 0x91, 0x33, 0xc8, 0xe8, 0x6c, 0x30, 0x24, 0x5, 0xf5, 0x70, 0xfe, 0x45}, }, { {0x91, 0x14, 0x95, 0xc8, 0x20, 0x49, 0xf2, 0x62, 0xa2, 0xc, 0x63, 0x3f, 0xc8, 0x7, 0xf0, 0x5, 0xb8, 0xd4, 0xc9, 0xf5, 0xd2, 0x45, 0xbb, 0x6f, 0x45, 0x22, 0x7a, 0xb5, 0x6d, 0x9f, 0x61, 0x16}, {0x8c, 0xb, 0xc, 0x96, 0xa6, 0x75, 0x48, 0xda, 0x20, 0x2f, 0xe, 0xef, 0x76, 0xd0, 0x68, 0x5b, 0xd4, 0x8f, 0xb, 0x3d, 0xcf, 0x51, 0xfb, 0x7, 0xd4, 0x92, 0xe3, 0xa0, 0x23, 0x16, 0x8d, 0x42}, {0xfd, 0x8, 0xa3, 0x1, 0x44, 0x4a, 0x4f, 0x8, 0xac, 0xca, 0xa5, 0x76, 0xc3, 0x19, 0x22, 0xa8, 0x7d, 0xbc, 0xd1, 0x43, 0x46, 0xde, 0xb8, 0xde, 0xc6, 0x38, 0xbd, 0x60, 0x2d, 0x59, 0x81, 0x1d}, }, }, { { {0xe8, 0xc5, 0x85, 0x7b, 0x9f, 0xb6, 0x65, 0x87, 0xb2, 0xba, 0x68, 0xd1, 0x8b, 0x67, 0xf0, 0x6f, 0x9b, 0xf, 0x33, 0x1d, 0x7c, 0xe7, 0x70, 0x3a, 0x7c, 0x8e, 0xaf, 0xb0, 0x51, 0x6d, 0x5f, 0x3a}, {0x5f, 0xac, 0xd, 0xa6, 0x56, 0x87, 0x36, 0x61, 0x57, 0xdc, 0xab, 0xeb, 0x6a, 0x2f, 0xe0, 0x17, 0x7d, 0xf, 0xce, 0x4c, 0x2d, 0x3f, 0x19, 0x7f, 0xf0, 0xdc, 0xec, 0x89, 0x77, 0x4a, 0x23, 0x20}, {0x52, 0xb2, 0x78, 0x71, 0xb6, 0xd, 0xd2, 0x76, 0x60, 0xd1, 0x1e, 0xd5, 0xf9, 0x34, 0x1c, 0x7, 0x70, 0x11, 0xe4, 0xb3, 0x20, 0x4a, 0x2a, 0xf6, 0x66, 0xe3, 0xff, 0x3c, 0x35, 0x82, 0xd6, 0x7c}, }, { {0xf3, 0xf4, 0xac, 0x68, 0x60, 0xcd, 0x65, 0xa6, 0xd3, 0xe3, 0xd7, 0x3c, 0x18, 0x2d, 0xd9, 0x42, 0xd9, 0x25, 0x60, 0x33, 0x9d, 0x38, 0x59, 0x57, 0xff, 0xd8, 0x2c, 0x2b, 0x3b, 0x25, 0xf0, 0x3e}, {0xb6, 0xfa, 0x87, 0xd8, 0x5b, 0xa4, 0xe1, 0xb, 0x6e, 0x3b, 0x40, 0xba, 0x32, 0x6a, 0x84, 0x2a, 0x0, 0x60, 0x6e, 0xe9, 0x12, 0x10, 0x92, 0xd9, 0x43, 0x9, 0xdc, 0x3b, 0x86, 0xc8, 0x38, 0x28}, {0x30, 0x50, 0x46, 0x4a, 0xcf, 0xb0, 0x6b, 0xd1, 0xab, 0x77, 0xc5, 0x15, 0x41, 0x6b, 0x49, 0xfa, 0x9d, 0x41, 0xab, 0xf4, 0x8a, 0xae, 0xcf, 0x82, 0x12, 0x28, 0xa8, 0x6, 0xa6, 0xb8, 0xdc, 0x21}, }, { {0xba, 0x31, 0x77, 0xbe, 0xfa, 0x0, 0x8d, 0x9a, 0x89, 0x18, 0x9e, 0x62, 0x7e, 0x60, 0x3, 0x82, 0x7f, 0xd9, 0xf3, 0x43, 0x37, 0x2, 0xcc, 0xb2, 0x8b, 0x67, 0x6f, 0x6c, 0xbf, 0xd, 0x84, 0x5d}, {0xc8, 0x9f, 0x9d, 0x8c, 0x46, 0x4, 0x60, 0x5c, 0xcb, 0xa3, 0x2a, 0xd4, 0x6e, 0x9, 0x40, 0x25, 0x9c, 0x2f, 0xee, 0x12, 0x4c, 0x4d, 0x5b, 0x12, 0xab, 0x1d, 0xa3, 0x94, 0x81, 0xd0, 0xc3, 0xb}, {0x8b, 0xe1, 0x9f, 0x30, 0xd, 0x38, 0x6e, 0x70, 0xc7, 0x65, 0xe1, 0xb9, 0xa6, 0x2d, 0xb0, 0x6e, 0xab, 0x20, 0xae, 0x7d, 0x99, 0xba, 0xbb, 0x57, 0xdd, 0x96, 0xc1, 0x2a, 0x23, 0x76, 0x42, 0x3a}, }, { {0xcb, 0x7e, 0x44, 0xdb, 0x72, 0xc1, 0xf8, 0x3b, 0xbd, 0x2d, 0x28, 0xc6, 0x1f, 0xc4, 0xcf, 0x5f, 0xfe, 0x15, 0xaa, 0x75, 0xc0, 0xff, 0xac, 0x80, 0xf9, 0xa9, 0xe1, 0x24, 0xe8, 0xc9, 0x70, 0x7}, {0xfa, 0x84, 0x70, 0x8a, 0x2c, 0x43, 0x42, 0x4b, 0x45, 0xe5, 0xb9, 0xdf, 0xe3, 0x19, 0x8a, 0x89, 0x5d, 0xe4, 0x58, 0x9c, 0x21, 0x0, 0x9f, 0xbe, 0xd1, 0xeb, 0x6d, 0xa1, 0xce, 0x77, 0xf1, 0x1f}, {0xfd, 0xb5, 0xb5, 0x45, 0x9a, 0xd9, 0x61, 0xcf, 0x24, 0x79, 0x3a, 0x1b, 0xe9, 0x84, 0x9, 0x86, 0x89, 0x3e, 0x3e, 0x30, 0x19, 0x9, 0x30, 0xe7, 0x1e, 0xb, 0x50, 0x41, 0xfd, 0x64, 0xf2, 0x39}, }, { {0xe1, 0x7b, 0x9, 0xfe, 0xab, 0x4a, 0x9b, 0xd1, 0x29, 0x19, 0xe0, 0xdf, 0xe1, 0xfc, 0x6d, 0xa4, 0xff, 0xf1, 0xa6, 0x2c, 0x94, 0x8, 0xc9, 0xc3, 0x4e, 0xf1, 0x35, 0x2c, 0x27, 0x21, 0xc6, 0x65}, {0x9c, 0xe2, 0xe7, 0xdb, 0x17, 0x34, 0xad, 0xa7, 0x9c, 0x13, 0x9c, 0x2b, 0x6a, 0x37, 0x94, 0xbd, 0xa9, 0x7b, 0x59, 0x93, 0x8e, 0x1b, 0xe9, 0xa0, 0x40, 0x98, 0x88, 0x68, 0x34, 0xd7, 0x12, 0x17}, {0xdd, 0x93, 0x31, 0xce, 0xf8, 0x89, 0x2b, 0xe7, 0xbb, 0xc0, 0x25, 0xa1, 0x56, 0x33, 0x10, 0x4d, 0x83, 0xfe, 0x1c, 0x2e, 0x3d, 0xa9, 0x19, 0x4, 0x72, 0xe2, 0x9c, 0xb1, 0xa, 0x80, 0xf9, 0x22}, }, { {0xac, 0xfd, 0x6e, 0x9a, 0xdd, 0x9f, 0x2, 0x42, 0x41, 0x49, 0xa5, 0x34, 0xbe, 0xce, 0x12, 0xb9, 0x7b, 0xf3, 0xbd, 0x87, 0xb9, 0x64, 0xf, 0x64, 0xb4, 0xca, 0x98, 0x85, 0xd3, 0xa4, 0x71, 0x41}, {0xcb, 0xf8, 0x9e, 0x3e, 0x8a, 0x36, 0x5a, 0x60, 0x15, 0x47, 0x50, 0xa5, 0x22, 0xc0, 0xe9, 0xe3, 0x8f, 0x24, 0x24, 0x5f, 0xb0, 0x48, 0x3d, 0x55, 0xe5, 0x26, 0x76, 0x64, 0xcd, 0x16, 0xf4, 0x13}, {0x8c, 0x4c, 0xc9, 0x99, 0xaa, 0x58, 0x27, 0xfa, 0x7, 0xb8, 0x0, 0xb0, 0x6f, 0x6f, 0x0, 0x23, 0x92, 0x53, 0xda, 0xad, 0xdd, 0x91, 0xd2, 0xfb, 0xab, 0xd1, 0x4b, 0x57, 0xfa, 0x14, 0x82, 0x50}, }, { {0xd6, 0x3, 0xd0, 0x53, 0xbb, 0x15, 0x1a, 0x46, 0x65, 0xc9, 0xf3, 0xbc, 0x88, 0x28, 0x10, 0xb2, 0x5a, 0x3a, 0x68, 0x6c, 0x75, 0x76, 0xc5, 0x27, 0x47, 0xb4, 0x6c, 0xc8, 0xa4, 0x58, 0x77, 0x3a}, {0x4b, 0xfe, 0xd6, 0x3e, 0x15, 0x69, 0x2, 0xc2, 0xc4, 0x77, 0x1d, 0x51, 0x39, 0x67, 0x5a, 0xa6, 0x94, 0xaf, 0x14, 0x2c, 0x46, 0x26, 0xde, 0xcb, 0x4b, 0xa7, 0xab, 0x6f, 0xec, 0x60, 0xf9, 0x22}, {0x76, 0x50, 0xae, 0x93, 0xf6, 0x11, 0x81, 0x54, 0xa6, 0x54, 0xfd, 0x1d, 0xdf, 0x21, 0xae, 0x1d, 0x65, 0x5e, 0x11, 0xf3, 0x90, 0x8c, 0x24, 0x12, 0x94, 0xf4, 0xe7, 0x8d, 0x5f, 0xd1, 0x9f, 0x5d}, }, { {0x1e, 0x52, 0xd7, 0xee, 0x2a, 0x4d, 0x24, 0x3f, 0x15, 0x96, 0x2e, 0x43, 0x28, 0x90, 0x3a, 0x8e, 0xd4, 0x16, 0x9c, 0x2e, 0x77, 0xba, 0x64, 0xe1, 0xd8, 0x98, 0xeb, 0x47, 0xfa, 0x87, 0xc1, 0x3b}, {0x7f, 0x72, 0x63, 0x6d, 0xd3, 0x8, 0x14, 0x3, 0x33, 0xb5, 0xc7, 0xd7, 0xef, 0x9a, 0x37, 0x6a, 0x4b, 0xe2, 0xae, 0xcc, 0xc5, 0x8f, 0xe1, 0xa9, 0xd3, 0xbe, 0x8f, 0x4f, 0x91, 0x35, 0x2f, 0x33}, {0xc, 0xc2, 0x86, 0xea, 0x15, 0x1, 0x47, 0x6d, 0x25, 0xd1, 0x46, 0x6c, 0xcb, 0xb7, 0x8a, 0x99, 0x88, 0x1, 0x66, 0x3a, 0xb5, 0x32, 0x78, 0xd7, 0x3, 0xba, 0x6f, 0x90, 0xce, 0x81, 0xd, 0x45}, }, }, { { {0x3f, 0x74, 0xae, 0x1c, 0x96, 0xd8, 0x74, 0xd0, 0xed, 0x63, 0x1c, 0xee, 0xf5, 0x18, 0x6d, 0xf8, 0x29, 0xed, 0xf4, 0xe7, 0x5b, 0xc5, 0xbd, 0x97, 0x8, 0xb1, 0x3a, 0x66, 0x79, 0xd2, 0xba, 0x4c}, {0x75, 0x52, 0x20, 0xa6, 0xa1, 0xb6, 0x7b, 0x6e, 0x83, 0x8e, 0x3c, 0x41, 0xd7, 0x21, 0x4f, 0xaa, 0xb2, 0x5c, 0x8f, 0xe8, 0x55, 0xd1, 0x56, 0x6f, 0xe1, 0x5b, 0x34, 0xa6, 0x4b, 0x5d, 0xe2, 0x2d}, {0xcd, 0x1f, 0xd7, 0xa0, 0x24, 0x90, 0xd1, 0x80, 0xf8, 0x8a, 0x28, 0xfb, 0xa, 0xc2, 0x25, 0xc5, 0x19, 0x64, 0x3a, 0x5f, 0x4b, 0x97, 0xa3, 0xb1, 0x33, 0x72, 0x0, 0xe2, 0xef, 0xbc, 0x7f, 0x7d}, }, { {0x94, 0x90, 0xc2, 0xf3, 0xc5, 0x5d, 0x7c, 0xcd, 0xab, 0x5, 0x91, 0x2a, 0x9a, 0xa2, 0x81, 0xc7, 0x58, 0x30, 0x1c, 0x42, 0x36, 0x1d, 0xc6, 0x80, 0xd7, 0xd4, 0xd8, 0xdc, 0x96, 0xd1, 0x9c, 0x4f}, {0x1, 0x28, 0x6b, 0x26, 0x6a, 0x1e, 0xef, 0xfa, 0x16, 0x9f, 0x73, 0xd5, 0xc4, 0x68, 0x6c, 0x86, 0x2c, 0x76, 0x3, 0x1b, 0xbc, 0x2f, 0x8a, 0xf6, 0x8d, 0x5a, 0xb7, 0x87, 0x5e, 0x43, 0x75, 0x59}, {0x68, 0x37, 0x7b, 0x6a, 0xd8, 0x97, 0x92, 0x19, 0x63, 0x7a, 0xd1, 0x1a, 0x24, 0x58, 0xd0, 0xd0, 0x17, 0xc, 0x1c, 0x5c, 0xad, 0x9c, 0x2, 0xba, 0x7, 0x3, 0x7a, 0x38, 0x84, 0xd0, 0xcd, 0x7c}, }, { {0x93, 0xcc, 0x60, 0x67, 0x18, 0x84, 0xc, 0x9b, 0x99, 0x2a, 0xb3, 0x1a, 0x7a, 0x0, 0xae, 0xcd, 0x18, 0xda, 0xb, 0x62, 0x86, 0xec, 0x8d, 0xa8, 0x44, 0xca, 0x90, 0x81, 0x84, 0xca, 0x93, 0x35}, {0x17, 0x4, 0x26, 0x6d, 0x2c, 0x42, 0xa6, 0xdc, 0xbd, 0x40, 0x82, 0x94, 0x50, 0x3d, 0x15, 0xae, 0x77, 0xc6, 0x68, 0xfb, 0xb4, 0xc1, 0xc0, 0xa9, 0x53, 0xcf, 0xd0, 0x61, 0xed, 0xd0, 0x8b, 0x42}, {0xa7, 0x9a, 0x84, 0x5e, 0x9a, 0x18, 0x13, 0x92, 0xcd, 0xfa, 0xd8, 0x65, 0x35, 0xc3, 0xd8, 0xd4, 0xd1, 0xbb, 0xfd, 0x53, 0x5b, 0x54, 0x52, 0x8c, 0xe6, 0x63, 0x2d, 0xda, 0x8, 0x83, 0x39, 0x27}, }, { {0x53, 0x24, 0x70, 0xa, 0x4c, 0xe, 0xa1, 0xb9, 0xde, 0x1b, 0x7d, 0xd5, 0x66, 0x58, 0xa2, 0xf, 0xf7, 0xda, 0x27, 0xcd, 0xb5, 0xd9, 0xb9, 0xff, 0xfd, 0x33, 0x2c, 0x49, 0x45, 0x29, 0x2c, 0x57}, {0x13, 0xd4, 0x5e, 0x43, 0x28, 0x8d, 0xc3, 0x42, 0xc9, 0xcc, 0x78, 0x32, 0x60, 0xf3, 0x50, 0xbd, 0xef, 0x3, 0xda, 0x79, 0x1a, 0xab, 0x7, 0xbb, 0x55, 0x33, 0x8c, 0xbe, 0xae, 0x97, 0x95, 0x26}, {0xbe, 0x30, 0xcd, 0xd6, 0x45, 0xc7, 0x7f, 0xc7, 0xfb, 0xae, 0xba, 0xe3, 0xd3, 0xe8, 0xdf, 0xe4, 0xc, 0xda, 0x5d, 0xaa, 0x30, 0x88, 0x2c, 0xa2, 0x80, 0xca, 0x5b, 0xc0, 0x98, 0x54, 0x98, 0x7f}, }, { {0x63, 0x63, 0xbf, 0xf, 0x52, 0x15, 0x56, 0xd3, 0xa6, 0xfb, 0x4d, 0xcf, 0x45, 0x5a, 0x4, 0x8, 0xc2, 0xa0, 0x3f, 0x87, 0xbc, 0x4f, 0xc2, 0xee, 0xe7, 0x12, 0x9b, 0xd6, 0x3c, 0x65, 0xf2, 0x30}, {0x17, 0xe1, 0xb, 0x9f, 0x88, 0xce, 0x49, 0x38, 0x88, 0xa2, 0x54, 0x7b, 0x1b, 0xad, 0x5, 0x80, 0x1c, 0x92, 0xfc, 0x23, 0x9f, 0xc3, 0xa3, 0x3d, 0x4, 0xf3, 0x31, 0xa, 0x47, 0xec, 0xc2, 0x76}, {0x85, 0xc, 0xc1, 0xaa, 0x38, 0xc9, 0x8, 0x8a, 0xcb, 0x6b, 0x27, 0xdb, 0x60, 0x9b, 0x17, 0x46, 0x70, 0xac, 0x6f, 0xe, 0x1e, 0xc0, 0x20, 0xa9, 0xda, 0x73, 0x64, 0x59, 0xf1, 0x73, 0x12, 0x2f}, }, { {0xc0, 0xb, 0xa7, 0x55, 0xd7, 0x8b, 0x48, 0x30, 0xe7, 0x42, 0xd4, 0xf1, 0xa4, 0xb5, 0xd6, 0x6, 0x62, 0x61, 0x59, 0xbc, 0x9e, 0xa6, 0xd1, 0xea, 0x84, 0xf7, 0xc5, 0xed, 0x97, 0x19, 0xac, 0x38}, {0x11, 0x1e, 0xe0, 0x8a, 0x7c, 0xfc, 0x39, 0x47, 0x9f, 0xab, 0x6a, 0x4a, 0x90, 0x74, 0x52, 0xfd, 0x2e, 0x8f, 0x72, 0x87, 0x82, 0x8a, 0xd9, 0x41, 0xf2, 0x69, 0x5b, 0xd8, 0x2a, 0x57, 0x9e, 0x5d}, {0x3b, 0xb1, 0x51, 0xa7, 0x17, 0xb5, 0x66, 0x6, 0x8c, 0x85, 0x9b, 0x7e, 0x86, 0x6, 0x7d, 0x74, 0x49, 0xde, 0x4d, 0x45, 0x11, 0xc0, 0xac, 0xac, 0x9c, 0xe6, 0xe9, 0xbf, 0x9c, 0xcd, 0xdf, 0x22}, }, { {0xa1, 0xe0, 0x3b, 0x10, 0xb4, 0x59, 0xec, 0x56, 0x69, 0xf9, 0x59, 0xd2, 0xec, 0xba, 0xe3, 0x2e, 0x32, 0xcd, 0xf5, 0x13, 0x94, 0xb2, 0x7c, 0x79, 0x72, 0xe4, 0xcd, 0x24, 0x78, 0x87, 0xe9, 0xf}, {0xd9, 0xc, 0xd, 0xc3, 0xe0, 0xd2, 0xdb, 0x8d, 0x33, 0x43, 0xbb, 0xac, 0x5f, 0x66, 0x8e, 0xad, 0x1f, 0x96, 0x2a, 0x32, 0x8c, 0x25, 0x6b, 0x8f, 0xc7, 0xc1, 0x48, 0x54, 0xc0, 0x16, 0x29, 0x6b}, {0x3b, 0x91, 0xba, 0xa, 0xd1, 0x34, 0xdb, 0x7e, 0xe, 0xac, 0x6d, 0x2e, 0x82, 0xcd, 0xa3, 0x4e, 0x15, 0xf8, 0x78, 0x65, 0xff, 0x3d, 0x8, 0x66, 0x17, 0xa, 0xf0, 0x7f, 0x30, 0x3f, 0x30, 0x4c}, }, { {0x0, 0x45, 0xd9, 0xd, 0x58, 0x3, 0xfc, 0x29, 0x93, 0xec, 0xbb, 0x6f, 0xa4, 0x7a, 0xd2, 0xec, 0xf8, 0xa7, 0xe2, 0xc2, 0x5f, 0x15, 0xa, 0x13, 0xd5, 0xa1, 0x6, 0xb7, 0x1a, 0x15, 0x6b, 0x41}, {0x85, 0x8c, 0xb2, 0x17, 0xd6, 0x3b, 0xa, 0xd3, 0xea, 0x3b, 0x77, 0x39, 0xb7, 0x77, 0xd3, 0xc5, 0xbf, 0x5c, 0x6a, 0x1e, 0x8c, 0xe7, 0xc6, 0xc6, 0xc4, 0xb7, 0x2a, 0x8b, 0xf7, 0xb8, 0x61, 0xd}, {0xb0, 0x36, 0xc1, 0xe9, 0xef, 0xd7, 0xa8, 0x56, 0x20, 0x4b, 0xe4, 0x58, 0xcd, 0xe5, 0x7, 0xbd, 0xab, 0xe0, 0x57, 0x1b, 0xda, 0x2f, 0xe6, 0xaf, 0xd2, 0xe8, 0x77, 0x42, 0xf7, 0x2a, 0x1a, 0x19}, }, }, { { {0xfb, 0xe, 0x46, 0x4f, 0x43, 0x2b, 0xe6, 0x9f, 0xd6, 0x7, 0x36, 0xa6, 0xd4, 0x3, 0xd3, 0xde, 0x24, 0xda, 0xa0, 0xb7, 0xe, 0x21, 0x52, 0xf0, 0x93, 0x5b, 0x54, 0x0, 0xbe, 0x7d, 0x7e, 0x23}, {0x31, 0x14, 0x3c, 0xc5, 0x4b, 0xf7, 0x16, 0xce, 0xde, 0xed, 0x72, 0x20, 0xce, 0x25, 0x97, 0x2b, 0xe7, 0x3e, 0xb2, 0xb5, 0x6f, 0xc3, 0xb9, 0xb8, 0x8, 0xc9, 0x5c, 0xb, 0x45, 0xe, 0x2e, 0x7e}, {0x30, 0xb4, 0x1, 0x67, 0xed, 0x75, 0x35, 0x1, 0x10, 0xfd, 0xb, 0x9f, 0xe6, 0x94, 0x10, 0x23, 0x22, 0x7f, 0xe4, 0x83, 0x15, 0xf, 0x32, 0x75, 0xe3, 0x55, 0x11, 0xb1, 0x99, 0xa6, 0xaf, 0x71}, }, { {0xd6, 0x50, 0x3b, 0x47, 0x1c, 0x3c, 0x42, 0xea, 0x10, 0xef, 0x38, 0x3b, 0x1f, 0x7a, 0xe8, 0x51, 0x95, 0xbe, 0xc9, 0xb2, 0x5f, 0xbf, 0x84, 0x9b, 0x1c, 0x9a, 0xf8, 0x78, 0xbc, 0x1f, 0x73, 0x0}, {0x1d, 0xb6, 0x53, 0x39, 0x9b, 0x6f, 0xce, 0x65, 0xe6, 0x41, 0xa1, 0xaf, 0xea, 0x39, 0x58, 0xc6, 0xfe, 0x59, 0xf7, 0xa9, 0xfd, 0x5f, 0x43, 0xf, 0x8e, 0xc2, 0xb1, 0xc2, 0xe9, 0x42, 0x11, 0x2}, {0x80, 0x18, 0xf8, 0x48, 0x18, 0xc7, 0x30, 0xe4, 0x19, 0xc1, 0xce, 0x5e, 0x22, 0xc, 0x96, 0xbf, 0xe3, 0x15, 0xba, 0x6b, 0x83, 0xe0, 0xda, 0xb6, 0x8, 0x58, 0xe1, 0x47, 0x33, 0x6f, 0x4d, 0x4c}, }, { {0x70, 0x19, 0x8f, 0x98, 0xfc, 0xdd, 0xc, 0x2f, 0x1b, 0xf5, 0xb9, 0xb0, 0x27, 0x62, 0x91, 0x6b, 0xbe, 0x76, 0x91, 0x77, 0xc4, 0xb6, 0xc7, 0x6e, 0xa8, 0x9f, 0x8f, 0xa8, 0x0, 0x95, 0xbf, 0x38}, {0xc9, 0x1f, 0x7d, 0xc1, 0xcf, 0xec, 0xf7, 0x18, 0x14, 0x3c, 0x40, 0x51, 0xa6, 0xf5, 0x75, 0x6c, 0xdf, 0xc, 0xee, 0xf7, 0x2b, 0x71, 0xde, 0xdb, 0x22, 0x7a, 0xe4, 0xa7, 0xaa, 0xdd, 0x3f, 0x19}, {0x6f, 0x87, 0xe8, 0x37, 0x3c, 0xc9, 0xd2, 0x1f, 0x2c, 0x46, 0xd1, 0x18, 0x5a, 0x1e, 0xf6, 0xa2, 0x76, 0x12, 0x24, 0x39, 0x82, 0xf5, 0x80, 0x50, 0x69, 0x49, 0xd, 0xbf, 0x9e, 0xb9, 0x6f, 0x6a}, }, { {0xc6, 0x23, 0xe4, 0xb6, 0xb5, 0x22, 0xb1, 0xee, 0x8e, 0xff, 0x86, 0xf2, 0x10, 0x70, 0x9d, 0x93, 0x8c, 0x5d, 0xcf, 0x1d, 0x83, 0x2a, 0xa9, 0x90, 0x10, 0xeb, 0xc5, 0x42, 0x9f, 0xda, 0x6f, 0x13}, {0xeb, 0x55, 0x8, 0x56, 0xbb, 0xc1, 0x46, 0x6a, 0x9d, 0xf0, 0x93, 0xf8, 0x38, 0xbb, 0x16, 0x24, 0xc1, 0xac, 0x71, 0x8f, 0x37, 0x11, 0x1d, 0xd7, 0xea, 0x96, 0x18, 0xa3, 0x14, 0x69, 0xf7, 0x75}, {0xd1, 0xbd, 0x5, 0xa3, 0xb1, 0xdf, 0x4c, 0xf9, 0x8, 0x2c, 0xf8, 0x9f, 0x9d, 0x4b, 0x36, 0xf, 0x8a, 0x58, 0xbb, 0xc3, 0xa5, 0xd8, 0x87, 0x2a, 0xba, 0xdc, 0xe8, 0xb, 0x51, 0x83, 0x21, 0x2}, }, { {0x7f, 0x7a, 0x30, 0x43, 0x1, 0x71, 0x5a, 0x9d, 0x5f, 0xa4, 0x7d, 0xc4, 0x9e, 0xde, 0x63, 0xb0, 0xd3, 0x7a, 0x92, 0xbe, 0x52, 0xfe, 0xbb, 0x22, 0x6c, 0x42, 0x40, 0xfd, 0x41, 0xc4, 0x87, 0x13}, {0x14, 0x2d, 0xad, 0x5e, 0x38, 0x66, 0xf7, 0x4a, 0x30, 0x58, 0x7c, 0xca, 0x80, 0xd8, 0x8e, 0xa0, 0x3d, 0x1e, 0x21, 0x10, 0xe6, 0xa6, 0x13, 0xd, 0x3, 0x6c, 0x80, 0x7b, 0xe1, 0x1c, 0x7, 0x6a}, {0xf8, 0x8a, 0x97, 0x87, 0xd1, 0xc3, 0xd3, 0xb5, 0x13, 0x44, 0xe, 0x7f, 0x3d, 0x5a, 0x2b, 0x72, 0xa0, 0x7c, 0x47, 0xbb, 0x48, 0x48, 0x7b, 0xd, 0x92, 0xdc, 0x1e, 0xaf, 0x6a, 0xb2, 0x71, 0x31}, }, { {0xd1, 0x47, 0x8a, 0xb2, 0xd8, 0xb7, 0xd, 0xa6, 0xf1, 0xa4, 0x70, 0x17, 0xd6, 0x14, 0xbf, 0xa6, 0x58, 0xbd, 0xdd, 0x53, 0x93, 0xf8, 0xa1, 0xd4, 0xe9, 0x43, 0x42, 0x34, 0x63, 0x4a, 0x51, 0x6c}, {0xa8, 0x4c, 0x56, 0x97, 0x90, 0x31, 0x2f, 0xa9, 0x19, 0xe1, 0x75, 0x22, 0x4c, 0xb8, 0x7b, 0xff, 0x50, 0x51, 0x87, 0xa4, 0x37, 0xfe, 0x55, 0x4f, 0x5a, 0x83, 0xf0, 0x3c, 0x87, 0xd4, 0x1f, 0x22}, {0x41, 0x63, 0x15, 0x3a, 0x4f, 0x20, 0x22, 0x23, 0x2d, 0x3, 0xa, 0xba, 0xe9, 0xe0, 0x73, 0xfb, 0xe, 0x3, 0xf, 0x41, 0x4c, 0xdd, 0xe0, 0xfc, 0xaa, 0x4a, 0x92, 0xfb, 0x96, 0xa5, 0xda, 0x48}, }, { {0x93, 0x97, 0x4c, 0xc8, 0x5d, 0x1d, 0xf6, 0x14, 0x6, 0x82, 0x41, 0xef, 0xe3, 0xf9, 0x41, 0x99, 0xac, 0x77, 0x62, 0x34, 0x8f, 0xb8, 0xf5, 0xcd, 0xa9, 0x79, 0x8a, 0xe, 0xfa, 0x37, 0xc8, 0x58}, {0xc7, 0x9c, 0xa5, 0x5c, 0x66, 0x8e, 0xca, 0x6e, 0xa0, 0xac, 0x38, 0x2e, 0x4b, 0x25, 0x47, 0xa8, 0xce, 0x17, 0x1e, 0xd2, 0x8, 0xc7, 0xaf, 0x31, 0xf7, 0x4a, 0xd8, 0xca, 0xfc, 0xd6, 0x6d, 0x67}, {0x58, 0x90, 0xfc, 0x96, 0x85, 0x68, 0xf9, 0xc, 0x1b, 0xa0, 0x56, 0x7b, 0xf3, 0xbb, 0xdc, 0x1d, 0x6a, 0xd6, 0x35, 0x49, 0x7d, 0xe7, 0xc2, 0xdc, 0xa, 0x7f, 0xa5, 0xc6, 0xf2, 0x73, 0x4f, 0x1c}, }, { {0x84, 0x34, 0x7c, 0xfc, 0x6e, 0x70, 0x6e, 0xb3, 0x61, 0xcf, 0xc1, 0xc3, 0xb4, 0xc9, 0xdf, 0x73, 0xe5, 0xc7, 0x1c, 0x78, 0xc9, 0x79, 0x1d, 0xeb, 0x5c, 0x67, 0xaf, 0x7d, 0xdb, 0x9a, 0x45, 0x70}, {0xbb, 0xa0, 0x5f, 0x30, 0xbd, 0x4f, 0x7a, 0xe, 0xad, 0x63, 0xc6, 0x54, 0xe0, 0x4c, 0x9d, 0x82, 0x48, 0x38, 0xe3, 0x2f, 0x83, 0xc3, 0x21, 0xf4, 0x42, 0x4c, 0xf6, 0x1b, 0xd, 0xc8, 0x5a, 0x79}, {0xb3, 0x2b, 0xb4, 0x91, 0x49, 0xdb, 0x91, 0x1b, 0xca, 0xdc, 0x2, 0x4b, 0x23, 0x96, 0x26, 0x57, 0xdc, 0x78, 0x8c, 0x1f, 0xe5, 0x9e, 0xdf, 0x9f, 0xd3, 0x1f, 0xe2, 0x8c, 0x84, 0x62, 0xe1, 0x5f}, }, }, { { {0x8, 0xb2, 0x7c, 0x5d, 0x2d, 0x85, 0x79, 0x28, 0xe7, 0xf2, 0x7d, 0x68, 0x70, 0xdd, 0xde, 0xb8, 0x91, 0x78, 0x68, 0x21, 0xab, 0xff, 0xb, 0xdc, 0x35, 0xaa, 0x7d, 0x67, 0x43, 0xc0, 0x44, 0x2b}, {0x1a, 0x96, 0x94, 0xe1, 0x4f, 0x21, 0x59, 0x4e, 0x4f, 0xcd, 0x71, 0xd, 0xc7, 0x7d, 0xbe, 0x49, 0x2d, 0xf2, 0x50, 0x3b, 0xd2, 0xcf, 0x0, 0x93, 0x32, 0x72, 0x91, 0xfc, 0x46, 0xd4, 0x89, 0x47}, {0x8e, 0xb7, 0x4e, 0x7, 0xab, 0x87, 0x1c, 0x1a, 0x67, 0xf4, 0xda, 0x99, 0x8e, 0xd1, 0xc6, 0xfa, 0x67, 0x90, 0x4f, 0x48, 0xcd, 0xbb, 0xac, 0x3e, 0xe4, 0xa4, 0xb9, 0x2b, 0xef, 0x2e, 0xc5, 0x60}, }, { {0x11, 0x6d, 0xae, 0x7c, 0xc2, 0xc5, 0x2b, 0x70, 0xab, 0x8c, 0xa4, 0x54, 0x9b, 0x69, 0xc7, 0x44, 0xb2, 0x2e, 0x49, 0xba, 0x56, 0x40, 0xbc, 0xef, 0x6d, 0x67, 0xb6, 0xd9, 0x48, 0x72, 0xd7, 0x70}, {0xf1, 0x8b, 0xfd, 0x3b, 0xbc, 0x89, 0x5d, 0xb, 0x1a, 0x55, 0xf3, 0xc9, 0x37, 0x92, 0x6b, 0xb0, 0xf5, 0x28, 0x30, 0xd5, 0xb0, 0x16, 0x4c, 0xe, 0xab, 0xca, 0xcf, 0x2c, 0x31, 0x9c, 0xbc, 0x10}, {0x5b, 0xa0, 0xc2, 0x3e, 0x4b, 0xe8, 0x8a, 0xaa, 0xe0, 0x81, 0x17, 0xed, 0xf4, 0x9e, 0x69, 0x98, 0xd1, 0x85, 0x8e, 0x70, 0xe4, 0x13, 0x45, 0x79, 0x13, 0xf4, 0x76, 0xa9, 0xd3, 0x5b, 0x75, 0x63}, }, { {0xb7, 0xac, 0xf1, 0x97, 0x18, 0x10, 0xc7, 0x3d, 0xd8, 0xbb, 0x65, 0xc1, 0x5e, 0x7d, 0xda, 0x5d, 0xf, 0x2, 0xa1, 0xf, 0x9c, 0x5b, 0x8e, 0x50, 0x56, 0x2a, 0xc5, 0x37, 0x17, 0x75, 0x63, 0x27}, {0x53, 0x8, 0xd1, 0x2a, 0x3e, 0xa0, 0x5f, 0xb5, 0x69, 0x35, 0xe6, 0x9e, 0x90, 0x75, 0x6f, 0x35, 0x90, 0xb8, 0x69, 0xbe, 0xfd, 0xf1, 0xf9, 0x9f, 0x84, 0x6f, 0xc1, 0x8b, 0xc4, 0xc1, 0x8c, 0xd}, {0xa9, 0x19, 0xb4, 0x6e, 0xd3, 0x2, 0x94, 0x2, 0xa5, 0x60, 0xb4, 0x77, 0x7e, 0x4e, 0xb4, 0xf0, 0x56, 0x49, 0x3c, 0xd4, 0x30, 0x62, 0xa8, 0xcf, 0xe7, 0x66, 0xd1, 0x7a, 0x8a, 0xdd, 0xc2, 0x70}, }, { {0x13, 0x7e, 0xed, 0xb8, 0x7d, 0x96, 0xd4, 0x91, 0x7a, 0x81, 0x76, 0xd7, 0xa, 0x2f, 0x25, 0x74, 0x64, 0x25, 0x85, 0xd, 0xe0, 0x82, 0x9, 0xe4, 0xe5, 0x3c, 0xa5, 0x16, 0x38, 0x61, 0xb8, 0x32}, {0xe, 0xec, 0x6f, 0x9f, 0x50, 0x94, 0x61, 0x65, 0x8d, 0x51, 0xc6, 0x46, 0xa9, 0x7e, 0x2e, 0xee, 0x5c, 0x9b, 0xe0, 0x67, 0xf3, 0xc1, 0x33, 0x97, 0x95, 0x84, 0x94, 0x63, 0x63, 0xac, 0xf, 0x2e}, {0x64, 0xcd, 0x48, 0xe4, 0xbe, 0xf7, 0xe7, 0x79, 0xd0, 0x86, 0x78, 0x8, 0x67, 0x3a, 0xc8, 0x6a, 0x2e, 0xdb, 0xe4, 0xa0, 0xd9, 0xd4, 0x9f, 0xf8, 0x41, 0x4f, 0x5a, 0x73, 0x5c, 0x21, 0x79, 0x41}, }, { {0x34, 0xcd, 0x6b, 0x28, 0xb9, 0x33, 0xae, 0xe4, 0xdc, 0xd6, 0x9d, 0x55, 0xb6, 0x7e, 0xef, 0xb7, 0x1f, 0x8e, 0xd3, 0xb3, 0x1f, 0x14, 0x8b, 0x27, 0x86, 0xc2, 0x41, 0x22, 0x66, 0x85, 0xfa, 0x31}, {0x2a, 0xed, 0xdc, 0xd7, 0xe7, 0x94, 0x70, 0x8c, 0x70, 0x9c, 0xd3, 0x47, 0xc3, 0x8a, 0xfb, 0x97, 0x2, 0xd9, 0x6, 0xa9, 0x33, 0xe0, 0x3b, 0xe1, 0x76, 0x9d, 0xd9, 0xc, 0xa3, 0x44, 0x3, 0x70}, {0xf4, 0x22, 0x36, 0x2e, 0x42, 0x6c, 0x82, 0xaf, 0x2d, 0x50, 0x33, 0x98, 0x87, 0x29, 0x20, 0xc1, 0x23, 0x91, 0x38, 0x2b, 0xe1, 0xb7, 0xc1, 0x9b, 0x89, 0x24, 0x95, 0xa9, 0x12, 0x23, 0xbb, 0x24}, }, { {0x6b, 0x5c, 0xf8, 0xf5, 0x2a, 0xc, 0xf8, 0x41, 0x94, 0x67, 0xfa, 0x4, 0xc3, 0x84, 0x72, 0x68, 0xad, 0x1b, 0xba, 0xa3, 0x99, 0xdf, 0x45, 0x89, 0x16, 0x5d, 0xeb, 0xff, 0xf9, 0x2a, 0x1d, 0xd}, {0xc3, 0x67, 0xde, 0x32, 0x17, 0xed, 0xa8, 0xb1, 0x48, 0x49, 0x1b, 0x46, 0x18, 0x94, 0xb4, 0x3c, 0xd2, 0xbc, 0xcf, 0x76, 0x43, 0x43, 0xbd, 0x8e, 0x8, 0x80, 0x18, 0x1e, 0x87, 0x3e, 0xee, 0xf}, {0xdf, 0x1e, 0x62, 0x32, 0xa1, 0x8a, 0xda, 0xa9, 0x79, 0x65, 0x22, 0x59, 0xa1, 0x22, 0xb8, 0x30, 0x93, 0xc1, 0x9a, 0xa7, 0x7b, 0x19, 0x4, 0x40, 0x76, 0x1d, 0x53, 0x18, 0x97, 0xd7, 0xac, 0x16}, }, { {0xad, 0xb6, 0x87, 0x78, 0xc5, 0xc6, 0x59, 0xc9, 0xba, 0xfe, 0x90, 0x5f, 0xad, 0x9e, 0xe1, 0x94, 0x4, 0xf5, 0x42, 0xa3, 0x62, 0x4e, 0xe2, 0x16, 0x0, 0x17, 0x16, 0x18, 0x4b, 0xd3, 0x4e, 0x16}, {0x3d, 0x1d, 0x9b, 0x2d, 0xaf, 0x72, 0xdf, 0x72, 0x5a, 0x24, 0x32, 0xa4, 0x36, 0x2a, 0x46, 0x63, 0x37, 0x96, 0xb3, 0x16, 0x79, 0xa0, 0xce, 0x3e, 0x9, 0x23, 0x30, 0xb9, 0xf6, 0xe, 0x3e, 0x12}, {0x9a, 0xe6, 0x2f, 0x19, 0x4c, 0xd9, 0x7e, 0x48, 0x13, 0x15, 0x91, 0x3a, 0xea, 0x2c, 0xae, 0x61, 0x27, 0xde, 0xa4, 0xb9, 0xd3, 0xf6, 0x7b, 0x87, 0xeb, 0xf3, 0x73, 0x10, 0xc6, 0xf, 0xda, 0x78}, }, { {0x94, 0x3a, 0xc, 0x68, 0xf1, 0x80, 0x9f, 0xa2, 0xe6, 0xe7, 0xe9, 0x1a, 0x15, 0x7e, 0xf7, 0x71, 0x73, 0x79, 0x1, 0x48, 0x58, 0xf1, 0x0, 0x11, 0xdd, 0x8d, 0xb3, 0x16, 0xb3, 0xa4, 0x4a, 0x5}, {0x6a, 0xc6, 0x2b, 0xe5, 0x28, 0x5d, 0xf1, 0x5b, 0x8e, 0x1a, 0xf0, 0x70, 0x18, 0xe3, 0x47, 0x2c, 0xdd, 0x8b, 0xc2, 0x6, 0xbc, 0xaf, 0x19, 0x24, 0x3a, 0x17, 0x6b, 0x25, 0xeb, 0xde, 0x25, 0x2d}, {0xb8, 0x7c, 0x26, 0x19, 0x8d, 0x46, 0xc8, 0xdf, 0xaf, 0x4d, 0xe5, 0x66, 0x9c, 0x78, 0x28, 0xb, 0x17, 0xec, 0x6e, 0x66, 0x2a, 0x1d, 0xeb, 0x2a, 0x60, 0xa7, 0x7d, 0xab, 0xa6, 0x10, 0x46, 0x13}, }, }, { { {0x15, 0xf5, 0xd1, 0x77, 0xe7, 0x65, 0x2a, 0xcd, 0xf1, 0x60, 0xaa, 0x8f, 0x87, 0x91, 0x89, 0x54, 0xe5, 0x6, 0xbc, 0xda, 0xbc, 0x3b, 0xb7, 0xb1, 0xfb, 0xc9, 0x7c, 0xa9, 0xcb, 0x78, 0x48, 0x65}, {0xfe, 0xb0, 0xf6, 0x8d, 0xc7, 0x8e, 0x13, 0x51, 0x1b, 0xf5, 0x75, 0xe5, 0x89, 0xda, 0x97, 0x53, 0xb9, 0xf1, 0x7a, 0x71, 0x1d, 0x7a, 0x20, 0x9, 0x50, 0xd6, 0x20, 0x2b, 0xba, 0xfd, 0x2, 0x21}, {0xa1, 0xe6, 0x5c, 0x5, 0x5, 0xe4, 0x9e, 0x96, 0x29, 0xad, 0x51, 0x12, 0x68, 0xa7, 0xbc, 0x36, 0x15, 0xa4, 0x7d, 0xaa, 0x17, 0xf5, 0x1a, 0x3a, 0xba, 0xb2, 0xec, 0x29, 0xdb, 0x25, 0xd7, 0xa}, }, { {0x85, 0x6f, 0x5, 0x9b, 0xc, 0xbc, 0xc7, 0xfe, 0xd7, 0xff, 0xf5, 0xe7, 0x68, 0x52, 0x7d, 0x53, 0xfa, 0xae, 0x12, 0x43, 0x62, 0xc6, 0xaf, 0x77, 0xd9, 0x9f, 0x39, 0x2, 0x53, 0x5f, 0x67, 0x4f}, {0x57, 0x24, 0x4e, 0x83, 0xb1, 0x67, 0x42, 0xdc, 0xc5, 0x1b, 0xce, 0x70, 0xb5, 0x44, 0x75, 0xb6, 0xd7, 0x5e, 0xd1, 0xf7, 0xb, 0x7a, 0xf0, 0x1a, 0x50, 0x36, 0xa0, 0x71, 0xfb, 0xcf, 0xef, 0x4a}, {0x1e, 0x17, 0x15, 0x4, 0x36, 0x36, 0x2d, 0xc3, 0x3b, 0x48, 0x98, 0x89, 0x11, 0xef, 0x2b, 0xcd, 0x10, 0x51, 0x94, 0xd0, 0xad, 0x6e, 0xa, 0x87, 0x61, 0x65, 0xa8, 0xa2, 0x72, 0xbb, 0xcc, 0xb}, }, { {0x96, 0x12, 0xfe, 0x50, 0x4c, 0x5e, 0x6d, 0x18, 0x7e, 0x9f, 0xe8, 0xfe, 0x82, 0x7b, 0x39, 0xe0, 0xb0, 0x31, 0x70, 0x50, 0xc5, 0xf6, 0xc7, 0x3b, 0xc2, 0x37, 0x8f, 0x10, 0x69, 0xfd, 0x78, 0x66}, {0xc8, 0xa9, 0xb1, 0xea, 0x2f, 0x96, 0x5e, 0x18, 0xcd, 0x7d, 0x14, 0x65, 0x35, 0xe6, 0xe7, 0x86, 0xf2, 0x6d, 0x5b, 0xbb, 0x31, 0xe0, 0x92, 0xb0, 0x3e, 0xb7, 0xd6, 0x59, 0xab, 0xf0, 0x24, 0x40}, {0xc2, 0x63, 0x68, 0x63, 0x31, 0xfa, 0x86, 0x15, 0xf2, 0x33, 0x2d, 0x57, 0x48, 0x8c, 0xf6, 0x7, 0xfc, 0xae, 0x9e, 0x78, 0x9f, 0xcc, 0x73, 0x4f, 0x1, 0x47, 0xad, 0x8e, 0x10, 0xe2, 0x42, 0x2d}, }, { {0x93, 0x75, 0x53, 0xf, 0xd, 0x7b, 0x71, 0x21, 0x4c, 0x6, 0x1e, 0x13, 0xb, 0x69, 0x4e, 0x91, 0x9f, 0xe0, 0x2a, 0x75, 0xae, 0x87, 0xb6, 0x1b, 0x6e, 0x3c, 0x42, 0x9b, 0xa7, 0xf3, 0xb, 0x42}, {0x9b, 0xd2, 0xdf, 0x94, 0x15, 0x13, 0xf5, 0x97, 0x6a, 0x4c, 0x3f, 0x31, 0x5d, 0x98, 0x55, 0x61, 0x10, 0x50, 0x45, 0x8, 0x7, 0x3f, 0xa1, 0xeb, 0x22, 0xd3, 0xd2, 0xb8, 0x8, 0x26, 0x6b, 0x67}, {0x47, 0x2b, 0x5b, 0x1c, 0x65, 0xba, 0x38, 0x81, 0x80, 0x1b, 0x1b, 0x31, 0xec, 0xb6, 0x71, 0x86, 0xb0, 0x35, 0x31, 0xbc, 0xb1, 0xc, 0xff, 0x7b, 0xe0, 0xf1, 0xc, 0x9c, 0xfa, 0x2f, 0x5d, 0x74}, }, { {0x6a, 0x4e, 0xd3, 0x21, 0x57, 0xdf, 0x36, 0x60, 0xd0, 0xb3, 0x7b, 0x99, 0x27, 0x88, 0xdb, 0xb1, 0xfa, 0x6a, 0x75, 0xc8, 0xc3, 0x9, 0xc2, 0xd3, 0x39, 0xc8, 0x1d, 0x4c, 0xe5, 0x5b, 0xe1, 0x6}, {0xbd, 0xc8, 0xc9, 0x2b, 0x1e, 0x5a, 0x52, 0xbf, 0x81, 0x9d, 0x47, 0x26, 0x8, 0x26, 0x5b, 0xea, 0xdb, 0x55, 0x1, 0xdf, 0xe, 0xc7, 0x11, 0xd5, 0xd0, 0xf5, 0xc, 0x96, 0xeb, 0x3c, 0xe2, 0x1a}, {0x4a, 0x99, 0x32, 0x19, 0x87, 0x5d, 0x72, 0x5b, 0xb0, 0xda, 0xb1, 0xce, 0xb5, 0x1c, 0x35, 0x32, 0x5, 0xca, 0xb7, 0xda, 0x49, 0x15, 0xc4, 0x7d, 0xf7, 0xc1, 0x8e, 0x27, 0x61, 0xd8, 0xde, 0x58}, }, { {0xa8, 0xc9, 0xc2, 0xb6, 0xa8, 0x5b, 0xfb, 0x2d, 0x8c, 0x59, 0x2c, 0xf5, 0x8e, 0xef, 0xee, 0x48, 0x73, 0x15, 0x2d, 0xf1, 0x7, 0x91, 0x80, 0x33, 0xd8, 0x5b, 0x1d, 0x53, 0x6b, 0x69, 0xba, 0x8}, {0x5c, 0xc5, 0x66, 0xf2, 0x93, 0x37, 0x17, 0xd8, 0x49, 0x4e, 0x45, 0xcc, 0xc5, 0x76, 0xc9, 0xc8, 0xa8, 0xc3, 0x26, 0xbc, 0xf8, 0x82, 0xe3, 0x5c, 0xf9, 0xf6, 0x85, 0x54, 0xe8, 0x9d, 0xf3, 0x2f}, {0x7a, 0xc5, 0xef, 0xc3, 0xee, 0x3e, 0xed, 0x77, 0x11, 0x48, 0xff, 0xd4, 0x17, 0x55, 0xe0, 0x4, 0xcb, 0x71, 0xa6, 0xf1, 0x3f, 0x7a, 0x3d, 0xea, 0x54, 0xfe, 0x7c, 0x94, 0xb4, 0x33, 0x6, 0x12}, }, { {0xa, 0x10, 0x12, 0x49, 0x47, 0x31, 0xbd, 0x82, 0x6, 0xbe, 0x6f, 0x7e, 0x6d, 0x7b, 0x23, 0xde, 0xc6, 0x79, 0xea, 0x11, 0x19, 0x76, 0x1e, 0xe1, 0xde, 0x3b, 0x39, 0xcb, 0xe3, 0x3b, 0x43, 0x7}, {0x42, 0x0, 0x61, 0x91, 0x78, 0x98, 0x94, 0xb, 0xe8, 0xfa, 0xeb, 0xec, 0x3c, 0xb1, 0xe7, 0x4e, 0xc0, 0xa4, 0xf0, 0x94, 0x95, 0x73, 0xbe, 0x70, 0x85, 0x91, 0xd5, 0xb4, 0x99, 0xa, 0xd3, 0x35}, {0xf4, 0x97, 0xe9, 0x5c, 0xc0, 0x44, 0x79, 0xff, 0xa3, 0x51, 0x5c, 0xb0, 0xe4, 0x3d, 0x5d, 0x57, 0x7c, 0x84, 0x76, 0x5a, 0xfd, 0x81, 0x33, 0x58, 0x9f, 0xda, 0xf6, 0x7a, 0xde, 0x3e, 0x87, 0x2d}, }, { {0x81, 0xf9, 0x5d, 0x4e, 0xe1, 0x2, 0x62, 0xaa, 0xf5, 0xe1, 0x15, 0x50, 0x17, 0x59, 0xd, 0xa2, 0x6c, 0x1d, 0xe2, 0xba, 0xd3, 0x75, 0xa2, 0x18, 0x53, 0x2, 0x60, 0x1, 0x8a, 0x61, 0x43, 0x5}, {0x9, 0x34, 0x37, 0x43, 0x64, 0x31, 0x7a, 0x15, 0xd9, 0x81, 0xaa, 0xf4, 0xee, 0xb7, 0xb8, 0xfa, 0x6, 0x48, 0xa6, 0xf5, 0xe6, 0xfe, 0x93, 0xb0, 0xb6, 0xa7, 0x7f, 0x70, 0x54, 0x36, 0x77, 0x2e}, {0xc1, 0x23, 0x4c, 0x97, 0xf4, 0xbd, 0xea, 0xd, 0x93, 0x46, 0xce, 0x9d, 0x25, 0xa, 0x6f, 0xaa, 0x2c, 0xba, 0x9a, 0xa2, 0xb8, 0x2c, 0x20, 0x4, 0xd, 0x96, 0x7, 0x2d, 0x36, 0x43, 0x14, 0x4b}, }, }, { { {0xcb, 0x9c, 0x52, 0x1c, 0xe9, 0x54, 0x7c, 0x96, 0xfb, 0x35, 0xc6, 0x64, 0x92, 0x26, 0xf6, 0x30, 0x65, 0x19, 0x12, 0x78, 0xf4, 0xaf, 0x47, 0x27, 0x5c, 0x6f, 0xf6, 0xea, 0x18, 0x84, 0x3, 0x17}, {0x7a, 0x1f, 0x6e, 0xb6, 0xc7, 0xb7, 0xc4, 0xcc, 0x7e, 0x2f, 0xc, 0xf5, 0x25, 0x7e, 0x15, 0x44, 0x1c, 0xaf, 0x3e, 0x71, 0xfc, 0x6d, 0xf0, 0x3e, 0xf7, 0x63, 0xda, 0x52, 0x67, 0x44, 0x2f, 0x58}, {0xe4, 0x4c, 0x32, 0x20, 0xd3, 0x7b, 0x31, 0xc6, 0xc4, 0x8b, 0x48, 0xa4, 0xe8, 0x42, 0x10, 0xa8, 0x64, 0x13, 0x5a, 0x4e, 0x8b, 0xf1, 0x1e, 0xb2, 0xc9, 0x8d, 0xa2, 0xcd, 0x4b, 0x1c, 0x2a, 0xc}, }, { {0x45, 0x69, 0xbd, 0x69, 0x48, 0x81, 0xc4, 0xed, 0x22, 0x8d, 0x1c, 0xbe, 0x7d, 0x90, 0x6d, 0xd, 0xab, 0xc5, 0x5c, 0xd5, 0x12, 0xd2, 0x3b, 0xc6, 0x83, 0xdc, 0x14, 0xa3, 0x30, 0x9b, 0x6a, 0x5a}, {0x47, 0x4, 0x1f, 0x6f, 0xd0, 0xc7, 0x4d, 0xd2, 0x59, 0xc0, 0x87, 0xdb, 0x3e, 0x9e, 0x26, 0xb2, 0x8f, 0xd2, 0xb2, 0xfb, 0x72, 0x2, 0x5b, 0xd1, 0x77, 0x48, 0xf6, 0xc6, 0xd1, 0x8b, 0x55, 0x7c}, {0x3d, 0x46, 0x96, 0xd3, 0x24, 0x15, 0xec, 0xd0, 0xf0, 0x24, 0x5a, 0xc3, 0x8a, 0x62, 0xbb, 0x12, 0xa4, 0x5f, 0xbc, 0x1c, 0x79, 0x3a, 0xc, 0xa5, 0xc3, 0xaf, 0xfb, 0xa, 0xca, 0xa5, 0x4, 0x4}, }, { {0xd1, 0x6f, 0x41, 0x2a, 0x1b, 0x9e, 0xbc, 0x62, 0x8b, 0x59, 0x50, 0xe3, 0x28, 0xf7, 0xc6, 0xb5, 0x67, 0x69, 0x5d, 0x3d, 0xd8, 0x3f, 0x34, 0x4, 0x98, 0xee, 0xf8, 0xe7, 0x16, 0x75, 0x52, 0x39}, {0xd6, 0x43, 0xa7, 0xa, 0x7, 0x40, 0x1f, 0x8c, 0xe8, 0x5e, 0x26, 0x5b, 0xcb, 0xd0, 0xba, 0xcc, 0xde, 0xd2, 0x8f, 0x66, 0x6b, 0x4, 0x4b, 0x57, 0x33, 0x96, 0xdd, 0xca, 0xfd, 0x5b, 0x39, 0x46}, {0x9c, 0x9a, 0x5d, 0x1a, 0x2d, 0xdb, 0x7f, 0x11, 0x2a, 0x5c, 0x0, 0xd1, 0xbc, 0x45, 0x77, 0x9c, 0xea, 0x6f, 0xd5, 0x54, 0xf1, 0xbe, 0xd4, 0xef, 0x16, 0xd0, 0x22, 0xe8, 0x29, 0x9a, 0x57, 0x76}, }, { {0xf2, 0x34, 0xb4, 0x52, 0x13, 0xb5, 0x3c, 0x33, 0xe1, 0x80, 0xde, 0x93, 0x49, 0x28, 0x32, 0xd8, 0xce, 0x35, 0xd, 0x75, 0x87, 0x28, 0x51, 0xb5, 0xc1, 0x77, 0x27, 0x2a, 0xbb, 0x14, 0xc5, 0x2}, {0x17, 0x2a, 0xc0, 0x49, 0x7e, 0x8e, 0xb6, 0x45, 0x7f, 0xa3, 0xa9, 0xbc, 0xa2, 0x51, 0xcd, 0x23, 0x1b, 0x4c, 0x22, 0xec, 0x11, 0x5f, 0xd6, 0x3e, 0xb1, 0xbd, 0x5, 0x9e, 0xdc, 0x84, 0xa3, 0x43}, {0x45, 0xb6, 0xf1, 0x8b, 0xda, 0xd5, 0x4b, 0x68, 0x53, 0x4b, 0xb5, 0xf6, 0x7e, 0xd3, 0x8b, 0xfb, 0x53, 0xd2, 0xb0, 0xa9, 0xd7, 0x16, 0x39, 0x31, 0x59, 0x80, 0x54, 0x61, 0x9, 0x92, 0x60, 0x11}, }, { {0xcd, 0x4d, 0x9b, 0x36, 0x16, 0x56, 0x38, 0x7a, 0x63, 0x35, 0x5c, 0x65, 0xa7, 0x2c, 0xc0, 0x75, 0x21, 0x80, 0xf1, 0xd4, 0xf9, 0x1b, 0xc2, 0x7d, 0x42, 0xe0, 0xe6, 0x91, 0x74, 0x7d, 0x63, 0x2f}, {0xaa, 0xcf, 0xda, 0x29, 0x69, 0x16, 0x4d, 0xb4, 0x8f, 0x59, 0x13, 0x84, 0x4c, 0x9f, 0x52, 0xda, 0x59, 0x55, 0x3d, 0x45, 0xca, 0x63, 0xef, 0xe9, 0xb, 0x8e, 0x69, 0xc5, 0x5b, 0x12, 0x1e, 0x35}, {0xbe, 0x7b, 0xf6, 0x1a, 0x46, 0x9b, 0xb4, 0xd4, 0x61, 0x89, 0xab, 0xc8, 0x7a, 0x3, 0x3, 0xd6, 0xfb, 0x99, 0xa6, 0xf9, 0x9f, 0xe1, 0xde, 0x71, 0x9a, 0x2a, 0xce, 0xe7, 0x6, 0x2d, 0x18, 0x7f}, }, { {0x22, 0x75, 0x21, 0x8e, 0x72, 0x4b, 0x45, 0x9, 0xd8, 0xb8, 0x84, 0xd4, 0xf4, 0xe8, 0x58, 0xaa, 0x3c, 0x90, 0x46, 0x7f, 0x4d, 0x25, 0x58, 0xd3, 0x17, 0x52, 0x1c, 0x24, 0x43, 0xc0, 0xac, 0x44}, {0xec, 0x68, 0x1, 0xab, 0x64, 0x8e, 0x7c, 0x7a, 0x43, 0xc5, 0xed, 0x15, 0x55, 0x4a, 0x5a, 0xcb, 0xda, 0xe, 0xcd, 0x47, 0xd3, 0x19, 0x55, 0x9, 0xb0, 0x93, 0x3e, 0x34, 0x8c, 0xac, 0xd4, 0x67}, {0x77, 0x57, 0x7a, 0x4f, 0xbb, 0x6b, 0x7d, 0x1c, 0xe1, 0x13, 0x83, 0x91, 0xd4, 0xfe, 0x35, 0x8b, 0x84, 0x46, 0x6b, 0xc9, 0xc6, 0xa1, 0xdc, 0x4a, 0xbd, 0x71, 0xad, 0x12, 0x83, 0x1c, 0x6d, 0x55}, }, { {0x21, 0xe8, 0x1b, 0xb1, 0x56, 0x67, 0xf0, 0x81, 0xdd, 0xf3, 0xa3, 0x10, 0x23, 0xf8, 0xaf, 0xf, 0x5d, 0x46, 0x99, 0x6a, 0x55, 0xd0, 0xb2, 0xf8, 0x5, 0x7f, 0x8c, 0xcc, 0x38, 0xbe, 0x7a, 0x9}, {0x82, 0x39, 0x8d, 0xc, 0xe3, 0x40, 0xef, 0x17, 0x34, 0xfa, 0xa3, 0x15, 0x3e, 0x7, 0xf7, 0x31, 0x6e, 0x64, 0x73, 0x7, 0xcb, 0xf3, 0x21, 0x4f, 0xff, 0x4e, 0x82, 0x1d, 0x6d, 0x6c, 0x6c, 0x74}, {0xa4, 0x2d, 0xa5, 0x7e, 0x87, 0xc9, 0x49, 0xc, 0x43, 0x1d, 0xdc, 0x9b, 0x55, 0x69, 0x43, 0x4c, 0xd2, 0xeb, 0xcc, 0xf7, 0x9, 0x38, 0x2c, 0x2, 0xbd, 0x84, 0xee, 0x4b, 0xa3, 0x14, 0x7e, 0x57}, }, { {0x2b, 0xd7, 0x4d, 0xbd, 0xbe, 0xce, 0xfe, 0x94, 0x11, 0x22, 0xf, 0x6, 0xda, 0x4f, 0x6a, 0xf4, 0xff, 0xd1, 0xc8, 0xc0, 0x77, 0x59, 0x4a, 0x12, 0x95, 0x92, 0x0, 0xfb, 0xb8, 0x4, 0x53, 0x70}, {0xa, 0x3b, 0xa7, 0x61, 0xac, 0x68, 0xe2, 0xf0, 0xf5, 0xa5, 0x91, 0x37, 0x10, 0xfa, 0xfa, 0xf2, 0xe9, 0x0, 0x6d, 0x6b, 0x82, 0x3e, 0xe1, 0xc1, 0x42, 0x8f, 0xd7, 0x6f, 0xe9, 0x7e, 0xfa, 0x60}, {0xc6, 0x6e, 0x29, 0x4d, 0x35, 0x1d, 0x3d, 0xb6, 0xd8, 0x31, 0xad, 0x5f, 0x3e, 0x5, 0xc3, 0xf3, 0xec, 0x42, 0xbd, 0xb4, 0x8c, 0x95, 0xb, 0x67, 0xfd, 0x53, 0x63, 0xa1, 0xc, 0x8e, 0x39, 0x21}, }, }, { { {0x1, 0x56, 0xb7, 0xb4, 0xf9, 0xaa, 0x98, 0x27, 0x72, 0xad, 0x8d, 0x5c, 0x13, 0x72, 0xac, 0x5e, 0x23, 0xa0, 0xb7, 0x61, 0x61, 0xaa, 0xce, 0xd2, 0x4e, 0x7d, 0x8f, 0xe9, 0x84, 0xb2, 0xbf, 0x1b}, {0xf3, 0x33, 0x2b, 0x38, 0x8a, 0x5, 0xf5, 0x89, 0xb4, 0xc0, 0x48, 0xad, 0xb, 0xba, 0xe2, 0x5a, 0x6e, 0xb3, 0x3d, 0xa5, 0x3, 0xb5, 0x93, 0x8f, 0xe6, 0x32, 0xa2, 0x95, 0x9d, 0xed, 0xa3, 0x5a}, {0x61, 0x65, 0xd9, 0xc7, 0xe9, 0x77, 0x67, 0x65, 0x36, 0x80, 0xc7, 0x72, 0x54, 0x12, 0x2b, 0xcb, 0xee, 0x6e, 0x50, 0xd9, 0x99, 0x32, 0x5, 0x65, 0xcc, 0x57, 0x89, 0x5e, 0x4e, 0xe1, 0x7, 0x4a}, }, { {0x9b, 0xa4, 0x77, 0xc4, 0xcd, 0x58, 0xb, 0x24, 0x17, 0xf0, 0x47, 0x64, 0xde, 0xda, 0x38, 0xfd, 0xad, 0x6a, 0xc8, 0xa7, 0x32, 0x8d, 0x92, 0x19, 0x81, 0xa0, 0xaf, 0x84, 0xed, 0x7a, 0xaf, 0x50}, {0x99, 0xf9, 0xd, 0x98, 0xcb, 0x12, 0xe4, 0x4e, 0x71, 0xc7, 0x6e, 0x3c, 0x6f, 0xd7, 0x15, 0xa3, 0xfd, 0x77, 0x5c, 0x92, 0xde, 0xed, 0xa5, 0xbb, 0x2, 0x34, 0x31, 0x1d, 0x39, 0xac, 0xb, 0x3f}, {0xe5, 0x5b, 0xf6, 0x15, 0x1, 0xde, 0x4f, 0x6e, 0xb2, 0x9, 0x61, 0x21, 0x21, 0x26, 0x98, 0x29, 0xd9, 0xd6, 0xad, 0xb, 0x81, 0x5, 0x2, 0x78, 0x6, 0xd0, 0xeb, 0xba, 0x16, 0xa3, 0x21, 0x19}, }, { {0x8b, 0xc1, 0xf3, 0xd9, 0x9a, 0xad, 0x5a, 0xd7, 0x9c, 0xc1, 0xb1, 0x60, 0xef, 0xe, 0x6a, 0x56, 0xd9, 0xe, 0x5c, 0x25, 0xac, 0xb, 0x9a, 0x3e, 0xf5, 0xc7, 0x62, 0xa0, 0xec, 0x9d, 0x4, 0x7b}, {0xfc, 0x70, 0xb8, 0xdf, 0x7e, 0x2f, 0x42, 0x89, 0xbd, 0xb3, 0x76, 0x4f, 0xeb, 0x6b, 0x29, 0x2c, 0xf7, 0x4d, 0xc2, 0x36, 0xd4, 0xf1, 0x38, 0x7, 0xb0, 0xae, 0x73, 0xe2, 0x41, 0xdf, 0x58, 0x64}, {0x83, 0x44, 0x44, 0x35, 0x7a, 0xe3, 0xcb, 0xdc, 0x93, 0xbe, 0xed, 0xf, 0x33, 0x79, 0x88, 0x75, 0x87, 0xdd, 0xc5, 0x12, 0xc3, 0x4, 0x60, 0x78, 0x64, 0xe, 0x95, 0xc2, 0xcb, 0xdc, 0x93, 0x60}, }, { {0x4b, 0x3, 0x84, 0x60, 0xbe, 0xee, 0xde, 0x6b, 0x54, 0xb8, 0xf, 0x78, 0xb6, 0xc2, 0x99, 0x31, 0x95, 0x6, 0x2d, 0xb6, 0xab, 0x76, 0x33, 0x97, 0x90, 0x7d, 0x64, 0x8b, 0xc9, 0x80, 0x31, 0x6e}, {0x6d, 0x70, 0xe0, 0x85, 0x85, 0x9a, 0xf3, 0x1f, 0x33, 0x39, 0xe7, 0xb3, 0xd8, 0xa5, 0xd0, 0x36, 0x3b, 0x45, 0x8f, 0x71, 0xe1, 0xf2, 0xb9, 0x43, 0x7c, 0xa9, 0x27, 0x48, 0x8, 0xea, 0xd1, 0x57}, {0x71, 0xb0, 0x28, 0xa1, 0xe7, 0xb6, 0x7a, 0xee, 0xaa, 0x8b, 0xa8, 0x93, 0x6d, 0x59, 0xc1, 0xa4, 0x30, 0x61, 0x21, 0xb2, 0x82, 0xde, 0xb4, 0xf7, 0x18, 0xbd, 0x97, 0xdd, 0x9d, 0x99, 0x3e, 0x36}, }, { {0xc6, 0xae, 0x4b, 0xe2, 0xdc, 0x48, 0x18, 0x2f, 0x60, 0xaf, 0xbc, 0xba, 0x55, 0x72, 0x9b, 0x76, 0x31, 0xe9, 0xef, 0x3c, 0x6e, 0x3c, 0xcb, 0x90, 0x55, 0xb3, 0xf9, 0xc6, 0x9b, 0x97, 0x1f, 0x23}, {0xc4, 0x1f, 0xee, 0x35, 0xc1, 0x43, 0xa8, 0x96, 0xcf, 0xc8, 0xe4, 0x8, 0x55, 0xb3, 0x6e, 0x97, 0x30, 0xd3, 0x8c, 0xb5, 0x1, 0x68, 0x2f, 0xb4, 0x2b, 0x5, 0x3a, 0x69, 0x78, 0x9b, 0xee, 0x48}, {0xc6, 0xf3, 0x2a, 0xcc, 0x4b, 0xde, 0x31, 0x5c, 0x1f, 0x8d, 0x20, 0xfe, 0x30, 0xb0, 0x4b, 0xb0, 0x66, 0xb4, 0x4f, 0xc1, 0x9, 0x70, 0x8d, 0xb7, 0x13, 0x24, 0x79, 0x8, 0x9b, 0xfa, 0x9b, 0x7}, }, { {0x45, 0x42, 0xd5, 0xa2, 0x80, 0xed, 0xc9, 0xf3, 0x52, 0x39, 0xf6, 0x77, 0x78, 0x8b, 0xa0, 0xa, 0x75, 0x54, 0x8, 0xd1, 0x63, 0xac, 0x6d, 0xd7, 0x6b, 0x63, 0x70, 0x94, 0x15, 0xfb, 0xf4, 0x1e}, {0xf4, 0xd, 0x30, 0xda, 0x51, 0x3a, 0x90, 0xe3, 0xb0, 0x5a, 0xa9, 0x3d, 0x23, 0x64, 0x39, 0x84, 0x80, 0x64, 0x35, 0xb, 0x2d, 0xf1, 0x3c, 0xed, 0x94, 0x71, 0x81, 0x84, 0xf6, 0x77, 0x8c, 0x3}, {0xec, 0x7b, 0x16, 0x5b, 0xe6, 0x5e, 0x4e, 0x85, 0xc2, 0xcd, 0xd0, 0x96, 0x42, 0xa, 0x59, 0x59, 0x99, 0x21, 0x10, 0x98, 0x34, 0xdf, 0xb2, 0x72, 0x56, 0xff, 0xb, 0x4a, 0x2a, 0xe9, 0x5e, 0x57}, }, { {0x1, 0xd8, 0xa4, 0xa, 0x45, 0xbc, 0x46, 0x5d, 0xd8, 0xb9, 0x33, 0xa5, 0x27, 0x12, 0xaf, 0xc3, 0xc2, 0x6, 0x89, 0x2b, 0x26, 0x3b, 0x9e, 0x38, 0x1b, 0x58, 0x2f, 0x38, 0x7e, 0x1e, 0xa, 0x20}, {0xcf, 0x2f, 0x18, 0x8a, 0x90, 0x80, 0xc0, 0xd4, 0xbd, 0x9d, 0x48, 0x99, 0xc2, 0x70, 0xe1, 0x30, 0xde, 0x33, 0xf7, 0x52, 0x57, 0xbd, 0xba, 0x5, 0x0, 0xfd, 0xd3, 0x2c, 0x11, 0xe7, 0xd4, 0x43}, {0xc5, 0x3a, 0xf9, 0xea, 0x67, 0xb9, 0x8d, 0x51, 0xc0, 0x52, 0x66, 0x5, 0x9b, 0x98, 0xbc, 0x71, 0xf5, 0x97, 0x71, 0x56, 0xd9, 0x85, 0x2b, 0xfe, 0x38, 0x4e, 0x1e, 0x65, 0x52, 0xca, 0xe, 0x5}, }, { {0xea, 0x68, 0xe6, 0x60, 0x76, 0x39, 0xac, 0x97, 0x97, 0xb4, 0x3a, 0x15, 0xfe, 0xbb, 0x19, 0x9b, 0x9f, 0xa7, 0xec, 0x34, 0xb5, 0x79, 0xb1, 0x4c, 0x57, 0xae, 0x31, 0xa1, 0x9f, 0xc0, 0x51, 0x61}, {0x9c, 0xc, 0x3f, 0x45, 0xde, 0x1a, 0x43, 0xc3, 0x9b, 0x3b, 0x70, 0xff, 0x5e, 0x4, 0xf5, 0xe9, 0x3d, 0x7b, 0x84, 0xed, 0xc9, 0x7a, 0xd9, 0xfc, 0xc6, 0xf4, 0x58, 0x1c, 0xc2, 0xe6, 0xe, 0x4b}, {0x96, 0x5d, 0xf0, 0xfd, 0xd, 0x5c, 0xf5, 0x3a, 0x7a, 0xee, 0xb4, 0x2a, 0xe0, 0x2e, 0x26, 0xdd, 0x9, 0x17, 0x17, 0x12, 0x87, 0xbb, 0xb2, 0x11, 0xb, 0x3, 0xf, 0x80, 0xfa, 0x24, 0xef, 0x1f}, }, }, { { {0x86, 0x6b, 0x97, 0x30, 0xf5, 0xaf, 0xd2, 0x22, 0x4, 0x46, 0xd2, 0xc2, 0x6, 0xb8, 0x90, 0x8d, 0xe5, 0xba, 0xe5, 0x4d, 0x6c, 0x89, 0xa1, 0xdc, 0x17, 0xc, 0x34, 0xc8, 0xe6, 0x5f, 0x0, 0x28}, {0x96, 0x31, 0xa7, 0x1a, 0xfb, 0x53, 0xd6, 0x37, 0x18, 0x64, 0xd7, 0x3f, 0x30, 0x95, 0x94, 0xf, 0xb2, 0x17, 0x3a, 0xfb, 0x9, 0xb, 0x20, 0xad, 0x3e, 0x61, 0xc8, 0x2f, 0x29, 0x49, 0x4d, 0x54}, {0x88, 0x86, 0x52, 0x34, 0x9f, 0xba, 0xef, 0x6a, 0xa1, 0x7d, 0x10, 0x25, 0x94, 0xff, 0x1b, 0x5c, 0x36, 0x4b, 0xd9, 0x66, 0xcd, 0xbb, 0x5b, 0xf7, 0xfa, 0x6d, 0x31, 0xf, 0x93, 0x72, 0xe4, 0x72}, }, { {0x27, 0x76, 0x2a, 0xd3, 0x35, 0xf6, 0xf3, 0x7, 0xf0, 0x66, 0x65, 0x5f, 0x86, 0x4d, 0xaa, 0x7a, 0x50, 0x44, 0xd0, 0x28, 0x97, 0xe7, 0x85, 0x3c, 0x38, 0x64, 0xe0, 0xf, 0x0, 0x7f, 0xee, 0x1f}, {0x4f, 0x8, 0x81, 0x97, 0x8c, 0x20, 0x95, 0x26, 0xe1, 0xe, 0x45, 0x23, 0xb, 0x2a, 0x50, 0xb1, 0x2, 0xde, 0xef, 0x3, 0xa6, 0xae, 0x9d, 0xfd, 0x4c, 0xa3, 0x33, 0x27, 0x8c, 0x2e, 0x9d, 0x5a}, {0xe5, 0xf7, 0xdb, 0x3, 0xda, 0x5, 0x53, 0x76, 0xbd, 0xcd, 0x34, 0x14, 0x49, 0xf2, 0xda, 0xa4, 0xec, 0x88, 0x4a, 0xd2, 0xcd, 0xd5, 0x4a, 0x7b, 0x43, 0x5, 0x4, 0xee, 0x51, 0x40, 0xf9, 0x0}, }, { {0x53, 0x97, 0xaf, 0x7, 0xbb, 0x93, 0xef, 0xd7, 0xa7, 0x66, 0xb7, 0x3d, 0xcf, 0xd0, 0x3e, 0x58, 0xc5, 0x1e, 0xb, 0x6e, 0xbf, 0x98, 0x69, 0xce, 0x52, 0x4, 0xd4, 0x5d, 0xd2, 0xff, 0xb7, 0x47}, {0xb2, 0x30, 0xd3, 0xc3, 0x23, 0x6b, 0x35, 0x8d, 0x6, 0x1b, 0x47, 0xb0, 0x9b, 0x8b, 0x1c, 0xf2, 0x3c, 0xb8, 0x42, 0x6e, 0x6c, 0x31, 0x6c, 0xb3, 0xd, 0xb1, 0xea, 0x8b, 0x7e, 0x9c, 0xd7, 0x7}, {0x12, 0xdd, 0x8, 0xbc, 0x9c, 0xfb, 0xfb, 0x87, 0x9b, 0xc2, 0xee, 0xe1, 0x3a, 0x6b, 0x6, 0x8a, 0xbf, 0xc1, 0x1f, 0xdb, 0x2b, 0x24, 0x57, 0xd, 0xb6, 0x4b, 0xa6, 0x5e, 0xa3, 0x20, 0x35, 0x1c}, }, { {0x59, 0xc0, 0x6b, 0x21, 0x40, 0x6f, 0xa8, 0xcd, 0x7e, 0xd8, 0xbc, 0x12, 0x1d, 0x23, 0xbb, 0x1f, 0x90, 0x9, 0xc7, 0x17, 0x9e, 0x6a, 0x95, 0xb4, 0x55, 0x2e, 0xd1, 0x66, 0x3b, 0xc, 0x75, 0x38}, {0x4a, 0xa3, 0xcb, 0xbc, 0xa6, 0x53, 0xd2, 0x80, 0x9b, 0x21, 0x38, 0x38, 0xa1, 0xc3, 0x61, 0x3e, 0x96, 0xe3, 0x82, 0x98, 0x1, 0xb6, 0xc3, 0x90, 0x6f, 0xe6, 0xe, 0x5d, 0x77, 0x5, 0x3d, 0x1c}, {0x1a, 0xe5, 0x22, 0x94, 0x40, 0xf1, 0x2e, 0x69, 0x71, 0xf6, 0x5d, 0x2b, 0x3c, 0xc7, 0xc0, 0xcb, 0x29, 0xe0, 0x4c, 0x74, 0xe7, 0x4f, 0x1, 0x21, 0x7c, 0x48, 0x30, 0xd3, 0xc7, 0xe2, 0x21, 0x6}, }, { {0xf3, 0xf0, 0xdb, 0xb0, 0x96, 0x17, 0xae, 0xb7, 0x96, 0xe1, 0x7c, 0xe1, 0xb9, 0xaf, 0xdf, 0x54, 0xb4, 0xa3, 0xaa, 0xe9, 0x71, 0x30, 0x92, 0x25, 0x9d, 0x2e, 0x0, 0xa1, 0x9c, 0x58, 0x8e, 0x5d}, {0x8d, 0x83, 0x59, 0x82, 0xcc, 0x60, 0x98, 0xaf, 0xdc, 0x9a, 0x9f, 0xc6, 0xc1, 0x48, 0xea, 0x90, 0x30, 0x1e, 0x58, 0x65, 0x37, 0x48, 0x26, 0x65, 0xbc, 0xa5, 0xd3, 0x7b, 0x9, 0xd6, 0x7, 0x0}, {0x4b, 0xa9, 0x42, 0x8, 0x95, 0x1d, 0xbf, 0xc0, 0x3e, 0x2e, 0x8f, 0x58, 0x63, 0xc3, 0xd3, 0xb2, 0xef, 0xe2, 0x51, 0xbb, 0x38, 0x14, 0x96, 0xa, 0x86, 0xbf, 0x1c, 0x3c, 0x78, 0xd7, 0x83, 0x15}, }, { {0xc7, 0x28, 0x9d, 0xcc, 0x4, 0x47, 0x3, 0x90, 0x8f, 0xc5, 0x2c, 0xf7, 0x9e, 0x67, 0x1b, 0x1d, 0x26, 0x87, 0x5b, 0xbe, 0x5f, 0x2b, 0xe1, 0x16, 0xa, 0x58, 0xc5, 0x83, 0x4e, 0x6, 0x58, 0x49}, {0xe1, 0x7a, 0xa2, 0x5d, 0xef, 0xa2, 0xee, 0xec, 0x74, 0x1, 0x67, 0x55, 0x14, 0x3a, 0x7c, 0x59, 0x7a, 0x16, 0x9, 0x66, 0x12, 0x2a, 0xa6, 0xc9, 0x70, 0x8f, 0xed, 0x81, 0x2e, 0x5f, 0x2a, 0x25}, {0xd, 0xe8, 0x66, 0x50, 0x26, 0x94, 0x28, 0xd, 0x6b, 0x8c, 0x7c, 0x30, 0x85, 0xf7, 0xc3, 0xfc, 0xfd, 0x12, 0x11, 0xc, 0x78, 0xda, 0x53, 0x1b, 0x88, 0xb3, 0x43, 0xd8, 0xb, 0x17, 0x9c, 0x7}, }, { {0x56, 0xd0, 0xd5, 0xc0, 0x50, 0xcd, 0xd6, 0xcd, 0x3b, 0x57, 0x3, 0xbb, 0x6d, 0x68, 0xf7, 0x9a, 0x48, 0xef, 0xc3, 0xf3, 0x3f, 0x72, 0xa6, 0x3c, 0xcc, 0x8a, 0x7b, 0x31, 0xd7, 0xc0, 0x68, 0x67}, {0xff, 0x6f, 0xfa, 0x64, 0xe4, 0xec, 0x6, 0x5, 0x23, 0xe5, 0x5, 0x62, 0x1e, 0x43, 0xe3, 0xbe, 0x42, 0xea, 0xb8, 0x51, 0x24, 0x42, 0x79, 0x35, 0x0, 0xfb, 0xc9, 0x4a, 0xe3, 0x5, 0xec, 0x6d}, {0xb3, 0xc1, 0x55, 0xf1, 0xe5, 0x25, 0xb6, 0x94, 0x91, 0x7b, 0x7b, 0x99, 0xa7, 0xf3, 0x7b, 0x41, 0x0, 0x26, 0x6b, 0x6d, 0xdc, 0xbd, 0x2c, 0xc2, 0xf4, 0x52, 0xcd, 0xdd, 0x14, 0x5e, 0x44, 0x51}, }, { {0x55, 0xa4, 0xbe, 0x2b, 0xab, 0x47, 0x31, 0x89, 0x29, 0x91, 0x7, 0x92, 0x4f, 0xa2, 0x53, 0x8c, 0xa7, 0xf7, 0x30, 0xbe, 0x48, 0xf9, 0x49, 0x4b, 0x3d, 0xd4, 0x4f, 0x6e, 0x8, 0x90, 0xe9, 0x12}, {0x51, 0x49, 0x14, 0x3b, 0x4b, 0x2b, 0x50, 0x57, 0xb3, 0xbc, 0x4b, 0x44, 0x6b, 0xff, 0x67, 0x8e, 0xdb, 0x85, 0x63, 0x16, 0x27, 0x69, 0xbd, 0xb8, 0xc8, 0x95, 0x92, 0xe3, 0x31, 0x6f, 0x18, 0x13}, {0x2e, 0xbb, 0xdf, 0x7f, 0xb3, 0x96, 0xc, 0xf1, 0xf9, 0xea, 0x1c, 0x12, 0x5e, 0x93, 0x9a, 0x9f, 0x3f, 0x98, 0x5b, 0x3a, 0xc4, 0x36, 0x11, 0xdf, 0xaf, 0x99, 0x3e, 0x5d, 0xf0, 0xe3, 0xb2, 0x77}, }, }, { { {0xa4, 0xb0, 0xdd, 0x12, 0x9c, 0x63, 0x98, 0xd5, 0x6b, 0x86, 0x24, 0xc0, 0x30, 0x9f, 0xd1, 0xa5, 0x60, 0xe4, 0xfc, 0x58, 0x3, 0x2f, 0x7c, 0xd1, 0x8a, 0x5e, 0x9, 0x2e, 0x15, 0x95, 0xa1, 0x7}, {0xde, 0xc4, 0x2e, 0x9c, 0xc5, 0xa9, 0x6f, 0x29, 0xcb, 0xf3, 0x84, 0x4f, 0xbf, 0x61, 0x8b, 0xbc, 0x8, 0xf9, 0xa8, 0x17, 0xd9, 0x6, 0x77, 0x1c, 0x5d, 0x25, 0xd3, 0x7a, 0xfc, 0x95, 0xb7, 0x63}, {0xc8, 0x5f, 0x9e, 0x38, 0x2, 0x8f, 0x36, 0xa8, 0x3b, 0xe4, 0x8d, 0xcf, 0x2, 0x3b, 0x43, 0x90, 0x43, 0x26, 0x41, 0xc5, 0x5d, 0xfd, 0xa1, 0xaf, 0x37, 0x1, 0x2f, 0x3, 0x3d, 0xe8, 0x8f, 0x3e}, }, { {0x3c, 0xd1, 0xef, 0xe8, 0x8d, 0x4c, 0x70, 0x8, 0x31, 0x37, 0xe0, 0x33, 0x8e, 0x1a, 0xc5, 0xdf, 0xe3, 0xcd, 0x60, 0x12, 0xa5, 0x5d, 0x9d, 0xa5, 0x86, 0x8c, 0x25, 0xa6, 0x99, 0x8, 0xd6, 0x22}, {0x94, 0xa2, 0x70, 0x5, 0xb9, 0x15, 0x8b, 0x2f, 0x49, 0x45, 0x8, 0x67, 0x70, 0x42, 0xf2, 0x94, 0x84, 0xfd, 0xbb, 0x61, 0xe1, 0x5a, 0x1c, 0xde, 0x7, 0x40, 0xac, 0x7f, 0x79, 0x3b, 0xba, 0x75}, {0x96, 0xd1, 0xcd, 0x70, 0xc0, 0xdb, 0x39, 0x62, 0x9a, 0x8a, 0x7d, 0x6c, 0x8b, 0x8a, 0xfe, 0x60, 0x60, 0x12, 0x40, 0xeb, 0xbc, 0x47, 0x88, 0xb3, 0x5e, 0x9e, 0x77, 0x87, 0x7b, 0xd0, 0x4, 0x9}, }, { {0xb9, 0x40, 0xf9, 0x48, 0x66, 0x2d, 0x32, 0xf4, 0x39, 0xc, 0x2d, 0xbd, 0xc, 0x2f, 0x95, 0x6, 0x31, 0xf9, 0x81, 0xa0, 0xad, 0x97, 0x76, 0x16, 0x6c, 0x2a, 0xf7, 0xba, 0xce, 0xaa, 0x40, 0x62}, {0x9c, 0x91, 0xba, 0xdd, 0xd4, 0x1f, 0xce, 0xb4, 0xaa, 0x8d, 0x4c, 0xc7, 0x3e, 0xdb, 0x31, 0xcf, 0x51, 0xcc, 0x86, 0xad, 0x63, 0xcc, 0x63, 0x2c, 0x7, 0xde, 0x1d, 0xbc, 0x3f, 0x14, 0xe2, 0x43}, {0xa0, 0x95, 0xa2, 0x5b, 0x9c, 0x74, 0x34, 0xf8, 0x5a, 0xd2, 0x37, 0xca, 0x5b, 0x7c, 0x94, 0xd6, 0x6a, 0x31, 0xc9, 0xe7, 0xa7, 0x3b, 0xf1, 0x66, 0xac, 0xc, 0xb4, 0x8d, 0x23, 0xaf, 0xbd, 0x56}, }, { {0xb2, 0x3b, 0x9d, 0xc1, 0x6c, 0xd3, 0x10, 0x13, 0xb9, 0x86, 0x23, 0x62, 0xb7, 0x6b, 0x2a, 0x6, 0x5c, 0x4f, 0xa1, 0xd7, 0x91, 0x85, 0x9b, 0x7c, 0x54, 0x57, 0x1e, 0x7e, 0x50, 0x31, 0xaa, 0x3}, {0xeb, 0x33, 0x35, 0xf5, 0xe3, 0xb9, 0x2a, 0x36, 0x40, 0x3d, 0xb9, 0x6e, 0xd5, 0x68, 0x85, 0x33, 0x72, 0x55, 0x5a, 0x1d, 0x52, 0x14, 0xe, 0x9e, 0x18, 0x13, 0x74, 0x83, 0x6d, 0xa8, 0x24, 0x1d}, {0x1f, 0xce, 0xd4, 0xff, 0x48, 0x76, 0xec, 0xf4, 0x1c, 0x8c, 0xac, 0x54, 0xf0, 0xea, 0x45, 0xe0, 0x7c, 0x35, 0x9, 0x1d, 0x82, 0x25, 0xd2, 0x88, 0x59, 0x48, 0xeb, 0x9a, 0xdc, 0x61, 0xb2, 0x43}, }, { {0x64, 0x13, 0x95, 0x6c, 0x8b, 0x3d, 0x51, 0x19, 0x7b, 0xf4, 0xb, 0x0, 0x26, 0x71, 0xfe, 0x94, 0x67, 0x95, 0x4f, 0xd5, 0xdd, 0x10, 0x8d, 0x2, 0x64, 0x9, 0x94, 0x42, 0xe2, 0xd5, 0xb4, 0x2}, {0xbb, 0x79, 0xbb, 0x88, 0x19, 0x1e, 0x5b, 0xe5, 0x9d, 0x35, 0x7a, 0xc1, 0x7d, 0xd0, 0x9e, 0xa0, 0x33, 0xea, 0x3d, 0x60, 0xe2, 0x2e, 0x2c, 0xb0, 0xc2, 0x6b, 0x27, 0x5b, 0xcf, 0x55, 0x60, 0x32}, {0xf2, 0x8d, 0xd1, 0x28, 0xcb, 0x55, 0xa1, 0xb4, 0x8, 0xe5, 0x6c, 0x18, 0x46, 0x46, 0xcc, 0xea, 0x89, 0x43, 0x82, 0x6c, 0x93, 0xf4, 0x9c, 0xc4, 0x10, 0x34, 0x5d, 0xae, 0x9, 0xc8, 0xa6, 0x27}, }, { {0x54, 0x69, 0x3d, 0xc4, 0xa, 0x27, 0x2c, 0xcd, 0xb2, 0xca, 0x66, 0x6a, 0x57, 0x3e, 0x4a, 0xdd, 0x6c, 0x3, 0xd7, 0x69, 0x24, 0x59, 0xfa, 0x79, 0x99, 0x25, 0x8c, 0x3d, 0x60, 0x3, 0x15, 0x22}, {0x88, 0xb1, 0xd, 0x1f, 0xcd, 0xeb, 0xa6, 0x8b, 0xe8, 0x5b, 0x5a, 0x67, 0x3a, 0xd7, 0xd3, 0x37, 0x5a, 0x58, 0xf5, 0x15, 0xa3, 0xdf, 0x2e, 0xf2, 0x7e, 0xa1, 0x60, 0xff, 0x74, 0x71, 0xb6, 0x2c}, {0xd0, 0xe1, 0xb, 0x39, 0xf9, 0xcd, 0xee, 0x59, 0xf1, 0xe3, 0x8c, 0x72, 0x44, 0x20, 0x42, 0xa9, 0xf4, 0xf0, 0x94, 0x7a, 0x66, 0x1c, 0x89, 0x82, 0x36, 0xf4, 0x90, 0x38, 0xb7, 0xf4, 0x1d, 0x7b}, }, { {0x8c, 0xf5, 0xf8, 0x7, 0x18, 0x22, 0x2e, 0x5f, 0xd4, 0x9, 0x94, 0xd4, 0x9f, 0x5c, 0x55, 0xe3, 0x30, 0xa6, 0xb6, 0x1f, 0x8d, 0xa8, 0xaa, 0xb2, 0x3d, 0xe0, 0x52, 0xd3, 0x45, 0x82, 0x69, 0x68}, {0x24, 0xa2, 0xb2, 0xb3, 0xe0, 0xf2, 0x92, 0xe4, 0x60, 0x11, 0x55, 0x2b, 0x6, 0x9e, 0x6c, 0x7c, 0xe, 0x7b, 0x7f, 0xd, 0xe2, 0x8f, 0xeb, 0x15, 0x92, 0x59, 0xfc, 0x58, 0x26, 0xef, 0xfc, 0x61}, {0x7a, 0x18, 0x18, 0x2a, 0x85, 0x5d, 0xb1, 0xdb, 0xd7, 0xac, 0xdd, 0x86, 0xd3, 0xaa, 0xe4, 0xf3, 0x82, 0xc4, 0xf6, 0xf, 0x81, 0xe2, 0xba, 0x44, 0xcf, 0x1, 0xaf, 0x3d, 0x47, 0x4c, 0xcf, 0x46}, }, { {0x40, 0x81, 0x49, 0xf1, 0xa7, 0x6e, 0x3c, 0x21, 0x54, 0x48, 0x2b, 0x39, 0xf8, 0x7e, 0x1e, 0x7c, 0xba, 0xce, 0x29, 0x56, 0x8c, 0xc3, 0x88, 0x24, 0xbb, 0xc5, 0x8c, 0xd, 0xe5, 0xaa, 0x65, 0x10}, {0xf9, 0xe5, 0xc4, 0x9e, 0xed, 0x25, 0x65, 0x42, 0x3, 0x33, 0x90, 0x16, 0x1, 0xda, 0x5e, 0xe, 0xdc, 0xca, 0xe5, 0xcb, 0xf2, 0xa7, 0xb1, 0x72, 0x40, 0x5f, 0xeb, 0x14, 0xcd, 0x7b, 0x38, 0x29}, {0x57, 0xd, 0x20, 0xdf, 0x25, 0x45, 0x2c, 0x1c, 0x4a, 0x67, 0xca, 0xbf, 0xd6, 0x2d, 0x3b, 0x5c, 0x30, 0x40, 0x83, 0xe1, 0xb1, 0xe7, 0x7, 0xa, 0x16, 0xe7, 0x1c, 0x4f, 0xe6, 0x98, 0xa1, 0x69}, }, }, { { {0xed, 0xca, 0xc5, 0xdc, 0x34, 0x44, 0x1, 0xe1, 0x33, 0xfb, 0x84, 0x3c, 0x96, 0x5d, 0xed, 0x47, 0xe7, 0xa0, 0x86, 0xed, 0x76, 0x95, 0x1, 0x70, 0xe4, 0xf9, 0x67, 0xd2, 0x7b, 0x69, 0xb2, 0x25}, {0xbc, 0x78, 0x1a, 0xd9, 0xe0, 0xb2, 0x62, 0x90, 0x67, 0x96, 0x50, 0xc8, 0x9c, 0x88, 0xc9, 0x47, 0xb8, 0x70, 0x50, 0x40, 0x66, 0x4a, 0xf5, 0x9d, 0xbf, 0xa1, 0x93, 0x24, 0xa9, 0xe6, 0x69, 0x73}, {0x64, 0x68, 0x98, 0x13, 0xfb, 0x3f, 0x67, 0x9d, 0xb8, 0xc7, 0x5d, 0x41, 0xd9, 0xfb, 0xa5, 0x3c, 0x5e, 0x3b, 0x27, 0xdf, 0x3b, 0xcc, 0x4e, 0xe0, 0xd2, 0x4c, 0x4e, 0xb5, 0x3d, 0x68, 0x20, 0x14}, }, { {0xd0, 0x5a, 0xcc, 0xc1, 0x6f, 0xbb, 0xee, 0x34, 0x8b, 0xac, 0x46, 0x96, 0xe9, 0xc, 0x1b, 0x6a, 0x53, 0xde, 0x6b, 0xa6, 0x49, 0xda, 0xb0, 0xd3, 0xc1, 0x81, 0xd0, 0x61, 0x41, 0x3b, 0xe8, 0x31}, {0x97, 0xd1, 0x9d, 0x24, 0x1e, 0xbd, 0x78, 0xb4, 0x2, 0xc1, 0x58, 0x5e, 0x0, 0x35, 0xc, 0x62, 0x5c, 0xac, 0xba, 0xcc, 0x2f, 0xd3, 0x2, 0xfb, 0x2d, 0xa7, 0x8, 0xf5, 0xeb, 0x3b, 0xb6, 0x60}, {0x4f, 0x2b, 0x6, 0x9e, 0x12, 0xc7, 0xe8, 0x97, 0xd8, 0xa, 0x32, 0x29, 0x4f, 0x8f, 0xe4, 0x49, 0x3f, 0x68, 0x18, 0x6f, 0x4b, 0xe1, 0xec, 0x5b, 0x17, 0x3, 0x55, 0x2d, 0xb6, 0x1e, 0xcf, 0x55}, }, { {0x52, 0x8c, 0xf5, 0x7d, 0xe3, 0xb5, 0x76, 0x30, 0x36, 0xcc, 0x99, 0xe7, 0xdd, 0xb9, 0x3a, 0xd7, 0x20, 0xee, 0x13, 0x49, 0xe3, 0x1c, 0x83, 0xbd, 0x33, 0x1, 0xba, 0x62, 0xaa, 0xfb, 0x56, 0x1a}, {0x58, 0x3d, 0xc2, 0x65, 0x10, 0x10, 0x79, 0x58, 0x9c, 0x81, 0x94, 0x50, 0x6d, 0x8, 0x9d, 0x8b, 0xa7, 0x5f, 0xc5, 0x12, 0xa9, 0x2f, 0x40, 0xe2, 0xd4, 0x91, 0x8, 0x57, 0x64, 0x65, 0x9a, 0x66}, {0xec, 0xc9, 0x9d, 0x5c, 0x50, 0x6b, 0x3e, 0x94, 0x1a, 0x37, 0x7c, 0xa7, 0xbb, 0x57, 0x25, 0x30, 0x51, 0x76, 0x34, 0x41, 0x56, 0xae, 0x73, 0x98, 0x5c, 0x8a, 0xc5, 0x99, 0x67, 0x83, 0xc4, 0x13}, }, { {0x80, 0xd0, 0x8b, 0x5d, 0x6a, 0xfb, 0xdc, 0xc4, 0x42, 0x48, 0x1a, 0x57, 0xec, 0xc4, 0xeb, 0xde, 0x65, 0x53, 0xe5, 0xb8, 0x83, 0xe8, 0xb2, 0xd4, 0x27, 0xb8, 0xe5, 0xc8, 0x7d, 0xc8, 0xbd, 0x50}, {0xb9, 0xe1, 0xb3, 0x5a, 0x46, 0x5d, 0x3a, 0x42, 0x61, 0x3f, 0xf1, 0xc7, 0x87, 0xc1, 0x13, 0xfc, 0xb6, 0xb9, 0xb5, 0xec, 0x64, 0x36, 0xf8, 0x19, 0x7, 0xb6, 0x37, 0xa6, 0x93, 0xc, 0xf8, 0x66}, {0x11, 0xe1, 0xdf, 0x6e, 0x83, 0x37, 0x6d, 0x60, 0xd9, 0xab, 0x11, 0xf0, 0x15, 0x3e, 0x35, 0x32, 0x96, 0x3b, 0xb7, 0x25, 0xc3, 0x3a, 0xb0, 0x64, 0xae, 0xd5, 0x5f, 0x72, 0x44, 0x64, 0xd5, 0x1d}, }, { {0x9a, 0xc8, 0xba, 0x8, 0x0, 0xe6, 0x97, 0xc2, 0xe0, 0xc3, 0xe1, 0xea, 0x11, 0xea, 0x4c, 0x7d, 0x7c, 0x97, 0xe7, 0x9f, 0xe1, 0x8b, 0xe3, 0xf3, 0xcd, 0x5, 0xa3, 0x63, 0xf, 0x45, 0x3a, 0x3a}, {0x7d, 0x12, 0x62, 0x33, 0xf8, 0x7f, 0xa4, 0x8f, 0x15, 0x7c, 0xcd, 0x71, 0xc4, 0x6a, 0x9f, 0xbc, 0x8b, 0xc, 0x22, 0x49, 0x43, 0x45, 0x71, 0x6e, 0x2e, 0x73, 0x9f, 0x21, 0x12, 0x59, 0x64, 0xe}, {0x27, 0x46, 0x39, 0xd8, 0x31, 0x2f, 0x8f, 0x7, 0x10, 0xa5, 0x94, 0xde, 0x83, 0x31, 0x9d, 0x38, 0x80, 0x6f, 0x99, 0x17, 0x6d, 0x6c, 0xe3, 0xd1, 0x7b, 0xa8, 0xa9, 0x93, 0x93, 0x8d, 0x8c, 0x31}, }, { {0x98, 0xd3, 0x1d, 0xab, 0x29, 0x9e, 0x66, 0x5d, 0x3b, 0x9e, 0x2d, 0x34, 0x58, 0x16, 0x92, 0xfc, 0xcd, 0x73, 0x59, 0xf3, 0xfd, 0x1d, 0x85, 0x55, 0xf6, 0xa, 0x95, 0x25, 0xc3, 0x41, 0x9a, 0x50}, {0x19, 0xfe, 0xff, 0x2a, 0x3, 0x5d, 0x74, 0xf2, 0x66, 0xdb, 0x24, 0x7f, 0x49, 0x3c, 0x9f, 0xc, 0xef, 0x98, 0x85, 0xba, 0xe3, 0xd3, 0x98, 0xbc, 0x14, 0x53, 0x1d, 0x9a, 0x67, 0x7c, 0x4c, 0x22}, {0xe9, 0x25, 0xf9, 0xa6, 0xdc, 0x6e, 0xc0, 0xbd, 0x33, 0x1f, 0x1b, 0x64, 0xf4, 0xf3, 0x3e, 0x79, 0x89, 0x3e, 0x83, 0x9d, 0x80, 0x12, 0xec, 0x82, 0x89, 0x13, 0xa1, 0x28, 0x23, 0xf0, 0xbf, 0x5}, }, { {0xe4, 0x12, 0xc5, 0xd, 0xdd, 0xa0, 0x81, 0x68, 0xfe, 0xfa, 0xa5, 0x44, 0xc8, 0xd, 0xe7, 0x4f, 0x40, 0x52, 0x4a, 0x8f, 0x6b, 0x8e, 0x74, 0x1f, 0xea, 0xa3, 0x1, 0xee, 0xcd, 0x77, 0x62, 0x57}, {0xb, 0xe0, 0xca, 0x23, 0x70, 0x13, 0x32, 0x36, 0x59, 0xcf, 0xac, 0xd1, 0xa, 0xcf, 0x4a, 0x54, 0x88, 0x1c, 0x1a, 0xd2, 0x49, 0x10, 0x74, 0x96, 0xa7, 0x44, 0x2a, 0xfa, 0xc3, 0x8c, 0xb, 0x78}, {0x5f, 0x30, 0x4f, 0x23, 0xbc, 0x8a, 0xf3, 0x1e, 0x8, 0xde, 0x5, 0x14, 0xbd, 0x7f, 0x57, 0x9a, 0xd, 0x2a, 0xe6, 0x34, 0x14, 0xa5, 0x82, 0x5e, 0xa1, 0xb7, 0x71, 0x62, 0x72, 0x18, 0xf4, 0x5f}, }, { {0x40, 0x95, 0xb6, 0x13, 0xe8, 0x47, 0xdb, 0xe5, 0xe1, 0x10, 0x26, 0x43, 0x3b, 0x2a, 0x5d, 0xf3, 0x76, 0x12, 0x78, 0x38, 0xe9, 0x26, 0x1f, 0xac, 0x69, 0xcb, 0xa0, 0xa0, 0x8c, 0xdb, 0xd4, 0x29}, {0x9d, 0xdb, 0x89, 0x17, 0xc, 0x8, 0x8e, 0x39, 0xf5, 0x78, 0xe7, 0xf3, 0x25, 0x20, 0x60, 0xa7, 0x5d, 0x3, 0xbd, 0x6, 0x4c, 0x89, 0x98, 0xfa, 0xbe, 0x66, 0xa9, 0x25, 0xdc, 0x3, 0x6a, 0x10}, {0xd0, 0x53, 0x33, 0x33, 0xaf, 0xa, 0xad, 0xd9, 0xe5, 0x9, 0xd3, 0xac, 0xa5, 0x9d, 0x66, 0x38, 0xf0, 0xf7, 0x88, 0xc8, 0x8a, 0x65, 0x57, 0x3c, 0xfa, 0xbe, 0x2c, 0x5, 0x51, 0x8a, 0xb3, 0x4a}, }, }, { { {0x9c, 0xc0, 0xdd, 0x5f, 0xef, 0xd1, 0xcf, 0xd6, 0xce, 0x5d, 0x57, 0xf7, 0xfd, 0x3e, 0x2b, 0xe8, 0xc2, 0x34, 0x16, 0x20, 0x5d, 0x6b, 0xd5, 0x25, 0x9b, 0x2b, 0xed, 0x4, 0xbb, 0xc6, 0x41, 0x30}, {0x93, 0xd5, 0x68, 0x67, 0x25, 0x2b, 0x7c, 0xda, 0x13, 0xca, 0x22, 0x44, 0x57, 0xc0, 0xc1, 0x98, 0x1d, 0xce, 0xa, 0xca, 0xd5, 0xb, 0xa8, 0xf1, 0x90, 0xa6, 0x88, 0xc0, 0xad, 0xd1, 0xcd, 0x29}, {0x48, 0xe1, 0x56, 0xd9, 0xf9, 0xf2, 0xf2, 0xf, 0x2e, 0x6b, 0x35, 0x9f, 0x75, 0x97, 0xe7, 0xad, 0x5c, 0x2, 0x6c, 0x5f, 0xbb, 0x98, 0x46, 0x1a, 0x7b, 0x9a, 0x4, 0x14, 0x68, 0xbd, 0x4b, 0x10}, }, { {0x63, 0xf1, 0x7f, 0xd6, 0x5f, 0x9a, 0x5d, 0xa9, 0x81, 0x56, 0xc7, 0x4c, 0x9d, 0xe6, 0x2b, 0xe9, 0x57, 0xf2, 0x20, 0xde, 0x4c, 0x2, 0xf8, 0xb7, 0xf5, 0x2d, 0x7, 0xfb, 0x20, 0x2a, 0x4f, 0x20}, {0x67, 0xed, 0xf1, 0x68, 0x31, 0xfd, 0xf0, 0x51, 0xc2, 0x3b, 0x6f, 0xd8, 0xcd, 0x1d, 0x81, 0x2c, 0xde, 0xf2, 0xd2, 0x4, 0x43, 0x5c, 0xdc, 0x44, 0x49, 0x71, 0x2a, 0x9, 0x57, 0xcc, 0xe8, 0x5b}, {0x79, 0xb0, 0xeb, 0x30, 0x3d, 0x3b, 0x14, 0xc8, 0x30, 0x2e, 0x65, 0xbd, 0x5a, 0x15, 0x89, 0x75, 0x31, 0x5c, 0x6d, 0x8f, 0x31, 0x3c, 0x3c, 0x65, 0x1f, 0x16, 0x79, 0xc2, 0x17, 0xfb, 0x70, 0x25}, }, { {0x5a, 0x24, 0xb8, 0xb, 0x55, 0xa9, 0x2e, 0x19, 0xd1, 0x50, 0x90, 0x8f, 0xa8, 0xfb, 0xe6, 0xc8, 0x35, 0xc9, 0xa4, 0x88, 0x2d, 0xea, 0x86, 0x79, 0x68, 0x86, 0x1, 0xde, 0x91, 0x5f, 0x1c, 0x24}, {0x75, 0x15, 0xb6, 0x2c, 0x7f, 0x36, 0xfa, 0x3e, 0x6c, 0x2, 0xd6, 0x1c, 0x76, 0x6f, 0xf9, 0xf5, 0x62, 0x25, 0xb5, 0x65, 0x2a, 0x14, 0xc7, 0xe8, 0xcd, 0xa, 0x3, 0x53, 0xea, 0x65, 0xcb, 0x3d}, {0xaa, 0x6c, 0xde, 0x40, 0x29, 0x17, 0xd8, 0x28, 0x3a, 0x73, 0xd9, 0x22, 0xf0, 0x2c, 0xbf, 0x8f, 0xd1, 0x1, 0x5b, 0x23, 0xdd, 0xfc, 0xd7, 0x16, 0xe5, 0xf0, 0xcd, 0x5f, 0xdd, 0xe, 0x42, 0x8}, }, { {0xce, 0x10, 0xf4, 0x4, 0x4e, 0xc3, 0x58, 0x3, 0x85, 0x6, 0x6e, 0x27, 0x5a, 0x5b, 0x13, 0xb6, 0x21, 0x15, 0xb9, 0xeb, 0xc7, 0x70, 0x96, 0x5d, 0x9c, 0x88, 0xdb, 0x21, 0xf3, 0x54, 0xd6, 0x4}, {0x4a, 0xfa, 0x62, 0x83, 0xab, 0x20, 0xff, 0xcd, 0x6e, 0x3e, 0x1a, 0xe2, 0xd4, 0x18, 0xe1, 0x57, 0x2b, 0xe6, 0x39, 0xfc, 0x17, 0x96, 0x17, 0xe3, 0xfd, 0x69, 0x17, 0xbc, 0xef, 0x53, 0x9a, 0xd}, {0xd5, 0xb5, 0xbd, 0xdd, 0x16, 0xc1, 0x7d, 0x5e, 0x2d, 0xdd, 0xa5, 0x8d, 0xb6, 0xde, 0x54, 0x29, 0x92, 0xa2, 0x34, 0x33, 0x17, 0x8, 0xb6, 0x1c, 0xd7, 0x1a, 0x99, 0x18, 0x26, 0x4f, 0x7a, 0x4a}, }, { {0x4b, 0x2a, 0x37, 0xaf, 0x91, 0xb2, 0xc3, 0x24, 0xf2, 0x47, 0x81, 0x71, 0x70, 0x82, 0xda, 0x93, 0xf2, 0x9e, 0x89, 0x86, 0x64, 0x85, 0x84, 0xdd, 0x33, 0xee, 0xe0, 0x23, 0x42, 0x31, 0x96, 0x4a}, {0x95, 0x5f, 0xb1, 0x5f, 0x2, 0x18, 0xa7, 0xf4, 0x8f, 0x1b, 0x5c, 0x6b, 0x34, 0x5f, 0xf6, 0x3d, 0x12, 0x11, 0xe0, 0x0, 0x85, 0xf0, 0xfc, 0xcd, 0x48, 0x18, 0xd3, 0xdd, 0x4c, 0xc, 0xb5, 0x11}, {0xd6, 0xff, 0xa4, 0x8, 0x44, 0x27, 0xe8, 0xa6, 0xd9, 0x76, 0x15, 0x9c, 0x7e, 0x17, 0x8e, 0x73, 0xf2, 0xb3, 0x2, 0x3d, 0xb6, 0x48, 0x33, 0x77, 0x51, 0xcc, 0x6b, 0xce, 0x4d, 0xce, 0x4b, 0x4f}, }, { {0x6f, 0xb, 0x9d, 0xc4, 0x6e, 0x61, 0xe2, 0x30, 0x17, 0x23, 0xec, 0xca, 0x8f, 0x71, 0x56, 0xe4, 0xa6, 0x4f, 0x6b, 0xf2, 0x9b, 0x40, 0xeb, 0x48, 0x37, 0x5f, 0x59, 0x61, 0xe5, 0xce, 0x42, 0x30}, {0x84, 0x25, 0x24, 0xe2, 0x5a, 0xce, 0x1f, 0xa7, 0x9e, 0x8a, 0xf5, 0x92, 0x56, 0x72, 0xea, 0x26, 0xf4, 0x3c, 0xea, 0x1c, 0xd7, 0x9, 0x1a, 0xd2, 0xe6, 0x1, 0x1c, 0xb7, 0x14, 0xdd, 0xfc, 0x73}, {0x41, 0xac, 0x9b, 0x44, 0x79, 0x70, 0x7e, 0x42, 0xa, 0x31, 0xe2, 0xbc, 0x6d, 0xe3, 0x5a, 0x85, 0x7c, 0x1a, 0x84, 0x5f, 0x21, 0x76, 0xae, 0x4c, 0xd6, 0xe1, 0x9c, 0x9a, 0xc, 0x74, 0x9e, 0x38}, }, { {0x28, 0xac, 0xe, 0x57, 0xf6, 0x78, 0xbd, 0xc9, 0xe1, 0x9c, 0x91, 0x27, 0x32, 0xb, 0x5b, 0xe5, 0xed, 0x91, 0x9b, 0xa1, 0xab, 0x3e, 0xfc, 0x65, 0x90, 0x36, 0x26, 0xd6, 0xe5, 0x25, 0xc4, 0x25}, {0xce, 0xb9, 0xdc, 0x34, 0xae, 0xb3, 0xfc, 0x64, 0xad, 0xd0, 0x48, 0xe3, 0x23, 0x3, 0x50, 0x97, 0x1b, 0x38, 0xc6, 0x62, 0x7d, 0xf0, 0xb3, 0x45, 0x88, 0x67, 0x5a, 0x46, 0x79, 0x53, 0x54, 0x61}, {0x6e, 0xde, 0xd7, 0xf1, 0xa6, 0x6, 0x3e, 0x3f, 0x8, 0x23, 0x6, 0x8e, 0x27, 0x76, 0xf9, 0x3e, 0x77, 0x6c, 0x8a, 0x4e, 0x26, 0xf6, 0x14, 0x8c, 0x59, 0x47, 0x48, 0x15, 0x89, 0xa0, 0x39, 0x65}, }, { {0x19, 0x4a, 0xbb, 0x14, 0xd4, 0xdb, 0xc4, 0xdd, 0x8e, 0x4f, 0x42, 0x98, 0x3c, 0xbc, 0xb2, 0x19, 0x69, 0x71, 0xca, 0x36, 0xd7, 0x9f, 0xa8, 0x48, 0x90, 0xbd, 0x19, 0xf0, 0xe, 0x32, 0x65, 0xf}, {0x73, 0xf7, 0xd2, 0xc3, 0x74, 0x1f, 0xd2, 0xe9, 0x45, 0x68, 0xc4, 0x25, 0x41, 0x54, 0x50, 0xc1, 0x33, 0x9e, 0xb9, 0xf9, 0xe8, 0x5c, 0x4e, 0x62, 0x6c, 0x18, 0xcd, 0xc5, 0xaa, 0xe4, 0xc5, 0x11}, {0xc6, 0xe0, 0xfd, 0xca, 0xb1, 0xd1, 0x86, 0xd4, 0x81, 0x51, 0x3b, 0x16, 0xe3, 0xe6, 0x3f, 0x4f, 0x9a, 0x93, 0xf2, 0xfa, 0xd, 0xaf, 0xa8, 0x59, 0x2a, 0x7, 0x33, 0xec, 0xbd, 0xc7, 0xab, 0x4c}, }, }, { { {0x89, 0xd2, 0x78, 0x3f, 0x8f, 0x78, 0x8f, 0xc0, 0x9f, 0x4d, 0x40, 0xa1, 0x2c, 0xa7, 0x30, 0xfe, 0x9d, 0xcc, 0x65, 0xcf, 0xfc, 0x8b, 0x77, 0xf2, 0x21, 0x20, 0xcb, 0x5a, 0x16, 0x98, 0xe4, 0x7e}, {0x2e, 0xa, 0x9c, 0x8, 0x24, 0x96, 0x9e, 0x23, 0x38, 0x47, 0xfe, 0x3a, 0xc0, 0xc4, 0x48, 0xc7, 0x2a, 0xa1, 0x4f, 0x76, 0x2a, 0xed, 0xdb, 0x17, 0x82, 0x85, 0x1c, 0x32, 0xf0, 0x93, 0x9b, 0x63}, {0xc3, 0xa1, 0x11, 0x91, 0xe3, 0x8, 0xd5, 0x7b, 0x89, 0x74, 0x90, 0x80, 0xd4, 0x90, 0x2b, 0x2b, 0x19, 0xfd, 0x72, 0xae, 0xc2, 0xae, 0xd2, 0xe7, 0xa6, 0x2, 0xb6, 0x85, 0x3c, 0x49, 0xdf, 0xe}, }, { {0x13, 0x41, 0x76, 0x84, 0xd2, 0xc4, 0x67, 0x67, 0x35, 0xf8, 0xf5, 0xf7, 0x3f, 0x40, 0x90, 0xa0, 0xde, 0xbe, 0xe6, 0xca, 0xfa, 0xcf, 0x8f, 0x1c, 0x69, 0xa3, 0xdf, 0xd1, 0x54, 0xc, 0xc0, 0x4}, {0x68, 0x5a, 0x9b, 0x59, 0x58, 0x81, 0xcc, 0xae, 0xe, 0xe2, 0xad, 0xeb, 0xf, 0x4f, 0x57, 0xea, 0x7, 0x7f, 0xb6, 0x22, 0x74, 0x1d, 0xe4, 0x4f, 0xb4, 0x4f, 0x9d, 0x1, 0xe3, 0x92, 0x3b, 0x40}, {0xf8, 0x5c, 0x46, 0x8b, 0x81, 0x2f, 0xc2, 0x4d, 0xf8, 0xef, 0x80, 0x14, 0x5a, 0xf3, 0xa0, 0x71, 0x57, 0xd6, 0xc7, 0x4, 0xad, 0xbf, 0xe8, 0xae, 0xf4, 0x76, 0x61, 0xb2, 0x2a, 0xb1, 0x5b, 0x35}, }, { {0x18, 0x73, 0x8c, 0x5a, 0xc7, 0xda, 0x1, 0xa3, 0x11, 0xaa, 0xce, 0xb3, 0x9d, 0x3, 0x90, 0xed, 0x2d, 0x3f, 0xae, 0x3b, 0xbf, 0x7c, 0x7, 0x6f, 0x8e, 0xad, 0x52, 0xe0, 0xf8, 0xea, 0x18, 0x75}, {0xf4, 0xbb, 0x93, 0x74, 0xcc, 0x64, 0x1e, 0xa7, 0xc3, 0xb0, 0xa3, 0xec, 0xd9, 0x84, 0xbd, 0xe5, 0x85, 0xe7, 0x5, 0xfa, 0xc, 0xc5, 0x6b, 0xa, 0x12, 0xc3, 0x2e, 0x18, 0x32, 0x81, 0x9b, 0xf}, {0x32, 0x6c, 0x7f, 0x1b, 0xc4, 0x59, 0x88, 0xa4, 0x98, 0x32, 0x38, 0xf4, 0xbc, 0x60, 0x2d, 0xf, 0xd9, 0xd1, 0xb1, 0xc9, 0x29, 0xa9, 0x15, 0x18, 0xc4, 0x55, 0x17, 0xbb, 0x1b, 0x87, 0xc3, 0x47}, }, { {0xb0, 0x66, 0x50, 0xc8, 0x50, 0x5d, 0xe6, 0xfb, 0xb0, 0x99, 0xa2, 0xb3, 0xb0, 0xc4, 0xec, 0x62, 0xe0, 0xe8, 0x1a, 0x44, 0xea, 0x54, 0x37, 0xe5, 0x5f, 0x8d, 0xd4, 0xe8, 0x2c, 0xa0, 0xfe, 0x8}, {0x48, 0x4f, 0xec, 0x71, 0x97, 0x53, 0x44, 0x51, 0x6e, 0x5d, 0x8c, 0xc9, 0x7d, 0xb1, 0x5, 0xf8, 0x6b, 0xc6, 0xc3, 0x47, 0x1a, 0xc1, 0x62, 0xf7, 0xdc, 0x99, 0x46, 0x76, 0x85, 0x9b, 0xb8, 0x0}, {0xd0, 0xea, 0xde, 0x68, 0x76, 0xdd, 0x4d, 0x82, 0x23, 0x5d, 0x68, 0x4b, 0x20, 0x45, 0x64, 0xc8, 0x65, 0xd6, 0x89, 0x5d, 0xcd, 0xcf, 0x14, 0xb5, 0x37, 0xd5, 0x75, 0x4f, 0xa7, 0x29, 0x38, 0x47}, }, { {0xc9, 0x2, 0x39, 0xad, 0x3a, 0x53, 0xd9, 0x23, 0x8f, 0x58, 0x3, 0xef, 0xce, 0xdd, 0xc2, 0x64, 0xb4, 0x2f, 0xe1, 0xcf, 0x90, 0x73, 0x25, 0x15, 0x90, 0xd3, 0xe4, 0x44, 0x4d, 0x8b, 0x66, 0x6c}, {0x18, 0xc4, 0x79, 0x46, 0x75, 0xda, 0xd2, 0x82, 0xf0, 0x8d, 0x61, 0xb2, 0xd8, 0xd7, 0x3b, 0xe6, 0xa, 0xeb, 0x47, 0xac, 0x24, 0xef, 0x5e, 0x35, 0xb4, 0xc6, 0x33, 0x48, 0x4c, 0x68, 0x78, 0x20}, {0xc, 0x82, 0x78, 0x7a, 0x21, 0xcf, 0x48, 0x3b, 0x97, 0x3e, 0x27, 0x81, 0xb2, 0xa, 0x6a, 0xf7, 0x7b, 0xed, 0x8e, 0x8c, 0xa7, 0x65, 0x6c, 0xa9, 0x3f, 0x43, 0x8a, 0x4f, 0x5, 0xa6, 0x11, 0x74}, }, { {0xb4, 0x75, 0xb1, 0x18, 0x3d, 0xe5, 0x9a, 0x57, 0x2, 0xa1, 0x92, 0xf3, 0x59, 0x31, 0x71, 0x68, 0xf5, 0x35, 0xef, 0x1e, 0xba, 0xec, 0x55, 0x84, 0x8f, 0x39, 0x8c, 0x45, 0x72, 0xa8, 0xc9, 0x1e}, {0x6d, 0xc8, 0x9d, 0xb9, 0x32, 0x9d, 0x65, 0x4d, 0x15, 0xf1, 0x3a, 0x60, 0x75, 0xdc, 0x4c, 0x4, 0x88, 0xe4, 0xc2, 0xdc, 0x2c, 0x71, 0x4c, 0xb3, 0xff, 0x34, 0x81, 0xfb, 0x74, 0x65, 0x13, 0x7c}, {0x9b, 0x50, 0xa2, 0x0, 0xd4, 0xa4, 0xe6, 0xb8, 0xb4, 0x82, 0xc8, 0xb, 0x2, 0xd7, 0x81, 0x9b, 0x61, 0x75, 0x95, 0xf1, 0x9b, 0xcc, 0xe7, 0x57, 0x60, 0x64, 0xcd, 0xc7, 0xa5, 0x88, 0xdd, 0x3a}, }, { {0x46, 0x30, 0x39, 0x59, 0xd4, 0x98, 0xc2, 0x85, 0xec, 0x59, 0xf6, 0x5f, 0x98, 0x35, 0x7e, 0x8f, 0x3a, 0x6e, 0xf6, 0xf2, 0x2a, 0xa2, 0x2c, 0x1d, 0x20, 0xa7, 0x6, 0xa4, 0x31, 0x11, 0xba, 0x61}, {0xf2, 0xdc, 0x35, 0xb6, 0x70, 0x57, 0x89, 0xab, 0xbc, 0x1f, 0x6c, 0xf6, 0x6c, 0xef, 0xdf, 0x2, 0x87, 0xd1, 0xb6, 0xbe, 0x68, 0x2, 0x53, 0x85, 0x74, 0x9e, 0x87, 0xcc, 0xfc, 0x29, 0x99, 0x24}, {0x29, 0x90, 0x95, 0x16, 0xf1, 0xa0, 0xd0, 0xa3, 0x89, 0xbd, 0x7e, 0xba, 0x6c, 0x6b, 0x3b, 0x2, 0x7, 0x33, 0x78, 0x26, 0x3e, 0x5a, 0xf1, 0x7b, 0xe7, 0xec, 0xd8, 0xbb, 0xc, 0x31, 0x20, 0x56}, }, { {0xd6, 0x85, 0xe2, 0x77, 0xf4, 0xb5, 0x46, 0x66, 0x93, 0x61, 0x8f, 0x6c, 0x67, 0xff, 0xe8, 0x40, 0xdd, 0x94, 0xb5, 0xab, 0x11, 0x73, 0xec, 0xa6, 0x4d, 0xec, 0x8c, 0x65, 0xf3, 0x46, 0xc8, 0x7e}, {0x43, 0xd6, 0x34, 0x49, 0x43, 0x93, 0x89, 0x52, 0xf5, 0x22, 0x12, 0xa5, 0x6, 0xf8, 0xdb, 0xb9, 0x22, 0x1c, 0xf4, 0xc3, 0x8f, 0x87, 0x6d, 0x8f, 0x30, 0x97, 0x9d, 0x4d, 0x2a, 0x6a, 0x67, 0x37}, {0xc7, 0x2e, 0xa2, 0x1d, 0x3f, 0x8f, 0x5e, 0x9b, 0x13, 0xcd, 0x1, 0x6c, 0x77, 0x1d, 0xf, 0x13, 0xb8, 0x9f, 0x98, 0xa2, 0xcf, 0x8f, 0x4c, 0x21, 0xd5, 0x9d, 0x9b, 0x39, 0x23, 0xf7, 0xaa, 0x6d}, }, }, { { {0xa2, 0x8e, 0xad, 0xac, 0xbf, 0x4, 0x3b, 0x58, 0x84, 0xe8, 0x8b, 0x14, 0xe8, 0x43, 0xb7, 0x29, 0xdb, 0xc5, 0x10, 0x8, 0x3b, 0x58, 0x1e, 0x2b, 0xaa, 0xbb, 0xb3, 0x8e, 0xe5, 0x49, 0x54, 0x2b}, {0x47, 0xbe, 0x3d, 0xeb, 0x62, 0x75, 0x3a, 0x5f, 0xb8, 0xa0, 0xbd, 0x8e, 0x54, 0x38, 0xea, 0xf7, 0x99, 0x72, 0x74, 0x45, 0x31, 0xe5, 0xc3, 0x0, 0x51, 0xd5, 0x27, 0x16, 0xe7, 0xe9, 0x4, 0x13}, {0xfe, 0x9c, 0xdc, 0x6a, 0xd2, 0x14, 0x98, 0x78, 0xb, 0xdd, 0x48, 0x8b, 0x3f, 0xab, 0x1b, 0x3c, 0xa, 0xc6, 0x79, 0xf9, 0xff, 0xe1, 0xf, 0xda, 0x93, 0xd6, 0x2d, 0x7c, 0x2d, 0xde, 0x68, 0x44}, }, { {0xce, 0x7, 0x63, 0xf8, 0xc6, 0xd8, 0x9a, 0x4b, 0x28, 0xc, 0x5d, 0x43, 0x31, 0x35, 0x11, 0x21, 0x2c, 0x77, 0x7a, 0x65, 0xc5, 0x66, 0xa8, 0xd4, 0x52, 0x73, 0x24, 0x63, 0x7e, 0x42, 0xa6, 0x5d}, {0x9e, 0x46, 0x19, 0x94, 0x5e, 0x35, 0xbb, 0x51, 0x54, 0xc7, 0xdd, 0x23, 0x4c, 0xdc, 0xe6, 0x33, 0x62, 0x99, 0x7f, 0x44, 0xd6, 0xb6, 0xa5, 0x93, 0x63, 0xbd, 0x44, 0xfb, 0x6f, 0x7c, 0xce, 0x6c}, {0xca, 0x22, 0xac, 0xde, 0x88, 0xc6, 0x94, 0x1a, 0xf8, 0x1f, 0xae, 0xbb, 0xf7, 0x6e, 0x6, 0xb9, 0xf, 0x58, 0x59, 0x8d, 0x38, 0x8c, 0xad, 0x88, 0xa8, 0x2c, 0x9f, 0xe7, 0xbf, 0x9a, 0xf2, 0x58}, }, { {0xf6, 0xcd, 0xe, 0x71, 0xbf, 0x64, 0x5a, 0x4b, 0x3c, 0x29, 0x2c, 0x46, 0x38, 0xe5, 0x4c, 0xb1, 0xb9, 0x3a, 0xb, 0xd5, 0x56, 0xd0, 0x43, 0x36, 0x70, 0x48, 0x5b, 0x18, 0x24, 0x37, 0xf9, 0x6a}, {0x68, 0x3e, 0xe7, 0x8d, 0xab, 0xcf, 0xe, 0xe9, 0xa5, 0x76, 0x7e, 0x37, 0x9f, 0x6f, 0x3, 0x54, 0x82, 0x59, 0x1, 0xbe, 0xb, 0x5b, 0x49, 0xf0, 0x36, 0x1e, 0xf4, 0xa7, 0xc4, 0x29, 0x76, 0x57}, {0x88, 0xa8, 0xc6, 0x9, 0x45, 0x2, 0x20, 0x32, 0x73, 0x89, 0x55, 0x4b, 0x13, 0x36, 0xe0, 0xd2, 0x9f, 0x28, 0x33, 0x3c, 0x23, 0x36, 0xe2, 0x83, 0x8f, 0xc1, 0xae, 0xc, 0xbb, 0x25, 0x1f, 0x70}, }, { {0x13, 0xc1, 0xbe, 0x7c, 0xd9, 0xf6, 0x18, 0x9d, 0xe4, 0xdb, 0xbf, 0x74, 0xe6, 0x6, 0x4a, 0x84, 0xd6, 0x60, 0x4e, 0xac, 0x22, 0xb5, 0xf5, 0x20, 0x51, 0x5e, 0x95, 0x50, 0xc0, 0x5b, 0xa, 0x72}, {0xed, 0x6c, 0x61, 0xe4, 0xf8, 0xb0, 0xa8, 0xc3, 0x7d, 0xa8, 0x25, 0x9e, 0xe, 0x66, 0x0, 0xf7, 0x9c, 0xa5, 0xbc, 0xf4, 0x1f, 0x6, 0xe3, 0x61, 0xe9, 0xb, 0xc4, 0xbd, 0xbf, 0x92, 0xc, 0x2e}, {0x35, 0x5a, 0x80, 0x9b, 0x43, 0x9, 0x3f, 0xc, 0xfc, 0xab, 0x42, 0x62, 0x37, 0x8b, 0x4e, 0xe8, 0x46, 0x93, 0x22, 0x5c, 0xf3, 0x17, 0x14, 0x69, 0xec, 0xf0, 0x4e, 0x14, 0xbb, 0x9c, 0x9b, 0xe}, }, { {0xee, 0xbe, 0xb1, 0x5d, 0xd5, 0x9b, 0xee, 0x8d, 0xb9, 0x3f, 0x72, 0xa, 0x37, 0xab, 0xc3, 0xc9, 0x91, 0xd7, 0x68, 0x1c, 0xbf, 0xf1, 0xa8, 0x44, 0xde, 0x3c, 0xfd, 0x1c, 0x19, 0x44, 0x6d, 0x36}, {0xad, 0x20, 0x57, 0xfb, 0x8f, 0xd4, 0xba, 0xfb, 0xe, 0xd, 0xf9, 0xdb, 0x6b, 0x91, 0x81, 0xee, 0xbf, 0x43, 0x55, 0x63, 0x52, 0x31, 0x81, 0xd4, 0xd8, 0x7b, 0x33, 0x3f, 0xeb, 0x4, 0x11, 0x22}, {0x14, 0x8c, 0xbc, 0xf2, 0x43, 0x17, 0x3c, 0x9e, 0x3b, 0x6c, 0x85, 0xb5, 0xfc, 0x26, 0xda, 0x2e, 0x97, 0xfb, 0xa7, 0x68, 0xe, 0x2f, 0xb8, 0xcc, 0x44, 0x32, 0x59, 0xbc, 0xe6, 0xa4, 0x67, 0x41}, }, { {0xee, 0x8f, 0xce, 0xf8, 0x65, 0x26, 0xbe, 0xc2, 0x2c, 0xd6, 0x80, 0xe8, 0x14, 0xff, 0x67, 0xe9, 0xee, 0x4e, 0x36, 0x2f, 0x7e, 0x6e, 0x2e, 0xf1, 0xf6, 0xd2, 0x7e, 0xcb, 0x70, 0x33, 0xb3, 0x34}, {0x0, 0x27, 0xf6, 0x76, 0x28, 0x9d, 0x3b, 0x64, 0xeb, 0x68, 0x76, 0xe, 0x40, 0x9d, 0x1d, 0x5d, 0x84, 0x6, 0xfc, 0x21, 0x3, 0x43, 0x4b, 0x1b, 0x6a, 0x24, 0x55, 0x22, 0x7e, 0xbb, 0x38, 0x79}, {0xcc, 0xd6, 0x81, 0x86, 0xee, 0x91, 0xc5, 0xcd, 0x53, 0xa7, 0x85, 0xed, 0x9c, 0x10, 0x2, 0xce, 0x83, 0x88, 0x80, 0x58, 0xc1, 0x85, 0x74, 0xed, 0xe4, 0x65, 0xfe, 0x2d, 0x6e, 0xfc, 0x76, 0x11}, }, { {0xb8, 0xe, 0x77, 0x49, 0x89, 0xe2, 0x90, 0xdb, 0xa3, 0x40, 0xf4, 0xac, 0x2a, 0xcc, 0xfb, 0x98, 0x9b, 0x87, 0xd7, 0xde, 0xfe, 0x4f, 0x35, 0x21, 0xb6, 0x6, 0x69, 0xf2, 0x54, 0x3e, 0x6a, 0x1f}, {0x9b, 0x61, 0x9c, 0x5b, 0xd0, 0x6c, 0xaf, 0xb4, 0x80, 0x84, 0xa5, 0xb2, 0xf4, 0xc9, 0xdf, 0x2d, 0xc4, 0x4d, 0xe9, 0xeb, 0x2, 0xa5, 0x4f, 0x3d, 0x34, 0x5f, 0x7d, 0x67, 0x4c, 0x3a, 0xfc, 0x8}, {0xea, 0x34, 0x7, 0xd3, 0x99, 0xc1, 0xa4, 0x60, 0xd6, 0x5c, 0x16, 0x31, 0xb6, 0x85, 0xc0, 0x40, 0x95, 0x82, 0x59, 0xf7, 0x23, 0x3e, 0x33, 0xe2, 0xd1, 0x0, 0xb9, 0x16, 0x1, 0xad, 0x2f, 0x4f}, }, { {0x38, 0xb6, 0x3b, 0xb7, 0x1d, 0xd9, 0x2c, 0x96, 0x8, 0x9c, 0x12, 0xfc, 0xaa, 0x77, 0x5, 0xe6, 0x89, 0x16, 0xb6, 0xf3, 0x39, 0x9b, 0x61, 0x6f, 0x81, 0xee, 0x44, 0x29, 0x5f, 0x99, 0x51, 0x34}, {0x54, 0x4e, 0xae, 0x94, 0x41, 0xb2, 0xbe, 0x44, 0x6c, 0xef, 0x57, 0x18, 0x51, 0x1c, 0x54, 0x5f, 0x98, 0x4, 0x8d, 0x36, 0x2d, 0x6b, 0x1e, 0xa6, 0xab, 0xf7, 0x2e, 0x97, 0xa4, 0x84, 0x54, 0x44}, {0x7c, 0x7d, 0xea, 0x9f, 0xd0, 0xfc, 0x52, 0x91, 0xf6, 0x5c, 0x93, 0xb0, 0x94, 0x6c, 0x81, 0x4a, 0x40, 0x5c, 0x28, 0x47, 0xaa, 0x9a, 0x8e, 0x25, 0xb7, 0x93, 0x28, 0x4, 0xa6, 0x9c, 0xb8, 0x10}, }, }, { { {0x6e, 0xf0, 0x45, 0x5a, 0xbe, 0x41, 0x39, 0x75, 0x65, 0x5f, 0x9c, 0x6d, 0xed, 0xae, 0x7c, 0xd0, 0xb6, 0x51, 0xff, 0x72, 0x9c, 0x6b, 0x77, 0x11, 0xa9, 0x4d, 0xd, 0xef, 0xd9, 0xd1, 0xd2, 0x17}, {0x9c, 0x28, 0x18, 0x97, 0x49, 0x47, 0x59, 0x3d, 0x26, 0x3f, 0x53, 0x24, 0xc5, 0xf8, 0xeb, 0x12, 0x15, 0xef, 0xc3, 0x14, 0xcb, 0xbf, 0x62, 0x2, 0x8e, 0x51, 0xb7, 0x77, 0xd5, 0x78, 0xb8, 0x20}, {0x6a, 0x3e, 0x3f, 0x7, 0x18, 0xaf, 0xf2, 0x27, 0x69, 0x10, 0x52, 0xd7, 0x19, 0xe5, 0x3f, 0xfd, 0x22, 0x0, 0xa6, 0x3c, 0x2c, 0xb7, 0xe3, 0x22, 0xa7, 0xc6, 0x65, 0xcc, 0x63, 0x4f, 0x21, 0x72}, }, { {0xc9, 0x29, 0x3b, 0xf4, 0xb9, 0xb7, 0x9d, 0x1d, 0x75, 0x8f, 0x51, 0x4f, 0x4a, 0x82, 0x5, 0xd6, 0xc4, 0x9d, 0x2f, 0x31, 0xbd, 0x72, 0xc0, 0xf2, 0xb0, 0x45, 0x15, 0x5a, 0x85, 0xac, 0x24, 0x1f}, {0x93, 0xa6, 0x7, 0x53, 0x40, 0x7f, 0xe3, 0xb4, 0x95, 0x67, 0x33, 0x2f, 0xd7, 0x14, 0xa7, 0xab, 0x99, 0x10, 0x76, 0x73, 0xa7, 0xd0, 0xfb, 0xd6, 0xc9, 0xcb, 0x71, 0x81, 0xc5, 0x48, 0xdf, 0x5f}, {0xaa, 0x5, 0x95, 0x8e, 0x32, 0x8, 0xd6, 0x24, 0xee, 0x20, 0x14, 0xc, 0xd1, 0xc1, 0x48, 0x47, 0xa2, 0x25, 0xfb, 0x6, 0x5c, 0xe4, 0xff, 0xc7, 0xe6, 0x95, 0xe3, 0x2a, 0x9e, 0x73, 0xba, 0x0}, }, { {0x26, 0xbb, 0x88, 0xea, 0xf5, 0x26, 0x44, 0xae, 0xfb, 0x3b, 0x97, 0x84, 0xd9, 0x79, 0x6, 0x36, 0x50, 0x4e, 0x69, 0x26, 0xc, 0x3, 0x9f, 0x5c, 0x26, 0xd2, 0x18, 0xd5, 0xe7, 0x7d, 0x29, 0x72}, {0xd6, 0x90, 0x87, 0x5c, 0xde, 0x98, 0x2e, 0x59, 0xdf, 0xa2, 0xc2, 0x45, 0xd3, 0xb7, 0xbf, 0xe5, 0x22, 0x99, 0xb4, 0xf9, 0x60, 0x3b, 0x5a, 0x11, 0xf3, 0x78, 0xad, 0x67, 0x3e, 0x3a, 0x28, 0x3}, {0x39, 0xb9, 0xc, 0xbe, 0xc7, 0x1d, 0x24, 0x48, 0x80, 0x30, 0x63, 0x8b, 0x4d, 0x9b, 0xf1, 0x32, 0x8, 0x93, 0x28, 0x2, 0xd, 0xc9, 0xdf, 0xd3, 0x45, 0x19, 0x27, 0x46, 0x68, 0x29, 0xe1, 0x5}, }, { {0x50, 0x45, 0x2c, 0x24, 0xc8, 0xbb, 0xbf, 0xad, 0xd9, 0x81, 0x30, 0xd0, 0xec, 0xc, 0xc8, 0xbc, 0x92, 0xdf, 0xc8, 0xf5, 0xa6, 0x66, 0x35, 0x84, 0x4c, 0xce, 0x58, 0x82, 0xd3, 0x25, 0xcf, 0x78}, {0x5a, 0x49, 0x9c, 0x2d, 0xb3, 0xee, 0x82, 0xba, 0x7c, 0xb9, 0x2b, 0xf1, 0xfc, 0xc8, 0xef, 0xce, 0xe0, 0xd1, 0xb5, 0x93, 0xae, 0xab, 0x2d, 0xb0, 0x9b, 0x8d, 0x69, 0x13, 0x9c, 0xc, 0xc0, 0x39}, {0x68, 0x9d, 0x48, 0x31, 0x8e, 0x6b, 0xae, 0x15, 0x87, 0xf0, 0x2b, 0x9c, 0xab, 0x1c, 0x85, 0xaa, 0x5, 0xfa, 0x4e, 0xf0, 0x97, 0x5a, 0xa7, 0xc9, 0x32, 0xf8, 0x3f, 0x6b, 0x7, 0x52, 0x6b, 0x0}, }, { {0x2d, 0x8, 0xce, 0xb9, 0x16, 0x7e, 0xcb, 0xf5, 0x29, 0xbc, 0x7a, 0x41, 0x4c, 0xf1, 0x7, 0x34, 0xab, 0xa7, 0xf4, 0x2b, 0xce, 0x6b, 0xb3, 0xd4, 0xce, 0x75, 0x9f, 0x1a, 0x56, 0xe9, 0xe2, 0x7d}, {0x1c, 0x78, 0x95, 0x9d, 0xe1, 0xcf, 0xe0, 0x29, 0xe2, 0x10, 0x63, 0x96, 0x18, 0xdf, 0x81, 0xb6, 0x39, 0x6b, 0x51, 0x70, 0xd3, 0x39, 0xdf, 0x57, 0x22, 0x61, 0xc7, 0x3b, 0x44, 0xe3, 0x57, 0x4d}, {0xcb, 0x5e, 0xa5, 0xb6, 0xf4, 0xd4, 0x70, 0xde, 0x99, 0xdb, 0x85, 0x5d, 0x7f, 0x52, 0x1, 0x48, 0x81, 0x9a, 0xee, 0xd3, 0x40, 0xc4, 0xc9, 0xdb, 0xed, 0x29, 0x60, 0x1a, 0xaf, 0x90, 0x2a, 0x6b}, }, { {0xa, 0xd8, 0xb2, 0x5b, 0x24, 0xf3, 0xeb, 0x77, 0x9b, 0x7, 0xb9, 0x2f, 0x47, 0x1b, 0x30, 0xd8, 0x33, 0x73, 0xee, 0x4c, 0xf2, 0xe6, 0x47, 0xc6, 0x9, 0x21, 0x6c, 0x27, 0xc8, 0x12, 0x58, 0x46}, {0x97, 0x1e, 0xe6, 0x9a, 0xfc, 0xf4, 0x23, 0x69, 0xd1, 0x5f, 0x3f, 0xe0, 0x1d, 0x28, 0x35, 0x57, 0x2d, 0xd1, 0xed, 0xe6, 0x43, 0xae, 0x64, 0xa7, 0x4a, 0x3e, 0x2d, 0xd1, 0xe9, 0xf4, 0xd8, 0x5f}, {0xd9, 0x62, 0x10, 0x2a, 0xb2, 0xbe, 0x43, 0x4d, 0x16, 0xdc, 0x31, 0x38, 0x75, 0xfb, 0x65, 0x70, 0xd7, 0x68, 0x29, 0xde, 0x7b, 0x4a, 0xd, 0x18, 0x90, 0x67, 0xb1, 0x1c, 0x2b, 0x2c, 0xb3, 0x5}, }, { {0x95, 0x81, 0xd5, 0x7a, 0x2c, 0xa4, 0xfc, 0xf7, 0xcc, 0xf3, 0x33, 0x43, 0x6e, 0x28, 0x14, 0x32, 0x9d, 0x97, 0xb, 0x34, 0xd, 0x9d, 0xc2, 0xb6, 0xe1, 0x7, 0x73, 0x56, 0x48, 0x1a, 0x77, 0x31}, {0xfd, 0xa8, 0x4d, 0xd2, 0xcc, 0x5e, 0xc0, 0xc8, 0x83, 0xef, 0xdf, 0x5, 0xac, 0x1a, 0xcf, 0xa1, 0x61, 0xcd, 0xf9, 0x7d, 0xf2, 0xef, 0xbe, 0xdb, 0x99, 0x1e, 0x47, 0x7b, 0xa3, 0x56, 0x55, 0x3b}, {0x82, 0xd4, 0x4d, 0xe1, 0x24, 0xc5, 0xb0, 0x32, 0xb6, 0xa4, 0x2b, 0x1a, 0x54, 0x51, 0xb3, 0xed, 0xf3, 0x5a, 0x2b, 0x28, 0x48, 0x60, 0xd1, 0xa3, 0xeb, 0x36, 0x73, 0x7a, 0xd2, 0x79, 0xc0, 0x4f}, }, { {0xd, 0xc5, 0x86, 0xc, 0x44, 0x8b, 0x34, 0xdc, 0x51, 0xe6, 0x94, 0xcc, 0xc9, 0xcb, 0x37, 0x13, 0xb9, 0x3c, 0x3e, 0x64, 0x4d, 0xf7, 0x22, 0x64, 0x8, 0xcd, 0xe3, 0xba, 0xc2, 0x70, 0x11, 0x24}, {0x7f, 0x2f, 0xbf, 0x89, 0xb0, 0x38, 0xc9, 0x51, 0xa7, 0xe9, 0xdf, 0x2, 0x65, 0xbd, 0x97, 0x24, 0x53, 0xe4, 0x80, 0x78, 0x9c, 0xc0, 0xff, 0xff, 0x92, 0x8e, 0xf9, 0xca, 0xce, 0x67, 0x45, 0x12}, {0xb4, 0x73, 0xc4, 0xa, 0x86, 0xab, 0xf9, 0x3f, 0x35, 0xe4, 0x13, 0x1, 0xee, 0x1d, 0x91, 0xf0, 0xaf, 0xc4, 0xc6, 0xeb, 0x60, 0x50, 0xe7, 0x4a, 0xd, 0x0, 0x87, 0x6c, 0x96, 0x12, 0x86, 0x3f}, }, }, { { {0x13, 0x8d, 0x4, 0x36, 0xfa, 0xfc, 0x18, 0x9c, 0xdd, 0x9d, 0x89, 0x73, 0xb3, 0x9d, 0x15, 0x29, 0xaa, 0xd0, 0x92, 0x9f, 0xb, 0x35, 0x9f, 0xdc, 0xd4, 0x19, 0x8a, 0x87, 0xee, 0x7e, 0xf5, 0x26}, {0xde, 0xd, 0x2a, 0x78, 0xc9, 0xc, 0x9a, 0x55, 0x85, 0x83, 0x71, 0xea, 0xb2, 0xcd, 0x1d, 0x55, 0x8c, 0x23, 0xef, 0x31, 0x5b, 0x86, 0x62, 0x7f, 0x3d, 0x61, 0x73, 0x79, 0x76, 0xa7, 0x4a, 0x50}, {0xb1, 0xef, 0x87, 0x56, 0xd5, 0x2c, 0xab, 0xc, 0x7b, 0xf1, 0x7a, 0x24, 0x62, 0xd1, 0x80, 0x51, 0x67, 0x24, 0x5a, 0x4f, 0x34, 0x5a, 0xc1, 0x85, 0x69, 0x30, 0xba, 0x9d, 0x3d, 0x94, 0x41, 0x40}, }, { {0xdd, 0xaa, 0x6c, 0xa2, 0x43, 0x77, 0x21, 0x4b, 0xce, 0xb7, 0x8a, 0x64, 0x24, 0xb4, 0xa6, 0x47, 0xe3, 0xc9, 0xfb, 0x3, 0x7a, 0x4f, 0x1d, 0xcb, 0x19, 0xd0, 0x0, 0x98, 0x42, 0x31, 0xd9, 0x12}, {0x96, 0xcc, 0xeb, 0x43, 0xba, 0xee, 0xc0, 0xc3, 0xaf, 0x9c, 0xea, 0x26, 0x9c, 0x9c, 0x74, 0x8d, 0xc6, 0xcc, 0x77, 0x1c, 0xee, 0x95, 0xfa, 0xd9, 0xf, 0x34, 0x84, 0x76, 0xd9, 0xa1, 0x20, 0x14}, {0x4f, 0x59, 0x37, 0xd3, 0x99, 0x77, 0xc6, 0x0, 0x7b, 0xa4, 0x3a, 0xb2, 0x40, 0x51, 0x3c, 0x5e, 0x95, 0xf3, 0x5f, 0xe3, 0x54, 0x28, 0x18, 0x44, 0x12, 0xa0, 0x59, 0x43, 0x31, 0x92, 0x4f, 0x1b}, }, { {0xb1, 0x66, 0x98, 0xa4, 0x30, 0x30, 0xcf, 0x33, 0x59, 0x48, 0x5f, 0x21, 0xd2, 0x73, 0x1f, 0x25, 0xf6, 0xf4, 0xde, 0x51, 0x40, 0xaa, 0x82, 0xab, 0xf6, 0x23, 0x9a, 0x6f, 0xd5, 0x91, 0xf1, 0x5f}, {0x51, 0x9, 0x15, 0x89, 0x9d, 0x10, 0x5c, 0x3e, 0x6a, 0x69, 0xe9, 0x2d, 0x91, 0xfa, 0xce, 0x39, 0x20, 0x30, 0x5f, 0x97, 0x3f, 0xe4, 0xea, 0x20, 0xae, 0x2d, 0x13, 0x7f, 0x2a, 0x57, 0x9b, 0x23}, {0x68, 0x90, 0x2d, 0xac, 0x33, 0xd4, 0x9e, 0x81, 0x23, 0x85, 0xc9, 0x5f, 0x79, 0xab, 0x83, 0x28, 0x3d, 0xeb, 0x93, 0x55, 0x80, 0x72, 0x45, 0xef, 0xcb, 0x36, 0x8f, 0x75, 0x6a, 0x52, 0xc, 0x2}, }, { {0x89, 0xcc, 0x42, 0xf0, 0x59, 0xef, 0x31, 0xe9, 0xb6, 0x4b, 0x12, 0x8e, 0x9d, 0x9c, 0x58, 0x2c, 0x97, 0x59, 0xc7, 0xae, 0x8a, 0xe1, 0xc8, 0xad, 0xc, 0xc5, 0x2, 0x56, 0xa, 0xfe, 0x2c, 0x45}, {0xbc, 0xdb, 0xd8, 0x9e, 0xf8, 0x34, 0x98, 0x77, 0x6c, 0xa4, 0x7c, 0xdc, 0xf9, 0xaa, 0xf2, 0xc8, 0x74, 0xb0, 0xe1, 0xa3, 0xdc, 0x4c, 0x52, 0xa9, 0x77, 0x38, 0x31, 0x15, 0x46, 0xcc, 0xaa, 0x2}, {0xdf, 0x77, 0x78, 0x64, 0xa0, 0xf7, 0xa0, 0x86, 0x9f, 0x7c, 0x60, 0xe, 0x27, 0x64, 0xc4, 0xbb, 0xc9, 0x11, 0xfb, 0xf1, 0x25, 0xea, 0x17, 0xab, 0x7b, 0x87, 0x4b, 0x30, 0x7b, 0x7d, 0xfb, 0x4c}, }, { {0x12, 0xef, 0x89, 0x97, 0xc2, 0x99, 0x86, 0xe2, 0xd, 0x19, 0x57, 0xdf, 0x71, 0xcd, 0x6e, 0x2b, 0xd0, 0x70, 0xc9, 0xec, 0x57, 0xc8, 0x43, 0xc3, 0xc5, 0x3a, 0x4d, 0x43, 0xbc, 0x4c, 0x1d, 0x5b}, {0xfe, 0x75, 0x9b, 0xb8, 0x6c, 0x3d, 0xb4, 0x72, 0x80, 0xdc, 0x6a, 0x9c, 0xd9, 0x94, 0xc6, 0x54, 0x9f, 0x4c, 0xe3, 0x3e, 0x37, 0xaa, 0xc3, 0xb8, 0x64, 0x53, 0x7, 0x39, 0x2b, 0x62, 0xb4, 0x14}, {0x26, 0x9f, 0xa, 0xcc, 0x15, 0x26, 0xfb, 0xb6, 0xe5, 0xcc, 0x8d, 0xb8, 0x2b, 0xe, 0x4f, 0x3a, 0x5, 0xa7, 0x69, 0x33, 0x8b, 0x49, 0x1, 0x13, 0xd1, 0x2d, 0x59, 0x58, 0x12, 0xf7, 0x98, 0x2f}, }, { {0x1, 0xa7, 0x54, 0x4f, 0x44, 0xae, 0x12, 0x2e, 0xde, 0xd7, 0xcb, 0xa9, 0xf0, 0x3e, 0xfe, 0xfc, 0xe0, 0x5d, 0x83, 0x75, 0xd, 0x89, 0xbf, 0xce, 0x54, 0x45, 0x61, 0xe7, 0xe9, 0x62, 0x80, 0x1d}, {0x56, 0x9e, 0xf, 0xb5, 0x4c, 0xa7, 0x94, 0xc, 0x20, 0x13, 0x8e, 0x8e, 0xa9, 0xf4, 0x1f, 0x5b, 0x67, 0xf, 0x30, 0x82, 0x21, 0xcc, 0x2a, 0x9a, 0xf9, 0xaa, 0x6, 0xd8, 0x49, 0xe2, 0x6a, 0x3a}, {0x5a, 0x7c, 0x90, 0xa9, 0x85, 0xda, 0x7a, 0x65, 0x62, 0xf, 0xb9, 0x91, 0xb5, 0xa8, 0xe, 0x1a, 0xe9, 0xb4, 0x34, 0xdf, 0xfb, 0x1d, 0xe, 0x8d, 0xf3, 0x5f, 0xf2, 0xae, 0xe8, 0x8c, 0x8b, 0x29}, }, { {0xde, 0x65, 0x21, 0xa, 0xea, 0x72, 0x7a, 0x83, 0xf6, 0x79, 0xcf, 0xb, 0xb4, 0x7, 0xab, 0x3f, 0x70, 0xae, 0x38, 0x77, 0xc7, 0x36, 0x16, 0x52, 0xdc, 0xd7, 0xa7, 0x3, 0x18, 0x27, 0xa6, 0x6b}, {0xb2, 0xc, 0xf7, 0xef, 0x53, 0x79, 0x92, 0x2a, 0x76, 0x70, 0x15, 0x79, 0x2a, 0xc9, 0x89, 0x4b, 0x6a, 0xcf, 0xa7, 0x30, 0x7a, 0x45, 0x18, 0x94, 0x85, 0xe4, 0x5c, 0x4d, 0x40, 0xa8, 0xb8, 0x34}, {0x35, 0x33, 0x69, 0x83, 0xb5, 0xec, 0x6e, 0xc2, 0xfd, 0xfe, 0xb5, 0x63, 0xdf, 0x13, 0xa8, 0xd5, 0x73, 0x25, 0xb2, 0xa4, 0x9a, 0xaa, 0x93, 0xa2, 0x6a, 0x1c, 0x5e, 0x46, 0xdd, 0x2b, 0xd6, 0x71}, }, { {0xf5, 0x5e, 0xf7, 0xb1, 0xda, 0xb5, 0x2d, 0xcd, 0xf5, 0x65, 0xb0, 0x16, 0xcf, 0x95, 0x7f, 0xd7, 0x85, 0xf0, 0x49, 0x3f, 0xea, 0x1f, 0x57, 0x14, 0x3d, 0x2b, 0x2b, 0x26, 0x21, 0x36, 0x33, 0x1c}, {0x80, 0xdf, 0x78, 0xd3, 0x28, 0xcc, 0x33, 0x65, 0xb4, 0xa4, 0xf, 0xa, 0x79, 0x43, 0xdb, 0xf6, 0x5a, 0xda, 0x1, 0xf7, 0xf9, 0x5f, 0x64, 0xe3, 0xa4, 0x2b, 0x17, 0xf3, 0x17, 0xf3, 0xd5, 0x74}, {0x81, 0xca, 0xd9, 0x67, 0x54, 0xe5, 0x6f, 0xa8, 0x37, 0x8c, 0x29, 0x2b, 0x75, 0x7c, 0x8b, 0x39, 0x3b, 0x62, 0xac, 0xe3, 0x92, 0x8, 0x6d, 0xda, 0x8c, 0xd9, 0xe9, 0x47, 0x45, 0xcc, 0xeb, 0x4a}, }, }, { { {0x10, 0xb6, 0x54, 0x73, 0x9e, 0x8d, 0x40, 0xb, 0x6e, 0x5b, 0xa8, 0x5b, 0x53, 0x32, 0x6b, 0x80, 0x7, 0xa2, 0x58, 0x4a, 0x3, 0x3a, 0xe6, 0xdb, 0x2c, 0xdf, 0xa1, 0xc9, 0xdd, 0xd9, 0x3b, 0x17}, {0xc9, 0x1, 0x6d, 0x27, 0x1b, 0x7, 0xf0, 0x12, 0x70, 0x8c, 0xc4, 0x86, 0xc5, 0xba, 0xb8, 0xe7, 0xa9, 0xfb, 0xd6, 0x71, 0x9b, 0x12, 0x8, 0x53, 0x92, 0xb7, 0x3d, 0x5a, 0xf9, 0xfb, 0x88, 0x5d}, {0xdf, 0x72, 0x58, 0xfe, 0x1e, 0xf, 0x50, 0x2b, 0xc1, 0x18, 0x39, 0xd4, 0x2e, 0x58, 0xd6, 0x58, 0xe0, 0x3a, 0x67, 0xc9, 0x8e, 0x27, 0xed, 0xe6, 0x19, 0xa3, 0x9e, 0xb1, 0x13, 0xcd, 0xe1, 0x6}, }, { {0x53, 0x3, 0x5b, 0x9e, 0x62, 0xaf, 0x2b, 0x47, 0x47, 0x4, 0x8d, 0x27, 0x90, 0xb, 0xaa, 0x3b, 0x27, 0xbf, 0x43, 0x96, 0x46, 0x5f, 0x78, 0xc, 0x13, 0x7b, 0x83, 0x8d, 0x1a, 0x6a, 0x3a, 0x7f}, {0x23, 0x6f, 0x16, 0x6f, 0x51, 0xad, 0xd0, 0x40, 0xbe, 0x6a, 0xab, 0x1f, 0x93, 0x32, 0x8e, 0x11, 0x8e, 0x8, 0x4d, 0xa0, 0x14, 0x5e, 0xe3, 0x3f, 0x66, 0x62, 0xe1, 0x26, 0x35, 0x60, 0x80, 0x30}, {0xb, 0x80, 0x3d, 0x5d, 0x39, 0x44, 0xe6, 0xf7, 0xf6, 0xed, 0x1, 0xc9, 0x55, 0xd5, 0xa8, 0x95, 0x39, 0x63, 0x2c, 0x59, 0x30, 0x78, 0xcd, 0x68, 0x7e, 0x30, 0x51, 0x2e, 0xed, 0xfd, 0xd0, 0x30}, }, { {0x50, 0x47, 0xb8, 0x68, 0x1e, 0x97, 0xb4, 0x9c, 0xcf, 0xbb, 0x64, 0x66, 0x29, 0x72, 0x95, 0xa0, 0x2b, 0x41, 0xfa, 0x72, 0x26, 0xe7, 0x8d, 0x5c, 0xd9, 0x89, 0xc5, 0x51, 0x43, 0x8, 0x15, 0x46}, {0xb3, 0x33, 0x12, 0xf2, 0x1a, 0x4d, 0x59, 0xe0, 0x9c, 0x4d, 0xcc, 0xf0, 0x8e, 0xe7, 0xdb, 0x1b, 0x77, 0x9a, 0x49, 0x8f, 0x7f, 0x18, 0x65, 0x69, 0x68, 0x98, 0x9, 0x2c, 0x20, 0x14, 0x92, 0xa}, {0x2e, 0xa0, 0xb9, 0xae, 0xc0, 0x19, 0x90, 0xbc, 0xae, 0x4c, 0x3, 0x16, 0xd, 0x11, 0xc7, 0x55, 0xec, 0x32, 0x99, 0x65, 0x1, 0xf5, 0x6d, 0xe, 0xfe, 0x5d, 0xca, 0x95, 0x28, 0xd, 0xca, 0x3b}, }, { {0xbf, 0x1, 0xcc, 0x9e, 0xb6, 0x8e, 0x68, 0x9c, 0x6f, 0x89, 0x44, 0xa6, 0xad, 0x83, 0xbc, 0xf0, 0xe2, 0x9f, 0x7a, 0x5f, 0x5f, 0x95, 0x2d, 0xca, 0x41, 0x82, 0xf2, 0x8d, 0x3, 0xb4, 0xa8, 0x4e}, {0xa4, 0x62, 0x5d, 0x3c, 0xbc, 0x31, 0xf0, 0x40, 0x60, 0x7a, 0xf0, 0xcf, 0x3e, 0x8b, 0xfc, 0x19, 0x45, 0xb5, 0xf, 0x13, 0xa2, 0x3d, 0x18, 0x98, 0xcd, 0x13, 0x8f, 0xae, 0xdd, 0xde, 0x31, 0x56}, {0x2, 0xd2, 0xca, 0xf1, 0xa, 0x46, 0xed, 0x2a, 0x83, 0xee, 0x8c, 0xa4, 0x5, 0x53, 0x30, 0x46, 0x5f, 0x1a, 0xf1, 0x49, 0x45, 0x77, 0x21, 0x91, 0x63, 0xa4, 0x2c, 0x54, 0x30, 0x9, 0xce, 0x24}, }, { {0x85, 0xb, 0xf3, 0xfd, 0x55, 0xa1, 0xcf, 0x3f, 0xa4, 0x2e, 0x37, 0x36, 0x8e, 0x16, 0xf7, 0xd2, 0x44, 0xf8, 0x92, 0x64, 0xde, 0x64, 0xe0, 0xb2, 0x80, 0x42, 0x4f, 0x32, 0xa7, 0x28, 0x99, 0x54}, {0x6, 0xc1, 0x6, 0xfd, 0xf5, 0x90, 0xe8, 0x1f, 0xf2, 0x10, 0x88, 0x5d, 0x35, 0x68, 0xc4, 0xb5, 0x3e, 0xaf, 0x8c, 0x6e, 0xfe, 0x8, 0x78, 0x82, 0x4b, 0xd7, 0x6, 0x8a, 0xc2, 0xe3, 0xd4, 0x41}, {0x2e, 0x1a, 0xee, 0x63, 0xa7, 0x32, 0x6e, 0xf2, 0xea, 0xfd, 0x5f, 0xd2, 0xb7, 0xe4, 0x91, 0xae, 0x69, 0x4d, 0x7f, 0xd1, 0x3b, 0xd3, 0x3b, 0xbc, 0x6a, 0xff, 0xdc, 0xc0, 0xde, 0x66, 0x1b, 0x49}, }, { {0xa1, 0x64, 0xda, 0xd0, 0x8e, 0x4a, 0xf0, 0x75, 0x4b, 0x28, 0xe2, 0x67, 0xaf, 0x2c, 0x22, 0xed, 0xa4, 0x7b, 0x7b, 0x1f, 0x79, 0xa3, 0x34, 0x82, 0x67, 0x8b, 0x1, 0xb7, 0xb0, 0xb8, 0xf6, 0x4c}, {0xa7, 0x32, 0xea, 0xc7, 0x3d, 0xb1, 0xf5, 0x98, 0x98, 0xdb, 0x16, 0x7e, 0xcc, 0xf8, 0xd5, 0xe3, 0x47, 0xd9, 0xf8, 0xcb, 0x52, 0xbf, 0xa, 0xac, 0xac, 0xe4, 0x5e, 0xc8, 0xd0, 0x38, 0xf3, 0x8}, {0xbd, 0x73, 0x1a, 0x99, 0x21, 0xa8, 0x83, 0xc3, 0x7a, 0xc, 0x32, 0xdf, 0x1, 0xbc, 0x27, 0xab, 0x63, 0x70, 0x77, 0x84, 0x1b, 0x33, 0x3d, 0xc1, 0x99, 0x8a, 0x7, 0xeb, 0x82, 0x4a, 0xd, 0x53}, }, { {0x9e, 0xbf, 0x9a, 0x6c, 0x45, 0x73, 0x69, 0x6d, 0x80, 0xa8, 0x0, 0x49, 0xfc, 0xb2, 0x7f, 0x25, 0x50, 0xb8, 0xcf, 0xc8, 0x12, 0xf4, 0xac, 0x2b, 0x5b, 0xbd, 0xbf, 0xc, 0xe0, 0xe7, 0xb3, 0xd}, {0x25, 0x48, 0xf9, 0xe1, 0x30, 0x36, 0x4c, 0x0, 0x5a, 0x53, 0xab, 0x8c, 0x26, 0x78, 0x2d, 0x7e, 0x8b, 0xff, 0x84, 0xcc, 0x23, 0x23, 0x48, 0xc7, 0xb9, 0x70, 0x17, 0x10, 0x3f, 0x75, 0xea, 0x65}, {0x63, 0x63, 0x9, 0xe2, 0x3e, 0xfc, 0x66, 0x3d, 0x6b, 0xcb, 0xb5, 0x61, 0x7f, 0x2c, 0xd6, 0x81, 0x1a, 0x3b, 0x44, 0x13, 0x42, 0x4, 0xbe, 0xf, 0xdb, 0xa1, 0xe1, 0x21, 0x19, 0xec, 0xa4, 0x2}, }, { {0x5f, 0x79, 0xcf, 0xf1, 0x62, 0x61, 0xc8, 0xf5, 0xf2, 0x57, 0xee, 0x26, 0x19, 0x86, 0x8c, 0x11, 0x78, 0x35, 0x6, 0x1c, 0x85, 0x24, 0x21, 0x17, 0xcf, 0x7f, 0x6, 0xec, 0x5d, 0x2b, 0xd1, 0x36}, {0xa2, 0xb8, 0x24, 0x3b, 0x9a, 0x25, 0xe6, 0x5c, 0xb8, 0xa0, 0xaf, 0x45, 0xcc, 0x7a, 0x57, 0xb8, 0x37, 0x70, 0xa0, 0x8b, 0xe8, 0xe6, 0xcb, 0xcc, 0xbf, 0x9, 0x78, 0x12, 0x51, 0x3c, 0x14, 0x3d}, {0x57, 0x45, 0x15, 0x79, 0x91, 0x27, 0x6d, 0x12, 0xa, 0x3a, 0x78, 0xfc, 0x5c, 0x8f, 0xe4, 0xd5, 0xac, 0x9b, 0x17, 0xdf, 0xe8, 0xb6, 0xbd, 0x36, 0x59, 0x28, 0xa8, 0x5b, 0x88, 0x17, 0xf5, 0x2e}, }, }, { { {0x51, 0x2f, 0x5b, 0x30, 0xfb, 0xbf, 0xee, 0x96, 0xb8, 0x96, 0x95, 0x88, 0xad, 0x38, 0xf9, 0xd3, 0x25, 0xdd, 0xd5, 0x46, 0xc7, 0x2d, 0xf5, 0xf0, 0x95, 0x0, 0x3a, 0xbb, 0x90, 0x82, 0x96, 0x57}, {0xdc, 0xae, 0x58, 0x8c, 0x4e, 0x97, 0x37, 0x46, 0xa4, 0x41, 0xf0, 0xab, 0xfb, 0x22, 0xef, 0xb9, 0x8a, 0x71, 0x80, 0xe9, 0x56, 0xd9, 0x85, 0xe1, 0xa6, 0xa8, 0x43, 0xb1, 0xfa, 0x78, 0x1b, 0x2f}, {0x1, 0xe1, 0x20, 0xa, 0x43, 0xb8, 0x1a, 0xf7, 0x47, 0xec, 0xf0, 0x24, 0x8d, 0x65, 0x93, 0xf3, 0xd1, 0xee, 0xe2, 0x6e, 0xa8, 0x9, 0x75, 0xcf, 0xe1, 0xa3, 0x2a, 0xdc, 0x35, 0x3e, 0xc4, 0x7d}, }, { {0x18, 0x97, 0x3e, 0x27, 0x5c, 0x2a, 0x78, 0x5a, 0x94, 0xfd, 0x4e, 0x5e, 0x99, 0xc6, 0x76, 0x35, 0x3e, 0x7d, 0x23, 0x1f, 0x5, 0xd8, 0x2e, 0xf, 0x99, 0xa, 0xd5, 0x82, 0x1d, 0xb8, 0x4f, 0x4}, {0xc3, 0xd9, 0x7d, 0x88, 0x65, 0x66, 0x96, 0x85, 0x55, 0x53, 0xb0, 0x4b, 0x31, 0x9b, 0xf, 0xc9, 0xb1, 0x79, 0x20, 0xef, 0xf8, 0x8d, 0xe0, 0xc6, 0x2f, 0xc1, 0x8c, 0x75, 0x16, 0x20, 0xf7, 0x7e}, {0xd9, 0xe3, 0x7, 0xa9, 0xc5, 0x18, 0xdf, 0xc1, 0x59, 0x63, 0x4c, 0xce, 0x1d, 0x37, 0xb3, 0x57, 0x49, 0xbb, 0x1, 0xb2, 0x34, 0x45, 0x70, 0xca, 0x2e, 0xdd, 0x30, 0x9c, 0x3f, 0x82, 0x79, 0x7f}, }, { {0xba, 0x87, 0xf5, 0x68, 0xf0, 0x1f, 0x9c, 0x6a, 0xde, 0xc8, 0x50, 0x0, 0x4e, 0x89, 0x27, 0x8, 0xe7, 0x5b, 0xed, 0x7d, 0x55, 0x99, 0xbf, 0x3c, 0xf0, 0xd6, 0x6, 0x1c, 0x43, 0xb0, 0xa9, 0x64}, {0xe8, 0x13, 0xb5, 0xa3, 0x39, 0xd2, 0x34, 0x83, 0xd8, 0xa8, 0x1f, 0xb9, 0xd4, 0x70, 0x36, 0xc1, 0x33, 0xbd, 0x90, 0xf5, 0x36, 0x41, 0xb5, 0x12, 0xb4, 0xd9, 0x84, 0xd7, 0x73, 0x3, 0x4e, 0xa}, {0x19, 0x29, 0x7d, 0x5b, 0xa1, 0xd6, 0xb3, 0x2e, 0x35, 0x82, 0x3a, 0xd5, 0xa0, 0xf6, 0xb4, 0xb0, 0x47, 0x5d, 0xa4, 0x89, 0x43, 0xce, 0x56, 0x71, 0x6c, 0x34, 0x18, 0xce, 0xa, 0x7d, 0x1a, 0x7}, }, { {0x31, 0x44, 0xe1, 0x20, 0x52, 0x35, 0xc, 0xcc, 0x41, 0x51, 0xb1, 0x9, 0x7, 0x95, 0x65, 0xd, 0x36, 0x5f, 0x9d, 0x20, 0x1b, 0x62, 0xf5, 0x9a, 0xd3, 0x55, 0x77, 0x61, 0xf7, 0xbc, 0x69, 0x7c}, {0xb, 0xba, 0x87, 0xc8, 0xaa, 0x2d, 0x7, 0xd3, 0xee, 0x62, 0xa5, 0xbf, 0x5, 0x29, 0x26, 0x1, 0x8b, 0x76, 0xef, 0xc0, 0x2, 0x30, 0x54, 0xcf, 0x9c, 0x7e, 0xea, 0x46, 0x71, 0xcc, 0x3b, 0x2c}, {0x5f, 0x29, 0xe8, 0x4, 0xeb, 0xd7, 0xf0, 0x7, 0x7d, 0xf3, 0x50, 0x2f, 0x25, 0x18, 0xdb, 0x10, 0xd7, 0x98, 0x17, 0x17, 0xa3, 0xa9, 0x51, 0xe9, 0x1d, 0xa5, 0xac, 0x22, 0x73, 0x9a, 0x5a, 0x6f}, }, { {0xbe, 0x44, 0xd9, 0xa3, 0xeb, 0xd4, 0x29, 0xe7, 0x9e, 0xaf, 0x78, 0x80, 0x40, 0x9, 0x9e, 0x8d, 0x3, 0x9c, 0x86, 0x47, 0x7a, 0x56, 0x25, 0x45, 0x24, 0x3b, 0x8d, 0xee, 0x80, 0x96, 0xab, 0x2}, {0xc5, 0xc6, 0x41, 0x2f, 0xc, 0x0, 0xa1, 0x8b, 0x9b, 0xfb, 0xfe, 0xc, 0xc1, 0x79, 0x9f, 0xc4, 0x9f, 0x1c, 0xc5, 0x3c, 0x70, 0x47, 0xfa, 0x4e, 0xca, 0xaf, 0x47, 0xe1, 0xa2, 0x21, 0x4e, 0x49}, {0x9a, 0xd, 0xe5, 0xdd, 0x85, 0x8a, 0xa4, 0xef, 0x49, 0xa2, 0xb9, 0xf, 0x4e, 0x22, 0x9a, 0x21, 0xd9, 0xf6, 0x1e, 0xd9, 0x1d, 0x1f, 0x9, 0xfa, 0x34, 0xbb, 0x46, 0xea, 0xcb, 0x76, 0x5d, 0x6b}, }, { {0x22, 0x25, 0x78, 0x1e, 0x17, 0x41, 0xf9, 0xe0, 0xd3, 0x36, 0x69, 0x3, 0x74, 0xae, 0xe6, 0xf1, 0x46, 0xc7, 0xfc, 0xd0, 0xa2, 0x3e, 0x8b, 0x40, 0x3e, 0x31, 0xdd, 0x3, 0x9c, 0x86, 0xfb, 0x16}, {0x94, 0xd9, 0xc, 0xec, 0x6c, 0x55, 0x57, 0x88, 0xba, 0x1d, 0xd0, 0x5c, 0x6f, 0xdc, 0x72, 0x64, 0x77, 0xb4, 0x42, 0x8f, 0x14, 0x69, 0x1, 0xaf, 0x54, 0x73, 0x27, 0x85, 0xf6, 0x33, 0xe3, 0xa}, {0x62, 0x9, 0xb6, 0x33, 0x97, 0x19, 0x8e, 0x28, 0x33, 0xe1, 0xab, 0xd8, 0xb4, 0x72, 0xfc, 0x24, 0x3e, 0xd0, 0x91, 0x9, 0xed, 0xf7, 0x11, 0x48, 0x75, 0xd0, 0x70, 0x8f, 0x8b, 0xe3, 0x81, 0x3f}, }, { {0x24, 0xc8, 0x17, 0x5f, 0x35, 0x7f, 0xdb, 0xa, 0xa4, 0x99, 0x42, 0xd7, 0xc3, 0x23, 0xb9, 0x74, 0xf7, 0xea, 0xf8, 0xcb, 0x8b, 0x3e, 0x7c, 0xd5, 0x3d, 0xdc, 0xde, 0x4c, 0xd3, 0xe2, 0xd3, 0xa}, {0xfe, 0xaf, 0xd9, 0x7e, 0xcc, 0xf, 0x91, 0x7f, 0x4b, 0x87, 0x65, 0x24, 0xa1, 0xb8, 0x5c, 0x54, 0x4, 0x47, 0xc, 0x4b, 0xd2, 0x7e, 0x39, 0xa8, 0x93, 0x9, 0xf5, 0x4, 0xc1, 0xf, 0x51, 0x50}, {0x9d, 0x24, 0x6e, 0x33, 0xc5, 0xf, 0xc, 0x6f, 0xd9, 0xcf, 0x31, 0xc3, 0x19, 0xde, 0x5e, 0x74, 0x1c, 0xfe, 0xee, 0x9, 0x0, 0xfd, 0xd6, 0xf2, 0xbe, 0x1e, 0xfa, 0xf0, 0x8b, 0x15, 0x7c, 0x12}, }, { {0x74, 0xb9, 0x51, 0xae, 0xc4, 0x8f, 0xa2, 0xde, 0x96, 0xfe, 0x4d, 0x74, 0xd3, 0x73, 0x99, 0x1d, 0xa8, 0x48, 0x38, 0x87, 0xb, 0x68, 0x40, 0x62, 0x95, 0xdf, 0x67, 0xd1, 0x79, 0x24, 0xd8, 0x4e}, {0xa2, 0x79, 0x98, 0x2e, 0x42, 0x7c, 0x19, 0xf6, 0x47, 0x36, 0xca, 0x52, 0xd4, 0xdd, 0x4a, 0xa4, 0xcb, 0xac, 0x4e, 0x4b, 0xc1, 0x3f, 0x41, 0x9b, 0x68, 0x4f, 0xef, 0x7, 0x7d, 0xf8, 0x4e, 0x35}, {0x75, 0xd9, 0xc5, 0x60, 0x22, 0xb5, 0xe3, 0xfe, 0xb8, 0xb0, 0x41, 0xeb, 0xfc, 0x2e, 0x35, 0x50, 0x3c, 0x65, 0xf6, 0xa9, 0x30, 0xac, 0x8, 0x88, 0x6d, 0x23, 0x39, 0x5, 0xd2, 0x92, 0x2d, 0x30}, }, }, { { {0x77, 0xf1, 0xe0, 0xe4, 0xb6, 0x6f, 0xbc, 0x2d, 0x93, 0x6a, 0xbd, 0xa4, 0x29, 0xbf, 0xe1, 0x4, 0xe8, 0xf6, 0x7a, 0x78, 0xd4, 0x66, 0x19, 0x5e, 0x60, 0xd0, 0x26, 0xb4, 0x5e, 0x5f, 0xdc, 0xe}, {0x3d, 0x28, 0xa4, 0xbc, 0xa2, 0xc1, 0x13, 0x78, 0xd9, 0x3d, 0x86, 0xa1, 0x91, 0xf0, 0x62, 0xed, 0x86, 0xfa, 0x68, 0xc2, 0xb8, 0xbc, 0xc7, 0xae, 0x4c, 0xae, 0x1c, 0x6f, 0xb7, 0xd3, 0xe5, 0x10}, {0x67, 0x8e, 0xda, 0x53, 0xd6, 0xbf, 0x53, 0x54, 0x41, 0xf6, 0xa9, 0x24, 0xec, 0x1e, 0xdc, 0xe9, 0x23, 0x8a, 0x57, 0x3, 0x3b, 0x26, 0x87, 0xbf, 0x72, 0xba, 0x1c, 0x36, 0x51, 0x6c, 0xb4, 0x45}, }, { {0xe4, 0xe3, 0x7f, 0x8a, 0xdd, 0x4d, 0x9d, 0xce, 0x30, 0xe, 0x62, 0x76, 0x56, 0x64, 0x13, 0xab, 0x58, 0x99, 0xe, 0xb3, 0x7b, 0x4f, 0x59, 0x4b, 0xdf, 0x29, 0x12, 0x32, 0xef, 0xa, 0x1c, 0x5c}, {0xa1, 0x7f, 0x4f, 0x31, 0xbf, 0x2a, 0x40, 0xa9, 0x50, 0xf4, 0x8c, 0x8e, 0xdc, 0xf1, 0x57, 0xe2, 0x84, 0xbe, 0xa8, 0x23, 0x4b, 0xd5, 0xbb, 0x1d, 0x3b, 0x71, 0xcb, 0x6d, 0xa3, 0xbf, 0x77, 0x21}, {0x8f, 0xdb, 0x79, 0xfa, 0xbc, 0x1b, 0x8, 0x37, 0xb3, 0x59, 0x5f, 0xc2, 0x1e, 0x81, 0x48, 0x60, 0x87, 0x24, 0x83, 0x9c, 0x65, 0x76, 0x7a, 0x8, 0xbb, 0xb5, 0x8a, 0x7d, 0x38, 0x19, 0xe6, 0x4a}, }, { {0x83, 0xfb, 0x5b, 0x98, 0x44, 0x7e, 0x11, 0x61, 0x36, 0x31, 0x96, 0x71, 0x2a, 0x46, 0xe0, 0xfc, 0x4b, 0x90, 0x25, 0xd4, 0x48, 0x34, 0xac, 0x83, 0x64, 0x3d, 0xa4, 0x5b, 0xbe, 0x5a, 0x68, 0x75}, {0x2e, 0xa3, 0x44, 0x53, 0xaa, 0xf6, 0xdb, 0x8d, 0x78, 0x40, 0x1b, 0xb4, 0xb4, 0xea, 0x88, 0x7d, 0x60, 0xd, 0x13, 0x4a, 0x97, 0xeb, 0xb0, 0x5e, 0x3, 0x3e, 0xbf, 0x17, 0x1b, 0xd9, 0x0, 0x1a}, {0xb2, 0xf2, 0x61, 0xeb, 0x33, 0x9, 0x96, 0x6e, 0x52, 0x49, 0xff, 0xc9, 0xa8, 0xf, 0x3d, 0x54, 0x69, 0x65, 0xf6, 0x7a, 0x10, 0x75, 0x72, 0xdf, 0xaa, 0xe6, 0xb0, 0x23, 0xb6, 0x29, 0x55, 0x13}, }, { {0xfe, 0x83, 0x2e, 0xe2, 0xbc, 0x16, 0xc7, 0xf5, 0xc1, 0x85, 0x9, 0xe8, 0x19, 0xeb, 0x2b, 0xb4, 0xae, 0x4a, 0x25, 0x14, 0x37, 0xa6, 0x9d, 0xec, 0x13, 0xa6, 0x90, 0x15, 0x5, 0xea, 0x72, 0x59}, {0x18, 0xd5, 0xd1, 0xad, 0xd7, 0xdb, 0xf0, 0x18, 0x11, 0x1f, 0xc1, 0xcf, 0x88, 0x78, 0x9f, 0x97, 0x9b, 0x75, 0x14, 0x71, 0xf0, 0xe1, 0x32, 0x87, 0x1, 0x3a, 0xca, 0x65, 0x1a, 0xb8, 0xb5, 0x79}, {0x11, 0x78, 0x8f, 0xdc, 0x20, 0xac, 0xd4, 0xf, 0xa8, 0x4f, 0x4d, 0xac, 0x94, 0xd2, 0x9a, 0x9a, 0x34, 0x4, 0x36, 0xb3, 0x64, 0x2d, 0x1b, 0xc0, 0xdb, 0x3b, 0x5f, 0x90, 0x95, 0x9c, 0x7e, 0x4f}, }, { {0xfe, 0x99, 0x52, 0x35, 0x3d, 0x44, 0xc8, 0x71, 0xd7, 0xea, 0xeb, 0xdb, 0x1c, 0x3b, 0xcd, 0x8b, 0x66, 0x94, 0xa4, 0xf1, 0x9e, 0x49, 0x92, 0x80, 0xc8, 0xad, 0x44, 0xa1, 0xc4, 0xee, 0x42, 0x19}, {0x2e, 0x30, 0x81, 0x57, 0xbc, 0x4b, 0x67, 0x62, 0xf, 0xdc, 0xad, 0x89, 0x39, 0xf, 0x52, 0xd8, 0xc6, 0xd9, 0xfb, 0x53, 0xae, 0x99, 0x29, 0x8c, 0x4c, 0x8e, 0x63, 0x2e, 0xd9, 0x3a, 0x99, 0x31}, {0x92, 0x49, 0x23, 0xae, 0x19, 0x53, 0xac, 0x7d, 0x92, 0x3e, 0xea, 0xc, 0x91, 0x3d, 0x1b, 0x2c, 0x22, 0x11, 0x3c, 0x25, 0x94, 0xe4, 0x3c, 0x55, 0x75, 0xca, 0xf9, 0x4e, 0x31, 0x65, 0xa, 0x2a}, }, { {0x3a, 0x79, 0x1c, 0x3c, 0xcd, 0x1a, 0x36, 0xcf, 0x3b, 0xbc, 0x35, 0x5a, 0xac, 0xbc, 0x9e, 0x2f, 0xab, 0xa6, 0xcd, 0xa8, 0xe9, 0x60, 0xe8, 0x60, 0x13, 0x1a, 0xea, 0x6d, 0x9b, 0xc3, 0x5d, 0x5}, {0xc2, 0x27, 0xf9, 0xf7, 0x7f, 0x93, 0xb7, 0x2d, 0x35, 0xa6, 0xd0, 0x17, 0x6, 0x1f, 0x74, 0xdb, 0x76, 0xaf, 0x55, 0x11, 0xa2, 0xf3, 0x82, 0x59, 0xed, 0x2d, 0x7c, 0x64, 0x18, 0xe2, 0xf6, 0x4c}, {0xb6, 0x5b, 0x8d, 0xc2, 0x7c, 0x22, 0x19, 0xb1, 0xab, 0xff, 0x4d, 0x77, 0xbc, 0x4e, 0xe2, 0x7, 0x89, 0x2c, 0xa3, 0xe4, 0xce, 0x78, 0x3c, 0xa8, 0xb6, 0x24, 0xaa, 0x10, 0x77, 0x30, 0x1a, 0x12}, }, { {0xc9, 0x83, 0x74, 0xc7, 0x3e, 0x71, 0x59, 0xd6, 0xaf, 0x96, 0x2b, 0xb8, 0x77, 0xe0, 0xbf, 0x88, 0xd3, 0xbc, 0x97, 0x10, 0x23, 0x28, 0x9e, 0x28, 0x9b, 0x3a, 0xed, 0x6c, 0x4a, 0xb9, 0x7b, 0x52}, {0x97, 0x4a, 0x3, 0x9f, 0x5e, 0x5d, 0xdb, 0xe4, 0x2d, 0xbc, 0x34, 0x30, 0x9, 0xfc, 0x53, 0xe1, 0xb1, 0xd3, 0x51, 0x95, 0x91, 0x46, 0x5, 0x46, 0x2d, 0xe5, 0x40, 0x7a, 0x6c, 0xc7, 0x3f, 0x33}, {0x2e, 0x48, 0x5b, 0x99, 0x2a, 0x99, 0x3d, 0x56, 0x1, 0x38, 0x38, 0x6e, 0x7c, 0xd0, 0x5, 0x34, 0xe5, 0xd8, 0x64, 0x2f, 0xde, 0x35, 0x50, 0x48, 0xf7, 0xa9, 0xa7, 0x20, 0x9b, 0x6, 0x89, 0x6b}, }, { {0x77, 0xdb, 0xc7, 0xb5, 0x8c, 0xfa, 0x82, 0x40, 0x55, 0xc1, 0x34, 0xc7, 0xf8, 0x86, 0x86, 0x6, 0x7e, 0xa5, 0xe7, 0xf6, 0xd9, 0xc8, 0xe6, 0x29, 0xcf, 0x9b, 0x63, 0xa7, 0x8, 0xd3, 0x73, 0x4}, {0xd, 0x22, 0x70, 0x62, 0x41, 0xa0, 0x2a, 0x81, 0x4e, 0x5b, 0x24, 0xf9, 0xfa, 0x89, 0x5a, 0x99, 0x5, 0xef, 0x72, 0x50, 0xce, 0xc4, 0xad, 0xff, 0x73, 0xeb, 0x73, 0xaa, 0x3, 0x21, 0xbc, 0x23}, {0x5, 0x9e, 0x58, 0x3, 0x26, 0x79, 0xee, 0xca, 0x92, 0xc4, 0xdc, 0x46, 0x12, 0x42, 0x4b, 0x2b, 0x4f, 0xa9, 0x1, 0xe6, 0x74, 0xef, 0xa1, 0x2, 0x1a, 0x34, 0x4, 0xde, 0xbf, 0x73, 0x2f, 0x10}, }, }, { { {0x9a, 0x1c, 0x51, 0xb5, 0xe0, 0xda, 0xb4, 0xa2, 0x6, 0xff, 0xff, 0x2b, 0x29, 0x60, 0xc8, 0x7a, 0x34, 0x42, 0x50, 0xf5, 0x5d, 0x37, 0x1f, 0x98, 0x2d, 0xa1, 0x4e, 0xda, 0x25, 0xd7, 0x6b, 0x3f}, {0xc6, 0x45, 0x57, 0x7f, 0xab, 0xb9, 0x18, 0xeb, 0x90, 0xc6, 0x87, 0x57, 0xee, 0x8a, 0x3a, 0x2, 0xa9, 0xaf, 0xf7, 0x2d, 0xda, 0x12, 0x27, 0xb7, 0x3d, 0x1, 0x5c, 0xea, 0x25, 0x7d, 0x59, 0x36}, {0xac, 0x58, 0x60, 0x10, 0x7b, 0x8d, 0x4d, 0x73, 0x5f, 0x90, 0xc6, 0x6f, 0x9e, 0x57, 0x40, 0xd9, 0x2d, 0x93, 0x2, 0x92, 0xf9, 0xf8, 0x66, 0x64, 0xd0, 0xd6, 0x60, 0xda, 0x19, 0xcc, 0x7e, 0x7b}, }, { {0x9b, 0xfa, 0x7c, 0xa7, 0x51, 0x4a, 0xae, 0x6d, 0x50, 0x86, 0xa3, 0xe7, 0x54, 0x36, 0x26, 0x82, 0xdb, 0x82, 0x2d, 0x8f, 0xcd, 0xff, 0xbb, 0x9, 0xba, 0xca, 0xf5, 0x1b, 0x66, 0xdc, 0xbe, 0x3}, {0xd, 0x69, 0x5c, 0x69, 0x3c, 0x37, 0xc2, 0x78, 0x6e, 0x90, 0x42, 0x6, 0x66, 0x2e, 0x25, 0xdd, 0xd2, 0x2b, 0xe1, 0x4a, 0x44, 0x44, 0x1d, 0x95, 0x56, 0x39, 0x74, 0x1, 0x76, 0xad, 0x35, 0x42}, {0xf5, 0x75, 0x89, 0x7, 0xd, 0xcb, 0x58, 0x62, 0x98, 0xf2, 0x89, 0x91, 0x54, 0x42, 0x29, 0x49, 0xe4, 0x6e, 0xe3, 0xe2, 0x23, 0xb4, 0xca, 0xa0, 0xa1, 0x66, 0xf0, 0xcd, 0xb0, 0xe2, 0x7c, 0xe}, }, { {0xf9, 0x70, 0x4b, 0xd9, 0xdf, 0xfe, 0xa6, 0xfe, 0x2d, 0xba, 0xfc, 0xc1, 0x51, 0xc0, 0x30, 0xf1, 0x89, 0xab, 0x2f, 0x7f, 0x7e, 0xd4, 0x82, 0x48, 0xb5, 0xee, 0xec, 0x8a, 0x13, 0x56, 0x52, 0x61}, {0xa3, 0x85, 0x8c, 0xc4, 0x3a, 0x64, 0x94, 0xc4, 0xad, 0x39, 0x61, 0x3c, 0xf4, 0x1d, 0x36, 0xfd, 0x48, 0x4d, 0xe9, 0x3a, 0xdd, 0x17, 0xdb, 0x9, 0x4a, 0x67, 0xb4, 0x8f, 0x5d, 0xa, 0x6e, 0x66}, {0xd, 0xcb, 0x70, 0x48, 0x4e, 0xf6, 0xbb, 0x2a, 0x6b, 0x8b, 0x45, 0xaa, 0xf0, 0xbc, 0x65, 0xcd, 0x5d, 0x98, 0xe8, 0x75, 0xba, 0x4e, 0xbe, 0x9a, 0xe4, 0xde, 0x14, 0xd5, 0x10, 0xc8, 0xb, 0x7f}, }, { {0xa0, 0x13, 0x72, 0x73, 0xad, 0x9d, 0xac, 0x83, 0x98, 0x2e, 0xf7, 0x2e, 0xba, 0xf8, 0xf6, 0x9f, 0x57, 0x69, 0xec, 0x43, 0xdd, 0x2e, 0x1e, 0x31, 0x75, 0xab, 0xc5, 0xde, 0x7d, 0x90, 0x3a, 0x1d}, {0x6f, 0x13, 0xf4, 0x26, 0xa4, 0x6b, 0x0, 0xb9, 0x35, 0x30, 0xe0, 0x57, 0x9e, 0x36, 0x67, 0x8d, 0x28, 0x3c, 0x46, 0x4f, 0xd9, 0xdf, 0xc8, 0xcb, 0xf5, 0xdb, 0xee, 0xf8, 0xbc, 0x8d, 0x1f, 0xd}, {0xdc, 0x81, 0xd0, 0x3e, 0x31, 0x93, 0x16, 0xba, 0x80, 0x34, 0x1b, 0x85, 0xad, 0x9f, 0x32, 0x29, 0xcb, 0x21, 0x3, 0x3, 0x3c, 0x1, 0x28, 0x1, 0xe3, 0xfd, 0x1b, 0xa3, 0x44, 0x1b, 0x1, 0x0}, }, { {0x5c, 0xa7, 0xa, 0x6a, 0x69, 0x1f, 0x56, 0x16, 0x6a, 0xbd, 0x52, 0x58, 0x5c, 0x72, 0xbf, 0xc1, 0xad, 0x66, 0x79, 0x9a, 0x7f, 0xdd, 0xa8, 0x11, 0x26, 0x10, 0x85, 0xd2, 0xa2, 0x88, 0xd9, 0x63}, {0xc, 0x6c, 0xc6, 0x3f, 0x6c, 0xa0, 0xdf, 0x3f, 0xd2, 0xd, 0xd6, 0x4d, 0x8e, 0xe3, 0x40, 0x5d, 0x71, 0x4d, 0x8e, 0x26, 0x38, 0x8b, 0xe3, 0x7a, 0xe1, 0x57, 0x83, 0x6e, 0x91, 0x8d, 0xc4, 0x3a}, {0x2e, 0x23, 0xbd, 0xaf, 0x53, 0x7, 0x12, 0x0, 0x83, 0xf6, 0xd8, 0xfd, 0xb8, 0xce, 0x2b, 0xe9, 0x91, 0x2b, 0xe7, 0x84, 0xb3, 0x69, 0x16, 0xf8, 0x66, 0xa0, 0x68, 0x23, 0x2b, 0xd5, 0xfa, 0x33}, }, { {0xe8, 0xcf, 0x22, 0xc4, 0xd0, 0xc8, 0x2c, 0x8d, 0xcb, 0x3a, 0xa1, 0x5, 0x7b, 0x4f, 0x2b, 0x7, 0x6f, 0xa5, 0xf6, 0xec, 0xe6, 0xb6, 0xfe, 0xa3, 0xe2, 0x71, 0xa, 0xb9, 0xcc, 0x55, 0xc3, 0x3c}, {0x16, 0x1e, 0xe4, 0xc5, 0xc6, 0x49, 0x6, 0x54, 0x35, 0x77, 0x3f, 0x33, 0x30, 0x64, 0xf8, 0xa, 0x46, 0xe7, 0x5, 0xf3, 0xd2, 0xfc, 0xac, 0xb2, 0xa7, 0xdc, 0x56, 0xa2, 0x29, 0xf4, 0xc0, 0x16}, {0x31, 0x91, 0x3e, 0x90, 0x43, 0x94, 0xb6, 0xe9, 0xce, 0x37, 0x56, 0x7a, 0xcb, 0x94, 0xa4, 0xb8, 0x44, 0x92, 0xba, 0xba, 0xa4, 0xd1, 0x7c, 0xc8, 0x68, 0x75, 0xae, 0x6b, 0x42, 0xaf, 0x1e, 0x63}, }, { {0xe8, 0xd, 0x70, 0xa3, 0xb9, 0x75, 0xd9, 0x47, 0x52, 0x5, 0xf8, 0xe2, 0xfb, 0xc5, 0x80, 0x72, 0xe1, 0x5d, 0xe4, 0x32, 0x27, 0x8f, 0x65, 0x53, 0xb5, 0x80, 0x5f, 0x66, 0x7f, 0x2c, 0x1f, 0x43}, {0x9f, 0xfe, 0x66, 0xda, 0x10, 0x4, 0xe9, 0xb3, 0xa6, 0xe5, 0x16, 0x6c, 0x52, 0x4b, 0xdd, 0x85, 0x83, 0xbf, 0xf9, 0x1e, 0x61, 0x97, 0x3d, 0xbc, 0xb5, 0x19, 0xa9, 0x1e, 0x8b, 0x64, 0x99, 0x55}, {0x19, 0x7b, 0x8f, 0x85, 0x44, 0x63, 0x2, 0xd6, 0x4a, 0x51, 0xea, 0xa1, 0x2f, 0x35, 0xab, 0x14, 0xd7, 0xa9, 0x90, 0x20, 0x1a, 0x44, 0x0, 0x89, 0x26, 0x3b, 0x25, 0x91, 0x5f, 0x71, 0x4, 0x7b}, }, { {0xc6, 0xba, 0xe6, 0xc4, 0x80, 0xc2, 0x76, 0xb3, 0xb, 0x9b, 0x1d, 0x6d, 0xdd, 0xd3, 0xe, 0x97, 0x44, 0xf9, 0xb, 0x45, 0x58, 0x95, 0x9a, 0xb0, 0x23, 0xe2, 0xcd, 0x57, 0xfa, 0xac, 0xd0, 0x48}, {0x43, 0xae, 0xf6, 0xac, 0x28, 0xbd, 0xed, 0x83, 0xb4, 0x7a, 0x5c, 0x7d, 0x8b, 0x7c, 0x35, 0x86, 0x44, 0x2c, 0xeb, 0xb7, 0x69, 0x47, 0x40, 0xc0, 0x3f, 0x58, 0xf6, 0xc2, 0xf5, 0x7b, 0xb3, 0x59}, {0x71, 0xe6, 0xab, 0x7d, 0xe4, 0x26, 0xf, 0xb6, 0x37, 0x3a, 0x2f, 0x62, 0x97, 0xa1, 0xd1, 0xf1, 0x94, 0x3, 0x96, 0xe9, 0x7e, 0xce, 0x8, 0x42, 0xdb, 0x3b, 0x6d, 0x33, 0x91, 0x41, 0x23, 0x16}, }, }, { { {0x40, 0x86, 0xf3, 0x1f, 0xd6, 0x9c, 0x49, 0xdd, 0xa0, 0x25, 0x36, 0x6, 0xc3, 0x9b, 0xcd, 0x29, 0xc3, 0x3d, 0xd7, 0x3d, 0x2, 0xd8, 0xe2, 0x51, 0x31, 0x92, 0x3b, 0x20, 0x7a, 0x70, 0x25, 0x4a}, {0xf6, 0x7f, 0x26, 0xf6, 0xde, 0x99, 0xe4, 0xb9, 0x43, 0x8, 0x2c, 0x74, 0x7b, 0xca, 0x72, 0x77, 0xb1, 0xf2, 0xa4, 0xe9, 0x3f, 0x15, 0xa0, 0x23, 0x6, 0x50, 0xd0, 0xd5, 0xec, 0xdf, 0xdf, 0x2c}, {0x6a, 0xed, 0xf6, 0x53, 0x8a, 0x66, 0xb7, 0x2a, 0xa1, 0x70, 0xd1, 0x1d, 0x58, 0x42, 0x42, 0x30, 0x61, 0x1, 0xe2, 0x3a, 0x4c, 0x14, 0x0, 0x40, 0xfc, 0x49, 0x8e, 0x24, 0x6d, 0x89, 0x21, 0x57}, }, { {0x4e, 0xda, 0xd0, 0xa1, 0x91, 0x50, 0x5d, 0x28, 0x8, 0x3e, 0xfe, 0xb5, 0xa7, 0x6f, 0xaa, 0x4b, 0xb3, 0x93, 0x93, 0xe1, 0x7c, 0x17, 0xe5, 0x63, 0xfd, 0x30, 0xb0, 0xc4, 0xaf, 0x35, 0xc9, 0x3}, {0xae, 0x1b, 0x18, 0xfd, 0x17, 0x55, 0x6e, 0xb, 0xb4, 0x63, 0xb9, 0x2b, 0x9f, 0x62, 0x22, 0x90, 0x25, 0x46, 0x6, 0x32, 0xe9, 0xbc, 0x9, 0x55, 0xda, 0x13, 0x3c, 0xf6, 0x74, 0xdd, 0x8e, 0x57}, {0x3d, 0xc, 0x2b, 0x49, 0xc6, 0x76, 0x72, 0x99, 0xfc, 0x5, 0xe2, 0xdf, 0xc4, 0xc2, 0xcc, 0x47, 0x3c, 0x3a, 0x62, 0xdd, 0x84, 0x9b, 0xd2, 0xdc, 0xa2, 0xc7, 0x88, 0x2, 0x59, 0xab, 0xc2, 0x3e}, }, { {0xcb, 0xd1, 0x32, 0xae, 0x9, 0x3a, 0x21, 0xa7, 0xd5, 0xc2, 0xf5, 0x40, 0xdf, 0x87, 0x2b, 0xf, 0x29, 0xab, 0x1e, 0xe8, 0xc6, 0xa4, 0xae, 0xb, 0x5e, 0xac, 0xdb, 0x6a, 0x6c, 0xf6, 0x1b, 0xe}, {0xb9, 0x7b, 0xd8, 0xe4, 0x7b, 0xd2, 0xa0, 0xa1, 0xed, 0x1a, 0x39, 0x61, 0xeb, 0x4d, 0x8b, 0xa9, 0x83, 0x9b, 0xcb, 0x73, 0xd0, 0xdd, 0xa0, 0x99, 0xce, 0xca, 0xf, 0x20, 0x5a, 0xc2, 0xd5, 0x2d}, {0x7e, 0x88, 0x2c, 0x79, 0xe9, 0xd5, 0xab, 0xe2, 0x5d, 0x6d, 0x92, 0xcb, 0x18, 0x0, 0x2, 0x1a, 0x1e, 0x5f, 0xae, 0xba, 0xcd, 0x69, 0xba, 0xbf, 0x5f, 0x8f, 0xe8, 0x5a, 0xb3, 0x48, 0x5, 0x73}, }, { {0x34, 0xe3, 0xd6, 0xa1, 0x4b, 0x9, 0x5b, 0x80, 0x19, 0x3f, 0x35, 0x9, 0x77, 0xf1, 0x3e, 0xbf, 0x2b, 0x70, 0x22, 0x6, 0xcb, 0x6, 0x3f, 0x42, 0xdd, 0x45, 0x78, 0xd8, 0x77, 0x22, 0x5a, 0x58}, {0xee, 0xb8, 0xa8, 0xcb, 0xa3, 0x51, 0x35, 0xc4, 0x16, 0x5f, 0x11, 0xb2, 0x1d, 0x6f, 0xa2, 0x65, 0x50, 0x38, 0x8c, 0xab, 0x52, 0x4f, 0xf, 0x76, 0xca, 0xb8, 0x1d, 0x41, 0x3b, 0x44, 0x43, 0x30}, {0x62, 0x89, 0xd4, 0x33, 0x82, 0x5f, 0x8a, 0xa1, 0x7f, 0x25, 0x78, 0xec, 0xb5, 0xc4, 0x98, 0x66, 0xff, 0x41, 0x3e, 0x37, 0xa5, 0x6f, 0x8e, 0xa7, 0x1f, 0x98, 0xef, 0x50, 0x89, 0x27, 0x56, 0x76}, }, { {0x9d, 0xcf, 0x86, 0xea, 0xa3, 0x73, 0x70, 0xe1, 0xdc, 0x5f, 0x15, 0x7, 0xb7, 0xfb, 0x8c, 0x3a, 0x8e, 0x8a, 0x83, 0x31, 0xfc, 0xe7, 0x53, 0x48, 0x16, 0xf6, 0x13, 0xb6, 0x84, 0xf4, 0xbb, 0x28}, {0xc0, 0xc8, 0x1f, 0xd5, 0x59, 0xcf, 0xc3, 0x38, 0xf2, 0xb6, 0x6, 0x5, 0xfd, 0xd2, 0xed, 0x9b, 0x8f, 0xe, 0x57, 0xab, 0x9f, 0x10, 0xbf, 0x26, 0xa6, 0x46, 0xb8, 0xc1, 0xa8, 0x60, 0x41, 0x3f}, {0x7c, 0x6c, 0x13, 0x6f, 0x5c, 0x2f, 0x61, 0xf2, 0xbe, 0x11, 0xdd, 0xf6, 0x7, 0xd1, 0xea, 0xaf, 0x33, 0x6f, 0xde, 0x13, 0xd2, 0x9a, 0x7e, 0x52, 0x5d, 0xf7, 0x88, 0x81, 0x35, 0xcb, 0x79, 0x1e}, }, { {0x81, 0x81, 0xe0, 0xf5, 0xd8, 0x53, 0xe9, 0x77, 0xd9, 0xde, 0x9d, 0x29, 0x44, 0xc, 0xa5, 0x84, 0xe5, 0x25, 0x45, 0x86, 0xc, 0x2d, 0x6c, 0xdc, 0xf4, 0xf2, 0xd1, 0x39, 0x2d, 0xb5, 0x8a, 0x47}, {0xf1, 0xe3, 0xf7, 0xee, 0xc3, 0x36, 0x34, 0x1, 0xf8, 0x10, 0x9e, 0xfe, 0x7f, 0x6a, 0x8b, 0x82, 0xfc, 0xde, 0xf9, 0xbc, 0xe5, 0x8, 0xf9, 0x7f, 0x31, 0x38, 0x3b, 0x3a, 0x1b, 0x95, 0xd7, 0x65}, {0x59, 0xd1, 0x52, 0x92, 0xd3, 0xa4, 0xa6, 0x66, 0x7, 0xc8, 0x1a, 0x87, 0xbc, 0xe1, 0xdd, 0xe5, 0x6f, 0xc9, 0xc1, 0xa6, 0x40, 0x6b, 0x2c, 0xb8, 0x14, 0x22, 0x21, 0x1a, 0x41, 0x7a, 0xd8, 0x16}, }, { {0x83, 0x5, 0x4e, 0xd5, 0xe2, 0xd5, 0xa4, 0xfb, 0xfa, 0x99, 0xbd, 0x2e, 0xd7, 0xaf, 0x1f, 0xe2, 0x8f, 0x77, 0xe9, 0x6e, 0x73, 0xc2, 0x7a, 0x49, 0xde, 0x6d, 0x5a, 0x7a, 0x57, 0xb, 0x99, 0x1f}, {0x15, 0x62, 0x6, 0x42, 0x5a, 0x7e, 0xbd, 0xb3, 0xc1, 0x24, 0x5a, 0xc, 0xcd, 0xe3, 0x9b, 0x87, 0xb7, 0x94, 0xf9, 0xd6, 0xb1, 0x5d, 0xc0, 0x57, 0xa6, 0x8c, 0xf3, 0x65, 0x81, 0x7c, 0xf8, 0x28}, {0xd6, 0xf7, 0xe8, 0x1b, 0xad, 0x4e, 0x34, 0xa3, 0x8f, 0x79, 0xea, 0xac, 0xeb, 0x50, 0x1e, 0x7d, 0x52, 0xe0, 0xd, 0x52, 0x9e, 0x56, 0xc6, 0x77, 0x3e, 0x6d, 0x4d, 0x53, 0xe1, 0x2f, 0x88, 0x45}, }, { {0xe4, 0x6f, 0x3c, 0x94, 0x29, 0x99, 0xac, 0xd8, 0xa2, 0x92, 0x83, 0xa3, 0x61, 0xf1, 0xf9, 0xb5, 0xf3, 0x9a, 0xc8, 0xbe, 0x13, 0xdb, 0x99, 0x26, 0x74, 0xf0, 0x5, 0xe4, 0x3c, 0x84, 0xcf, 0x7d}, {0xd6, 0x83, 0x79, 0x75, 0x5d, 0x34, 0x69, 0x66, 0xa6, 0x11, 0xaa, 0x17, 0x11, 0xed, 0xb6, 0x62, 0x8f, 0x12, 0x5e, 0x98, 0x57, 0x18, 0xdd, 0x7d, 0xdd, 0xf6, 0x26, 0xf6, 0xb8, 0xe5, 0x8f, 0x68}, {0xc0, 0x32, 0x47, 0x4a, 0x48, 0xd6, 0x90, 0x6c, 0x99, 0x32, 0x56, 0xca, 0xfd, 0x43, 0x21, 0xd5, 0xe1, 0xc6, 0x5d, 0x91, 0xc3, 0x28, 0xbe, 0xb3, 0x1b, 0x19, 0x27, 0x73, 0x7e, 0x68, 0x39, 0x67}, }, }, { { {0xc0, 0x1a, 0xc, 0xc8, 0x9d, 0xcc, 0x6d, 0xa6, 0x36, 0xa4, 0x38, 0x1b, 0xf4, 0x5c, 0xa0, 0x97, 0xc6, 0xd7, 0xdb, 0x95, 0xbe, 0xf3, 0xeb, 0xa7, 0xab, 0x7d, 0x7e, 0x8d, 0xf6, 0xb8, 0xa0, 0x7d}, {0xa6, 0x75, 0x56, 0x38, 0x14, 0x20, 0x78, 0xef, 0xe8, 0xa9, 0xfd, 0xaa, 0x30, 0x9f, 0x64, 0xa2, 0xcb, 0xa8, 0xdf, 0x5c, 0x50, 0xeb, 0xd1, 0x4c, 0xb3, 0xc0, 0x4d, 0x1d, 0xba, 0x5a, 0x11, 0x46}, {0x76, 0xda, 0xb5, 0xc3, 0x53, 0x19, 0xf, 0xd4, 0x9b, 0x9e, 0x11, 0x21, 0x73, 0x6f, 0xac, 0x1d, 0x60, 0x59, 0xb2, 0xfe, 0x21, 0x60, 0xcc, 0x3, 0x4b, 0x4b, 0x67, 0x83, 0x7e, 0x88, 0x5f, 0x5a}, }, { {0xb9, 0x43, 0xa6, 0xa0, 0xd3, 0x28, 0x96, 0x9e, 0x64, 0x20, 0xc3, 0xe6, 0x0, 0xcb, 0xc3, 0xb5, 0x32, 0xec, 0x2d, 0x7c, 0x89, 0x2, 0x53, 0x9b, 0xc, 0xc7, 0xd1, 0xd5, 0xe2, 0x7a, 0xe3, 0x43}, {0x11, 0x3d, 0xa1, 0x70, 0xcf, 0x1, 0x63, 0x8f, 0xc4, 0xd0, 0xd, 0x35, 0x15, 0xb8, 0xce, 0xcf, 0x7e, 0xa4, 0xbc, 0xa4, 0xd4, 0x97, 0x2, 0xf7, 0x34, 0x14, 0x4d, 0xe4, 0x56, 0xb6, 0x69, 0x36}, {0x33, 0xe1, 0xa6, 0xed, 0x6, 0x3f, 0x7e, 0x38, 0xc0, 0x3a, 0xa1, 0x99, 0x51, 0x1d, 0x30, 0x67, 0x11, 0x38, 0x26, 0x36, 0xf8, 0xd8, 0x5a, 0xbd, 0xbe, 0xe9, 0xd5, 0x4f, 0xcd, 0xe6, 0x21, 0x6a}, }, { {0xe3, 0xb2, 0x99, 0x66, 0x12, 0x29, 0x41, 0xef, 0x1, 0x13, 0x8d, 0x70, 0x47, 0x8, 0xd3, 0x71, 0xbd, 0xb0, 0x82, 0x11, 0xd0, 0x32, 0x54, 0x32, 0x36, 0x8b, 0x1e, 0x0, 0x7, 0x1b, 0x37, 0x45}, {0x5f, 0xe6, 0x46, 0x30, 0xa, 0x17, 0xc6, 0xf1, 0x24, 0x35, 0xd2, 0x0, 0x2a, 0x2a, 0x71, 0x58, 0x55, 0xb7, 0x82, 0x8c, 0x3c, 0xbd, 0xdb, 0x69, 0x57, 0xff, 0x95, 0xa1, 0xf1, 0xf9, 0x6b, 0x58}, {0xb, 0x79, 0xf8, 0x5e, 0x8d, 0x8, 0xdb, 0xa6, 0xe5, 0x37, 0x9, 0x61, 0xdc, 0xf0, 0x78, 0x52, 0xb8, 0x6e, 0xa1, 0x61, 0xd2, 0x49, 0x3, 0xac, 0x79, 0x21, 0xe5, 0x90, 0x37, 0xb0, 0xaf, 0xe}, }, { {0x1d, 0xae, 0x75, 0xf, 0x5e, 0x80, 0x40, 0x51, 0x30, 0xcc, 0x62, 0x26, 0xe3, 0xfb, 0x2, 0xec, 0x6d, 0x39, 0x92, 0xea, 0x1e, 0xdf, 0xeb, 0x2c, 0xb3, 0x5b, 0x43, 0xc5, 0x44, 0x33, 0xae, 0x44}, {0x2f, 0x4, 0x48, 0x37, 0xc1, 0x55, 0x5, 0x96, 0x11, 0xaa, 0xb, 0x82, 0xe6, 0x41, 0x9a, 0x21, 0xc, 0x6d, 0x48, 0x73, 0x38, 0xf7, 0x81, 0x1c, 0x61, 0xc6, 0x2, 0x5a, 0x67, 0xcc, 0x9a, 0x30}, {0xee, 0x43, 0xa5, 0xbb, 0xb9, 0x89, 0xf2, 0x9c, 0x42, 0x71, 0xc9, 0x5a, 0x9d, 0xe, 0x76, 0xf3, 0xaa, 0x60, 0x93, 0x4f, 0xc6, 0xe5, 0x82, 0x1d, 0x8f, 0x67, 0x94, 0x7f, 0x1b, 0x22, 0xd5, 0x62}, }, { {0x3c, 0x7a, 0xf7, 0x3a, 0x26, 0xd4, 0x85, 0x75, 0x4d, 0x14, 0xe9, 0xfe, 0x11, 0x7b, 0xae, 0xdf, 0x3d, 0x19, 0xf7, 0x59, 0x80, 0x70, 0x6, 0xa5, 0x37, 0x20, 0x92, 0x83, 0x53, 0x9a, 0xf2, 0x14}, {0x6d, 0x93, 0xd0, 0x18, 0x9c, 0x29, 0x4c, 0x52, 0xc, 0x1a, 0xc, 0x8a, 0x6c, 0xb5, 0x6b, 0xc8, 0x31, 0x86, 0x4a, 0xdb, 0x2e, 0x5, 0x75, 0xa3, 0x62, 0x45, 0x75, 0xbc, 0xe4, 0xfd, 0xe, 0x5c}, {0xf5, 0xd7, 0xb2, 0x25, 0xdc, 0x7e, 0x71, 0xdf, 0x40, 0x30, 0xb5, 0x99, 0xdb, 0x70, 0xf9, 0x21, 0x62, 0x4c, 0xed, 0xc3, 0xb7, 0x34, 0x92, 0xda, 0x3e, 0x9, 0xee, 0x7b, 0x5c, 0x36, 0x72, 0x5e}, }, { {0x3e, 0xb3, 0x8, 0x2f, 0x6, 0x39, 0x93, 0x7d, 0xbe, 0x32, 0x9f, 0xdf, 0xe5, 0x59, 0x96, 0x5b, 0xfd, 0xbd, 0x9e, 0x1f, 0xad, 0x3d, 0xff, 0xac, 0xb7, 0x49, 0x73, 0xcb, 0x55, 0x5, 0xb2, 0x70}, {0x7f, 0x21, 0x71, 0x45, 0x7, 0xfc, 0x5b, 0x57, 0x5b, 0xd9, 0x94, 0x6, 0x5d, 0x67, 0x79, 0x37, 0x33, 0x1e, 0x19, 0xf4, 0xbb, 0x37, 0xa, 0x9a, 0xbc, 0xea, 0xb4, 0x47, 0x4c, 0x10, 0xf1, 0x77}, {0x4c, 0x2c, 0x11, 0x55, 0xc5, 0x13, 0x51, 0xbe, 0xcd, 0x1f, 0x88, 0x9a, 0x3a, 0x42, 0x88, 0x66, 0x47, 0x3b, 0x50, 0x5e, 0x85, 0x77, 0x66, 0x44, 0x4a, 0x40, 0x6, 0x4a, 0x8f, 0x39, 0x34, 0xe}, }, { {0x28, 0x19, 0x4b, 0x3e, 0x9, 0xb, 0x93, 0x18, 0x40, 0xf6, 0xf3, 0x73, 0xe, 0xe1, 0xe3, 0x7d, 0x6f, 0x5d, 0x39, 0x73, 0xda, 0x17, 0x32, 0xf4, 0x3e, 0x9c, 0x37, 0xca, 0xd6, 0xde, 0x8a, 0x6f}, {0xe8, 0xbd, 0xce, 0x3e, 0xd9, 0x22, 0x7d, 0xb6, 0x7, 0x2f, 0x82, 0x27, 0x41, 0xe8, 0xb3, 0x9, 0x8d, 0x6d, 0x5b, 0xb0, 0x1f, 0xa6, 0x3f, 0x74, 0x72, 0x23, 0x36, 0x8a, 0x36, 0x5, 0x54, 0x5e}, {0x9a, 0xb2, 0xb7, 0xfd, 0x3d, 0x12, 0x40, 0xe3, 0x91, 0xb2, 0x1a, 0xa2, 0xe1, 0x97, 0x7b, 0x48, 0x9e, 0x94, 0xe6, 0xfd, 0x2, 0x7d, 0x96, 0xf9, 0x97, 0xde, 0xd3, 0xc8, 0x2e, 0xe7, 0xd, 0x78}, }, { {0x72, 0x27, 0xf4, 0x0, 0xf3, 0xea, 0x1f, 0x67, 0xaa, 0x41, 0x8c, 0x2a, 0x2a, 0xeb, 0x72, 0x8f, 0x92, 0x32, 0x37, 0x97, 0xd7, 0x7f, 0xa1, 0x29, 0xa6, 0x87, 0xb5, 0x32, 0xad, 0xc6, 0xef, 0x1d}, {0xbc, 0xe7, 0x9a, 0x8, 0x45, 0x85, 0xe2, 0xa, 0x6, 0x4d, 0x7f, 0x1c, 0xcf, 0xde, 0x8d, 0x38, 0xb8, 0x11, 0x48, 0xa, 0x51, 0x15, 0xac, 0x38, 0xe4, 0x8c, 0x92, 0x71, 0xf6, 0x8b, 0xb2, 0xe}, {0xa7, 0x95, 0x51, 0xef, 0x1a, 0xbe, 0x5b, 0xaf, 0xed, 0x15, 0x7b, 0x91, 0x77, 0x12, 0x8c, 0x14, 0x2e, 0xda, 0xe5, 0x7a, 0xfb, 0xf7, 0x91, 0x29, 0x67, 0x28, 0xdd, 0xf8, 0x1b, 0x20, 0x7d, 0x46}, }, }, { { {0xa9, 0xe7, 0x7a, 0x56, 0xbd, 0xf4, 0x1e, 0xbc, 0xbd, 0x98, 0x44, 0xd6, 0xb2, 0x4c, 0x62, 0x3f, 0xc8, 0x4e, 0x1f, 0x2c, 0xd2, 0x64, 0x10, 0xe4, 0x1, 0x40, 0x38, 0xba, 0xa5, 0xc5, 0xf9, 0x2e}, {0xad, 0x4f, 0xef, 0x74, 0x9a, 0x91, 0xfe, 0x95, 0xa2, 0x8, 0xa3, 0xf6, 0xec, 0x7b, 0x82, 0x3a, 0x1, 0x7b, 0xa4, 0x9, 0xd3, 0x1, 0x4e, 0x96, 0x97, 0xc7, 0xa3, 0x5b, 0x4f, 0x3c, 0xc4, 0x71}, {0xcd, 0x74, 0x9e, 0xfa, 0xf6, 0x6d, 0xfd, 0xb6, 0x7a, 0x26, 0xaf, 0xe4, 0xbc, 0x78, 0x82, 0xf1, 0xe, 0x99, 0xef, 0xf1, 0xd0, 0xb3, 0x55, 0x82, 0x93, 0xf2, 0xc5, 0x90, 0xa3, 0x8c, 0x75, 0x5a}, }, { {0x94, 0xdc, 0x61, 0x1d, 0x8b, 0x91, 0xe0, 0x8c, 0x66, 0x30, 0x81, 0x9a, 0x46, 0x36, 0xed, 0x8d, 0xd3, 0xaa, 0xe8, 0xaf, 0x29, 0xa8, 0xe6, 0xd4, 0x3f, 0xd4, 0x39, 0xf6, 0x27, 0x80, 0x73, 0xa}, {0x95, 0x24, 0x46, 0xd9, 0x10, 0x27, 0xb7, 0xa2, 0x3, 0x50, 0x7d, 0xd5, 0xd2, 0xc6, 0xa8, 0x3a, 0xca, 0x87, 0xb4, 0xa0, 0xbf, 0x0, 0xd4, 0xe3, 0xec, 0x72, 0xeb, 0xb3, 0x44, 0xe2, 0xba, 0x2d}, {0xcc, 0xe1, 0xff, 0x57, 0x2f, 0x4a, 0xf, 0x98, 0x43, 0x98, 0x83, 0xe1, 0xd, 0xd, 0x67, 0x0, 0xfd, 0x15, 0xfb, 0x49, 0x4a, 0x3f, 0x5c, 0x10, 0x9c, 0xa6, 0x26, 0x51, 0x63, 0xca, 0x98, 0x26}, }, { {0xe, 0xd9, 0x3d, 0x5e, 0x2f, 0x70, 0x3d, 0x2e, 0x86, 0x53, 0xd2, 0xe4, 0x18, 0x9, 0x3f, 0x9e, 0x6a, 0xa9, 0x4d, 0x2, 0xf6, 0x3e, 0x77, 0x5e, 0x32, 0x33, 0xfa, 0x4a, 0xc, 0x4b, 0x0, 0x3c}, {0x78, 0xba, 0xb0, 0x32, 0x88, 0x31, 0x65, 0xe7, 0x8b, 0xff, 0x5c, 0x92, 0xf7, 0x31, 0x18, 0x38, 0xcc, 0x1f, 0x29, 0xa0, 0x91, 0x1b, 0xa8, 0x8, 0x7, 0xeb, 0xca, 0x49, 0xcc, 0x3d, 0xb4, 0x1f}, {0x2b, 0xb8, 0xf4, 0x6, 0xac, 0x46, 0xa9, 0x9a, 0xf3, 0xc4, 0x6, 0xa8, 0xa5, 0x84, 0xa2, 0x1c, 0x87, 0x47, 0xcd, 0xc6, 0x5f, 0x26, 0xd3, 0x3e, 0x17, 0xd2, 0x1f, 0xcd, 0x1, 0xfd, 0x43, 0x6b}, }, { {0xf3, 0xe, 0x76, 0x3e, 0x58, 0x42, 0xc7, 0xb5, 0x90, 0xb9, 0xa, 0xee, 0xb9, 0x52, 0xdc, 0x75, 0x3f, 0x92, 0x2b, 0x7, 0xc2, 0x27, 0x14, 0xbf, 0xf0, 0xd9, 0xf0, 0x6f, 0x2d, 0xb, 0x42, 0x73}, {0x44, 0xc5, 0x97, 0x46, 0x4b, 0x5d, 0xa7, 0xc7, 0xbf, 0xff, 0xf, 0xdf, 0x48, 0xf8, 0xfd, 0x15, 0x5a, 0x78, 0x46, 0xaa, 0xeb, 0xb9, 0x68, 0x28, 0x14, 0xf7, 0x52, 0x5b, 0x10, 0xd7, 0x68, 0x5a}, {0x6, 0x1e, 0x85, 0x9e, 0xcb, 0xf6, 0x2c, 0xaf, 0xc4, 0x38, 0x22, 0xc6, 0x13, 0x39, 0x59, 0x8f, 0x73, 0xf3, 0xfb, 0x99, 0x96, 0xb8, 0x8a, 0xda, 0x9e, 0xbc, 0x34, 0xea, 0x2f, 0x63, 0xb5, 0x3d}, }, { {0xd5, 0x25, 0x98, 0x82, 0xb1, 0x90, 0x49, 0x2e, 0x91, 0x89, 0x9a, 0x3e, 0x87, 0xeb, 0xea, 0xed, 0xf8, 0x4a, 0x70, 0x4c, 0x39, 0x3d, 0xf0, 0xee, 0xe, 0x2b, 0xdf, 0x95, 0xa4, 0x7e, 0x19, 0x59}, {0xd8, 0xd9, 0x5d, 0xf7, 0x2b, 0xee, 0x6e, 0xf4, 0xa5, 0x59, 0x67, 0x39, 0xf6, 0xb1, 0x17, 0xd, 0x73, 0x72, 0x9e, 0x49, 0x31, 0xd1, 0xf2, 0x1b, 0x13, 0x5f, 0xd7, 0x49, 0xdf, 0x1a, 0x32, 0x4}, {0xae, 0x5a, 0xe5, 0xe4, 0x19, 0x60, 0xe1, 0x4, 0xe9, 0x92, 0x2f, 0x7e, 0x7a, 0x43, 0x7b, 0xe7, 0xa4, 0x9a, 0x15, 0x6f, 0xc1, 0x2d, 0xce, 0xc7, 0xc0, 0xc, 0xd7, 0xf4, 0xc1, 0xfd, 0xea, 0x45}, }, { {0xed, 0xb1, 0xcc, 0xcf, 0x24, 0x46, 0xe, 0xb6, 0x95, 0x3, 0x5c, 0xbd, 0x92, 0xc2, 0xdb, 0x59, 0xc9, 0x81, 0x4, 0xdc, 0x1d, 0x9d, 0xa0, 0x31, 0x40, 0xd9, 0x56, 0x5d, 0xea, 0xce, 0x73, 0x3f}, {0x2b, 0xd7, 0x45, 0x80, 0x85, 0x1, 0x84, 0x69, 0x51, 0x6, 0x2f, 0xcf, 0xa2, 0xfa, 0x22, 0x4c, 0xc6, 0x2d, 0x22, 0x6b, 0x65, 0x36, 0x1a, 0x94, 0xde, 0xda, 0x62, 0x3, 0xc8, 0xeb, 0x5e, 0x5a}, {0xc6, 0x8d, 0x4e, 0xa, 0xd1, 0xbf, 0xa7, 0xb7, 0x39, 0xb3, 0xc9, 0x44, 0x7e, 0x0, 0x57, 0xbe, 0xfa, 0xae, 0x57, 0x15, 0x7f, 0x20, 0xc1, 0x60, 0xdb, 0x18, 0x62, 0x26, 0x91, 0x88, 0x5, 0x26}, }, { {0x42, 0xe5, 0x76, 0xc6, 0x3c, 0x8e, 0x81, 0x4c, 0xad, 0xcc, 0xce, 0x3, 0x93, 0x2c, 0x42, 0x5e, 0x8, 0x9f, 0x12, 0xb4, 0xca, 0xcc, 0x7, 0xec, 0xb8, 0x43, 0x44, 0xb2, 0x10, 0xfa, 0xed, 0xd}, {0x4, 0xff, 0x60, 0x83, 0xa6, 0x4, 0xf7, 0x59, 0xf4, 0xe6, 0x61, 0x76, 0xde, 0x3f, 0xd9, 0xc3, 0x51, 0x35, 0x87, 0x12, 0x73, 0x2a, 0x1b, 0x83, 0x57, 0x5d, 0x61, 0x4e, 0x2e, 0xc, 0xad, 0x54}, {0x2a, 0x52, 0x2b, 0xb8, 0xd5, 0x67, 0x3b, 0xee, 0xeb, 0xc1, 0xa5, 0x9f, 0x46, 0x63, 0xf1, 0x36, 0xd3, 0x9f, 0xc1, 0x6e, 0xf2, 0xd2, 0xb4, 0xa5, 0x8, 0x94, 0x7a, 0xa7, 0xba, 0xb2, 0xec, 0x62}, }, { {0x74, 0x28, 0xb6, 0xaf, 0x36, 0x28, 0x7, 0x92, 0xa5, 0x4, 0xe1, 0x79, 0x85, 0x5e, 0xcd, 0x5f, 0x4a, 0xa1, 0x30, 0xc6, 0xad, 0x1, 0xad, 0x5a, 0x98, 0x3f, 0x66, 0x75, 0x50, 0x3d, 0x91, 0x61}, {0x3d, 0x2b, 0x15, 0x61, 0x52, 0x79, 0xed, 0xe5, 0xd1, 0xd7, 0xdd, 0xe, 0x7d, 0x35, 0x62, 0x49, 0x71, 0x4c, 0x6b, 0xb9, 0xd0, 0xc8, 0x82, 0x74, 0xbe, 0xd8, 0x66, 0xa9, 0x19, 0xf9, 0x59, 0x2e}, {0xda, 0x31, 0x32, 0x1a, 0x36, 0x2d, 0xc6, 0xd, 0x70, 0x2, 0x20, 0x94, 0x32, 0x58, 0x47, 0xfa, 0xce, 0x94, 0x95, 0x3f, 0x51, 0x1, 0xd8, 0x2, 0x5c, 0x5d, 0xc0, 0x31, 0xa1, 0xc2, 0xdb, 0x3d}, }, }, { { {0x14, 0xbb, 0x96, 0x27, 0xa2, 0x57, 0xaa, 0xf3, 0x21, 0xda, 0x7, 0x9b, 0xb7, 0xba, 0x3a, 0x88, 0x1c, 0x39, 0xa0, 0x31, 0x18, 0xe2, 0x4b, 0xe5, 0xf9, 0x5, 0x32, 0xd8, 0x38, 0xfb, 0xe7, 0x5e}, {0x4b, 0xc5, 0x5e, 0xce, 0xf9, 0xf, 0xdc, 0x9a, 0xd, 0x13, 0x2f, 0x8c, 0x6b, 0x2a, 0x9c, 0x3, 0x15, 0x95, 0xf8, 0xf0, 0xc7, 0x7, 0x80, 0x2, 0x6b, 0xb3, 0x4, 0xac, 0x14, 0x83, 0x96, 0x78}, {0x8e, 0x6a, 0x44, 0x41, 0xcb, 0xfd, 0x8d, 0x53, 0xf9, 0x37, 0x49, 0x43, 0xa9, 0xfd, 0xac, 0xa5, 0x78, 0x8c, 0x3c, 0x26, 0x8d, 0x90, 0xaf, 0x46, 0x9, 0xd, 0xca, 0x9b, 0x3c, 0x63, 0xd0, 0x61}, }, { {0xdf, 0x73, 0xfc, 0xf8, 0xbc, 0x28, 0xa3, 0xad, 0xfc, 0x37, 0xf0, 0xa6, 0x5d, 0x69, 0x84, 0xee, 0x9, 0xa9, 0xc2, 0x38, 0xdb, 0xb4, 0x7f, 0x63, 0xdc, 0x7b, 0x6, 0xf8, 0x2d, 0xac, 0x23, 0x5b}, {0x66, 0x25, 0xdb, 0xff, 0x35, 0x49, 0x74, 0x63, 0xbb, 0x68, 0xb, 0x78, 0x89, 0x6b, 0xbd, 0xc5, 0x3, 0xec, 0x3e, 0x55, 0x80, 0x32, 0x1b, 0x6f, 0xf5, 0xd7, 0xae, 0x47, 0xd8, 0x5f, 0x96, 0x6e}, {0x7b, 0x52, 0x80, 0xee, 0x53, 0xb9, 0xd2, 0x9a, 0x8d, 0x6d, 0xde, 0xfa, 0xaa, 0x19, 0x8f, 0xe8, 0xcf, 0x82, 0xe, 0x15, 0x4, 0x17, 0x71, 0xe, 0xdc, 0xde, 0x95, 0xdd, 0xb9, 0xbb, 0xb9, 0x79}, }, { {0x74, 0x73, 0x9f, 0x8e, 0xae, 0x7d, 0x99, 0xd1, 0x16, 0x8, 0xbb, 0xcf, 0xf8, 0xa2, 0x32, 0xa0, 0xa, 0x5f, 0x44, 0x6d, 0x12, 0xba, 0x6c, 0xcd, 0x34, 0xb8, 0xcc, 0xa, 0x46, 0x11, 0xa8, 0x1b}, {0xc2, 0x26, 0x31, 0x6a, 0x40, 0x55, 0xb3, 0xeb, 0x93, 0xc3, 0xc8, 0x68, 0xa8, 0x83, 0x63, 0xd2, 0x82, 0x7a, 0xb9, 0xe5, 0x29, 0x64, 0xc, 0x6c, 0x47, 0x21, 0xfd, 0xc9, 0x58, 0xf1, 0x65, 0x50}, {0x54, 0x99, 0x42, 0xc, 0xfb, 0x69, 0x81, 0x70, 0x67, 0xcf, 0x6e, 0xd7, 0xac, 0x0, 0x46, 0xe1, 0xba, 0x45, 0xe6, 0x70, 0x8a, 0xb9, 0xaa, 0x2e, 0xf2, 0xfa, 0xa4, 0x58, 0x9e, 0xf3, 0x81, 0x39}, }, { {0xde, 0x6f, 0xe6, 0x6d, 0xa5, 0xdf, 0x45, 0xc8, 0x3a, 0x48, 0x40, 0x2c, 0x0, 0xa5, 0x52, 0xe1, 0x32, 0xf6, 0xb4, 0xc7, 0x63, 0xe1, 0xd2, 0xe9, 0x65, 0x1b, 0xbc, 0xdc, 0x2e, 0x45, 0xf4, 0x30}, {0x93, 0xa, 0x23, 0x59, 0x75, 0x8a, 0xfb, 0x18, 0x5d, 0xf4, 0xe6, 0x60, 0x69, 0x8f, 0x16, 0x1d, 0xb5, 0x3c, 0xa9, 0x14, 0x45, 0xa9, 0x85, 0x3a, 0xfd, 0xd0, 0xac, 0x5, 0x37, 0x8, 0xdc, 0x38}, {0x40, 0x97, 0x75, 0xc5, 0x82, 0x27, 0x6d, 0x85, 0xcc, 0xbe, 0x9c, 0xf9, 0x69, 0x45, 0x13, 0xfa, 0x71, 0x4e, 0xea, 0xc0, 0x73, 0xfc, 0x44, 0x88, 0x69, 0x24, 0x3f, 0x59, 0x1a, 0x9a, 0x2d, 0x63}, }, { {0xa7, 0x84, 0xc, 0xed, 0x11, 0xfd, 0x9, 0xbf, 0x3a, 0x69, 0x9f, 0xd, 0x81, 0x71, 0xf0, 0x63, 0x79, 0x87, 0xcf, 0x57, 0x2d, 0x8c, 0x90, 0x21, 0xa2, 0x4b, 0xf6, 0x8a, 0xf2, 0x7d, 0x5a, 0x3a}, {0xa6, 0xcb, 0x7, 0xb8, 0x15, 0x6b, 0xbb, 0xf6, 0xd7, 0xf0, 0x54, 0xbc, 0xdf, 0xc7, 0x23, 0x18, 0xb, 0x67, 0x29, 0x6e, 0x3, 0x97, 0x1d, 0xbb, 0x57, 0x4a, 0xed, 0x47, 0x88, 0xf4, 0x24, 0xb}, {0xc7, 0xea, 0x1b, 0x51, 0xbe, 0xd4, 0xda, 0xdc, 0xf2, 0xcc, 0x26, 0xed, 0x75, 0x80, 0x53, 0xa4, 0x65, 0x9a, 0x5f, 0x0, 0x9f, 0xff, 0x9c, 0xe1, 0x63, 0x1f, 0x48, 0x75, 0x44, 0xf7, 0xfc, 0x34}, }, { {0x98, 0xaa, 0xcf, 0x78, 0xab, 0x1d, 0xbb, 0xa5, 0xf2, 0x72, 0xb, 0x19, 0x67, 0xa2, 0xed, 0x5c, 0x8e, 0x60, 0x92, 0xa, 0x11, 0xc9, 0x9, 0x93, 0xb0, 0x74, 0xb3, 0x2f, 0x4, 0xa3, 0x19, 0x1}, {0xca, 0x67, 0x97, 0x78, 0x4c, 0xe0, 0x97, 0xc1, 0x7d, 0x46, 0xd9, 0x38, 0xcb, 0x4d, 0x71, 0xb8, 0xa8, 0x5f, 0xf9, 0x83, 0x82, 0x88, 0xde, 0x55, 0xf7, 0x63, 0xfa, 0x4d, 0x16, 0xdc, 0x3b, 0x3d}, {0x7d, 0x17, 0xc2, 0xe8, 0x9c, 0xd8, 0xa2, 0x67, 0xc1, 0xd0, 0x95, 0x68, 0xf6, 0xa5, 0x9d, 0x66, 0xb0, 0xa2, 0x82, 0xb2, 0xe5, 0x98, 0x65, 0xf5, 0x73, 0xa, 0xe2, 0xed, 0xf1, 0x88, 0xc0, 0x56}, }, { {0x2, 0x8f, 0xf3, 0x24, 0xac, 0x5f, 0x1b, 0x58, 0xbd, 0xc, 0xe3, 0xba, 0xfe, 0xe9, 0xb, 0xa9, 0xf0, 0x92, 0xcf, 0x8a, 0x2, 0x69, 0x21, 0x9a, 0x8f, 0x3, 0x59, 0x83, 0xa4, 0x7e, 0x8b, 0x3}, {0x17, 0x6e, 0xa8, 0x10, 0x11, 0x3d, 0x6d, 0x33, 0xfa, 0xb2, 0x75, 0xb, 0x32, 0x88, 0xf3, 0xd7, 0x88, 0x29, 0x7, 0x25, 0x76, 0x33, 0x15, 0xf9, 0x87, 0x8b, 0x10, 0x99, 0x6b, 0x4c, 0x67, 0x9}, {0xf8, 0x6f, 0x31, 0x99, 0x21, 0xf8, 0x4e, 0x9f, 0x4f, 0x8d, 0xa7, 0xea, 0x82, 0xd2, 0x49, 0x2f, 0x74, 0x31, 0xef, 0x5a, 0xab, 0xa5, 0x71, 0x9, 0x65, 0xeb, 0x69, 0x59, 0x2, 0x31, 0x5e, 0x6e}, }, { {0x22, 0x62, 0x6, 0x63, 0xe, 0xfb, 0x4, 0x33, 0x3f, 0xba, 0xac, 0x87, 0x89, 0x6, 0x35, 0xfb, 0xa3, 0x61, 0x10, 0x8c, 0x77, 0x24, 0x19, 0xbd, 0x20, 0x86, 0x83, 0xd1, 0x43, 0xad, 0x58, 0x30}, {0xfb, 0x93, 0xe5, 0x87, 0xf5, 0x62, 0x6c, 0xb1, 0x71, 0x3e, 0x5d, 0xca, 0xde, 0xed, 0x99, 0x49, 0x6d, 0x3e, 0xcc, 0x14, 0xe0, 0xc1, 0x91, 0xb4, 0xa8, 0xdb, 0xa8, 0x89, 0x47, 0x11, 0xf5, 0x8}, {0xd0, 0x63, 0x76, 0xe5, 0xfd, 0xf, 0x3c, 0x32, 0x10, 0xa6, 0x2e, 0xa2, 0x38, 0xdf, 0xc3, 0x5, 0x9a, 0x4f, 0x99, 0xac, 0xbd, 0x8a, 0xc7, 0xbd, 0x99, 0xdc, 0xe3, 0xef, 0xa4, 0x9f, 0x54, 0x26}, }, }, { { {0x6e, 0x66, 0x3f, 0xaf, 0x49, 0x85, 0x46, 0xdb, 0xa5, 0xe, 0x4a, 0xf1, 0x4, 0xcf, 0x7f, 0xd7, 0x47, 0xc, 0xba, 0xa4, 0xf7, 0x3f, 0xf2, 0x3d, 0x85, 0x3c, 0xce, 0x32, 0xe1, 0xdf, 0x10, 0x3a}, {0xd6, 0xf9, 0x6b, 0x1e, 0x46, 0x5a, 0x1d, 0x74, 0x81, 0xa5, 0x77, 0x77, 0xfc, 0xb3, 0x5, 0x23, 0xd9, 0xd3, 0x74, 0x64, 0xa2, 0x74, 0x55, 0xd4, 0xff, 0xe0, 0x1, 0x64, 0xdc, 0xe1, 0x26, 0x19}, {0xa0, 0xce, 0x17, 0xea, 0x8a, 0x4e, 0x7f, 0xe0, 0xfd, 0xc1, 0x1f, 0x3a, 0x46, 0x15, 0xd5, 0x2f, 0xf1, 0xc0, 0xf2, 0x31, 0xfd, 0x22, 0x53, 0x17, 0x15, 0x5d, 0x1e, 0x86, 0x1d, 0xd0, 0xa1, 0x1f}, }, { {0xab, 0x94, 0xdf, 0xd1, 0x0, 0xac, 0xdc, 0x38, 0xe9, 0xd, 0x8, 0xd1, 0xdd, 0x2b, 0x71, 0x2e, 0x62, 0xe2, 0xd5, 0xfd, 0x3e, 0xe9, 0x13, 0x7f, 0xe5, 0x1, 0x9a, 0xee, 0x18, 0xed, 0xfc, 0x73}, {0x32, 0x98, 0x59, 0x7d, 0x94, 0x55, 0x80, 0xcc, 0x20, 0x55, 0xf1, 0x37, 0xda, 0x56, 0x46, 0x1e, 0x20, 0x93, 0x5, 0x4e, 0x74, 0xf7, 0xf6, 0x99, 0x33, 0xcf, 0x75, 0x6a, 0xbc, 0x63, 0x35, 0x77}, {0xb3, 0x9c, 0x13, 0x63, 0x8, 0xe9, 0xb1, 0x6, 0xcd, 0x3e, 0xa0, 0xc5, 0x67, 0xda, 0x93, 0xa4, 0x32, 0x89, 0x63, 0xad, 0xc8, 0xce, 0x77, 0x8d, 0x44, 0x4f, 0x86, 0x1b, 0x70, 0x6b, 0x42, 0x1f}, }, { {0x52, 0x25, 0xa1, 0x91, 0xc8, 0x35, 0x7e, 0xf1, 0x76, 0x9c, 0x5e, 0x57, 0x53, 0x81, 0x6b, 0xb7, 0x3e, 0x72, 0x9b, 0xd, 0x6f, 0x40, 0x83, 0xfa, 0x38, 0xe4, 0xa7, 0x3f, 0x1b, 0xbb, 0x76, 0xb}, {0x1, 0x1c, 0x91, 0x41, 0x4c, 0x26, 0xc9, 0xef, 0x25, 0x2c, 0xa2, 0x17, 0xb8, 0xb7, 0xa3, 0xf1, 0x47, 0x14, 0xf, 0xf3, 0x6b, 0xda, 0x75, 0x58, 0x90, 0xb0, 0x31, 0x1d, 0x27, 0xf5, 0x1a, 0x4e}, {0x9b, 0x93, 0x92, 0x7f, 0xf9, 0xc1, 0xb8, 0x8, 0x6e, 0xab, 0x44, 0xd4, 0xcb, 0x71, 0x67, 0xbe, 0x17, 0x80, 0xbb, 0x99, 0x63, 0x64, 0xe5, 0x22, 0x55, 0xa9, 0x72, 0xb7, 0x1e, 0xd6, 0x6d, 0x7b}, }, { {0xc7, 0xd2, 0x1, 0xab, 0xf9, 0xab, 0x30, 0x57, 0x18, 0x3b, 0x14, 0x40, 0xdc, 0x76, 0xfb, 0x16, 0x81, 0xb2, 0xcb, 0xa0, 0x65, 0xbe, 0x6c, 0x86, 0xfe, 0x6a, 0xff, 0x9b, 0x65, 0x9b, 0xfa, 0x53}, {0x92, 0x3d, 0xf3, 0x50, 0xe8, 0xc1, 0xad, 0xb7, 0xcf, 0xd5, 0x8c, 0x60, 0x4f, 0xfa, 0x98, 0x79, 0xdb, 0x5b, 0xfc, 0x8d, 0xbd, 0x2d, 0x96, 0xad, 0x4f, 0x2f, 0x1d, 0xaf, 0xce, 0x9b, 0x3e, 0x70}, {0x55, 0x54, 0x88, 0x94, 0xe9, 0xc8, 0x14, 0x6c, 0xe5, 0xd4, 0xae, 0x65, 0x66, 0x5d, 0x3a, 0x84, 0xf1, 0x5a, 0xd6, 0xbc, 0x3e, 0xb7, 0x1b, 0x18, 0x50, 0x1f, 0xc6, 0xc4, 0xe5, 0x93, 0x8d, 0x39}, }, { {0xf2, 0xe3, 0xe7, 0xd2, 0x60, 0x7c, 0x87, 0xc3, 0xb1, 0x8b, 0x82, 0x30, 0xa0, 0xaa, 0x34, 0x3b, 0x38, 0xf1, 0x9e, 0x73, 0xe7, 0x26, 0x3e, 0x28, 0x77, 0x5, 0xc3, 0x2, 0x90, 0x9c, 0x9c, 0x69}, {0xf3, 0x48, 0xe2, 0x33, 0x67, 0xd1, 0x4b, 0x1c, 0x5f, 0xa, 0xbf, 0x15, 0x87, 0x12, 0x9e, 0xbd, 0x76, 0x3, 0xb, 0xa1, 0xf0, 0x8c, 0x3f, 0xd4, 0x13, 0x1b, 0x19, 0xdf, 0x5d, 0x9b, 0xb0, 0x53}, {0xcc, 0xf1, 0x46, 0x59, 0x23, 0xa7, 0x6, 0xf3, 0x7d, 0xd9, 0xe5, 0xcc, 0xb5, 0x18, 0x17, 0x92, 0x75, 0xe9, 0xb4, 0x81, 0x47, 0xd2, 0xcd, 0x28, 0x7, 0xd9, 0xcd, 0x6f, 0xc, 0xf3, 0xca, 0x51}, }, { {0xc7, 0x54, 0xac, 0x18, 0x9a, 0xf9, 0x7a, 0x73, 0xf, 0xb3, 0x1c, 0xc5, 0xdc, 0x78, 0x33, 0x90, 0xc7, 0xc, 0xe1, 0x4c, 0x33, 0xbc, 0x89, 0x2b, 0x9a, 0xe9, 0xf8, 0x89, 0xc1, 0x29, 0xae, 0x12}, {0xa, 0xe0, 0x74, 0x76, 0x42, 0xa7, 0xb, 0xa6, 0xf3, 0x7b, 0x7a, 0xa1, 0x70, 0x85, 0xe, 0x63, 0xcc, 0x24, 0x33, 0xcf, 0x3d, 0x56, 0x58, 0x37, 0xaa, 0xfd, 0x83, 0x23, 0x29, 0xaa, 0x4, 0x55}, {0xcf, 0x1, 0xd, 0x1f, 0xcb, 0xc0, 0x9e, 0xa9, 0xae, 0xf7, 0x34, 0x3a, 0xcc, 0xef, 0xd1, 0xd, 0x22, 0x4e, 0x9c, 0xd0, 0x21, 0x75, 0xca, 0x55, 0xea, 0xa5, 0xeb, 0x58, 0xe9, 0x4f, 0xd1, 0x5f}, }, { {0x8e, 0xcb, 0x93, 0xbf, 0x5e, 0xfe, 0x42, 0x3c, 0x5f, 0x56, 0xd4, 0x36, 0x51, 0xa8, 0xdf, 0xbe, 0xe8, 0x20, 0x42, 0x88, 0x9e, 0x85, 0xf0, 0xe0, 0x28, 0xd1, 0x25, 0x7, 0x96, 0x3f, 0xd7, 0x7d}, {0x2c, 0xab, 0x45, 0x28, 0xdf, 0x2d, 0xdc, 0xb5, 0x93, 0xe9, 0x7f, 0xa, 0xb1, 0x91, 0x94, 0x6, 0x46, 0xe3, 0x2, 0x40, 0xd6, 0xf3, 0xaa, 0x4d, 0xd1, 0x74, 0x64, 0x58, 0x6e, 0xf2, 0x3f, 0x9}, {0x29, 0x98, 0x5, 0x68, 0xfe, 0x24, 0xd, 0xb1, 0xe5, 0x23, 0xaf, 0xdb, 0x72, 0x6, 0x73, 0x75, 0x29, 0xac, 0x57, 0xb4, 0x3a, 0x25, 0x67, 0x13, 0xa4, 0x70, 0xb4, 0x86, 0xbc, 0xbc, 0x59, 0x2f}, }, { {0x1, 0xc3, 0x91, 0xb6, 0x60, 0xd5, 0x41, 0x70, 0x1e, 0xe7, 0xd7, 0xad, 0x3f, 0x1b, 0x20, 0x85, 0x85, 0x55, 0x33, 0x11, 0x63, 0xe1, 0xc2, 0x16, 0xb1, 0x28, 0x8, 0x1, 0x3d, 0x5e, 0xa5, 0x2a}, {0x5f, 0x13, 0x17, 0x99, 0x42, 0x7d, 0x84, 0x83, 0xd7, 0x3, 0x7d, 0x56, 0x1f, 0x91, 0x1b, 0xad, 0xd1, 0xaa, 0x77, 0xbe, 0xd9, 0x48, 0x77, 0x7e, 0x4a, 0xaf, 0x51, 0x2e, 0x2e, 0xb4, 0x58, 0x54}, {0x4f, 0x44, 0x7, 0xc, 0xe6, 0x92, 0x51, 0xed, 0x10, 0x1d, 0x42, 0x74, 0x2d, 0x4e, 0xc5, 0x42, 0x64, 0xc8, 0xb5, 0xfd, 0x82, 0x4c, 0x2b, 0x35, 0x64, 0x86, 0x76, 0x8a, 0x4a, 0x0, 0xe9, 0x13}, }, }, { { {0x7f, 0x87, 0x3b, 0x19, 0xc9, 0x0, 0x2e, 0xbb, 0x6b, 0x50, 0xdc, 0xe0, 0x90, 0xa8, 0xe3, 0xec, 0x9f, 0x64, 0xde, 0x36, 0xc0, 0xb7, 0xf3, 0xec, 0x1a, 0x9e, 0xde, 0x98, 0x8, 0x4, 0x46, 0x5f}, {0xdb, 0xce, 0x2f, 0x83, 0x45, 0x88, 0x9d, 0x73, 0x63, 0xf8, 0x6b, 0xae, 0xc9, 0xd6, 0x38, 0xfa, 0xf7, 0xfe, 0x4f, 0xb7, 0xca, 0xd, 0xbc, 0x32, 0x5e, 0xe4, 0xbc, 0x14, 0x88, 0x7e, 0x93, 0x73}, {0x8d, 0xf4, 0x7b, 0x29, 0x16, 0x71, 0x3, 0xb9, 0x34, 0x68, 0xf0, 0xd4, 0x22, 0x3b, 0xd1, 0xa9, 0xc6, 0xbd, 0x96, 0x46, 0x57, 0x15, 0x97, 0xe1, 0x35, 0xe8, 0xd5, 0x91, 0xe8, 0xa4, 0xf8, 0x2c}, }, { {0xa2, 0x6b, 0xd0, 0x17, 0x7e, 0x48, 0xb5, 0x2c, 0x6b, 0x19, 0x50, 0x39, 0x1c, 0x38, 0xd2, 0x24, 0x30, 0x8a, 0x97, 0x85, 0x81, 0x9c, 0x65, 0xd7, 0xf6, 0xa4, 0xd6, 0x91, 0x28, 0x7f, 0x6f, 0x7a}, {0x67, 0xf, 0x11, 0x7, 0x87, 0xfd, 0x93, 0x6d, 0x49, 0xb5, 0x38, 0x7c, 0xd3, 0x9, 0x4c, 0xdd, 0x86, 0x6a, 0x73, 0xc2, 0x4c, 0x6a, 0xb1, 0x7c, 0x9, 0x2a, 0x25, 0x58, 0x6e, 0xbd, 0x49, 0x20}, {0x49, 0xef, 0x9a, 0x6a, 0x8d, 0xfd, 0x9, 0x7d, 0xb, 0xb9, 0x3d, 0x5b, 0xbe, 0x60, 0xee, 0xf0, 0xd4, 0xbf, 0x9e, 0x51, 0x2c, 0xb5, 0x21, 0x4c, 0x1d, 0x94, 0x45, 0xc5, 0xdf, 0xaa, 0x11, 0x60}, }, { {0x90, 0xf8, 0xcb, 0x2, 0xc8, 0xd0, 0xde, 0x63, 0xaa, 0x6a, 0xff, 0xd, 0xca, 0x98, 0xd0, 0xfb, 0x99, 0xed, 0xb6, 0xb9, 0xfd, 0xa, 0x4d, 0x62, 0x1e, 0xb, 0x34, 0x79, 0xb7, 0x18, 0xce, 0x69}, {0x3c, 0xf8, 0x95, 0xcf, 0x6d, 0x92, 0x67, 0x5f, 0x71, 0x90, 0x28, 0x71, 0x61, 0x85, 0x7e, 0x7c, 0x5b, 0x7a, 0x8f, 0x99, 0xf3, 0xe7, 0xa1, 0xd6, 0xe0, 0xf9, 0x62, 0xb, 0x1b, 0xcc, 0xc5, 0x6f}, {0xcb, 0x79, 0x98, 0xb2, 0x28, 0x55, 0xef, 0xd1, 0x92, 0x90, 0x7e, 0xd4, 0x3c, 0xae, 0x1a, 0xdd, 0x52, 0x23, 0x9f, 0x18, 0x42, 0x4, 0x7e, 0x12, 0xf1, 0x1, 0x71, 0xe5, 0x3a, 0x6b, 0x59, 0x15}, }, { {0xca, 0x24, 0x51, 0x7e, 0x16, 0x31, 0xff, 0x9, 0xdf, 0x45, 0xc7, 0xd9, 0x8b, 0x15, 0xe4, 0xb, 0xe5, 0x56, 0xf5, 0x7e, 0x22, 0x7d, 0x2b, 0x29, 0x38, 0xd1, 0xb6, 0xaf, 0x41, 0xe2, 0xa4, 0x3a}, {0xa2, 0x79, 0x91, 0x3f, 0xd2, 0x39, 0x27, 0x46, 0xcf, 0xdd, 0xd6, 0x97, 0x31, 0x12, 0x83, 0xff, 0x8a, 0x14, 0xf2, 0x53, 0xb5, 0xde, 0x7, 0x13, 0xda, 0x4d, 0x5f, 0x7b, 0x68, 0x37, 0x22, 0xd}, {0xf5, 0x5, 0x33, 0x2a, 0xbf, 0x38, 0xc1, 0x2c, 0xc3, 0x26, 0xe9, 0xa2, 0x8f, 0x3f, 0x58, 0x48, 0xeb, 0xd2, 0x49, 0x55, 0xa2, 0xb1, 0x3a, 0x8, 0x6c, 0xa3, 0x87, 0x46, 0x6e, 0xaa, 0xfc, 0x32}, }, { {0xdf, 0xcc, 0x87, 0x27, 0x73, 0xa4, 0x7, 0x32, 0xf8, 0xe3, 0x13, 0xf2, 0x8, 0x19, 0xe3, 0x17, 0x4e, 0x96, 0xd, 0xf6, 0xd7, 0xec, 0xb2, 0xd5, 0xe9, 0xb, 0x60, 0xc2, 0x36, 0x63, 0x6f, 0x74}, {0xf5, 0x9a, 0x7d, 0xc5, 0x8d, 0x6e, 0xc5, 0x7b, 0xf2, 0xbd, 0xf0, 0x9d, 0xed, 0xd2, 0xb, 0x3e, 0xa3, 0xe4, 0xef, 0x22, 0xde, 0x14, 0xc0, 0xaa, 0x5c, 0x6a, 0xbd, 0xfe, 0xce, 0xe9, 0x27, 0x46}, {0x1c, 0x97, 0x6c, 0xab, 0x45, 0xf3, 0x4a, 0x3f, 0x1f, 0x73, 0x43, 0x99, 0x72, 0xeb, 0x88, 0xe2, 0x6d, 0x18, 0x44, 0x3, 0x8a, 0x6a, 0x59, 0x33, 0x93, 0x62, 0xd6, 0x7e, 0x0, 0x17, 0x49, 0x7b}, }, { {0xdd, 0xa2, 0x53, 0xdd, 0x28, 0x1b, 0x34, 0x54, 0x3f, 0xfc, 0x42, 0xdf, 0x5b, 0x90, 0x17, 0xaa, 0xf4, 0xf8, 0xd2, 0x4d, 0xd9, 0x92, 0xf5, 0xf, 0x7d, 0xd3, 0x8c, 0xe0, 0xf, 0x62, 0x3, 0x1d}, {0x64, 0xb0, 0x84, 0xab, 0x5c, 0xfb, 0x85, 0x2d, 0x14, 0xbc, 0xf3, 0x89, 0xd2, 0x10, 0x78, 0x49, 0xc, 0xce, 0x15, 0x7b, 0x44, 0xdc, 0x6a, 0x47, 0x7b, 0xfd, 0x44, 0xf8, 0x76, 0xa3, 0x2b, 0x12}, {0x54, 0xe5, 0xb4, 0xa2, 0xcd, 0x32, 0x2, 0xc2, 0x7f, 0x18, 0x5d, 0x11, 0x42, 0xfd, 0xd0, 0x9e, 0xd9, 0x79, 0xd4, 0x7d, 0xbe, 0xb4, 0xab, 0x2e, 0x4c, 0xec, 0x68, 0x2b, 0xf5, 0xb, 0xc7, 0x2}, }, { {0xe1, 0x72, 0x8d, 0x45, 0xbf, 0x32, 0xe5, 0xac, 0xb5, 0x3c, 0xb7, 0x7c, 0xe0, 0x68, 0xe7, 0x5b, 0xe7, 0xbd, 0x8b, 0xee, 0x94, 0x7d, 0xcf, 0x56, 0x3, 0x3a, 0xb4, 0xfe, 0xe3, 0x97, 0x6, 0x6b}, {0xbb, 0x2f, 0xb, 0x5d, 0x4b, 0xec, 0x87, 0xa2, 0xca, 0x82, 0x48, 0x7, 0x90, 0x57, 0x5c, 0x41, 0x5c, 0x81, 0xd0, 0xc1, 0x1e, 0xa6, 0x44, 0xe0, 0xe0, 0xf5, 0x9e, 0x40, 0xa, 0x4f, 0x33, 0x26}, {0xc0, 0xa3, 0x62, 0xdf, 0x4a, 0xf0, 0xc8, 0xb6, 0x5d, 0xa4, 0x6d, 0x7, 0xef, 0x0, 0xf0, 0x3e, 0xa9, 0xd2, 0xf0, 0x49, 0x58, 0xb9, 0x9c, 0x9c, 0xae, 0x2f, 0x1b, 0x44, 0x43, 0x7f, 0xc3, 0x1c}, }, { {0xb9, 0xae, 0xce, 0xc9, 0xf1, 0x56, 0x66, 0xd7, 0x6a, 0x65, 0xe5, 0x18, 0xf8, 0x15, 0x5b, 0x1c, 0x34, 0x23, 0x4c, 0x84, 0x32, 0x28, 0xe7, 0x26, 0x38, 0x68, 0x19, 0x2f, 0x77, 0x6f, 0x34, 0x3a}, {0x4f, 0x32, 0xc7, 0x5c, 0x5a, 0x56, 0x8f, 0x50, 0x22, 0xa9, 0x6, 0xe5, 0xc0, 0xc4, 0x61, 0xd0, 0x19, 0xac, 0x45, 0x5c, 0xdb, 0xab, 0x18, 0xfb, 0x4a, 0x31, 0x80, 0x3, 0xc1, 0x9, 0x68, 0x6c}, {0xc8, 0x6a, 0xda, 0xe2, 0x12, 0x51, 0xd5, 0xd2, 0xed, 0x51, 0xe8, 0xb1, 0x31, 0x3, 0xbd, 0xe9, 0x62, 0x72, 0xc6, 0x8e, 0xdd, 0x46, 0x7, 0x96, 0xd0, 0xc5, 0xf7, 0x6e, 0x9f, 0x1b, 0x91, 0x5}, }, }, { { {0xef, 0xea, 0x2e, 0x51, 0xf3, 0xac, 0x49, 0x53, 0x49, 0xcb, 0xc1, 0x1c, 0xd3, 0x41, 0xc1, 0x20, 0x8d, 0x68, 0x9a, 0xa9, 0x7, 0xc, 0x18, 0x24, 0x17, 0x2d, 0x4b, 0xc6, 0xd1, 0xf9, 0x5e, 0x55}, {0xbb, 0xe, 0xdf, 0xf5, 0x83, 0x99, 0x33, 0xc1, 0xac, 0x4c, 0x2c, 0x51, 0x8f, 0x75, 0xf3, 0xc0, 0xe1, 0x98, 0xb3, 0xb, 0xa, 0x13, 0xf1, 0x2c, 0x62, 0xc, 0x27, 0xaa, 0xf9, 0xec, 0x3c, 0x6b}, {0x8, 0xbd, 0x73, 0x3b, 0xba, 0x70, 0xa7, 0x36, 0xc, 0xbf, 0xaf, 0xa3, 0x8, 0xef, 0x4a, 0x62, 0xf2, 0x46, 0x9, 0xb4, 0x98, 0xff, 0x37, 0x57, 0x9d, 0x74, 0x81, 0x33, 0xe1, 0x4d, 0x5f, 0x67}, }, { {0x1d, 0xb3, 0xda, 0x3b, 0xd9, 0xf6, 0x2f, 0xa1, 0xfe, 0x2d, 0x65, 0x9d, 0xf, 0xd8, 0x25, 0x7, 0x87, 0x94, 0xbe, 0x9a, 0xf3, 0x4f, 0x9c, 0x1, 0x43, 0x3c, 0xcd, 0x82, 0xb8, 0x50, 0xf4, 0x60}, {0xfc, 0x82, 0x17, 0x6b, 0x3, 0x52, 0x2c, 0xe, 0xb4, 0x83, 0xad, 0x6c, 0x81, 0x6c, 0x81, 0x64, 0x3e, 0x7, 0x64, 0x69, 0xd9, 0xbd, 0xdc, 0xd0, 0x20, 0xc5, 0x64, 0x1, 0xf7, 0x9d, 0xd9, 0x13}, {0xca, 0xc0, 0xe5, 0x21, 0xc3, 0x5e, 0x4b, 0x1, 0xa2, 0xbf, 0x19, 0xd7, 0xc9, 0x69, 0xcb, 0x4f, 0xa0, 0x23, 0x0, 0x75, 0x18, 0x1c, 0x5f, 0x4e, 0x80, 0xac, 0xed, 0x55, 0x9e, 0xde, 0x6, 0x1c}, }, { {0xaa, 0x69, 0x6d, 0xff, 0x40, 0x2b, 0xd5, 0xff, 0xbb, 0x49, 0x40, 0xdc, 0x18, 0xb, 0x53, 0x34, 0x97, 0x98, 0x4d, 0xa3, 0x2f, 0x5c, 0x4a, 0x5e, 0x2d, 0xba, 0x32, 0x7d, 0x8e, 0x6f, 0x9, 0x78}, {0xe2, 0xc4, 0x3e, 0xa3, 0xd6, 0x7a, 0xf, 0x99, 0x8e, 0xe0, 0x2e, 0xbe, 0x38, 0xf9, 0x8, 0x66, 0x15, 0x45, 0x28, 0x63, 0xc5, 0x43, 0xa1, 0x9c, 0xd, 0xb6, 0x2d, 0xec, 0x1f, 0x8a, 0xf3, 0x4c}, {0xe7, 0x5c, 0xfa, 0xd, 0x65, 0xaa, 0xaa, 0xa0, 0x8c, 0x47, 0xb5, 0x48, 0x2a, 0x9e, 0xc4, 0xf9, 0x5b, 0x72, 0x3, 0x70, 0x7d, 0xcc, 0x9, 0x4f, 0xbe, 0x1a, 0x9, 0x26, 0x3a, 0xad, 0x3c, 0x37}, }, { {0xad, 0xbb, 0xdd, 0x89, 0xfb, 0xa8, 0xbe, 0xf1, 0xcb, 0xae, 0xae, 0x61, 0xbc, 0x2c, 0xcb, 0x3b, 0x9d, 0x8d, 0x9b, 0x1f, 0xbb, 0xa7, 0x58, 0x8f, 0x86, 0xa6, 0x12, 0x51, 0xda, 0x7e, 0x54, 0x21}, {0x7c, 0xf5, 0xc9, 0x82, 0x4d, 0x63, 0x94, 0xb2, 0x36, 0x45, 0x93, 0x24, 0xe1, 0xfd, 0xcb, 0x1f, 0x5a, 0xdb, 0x8c, 0x41, 0xb3, 0x4d, 0x9c, 0x9e, 0xfc, 0x19, 0x44, 0x45, 0xd9, 0xf3, 0x40, 0x0}, {0xd3, 0x86, 0x59, 0xfd, 0x39, 0xe9, 0xfd, 0xde, 0xc, 0x38, 0xa, 0x51, 0x89, 0x2c, 0x27, 0xf4, 0xb9, 0x19, 0x31, 0xbb, 0x7, 0xa4, 0x2b, 0xb7, 0xf4, 0x4d, 0x25, 0x4a, 0x33, 0xa, 0x55, 0x63}, }, { {0x49, 0x7b, 0x54, 0x72, 0x45, 0x58, 0xba, 0x9b, 0xe0, 0x8, 0xc4, 0xe2, 0xfa, 0xc6, 0x5, 0xf3, 0x8d, 0xf1, 0x34, 0xc7, 0x69, 0xfa, 0xe8, 0x60, 0x7a, 0x76, 0x7d, 0xaa, 0xaf, 0x2b, 0xa9, 0x39}, {0x37, 0xcf, 0x69, 0xb5, 0xed, 0xd6, 0x7, 0x65, 0xe1, 0x2e, 0xa5, 0xc, 0xb0, 0x29, 0x84, 0x17, 0x5d, 0xd6, 0x6b, 0xeb, 0x90, 0x0, 0x7c, 0xea, 0x51, 0x8f, 0xf7, 0xda, 0xc7, 0x62, 0xea, 0x3e}, {0x4e, 0x27, 0x93, 0xe6, 0x13, 0xc7, 0x24, 0x9d, 0x75, 0xd3, 0xdb, 0x68, 0x77, 0x85, 0x63, 0x5f, 0x9a, 0xb3, 0x8a, 0xeb, 0x60, 0x55, 0x52, 0x70, 0xcd, 0xc4, 0xc9, 0x65, 0x6, 0x6a, 0x43, 0x68}, }, { {0x7c, 0x10, 0x20, 0xe8, 0x17, 0xd3, 0x56, 0x1e, 0x65, 0xe9, 0xa, 0x84, 0x44, 0x68, 0x26, 0xc5, 0x7a, 0xfc, 0xf, 0x32, 0xc6, 0xa1, 0xe0, 0xc1, 0x72, 0x14, 0x61, 0x91, 0x9c, 0x66, 0x73, 0x53}, {0x27, 0x3f, 0x2f, 0x20, 0xe8, 0x35, 0x2, 0xbc, 0xb0, 0x75, 0xf9, 0x64, 0xe2, 0x0, 0x5c, 0xc7, 0x16, 0x24, 0x8c, 0xa3, 0xd5, 0xe9, 0xa4, 0x91, 0xf9, 0x89, 0xb7, 0x8a, 0xf6, 0xe7, 0xb6, 0x17}, {0x57, 0x52, 0xe, 0x9a, 0xab, 0x14, 0x28, 0x5d, 0xfc, 0xb3, 0xca, 0xc9, 0x84, 0x20, 0x8f, 0x90, 0xca, 0x1e, 0x2d, 0x5b, 0x88, 0xf5, 0xca, 0xaf, 0x11, 0x7d, 0xf8, 0x78, 0xa6, 0xb5, 0xb4, 0x1c}, }, { {0xe7, 0x7, 0xa0, 0xa2, 0x62, 0xaa, 0x74, 0x6b, 0xb1, 0xc7, 0x71, 0xf0, 0xb0, 0xe0, 0x11, 0xf3, 0x23, 0xe2, 0xb, 0x0, 0x38, 0xe4, 0x7, 0x57, 0xac, 0x6e, 0xef, 0x82, 0x2d, 0xfd, 0xc0, 0x2d}, {0x6c, 0xfc, 0x4a, 0x39, 0x6b, 0xc0, 0x64, 0xb6, 0xb1, 0x5f, 0xda, 0x98, 0x24, 0xde, 0x88, 0xc, 0x34, 0xd8, 0xca, 0x4b, 0x16, 0x3, 0x8d, 0x4f, 0xa2, 0x34, 0x74, 0xde, 0x78, 0xca, 0xb, 0x33}, {0x4e, 0x74, 0x19, 0x11, 0x84, 0xff, 0x2e, 0x98, 0x24, 0x47, 0x7, 0x2b, 0x96, 0x5e, 0x69, 0xf9, 0xfb, 0x53, 0xc9, 0xbf, 0x4f, 0xc1, 0x8a, 0xc5, 0xf5, 0x1c, 0x9f, 0x36, 0x1b, 0xbe, 0x31, 0x3c}, }, { {0x72, 0x42, 0xcb, 0xf9, 0x93, 0xbc, 0x68, 0xc1, 0x98, 0xdb, 0xce, 0xc7, 0x1f, 0x71, 0xb8, 0xae, 0x7a, 0x8d, 0xac, 0x34, 0xaa, 0x52, 0xe, 0x7f, 0xbb, 0x55, 0x7d, 0x7e, 0x9, 0xc1, 0xce, 0x41}, {0xee, 0x8a, 0x94, 0x8, 0x4d, 0x86, 0xf4, 0xb0, 0x6f, 0x1c, 0xba, 0x91, 0xee, 0x19, 0xdc, 0x7, 0x58, 0xa1, 0xac, 0xa6, 0xae, 0xcd, 0x75, 0x79, 0xbb, 0xd4, 0x62, 0x42, 0x13, 0x61, 0xb, 0x33}, {0x8a, 0x80, 0x6d, 0xa2, 0xd7, 0x19, 0x96, 0xf7, 0x6d, 0x15, 0x9e, 0x1d, 0x9e, 0xd4, 0x1f, 0xbb, 0x27, 0xdf, 0xa1, 0xdb, 0x6c, 0xc3, 0xd7, 0x73, 0x7d, 0x77, 0x28, 0x1f, 0xd9, 0x4c, 0xb4, 0x26}, }, }, { { {0x83, 0x3, 0x73, 0x62, 0x93, 0xf2, 0xb7, 0xe1, 0x2c, 0x8a, 0xca, 0xeb, 0xff, 0x79, 0x52, 0x4b, 0x14, 0x13, 0xd4, 0xbf, 0x8a, 0x77, 0xfc, 0xda, 0xf, 0x61, 0x72, 0x9c, 0x14, 0x10, 0xeb, 0x7d}, {0x75, 0x74, 0x38, 0x8f, 0x47, 0x48, 0xf0, 0x51, 0x3c, 0xcb, 0xbe, 0x9c, 0xf4, 0xbc, 0x5d, 0xb2, 0x55, 0x20, 0x9f, 0xd9, 0x44, 0x12, 0xab, 0x9a, 0xd6, 0xa5, 0x10, 0x1c, 0x6c, 0x9e, 0x70, 0x2c}, {0x7a, 0xee, 0x66, 0x87, 0x6a, 0xaf, 0x62, 0xcb, 0xe, 0xcd, 0x53, 0x55, 0x4, 0xec, 0xcb, 0x66, 0xb5, 0xe4, 0xb, 0xf, 0x38, 0x1, 0x80, 0x58, 0xea, 0xe2, 0x2c, 0xf6, 0x9f, 0x8e, 0xe6, 0x8}, }, { {0xf9, 0xf2, 0xb8, 0xa, 0xd5, 0x9, 0x2d, 0x2f, 0xdf, 0x23, 0x59, 0xc5, 0x8d, 0x21, 0xb9, 0xac, 0xb9, 0x6c, 0x76, 0x73, 0x26, 0x34, 0x8f, 0x4a, 0xf5, 0x19, 0xf7, 0x38, 0xd7, 0x3b, 0xb1, 0x4c}, {0xad, 0x30, 0xc1, 0x4b, 0xa, 0x50, 0xad, 0x34, 0x9c, 0xd4, 0xb, 0x3d, 0x49, 0xdb, 0x38, 0x8d, 0xbe, 0x89, 0xa, 0x50, 0x98, 0x3d, 0x5c, 0xa2, 0x9, 0x3b, 0xba, 0xee, 0x87, 0x3f, 0x1f, 0x2f}, {0x4a, 0xb6, 0x15, 0xe5, 0x75, 0x8c, 0x84, 0xf7, 0x38, 0x90, 0x4a, 0xdb, 0xba, 0x1, 0x95, 0xa5, 0x50, 0x1b, 0x75, 0x3f, 0x3f, 0x31, 0xd, 0xc2, 0xe8, 0x2e, 0xae, 0xc0, 0x53, 0xe3, 0xa1, 0x19}, }, { {0xbd, 0xbd, 0x96, 0xd5, 0xcd, 0x72, 0x21, 0xb4, 0x40, 0xfc, 0xee, 0x98, 0x43, 0x45, 0xe0, 0x93, 0xb5, 0x9, 0x41, 0xb4, 0x47, 0x53, 0xb1, 0x9f, 0x34, 0xae, 0x66, 0x2, 0x99, 0xd3, 0x6b, 0x73}, {0xc3, 0x5, 0xfa, 0xba, 0x60, 0x75, 0x1c, 0x7d, 0x61, 0x5e, 0xe5, 0xc6, 0xa0, 0xa0, 0xe1, 0xb3, 0x73, 0x64, 0xd6, 0xc0, 0x18, 0x97, 0x52, 0xe3, 0x86, 0x34, 0xc, 0xc2, 0x11, 0x6b, 0x54, 0x41}, {0xb4, 0xb3, 0x34, 0x93, 0x50, 0x2d, 0x53, 0x85, 0x73, 0x65, 0x81, 0x60, 0x4b, 0x11, 0xfd, 0x46, 0x75, 0x83, 0x5c, 0x42, 0x30, 0x5f, 0x5f, 0xcc, 0x5c, 0xab, 0x7f, 0xb8, 0xa2, 0x95, 0x22, 0x41}, }, { {0xc6, 0xea, 0x93, 0xe2, 0x61, 0x52, 0x65, 0x2e, 0xdb, 0xac, 0x33, 0x21, 0x3, 0x92, 0x5a, 0x84, 0x6b, 0x99, 0x0, 0x79, 0xcb, 0x75, 0x9, 0x46, 0x80, 0xdd, 0x5a, 0x19, 0x8d, 0xbb, 0x60, 0x7}, {0xe9, 0xd6, 0x7e, 0xf5, 0x88, 0x9b, 0xc9, 0x19, 0x25, 0xc8, 0xf8, 0x6d, 0x26, 0xcb, 0x93, 0x53, 0x73, 0xd2, 0xa, 0xb3, 0x13, 0x32, 0xee, 0x5c, 0x34, 0x2e, 0x2d, 0xb5, 0xeb, 0x53, 0xe1, 0x14}, {0x8a, 0x81, 0xe6, 0xcd, 0x17, 0x1a, 0x3e, 0x41, 0x84, 0xa0, 0x69, 0xed, 0xa9, 0x6d, 0x15, 0x57, 0xb1, 0xcc, 0xca, 0x46, 0x8f, 0x26, 0xbf, 0x2c, 0xf2, 0xc5, 0x3a, 0xc3, 0x9b, 0xbe, 0x34, 0x6b}, }, { {0xd3, 0xf2, 0x71, 0x65, 0x65, 0x69, 0xfc, 0x11, 0x7a, 0x73, 0xe, 0x53, 0x45, 0xe8, 0xc9, 0xc6, 0x35, 0x50, 0xfe, 0xd4, 0xa2, 0xe7, 0x3a, 0xe3, 0xb, 0xd3, 0x6d, 0x2e, 0xb6, 0xc7, 0xb9, 0x1}, {0xb2, 0xc0, 0x78, 0x3a, 0x64, 0x2f, 0xdf, 0xf3, 0x7c, 0x2, 0x2e, 0xf2, 0x1e, 0x97, 0x3e, 0x4c, 0xa3, 0xb5, 0xc1, 0x49, 0x5e, 0x1c, 0x7d, 0xec, 0x2d, 0xdd, 0x22, 0x9, 0x8f, 0xc1, 0x12, 0x20}, {0x29, 0x9d, 0xc8, 0x5a, 0xe5, 0x55, 0xb, 0x88, 0x63, 0xa7, 0xa0, 0x45, 0x1f, 0x24, 0x83, 0x14, 0x1f, 0x6c, 0xe7, 0xc2, 0xdf, 0xef, 0x36, 0x3d, 0xe8, 0xad, 0x4b, 0x4e, 0x78, 0x5b, 0xaf, 0x8}, }, { {0x4b, 0x2c, 0xcc, 0x89, 0xd2, 0x14, 0x73, 0xe2, 0x8d, 0x17, 0x87, 0xa2, 0x11, 0xbd, 0xe4, 0x4b, 0xce, 0x64, 0x33, 0xfa, 0xd6, 0x28, 0xd5, 0x18, 0x6e, 0x82, 0xd9, 0xaf, 0xd5, 0xc1, 0x23, 0x64}, {0x33, 0x25, 0x1f, 0x88, 0xdc, 0x99, 0x34, 0x28, 0xb6, 0x23, 0x93, 0x77, 0xda, 0x25, 0x5, 0x9d, 0xf4, 0x41, 0x34, 0x67, 0xfb, 0xdd, 0x7a, 0x89, 0x8d, 0x16, 0x3a, 0x16, 0x71, 0x9d, 0xb7, 0x32}, {0x6a, 0xb3, 0xfc, 0xed, 0xd9, 0xf8, 0x85, 0xcc, 0xf9, 0xe5, 0x46, 0x37, 0x8f, 0xc2, 0xbc, 0x22, 0xcd, 0xd3, 0xe5, 0xf9, 0x38, 0xe3, 0x9d, 0xe4, 0xcc, 0x2d, 0x3e, 0xc1, 0xfb, 0x5e, 0xa, 0x48}, }, { {0x1f, 0x22, 0xce, 0x42, 0xe4, 0x4c, 0x61, 0xb6, 0x28, 0x39, 0x5, 0x4c, 0xcc, 0x9d, 0x19, 0x6e, 0x3, 0xbe, 0x1c, 0xdc, 0xa4, 0xb4, 0x3f, 0x66, 0x6, 0x8e, 0x1c, 0x69, 0x47, 0x1d, 0xb3, 0x24}, {0x71, 0x20, 0x62, 0x1, 0xb, 0xe7, 0x51, 0xb, 0xc5, 0xaf, 0x1d, 0x8b, 0xcf, 0x5, 0xb5, 0x6, 0xcd, 0xab, 0x5a, 0xef, 0x61, 0xb0, 0x6b, 0x2c, 0x31, 0xbf, 0xb7, 0xc, 0x60, 0x27, 0xaa, 0x47}, {0xc3, 0xf8, 0x15, 0xc0, 0xed, 0x1e, 0x54, 0x2a, 0x7c, 0x3f, 0x69, 0x7c, 0x7e, 0xfe, 0xa4, 0x11, 0xd6, 0x78, 0xa2, 0x4e, 0x13, 0x66, 0xaf, 0xf0, 0x94, 0xa0, 0xdd, 0x14, 0x5d, 0x58, 0x5b, 0x54}, }, { {0xe1, 0x21, 0xb3, 0xe3, 0xd0, 0xe4, 0x4, 0x62, 0x95, 0x1e, 0xff, 0x28, 0x7a, 0x63, 0xaa, 0x3b, 0x9e, 0xbd, 0x99, 0x5b, 0xfd, 0xcf, 0xc, 0xb, 0x71, 0xd0, 0xc8, 0x64, 0x3e, 0xdc, 0x22, 0x4d}, {0xf, 0x3a, 0xd4, 0xa0, 0x5e, 0x27, 0xbf, 0x67, 0xbe, 0xee, 0x9b, 0x8, 0x34, 0x8e, 0xe6, 0xad, 0x2e, 0xe7, 0x79, 0xd4, 0x4c, 0x13, 0x89, 0x42, 0x54, 0x54, 0xba, 0x32, 0xc3, 0xf9, 0x62, 0xf}, {0x39, 0x5f, 0x3b, 0xd6, 0x89, 0x65, 0xb4, 0xfc, 0x61, 0xcf, 0xcb, 0x57, 0x3f, 0x6a, 0xae, 0x5c, 0x5, 0xfa, 0x3a, 0x95, 0xd2, 0xc2, 0xba, 0xfe, 0x36, 0x14, 0x37, 0x36, 0x1a, 0xa0, 0xf, 0x1c}, }, }, { { {0x50, 0x6a, 0x93, 0x8c, 0xe, 0x2b, 0x8, 0x69, 0xb6, 0xc5, 0xda, 0xc1, 0x35, 0xa0, 0xc9, 0xf9, 0x34, 0xb6, 0xdf, 0xc4, 0x54, 0x3e, 0xb7, 0x6f, 0x40, 0xc1, 0x2b, 0x1d, 0x9b, 0x41, 0x5, 0x40}, {0xff, 0x3d, 0x94, 0x22, 0xb6, 0x4, 0xc6, 0xd2, 0xa0, 0xb3, 0xcf, 0x44, 0xce, 0xbe, 0x8c, 0xbc, 0x78, 0x86, 0x80, 0x97, 0xf3, 0x4f, 0x25, 0x5d, 0xbf, 0xa6, 0x1c, 0x3b, 0x4f, 0x61, 0xa3, 0xf}, {0xf0, 0x82, 0xbe, 0xb9, 0xbd, 0xfe, 0x3, 0xa0, 0x90, 0xac, 0x44, 0x3a, 0xaf, 0xc1, 0x89, 0x20, 0x8e, 0xfa, 0x54, 0x19, 0x91, 0x9f, 0x49, 0xf8, 0x42, 0xab, 0x40, 0xef, 0x8a, 0x21, 0xba, 0x1f}, }, { {0x94, 0x1, 0x7b, 0x3e, 0x4, 0x57, 0x3e, 0x4f, 0x7f, 0xaf, 0xda, 0x8, 0xee, 0x3e, 0x1d, 0xa8, 0xf1, 0xde, 0xdc, 0x99, 0xab, 0xc6, 0x39, 0xc8, 0xd5, 0x61, 0x77, 0xff, 0x13, 0x5d, 0x53, 0x6c}, {0x3e, 0xf5, 0xc8, 0xfa, 0x48, 0x94, 0x54, 0xab, 0x41, 0x37, 0xa6, 0x7b, 0x9a, 0xe8, 0xf6, 0x81, 0x1, 0x5e, 0x2b, 0x6c, 0x7d, 0x6c, 0xfd, 0x74, 0x42, 0x6e, 0xc8, 0xa8, 0xca, 0x3a, 0x2e, 0x39}, {0xaf, 0x35, 0x8a, 0x3e, 0xe9, 0x34, 0xbd, 0x4c, 0x16, 0xe8, 0x87, 0x58, 0x44, 0x81, 0x7, 0x2e, 0xab, 0xb0, 0x9a, 0xf2, 0x76, 0x9c, 0x31, 0x19, 0x3b, 0xc1, 0xa, 0xd5, 0xe4, 0x7f, 0xe1, 0x25}, }, { {0xa7, 0x21, 0xf1, 0x76, 0xf5, 0x7f, 0x5f, 0x91, 0xe3, 0x87, 0xcd, 0x2f, 0x27, 0x32, 0x4a, 0xc3, 0x26, 0xe5, 0x1b, 0x4d, 0xde, 0x2f, 0xba, 0xcc, 0x9b, 0x89, 0x69, 0x89, 0x8f, 0x82, 0xba, 0x6b}, {0x76, 0xf6, 0x4, 0x1e, 0xd7, 0x9b, 0x28, 0xa, 0x95, 0xf, 0x42, 0xd6, 0x52, 0x1c, 0x8e, 0x20, 0xab, 0x1f, 0x69, 0x34, 0xb0, 0xd8, 0x86, 0x51, 0x51, 0xb3, 0x9f, 0x2a, 0x44, 0x51, 0x57, 0x25}, {0x1, 0x39, 0xfe, 0x90, 0x66, 0xbc, 0xd1, 0xe2, 0xd5, 0x7a, 0x99, 0xa0, 0x18, 0x4a, 0xb5, 0x4c, 0xd4, 0x60, 0x84, 0xaf, 0x14, 0x69, 0x1d, 0x97, 0xe4, 0x7b, 0x6b, 0x7f, 0x4f, 0x50, 0x9d, 0x55}, }, { {0xfd, 0x66, 0xd2, 0xf6, 0xe7, 0x91, 0x48, 0x9c, 0x1b, 0x78, 0x7, 0x3, 0x9b, 0xa1, 0x44, 0x7, 0x3b, 0xe2, 0x61, 0x60, 0x1d, 0x8f, 0x38, 0x88, 0xe, 0xd5, 0x4b, 0x35, 0xa3, 0xa6, 0x3e, 0x12}, {0xd5, 0x54, 0xeb, 0xb3, 0x78, 0x83, 0x73, 0xa7, 0x7c, 0x3c, 0x55, 0xa5, 0x66, 0xd3, 0x69, 0x1d, 0xba, 0x0, 0x28, 0xf9, 0x62, 0xcf, 0x26, 0xa, 0x17, 0x32, 0x7e, 0x80, 0xd5, 0x12, 0xab, 0x1}, {0x96, 0x2d, 0xe3, 0x41, 0x90, 0x18, 0x8d, 0x11, 0x48, 0x58, 0x31, 0xd8, 0xc2, 0xe3, 0xed, 0xb9, 0xd9, 0x45, 0x32, 0xd8, 0x71, 0x42, 0xab, 0x1e, 0x54, 0xa1, 0x18, 0xc9, 0xe2, 0x61, 0x39, 0x4a}, }, { {0x1e, 0x3f, 0x23, 0xf3, 0x44, 0xd6, 0x27, 0x3, 0x16, 0xf0, 0xfc, 0x34, 0xe, 0x26, 0x9a, 0x49, 0x79, 0xb9, 0xda, 0xf2, 0x16, 0xa7, 0xb5, 0x83, 0x1f, 0x11, 0xd4, 0x9b, 0xad, 0xee, 0xac, 0x68}, {0xa0, 0xbb, 0xe6, 0xf8, 0xe0, 0x3b, 0xdc, 0x71, 0xa, 0xe3, 0xff, 0x7e, 0x34, 0xf8, 0xce, 0xd6, 0x6a, 0x47, 0x3a, 0xe1, 0x5f, 0x42, 0x92, 0xa9, 0x63, 0xb7, 0x1d, 0xfb, 0xe3, 0xbc, 0xd6, 0x2c}, {0x10, 0xc2, 0xd7, 0xf3, 0xe, 0xc9, 0xb4, 0x38, 0xc, 0x4, 0xad, 0xb7, 0x24, 0x6e, 0x8e, 0x30, 0x23, 0x3e, 0xe7, 0xb7, 0xf1, 0xd9, 0x60, 0x38, 0x97, 0xf5, 0x8, 0xb5, 0xd5, 0x60, 0x57, 0x59}, }, { {0x90, 0x27, 0x2, 0xfd, 0xeb, 0xcb, 0x2a, 0x88, 0x60, 0x57, 0x11, 0xc4, 0x5, 0x33, 0xaf, 0x89, 0xf4, 0x73, 0x34, 0x7d, 0xe3, 0x92, 0xf4, 0x65, 0x2b, 0x5a, 0x51, 0x54, 0xdf, 0xc5, 0xb2, 0x2c}, {0x97, 0x63, 0xaa, 0x4, 0xe1, 0xbf, 0x29, 0x61, 0xcb, 0xfc, 0xa7, 0xa4, 0x8, 0x0, 0x96, 0x8f, 0x58, 0x94, 0x90, 0x7d, 0x89, 0xc0, 0x8b, 0x3f, 0xa9, 0x91, 0xb2, 0xdc, 0x3e, 0xa4, 0x9f, 0x70}, {0xca, 0x2a, 0xfd, 0x63, 0x8c, 0x5d, 0xa, 0xeb, 0xff, 0x4e, 0x69, 0x2e, 0x66, 0xc1, 0x2b, 0xd2, 0x3a, 0xb0, 0xcb, 0xf8, 0x6e, 0xf3, 0x23, 0x27, 0x1f, 0x13, 0xc8, 0xf0, 0xec, 0x29, 0xf0, 0x70}, }, { {0xb9, 0xb0, 0x10, 0x5e, 0xaa, 0xaf, 0x6a, 0x2a, 0xa9, 0x1a, 0x4, 0xef, 0x70, 0xa3, 0xf0, 0x78, 0x1f, 0xd6, 0x3a, 0xaa, 0x77, 0xfb, 0x3e, 0x77, 0xe1, 0xd9, 0x4b, 0xa7, 0xa2, 0xa5, 0xec, 0x44}, {0x33, 0x3e, 0xed, 0x2e, 0xb3, 0x7, 0x13, 0x46, 0xe7, 0x81, 0x55, 0xa4, 0x33, 0x2f, 0x4, 0xae, 0x66, 0x3, 0x5f, 0x19, 0xd3, 0x49, 0x44, 0xc9, 0x58, 0x48, 0x31, 0x6c, 0x8a, 0x5d, 0x7d, 0xb}, {0x43, 0xd5, 0x95, 0x7b, 0x32, 0x48, 0xd4, 0x25, 0x1d, 0xf, 0x34, 0xa3, 0x0, 0x83, 0xd3, 0x70, 0x2b, 0xc5, 0xe1, 0x60, 0x1c, 0x53, 0x1c, 0xde, 0xe4, 0xe9, 0x7d, 0x2c, 0x51, 0x24, 0x22, 0x27}, }, { {0xfc, 0x75, 0xa9, 0x42, 0x8a, 0xbb, 0x7b, 0xbf, 0x58, 0xa3, 0xad, 0x96, 0x77, 0x39, 0x5c, 0x8c, 0x48, 0xaa, 0xed, 0xcd, 0x6f, 0xc7, 0x7f, 0xe2, 0xa6, 0x20, 0xbc, 0xf6, 0xd7, 0x5f, 0x73, 0x19}, {0x2e, 0x34, 0xc5, 0x49, 0xaf, 0x92, 0xbc, 0x1a, 0xd0, 0xfa, 0xe6, 0xb2, 0x11, 0xd8, 0xee, 0xff, 0x29, 0x4e, 0xc8, 0xfc, 0x8d, 0x8c, 0xa2, 0xef, 0x43, 0xc5, 0x4c, 0xa4, 0x18, 0xdf, 0xb5, 0x11}, {0x66, 0x42, 0xc8, 0x42, 0xd0, 0x90, 0xab, 0xe3, 0x7e, 0x54, 0x19, 0x7f, 0xf, 0x8e, 0x84, 0xeb, 0xb9, 0x97, 0xa4, 0x65, 0xd0, 0xa1, 0x3, 0x25, 0x5f, 0x89, 0xdf, 0x91, 0x11, 0x91, 0xef, 0xf}, }, }, }; #endif // OPENSSL_SMALL // Bi[i] = (2*i+1)*B static const ge_precomp Bi[8] = { { {{ #if defined(OPENSSL_64_BIT) 1288382639258501, 245678601348599, 269427782077623, 1462984067271730, 137412439391563 #else 25967493, 19198397, 29566455, 3660896, 54414519, 4014786, 27544626, 21800161, 61029707, 2047604 #endif }}, {{ #if defined(OPENSSL_64_BIT) 62697248952638, 204681361388450, 631292143396476, 338455783676468, 1213667448819585 #else 54563134, 934261, 64385954, 3049989, 66381436, 9406985, 12720692, 5043384, 19500929, 18085054 #endif }}, {{ #if defined(OPENSSL_64_BIT) 301289933810280, 1259582250014073, 1422107436869536, 796239922652654, 1953934009299142 #else 58370664, 4489569, 9688441, 18769238, 10184608, 21191052, 29287918, 11864899, 42594502, 29115885 #endif }}, }, { {{ #if defined(OPENSSL_64_BIT) 1601611775252272, 1720807796594148, 1132070835939856, 1260455018889551, 2147779492816911 #else 15636272, 23865875, 24204772, 25642034, 616976, 16869170, 27787599, 18782243, 28944399, 32004408 #endif }}, {{ #if defined(OPENSSL_64_BIT) 316559037616741, 2177824224946892, 1459442586438991, 1461528397712656, 751590696113597 #else 16568933, 4717097, 55552716, 32452109, 15682895, 21747389, 16354576, 21778470, 7689661, 11199574 #endif }}, {{ #if defined(OPENSSL_64_BIT) 1850748884277385, 1200145853858453, 1068094770532492, 672251375690438, 1586055907191707 #else 30464137, 27578307, 55329429, 17883566, 23220364, 15915852, 7512774, 10017326, 49359771, 23634074 #endif }}, }, { {{ #if defined(OPENSSL_64_BIT) 769950342298419, 132954430919746, 844085933195555, 974092374476333, 726076285546016 #else 10861363, 11473154, 27284546, 1981175, 37044515, 12577860, 32867885, 14515107, 51670560, 10819379 #endif }}, {{ #if defined(OPENSSL_64_BIT) 425251763115706, 608463272472562, 442562545713235, 837766094556764, 374555092627893 #else 4708026, 6336745, 20377586, 9066809, 55836755, 6594695, 41455196, 12483687, 54440373, 5581305 #endif }}, {{ #if defined(OPENSSL_64_BIT) 1086255230780037, 274979815921559, 1960002765731872, 929474102396301, 1190409889297339 #else 19563141, 16186464, 37722007, 4097518, 10237984, 29206317, 28542349, 13850243, 43430843, 17738489 #endif }}, }, { {{ #if defined(OPENSSL_64_BIT) 665000864555967, 2065379846933859, 370231110385876, 350988370788628, 1233371373142985 #else 5153727, 9909285, 1723747, 30776558, 30523604, 5516873, 19480852, 5230134, 43156425, 18378665 #endif }}, {{ #if defined(OPENSSL_64_BIT) 2019367628972465, 676711900706637, 110710997811333, 1108646842542025, 517791959672113 #else 36839857, 30090922, 7665485, 10083793, 28475525, 1649722, 20654025, 16520125, 30598449, 7715701 #endif }}, {{ #if defined(OPENSSL_64_BIT) 965130719900578, 247011430587952, 526356006571389, 91986625355052, 2157223321444601 #else 28881826, 14381568, 9657904, 3680757, 46927229, 7843315, 35708204, 1370707, 29794553, 32145132 #endif }}, }, { {{ #if defined(OPENSSL_64_BIT) 1802695059465007, 1664899123557221, 593559490740857, 2160434469266659, 927570450755031 #else 44589871, 26862249, 14201701, 24808930, 43598457, 8844725, 18474211, 32192982, 54046167, 13821876 #endif }}, {{ #if defined(OPENSSL_64_BIT) 1725674970513508, 1933645953859181, 1542344539275782, 1767788773573747, 1297447965928905 #else 60653668, 25714560, 3374701, 28813570, 40010246, 22982724, 31655027, 26342105, 18853321, 19333481 #endif }}, {{ #if defined(OPENSSL_64_BIT) 1381809363726107, 1430341051343062, 2061843536018959, 1551778050872521, 2036394857967624 #else 4566811, 20590564, 38133974, 21313742, 59506191, 30723862, 58594505, 23123294, 2207752, 30344648 #endif }}, }, { {{ #if defined(OPENSSL_64_BIT) 1970894096313054, 528066325833207, 1619374932191227, 2207306624415883, 1169170329061080 #else 41954014, 29368610, 29681143, 7868801, 60254203, 24130566, 54671499, 32891431, 35997400, 17421995 #endif }}, {{ #if defined(OPENSSL_64_BIT) 2070390218572616, 1458919061857835, 624171843017421, 1055332792707765, 433987520732508 #else 25576264, 30851218, 7349803, 21739588, 16472781, 9300885, 3844789, 15725684, 171356, 6466918 #endif }}, {{ #if defined(OPENSSL_64_BIT) 893653801273833, 1168026499324677, 1242553501121234, 1306366254304474, 1086752658510815 #else 23103977, 13316479, 9739013, 17404951, 817874, 18515490, 8965338, 19466374, 36393951, 16193876 #endif }}, }, { {{ #if defined(OPENSSL_64_BIT) 213454002618221, 939771523987438, 1159882208056014, 317388369627517, 621213314200687 #else 33587053, 3180712, 64714734, 14003686, 50205390, 17283591, 17238397, 4729455, 49034351, 9256799 #endif }}, {{ #if defined(OPENSSL_64_BIT) 1971678598905747, 338026507889165, 762398079972271, 655096486107477, 42299032696322 #else 41926547, 29380300, 32336397, 5036987, 45872047, 11360616, 22616405, 9761698, 47281666, 630304 #endif }}, {{ #if defined(OPENSSL_64_BIT) 177130678690680, 1754759263300204, 1864311296286618, 1180675631479880, 1292726903152791 #else 53388152, 2639452, 42871404, 26147950, 9494426, 27780403, 60554312, 17593437, 64659607, 19263131 #endif }}, }, { {{ #if defined(OPENSSL_64_BIT) 1913163449625248, 460779200291993, 2193883288642314, 1008900146920800, 1721983679009502 #else 63957664, 28508356, 9282713, 6866145, 35201802, 32691408, 48168288, 15033783, 25105118, 25659556 #endif }}, {{ #if defined(OPENSSL_64_BIT) 1070401523076875, 1272492007800961, 1910153608563310, 2075579521696771, 1191169788841221 #else 42782475, 15950225, 35307649, 18961608, 55446126, 28463506, 1573891, 30928545, 2198789, 17749813 #endif }}, {{ #if defined(OPENSSL_64_BIT) 692896803108118, 500174642072499, 2068223309439677, 1162190621851337, 1426986007309901 #else 64009494, 10324966, 64867251, 7453182, 61661885, 30818928, 53296841, 17317989, 34647629, 21263748 #endif }}, }, }; ring-0.17.14/crypto/curve25519/internal.h000064400000000000000000000100161046102023000157720ustar 00000000000000// Copyright 2020 The BoringSSL Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef OPENSSL_HEADER_CURVE25519_INTERNAL_H #define OPENSSL_HEADER_CURVE25519_INTERNAL_H #include #include "../internal.h" #if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_APPLE) #define BORINGSSL_X25519_NEON // x25519_NEON is defined in asm/x25519-arm.S. void x25519_NEON(uint8_t out[32], const uint8_t scalar[32], const uint8_t point[32]); #endif #if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && \ defined(__GNUC__) && defined(__x86_64__) && !defined(OPENSSL_WINDOWS) #define BORINGSSL_FE25519_ADX // fiat_curve25519_adx_mul is defined in // third_party/fiat/asm/fiat_curve25519_adx_mul.S void __attribute__((sysv_abi)) fiat_curve25519_adx_mul(uint64_t out[4], const uint64_t in1[4], const uint64_t in2[4]); // fiat_curve25519_adx_square is defined in // third_party/fiat/asm/fiat_curve25519_adx_square.S void __attribute__((sysv_abi)) fiat_curve25519_adx_square(uint64_t out[4], const uint64_t in[4]); // x25519_scalar_mult_adx is defined in third_party/fiat/curve25519_64_adx.h void x25519_scalar_mult_adx(uint8_t out[32], const uint8_t scalar[32], const uint8_t point[32]); void x25519_ge_scalarmult_base_adx(uint8_t h[4][32], const uint8_t a[32]); #endif #if defined(OPENSSL_64_BIT) // An element t, // entries t[0]...t[4], represents the integer t[0]+2^51 t[1]+2^102 t[2]+2^153 // t[3]+2^204 t[4]. // fe limbs are bounded by 1.125*2^51. // fe_loose limbs are bounded by 3.375*2^51. typedef uint64_t fe_limb_t; #define FE_NUM_LIMBS 5 #else // An element t, // entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77 // t[3]+2^102 t[4]+...+2^230 t[9]. // fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc. // fe_loose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc. typedef uint32_t fe_limb_t; #define FE_NUM_LIMBS 10 #endif // fe means field element. Here the field is \Z/(2^255-19). // Multiplication and carrying produce fe from fe_loose. // Keep in sync with `Elem` and `ELEM_LIMBS` in curve25519/ops.rs. typedef struct fe { fe_limb_t v[FE_NUM_LIMBS]; } fe; // Addition and subtraction produce fe_loose from (fe, fe). // Keep in sync with `Elem` and `ELEM_LIMBS` in curve25519/ops.rs. typedef struct fe_loose { fe_limb_t v[FE_NUM_LIMBS]; } fe_loose; static inline void fe_limbs_copy(fe_limb_t r[], const fe_limb_t a[]) { for (size_t i = 0; i < FE_NUM_LIMBS; ++i) { r[i] = a[i]; } } // ge means group element. // // Here the group is the set of pairs (x,y) of field elements (see fe.h) // satisfying -x^2 + y^2 = 1 + d x^2y^2 // where d = -121665/121666. // // Representations: // ge_p2 (projective): (X:Y:Z) satisfying x=X/Z, y=Y/Z // ge_p3 (extended): (X:Y:Z:T) satisfying x=X/Z, y=Y/Z, XY=ZT // ge_p1p1 (completed): ((X:Z),(Y:T)) satisfying x=X/Z, y=Y/T // ge_precomp (Duif): (y+x,y-x,2dxy) // Keep in sync with `Point` in curve25519/ops.rs. typedef struct { fe X; fe Y; fe Z; } ge_p2; // Keep in sync with `ExtPoint` in curve25519/ops.rs. typedef struct { fe X; fe Y; fe Z; fe T; } ge_p3; typedef struct { fe_loose X; fe_loose Y; fe_loose Z; fe_loose T; } ge_p1p1; typedef struct { fe_loose yplusx; fe_loose yminusx; fe_loose xy2d; } ge_precomp; typedef struct { fe_loose YplusX; fe_loose YminusX; fe_loose Z; fe_loose T2d; } ge_cached; extern const uint8_t k25519Precomp[32][8][3][32]; #endif // OPENSSL_HEADER_CURVE25519_INTERNAL_H ring-0.17.14/crypto/fipsmodule/aes/aes_nohw.c000064400000000000000000001046451046102023000171550ustar 00000000000000/* Copyright (c) 2019, Google Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include "../../internal.h" // This file contains a constant-time implementation of AES, bitsliced with // 32-bit or 64-bit, operating on two-, four-, and eight-block // batches, respectively. // // This implementation is based on the algorithms described in the following // references: // - https://bearssl.org/constanttime.html#aes // - https://eprint.iacr.org/2009/129.pdf // - https://eprint.iacr.org/2009/191.pdf // Word operations. // // An aes_word_t is the word used for this AES implementation. Throughout this // file, bits and bytes are ordered little-endian, though "left" and "right" // shifts match the operations themselves, which makes them reversed in a // little-endian, left-to-right reading. // // Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an // |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE| // bits each, each corresponding to a byte in an AES block in column-major // order (AES's byte order). We refer to these as "logical bytes". Note, in the // 32-bit and 64-bit implementations, they are smaller than a byte. (The // contents of a logical byte will be described later.) // // MSVC does not support C bit operators on |__m128i|, so the wrapper functions // |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and // |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift // value ranges from 0 to 15 independent of |aes_word_t| and // |AES_NOHW_BATCH_SIZE|. // // This ordering is different from https://eprint.iacr.org/2009/129.pdf, which // uses row-major order. Matching the AES order was easier to reason about, and // we do not have PSHUFB available to arbitrarily permute bytes. #if defined(OPENSSL_64_BIT) typedef uint64_t aes_word_t; #define AES_NOHW_WORD_SIZE 8 #define AES_NOHW_BATCH_SIZE 4 #define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f) #define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0) #define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00) #define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000) #else // !OPENSSL_64_BIT typedef uint32_t aes_word_t; #define AES_NOHW_WORD_SIZE 4 #define AES_NOHW_BATCH_SIZE 2 #define AES_NOHW_ROW0_MASK 0x03030303 #define AES_NOHW_ROW1_MASK 0x0c0c0c0c #define AES_NOHW_ROW2_MASK 0x30303030 #define AES_NOHW_ROW3_MASK 0xc0c0c0c0 #endif // OPENSSL_64_BIT static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) { return a & b; } static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) { return a | b; } static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) { return a ^ b; } static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; } static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) { return a << (i * AES_NOHW_BATCH_SIZE); } static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) { return a >> (i * AES_NOHW_BATCH_SIZE); } OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t), "batch size does not match word size"); OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t), "AES_NOHW_WORD_SIZE is incorrect"); // Block representations. // // This implementation uses three representations for AES blocks. First, the // public API represents blocks as uint8_t[16] in the usual way. Second, most // AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|. // This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words // containing bitsliced blocks a, b, c, d, this would be as follows (vertical // bars divide logical bytes): // // batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... // batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... // batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... // batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... // ... // // Finally, an individual block may be stored as an intermediate form in an // aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each // block, so that block[0]'s ith logical byte contains least-significant // |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of // |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as // "compacting" the block. Note this is no-op with 128-bit words because then // |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit // words, one block would be stored in two words: // // block[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... // block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ... // // Observe that the distances between corresponding bits in bitsliced and // compact bit orders match. If we line up corresponding words of each block, // the bitsliced and compact representations may be converted by tranposing bits // in corresponding logical bytes. Continuing the 64-bit example: // // block_a[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... // block_b[0] = b0 b1 b2 b3 | b8 b9 b10 b11 | b16 b17 b18 b19 ... // block_c[0] = c0 c1 c2 c3 | c8 c9 c10 c11 | c16 c17 c18 c19 ... // block_d[0] = d0 d1 d2 d3 | d8 d9 d10 d11 | d16 d17 d18 d19 ... // // batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... // batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... // batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... // batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... // // Note also that bitwise operations and (logical) byte permutations on an // |aes_word_t| work equally for the bitsliced and compact words. // // We use the compact form in the |AES_KEY| representation to save work // inflating round keys into |AES_NOHW_BATCH|. The compact form also exists // temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately // before or after |aes_nohw_transpose|. #define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t)) // An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise // specified, it is in bitsliced form. typedef struct { aes_word_t w[8]; } AES_NOHW_BATCH; // An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is // suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH| // |AES_KEY|s so it should not be used as a long-term key representation. typedef struct { // keys is an array of batches, one for each round key. Each batch stores // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form. AES_NOHW_BATCH keys[AES_MAXNR + 1]; } AES_NOHW_SCHEDULE; // aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in // compact form. static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch, const aes_word_t in[AES_NOHW_BLOCK_WORDS], size_t i) { // Note the words are interleaved. The order comes from |aes_nohw_transpose|. // If |i| is zero and this is the 64-bit implementation, in[0] contains bits // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at // w[4] so that bits 0 and 4 are in the correct position. (In general, bits // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares // will be correctly placed.) dev_assert_secret(i < AES_NOHW_BATCH_SIZE); #if defined(OPENSSL_64_BIT) batch->w[i] = in[0]; batch->w[i + 4] = in[1]; #else batch->w[i] = in[0]; batch->w[i + 2] = in[1]; batch->w[i + 4] = in[2]; batch->w[i + 6] = in[3]; #endif } // aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in // compact form. static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch, aes_word_t out[AES_NOHW_BLOCK_WORDS], size_t i) { dev_assert_secret(i < AES_NOHW_BATCH_SIZE); #if defined(OPENSSL_64_BIT) out[0] = batch->w[i]; out[1] = batch->w[i + 4]; #else out[0] = batch->w[i]; out[1] = batch->w[i + 2]; out[2] = batch->w[i + 4]; out[3] = batch->w[i + 6]; #endif } // aes_nohw_delta_swap returns |a| with bits |a & mask| and // |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap. static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask, aes_word_t shift) { // See // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/ aes_word_t b = (a ^ (a >> shift)) & mask; return a ^ b ^ (b << shift); } // In the 32-bit and 64-bit implementations, a block spans multiple words. // |aes_nohw_compact_block| must permute bits across different words. First we // implement |aes_nohw_compact_word| which performs a smaller version of the // transformation which stays within a single word. // // These transformations are generalizations of the output of // http://programming.sirrida.de/calcperm.php on smaller inputs. #if defined(OPENSSL_64_BIT) static inline uint64_t aes_nohw_compact_word(uint64_t a) { #if defined(RING_BIG_ENDIAN) a = CRYPTO_bswap8(a); #endif // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap // quartets of those chunks: // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks): // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 => // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks): // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 => // 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15 a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); return a; } static inline uint64_t aes_nohw_uncompact_word(uint64_t a) { // Reverse the steps of |aes_nohw_uncompact_word|. a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); #if defined(RING_BIG_ENDIAN) a = CRYPTO_bswap8(a); #endif return a; } #else // !OPENSSL_64_BIT static inline uint32_t aes_nohw_compact_word(uint32_t a) { #if defined(RING_BIG_ENDIAN) a = CRYPTO_bswap4(a); #endif // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap: // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 // Note: 0x00cc = 0b0000_0000_1100_1100 // 0x00cc << 6 = 0b0011_0011_0000_0000 a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); // Now we swap groups of four bits (still numbering by pairs): // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 => // 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15 // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000 a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); return a; } static inline uint32_t aes_nohw_uncompact_word(uint32_t a) { // Reverse the steps of |aes_nohw_uncompact_word|. a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); #if defined(RING_BIG_ENDIAN) a = CRYPTO_bswap4(a); #endif return a; } static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3) { return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) | ((uint32_t)a3 << 24); } static inline uint8_t lo(uint32_t a) { return (uint8_t)a; } #endif // OPENSSL_64_BIT static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], const uint8_t in[16]) { OPENSSL_memcpy(out, in, 16); #if defined(OPENSSL_64_BIT) uint64_t a0 = aes_nohw_compact_word(out[0]); uint64_t a1 = aes_nohw_compact_word(out[1]); out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32); out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32); #else uint32_t a0 = aes_nohw_compact_word(out[0]); uint32_t a1 = aes_nohw_compact_word(out[1]); uint32_t a2 = aes_nohw_compact_word(out[2]); uint32_t a3 = aes_nohw_compact_word(out[3]); // Note clang, when building for ARM Thumb2, will sometimes miscompile // expressions such as (a0 & 0x0000ff00) << 8, particularly when building // without optimizations. This bug was introduced in // https://reviews.llvm.org/rL340261 and fixed in // https://reviews.llvm.org/rL351310. The following is written to avoid this. out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3)); out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)); out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)); out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)); #endif } static inline void aes_nohw_uncompact_block( uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { #if defined(OPENSSL_64_BIT) uint64_t a0 = in[0]; uint64_t a1 = in[1]; uint64_t b0 = aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32)); uint64_t b1 = aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32)); OPENSSL_memcpy(out, &b0, 8); OPENSSL_memcpy(out + 8, &b1, 8); #else uint32_t a0 = in[0]; uint32_t a1 = in[1]; uint32_t a2 = in[2]; uint32_t a3 = in[3]; // Note clang, when building for ARM Thumb2, will sometimes miscompile // expressions such as (a0 & 0x0000ff00) << 8, particularly when building // without optimizations. This bug was introduced in // https://reviews.llvm.org/rL340261 and fixed in // https://reviews.llvm.org/rL351310. The following is written to avoid this. uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3)); uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)); uint32_t b2 = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)); uint32_t b3 = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)); b0 = aes_nohw_uncompact_word(b0); b1 = aes_nohw_uncompact_word(b1); b2 = aes_nohw_uncompact_word(b2); b3 = aes_nohw_uncompact_word(b3); OPENSSL_memcpy(out, &b0, 4); OPENSSL_memcpy(out + 4, &b1, 4); OPENSSL_memcpy(out + 8, &b2, 4); OPENSSL_memcpy(out + 12, &b3, 4); #endif } // aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in // |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and // |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it // is repeated to the full width of |aes_word_t|. static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b, uint32_t mask, aes_word_t shift) { #if defined(OPENSSL_64_BIT) aes_word_t mask_w = (((uint64_t)mask) << 32) | mask; #else aes_word_t mask_w = mask; #endif // This is a variation on a delta swap. aes_word_t swap = ((*a >> shift) ^ *b) & mask_w; *a ^= swap << shift; *b ^= swap; } // aes_nohw_transpose converts |batch| to and from bitsliced form. It divides // the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares // and transposes each square. static void aes_nohw_transpose(AES_NOHW_BATCH *batch) { // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101). aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1); aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1); aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1); aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1); #if AES_NOHW_BATCH_SIZE >= 4 // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011). aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2); aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2); aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2); aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2); #endif #if AES_NOHW_BATCH_SIZE >= 8 // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111). aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4); aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4); aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4); aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4); #endif } // aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|. // |num_blocks| must be at most |AES_NOHW_BATCH|. static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in, size_t num_blocks) { // Don't leave unused blocks uninitialized. OPENSSL_memset(out, 0, sizeof(AES_NOHW_BATCH)); debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE); for (size_t i = 0; i < num_blocks; i++) { aes_word_t block[AES_NOHW_BLOCK_WORDS]; aes_nohw_compact_block(block, in + 16 * i); aes_nohw_batch_set(out, block, i); } aes_nohw_transpose(out); } // aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|. // |num_blocks| must be at most |AES_NOHW_BATCH|. static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks, const AES_NOHW_BATCH *batch) { AES_NOHW_BATCH copy = *batch; aes_nohw_transpose(©); debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE); for (size_t i = 0; i < num_blocks; i++) { aes_word_t block[AES_NOHW_BLOCK_WORDS]; aes_nohw_batch_get(©, block, i); aes_nohw_uncompact_block(out + 16 * i, block); } } // AES round steps. static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch, const AES_NOHW_BATCH *key) { for (size_t i = 0; i < 8; i++) { batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]); } } static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) { // See https://eprint.iacr.org/2009/191.pdf, Appendix C. aes_word_t x0 = batch->w[7]; aes_word_t x1 = batch->w[6]; aes_word_t x2 = batch->w[5]; aes_word_t x3 = batch->w[4]; aes_word_t x4 = batch->w[3]; aes_word_t x5 = batch->w[2]; aes_word_t x6 = batch->w[1]; aes_word_t x7 = batch->w[0]; // Figure 2, the top linear transformation. aes_word_t y14 = aes_nohw_xor(x3, x5); aes_word_t y13 = aes_nohw_xor(x0, x6); aes_word_t y9 = aes_nohw_xor(x0, x3); aes_word_t y8 = aes_nohw_xor(x0, x5); aes_word_t t0 = aes_nohw_xor(x1, x2); aes_word_t y1 = aes_nohw_xor(t0, x7); aes_word_t y4 = aes_nohw_xor(y1, x3); aes_word_t y12 = aes_nohw_xor(y13, y14); aes_word_t y2 = aes_nohw_xor(y1, x0); aes_word_t y5 = aes_nohw_xor(y1, x6); aes_word_t y3 = aes_nohw_xor(y5, y8); aes_word_t t1 = aes_nohw_xor(x4, y12); aes_word_t y15 = aes_nohw_xor(t1, x5); aes_word_t y20 = aes_nohw_xor(t1, x1); aes_word_t y6 = aes_nohw_xor(y15, x7); aes_word_t y10 = aes_nohw_xor(y15, t0); aes_word_t y11 = aes_nohw_xor(y20, y9); aes_word_t y7 = aes_nohw_xor(x7, y11); aes_word_t y17 = aes_nohw_xor(y10, y11); aes_word_t y19 = aes_nohw_xor(y10, y8); aes_word_t y16 = aes_nohw_xor(t0, y11); aes_word_t y21 = aes_nohw_xor(y13, y16); aes_word_t y18 = aes_nohw_xor(x0, y16); // Figure 3, the middle non-linear section. aes_word_t t2 = aes_nohw_and(y12, y15); aes_word_t t3 = aes_nohw_and(y3, y6); aes_word_t t4 = aes_nohw_xor(t3, t2); aes_word_t t5 = aes_nohw_and(y4, x7); aes_word_t t6 = aes_nohw_xor(t5, t2); aes_word_t t7 = aes_nohw_and(y13, y16); aes_word_t t8 = aes_nohw_and(y5, y1); aes_word_t t9 = aes_nohw_xor(t8, t7); aes_word_t t10 = aes_nohw_and(y2, y7); aes_word_t t11 = aes_nohw_xor(t10, t7); aes_word_t t12 = aes_nohw_and(y9, y11); aes_word_t t13 = aes_nohw_and(y14, y17); aes_word_t t14 = aes_nohw_xor(t13, t12); aes_word_t t15 = aes_nohw_and(y8, y10); aes_word_t t16 = aes_nohw_xor(t15, t12); aes_word_t t17 = aes_nohw_xor(t4, t14); aes_word_t t18 = aes_nohw_xor(t6, t16); aes_word_t t19 = aes_nohw_xor(t9, t14); aes_word_t t20 = aes_nohw_xor(t11, t16); aes_word_t t21 = aes_nohw_xor(t17, y20); aes_word_t t22 = aes_nohw_xor(t18, y19); aes_word_t t23 = aes_nohw_xor(t19, y21); aes_word_t t24 = aes_nohw_xor(t20, y18); aes_word_t t25 = aes_nohw_xor(t21, t22); aes_word_t t26 = aes_nohw_and(t21, t23); aes_word_t t27 = aes_nohw_xor(t24, t26); aes_word_t t28 = aes_nohw_and(t25, t27); aes_word_t t29 = aes_nohw_xor(t28, t22); aes_word_t t30 = aes_nohw_xor(t23, t24); aes_word_t t31 = aes_nohw_xor(t22, t26); aes_word_t t32 = aes_nohw_and(t31, t30); aes_word_t t33 = aes_nohw_xor(t32, t24); aes_word_t t34 = aes_nohw_xor(t23, t33); aes_word_t t35 = aes_nohw_xor(t27, t33); aes_word_t t36 = aes_nohw_and(t24, t35); aes_word_t t37 = aes_nohw_xor(t36, t34); aes_word_t t38 = aes_nohw_xor(t27, t36); aes_word_t t39 = aes_nohw_and(t29, t38); aes_word_t t40 = aes_nohw_xor(t25, t39); aes_word_t t41 = aes_nohw_xor(t40, t37); aes_word_t t42 = aes_nohw_xor(t29, t33); aes_word_t t43 = aes_nohw_xor(t29, t40); aes_word_t t44 = aes_nohw_xor(t33, t37); aes_word_t t45 = aes_nohw_xor(t42, t41); aes_word_t z0 = aes_nohw_and(t44, y15); aes_word_t z1 = aes_nohw_and(t37, y6); aes_word_t z2 = aes_nohw_and(t33, x7); aes_word_t z3 = aes_nohw_and(t43, y16); aes_word_t z4 = aes_nohw_and(t40, y1); aes_word_t z5 = aes_nohw_and(t29, y7); aes_word_t z6 = aes_nohw_and(t42, y11); aes_word_t z7 = aes_nohw_and(t45, y17); aes_word_t z8 = aes_nohw_and(t41, y10); aes_word_t z9 = aes_nohw_and(t44, y12); aes_word_t z10 = aes_nohw_and(t37, y3); aes_word_t z11 = aes_nohw_and(t33, y4); aes_word_t z12 = aes_nohw_and(t43, y13); aes_word_t z13 = aes_nohw_and(t40, y5); aes_word_t z14 = aes_nohw_and(t29, y2); aes_word_t z15 = aes_nohw_and(t42, y9); aes_word_t z16 = aes_nohw_and(t45, y14); aes_word_t z17 = aes_nohw_and(t41, y8); // Figure 4, bottom linear transformation. aes_word_t t46 = aes_nohw_xor(z15, z16); aes_word_t t47 = aes_nohw_xor(z10, z11); aes_word_t t48 = aes_nohw_xor(z5, z13); aes_word_t t49 = aes_nohw_xor(z9, z10); aes_word_t t50 = aes_nohw_xor(z2, z12); aes_word_t t51 = aes_nohw_xor(z2, z5); aes_word_t t52 = aes_nohw_xor(z7, z8); aes_word_t t53 = aes_nohw_xor(z0, z3); aes_word_t t54 = aes_nohw_xor(z6, z7); aes_word_t t55 = aes_nohw_xor(z16, z17); aes_word_t t56 = aes_nohw_xor(z12, t48); aes_word_t t57 = aes_nohw_xor(t50, t53); aes_word_t t58 = aes_nohw_xor(z4, t46); aes_word_t t59 = aes_nohw_xor(z3, t54); aes_word_t t60 = aes_nohw_xor(t46, t57); aes_word_t t61 = aes_nohw_xor(z14, t57); aes_word_t t62 = aes_nohw_xor(t52, t58); aes_word_t t63 = aes_nohw_xor(t49, t58); aes_word_t t64 = aes_nohw_xor(z4, t59); aes_word_t t65 = aes_nohw_xor(t61, t62); aes_word_t t66 = aes_nohw_xor(z1, t63); aes_word_t s0 = aes_nohw_xor(t59, t63); aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62)); aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60)); aes_word_t t67 = aes_nohw_xor(t64, t65); aes_word_t s3 = aes_nohw_xor(t53, t66); aes_word_t s4 = aes_nohw_xor(t51, t66); aes_word_t s5 = aes_nohw_xor(t47, t65); aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3)); aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67)); batch->w[0] = s7; batch->w[1] = s6; batch->w[2] = s5; batch->w[3] = s4; batch->w[4] = s3; batch->w[5] = s2; batch->w[6] = s1; batch->w[7] = s0; } // aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated // to the right by |n|. This is a macro because |aes_nohw_shift_*| require // constant shift counts in the SSE2 implementation. #define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \ (aes_nohw_or(aes_nohw_shift_right((v), (n)*4), \ aes_nohw_shift_left((v), 16 - (n)*4))) static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) { for (size_t i = 0; i < 8; i++) { aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK); aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK); aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK); aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK); row1 = aes_nohw_rotate_cols_right(row1, 1); row2 = aes_nohw_rotate_cols_right(row2, 2); row3 = aes_nohw_rotate_cols_right(row3, 3); batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3)); } } // aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated // down by one. static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) { #if defined(OPENSSL_64_BIT) return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) | ((v << 12) & UINT64_C(0xf000f000f000f000)); #else return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0); #endif } // aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated // by two. static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) { #if defined(OPENSSL_64_BIT) return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) | ((v << 8) & UINT64_C(0xff00ff00ff00ff00)); #else return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0); #endif } static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) { // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A. aes_word_t a0 = batch->w[0]; aes_word_t a1 = batch->w[1]; aes_word_t a2 = batch->w[2]; aes_word_t a3 = batch->w[3]; aes_word_t a4 = batch->w[4]; aes_word_t a5 = batch->w[5]; aes_word_t a6 = batch->w[6]; aes_word_t a7 = batch->w[7]; aes_word_t r0 = aes_nohw_rotate_rows_down(a0); aes_word_t a0_r0 = aes_nohw_xor(a0, r0); aes_word_t r1 = aes_nohw_rotate_rows_down(a1); aes_word_t a1_r1 = aes_nohw_xor(a1, r1); aes_word_t r2 = aes_nohw_rotate_rows_down(a2); aes_word_t a2_r2 = aes_nohw_xor(a2, r2); aes_word_t r3 = aes_nohw_rotate_rows_down(a3); aes_word_t a3_r3 = aes_nohw_xor(a3, r3); aes_word_t r4 = aes_nohw_rotate_rows_down(a4); aes_word_t a4_r4 = aes_nohw_xor(a4, r4); aes_word_t r5 = aes_nohw_rotate_rows_down(a5); aes_word_t a5_r5 = aes_nohw_xor(a5, r5); aes_word_t r6 = aes_nohw_rotate_rows_down(a6); aes_word_t a6_r6 = aes_nohw_xor(a6, r6); aes_word_t r7 = aes_nohw_rotate_rows_down(a7); aes_word_t a7_r7 = aes_nohw_xor(a7, r7); batch->w[0] = aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0)); batch->w[1] = aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7), aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1))); batch->w[2] = aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2)); batch->w[3] = aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7), aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3))); batch->w[4] = aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7), aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4))); batch->w[5] = aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5)); batch->w[6] = aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6)); batch->w[7] = aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7)); } static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key, size_t num_rounds, AES_NOHW_BATCH *batch) { aes_nohw_add_round_key(batch, &key->keys[0]); for (size_t i = 1; i < num_rounds; i++) { aes_nohw_sub_bytes(batch); aes_nohw_shift_rows(batch); aes_nohw_mix_columns(batch); aes_nohw_add_round_key(batch, &key->keys[i]); } aes_nohw_sub_bytes(batch); aes_nohw_shift_rows(batch); aes_nohw_add_round_key(batch, &key->keys[num_rounds]); } // Key schedule. static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out, const AES_KEY *key) { for (size_t i = 0; i <= key->rounds; i++) { // Copy the round key into each block in the batch. for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) { aes_word_t tmp[AES_NOHW_BLOCK_WORDS]; OPENSSL_memcpy(tmp, key->rd_key + 4 * i, 16); aes_nohw_batch_set(&out->keys[i], tmp, j); } aes_nohw_transpose(&out->keys[i]); } } static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36}; // aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in // |rcon|, stored in a |aes_word_t|. static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) { rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1); return ((aes_word_t)rcon); } static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { AES_NOHW_BATCH batch; OPENSSL_memset(&batch, 0, sizeof(batch)); aes_nohw_batch_set(&batch, in, 0); aes_nohw_transpose(&batch); aes_nohw_sub_bytes(&batch); aes_nohw_transpose(&batch); aes_nohw_batch_get(&batch, out, 0); } static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) { key->rounds = 10; aes_word_t block[AES_NOHW_BLOCK_WORDS]; aes_nohw_compact_block(block, in); OPENSSL_memcpy(key->rd_key, block, 16); for (size_t i = 1; i <= 10; i++) { aes_word_t sub[AES_NOHW_BLOCK_WORDS]; aes_nohw_sub_block(sub, block); uint8_t rcon = aes_nohw_rcon[i - 1]; for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { // Incorporate |rcon| and the transformed word into the first word. block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j)); block[j] = aes_nohw_xor( block[j], aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); // Propagate to the remaining words. Note this is reordered from the usual // formulation to avoid needing masks. aes_word_t v = block[j]; block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4)); block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8)); block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12)); } OPENSSL_memcpy(key->rd_key + 4 * i, block, 16); } } static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) { key->rounds = 14; // Each key schedule iteration produces two round keys. aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS]; aes_nohw_compact_block(block1, in); OPENSSL_memcpy(key->rd_key, block1, 16); aes_nohw_compact_block(block2, in + 16); OPENSSL_memcpy(key->rd_key + 4, block2, 16); for (size_t i = 2; i <= 14; i += 2) { aes_word_t sub[AES_NOHW_BLOCK_WORDS]; aes_nohw_sub_block(sub, block2); uint8_t rcon = aes_nohw_rcon[i / 2 - 1]; for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { // Incorporate |rcon| and the transformed word into the first word. block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)); block1[j] = aes_nohw_xor( block1[j], aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); // Propagate to the remaining words. aes_word_t v = block1[j]; block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4)); block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8)); block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12)); } OPENSSL_memcpy(key->rd_key + 4 * i, block1, 16); if (i == 14) { break; } aes_nohw_sub_block(sub, block1); for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { // Incorporate the transformed word into the first word. block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12)); // Propagate to the remaining words. aes_word_t v = block2[j]; block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4)); block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8)); block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12)); } OPENSSL_memcpy(key->rd_key + 4 * (i + 1), block2, 16); } } // External API. int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits, AES_KEY *aeskey) { switch (bits) { case 128: aes_nohw_setup_key_128(aeskey, key); return 0; case 256: aes_nohw_setup_key_256(aeskey, key); return 0; } return 1; } void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { AES_NOHW_SCHEDULE sched; aes_nohw_expand_round_keys(&sched, key); AES_NOHW_BATCH batch; aes_nohw_to_batch(&batch, in, /*num_blocks=*/1); aes_nohw_encrypt_batch(&sched, key->rounds, &batch); aes_nohw_from_batch(out, /*num_blocks=*/1, &batch); } static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16], const uint8_t b[16]) { for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) { aes_word_t x, y; OPENSSL_memcpy(&x, a + i, sizeof(aes_word_t)); OPENSSL_memcpy(&y, b + i, sizeof(aes_word_t)); x = aes_nohw_xor(x, y); OPENSSL_memcpy(out + i, &x, sizeof(aes_word_t)); } } void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t blocks, const AES_KEY *key, const uint8_t ivec[16]) { if (blocks == 0) { return; } AES_NOHW_SCHEDULE sched; aes_nohw_expand_round_keys(&sched, key); // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|. alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16]; alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16]; for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { OPENSSL_memcpy(ivs + 16 * i, ivec, 16); } uint32_t ctr = CRYPTO_load_u32_be(ivs + 12); for (;;) { // Update counters. for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i); } size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks; AES_NOHW_BATCH batch; aes_nohw_to_batch(&batch, ivs, todo); aes_nohw_encrypt_batch(&sched, key->rounds, &batch); aes_nohw_from_batch(enc_ivs, todo, &batch); for (size_t i = 0; i < todo; i++) { aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i); } blocks -= todo; if (blocks == 0) { break; } in += 16 * AES_NOHW_BATCH_SIZE; out += 16 * AES_NOHW_BATCH_SIZE; ctr += AES_NOHW_BATCH_SIZE; } } ring-0.17.14/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl000064400000000000000000001224331046102023000214440ustar 00000000000000#!/usr/bin/env perl # Copyright 2024 The BoringSSL Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # #------------------------------------------------------------------------------ # # VAES and VPCLMULQDQ optimized AES-GCM for x86_64 (AVX2 version) # # This is similar to aes-gcm-avx10-x86_64.pl, but it uses AVX2 instead of AVX512 # / AVX10. This means it can only use 16 vector registers instead of 32, the # maximum vector length is 32 bytes, and some instructions such as vpternlogd # and masked loads/stores are unavailable. However, it is able to run on CPUs # that have VAES without AVX512 / AVX10, namely AMD Zen 3 (including "Milan" # server processors) and some Intel client CPUs such as Alder Lake. # # This implementation also uses Karatsuba multiplication instead of schoolbook # multiplication for GHASH in its main loop. This does not help much on Intel, # but it improves performance by ~5% on AMD Zen 3 which is the main target for # this implementation. Other factors weighing slightly in favor of Karatsuba # multiplication in this implementation are the lower maximum vector length # (which means there is space left in the Htable array to cache the halves of # the key powers XOR'd together) and the unavailability of the vpternlogd # instruction (which helped schoolbook a bit more than Karatsuba). use strict; my $flavour = shift; my $output = shift; if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; } my $win64; my @argregs; if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) { $win64 = 1; @argregs = ( "%rcx", "%rdx", "%r8", "%r9" ); } else { $win64 = 0; @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" ); } $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir = $1; my $xlate; ( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate ) or die "can't locate x86_64-xlate.pl"; open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT = *OUT; my $g_cur_func_name; my $g_cur_func_uses_seh; my @g_cur_func_saved_gpregs; my @g_cur_func_saved_xmmregs; sub _begin_func { my ( $funcname, $uses_seh ) = @_; $g_cur_func_name = $funcname; $g_cur_func_uses_seh = $uses_seh; @g_cur_func_saved_gpregs = (); @g_cur_func_saved_xmmregs = (); return <<___; .globl $funcname .type $funcname,\@abi-omnipotent .align 32 $funcname: .cfi_startproc @{[ $uses_seh ? ".seh_startproc" : "" ]} _CET_ENDBR ___ } # Push a list of general purpose registers onto the stack. sub _save_gpregs { my @gpregs = @_; my $code = ""; die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh; die "_save_gpregs can only be called once per function" if @g_cur_func_saved_gpregs; die "Order must be _save_gpregs, then _save_xmmregs" if @g_cur_func_saved_xmmregs; @g_cur_func_saved_gpregs = @gpregs; for my $reg (@gpregs) { $code .= "push $reg\n"; if ($win64) { $code .= ".seh_pushreg $reg\n"; } else { $code .= ".cfi_push $reg\n"; } } return $code; } # Push a list of xmm registers onto the stack if the target is Windows. sub _save_xmmregs { my @xmmregs = @_; my $num_xmmregs = scalar @xmmregs; my $code = ""; die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh; die "_save_xmmregs can only be called once per function" if @g_cur_func_saved_xmmregs; if ( $win64 and $num_xmmregs > 0 ) { @g_cur_func_saved_xmmregs = @xmmregs; my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0; my $alloc_size = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 ); $code .= "sub \$$alloc_size, %rsp\n"; $code .= ".seh_stackalloc $alloc_size\n"; for my $i ( 0 .. $num_xmmregs - 1 ) { my $reg_num = $xmmregs[$i]; my $pos = 16 * $i; $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n"; $code .= ".seh_savexmm %xmm$reg_num, $pos\n"; } } return $code; } sub _end_func { my $code = ""; # Restore any xmm registers that were saved earlier. my $num_xmmregs = scalar @g_cur_func_saved_xmmregs; if ( $win64 and $num_xmmregs > 0 ) { my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0; my $alloc_size = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 ); for my $i ( 0 .. $num_xmmregs - 1 ) { my $reg_num = $g_cur_func_saved_xmmregs[$i]; my $pos = 16 * $i; $code .= "movdqa $pos(%rsp), %xmm$reg_num\n"; } $code .= "add \$$alloc_size, %rsp\n"; } # Restore any general purpose registers that were saved earlier. for my $reg ( reverse @g_cur_func_saved_gpregs ) { $code .= "pop $reg\n"; if ( !$win64 ) { $code .= ".cfi_pop $reg\n"; } } $code .= <<___; ret @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]} .cfi_endproc .size $g_cur_func_name, . - $g_cur_func_name ___ return $code; } my $code = <<___; .section .rodata .align 16 # A shuffle mask that reflects the bytes of 16-byte blocks .Lbswap_mask: .quad 0x08090a0b0c0d0e0f, 0x0001020304050607 # This is the GHASH reducing polynomial without its constant term, i.e. # x^128 + x^7 + x^2 + x, represented using the backwards mapping # between bits and polynomial coefficients. # # Alternatively, it can be interpreted as the naturally-ordered # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the # "reversed" GHASH reducing polynomial without its x^128 term. .Lgfpoly: .quad 1, 0xc200000000000000 # Same as above, but with the (1 << 64) bit set. .Lgfpoly_and_internal_carrybit: .quad 1, 0xc200000000000001 .align 32 # The below constants are used for incrementing the counter blocks. .Lctr_pattern: .quad 0, 0 .quad 1, 0 .Linc_2blocks: .quad 2, 0 .quad 2, 0 .text ___ # We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the # 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication) # in the order 8,6,7,5,4,2,3,1. We do not use Htable[12..15]. my $NUM_H_POWERS = 8; my $OFFSETOFEND_H_POWERS = $NUM_H_POWERS * 16; my $OFFSETOF_H_POWERS_XORED = $OFFSETOFEND_H_POWERS; # Offset to 'rounds' in AES_KEY struct my $OFFSETOF_AES_ROUNDS = 240; # GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store # the reduced products in \dst. Uses schoolbook multiplication. sub _ghash_mul { my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_; return <<___; vpclmulqdq \$0x00, $a, $b, $t0 # LO = a_L * b_L vpclmulqdq \$0x01, $a, $b, $t1 # MI_0 = a_L * b_H vpclmulqdq \$0x10, $a, $b, $t2 # MI_1 = a_H * b_L vpxor $t2, $t1, $t1 # MI = MI_0 + MI_1 vpclmulqdq \$0x01, $t0, $gfpoly, $t2 # LO_L*(x^63 + x^62 + x^57) vpshufd \$0x4e, $t0, $t0 # Swap halves of LO vpxor $t0, $t1, $t1 # Fold LO into MI (part 1) vpxor $t2, $t1, $t1 # Fold LO into MI (part 2) vpclmulqdq \$0x11, $a, $b, $dst # HI = a_H * b_H vpclmulqdq \$0x01, $t1, $gfpoly, $t0 # MI_L*(x^63 + x^62 + x^57) vpshufd \$0x4e, $t1, $t1 # Swap halves of MI vpxor $t1, $dst, $dst # Fold MI into HI (part 1) vpxor $t0, $dst, $dst # Fold MI into HI (part 2) ___ } # void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]); # # Initialize |Htable| with powers of the GHASH subkey |H|. # # We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the # 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication) # in the order 8,6,7,5,4,2,3,1. We do not use Htable[12..15]. $code .= _begin_func "gcm_init_vpclmulqdq_avx2", 1; { my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ]; my ( $TMP0, $TMP0_XMM ) = ( "%ymm0", "%xmm0" ); my ( $TMP1, $TMP1_XMM ) = ( "%ymm1", "%xmm1" ); my ( $TMP2, $TMP2_XMM ) = ( "%ymm2", "%xmm2" ); my ( $H_CUR, $H_CUR_XMM ) = ( "%ymm3", "%xmm3" ); my ( $H_CUR2, $H_CUR2_XMM ) = ( "%ymm4", "%xmm4" ); my ( $H_INC, $H_INC_XMM ) = ( "%ymm5", "%xmm5" ); my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm6", "%xmm6" ); $code .= <<___; @{[ _save_xmmregs (6) ]} .seh_endprologue # Load the byte-reflected hash subkey. BoringSSL provides it in # byte-reflected form except the two halves are in the wrong order. vpshufd \$0x4e, ($H_PTR), $H_CUR_XMM # Finish preprocessing the byte-reflected hash subkey by multiplying it by # x^-1 ("standard" interpretation of polynomial coefficients) or # equivalently x^1 (natural interpretation). This gets the key into a # format that avoids having to bit-reflect the data blocks later. vpshufd \$0xd3, $H_CUR_XMM, $TMP0_XMM vpsrad \$31, $TMP0_XMM, $TMP0_XMM vpaddq $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM vpand .Lgfpoly_and_internal_carrybit(%rip), $TMP0_XMM, $TMP0_XMM vpxor $TMP0_XMM, $H_CUR_XMM, $H_CUR_XMM vbroadcasti128 .Lgfpoly(%rip), $GFPOLY # Square H^1 to get H^2. @{[ _ghash_mul $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM, $TMP0_XMM, $TMP1_XMM, $TMP2_XMM ]} # Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2]. vinserti128 \$1, $H_CUR_XMM, $H_INC, $H_CUR vinserti128 \$1, $H_INC_XMM, $H_INC, $H_INC # Compute H_CUR2 = [H^4, H^3]. @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} # Store [H^2, H^1] and [H^4, H^3]. vmovdqu $H_CUR, 3*32($HTABLE) vmovdqu $H_CUR2, 2*32($HTABLE) # For Karatsuba multiplication: compute and store the two 64-bit halves of # each key power XOR'd together. Order is 4,2,3,1. vpunpcklqdq $H_CUR, $H_CUR2, $TMP0 vpunpckhqdq $H_CUR, $H_CUR2, $TMP1 vpxor $TMP1, $TMP0, $TMP0 vmovdqu $TMP0, $OFFSETOF_H_POWERS_XORED+32($HTABLE) # Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7]. @{[ _ghash_mul $H_INC, $H_CUR2, $H_CUR, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} vmovdqu $H_CUR, 1*32($HTABLE) vmovdqu $H_CUR2, 0*32($HTABLE) # Again, compute and store the two 64-bit halves of each key power XOR'd # together. Order is 8,6,7,5. vpunpcklqdq $H_CUR, $H_CUR2, $TMP0 vpunpckhqdq $H_CUR, $H_CUR2, $TMP1 vpxor $TMP1, $TMP0, $TMP0 vmovdqu $TMP0, $OFFSETOF_H_POWERS_XORED($HTABLE) vzeroupper ___ } $code .= _end_func; # Do one step of the GHASH update of four vectors of data blocks. # $i: the step to do, 0 through 9 # $ghashdata_ptr: pointer to the data blocks (ciphertext or AAD) # $htable: pointer to the Htable for the key # $bswap_mask: mask for reflecting the bytes of blocks # $h_pow[2-1]_xored: XOR'd key powers cached from Htable # $tmp[0-2]: temporary registers. $tmp[1-2] must be preserved across steps. # $lo, $mi: working state for this macro that must be preserved across steps # $ghash_acc: the GHASH accumulator (input/output) sub _ghash_step_4x { my ( $i, $ghashdata_ptr, $htable, $bswap_mask, $h_pow2_xored, $h_pow1_xored, $tmp0, $tmp0_xmm, $tmp1, $tmp2, $lo, $mi, $ghash_acc, $ghash_acc_xmm ) = @_; my ( $hi, $hi_xmm ) = ( $ghash_acc, $ghash_acc_xmm ); # alias if ( $i == 0 ) { return <<___; # First vector vmovdqu 0*32($ghashdata_ptr), $tmp1 vpshufb $bswap_mask, $tmp1, $tmp1 vmovdqu 0*32($htable), $tmp2 vpxor $ghash_acc, $tmp1, $tmp1 vpclmulqdq \$0x00, $tmp2, $tmp1, $lo vpclmulqdq \$0x11, $tmp2, $tmp1, $hi vpunpckhqdq $tmp1, $tmp1, $tmp0 vpxor $tmp1, $tmp0, $tmp0 vpclmulqdq \$0x00, $h_pow2_xored, $tmp0, $mi ___ } elsif ( $i == 1 ) { return <<___; ___ } elsif ( $i == 2 ) { return <<___; # Second vector vmovdqu 1*32($ghashdata_ptr), $tmp1 vpshufb $bswap_mask, $tmp1, $tmp1 vmovdqu 1*32($htable), $tmp2 vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 vpxor $tmp0, $lo, $lo vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 vpxor $tmp0, $hi, $hi vpunpckhqdq $tmp1, $tmp1, $tmp0 vpxor $tmp1, $tmp0, $tmp0 vpclmulqdq \$0x10, $h_pow2_xored, $tmp0, $tmp0 vpxor $tmp0, $mi, $mi ___ } elsif ( $i == 3 ) { return <<___; # Third vector vmovdqu 2*32($ghashdata_ptr), $tmp1 vpshufb $bswap_mask, $tmp1, $tmp1 vmovdqu 2*32($htable), $tmp2 ___ } elsif ( $i == 4 ) { return <<___; vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 vpxor $tmp0, $lo, $lo vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 vpxor $tmp0, $hi, $hi ___ } elsif ( $i == 5 ) { return <<___; vpunpckhqdq $tmp1, $tmp1, $tmp0 vpxor $tmp1, $tmp0, $tmp0 vpclmulqdq \$0x00, $h_pow1_xored, $tmp0, $tmp0 vpxor $tmp0, $mi, $mi # Fourth vector vmovdqu 3*32($ghashdata_ptr), $tmp1 vpshufb $bswap_mask, $tmp1, $tmp1 ___ } elsif ( $i == 6 ) { return <<___; vmovdqu 3*32($htable), $tmp2 vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 vpxor $tmp0, $lo, $lo vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 vpxor $tmp0, $hi, $hi vpunpckhqdq $tmp1, $tmp1, $tmp0 vpxor $tmp1, $tmp0, $tmp0 vpclmulqdq \$0x10, $h_pow1_xored, $tmp0, $tmp0 vpxor $tmp0, $mi, $mi ___ } elsif ( $i == 7 ) { return <<___; # Finalize 'mi' following Karatsuba multiplication. vpxor $lo, $mi, $mi vpxor $hi, $mi, $mi # Fold lo into mi. vbroadcasti128 .Lgfpoly(%rip), $tmp2 vpclmulqdq \$0x01, $lo, $tmp2, $tmp0 vpshufd \$0x4e, $lo, $lo vpxor $lo, $mi, $mi vpxor $tmp0, $mi, $mi ___ } elsif ( $i == 8 ) { return <<___; # Fold mi into hi. vpclmulqdq \$0x01, $mi, $tmp2, $tmp0 vpshufd \$0x4e, $mi, $mi vpxor $mi, $hi, $hi vpxor $tmp0, $hi, $hi ___ } elsif ( $i == 9 ) { return <<___; vextracti128 \$1, $hi, $tmp0_xmm vpxor $tmp0_xmm, $hi_xmm, $ghash_acc_xmm ___ } } sub _ghash_4x { my $code = ""; for my $i ( 0 .. 9 ) { $code .= _ghash_step_4x $i, @_; } return $code; } # void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16], # const uint8_t *in, size_t len); # # Using the key |Htable|, update the GHASH accumulator |Xi| with the data given # by |in| and |len|. |len| must be exactly 16. $code .= _begin_func "gcm_ghash_vpclmulqdq_avx2_1", 1; { # Function arguments my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ]; # Additional local variables my ( $TMP0, $TMP0_XMM ) = ( "%ymm0", "%xmm0" ); my ( $TMP1, $TMP1_XMM ) = ( "%ymm1", "%xmm1" ); my ( $TMP2, $TMP2_XMM ) = ( "%ymm2", "%xmm2" ); my ( $LO, $LO_XMM ) = ( "%ymm3", "%xmm3" ); my ( $MI, $MI_XMM ) = ( "%ymm4", "%xmm4" ); my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm5", "%xmm5" ); my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm6", "%xmm6" ); my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm7", "%xmm7" ); my $H_POW2_XORED = "%ymm8"; my $H_POW1_XORED = "%ymm9"; $code .= <<___; @{[ _save_xmmregs (6 .. 9) ]} .seh_endprologue # Load the bswap_mask and gfpoly constants. Since AADLEN is usually small, # usually only 128-bit vectors will be used. So as an optimization, don't # broadcast these constants to both 128-bit lanes quite yet. vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK_XMM vmovdqu .Lgfpoly(%rip), $GFPOLY_XMM # Load the GHASH accumulator. vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM # Update GHASH with the remaining 16-byte block if any. .Lghash_lastblock: vmovdqu ($AAD), $TMP0_XMM vpshufb $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $TMP0_XMM @{[ _ghash_mul $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM, $TMP1_XMM, $TMP2_XMM, $LO_XMM ]} .Lghash_done: # Store the updated GHASH accumulator back to memory. vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR) vzeroupper ___ } $code .= _end_func; sub _vaesenc_4x { my ( $round_key, $aesdata0, $aesdata1, $aesdata2, $aesdata3 ) = @_; return <<___; vaesenc $round_key, $aesdata0, $aesdata0 vaesenc $round_key, $aesdata1, $aesdata1 vaesenc $round_key, $aesdata2, $aesdata2 vaesenc $round_key, $aesdata3, $aesdata3 ___ } sub _ctr_begin_4x { my ( $le_ctr, $bswap_mask, $rndkey0, $aesdata0, $aesdata1, $aesdata2, $aesdata3, $tmp ) = @_; return <<___; # Increment le_ctr four times to generate four vectors of little-endian # counter blocks, swap each to big-endian, and store them in aesdata[0-3]. vmovdqu .Linc_2blocks(%rip), $tmp vpshufb $bswap_mask, $le_ctr, $aesdata0 vpaddd $tmp, $le_ctr, $le_ctr vpshufb $bswap_mask, $le_ctr, $aesdata1 vpaddd $tmp, $le_ctr, $le_ctr vpshufb $bswap_mask, $le_ctr, $aesdata2 vpaddd $tmp, $le_ctr, $le_ctr vpshufb $bswap_mask, $le_ctr, $aesdata3 vpaddd $tmp, $le_ctr, $le_ctr # AES "round zero": XOR in the zero-th round key. vpxor $rndkey0, $aesdata0, $aesdata0 vpxor $rndkey0, $aesdata1, $aesdata1 vpxor $rndkey0, $aesdata2, $aesdata2 vpxor $rndkey0, $aesdata3, $aesdata3 ___ } # Do the last AES round for four vectors of counter blocks, XOR four vectors of # source data with the resulting keystream blocks, and write the result to the # destination buffer. The implementation differs slightly as it takes advantage # of the property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) to reduce # latency, but it has the same effect. sub _aesenclast_and_xor_4x { my ( $src, $dst, $rndkeylast, $aesdata0, $aesdata1, $aesdata2, $aesdata3, $t0, $t1, $t2, $t3 ) = @_; return <<___; vpxor 0*32($src), $rndkeylast, $t0 vpxor 1*32($src), $rndkeylast, $t1 vpxor 2*32($src), $rndkeylast, $t2 vpxor 3*32($src), $rndkeylast, $t3 vaesenclast $t0, $aesdata0, $aesdata0 vaesenclast $t1, $aesdata1, $aesdata1 vaesenclast $t2, $aesdata2, $aesdata2 vaesenclast $t3, $aesdata3, $aesdata3 vmovdqu $aesdata0, 0*32($dst) vmovdqu $aesdata1, 1*32($dst) vmovdqu $aesdata2, 2*32($dst) vmovdqu $aesdata3, 3*32($dst) ___ } my $g_update_macro_expansion_count = 0; # void aes_gcm_{enc,dec}_update_vaes_avx2(const uint8_t *in, uint8_t *out, # size_t len, const AES_KEY *key, # const uint8_t ivec[16], # const u128 Htable[16], # uint8_t Xi[16]); # # This macro generates a GCM encryption or decryption update function with the # above prototype (with \enc selecting which one). The function computes the # next portion of the CTR keystream, XOR's it with |len| bytes from |in|, and # writes the resulting encrypted or decrypted data to |out|. It also updates # the GHASH accumulator |Xi| using the next |len| ciphertext bytes. # # |len| must be a multiple of 16. The caller must do any buffering needed to # ensure this. Both in-place and out-of-place en/decryption are supported. # # |ivec| must give the current counter in big-endian format. This function # loads the counter from |ivec| and increments the loaded counter as needed, but # it does *not* store the updated counter back to |ivec|. The caller must # update |ivec| if any more data segments follow. Internally, only the low # 32-bit word of the counter is incremented, following the GCM standard. sub _aes_gcm_update { my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count; my ($enc) = @_; my $code = ""; # Function arguments my ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ) = $win64 ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" ) : ( @argregs[ 0 .. 5 ], "%r12" ); # Additional local variables. # %rax is used as a temporary register. BE_CTR_PTR is also available as a # temporary register after the counter is loaded. # AES key length in bytes my ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" ); # Pointer to the last AES round key for the chosen AES variant my $RNDKEYLAST_PTR = "%r11"; # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values # using vpshufb, copied to all 128-bit lanes. my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm0", "%xmm0" ); # GHASH_ACC is the accumulator variable for GHASH. When fully reduced, # only the lowest 128-bit lane can be nonzero. When not fully reduced, # more than one lane may be used, and they need to be XOR'd together. my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm1", "%xmm1" ); # TMP[0-2] are temporary registers. my ( $TMP0, $TMP0_XMM ) = ( "%ymm2", "%xmm2" ); my ( $TMP1, $TMP1_XMM ) = ( "%ymm3", "%xmm3" ); my ( $TMP2, $TMP2_XMM ) = ( "%ymm4", "%xmm4" ); # LO and MI are used to accumulate unreduced GHASH products. my ( $LO, $LO_XMM ) = ( "%ymm5", "%xmm5" ); my ( $MI, $MI_XMM ) = ( "%ymm6", "%xmm6" ); # Cached key powers from Htable my ( $H_POW2_XORED, $H_POW2_XORED_XMM ) = ( "%ymm7", "%xmm7" ); my ( $H_POW1_XORED, $H_POW1_XORED_XMM ) = ( "%ymm8", "%xmm8" ); # RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one. my $RNDKEY0 = "%ymm9"; my $RNDKEYLAST = "%ymm10"; # LE_CTR contains the next set of little-endian counter blocks. my $LE_CTR = "%ymm11"; # AESDATA[0-3] hold the counter blocks that are being encrypted by AES. my ( $AESDATA0, $AESDATA0_XMM ) = ( "%ymm12", "%xmm12" ); my ( $AESDATA1, $AESDATA1_XMM ) = ( "%ymm13", "%xmm13" ); my ( $AESDATA2, $AESDATA2_XMM ) = ( "%ymm14", "%xmm14" ); my ( $AESDATA3, $AESDATA3_XMM ) = ( "%ymm15", "%xmm15" ); my @AESDATA = ( $AESDATA0, $AESDATA1, $AESDATA2, $AESDATA3 ); my @ghash_4x_args = ( $enc ? $DST : $SRC, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, $H_POW1_XORED, $TMP0, $TMP0_XMM, $TMP1, $TMP2, $LO, $MI, $GHASH_ACC, $GHASH_ACC_XMM ); if ($win64) { $code .= <<___; @{[ _save_gpregs $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ]} mov 64(%rsp), $BE_CTR_PTR # arg5 mov 72(%rsp), $HTABLE # arg6 mov 80(%rsp), $GHASH_ACC_PTR # arg7 @{[ _save_xmmregs (6 .. 15) ]} .seh_endprologue ___ } else { $code .= <<___; @{[ _save_gpregs $GHASH_ACC_PTR ]} mov 16(%rsp), $GHASH_ACC_PTR # arg7 ___ } if ($enc) { $code .= <<___; #ifdef BORINGSSL_DISPATCH_TEST .extern BORINGSSL_function_hit movb \$1,BORINGSSL_function_hit+8(%rip) #endif ___ } $code .= <<___; vbroadcasti128 .Lbswap_mask(%rip), $BSWAP_MASK # Load the GHASH accumulator and the starting counter. # BoringSSL passes these values in big endian format. vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM vbroadcasti128 ($BE_CTR_PTR), $LE_CTR vpshufb $BSWAP_MASK, $LE_CTR, $LE_CTR # Load the AES key length in bytes. BoringSSL stores number of rounds # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20. movl $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN lea -20(,$AESKEYLEN,4), $AESKEYLEN # Make RNDKEYLAST_PTR point to the last AES round key. This is the # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 # respectively. Then load the zero-th and last round keys. lea 6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR vbroadcasti128 ($AESKEY), $RNDKEY0 vbroadcasti128 ($RNDKEYLAST_PTR), $RNDKEYLAST # Finish initializing LE_CTR by adding 1 to the second block. vpaddd .Lctr_pattern(%rip), $LE_CTR, $LE_CTR # If there are at least 128 bytes of data, then continue into the loop that # processes 128 bytes of data at a time. Otherwise skip it. cmp \$127, $DATALEN jbe .Lcrypt_loop_4x_done$local_label_suffix vmovdqu $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED vmovdqu $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED ___ # Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time. if ($enc) { $code .= <<___; # Encrypt the first 4 vectors of plaintext blocks. @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]} lea 16($AESKEY), %rax .Lvaesenc_loop_first_4_vecs$local_label_suffix: vbroadcasti128 (%rax), $TMP0 @{[ _vaesenc_4x $TMP0, @AESDATA ]} add \$16, %rax cmp %rax, $RNDKEYLAST_PTR jne .Lvaesenc_loop_first_4_vecs$local_label_suffix @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, $TMP0, $TMP1, $LO, $MI ]} sub \$-128, $SRC # 128 is 4 bytes, -128 is 1 byte add \$-128, $DATALEN cmp \$127, $DATALEN jbe .Lghash_last_ciphertext_4x$local_label_suffix ___ } $code .= <<___; .align 16 .Lcrypt_loop_4x$local_label_suffix: # Start the AES encryption of the counter blocks. @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]} cmp \$24, $AESKEYLEN jl .Laes128$local_label_suffix je .Laes192$local_label_suffix # AES-256 vbroadcasti128 -13*16($RNDKEYLAST_PTR), $TMP0 @{[ _vaesenc_4x $TMP0, @AESDATA ]} vbroadcasti128 -12*16($RNDKEYLAST_PTR), $TMP0 @{[ _vaesenc_4x $TMP0, @AESDATA ]} .Laes192$local_label_suffix: vbroadcasti128 -11*16($RNDKEYLAST_PTR), $TMP0 @{[ _vaesenc_4x $TMP0, @AESDATA ]} vbroadcasti128 -10*16($RNDKEYLAST_PTR), $TMP0 @{[ _vaesenc_4x $TMP0, @AESDATA ]} .Laes128$local_label_suffix: ___ # Prefetch the source data 512 bytes ahead into the L1 data cache, to # improve performance when the hardware prefetcher is disabled. Assumes the # L1 data cache line size is 64 bytes (de facto standard on x86_64). $code .= "prefetcht0 512($SRC)\n"; $code .= "prefetcht0 512+64($SRC)\n"; # Finish the AES encryption of the counter blocks in AESDATA[0-3], # interleaved with the GHASH update of the ciphertext blocks. for my $i ( reverse 1 .. 9 ) { $code .= <<___; @{[ _ghash_step_4x 9-$i, @ghash_4x_args ]} vbroadcasti128 -$i*16($RNDKEYLAST_PTR), $TMP0 @{[ _vaesenc_4x $TMP0, @AESDATA ]} ___ } $code .= <<___; @{[ _ghash_step_4x 9, @ghash_4x_args ]} @{[ $enc ? "sub \$-128, $DST" : "" ]} # 128 is 4 bytes, -128 is 1 byte @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, $TMP0, $TMP1, $LO, $MI ]} sub \$-128, $SRC @{[ !$enc ? "sub \$-128, $DST" : "" ]} add \$-128, $DATALEN cmp \$127, $DATALEN ja .Lcrypt_loop_4x$local_label_suffix ___ if ($enc) { # Update GHASH with the last set of ciphertext blocks. $code .= <<___; .Lghash_last_ciphertext_4x$local_label_suffix: @{[ _ghash_4x @ghash_4x_args ]} sub \$-128, $DST ___ } my $POWERS_PTR = $BE_CTR_PTR; # BE_CTR_PTR is free to be reused. my ( $HI, $HI_XMM ) = ( $H_POW2_XORED, $H_POW2_XORED_XMM ); # reuse $code .= <<___; .Lcrypt_loop_4x_done$local_label_suffix: # Check whether any data remains. test $DATALEN, $DATALEN jz .Ldone$local_label_suffix # DATALEN is in [16, 32, 48, 64, 80, 96, 112]. # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N # is the number of blocks that remain. lea $OFFSETOFEND_H_POWERS($HTABLE), $POWERS_PTR sub $DATALEN, $POWERS_PTR # Start collecting the unreduced GHASH intermediate value LO, MI, HI. vpxor $LO_XMM, $LO_XMM, $LO_XMM vpxor $MI_XMM, $MI_XMM, $MI_XMM vpxor $HI_XMM, $HI_XMM, $HI_XMM cmp \$64, $DATALEN jb .Llessthan64bytes$local_label_suffix # DATALEN is in [64, 80, 96, 112]. Encrypt two vectors of counter blocks. vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0 vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA1 vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR vpxor $RNDKEY0, $AESDATA0, $AESDATA0 vpxor $RNDKEY0, $AESDATA1, $AESDATA1 lea 16($AESKEY), %rax .Lvaesenc_loop_tail_1$local_label_suffix: vbroadcasti128 (%rax), $TMP0 vaesenc $TMP0, $AESDATA0, $AESDATA0 vaesenc $TMP0, $AESDATA1, $AESDATA1 add \$16, %rax cmp %rax, $RNDKEYLAST_PTR jne .Lvaesenc_loop_tail_1$local_label_suffix vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0 vaesenclast $RNDKEYLAST, $AESDATA1, $AESDATA1 # XOR the data with the two vectors of keystream blocks. vmovdqu 0($SRC), $TMP0 vmovdqu 32($SRC), $TMP1 vpxor $TMP0, $AESDATA0, $AESDATA0 vpxor $TMP1, $AESDATA1, $AESDATA1 vmovdqu $AESDATA0, 0($DST) vmovdqu $AESDATA1, 32($DST) # Update GHASH with two vectors of ciphertext blocks, without reducing. vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA1 : $TMP1 ]}, $AESDATA1 vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 vmovdqu ($POWERS_PTR), $TMP0 vmovdqu 32($POWERS_PTR), $TMP1 vpclmulqdq \$0x00, $TMP0, $AESDATA0, $LO vpclmulqdq \$0x01, $TMP0, $AESDATA0, $MI vpclmulqdq \$0x10, $TMP0, $AESDATA0, $TMP2 vpxor $TMP2, $MI, $MI vpclmulqdq \$0x11, $TMP0, $AESDATA0, $HI vpclmulqdq \$0x00, $TMP1, $AESDATA1, $TMP2 vpxor $TMP2, $LO, $LO vpclmulqdq \$0x01, $TMP1, $AESDATA1, $TMP2 vpxor $TMP2, $MI, $MI vpclmulqdq \$0x10, $TMP1, $AESDATA1, $TMP2 vpxor $TMP2, $MI, $MI vpclmulqdq \$0x11, $TMP1, $AESDATA1, $TMP2 vpxor $TMP2, $HI, $HI add \$64, $POWERS_PTR add \$64, $SRC add \$64, $DST sub \$64, $DATALEN jz .Lreduce$local_label_suffix vpxor $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM # DATALEN is in [16, 32, 48]. Encrypt two last vectors of counter blocks. .Llessthan64bytes$local_label_suffix: vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0 vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA1 vpxor $RNDKEY0, $AESDATA0, $AESDATA0 vpxor $RNDKEY0, $AESDATA1, $AESDATA1 lea 16($AESKEY), %rax .Lvaesenc_loop_tail_2$local_label_suffix: vbroadcasti128 (%rax), $TMP0 vaesenc $TMP0, $AESDATA0, $AESDATA0 vaesenc $TMP0, $AESDATA1, $AESDATA1 add \$16, %rax cmp %rax, $RNDKEYLAST_PTR jne .Lvaesenc_loop_tail_2$local_label_suffix vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0 vaesenclast $RNDKEYLAST, $AESDATA1, $AESDATA1 # XOR the remaining data with the keystream blocks, and update GHASH with # the remaining ciphertext blocks without reducing. cmp \$32, $DATALEN jb .Lxor_one_block$local_label_suffix je .Lxor_two_blocks$local_label_suffix .Lxor_three_blocks$local_label_suffix: vmovdqu 0($SRC), $TMP0 vmovdqu 32($SRC), $TMP1_XMM vpxor $TMP0, $AESDATA0, $AESDATA0 vpxor $TMP1_XMM, $AESDATA1_XMM, $AESDATA1_XMM vmovdqu $AESDATA0, 0($DST) vmovdqu $AESDATA1_XMM, 32($DST) vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 vpshufb $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA1_XMM : $TMP1_XMM ]}, $AESDATA1_XMM vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 vmovdqu ($POWERS_PTR), $TMP0 vmovdqu 32($POWERS_PTR), $TMP1_XMM vpclmulqdq \$0x00, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM vpxor $TMP2, $LO, $LO vpclmulqdq \$0x01, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM vpxor $TMP2, $MI, $MI vpclmulqdq \$0x10, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM vpxor $TMP2, $MI, $MI vpclmulqdq \$0x11, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM vpxor $TMP2, $HI, $HI jmp .Lghash_mul_one_vec_unreduced$local_label_suffix .Lxor_two_blocks$local_label_suffix: vmovdqu ($SRC), $TMP0 vpxor $TMP0, $AESDATA0, $AESDATA0 vmovdqu $AESDATA0, ($DST) vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 vmovdqu ($POWERS_PTR), $TMP0 jmp .Lghash_mul_one_vec_unreduced$local_label_suffix .Lxor_one_block$local_label_suffix: vmovdqu ($SRC), $TMP0_XMM vpxor $TMP0_XMM, $AESDATA0_XMM, $AESDATA0_XMM vmovdqu $AESDATA0_XMM, ($DST) vpshufb $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA0_XMM : $TMP0_XMM ]}, $AESDATA0_XMM vpxor $GHASH_ACC_XMM, $AESDATA0_XMM, $AESDATA0_XMM vmovdqu ($POWERS_PTR), $TMP0_XMM .Lghash_mul_one_vec_unreduced$local_label_suffix: vpclmulqdq \$0x00, $TMP0, $AESDATA0, $TMP2 vpxor $TMP2, $LO, $LO vpclmulqdq \$0x01, $TMP0, $AESDATA0, $TMP2 vpxor $TMP2, $MI, $MI vpclmulqdq \$0x10, $TMP0, $AESDATA0, $TMP2 vpxor $TMP2, $MI, $MI vpclmulqdq \$0x11, $TMP0, $AESDATA0, $TMP2 vpxor $TMP2, $HI, $HI .Lreduce$local_label_suffix: # Finally, do the GHASH reduction. vbroadcasti128 .Lgfpoly(%rip), $TMP0 vpclmulqdq \$0x01, $LO, $TMP0, $TMP1 vpshufd \$0x4e, $LO, $LO vpxor $LO, $MI, $MI vpxor $TMP1, $MI, $MI vpclmulqdq \$0x01, $MI, $TMP0, $TMP1 vpshufd \$0x4e, $MI, $MI vpxor $MI, $HI, $HI vpxor $TMP1, $HI, $HI vextracti128 \$1, $HI, $GHASH_ACC_XMM vpxor $HI_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM .Ldone$local_label_suffix: # Store the updated GHASH accumulator back to memory. vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR) vzeroupper ___ return $code; } $code .= _begin_func "aes_gcm_enc_update_vaes_avx2", 1; $code .= _aes_gcm_update 1; $code .= _end_func; $code .= _begin_func "aes_gcm_dec_update_vaes_avx2", 1; $code .= _aes_gcm_update 0; $code .= _end_func; sub filter_and_print { # This function replaces AVX2 assembly instructions with their assembled forms, # to allow the code to work on old versions of binutils (older than 2.30) that do # not support these instructions. my %asmMap = ( 'vaesenc %ymm2, %ymm12, %ymm12' => '.byte 0xc4,0x62,0x1d,0xdc,0xe2', 'vaesenc %ymm2, %ymm13, %ymm13' => '.byte 0xc4,0x62,0x15,0xdc,0xea', 'vaesenc %ymm2, %ymm14, %ymm14' => '.byte 0xc4,0x62,0x0d,0xdc,0xf2', 'vaesenc %ymm2, %ymm15, %ymm15' => '.byte 0xc4,0x62,0x05,0xdc,0xfa', 'vaesenclast %ymm10, %ymm12, %ymm12' => '.byte 0xc4,0x42,0x1d,0xdd,0xe2', 'vaesenclast %ymm10, %ymm13, %ymm13' => '.byte 0xc4,0x42,0x15,0xdd,0xea', 'vaesenclast %ymm2, %ymm12, %ymm12' => '.byte 0xc4,0x62,0x1d,0xdd,0xe2', 'vaesenclast %ymm3, %ymm13, %ymm13' => '.byte 0xc4,0x62,0x15,0xdd,0xeb', 'vaesenclast %ymm5, %ymm14, %ymm14' => '.byte 0xc4,0x62,0x0d,0xdd,0xf5', 'vaesenclast %ymm6, %ymm15, %ymm15' => '.byte 0xc4,0x62,0x05,0xdd,0xfe', 'vpclmulqdq $0x00, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x00', 'vpclmulqdq $0x00, %ymm2, %ymm12, %ymm5' => '.byte 0xc4,0xe3,0x1d,0x44,0xea,0x00', 'vpclmulqdq $0x00, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x00', 'vpclmulqdq $0x00, %ymm4, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00', 'vpclmulqdq $0x00, %ymm4, %ymm3, %ymm5' => '.byte 0xc4,0xe3,0x65,0x44,0xec,0x00', 'vpclmulqdq $0x00, %ymm5, %ymm3, %ymm0' => '.byte 0xc4,0xe3,0x65,0x44,0xc5,0x00', 'vpclmulqdq $0x00, %ymm5, %ymm4, %ymm0' => '.byte 0xc4,0xe3,0x5d,0x44,0xc5,0x00', 'vpclmulqdq $0x00, %ymm7, %ymm2, %ymm6' => '.byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00', 'vpclmulqdq $0x00, %ymm8, %ymm2, %ymm2' => '.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00', 'vpclmulqdq $0x01, %ymm0, %ymm6, %ymm2' => '.byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01', 'vpclmulqdq $0x01, %ymm1, %ymm6, %ymm0' => '.byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01', 'vpclmulqdq $0x01, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x01', 'vpclmulqdq $0x01, %ymm2, %ymm12, %ymm6' => '.byte 0xc4,0xe3,0x1d,0x44,0xf2,0x01', 'vpclmulqdq $0x01, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x01', 'vpclmulqdq $0x01, %ymm5, %ymm2, %ymm3' => '.byte 0xc4,0xe3,0x6d,0x44,0xdd,0x01', 'vpclmulqdq $0x01, %ymm5, %ymm3, %ymm1' => '.byte 0xc4,0xe3,0x65,0x44,0xcd,0x01', 'vpclmulqdq $0x01, %ymm5, %ymm4, %ymm1' => '.byte 0xc4,0xe3,0x5d,0x44,0xcd,0x01', 'vpclmulqdq $0x01, %ymm5, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01', 'vpclmulqdq $0x01, %ymm6, %ymm2, %ymm3' => '.byte 0xc4,0xe3,0x6d,0x44,0xde,0x01', 'vpclmulqdq $0x01, %ymm6, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01', 'vpclmulqdq $0x10, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10', 'vpclmulqdq $0x10, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x10', 'vpclmulqdq $0x10, %ymm5, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd5,0x10', 'vpclmulqdq $0x10, %ymm5, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x10', 'vpclmulqdq $0x10, %ymm7, %ymm2, %ymm2' => '.byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10', 'vpclmulqdq $0x10, %ymm8, %ymm2, %ymm2' => '.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10', 'vpclmulqdq $0x11, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x11', 'vpclmulqdq $0x11, %ymm2, %ymm12, %ymm7' => '.byte 0xc4,0xe3,0x1d,0x44,0xfa,0x11', 'vpclmulqdq $0x11, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x11', 'vpclmulqdq $0x11, %ymm4, %ymm3, %ymm1' => '.byte 0xc4,0xe3,0x65,0x44,0xcc,0x11', 'vpclmulqdq $0x11, %ymm4, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11', 'vpclmulqdq $0x11, %ymm5, %ymm3, %ymm4' => '.byte 0xc4,0xe3,0x65,0x44,0xe5,0x11', 'vpclmulqdq $0x11, %ymm5, %ymm4, %ymm3' => '.byte 0xc4,0xe3,0x5d,0x44,0xdd,0x11', ); for my $line (split("\n",$code)) { my $trimmed; $trimmed = $line; $trimmed =~ s/^\s+//; $trimmed =~ s/\s+(#.*)?$//; if (exists $asmMap{$trimmed}) { $line = $asmMap{$trimmed}; } else { if($trimmed =~ /(vpclmulqdq|vaes).*%[yz]mm/) { die ("found instruction not supported under old binutils, please update asmMap with the results of running\n" . 'find target -name "*aes-gcm-avx2*.o" -exec python3 crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py \{\} \; | LC_ALL=C sort | uniq'); } } print $line,"\n"; } } filter_and_print(); close STDOUT or die "error closing STDOUT: $!"; exit 0; ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl000064400000000000000000000764411046102023000211240ustar 00000000000000#! /usr/bin/env perl # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # # AES-NI-CTR+GHASH stitch. # # February 2013 # # OpenSSL GCM implementation is organized in such way that its # performance is rather close to the sum of its streamed components, # in the context parallelized AES-NI CTR and modulo-scheduled # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation # was observed to perform significantly better than the sum of the # components on contemporary CPUs, the effort was deemed impossible to # justify. This module is based on combination of Intel submissions, # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max # Locktyukhin of Intel Corp. who verified that it reduces shuffles # pressure with notable relative improvement, achieving 1.0 cycle per # byte processed with 128-bit key on Haswell processor, 0.74 - on # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled # measurements for favourable packet size, one divisible by 96. # Applications using the EVP interface will observe a few percent # worse performance.] # # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; # |$avx| in ghash-x86_64.pl must be set to at least 1; otherwise tags will # be computed incorrectly. # # In upstream, this is controlled by shelling out to the compiler to check # versions, but BoringSSL is intended to be used with pre-generated perlasm # output, so this isn't useful anyway. # # The upstream code uses the condition |$avx>1| even though no AVX2 # instructions are used, because it assumes MOVBE is supported by the assembler # if and only if AVX2 is also supported by the assembler; see # https://marc.info/?l=openssl-dev&m=146567589526984&w=2. $avx = 2; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # See the comment above regarding why the condition is ($avx>1) when there are # no AVX2 instructions being used. if ($avx>1) {{{ # On Windows, only four parameters are passed in registers. The last two # parameters will be manually loaded into %rdi and %rsi. my ($inp, $out, $len, $key, $ivp, $Htable) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9", "%rdi", "%rsi") : ("%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9"); # The offset from %rbp to the Xip parameter. On Windows, all parameters have # corresponding stack positions, not just ones passed on the stack. # (0x40 = 6*8 + 0x10) # # Xip only needs to be accessed at the beginning and end of the function, and # this function is short on registers, so we make it the last parameter for # convenience. my $Xip_offset = $win64 ? 0x40 : 0x10; ($Ii,$T1,$T2,$Hkey, $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8)); ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15)); ($counter,$rounds,$const,$in0,$end0)=("%ebx","%r10d","%r11","%r14","%r15"); $code=<<___; .text .type _aesni_ctr32_ghash_6x,\@abi-omnipotent .align 32 _aesni_ctr32_ghash_6x: .cfi_startproc vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb sub \$6,$len vpxor $Z0,$Z0,$Z0 # $Z0 = 0 vmovdqu 0x00-0x80($key),$rndkey vpaddb $T2,$T1,$inout1 vpaddb $T2,$inout1,$inout2 vpaddb $T2,$inout2,$inout3 vpaddb $T2,$inout3,$inout4 vpaddb $T2,$inout4,$inout5 vpxor $rndkey,$T1,$inout0 vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0 jmp .Loop6x .align 32 .Loop6x: add \$`6<<24`,$counter jc .Lhandle_ctr32 # discard $inout[1-5]? vmovdqu 0x00-0x20($Htable),$Hkey # $Hkey^1 vpaddb $T2,$inout5,$T1 # next counter value vpxor $rndkey,$inout1,$inout1 vpxor $rndkey,$inout2,$inout2 .Lresume_ctr32: vmovdqu $T1,($ivp) # save next counter value vpclmulqdq \$0x10,$Hkey,$Z3,$Z1 vpxor $rndkey,$inout3,$inout3 vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey vpclmulqdq \$0x01,$Hkey,$Z3,$Z2 # At this point, the current block of 96 (0x60) bytes has already been # loaded into registers. Concurrently with processing it, we want to # load the next 96 bytes of input for the next round. Obviously, we can # only do this if there are at least 96 more bytes of input beyond the # input we're currently processing, or else we'd read past the end of # the input buffer. Here, we set |%r12| to 96 if there are at least 96 # bytes of input beyond the 96 bytes we're already processing, and we # set |%r12| to 0 otherwise. In the case where we set |%r12| to 96, # we'll read in the next block so that it is in registers for the next # loop iteration. In the case where we set |%r12| to 0, we'll re-read # the current block and then ignore what we re-read. # # At this point, |$in0| points to the current (already read into # registers) block, and |$end0| points to 2*96 bytes before the end of # the input. Thus, |$in0| > |$end0| means that we do not have the next # 96-byte block to read in, and |$in0| <= |$end0| means we do. xor %r12,%r12 cmp $in0,$end0 vaesenc $T2,$inout0,$inout0 vmovdqu 0x30+8(%rsp),$Ii # I[4] vpxor $rndkey,$inout4,$inout4 vpclmulqdq \$0x00,$Hkey,$Z3,$T1 vaesenc $T2,$inout1,$inout1 vpxor $rndkey,$inout5,$inout5 setnc %r12b vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 vaesenc $T2,$inout2,$inout2 vmovdqu 0x10-0x20($Htable),$Hkey # $Hkey^2 neg %r12 vaesenc $T2,$inout3,$inout3 vpxor $Z1,$Z2,$Z2 vpclmulqdq \$0x00,$Hkey,$Ii,$Z1 vpxor $Z0,$Xi,$Xi # modulo-scheduled vaesenc $T2,$inout4,$inout4 vpxor $Z1,$T1,$Z0 and \$0x60,%r12 vmovups 0x20-0x80($key),$rndkey vpclmulqdq \$0x10,$Hkey,$Ii,$T1 vaesenc $T2,$inout5,$inout5 vpclmulqdq \$0x01,$Hkey,$Ii,$T2 lea ($in0,%r12),$in0 vaesenc $rndkey,$inout0,$inout0 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi] vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey vmovdqu 0x40+8(%rsp),$Ii # I[3] vaesenc $rndkey,$inout1,$inout1 movbe 0x58($in0),%r13 vaesenc $rndkey,$inout2,$inout2 movbe 0x50($in0),%r12 vaesenc $rndkey,$inout3,$inout3 mov %r13,0x20+8(%rsp) vaesenc $rndkey,$inout4,$inout4 mov %r12,0x28+8(%rsp) vmovdqu 0x30-0x20($Htable),$Z1 # borrow $Z1 for $Hkey^3 vaesenc $rndkey,$inout5,$inout5 vmovups 0x30-0x80($key),$rndkey vpxor $T1,$Z2,$Z2 vpclmulqdq \$0x00,$Z1,$Ii,$T1 vaesenc $rndkey,$inout0,$inout0 vpxor $T2,$Z2,$Z2 vpclmulqdq \$0x10,$Z1,$Ii,$T2 vaesenc $rndkey,$inout1,$inout1 vpxor $Hkey,$Z3,$Z3 vpclmulqdq \$0x01,$Z1,$Ii,$Hkey vaesenc $rndkey,$inout2,$inout2 vpclmulqdq \$0x11,$Z1,$Ii,$Z1 vmovdqu 0x50+8(%rsp),$Ii # I[2] vaesenc $rndkey,$inout3,$inout3 vaesenc $rndkey,$inout4,$inout4 vpxor $T1,$Z0,$Z0 vmovdqu 0x40-0x20($Htable),$T1 # borrow $T1 for $Hkey^4 vaesenc $rndkey,$inout5,$inout5 vmovups 0x40-0x80($key),$rndkey vpxor $T2,$Z2,$Z2 vpclmulqdq \$0x00,$T1,$Ii,$T2 vaesenc $rndkey,$inout0,$inout0 vpxor $Hkey,$Z2,$Z2 vpclmulqdq \$0x10,$T1,$Ii,$Hkey vaesenc $rndkey,$inout1,$inout1 movbe 0x48($in0),%r13 vpxor $Z1,$Z3,$Z3 vpclmulqdq \$0x01,$T1,$Ii,$Z1 vaesenc $rndkey,$inout2,$inout2 movbe 0x40($in0),%r12 vpclmulqdq \$0x11,$T1,$Ii,$T1 vmovdqu 0x60+8(%rsp),$Ii # I[1] vaesenc $rndkey,$inout3,$inout3 mov %r13,0x30+8(%rsp) vaesenc $rndkey,$inout4,$inout4 mov %r12,0x38+8(%rsp) vpxor $T2,$Z0,$Z0 vmovdqu 0x60-0x20($Htable),$T2 # borrow $T2 for $Hkey^5 vaesenc $rndkey,$inout5,$inout5 vmovups 0x50-0x80($key),$rndkey vpxor $Hkey,$Z2,$Z2 vpclmulqdq \$0x00,$T2,$Ii,$Hkey vaesenc $rndkey,$inout0,$inout0 vpxor $Z1,$Z2,$Z2 vpclmulqdq \$0x10,$T2,$Ii,$Z1 vaesenc $rndkey,$inout1,$inout1 movbe 0x38($in0),%r13 vpxor $T1,$Z3,$Z3 vpclmulqdq \$0x01,$T2,$Ii,$T1 vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0] vaesenc $rndkey,$inout2,$inout2 movbe 0x30($in0),%r12 vpclmulqdq \$0x11,$T2,$Ii,$T2 vaesenc $rndkey,$inout3,$inout3 mov %r13,0x40+8(%rsp) vaesenc $rndkey,$inout4,$inout4 mov %r12,0x48+8(%rsp) vpxor $Hkey,$Z0,$Z0 vmovdqu 0x70-0x20($Htable),$Hkey # $Hkey^6 vaesenc $rndkey,$inout5,$inout5 vmovups 0x60-0x80($key),$rndkey vpxor $Z1,$Z2,$Z2 vpclmulqdq \$0x10,$Hkey,$Xi,$Z1 vaesenc $rndkey,$inout0,$inout0 vpxor $T1,$Z2,$Z2 vpclmulqdq \$0x01,$Hkey,$Xi,$T1 vaesenc $rndkey,$inout1,$inout1 movbe 0x28($in0),%r13 vpxor $T2,$Z3,$Z3 vpclmulqdq \$0x00,$Hkey,$Xi,$T2 vaesenc $rndkey,$inout2,$inout2 movbe 0x20($in0),%r12 vpclmulqdq \$0x11,$Hkey,$Xi,$Xi vaesenc $rndkey,$inout3,$inout3 mov %r13,0x50+8(%rsp) vaesenc $rndkey,$inout4,$inout4 mov %r12,0x58+8(%rsp) vpxor $Z1,$Z2,$Z2 vaesenc $rndkey,$inout5,$inout5 vpxor $T1,$Z2,$Z2 vmovups 0x70-0x80($key),$rndkey vpslldq \$8,$Z2,$Z1 vpxor $T2,$Z0,$Z0 vmovdqu 0x10($const),$Hkey # .Lpoly vaesenc $rndkey,$inout0,$inout0 vpxor $Xi,$Z3,$Z3 vaesenc $rndkey,$inout1,$inout1 vpxor $Z1,$Z0,$Z0 movbe 0x18($in0),%r13 vaesenc $rndkey,$inout2,$inout2 movbe 0x10($in0),%r12 vpalignr \$8,$Z0,$Z0,$Ii # 1st phase vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 mov %r13,0x60+8(%rsp) vaesenc $rndkey,$inout3,$inout3 mov %r12,0x68+8(%rsp) vaesenc $rndkey,$inout4,$inout4 vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey vaesenc $rndkey,$inout5,$inout5 vaesenc $T1,$inout0,$inout0 vmovups 0x90-0x80($key),$rndkey vaesenc $T1,$inout1,$inout1 vpsrldq \$8,$Z2,$Z2 vaesenc $T1,$inout2,$inout2 vpxor $Z2,$Z3,$Z3 vaesenc $T1,$inout3,$inout3 vpxor $Ii,$Z0,$Z0 movbe 0x08($in0),%r13 vaesenc $T1,$inout4,$inout4 movbe 0x00($in0),%r12 vaesenc $T1,$inout5,$inout5 vmovups 0xa0-0x80($key),$T1 cmp \$11,$rounds jb .Lenc_tail # 128-bit key vaesenc $rndkey,$inout0,$inout0 vaesenc $rndkey,$inout1,$inout1 vaesenc $rndkey,$inout2,$inout2 vaesenc $rndkey,$inout3,$inout3 vaesenc $rndkey,$inout4,$inout4 vaesenc $rndkey,$inout5,$inout5 vaesenc $T1,$inout0,$inout0 vaesenc $T1,$inout1,$inout1 vaesenc $T1,$inout2,$inout2 vaesenc $T1,$inout3,$inout3 vaesenc $T1,$inout4,$inout4 vmovups 0xb0-0x80($key),$rndkey vaesenc $T1,$inout5,$inout5 vmovups 0xc0-0x80($key),$T1 # 192-bit key support was removed. vaesenc $rndkey,$inout0,$inout0 vaesenc $rndkey,$inout1,$inout1 vaesenc $rndkey,$inout2,$inout2 vaesenc $rndkey,$inout3,$inout3 vaesenc $rndkey,$inout4,$inout4 vaesenc $rndkey,$inout5,$inout5 vaesenc $T1,$inout0,$inout0 vaesenc $T1,$inout1,$inout1 vaesenc $T1,$inout2,$inout2 vaesenc $T1,$inout3,$inout3 vaesenc $T1,$inout4,$inout4 vmovups 0xd0-0x80($key),$rndkey vaesenc $T1,$inout5,$inout5 vmovups 0xe0-0x80($key),$T1 jmp .Lenc_tail # 256-bit key .align 32 .Lhandle_ctr32: vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask vpshufb $Ii,$T1,$Z2 # byte-swap counter vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb vpaddd $Z1,$Z2,$inout2 vmovdqu 0x00-0x20($Htable),$Hkey # $Hkey^1 vpaddd $Z1,$inout1,$inout3 vpshufb $Ii,$inout1,$inout1 vpaddd $Z1,$inout2,$inout4 vpshufb $Ii,$inout2,$inout2 vpxor $rndkey,$inout1,$inout1 vpaddd $Z1,$inout3,$inout5 vpshufb $Ii,$inout3,$inout3 vpxor $rndkey,$inout2,$inout2 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value vpshufb $Ii,$inout4,$inout4 vpshufb $Ii,$inout5,$inout5 vpshufb $Ii,$T1,$T1 # next counter value jmp .Lresume_ctr32 .align 32 .Lenc_tail: vaesenc $rndkey,$inout0,$inout0 vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase vaesenc $rndkey,$inout1,$inout1 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 vpxor 0x00($inp),$T1,$T2 vaesenc $rndkey,$inout2,$inout2 vpxor 0x10($inp),$T1,$Ii vaesenc $rndkey,$inout3,$inout3 vpxor 0x20($inp),$T1,$Z1 vaesenc $rndkey,$inout4,$inout4 vpxor 0x30($inp),$T1,$Z2 vaesenc $rndkey,$inout5,$inout5 vpxor 0x40($inp),$T1,$Z3 vpxor 0x50($inp),$T1,$Hkey vmovdqu ($ivp),$T1 # load next counter value vaesenclast $T2,$inout0,$inout0 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb vaesenclast $Ii,$inout1,$inout1 vpaddb $T2,$T1,$Ii mov %r13,0x70+8(%rsp) lea 0x60($inp),$inp # These two prefetches were added in BoringSSL. See change that added them. prefetcht0 512($inp) # We use 96-byte block so prefetch 2 lines (128 bytes) prefetcht0 576($inp) vaesenclast $Z1,$inout2,$inout2 vpaddb $T2,$Ii,$Z1 mov %r12,0x78+8(%rsp) lea 0x60($out),$out vmovdqu 0x00-0x80($key),$rndkey vaesenclast $Z2,$inout3,$inout3 vpaddb $T2,$Z1,$Z2 vaesenclast $Z3, $inout4,$inout4 vpaddb $T2,$Z2,$Z3 vaesenclast $Hkey,$inout5,$inout5 vpaddb $T2,$Z3,$Hkey add \$0x60,%rax sub \$0x6,$len jc .L6x_done vmovups $inout0,-0x60($out) # save output vpxor $rndkey,$T1,$inout0 vmovups $inout1,-0x50($out) vmovdqa $Ii,$inout1 # 0 latency vmovups $inout2,-0x40($out) vmovdqa $Z1,$inout2 # 0 latency vmovups $inout3,-0x30($out) vmovdqa $Z2,$inout3 # 0 latency vmovups $inout4,-0x20($out) vmovdqa $Z3,$inout4 # 0 latency vmovups $inout5,-0x10($out) vmovdqa $Hkey,$inout5 # 0 latency vmovdqu 0x20+8(%rsp),$Z3 # I[5] jmp .Loop6x .L6x_done: vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled vpxor $Z0,$Xi,$Xi # modulo-scheduled ret .cfi_endproc .size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x ___ ###################################################################### # # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len, # const AES_KEY *key, unsigned char iv[16], const u128 Htbl[9], # u128 *Xip); $code.=<<___; .globl aesni_gcm_decrypt .type aesni_gcm_decrypt,\@abi-omnipotent .align 32 aesni_gcm_decrypt: .cfi_startproc .seh_startproc _CET_ENDBR xor %rax,%rax # We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60) # bytes of input. cmp \$0x60,$len # minimal accepted length jb .Lgcm_dec_abort push %rbp .cfi_push %rbp .seh_pushreg %rbp mov %rsp, %rbp # save stack pointer .cfi_def_cfa_register %rbp push %rbx .cfi_push %rbx .seh_pushreg %rbx push %r12 .cfi_push %r12 .seh_pushreg %r12 push %r13 .cfi_push %r13 .seh_pushreg %r13 push %r14 .cfi_push %r14 .seh_pushreg %r14 push %r15 .cfi_push %r15 .seh_pushreg %r15 ___ if ($win64) { $code.=<<___ lea -0xa8(%rsp),%rsp # 8 extra bytes to align the stack .seh_stackalloc 0xa8 .seh_setframe %rbp, 0xa8+5*8 # Load the last two parameters. These go into %rdi and %rsi, which are # non-volatile on Windows, so stash them in the parameter stack area # first. mov %rdi, 0x10(%rbp) .seh_savereg %rdi, 0xa8+5*8+0x10 mov %rsi, 0x18(%rbp) .seh_savereg %rsi, 0xa8+5*8+0x18 mov 0x30(%rbp), $ivp mov 0x38(%rbp), $Htable # Save non-volatile XMM registers. movaps %xmm6,-0xd0(%rbp) .seh_savexmm %xmm6, 0xa8+5*8-0xd0 movaps %xmm7,-0xc0(%rbp) .seh_savexmm %xmm7, 0xa8+5*8-0xc0 movaps %xmm8,-0xb0(%rbp) .seh_savexmm %xmm8, 0xa8+5*8-0xb0 movaps %xmm9,-0xa0(%rbp) .seh_savexmm %xmm9, 0xa8+5*8-0xa0 movaps %xmm10,-0x90(%rbp) .seh_savexmm %xmm10, 0xa8+5*8-0x90 movaps %xmm11,-0x80(%rbp) .seh_savexmm %xmm11, 0xa8+5*8-0x80 movaps %xmm12,-0x70(%rbp) .seh_savexmm %xmm12, 0xa8+5*8-0x70 movaps %xmm13,-0x60(%rbp) .seh_savexmm %xmm13, 0xa8+5*8-0x60 movaps %xmm14,-0x50(%rbp) .seh_savexmm %xmm14, 0xa8+5*8-0x50 movaps %xmm15,-0x40(%rbp) .seh_savexmm %xmm15, 0xa8+5*8-0x40 .seh_endprologue ___ } $code.=<<___; vzeroupper mov $Xip_offset(%rbp), %r12 vmovdqu ($ivp),$T1 # input counter value add \$-128,%rsp mov 12($ivp),$counter lea .Lbswap_mask(%rip),$const lea -0x80($key),$in0 # borrow $in0 mov \$0xf80,$end0 # borrow $end0 vmovdqu (%r12),$Xi # load Xi and \$-128,%rsp # ensure stack alignment vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask lea 0x80($key),$key # size optimization lea 0x20($Htable),$Htable # size optimization mov 0xf0-0x80($key),$rounds vpshufb $Ii,$Xi,$Xi and $end0,$in0 and %rsp,$end0 sub $in0,$end0 jc .Ldec_no_key_aliasing cmp \$768,$end0 jnc .Ldec_no_key_aliasing sub $end0,%rsp # avoid aliasing with key .Ldec_no_key_aliasing: vmovdqu 0x50($inp),$Z3 # I[5] mov $inp,$in0 vmovdqu 0x40($inp),$Z0 # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) # bytes before the end of the input. Note, in particular, that this is # correct even if |$len| is not an even multiple of 96 or 16. XXX: This # seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must # not be near the very beginning of the address space when |$len| < 2*96 # (0xc0). lea -0xc0($inp,$len),$end0 vmovdqu 0x30($inp),$Z1 shr \$4,$len xor %rax,%rax vmovdqu 0x20($inp),$Z2 vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x vmovdqu 0x10($inp),$T2 vpshufb $Ii,$Z0,$Z0 vmovdqu ($inp),$Hkey vpshufb $Ii,$Z1,$Z1 vmovdqu $Z0,0x30(%rsp) vpshufb $Ii,$Z2,$Z2 vmovdqu $Z1,0x40(%rsp) vpshufb $Ii,$T2,$T2 vmovdqu $Z2,0x50(%rsp) vpshufb $Ii,$Hkey,$Hkey vmovdqu $T2,0x60(%rsp) vmovdqu $Hkey,0x70(%rsp) call _aesni_ctr32_ghash_6x mov $Xip_offset(%rbp), %r12 vmovups $inout0,-0x60($out) # save output vmovups $inout1,-0x50($out) vmovups $inout2,-0x40($out) vmovups $inout3,-0x30($out) vmovups $inout4,-0x20($out) vmovups $inout5,-0x10($out) vpshufb ($const),$Xi,$Xi # .Lbswap_mask vmovdqu $Xi,(%r12) # output Xi vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd0(%rbp),%xmm6 movaps -0xc0(%rbp),%xmm7 movaps -0xb0(%rbp),%xmm8 movaps -0xa0(%rbp),%xmm9 movaps -0x90(%rbp),%xmm10 movaps -0x80(%rbp),%xmm11 movaps -0x70(%rbp),%xmm12 movaps -0x60(%rbp),%xmm13 movaps -0x50(%rbp),%xmm14 movaps -0x40(%rbp),%xmm15 mov 0x10(%rbp),%rdi mov 0x18(%rbp),%rsi ___ $code.=<<___; lea -0x28(%rbp), %rsp # restore %rsp to fixed allocation .cfi_def_cfa %rsp, 0x38 pop %r15 .cfi_pop %r15 pop %r14 .cfi_pop %r14 pop %r13 .cfi_pop %r13 pop %r12 .cfi_pop %r12 pop %rbx .cfi_pop %rbx pop %rbp .cfi_pop %rbp .Lgcm_dec_abort: ret .seh_endproc .cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt ___ $code.=<<___; .type _aesni_ctr32_6x,\@abi-omnipotent .align 32 _aesni_ctr32_6x: .cfi_startproc vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb lea -1($rounds),%r13 vmovups 0x10-0x80($key),$rndkey lea 0x20-0x80($key),%r12 vpxor $Z0,$T1,$inout0 add \$`6<<24`,$counter jc .Lhandle_ctr32_2 vpaddb $T2,$T1,$inout1 vpaddb $T2,$inout1,$inout2 vpxor $Z0,$inout1,$inout1 vpaddb $T2,$inout2,$inout3 vpxor $Z0,$inout2,$inout2 vpaddb $T2,$inout3,$inout4 vpxor $Z0,$inout3,$inout3 vpaddb $T2,$inout4,$inout5 vpxor $Z0,$inout4,$inout4 vpaddb $T2,$inout5,$T1 vpxor $Z0,$inout5,$inout5 jmp .Loop_ctr32 .align 16 .Loop_ctr32: vaesenc $rndkey,$inout0,$inout0 vaesenc $rndkey,$inout1,$inout1 vaesenc $rndkey,$inout2,$inout2 vaesenc $rndkey,$inout3,$inout3 vaesenc $rndkey,$inout4,$inout4 vaesenc $rndkey,$inout5,$inout5 vmovups (%r12),$rndkey lea 0x10(%r12),%r12 dec %r13d jnz .Loop_ctr32 vmovdqu (%r12),$Hkey # last round key vaesenc $rndkey,$inout0,$inout0 vpxor 0x00($inp),$Hkey,$Z0 vaesenc $rndkey,$inout1,$inout1 vpxor 0x10($inp),$Hkey,$Z1 vaesenc $rndkey,$inout2,$inout2 vpxor 0x20($inp),$Hkey,$Z2 vaesenc $rndkey,$inout3,$inout3 vpxor 0x30($inp),$Hkey,$Xi vaesenc $rndkey,$inout4,$inout4 vpxor 0x40($inp),$Hkey,$T2 vaesenc $rndkey,$inout5,$inout5 vpxor 0x50($inp),$Hkey,$Hkey lea 0x60($inp),$inp vaesenclast $Z0,$inout0,$inout0 vaesenclast $Z1,$inout1,$inout1 vaesenclast $Z2,$inout2,$inout2 vaesenclast $Xi,$inout3,$inout3 vaesenclast $T2,$inout4,$inout4 vaesenclast $Hkey,$inout5,$inout5 vmovups $inout0,0x00($out) vmovups $inout1,0x10($out) vmovups $inout2,0x20($out) vmovups $inout3,0x30($out) vmovups $inout4,0x40($out) vmovups $inout5,0x50($out) lea 0x60($out),$out ret .align 32 .Lhandle_ctr32_2: vpshufb $Ii,$T1,$Z2 # byte-swap counter vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb vpaddd $Z1,$Z2,$inout2 vpaddd $Z1,$inout1,$inout3 vpshufb $Ii,$inout1,$inout1 vpaddd $Z1,$inout2,$inout4 vpshufb $Ii,$inout2,$inout2 vpxor $Z0,$inout1,$inout1 vpaddd $Z1,$inout3,$inout5 vpshufb $Ii,$inout3,$inout3 vpxor $Z0,$inout2,$inout2 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value vpshufb $Ii,$inout4,$inout4 vpxor $Z0,$inout3,$inout3 vpshufb $Ii,$inout5,$inout5 vpxor $Z0,$inout4,$inout4 vpshufb $Ii,$T1,$T1 # next counter value vpxor $Z0,$inout5,$inout5 jmp .Loop_ctr32 .cfi_endproc .size _aesni_ctr32_6x,.-_aesni_ctr32_6x .globl aesni_gcm_encrypt .type aesni_gcm_encrypt,\@abi-omnipotent .align 32 aesni_gcm_encrypt: .cfi_startproc .seh_startproc _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST .extern BORINGSSL_function_hit movb \$1,BORINGSSL_function_hit+2(%rip) #endif xor %rax,%rax # We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of # input. Then we call |_aesni_ctr32_ghash_6x|, which requires at # least 96 more bytes of input. cmp \$0x60*3,$len # minimal accepted length jb .Lgcm_enc_abort push %rbp .cfi_push %rbp .seh_pushreg %rbp mov %rsp, %rbp # save stack pointer .cfi_def_cfa_register %rbp push %rbx .cfi_push %rbx .seh_pushreg %rbx push %r12 .cfi_push %r12 .seh_pushreg %r12 push %r13 .cfi_push %r13 .seh_pushreg %r13 push %r14 .cfi_push %r14 .seh_pushreg %r14 push %r15 .cfi_push %r15 .seh_pushreg %r15 ___ if ($win64) { $code.=<<___ lea -0xa8(%rsp),%rsp # 8 extra bytes to align the stack .seh_stackalloc 0xa8 .seh_setframe %rbp, 0xa8+5*8 # Load the last two parameters. These go into %rdi and %rsi, which are # non-volatile on Windows, so stash them in the parameter stack area # first. mov %rdi, 0x10(%rbp) .seh_savereg %rdi, 0xa8+5*8+0x10 mov %rsi, 0x18(%rbp) .seh_savereg %rsi, 0xa8+5*8+0x18 mov 0x30(%rbp), $ivp mov 0x38(%rbp), $Htable # Save non-volatile XMM registers. movaps %xmm6,-0xd0(%rbp) .seh_savexmm %xmm6, 0xa8+5*8-0xd0 movaps %xmm7,-0xc0(%rbp) .seh_savexmm %xmm7, 0xa8+5*8-0xc0 movaps %xmm8,-0xb0(%rbp) .seh_savexmm %xmm8, 0xa8+5*8-0xb0 movaps %xmm9,-0xa0(%rbp) .seh_savexmm %xmm9, 0xa8+5*8-0xa0 movaps %xmm10,-0x90(%rbp) .seh_savexmm %xmm10, 0xa8+5*8-0x90 movaps %xmm11,-0x80(%rbp) .seh_savexmm %xmm11, 0xa8+5*8-0x80 movaps %xmm12,-0x70(%rbp) .seh_savexmm %xmm12, 0xa8+5*8-0x70 movaps %xmm13,-0x60(%rbp) .seh_savexmm %xmm13, 0xa8+5*8-0x60 movaps %xmm14,-0x50(%rbp) .seh_savexmm %xmm14, 0xa8+5*8-0x50 movaps %xmm15,-0x40(%rbp) .seh_savexmm %xmm15, 0xa8+5*8-0x40 .seh_endprologue ___ } $code.=<<___; vzeroupper vmovdqu ($ivp),$T1 # input counter value add \$-128,%rsp mov 12($ivp),$counter lea .Lbswap_mask(%rip),$const lea -0x80($key),$in0 # borrow $in0 mov \$0xf80,$end0 # borrow $end0 lea 0x80($key),$key # size optimization vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask and \$-128,%rsp # ensure stack alignment mov 0xf0-0x80($key),$rounds and $end0,$in0 and %rsp,$end0 sub $in0,$end0 jc .Lenc_no_key_aliasing cmp \$768,$end0 jnc .Lenc_no_key_aliasing sub $end0,%rsp # avoid aliasing with key .Lenc_no_key_aliasing: mov $out,$in0 # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) # bytes before the end of the input. Note, in particular, that this is # correct even if |$len| is not an even multiple of 96 or 16. Unlike in # the decryption case, there's no caveat that |$out| must not be near # the very beginning of the address space, because we know that # |$len| >= 3*96 from the check above, and so we know # |$out| + |$len| >= 2*96 (0xc0). lea -0xc0($out,$len),$end0 shr \$4,$len call _aesni_ctr32_6x vpshufb $Ii,$inout0,$Xi # save bswapped output on stack vpshufb $Ii,$inout1,$T2 vmovdqu $Xi,0x70(%rsp) vpshufb $Ii,$inout2,$Z0 vmovdqu $T2,0x60(%rsp) vpshufb $Ii,$inout3,$Z1 vmovdqu $Z0,0x50(%rsp) vpshufb $Ii,$inout4,$Z2 vmovdqu $Z1,0x40(%rsp) vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x vmovdqu $Z2,0x30(%rsp) call _aesni_ctr32_6x mov $Xip_offset(%rbp), %r12 lea 0x20($Htable),$Htable # size optimization vmovdqu (%r12),$Xi # load Xi sub \$12,$len mov \$0x60*2,%rax vpshufb $Ii,$Xi,$Xi call _aesni_ctr32_ghash_6x vmovdqu 0x20(%rsp),$Z3 # I[5] vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask vmovdqu 0x00-0x20($Htable),$Hkey # $Hkey^1 vpunpckhqdq $Z3,$Z3,$T1 vmovdqu 0x20-0x20($Htable),$rndkey # borrow $rndkey for $HK vmovups $inout0,-0x60($out) # save output vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy vpxor $Z3,$T1,$T1 vmovups $inout1,-0x50($out) vpshufb $Ii,$inout1,$inout1 vmovups $inout2,-0x40($out) vpshufb $Ii,$inout2,$inout2 vmovups $inout3,-0x30($out) vpshufb $Ii,$inout3,$inout3 vmovups $inout4,-0x20($out) vpshufb $Ii,$inout4,$inout4 vmovups $inout5,-0x10($out) vpshufb $Ii,$inout5,$inout5 vmovdqu $inout0,0x10(%rsp) # free $inout0 ___ { my ($HK,$T3)=($rndkey,$inout0); $code.=<<___; vmovdqu 0x30(%rsp),$Z2 # I[4] vmovdqu 0x10-0x20($Htable),$Ii # borrow $Ii for $Hkey^2 vpunpckhqdq $Z2,$Z2,$T2 vpclmulqdq \$0x00,$Hkey,$Z3,$Z1 vpxor $Z2,$T2,$T2 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 vpclmulqdq \$0x00,$HK,$T1,$T1 vmovdqu 0x40(%rsp),$T3 # I[3] vpclmulqdq \$0x00,$Ii,$Z2,$Z0 vmovdqu 0x30-0x20($Htable),$Hkey # $Hkey^3 vpxor $Z1,$Z0,$Z0 vpunpckhqdq $T3,$T3,$Z1 vpclmulqdq \$0x11,$Ii,$Z2,$Z2 vpxor $T3,$Z1,$Z1 vpxor $Z3,$Z2,$Z2 vpclmulqdq \$0x10,$HK,$T2,$T2 vmovdqu 0x50-0x20($Htable),$HK vpxor $T1,$T2,$T2 vmovdqu 0x50(%rsp),$T1 # I[2] vpclmulqdq \$0x00,$Hkey,$T3,$Z3 vmovdqu 0x40-0x20($Htable),$Ii # borrow $Ii for $Hkey^4 vpxor $Z0,$Z3,$Z3 vpunpckhqdq $T1,$T1,$Z0 vpclmulqdq \$0x11,$Hkey,$T3,$T3 vpxor $T1,$Z0,$Z0 vpxor $Z2,$T3,$T3 vpclmulqdq \$0x00,$HK,$Z1,$Z1 vpxor $T2,$Z1,$Z1 vmovdqu 0x60(%rsp),$T2 # I[1] vpclmulqdq \$0x00,$Ii,$T1,$Z2 vmovdqu 0x60-0x20($Htable),$Hkey # $Hkey^5 vpxor $Z3,$Z2,$Z2 vpunpckhqdq $T2,$T2,$Z3 vpclmulqdq \$0x11,$Ii,$T1,$T1 vpxor $T2,$Z3,$Z3 vpxor $T3,$T1,$T1 vpclmulqdq \$0x10,$HK,$Z0,$Z0 vmovdqu 0x80-0x20($Htable),$HK vpxor $Z1,$Z0,$Z0 vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0] vpclmulqdq \$0x00,$Hkey,$T2,$Z1 vmovdqu 0x70-0x20($Htable),$Ii # borrow $Ii for $Hkey^6 vpunpckhqdq $Xi,$Xi,$T3 vpxor $Z2,$Z1,$Z1 vpclmulqdq \$0x11,$Hkey,$T2,$T2 vpxor $Xi,$T3,$T3 vpxor $T1,$T2,$T2 vpclmulqdq \$0x00,$HK,$Z3,$Z3 vpxor $Z0,$Z3,$Z0 vpclmulqdq \$0x00,$Ii,$Xi,$Z2 vmovdqu 0x00-0x20($Htable),$Hkey # $Hkey^1 vpunpckhqdq $inout5,$inout5,$T1 vpclmulqdq \$0x11,$Ii,$Xi,$Xi vpxor $inout5,$T1,$T1 vpxor $Z1,$Z2,$Z1 vpclmulqdq \$0x10,$HK,$T3,$T3 vmovdqu 0x20-0x20($Htable),$HK vpxor $T2,$Xi,$Z3 vpxor $Z0,$T3,$Z2 vmovdqu 0x10-0x20($Htable),$Ii # borrow $Ii for $Hkey^2 vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing vpclmulqdq \$0x00,$Hkey,$inout5,$Z0 vpxor $T3,$Z2,$Z2 vpunpckhqdq $inout4,$inout4,$T2 vpclmulqdq \$0x11,$Hkey,$inout5,$inout5 vpxor $inout4,$T2,$T2 vpslldq \$8,$Z2,$T3 vpclmulqdq \$0x00,$HK,$T1,$T1 vpxor $T3,$Z1,$Xi vpsrldq \$8,$Z2,$Z2 vpxor $Z2,$Z3,$Z3 vpclmulqdq \$0x00,$Ii,$inout4,$Z1 vmovdqu 0x30-0x20($Htable),$Hkey # $Hkey^3 vpxor $Z0,$Z1,$Z1 vpunpckhqdq $inout3,$inout3,$T3 vpclmulqdq \$0x11,$Ii,$inout4,$inout4 vpxor $inout3,$T3,$T3 vpxor $inout5,$inout4,$inout4 vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase vpclmulqdq \$0x10,$HK,$T2,$T2 vmovdqu 0x50-0x20($Htable),$HK vpxor $T1,$T2,$T2 vpclmulqdq \$0x00,$Hkey,$inout3,$Z0 vmovdqu 0x40-0x20($Htable),$Ii # borrow $Ii for $Hkey^4 vpxor $Z1,$Z0,$Z0 vpunpckhqdq $inout2,$inout2,$T1 vpclmulqdq \$0x11,$Hkey,$inout3,$inout3 vpxor $inout2,$T1,$T1 vpxor $inout4,$inout3,$inout3 vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0 vpclmulqdq \$0x00,$HK,$T3,$T3 vpxor $T2,$T3,$T3 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi vxorps $inout5,$Xi,$Xi vpclmulqdq \$0x00,$Ii,$inout2,$Z1 vmovdqu 0x60-0x20($Htable),$Hkey # $Hkey^5 vpxor $Z0,$Z1,$Z1 vpunpckhqdq $inout1,$inout1,$T2 vpclmulqdq \$0x11,$Ii,$inout2,$inout2 vpxor $inout1,$T2,$T2 vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase vpxor $inout3,$inout2,$inout2 vpclmulqdq \$0x10,$HK,$T1,$T1 vmovdqu 0x80-0x20($Htable),$HK vpxor $T3,$T1,$T1 vxorps $Z3,$inout5,$inout5 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi vxorps $inout5,$Xi,$Xi vpclmulqdq \$0x00,$Hkey,$inout1,$Z0 vmovdqu 0x70-0x20($Htable),$Ii # borrow $Ii for $Hkey^6 vpxor $Z1,$Z0,$Z0 vpunpckhqdq $Xi,$Xi,$T3 vpclmulqdq \$0x11,$Hkey,$inout1,$inout1 vpxor $Xi,$T3,$T3 vpxor $inout2,$inout1,$inout1 vpclmulqdq \$0x00,$HK,$T2,$T2 vpxor $T1,$T2,$T2 vpclmulqdq \$0x00,$Ii,$Xi,$Z1 vpclmulqdq \$0x11,$Ii,$Xi,$Z3 vpxor $Z0,$Z1,$Z1 vpclmulqdq \$0x10,$HK,$T3,$Z2 vpxor $inout1,$Z3,$Z3 vpxor $T2,$Z2,$Z2 vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing vpxor $Z0,$Z2,$Z2 vpslldq \$8,$Z2,$T1 vmovdqu 0x10($const),$Hkey # .Lpoly vpsrldq \$8,$Z2,$Z2 vpxor $T1,$Z1,$Xi vpxor $Z2,$Z3,$Z3 vpalignr \$8,$Xi,$Xi,$T2 # 1st phase vpclmulqdq \$0x10,$Hkey,$Xi,$Xi vpxor $T2,$Xi,$Xi vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase vpclmulqdq \$0x10,$Hkey,$Xi,$Xi vpxor $Z3,$T2,$T2 vpxor $T2,$Xi,$Xi ___ } $code.=<<___; mov $Xip_offset(%rbp), %r12 vpshufb ($const),$Xi,$Xi # .Lbswap_mask vmovdqu $Xi,(%r12) # output Xi vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd0(%rbp),%xmm6 movaps -0xc0(%rbp),%xmm7 movaps -0xb0(%rbp),%xmm8 movaps -0xa0(%rbp),%xmm9 movaps -0x90(%rbp),%xmm10 movaps -0x80(%rbp),%xmm11 movaps -0x70(%rbp),%xmm12 movaps -0x60(%rbp),%xmm13 movaps -0x50(%rbp),%xmm14 movaps -0x40(%rbp),%xmm15 mov 0x10(%rbp),%rdi mov 0x18(%rbp),%rsi ___ $code.=<<___; lea -0x28(%rbp), %rsp # restore %rsp to fixed allocation .cfi_def_cfa %rsp, 0x38 pop %r15 .cfi_pop %r15 pop %r14 .cfi_pop %r14 pop %r13 .cfi_pop %r13 pop %r12 .cfi_pop %r12 pop %rbx .cfi_pop %rbx pop %rbp .cfi_pop %rbp .Lgcm_enc_abort: ret .seh_endproc .cfi_endproc .size aesni_gcm_encrypt,.-aesni_gcm_encrypt ___ $code.=<<___; .section .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lpoly: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .Lone_msb: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .Ltwo_lsb: .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .Lone_lsb: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by " .align 64 .text ___ }}} else {{{ $code=<<___; # assembler is too old .text .globl aesni_gcm_encrypt .type aesni_gcm_encrypt,\@abi-omnipotent aesni_gcm_encrypt: _CET_ENDBR xor %eax,%eax ret .size aesni_gcm_encrypt,.-aesni_gcm_encrypt .globl aesni_gcm_decrypt .type aesni_gcm_decrypt,\@abi-omnipotent aesni_gcm_decrypt: _CET_ENDBR xor %eax,%eax ret .size aesni_gcm_decrypt,.-aesni_gcm_decrypt ___ }}} $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86.pl000064400000000000000000000713511046102023000200420ustar 00000000000000#! /usr/bin/env perl # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # This module implements support for Intel AES-NI extension. In # OpenSSL context it's used with Intel engine, but can also be used as # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for # details]. # # Performance. # # To start with see corresponding paragraph in aesni-x86_64.pl... # Instead of filling table similar to one found there I've chosen to # summarize *comparison* results for raw ECB, CTR and CBC benchmarks. # The simplified table below represents 32-bit performance relative # to 64-bit one in every given point. Ratios vary for different # encryption modes, therefore interval values. # # 16-byte 64-byte 256-byte 1-KB 8-KB # 53-67% 67-84% 91-94% 95-98% 97-99.5% # # Lower ratios for smaller block sizes are perfectly understandable, # because function call overhead is higher in 32-bit mode. Largest # 8-KB block performance is virtually same: 32-bit code is less than # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. # January 2011 # # See aesni-x86_64.pl for details. Unlike x86_64 version this module # interleaves at most 6 aes[enc|dec] instructions, because there are # not enough registers for 8x interleave [which should be optimal for # Sandy Bridge]. Actually, performance results for 6x interleave # factor presented in aesni-x86_64.pl (except for CTR) are for this # module. # April 2011 # # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. # November 2015 # # Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL] ###################################################################### # Current large-block performance in cycles per byte processed with # 128-bit key (less is better). # # CBC en-/decrypt CTR XTS ECB OCB # Westmere 3.77/1.37 1.37 1.52 1.27 # * Bridge 5.07/0.98 0.99 1.09 0.91 1.10 # Haswell 4.44/0.80 0.97 1.03 0.72 0.76 # Skylake 2.68/0.65 0.65 0.66 0.64 0.66 # Silvermont 5.77/3.56 3.67 4.03 3.46 4.03 # Goldmont 3.84/1.39 1.39 1.63 1.31 1.70 # Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23 $PREFIX="aes_hw"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-586.pl:-) $AESNI_PREFIX="aes_hw"; $inline=1; # inline _aesni_[en|de]crypt $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../../perlasm"); require "x86asm.pl"; $output = pop; open OUT,">$output"; *STDOUT=*OUT; &asm_init($ARGV[0]); &preprocessor_ifdef("BORINGSSL_DISPATCH_TEST") &external_label("BORINGSSL_function_hit"); &preprocessor_endif(); &static_label("key_const"); if ($PREFIX eq $AESNI_PREFIX) { $movekey=\&movups; } else { $movekey=\&movups; } $len="eax"; $rounds="ecx"; $key="edx"; $inp="esi"; $out="edi"; $rounds_="ebx"; # backup copy for $rounds $key_="ebp"; # backup copy for $key $rndkey0="xmm0"; $rndkey1="xmm1"; $inout0="xmm2"; $inout1="xmm3"; $inout2="xmm4"; $inout3="xmm5"; $in1="xmm5"; $inout4="xmm6"; $in0="xmm6"; $inout5="xmm7"; $ivec="xmm7"; # AESNI extension sub aeskeygenassist { my($dst,$src,$imm)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } } sub aescommon { my($opcodelet,$dst,$src)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} } sub aesimc { aescommon(0xdb,@_); } sub aesenc { aescommon(0xdc,@_); } sub aesenclast { aescommon(0xdd,@_); } # Inline version of internal aesni_[en|de]crypt1 { my $sn; sub aesni_inline_generate1 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); $sn++; &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &xorps ($ivec,$rndkey0) if (defined($ivec)); &lea ($key,&DWP(32,$key)); &xorps ($inout,$ivec) if (defined($ivec)); &xorps ($inout,$rndkey0) if (!defined($ivec)); &set_label("${p}1_loop_$sn"); eval"&aes${p} ($inout,$rndkey1)"; &dec ($rounds); &$movekey ($rndkey1,&QWP(0,$key)); &lea ($key,&DWP(16,$key)); &jnz (&label("${p}1_loop_$sn")); eval"&aes${p}last ($inout,$rndkey1)"; }} sub aesni_generate1 # fully unrolled loop { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); &function_begin_B("_aesni_${p}rypt1"); &movups ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(0x10,$key)); &xorps ($inout,$rndkey0); &$movekey ($rndkey0,&QWP(0x20,$key)); &lea ($key,&DWP(0x30,$key)); &cmp ($rounds,11); &jb (&label("${p}128")); &lea ($key,&DWP(0x40,$key)); # 192-bit key support was removed. eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(-0x40,$key)); eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(-0x30,$key)); # 192-bit key support was removed. eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(-0x20,$key)); eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(-0x10,$key)); &set_label("${p}128"); eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0,$key)); eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x10,$key)); eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x20,$key)); eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x30,$key)); eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x40,$key)); eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x50,$key)); eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x60,$key)); eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x70,$key)); eval"&aes${p} ($inout,$rndkey1)"; eval"&aes${p}last ($inout,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt1"); } # _aesni_[en|de]cryptN are private interfaces, N denotes interleave # factor. Why 3x subroutine were originally used in loops? Even though # aes[enc|dec] latency was originally 6, it could be scheduled only # every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. # This is why it originally made no sense to implement 2x subroutine. # But times change and it became appropriate to spend extra 192 bytes # on 2x subroutine on Atom Silvermont account. For processors that # can schedule aes[enc|dec] every cycle optimal interleave factor # equals to corresponding instructions latency. 8x is optimal for # * Bridge, but it's unfeasible to accommodate such implementation # in XMM registers addressable in 32-bit mode and therefore maximum # of 6x is used instead... sub aesni_generate2 { my $p=shift; &function_begin_B("_aesni_${p}rypt2"); &$movekey ($rndkey0,&QWP(0,$key)); &shl ($rounds,4); &$movekey ($rndkey1,&QWP(16,$key)); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &$movekey ($rndkey0,&QWP(32,$key)); &lea ($key,&DWP(32,$key,$rounds)); &neg ($rounds); &add ($rounds,16); &set_label("${p}2_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; &$movekey ($rndkey1,&QWP(0,$key,$rounds)); &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}2_loop")); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p}last ($inout1,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt2"); } sub aesni_generate3 { my $p=shift; &function_begin_B("_aesni_${p}rypt3"); &$movekey ($rndkey0,&QWP(0,$key)); &shl ($rounds,4); &$movekey ($rndkey1,&QWP(16,$key)); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &$movekey ($rndkey0,&QWP(32,$key)); &lea ($key,&DWP(32,$key,$rounds)); &neg ($rounds); &add ($rounds,16); &set_label("${p}3_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; &$movekey ($rndkey1,&QWP(0,$key,$rounds)); &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; eval"&aes${p} ($inout2,$rndkey0)"; &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}3_loop")); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p}last ($inout1,$rndkey0)"; eval"&aes${p}last ($inout2,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt3"); } # 4x interleave is implemented to improve small block performance, # most notably [and naturally] 4 block by ~30%. One can argue that one # should have implemented 5x as well, but improvement would be <20%, # so it's not worth it... sub aesni_generate4 { my $p=shift; &function_begin_B("_aesni_${p}rypt4"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &shl ($rounds,4); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey0); &$movekey ($rndkey0,&QWP(32,$key)); &lea ($key,&DWP(32,$key,$rounds)); &neg ($rounds); &data_byte (0x0f,0x1f,0x40,0x00); &add ($rounds,16); &set_label("${p}4_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; &$movekey ($rndkey1,&QWP(0,$key,$rounds)); &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}4_loop")); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p}last ($inout1,$rndkey0)"; eval"&aes${p}last ($inout2,$rndkey0)"; eval"&aes${p}last ($inout3,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt4"); } sub aesni_generate6 { my $p=shift; &function_begin_B("_aesni_${p}rypt6"); &static_label("_aesni_${p}rypt6_enter"); &$movekey ($rndkey0,&QWP(0,$key)); &shl ($rounds,4); &$movekey ($rndkey1,&QWP(16,$key)); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); # pxor does better here &pxor ($inout2,$rndkey0); eval"&aes${p} ($inout0,$rndkey1)"; &pxor ($inout3,$rndkey0); &pxor ($inout4,$rndkey0); eval"&aes${p} ($inout1,$rndkey1)"; &lea ($key,&DWP(32,$key,$rounds)); &neg ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; &pxor ($inout5,$rndkey0); &$movekey ($rndkey0,&QWP(0,$key,$rounds)); &add ($rounds,16); &jmp (&label("_aesni_${p}rypt6_inner")); &set_label("${p}6_loop",16); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; &set_label("_aesni_${p}rypt6_inner"); eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p} ($inout4,$rndkey1)"; eval"&aes${p} ($inout5,$rndkey1)"; &set_label("_aesni_${p}rypt6_enter"); &$movekey ($rndkey1,&QWP(0,$key,$rounds)); &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; eval"&aes${p} ($inout4,$rndkey0)"; eval"&aes${p} ($inout5,$rndkey0)"; &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}6_loop")); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p} ($inout4,$rndkey1)"; eval"&aes${p} ($inout5,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p}last ($inout1,$rndkey0)"; eval"&aes${p}last ($inout2,$rndkey0)"; eval"&aes${p}last ($inout3,$rndkey0)"; eval"&aes${p}last ($inout4,$rndkey0)"; eval"&aes${p}last ($inout5,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt6"); } &aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX); &aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX); &aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX); &aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX); if ($PREFIX eq $AESNI_PREFIX) { ###################################################################### # void aes_hw_ctr32_encrypt_blocks (const void *in, void *out, # size_t blocks, const AES_KEY *key, # const char *ivec); # # Handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see crypto/modes/ctr128.c for details) # # stack layout: # 0 pshufb mask # 16 vector addend: 0,6,6,6 # 32 counter-less ivec # 48 1st triplet of counter vector # 64 2nd triplet of counter vector # 80 saved %esp &function_begin("${PREFIX}_ctr32_encrypt_blocks"); &record_function_hit(0); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &mov ($rounds_,&wparam(4)); &mov ($key_,"esp"); &sub ("esp",88); &and ("esp",-16); # align stack &mov (&DWP(80,"esp"),$key_); &cmp ($len,1); &je (&label("ctr32_one_shortcut")); &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); &mov (&DWP(4,"esp"),0x08090a0b); &mov (&DWP(8,"esp"),0x04050607); &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds,6); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds); &mov (&DWP(20,"esp"),$rounds); &mov (&DWP(24,"esp"),$rounds); &mov (&DWP(28,"esp"),$key_); &pextrd ($rounds_,$inout5,3); # pull 32-bit counter &pinsrd ($inout5,$key_,3); # wipe 32-bit counter &mov ($rounds,&DWP(240,$key)); # key->rounds # compose 2 vectors of 3x32-bit counters &bswap ($rounds_); &pxor ($rndkey0,$rndkey0); &pxor ($rndkey1,$rndkey1); &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask &pinsrd ($rndkey0,$rounds_,0); &lea ($key_,&DWP(3,$rounds_)); &pinsrd ($rndkey1,$key_,0); &inc ($rounds_); &pinsrd ($rndkey0,$rounds_,1); &inc ($key_); &pinsrd ($rndkey1,$key_,1); &inc ($rounds_); &pinsrd ($rndkey0,$rounds_,2); &inc ($key_); &pinsrd ($rndkey1,$key_,2); &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet &pshufb ($rndkey0,$inout0); # byte swap &movdqu ($inout4,&QWP(0,$key)); # key[0] &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet &pshufb ($rndkey1,$inout0); # byte swap &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword &pshufd ($inout1,$rndkey0,2<<6); &cmp ($len,6); &jb (&label("ctr32_tail")); &pxor ($inout5,$inout4); # counter-less ivec^key[0] &shl ($rounds,4); &mov ($rounds_,16); &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] &mov ($key_,$key); # backup $key &sub ($rounds_,$rounds); # backup twisted $rounds &lea ($key,&DWP(32,$key,$rounds)); &sub ($len,6); &jmp (&label("ctr32_loop6")); &set_label("ctr32_loop6",16); # inlining _aesni_encrypt6's prologue gives ~6% improvement... &pshufd ($inout2,$rndkey0,1<<6); &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec &pshufd ($inout3,$rndkey1,3<<6); &pxor ($inout0,$rndkey0); # merge counter-less ivec &pshufd ($inout4,$rndkey1,2<<6); &pxor ($inout1,$rndkey0); &pshufd ($inout5,$rndkey1,1<<6); &$movekey ($rndkey1,&QWP(16,$key_)); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey0); &aesenc ($inout0,$rndkey1); &pxor ($inout4,$rndkey0); &pxor ($inout5,$rndkey0); &aesenc ($inout1,$rndkey1); &$movekey ($rndkey0,&QWP(32,$key_)); &mov ($rounds,$rounds_); &aesenc ($inout2,$rndkey1); &aesenc ($inout3,$rndkey1); &aesenc ($inout4,$rndkey1); &aesenc ($inout5,$rndkey1); &call (&label("_aesni_encrypt6_enter")); &movups ($rndkey1,&QWP(0,$inp)); &movups ($rndkey0,&QWP(0x10,$inp)); &xorps ($inout0,$rndkey1); &movups ($rndkey1,&QWP(0x20,$inp)); &xorps ($inout1,$rndkey0); &movups (&QWP(0,$out),$inout0); &movdqa ($rndkey0,&QWP(16,"esp")); # load increment &xorps ($inout2,$rndkey1); &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &paddd ($rndkey1,$rndkey0); # 2nd triplet increment &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask &movups ($inout1,&QWP(0x30,$inp)); &movups ($inout2,&QWP(0x40,$inp)); &xorps ($inout3,$inout1); &movups ($inout1,&QWP(0x50,$inp)); &lea ($inp,&DWP(0x60,$inp)); &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet &pshufb ($rndkey0,$inout0); # byte swap &xorps ($inout4,$inout2); &movups (&QWP(0x30,$out),$inout3); &xorps ($inout5,$inout1); &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet &pshufb ($rndkey1,$inout0); # byte swap &movups (&QWP(0x40,$out),$inout4); &pshufd ($inout0,$rndkey0,3<<6); &movups (&QWP(0x50,$out),$inout5); &lea ($out,&DWP(0x60,$out)); &pshufd ($inout1,$rndkey0,2<<6); &sub ($len,6); &jnc (&label("ctr32_loop6")); &add ($len,6); &jz (&label("ctr32_ret")); &movdqu ($inout5,&QWP(0,$key_)); &mov ($key,$key_); &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec &mov ($rounds,&DWP(240,$key_)); # restore $rounds &set_label("ctr32_tail"); &por ($inout0,$inout5); &cmp ($len,2); &jb (&label("ctr32_one")); &pshufd ($inout2,$rndkey0,1<<6); &por ($inout1,$inout5); &je (&label("ctr32_two")); &pshufd ($inout3,$rndkey1,3<<6); &por ($inout2,$inout5); &cmp ($len,4); &jb (&label("ctr32_three")); &pshufd ($inout4,$rndkey1,2<<6); &por ($inout3,$inout5); &je (&label("ctr32_four")); &por ($inout4,$inout5); &call ("_aesni_encrypt6"); &movups ($rndkey1,&QWP(0,$inp)); &movups ($rndkey0,&QWP(0x10,$inp)); &xorps ($inout0,$rndkey1); &movups ($rndkey1,&QWP(0x20,$inp)); &xorps ($inout1,$rndkey0); &movups ($rndkey0,&QWP(0x30,$inp)); &xorps ($inout2,$rndkey1); &movups ($rndkey1,&QWP(0x40,$inp)); &xorps ($inout3,$rndkey0); &movups (&QWP(0,$out),$inout0); &xorps ($inout4,$rndkey1); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &movups (&QWP(0x40,$out),$inout4); &jmp (&label("ctr32_ret")); &set_label("ctr32_one_shortcut",16); &movups ($inout0,&QWP(0,$rounds_)); # load ivec &mov ($rounds,&DWP(240,$key)); &set_label("ctr32_one"); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &movups ($in0,&QWP(0,$inp)); &xorps ($in0,$inout0); &movups (&QWP(0,$out),$in0); &jmp (&label("ctr32_ret")); &set_label("ctr32_two",16); &call ("_aesni_encrypt2"); &movups ($inout3,&QWP(0,$inp)); &movups ($inout4,&QWP(0x10,$inp)); &xorps ($inout0,$inout3); &xorps ($inout1,$inout4); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ctr32_ret")); &set_label("ctr32_three",16); &call ("_aesni_encrypt3"); &movups ($inout3,&QWP(0,$inp)); &movups ($inout4,&QWP(0x10,$inp)); &xorps ($inout0,$inout3); &movups ($inout5,&QWP(0x20,$inp)); &xorps ($inout1,$inout4); &movups (&QWP(0,$out),$inout0); &xorps ($inout2,$inout5); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &jmp (&label("ctr32_ret")); &set_label("ctr32_four",16); &call ("_aesni_encrypt4"); &movups ($inout4,&QWP(0,$inp)); &movups ($inout5,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &xorps ($inout0,$inout4); &movups ($rndkey0,&QWP(0x30,$inp)); &xorps ($inout1,$inout5); &movups (&QWP(0,$out),$inout0); &xorps ($inout2,$rndkey1); &movups (&QWP(0x10,$out),$inout1); &xorps ($inout3,$rndkey0); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &set_label("ctr32_ret"); &pxor ("xmm0","xmm0"); # clear register bank &pxor ("xmm1","xmm1"); &pxor ("xmm2","xmm2"); &pxor ("xmm3","xmm3"); &pxor ("xmm4","xmm4"); &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack &pxor ("xmm5","xmm5"); &movdqa (&QWP(48,"esp"),"xmm0"); &pxor ("xmm6","xmm6"); &movdqa (&QWP(64,"esp"),"xmm0"); &pxor ("xmm7","xmm7"); &mov ("esp",&DWP(80,"esp")); &function_end("${PREFIX}_ctr32_encrypt_blocks"); } ###################################################################### # Mechanical port from aesni-x86_64.pl. # int $PREFIX_set_encrypt_key_base (const unsigned char *userKey, int bits, # AES_KEY *key) &function_begin_B("${PREFIX}_set_encrypt_key_base"); &record_function_hit(3); &mov ("eax",&wparam(0)); &mov ($rounds,&wparam(1)); &mov ($key,&wparam(2)); &push ("ebx"); &call (&label("pic")); &set_label("pic"); &blindpop("ebx"); &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 &lea ($key,&DWP(16,$key)); &cmp ($rounds,256); &je (&label("14rounds")); # 192-bit key support was removed. &cmp ($rounds,128); &jne (&label("bad_keybits")); &set_label("10rounds",16); &mov ($rounds,9); &$movekey (&QWP(-16,$key),"xmm0"); # round 0 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 &call (&label("key_128_cold")); &aeskeygenassist("xmm1","xmm0",0x2); # round 2 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x04); # round 3 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x08); # round 4 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x10); # round 5 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x20); # round 6 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x40); # round 7 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x80); # round 8 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x36); # round 10 &call (&label("key_128")); &$movekey (&QWP(0,$key),"xmm0"); &mov (&DWP(80,$key),$rounds); &jmp (&label("good_key")); &set_label("key_128",16); &$movekey (&QWP(0,$key),"xmm0"); &lea ($key,&DWP(16,$key)); &set_label("key_128_cold"); &shufps ("xmm4","xmm0",0b00010000); &xorps ("xmm0","xmm4"); &shufps ("xmm4","xmm0",0b10001100); &xorps ("xmm0","xmm4"); &shufps ("xmm1","xmm1",0b11111111); # critical path &xorps ("xmm0","xmm1"); &ret(); &set_label("14rounds",16); &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey &lea ($key,&DWP(16,$key)); &mov ($rounds,13); &$movekey (&QWP(-32,$key),"xmm0"); # round 0 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 &call (&label("key_256a_cold")); &aeskeygenassist("xmm1","xmm0",0x01); # round 3 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x02); # round 4 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x02); # round 5 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x04); # round 6 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x04); # round 7 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x08); # round 8 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x08); # round 9 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x10); # round 10 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x10); # round 11 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x20); # round 12 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x20); # round 13 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x40); # round 14 &call (&label("key_256a")); &$movekey (&QWP(0,$key),"xmm0"); &mov (&DWP(16,$key),$rounds); &xor ("eax","eax"); &jmp (&label("good_key")); &set_label("key_256a",16); &$movekey (&QWP(0,$key),"xmm2"); &lea ($key,&DWP(16,$key)); &set_label("key_256a_cold"); &shufps ("xmm4","xmm0",0b00010000); &xorps ("xmm0","xmm4"); &shufps ("xmm4","xmm0",0b10001100); &xorps ("xmm0","xmm4"); &shufps ("xmm1","xmm1",0b11111111); # critical path &xorps ("xmm0","xmm1"); &ret(); &set_label("key_256b",16); &$movekey (&QWP(0,$key),"xmm0"); &lea ($key,&DWP(16,$key)); &shufps ("xmm4","xmm2",0b00010000); &xorps ("xmm2","xmm4"); &shufps ("xmm4","xmm2",0b10001100); &xorps ("xmm2","xmm4"); &shufps ("xmm1","xmm1",0b10101010); # critical path &xorps ("xmm2","xmm1"); &ret(); &set_label("good_key"); &pxor ("xmm0","xmm0"); &pxor ("xmm1","xmm1"); &pxor ("xmm2","xmm2"); &pxor ("xmm3","xmm3"); &pxor ("xmm4","xmm4"); &pxor ("xmm5","xmm5"); &xor ("eax","eax"); &pop ("ebx"); &ret (); &set_label("bad_keybits",4); &pxor ("xmm0","xmm0"); &mov ("eax",-2); &pop ("ebx"); &ret (); &function_end_B("${PREFIX}_set_encrypt_key_base"); # int $PREFIX_set_encrypt_key_alt (const unsigned char *userKey, int bits, # AES_KEY *key) &function_begin_B("${PREFIX}_set_encrypt_key_alt"); &record_function_hit(3); &mov ("eax",&wparam(0)); &mov ($rounds,&wparam(1)); &mov ($key,&wparam(2)); &push ("ebx"); &call (&label("pic")); &set_label("pic"); &blindpop("ebx"); &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 &lea ($key,&DWP(16,$key)); &cmp ($rounds,256); &je (&label("14rounds_alt")); # 192-bit key support was removed. &cmp ($rounds,128); &jne (&label("bad_keybits")); &set_label("10rounds_alt",16); &movdqa ("xmm5",&QWP(0x00,"ebx")); &mov ($rounds,8); &movdqa ("xmm4",&QWP(0x20,"ebx")); &movdqa ("xmm2","xmm0"); &movdqu (&QWP(-16,$key),"xmm0"); &set_label("loop_key128"); &pshufb ("xmm0","xmm5"); &aesenclast ("xmm0","xmm4"); &pslld ("xmm4",1); &lea ($key,&DWP(16,$key)); &movdqa ("xmm3","xmm2"); &pslldq ("xmm2",4); &pxor ("xmm3","xmm2"); &pslldq ("xmm2",4); &pxor ("xmm3","xmm2"); &pslldq ("xmm2",4); &pxor ("xmm2","xmm3"); &pxor ("xmm0","xmm2"); &movdqu (&QWP(-16,$key),"xmm0"); &movdqa ("xmm2","xmm0"); &dec ($rounds); &jnz (&label("loop_key128")); &movdqa ("xmm4",&QWP(0x30,"ebx")); &pshufb ("xmm0","xmm5"); &aesenclast ("xmm0","xmm4"); &pslld ("xmm4",1); &movdqa ("xmm3","xmm2"); &pslldq ("xmm2",4); &pxor ("xmm3","xmm2"); &pslldq ("xmm2",4); &pxor ("xmm3","xmm2"); &pslldq ("xmm2",4); &pxor ("xmm2","xmm3"); &pxor ("xmm0","xmm2"); &movdqu (&QWP(0,$key),"xmm0"); &movdqa ("xmm2","xmm0"); &pshufb ("xmm0","xmm5"); &aesenclast ("xmm0","xmm4"); &movdqa ("xmm3","xmm2"); &pslldq ("xmm2",4); &pxor ("xmm3","xmm2"); &pslldq ("xmm2",4); &pxor ("xmm3","xmm2"); &pslldq ("xmm2",4); &pxor ("xmm2","xmm3"); &pxor ("xmm0","xmm2"); &movdqu (&QWP(16,$key),"xmm0"); &mov ($rounds,9); &mov (&DWP(96,$key),$rounds); &jmp (&label("good_key")); # 192-bit key support was removed. &set_label("14rounds_alt",16); &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey &lea ($key,&DWP(16,$key)); &movdqa ("xmm5",&QWP(0x00,"ebx")); &movdqa ("xmm4",&QWP(0x20,"ebx")); &mov ($rounds,7); &movdqu (&QWP(-32,$key),"xmm0"); &movdqa ("xmm1","xmm2"); &movdqu (&QWP(-16,$key),"xmm2"); &set_label("loop_key256"); &pshufb ("xmm2","xmm5"); &aesenclast ("xmm2","xmm4"); &movdqa ("xmm3","xmm0"); &pslldq ("xmm0",4); &pxor ("xmm3","xmm0"); &pslldq ("xmm0",4); &pxor ("xmm3","xmm0"); &pslldq ("xmm0",4); &pxor ("xmm0","xmm3"); &pslld ("xmm4",1); &pxor ("xmm0","xmm2"); &movdqu (&QWP(0,$key),"xmm0"); &dec ($rounds); &jz (&label("done_key256")); &pshufd ("xmm2","xmm0",0xff); &pxor ("xmm3","xmm3"); &aesenclast ("xmm2","xmm3"); &movdqa ("xmm3","xmm1"); &pslldq ("xmm1",4); &pxor ("xmm3","xmm1"); &pslldq ("xmm1",4); &pxor ("xmm3","xmm1"); &pslldq ("xmm1",4); &pxor ("xmm1","xmm3"); &pxor ("xmm2","xmm1"); &movdqu (&QWP(16,$key),"xmm2"); &lea ($key,&DWP(32,$key)); &movdqa ("xmm1","xmm2"); &jmp (&label("loop_key256")); &set_label("done_key256"); &mov ($rounds,13); &mov (&DWP(16,$key),$rounds); &set_label("good_key"); &pxor ("xmm0","xmm0"); &pxor ("xmm1","xmm1"); &pxor ("xmm2","xmm2"); &pxor ("xmm3","xmm3"); &pxor ("xmm4","xmm4"); &pxor ("xmm5","xmm5"); &xor ("eax","eax"); &pop ("ebx"); &ret (); &set_label("bad_keybits",4); &pxor ("xmm0","xmm0"); &mov ("eax",-2); &pop ("ebx"); &ret (); &function_end_B("${PREFIX}_set_encrypt_key_alt"); &set_label("key_const",64); &data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); &data_word(0x04070605,0x04070605,0x04070605,0x04070605); &data_word(1,1,1,1); &data_word(0x1b,0x1b,0x1b,0x1b); &asciz("AES for Intel AES-NI, CRYPTOGAMS by "); &asm_finish(); close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86_64.pl000064400000000000000000001271101046102023000203460ustar 00000000000000#! /usr/bin/env perl # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # This module implements support for Intel AES-NI extension. In # OpenSSL context it's used with Intel engine, but can also be used as # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for # details]. # # Performance. # # Given aes(enc|dec) instructions' latency asymptotic performance for # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte # processed with 128-bit key. And given their throughput asymptotic # performance for parallelizable modes is 1.25 cycles per byte. Being # asymptotic limit it's not something you commonly achieve in reality, # but how close does one get? Below are results collected for # different modes and block sized. Pairs of numbers are for en-/ # decryption. # # 16-byte 64-byte 256-byte 1-KB 8-KB # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 # # ECB, CTR, CBC and CCM results are free from EVP overhead. This means # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni # [-decrypt]' will exhibit 10-15% worse results for smaller blocks. # The results were collected with specially crafted speed.c benchmark # in order to compare them with results reported in "Intel Advanced # Encryption Standard (AES) New Instruction Set" White Paper Revision # 3.0 dated May 2010. All above results are consistently better. This # module also provides better performance for block sizes smaller than # 128 bytes in points *not* represented in the above table. # # Looking at the results for 8-KB buffer. # # CFB and OFB results are far from the limit, because implementation # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on # single-block aesni_encrypt, which is not the most optimal way to go. # CBC encrypt result is unexpectedly high and there is no documented # explanation for it. Seemingly there is a small penalty for feeding # the result back to AES unit the way it's done in CBC mode. There is # nothing one can do and the result appears optimal. CCM result is # identical to CBC, because CBC-MAC is essentially CBC encrypt without # saving output. CCM CTR "stays invisible," because it's neatly # interleaved wih CBC-MAC. This provides ~30% improvement over # "straightforward" CCM implementation with CTR and CBC-MAC performed # disjointly. Parallelizable modes practically achieve the theoretical # limit. # # Looking at how results vary with buffer size. # # Curves are practically saturated at 1-KB buffer size. In most cases # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. # CTR curve doesn't follow this pattern and is "slowest" changing one # with "256-byte" result being 87% of "8-KB." This is because overhead # in CTR mode is most computationally intensive. Small-block CCM # decrypt is slower than encrypt, because first CTR and last CBC-MAC # iterations can't be interleaved. # # Results for 192- and 256-bit keys. # # EVP-free results were observed to scale perfectly with number of # rounds for larger block sizes, i.e. 192-bit result being 10/12 times # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences # are a tad smaller, because the above mentioned penalty biases all # results by same constant value. In similar way function call # overhead affects small-block performance, as well as OFB and CFB # results. Differences are not large, most common coefficients are # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... # January 2011 # # While Westmere processor features 6 cycles latency for aes[enc|dec] # instructions, which can be scheduled every second cycle, Sandy # Bridge spends 8 cycles per instruction, but it can schedule them # every cycle. This means that code targeting Westmere would perform # suboptimally on Sandy Bridge. Therefore this update. # # In addition, non-parallelizable CBC encrypt (as well as CCM) is # optimized. Relative improvement might appear modest, 8% on Westmere, # but in absolute terms it's 3.77 cycles per byte encrypted with # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers # should be compared to asymptotic limits of 3.75 for Westmere and # 5.00 for Sandy Bridge. Actually, the fact that they get this close # to asymptotic limits is quite amazing. Indeed, the limit is # calculated as latency times number of rounds, 10 for 128-bit key, # and divided by 16, the number of bytes in block, or in other words # it accounts *solely* for aesenc instructions. But there are extra # instructions, and numbers so close to the asymptotic limits mean # that it's as if it takes as little as *one* additional cycle to # execute all of them. How is it possible? It is possible thanks to # out-of-order execution logic, which manages to overlap post- # processing of previous block, things like saving the output, with # actual encryption of current block, as well as pre-processing of # current block, things like fetching input and xor-ing it with # 0-round element of the key schedule, with actual encryption of # previous block. Keep this in mind... # # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher # performance is achieved by interleaving instructions working on # independent blocks. In which case asymptotic limit for such modes # can be obtained by dividing above mentioned numbers by AES # instructions' interleave factor. Westmere can execute at most 3 # instructions at a time, meaning that optimal interleave factor is 3, # and that's where the "magic" number of 1.25 come from. "Optimal # interleave factor" means that increase of interleave factor does # not improve performance. The formula has proven to reflect reality # pretty well on Westmere... Sandy Bridge on the other hand can # execute up to 8 AES instructions at a time, so how does varying # interleave factor affect the performance? Here is table for ECB # (numbers are cycles per byte processed with 128-bit key): # # instruction interleave factor 3x 6x 8x # theoretical asymptotic limit 1.67 0.83 0.625 # measured performance for 8KB block 1.05 0.86 0.84 # # "as if" interleave factor 4.7x 5.8x 6.0x # # Further data for other parallelizable modes: # # CBC decrypt 1.16 0.93 0.74 # CTR 1.14 0.91 0.74 # # Well, given 3x column it's probably inappropriate to call the limit # asymptotic, if it can be surpassed, isn't it? What happens there? # Rewind to CBC paragraph for the answer. Yes, out-of-order execution # magic is responsible for this. Processor overlaps not only the # additional instructions with AES ones, but even AES instructions # processing adjacent triplets of independent blocks. In the 6x case # additional instructions still claim disproportionally small amount # of additional cycles, but in 8x case number of instructions must be # a tad too high for out-of-order logic to cope with, and AES unit # remains underutilized... As you can see 8x interleave is hardly # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl # utilizes 6x interleave because of limited register bank capacity. # # Higher interleave factors do have negative impact on Westmere # performance. While for ECB mode it's negligible ~1.5%, other # parallelizables perform ~5% worse, which is outweighed by ~25% # improvement on Sandy Bridge. To balance regression on Westmere # CTR mode was implemented with 6x aesenc interleave factor. # April 2011 # # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like # in CTR mode AES instruction interleave factor was chosen to be 6x. ###################################################################### # Current large-block performance in cycles per byte processed with # 128-bit key (less is better). # # CBC en-/decrypt CTR XTS ECB OCB # Westmere 3.77/1.25 1.25 1.25 1.26 # * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 # Haswell 4.44/0.63 0.63 0.73 0.63 0.70 # Skylake 2.62/0.63 0.63 0.63 0.63 # Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 # Knights L 2.54/0.77 0.78 0.85 - 1.50 # Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 # Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 # Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 # # (*) Atom Silvermont ECB result is suboptimal because of penalties # incurred by operations on %xmm8-15. As ECB is not considered # critical, nothing was done to mitigate the problem. $PREFIX="aes_hw"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-x86_64.pl:-) $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $movkey = $PREFIX eq "aes_hw" ? "movups" : "movups"; @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order ("%rdi","%rsi","%rdx","%rcx"); # Unix order $code=".text\n"; $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... $inp="%rdi"; $out="%rsi"; $len="%rdx"; $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! $ivp="%r8"; # cbc, ctr, ... $rnds_="%r10d"; # backup copy for $rounds $key_="%r11"; # backup copy for $key # %xmm register layout $rndkey0="%xmm0"; $rndkey1="%xmm1"; $inout0="%xmm2"; $inout1="%xmm3"; $inout2="%xmm4"; $inout3="%xmm5"; $inout4="%xmm6"; $inout5="%xmm7"; $inout6="%xmm8"; $inout7="%xmm9"; $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... $in0="%xmm8"; $iv="%xmm9"; # Inline version of internal aesni_[en|de]crypt1. # # Why folded loop? Because aes[enc|dec] is slow enough to accommodate # cycles which take care of loop variables... { my $sn; sub aesni_generate1 { my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); ++$sn; $code.=<<___; $movkey ($key),$rndkey0 $movkey 16($key),$rndkey1 ___ $code.=<<___ if (defined($ivec)); xorps $rndkey0,$ivec lea 32($key),$key xorps $ivec,$inout ___ $code.=<<___ if (!defined($ivec)); lea 32($key),$key xorps $rndkey0,$inout ___ $code.=<<___; .Loop_${p}1_$sn: aes${p} $rndkey1,$inout dec $rounds $movkey ($key),$rndkey1 lea 16($key),$key jnz .Loop_${p}1_$sn # loop body is 16 bytes aes${p}last $rndkey1,$inout ___ }} # _aesni_[en|de]cryptN are private interfaces, N denotes interleave # factor. Why 3x subroutine were originally used in loops? Even though # aes[enc|dec] latency was originally 6, it could be scheduled only # every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. # This is why it originally made no sense to implement 2x subroutine. # But times change and it became appropriate to spend extra 192 bytes # on 2x subroutine on Atom Silvermont account. For processors that # can schedule aes[enc|dec] every cycle optimal interleave factor # equals to corresponding instructions latency. 8x is optimal for # * Bridge and "super-optimal" for other Intel CPUs... sub aesni_generate2 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* # preserved. $inout[0-1] is cipher/clear text... $code.=<<___; .type _aesni_${dir}rypt2,\@abi-omnipotent .align 16 _aesni_${dir}rypt2: .cfi_startproc $movkey ($key),$rndkey0 shl \$4,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 $movkey 32($key),$rndkey0 lea 32($key,$rounds),$key neg %rax # $rounds add \$16,%rax .L${dir}_loop2: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 $movkey ($key,%rax),$rndkey1 add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop2 aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir}last $rndkey0,$inout0 aes${dir}last $rndkey0,$inout1 ret .cfi_endproc .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 ___ } sub aesni_generate3 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* # preserved. $inout[0-2] is cipher/clear text... $code.=<<___; .type _aesni_${dir}rypt3,\@abi-omnipotent .align 16 _aesni_${dir}rypt3: .cfi_startproc $movkey ($key),$rndkey0 shl \$4,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 xorps $rndkey0,$inout2 $movkey 32($key),$rndkey0 lea 32($key,$rounds),$key neg %rax # $rounds add \$16,%rax .L${dir}_loop3: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 $movkey ($key,%rax),$rndkey1 add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 aes${dir} $rndkey0,$inout2 $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop3 aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 aes${dir}last $rndkey0,$inout0 aes${dir}last $rndkey0,$inout1 aes${dir}last $rndkey0,$inout2 ret .cfi_endproc .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 ___ } # 4x interleave is implemented to improve small block performance, # most notably [and naturally] 4 block by ~30%. One can argue that one # should have implemented 5x as well, but improvement would be <20%, # so it's not worth it... sub aesni_generate4 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* # preserved. $inout[0-3] is cipher/clear text... $code.=<<___; .type _aesni_${dir}rypt4,\@abi-omnipotent .align 16 _aesni_${dir}rypt4: .cfi_startproc $movkey ($key),$rndkey0 shl \$4,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 xorps $rndkey0,$inout2 xorps $rndkey0,$inout3 $movkey 32($key),$rndkey0 lea 32($key,$rounds),$key neg %rax # $rounds .byte 0x0f,0x1f,0x00 add \$16,%rax .L${dir}_loop4: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 $movkey ($key,%rax),$rndkey1 add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop4 aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 aes${dir}last $rndkey0,$inout0 aes${dir}last $rndkey0,$inout1 aes${dir}last $rndkey0,$inout2 aes${dir}last $rndkey0,$inout3 ret .cfi_endproc .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 ___ } sub aesni_generate6 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* # preserved. $inout[0-5] is cipher/clear text... $code.=<<___; .type _aesni_${dir}rypt6,\@abi-omnipotent .align 16 _aesni_${dir}rypt6: .cfi_startproc $movkey ($key),$rndkey0 shl \$4,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$inout0 pxor $rndkey0,$inout1 pxor $rndkey0,$inout2 aes${dir} $rndkey1,$inout0 lea 32($key,$rounds),$key neg %rax # $rounds aes${dir} $rndkey1,$inout1 pxor $rndkey0,$inout3 pxor $rndkey0,$inout4 aes${dir} $rndkey1,$inout2 pxor $rndkey0,$inout5 $movkey ($key,%rax),$rndkey0 add \$16,%rax jmp .L${dir}_loop6_enter .align 16 .L${dir}_loop6: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 .L${dir}_loop6_enter: aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 aes${dir} $rndkey1,$inout5 $movkey ($key,%rax),$rndkey1 add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 aes${dir} $rndkey0,$inout4 aes${dir} $rndkey0,$inout5 $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop6 aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 aes${dir} $rndkey1,$inout5 aes${dir}last $rndkey0,$inout0 aes${dir}last $rndkey0,$inout1 aes${dir}last $rndkey0,$inout2 aes${dir}last $rndkey0,$inout3 aes${dir}last $rndkey0,$inout4 aes${dir}last $rndkey0,$inout5 ret .cfi_endproc .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 ___ } sub aesni_generate8 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* # preserved. $inout[0-7] is cipher/clear text... $code.=<<___; .type _aesni_${dir}rypt8,\@abi-omnipotent .align 16 _aesni_${dir}rypt8: .cfi_startproc $movkey ($key),$rndkey0 shl \$4,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 pxor $rndkey0,$inout2 pxor $rndkey0,$inout3 pxor $rndkey0,$inout4 lea 32($key,$rounds),$key neg %rax # $rounds aes${dir} $rndkey1,$inout0 pxor $rndkey0,$inout5 pxor $rndkey0,$inout6 aes${dir} $rndkey1,$inout1 pxor $rndkey0,$inout7 $movkey ($key,%rax),$rndkey0 add \$16,%rax jmp .L${dir}_loop8_inner .align 16 .L${dir}_loop8: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 .L${dir}_loop8_inner: aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 aes${dir} $rndkey1,$inout5 aes${dir} $rndkey1,$inout6 aes${dir} $rndkey1,$inout7 .L${dir}_loop8_enter: $movkey ($key,%rax),$rndkey1 add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 aes${dir} $rndkey0,$inout4 aes${dir} $rndkey0,$inout5 aes${dir} $rndkey0,$inout6 aes${dir} $rndkey0,$inout7 $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop8 aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 aes${dir} $rndkey1,$inout5 aes${dir} $rndkey1,$inout6 aes${dir} $rndkey1,$inout7 aes${dir}last $rndkey0,$inout0 aes${dir}last $rndkey0,$inout1 aes${dir}last $rndkey0,$inout2 aes${dir}last $rndkey0,$inout3 aes${dir}last $rndkey0,$inout4 aes${dir}last $rndkey0,$inout5 aes${dir}last $rndkey0,$inout6 aes${dir}last $rndkey0,$inout7 ret .cfi_endproc .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 ___ } &aesni_generate2("enc") if ($PREFIX eq "aes_hw"); &aesni_generate3("enc") if ($PREFIX eq "aes_hw"); &aesni_generate4("enc") if ($PREFIX eq "aes_hw"); &aesni_generate6("enc") if ($PREFIX eq "aes_hw"); &aesni_generate8("enc") if ($PREFIX eq "aes_hw"); if ($PREFIX eq "aes_hw") { { ###################################################################### # void aesni_ctr32_encrypt_blocks (const void *in, void *out, # size_t blocks, const AES_KEY *key, # const char *ivec); # # Handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see crypto/modes/ctr128.c for details) # # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. # Keywords are full unroll and modulo-schedule counter calculations # with zero-round key xor. { my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); my ($key0,$ctr)=("%ebp","${ivp}d"); my $frame_size = 0x80 + ($win64?160:0); $code.=<<___; .globl ${PREFIX}_ctr32_encrypt_blocks .type ${PREFIX}_ctr32_encrypt_blocks,\@function,5 .align 16 ${PREFIX}_ctr32_encrypt_blocks: .cfi_startproc _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST movb \$1,BORINGSSL_function_hit(%rip) #endif cmp \$1,$len jne .Lctr32_bulk # handle single block without allocating stack frame, # useful when handling edges movups ($ivp),$inout0 movups ($inp),$inout1 mov 240($key),%edx # key->rounds ___ &aesni_generate1("enc",$key,"%edx"); $code.=<<___; pxor $rndkey0,$rndkey0 # clear register bank pxor $rndkey1,$rndkey1 xorps $inout1,$inout0 pxor $inout1,$inout1 movups $inout0,($out) xorps $inout0,$inout0 jmp .Lctr32_epilogue .align 16 .Lctr32_bulk: lea (%rsp),$key_ # use $key_ as frame pointer .cfi_def_cfa_register $key_ push %rbp .cfi_push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); movaps %xmm6,-0xa8($key_) # offload everything movaps %xmm7,-0x98($key_) movaps %xmm8,-0x88($key_) movaps %xmm9,-0x78($key_) movaps %xmm10,-0x68($key_) movaps %xmm11,-0x58($key_) movaps %xmm12,-0x48($key_) movaps %xmm13,-0x38($key_) movaps %xmm14,-0x28($key_) movaps %xmm15,-0x18($key_) .Lctr32_body: ___ $code.=<<___; # 8 16-byte words on top of stack are counter values # xor-ed with zero-round key movdqu ($ivp),$inout0 movdqu ($key),$rndkey0 mov 12($ivp),$ctr # counter LSB pxor $rndkey0,$inout0 mov 12($key),$key0 # 0-round key LSB movdqa $inout0,0x00(%rsp) # populate counter block bswap $ctr movdqa $inout0,$inout1 movdqa $inout0,$inout2 movdqa $inout0,$inout3 movdqa $inout0,0x40(%rsp) movdqa $inout0,0x50(%rsp) movdqa $inout0,0x60(%rsp) mov %rdx,%r10 # about to borrow %rdx movdqa $inout0,0x70(%rsp) lea 1($ctr),%rax lea 2($ctr),%rdx bswap %eax bswap %edx xor $key0,%eax xor $key0,%edx pinsrd \$3,%eax,$inout1 lea 3($ctr),%rax movdqa $inout1,0x10(%rsp) pinsrd \$3,%edx,$inout2 bswap %eax mov %r10,%rdx # restore %rdx lea 4($ctr),%r10 movdqa $inout2,0x20(%rsp) xor $key0,%eax bswap %r10d pinsrd \$3,%eax,$inout3 xor $key0,%r10d movdqa $inout3,0x30(%rsp) lea 5($ctr),%r9 mov %r10d,0x40+12(%rsp) bswap %r9d lea 6($ctr),%r10 mov 240($key),$rounds # key->rounds xor $key0,%r9d bswap %r10d mov %r9d,0x50+12(%rsp) xor $key0,%r10d lea 7($ctr),%r9 mov %r10d,0x60+12(%rsp) bswap %r9d xor $key0,%r9d mov %r9d,0x70+12(%rsp) $movkey 0x10($key),$rndkey1 movdqa 0x40(%rsp),$inout4 movdqa 0x50(%rsp),$inout5 cmp \$8,$len # $len is in blocks jb .Lctr32_tail # short input if ($len<8) lea 0x80($key),$key # size optimization sub \$8,$len # $len is biased by -8 jmp .Lctr32_loop8 .align 32 .Lctr32_loop8: add \$8,$ctr # next counter value movdqa 0x60(%rsp),$inout6 aesenc $rndkey1,$inout0 mov $ctr,%r9d movdqa 0x70(%rsp),$inout7 aesenc $rndkey1,$inout1 bswap %r9d $movkey 0x20-0x80($key),$rndkey0 aesenc $rndkey1,$inout2 xor $key0,%r9d nop aesenc $rndkey1,$inout3 mov %r9d,0x00+12(%rsp) # store next counter value lea 1($ctr),%r9 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 aesenc $rndkey1,$inout7 $movkey 0x30-0x80($key),$rndkey1 ___ for($i=2;$i<8;$i++) { my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; $code.=<<___; bswap %r9d aesenc $rndkeyx,$inout0 aesenc $rndkeyx,$inout1 xor $key0,%r9d .byte 0x66,0x90 aesenc $rndkeyx,$inout2 aesenc $rndkeyx,$inout3 mov %r9d,`0x10*($i-1)`+12(%rsp) lea $i($ctr),%r9 aesenc $rndkeyx,$inout4 aesenc $rndkeyx,$inout5 aesenc $rndkeyx,$inout6 aesenc $rndkeyx,$inout7 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx ___ } $code.=<<___; bswap %r9d aesenc $rndkey0,$inout0 aesenc $rndkey0,$inout1 aesenc $rndkey0,$inout2 xor $key0,%r9d movdqu 0x00($inp),$in0 # start loading input aesenc $rndkey0,$inout3 mov %r9d,0x70+12(%rsp) cmp \$11,$rounds aesenc $rndkey0,$inout4 aesenc $rndkey0,$inout5 aesenc $rndkey0,$inout6 aesenc $rndkey0,$inout7 $movkey 0xa0-0x80($key),$rndkey0 jb .Lctr32_enc_done aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 aesenc $rndkey1,$inout7 $movkey 0xb0-0x80($key),$rndkey1 aesenc $rndkey0,$inout0 aesenc $rndkey0,$inout1 aesenc $rndkey0,$inout2 aesenc $rndkey0,$inout3 aesenc $rndkey0,$inout4 aesenc $rndkey0,$inout5 aesenc $rndkey0,$inout6 aesenc $rndkey0,$inout7 $movkey 0xc0-0x80($key),$rndkey0 # 192-bit key support was removed. aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 aesenc $rndkey1,$inout7 $movkey 0xd0-0x80($key),$rndkey1 aesenc $rndkey0,$inout0 aesenc $rndkey0,$inout1 aesenc $rndkey0,$inout2 aesenc $rndkey0,$inout3 aesenc $rndkey0,$inout4 aesenc $rndkey0,$inout5 aesenc $rndkey0,$inout6 aesenc $rndkey0,$inout7 $movkey 0xe0-0x80($key),$rndkey0 jmp .Lctr32_enc_done .align 16 .Lctr32_enc_done: movdqu 0x10($inp),$in1 pxor $rndkey0,$in0 # input^=round[last] movdqu 0x20($inp),$in2 pxor $rndkey0,$in1 movdqu 0x30($inp),$in3 pxor $rndkey0,$in2 movdqu 0x40($inp),$in4 pxor $rndkey0,$in3 movdqu 0x50($inp),$in5 pxor $rndkey0,$in4 prefetcht0 0x1c0($inp) # We process 128 bytes (8*16), so to prefetch 1 iteration prefetcht0 0x200($inp) # We need to prefetch 2 64 byte lines pxor $rndkey0,$in5 aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 aesenc $rndkey1,$inout7 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] lea 0x80($inp),$inp # $inp+=8*16 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] pxor $rndkey0,$rndkey1 # borrowed $rndkey movdqu 0x70-0x80($inp),$in0 aesenclast $in1,$inout1 pxor $rndkey0,$in0 movdqa 0x00(%rsp),$in1 # load next counter block aesenclast $in2,$inout2 aesenclast $in3,$inout3 movdqa 0x10(%rsp),$in2 movdqa 0x20(%rsp),$in3 aesenclast $in4,$inout4 aesenclast $in5,$inout5 movdqa 0x30(%rsp),$in4 movdqa 0x40(%rsp),$in5 aesenclast $rndkey1,$inout6 movdqa 0x50(%rsp),$rndkey0 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key aesenclast $in0,$inout7 movups $inout0,($out) # store 8 output blocks movdqa $in1,$inout0 movups $inout1,0x10($out) movdqa $in2,$inout1 movups $inout2,0x20($out) movdqa $in3,$inout2 movups $inout3,0x30($out) movdqa $in4,$inout3 movups $inout4,0x40($out) movdqa $in5,$inout4 movups $inout5,0x50($out) movdqa $rndkey0,$inout5 movups $inout6,0x60($out) movups $inout7,0x70($out) lea 0x80($out),$out # $out+=8*16 sub \$8,$len jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow add \$8,$len # restore real remaining $len jz .Lctr32_done # done if ($len==0) lea -0x80($key),$key .Lctr32_tail: # note that at this point $inout0..5 are populated with # counter values xor-ed with 0-round key lea 16($key),$key cmp \$4,$len jb .Lctr32_loop3 je .Lctr32_loop4 # if ($len>4) compute 7 E(counter) shl \$4,$rounds movdqa 0x60(%rsp),$inout6 pxor $inout7,$inout7 $movkey 16($key),$rndkey0 aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter neg %rax aesenc $rndkey1,$inout2 add \$16,%rax # prepare for .Lenc_loop8_enter movups ($inp),$in0 aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 movups 0x10($inp),$in1 # pre-load input movups 0x20($inp),$in2 aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 call .Lenc_loop8_enter movdqu 0x30($inp),$in3 pxor $in0,$inout0 movdqu 0x40($inp),$in0 pxor $in1,$inout1 movdqu $inout0,($out) # store output pxor $in2,$inout2 movdqu $inout1,0x10($out) pxor $in3,$inout3 movdqu $inout2,0x20($out) pxor $in0,$inout4 movdqu $inout3,0x30($out) movdqu $inout4,0x40($out) cmp \$6,$len jb .Lctr32_done # $len was 5, stop store movups 0x50($inp),$in1 xorps $in1,$inout5 movups $inout5,0x50($out) je .Lctr32_done # $len was 6, stop store movups 0x60($inp),$in2 xorps $in2,$inout6 movups $inout6,0x60($out) jmp .Lctr32_done # $len was 7, stop store .align 32 .Lctr32_loop4: aesenc $rndkey1,$inout0 lea 16($key),$key dec $rounds aesenc $rndkey1,$inout1 aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 $movkey ($key),$rndkey1 jnz .Lctr32_loop4 aesenclast $rndkey1,$inout0 aesenclast $rndkey1,$inout1 movups ($inp),$in0 # load input movups 0x10($inp),$in1 aesenclast $rndkey1,$inout2 aesenclast $rndkey1,$inout3 movups 0x20($inp),$in2 movups 0x30($inp),$in3 xorps $in0,$inout0 movups $inout0,($out) # store output xorps $in1,$inout1 movups $inout1,0x10($out) pxor $in2,$inout2 movdqu $inout2,0x20($out) pxor $in3,$inout3 movdqu $inout3,0x30($out) jmp .Lctr32_done # $len was 4, stop store .align 32 .Lctr32_loop3: aesenc $rndkey1,$inout0 lea 16($key),$key dec $rounds aesenc $rndkey1,$inout1 aesenc $rndkey1,$inout2 $movkey ($key),$rndkey1 jnz .Lctr32_loop3 aesenclast $rndkey1,$inout0 aesenclast $rndkey1,$inout1 aesenclast $rndkey1,$inout2 movups ($inp),$in0 # load input xorps $in0,$inout0 movups $inout0,($out) # store output cmp \$2,$len jb .Lctr32_done # $len was 1, stop store movups 0x10($inp),$in1 xorps $in1,$inout1 movups $inout1,0x10($out) je .Lctr32_done # $len was 2, stop store movups 0x20($inp),$in2 xorps $in2,$inout2 movups $inout2,0x20($out) # $len was 3, stop store .Lctr32_done: xorps %xmm0,%xmm0 # clear register bank xor $key0,$key0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 ___ $code.=<<___ if (!$win64); pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 movaps %xmm0,0x00(%rsp) # clear stack pxor %xmm8,%xmm8 movaps %xmm0,0x10(%rsp) pxor %xmm9,%xmm9 movaps %xmm0,0x20(%rsp) pxor %xmm10,%xmm10 movaps %xmm0,0x30(%rsp) pxor %xmm11,%xmm11 movaps %xmm0,0x40(%rsp) pxor %xmm12,%xmm12 movaps %xmm0,0x50(%rsp) pxor %xmm13,%xmm13 movaps %xmm0,0x60(%rsp) pxor %xmm14,%xmm14 movaps %xmm0,0x70(%rsp) pxor %xmm15,%xmm15 ___ $code.=<<___ if ($win64); movaps -0xa8($key_),%xmm6 movaps %xmm0,-0xa8($key_) # clear stack movaps -0x98($key_),%xmm7 movaps %xmm0,-0x98($key_) movaps -0x88($key_),%xmm8 movaps %xmm0,-0x88($key_) movaps -0x78($key_),%xmm9 movaps %xmm0,-0x78($key_) movaps -0x68($key_),%xmm10 movaps %xmm0,-0x68($key_) movaps -0x58($key_),%xmm11 movaps %xmm0,-0x58($key_) movaps -0x48($key_),%xmm12 movaps %xmm0,-0x48($key_) movaps -0x38($key_),%xmm13 movaps %xmm0,-0x38($key_) movaps -0x28($key_),%xmm14 movaps %xmm0,-0x28($key_) movaps -0x18($key_),%xmm15 movaps %xmm0,-0x18($key_) movaps %xmm0,0x00(%rsp) movaps %xmm0,0x10(%rsp) movaps %xmm0,0x20(%rsp) movaps %xmm0,0x30(%rsp) movaps %xmm0,0x40(%rsp) movaps %xmm0,0x50(%rsp) movaps %xmm0,0x60(%rsp) movaps %xmm0,0x70(%rsp) ___ $code.=<<___; mov -8($key_),%rbp .cfi_restore %rbp lea ($key_),%rsp .cfi_def_cfa_register %rsp .Lctr32_epilogue: ret .cfi_endproc .size ${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks ___ } }} { my ($inp,$bits,$key) = @_4args; $bits =~ s/%r/%e/; # This is based on submission from Intel by # Huang Ying # Vinodh Gopal # Kahraman Akdemir # # Aggressively optimized in respect to aeskeygenassist's critical path # and is contained in %xmm0-5 to meet Win64 ABI requirement. # # int ${PREFIX}_set_encrypt_key(const unsigned char *inp, # int bits, AES_KEY * const key); # # input: $inp user-supplied key # $bits $inp length in bits # $key pointer to key schedule # output: %eax 0 denoting success, -1 or -2 - failure (see C) # $bits rounds-1 (used in aesni_set_decrypt_key) # *$key key schedule # $key pointer to key schedule (used in # aesni_set_decrypt_key) # # Subroutine is frame-less, which means that only volatile registers # are used. Note that it's declared "abi-omnipotent", which means that # amount of volatile registers is smaller on Windows. # # There are two variants of this function, one which uses aeskeygenassist # ("base") and one which uses aesenclast + pshufb ("alt"). See aes/internal.h # for details. $code.=<<___; .globl ${PREFIX}_set_encrypt_key_base .type ${PREFIX}_set_encrypt_key_base,\@abi-omnipotent .align 16 ${PREFIX}_set_encrypt_key_base: .cfi_startproc .seh_startproc _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST movb \$1,BORINGSSL_function_hit+3(%rip) #endif sub \$8,%rsp .cfi_adjust_cfa_offset 8 .seh_stackalloc 8 .seh_endprologue movups ($inp),%xmm0 # pull first 128 bits of *userKey xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 lea 16($key),%rax # %rax is used as modifiable copy of $key cmp \$256,$bits je .L14rounds # 192-bit key support was removed. cmp \$128,$bits jne .Lbad_keybits .L10rounds: mov \$9,$bits # 10 rounds for 128-bit key $movkey %xmm0,($key) # round 0 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 call .Lkey_expansion_128_cold aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 call .Lkey_expansion_128 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 call .Lkey_expansion_128 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 call .Lkey_expansion_128 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 call .Lkey_expansion_128 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 call .Lkey_expansion_128 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 call .Lkey_expansion_128 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 call .Lkey_expansion_128 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 call .Lkey_expansion_128 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 call .Lkey_expansion_128 $movkey %xmm0,(%rax) mov $bits,80(%rax) # 240(%rdx) xor %eax,%eax jmp .Lenc_key_ret # 192-bit key support was removed. .align 16 .L14rounds: movups 16($inp),%xmm2 # remaining half of *userKey mov \$13,$bits # 14 rounds for 256 lea 16(%rax),%rax $movkey %xmm0,($key) # round 0 $movkey %xmm2,16($key) # round 1 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 call .Lkey_expansion_256a_cold aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 call .Lkey_expansion_256b aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 call .Lkey_expansion_256a aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 call .Lkey_expansion_256b aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 call .Lkey_expansion_256a aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 call .Lkey_expansion_256b aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 call .Lkey_expansion_256a aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 call .Lkey_expansion_256b aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 call .Lkey_expansion_256a aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 call .Lkey_expansion_256b aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 call .Lkey_expansion_256a aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 call .Lkey_expansion_256b aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 call .Lkey_expansion_256a $movkey %xmm0,(%rax) mov $bits,16(%rax) # 240(%rdx) xor %rax,%rax jmp .Lenc_key_ret .align 16 .Lbad_keybits: mov \$-2,%rax .Lenc_key_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 add \$8,%rsp .cfi_adjust_cfa_offset -8 ret .cfi_endproc .seh_endproc .align 16 .Lkey_expansion_128: .cfi_startproc $movkey %xmm0,(%rax) lea 16(%rax),%rax .Lkey_expansion_128_cold: shufps \$0b00010000,%xmm0,%xmm4 xorps %xmm4, %xmm0 shufps \$0b10001100,%xmm0,%xmm4 xorps %xmm4, %xmm0 shufps \$0b11111111,%xmm1,%xmm1 # critical path xorps %xmm1,%xmm0 ret .cfi_endproc .align 16 .Lkey_expansion_256a: .cfi_startproc $movkey %xmm2,(%rax) lea 16(%rax),%rax .Lkey_expansion_256a_cold: shufps \$0b00010000,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps \$0b10001100,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps \$0b11111111,%xmm1,%xmm1 # critical path xorps %xmm1,%xmm0 ret .cfi_endproc .align 16 .Lkey_expansion_256b: .cfi_startproc $movkey %xmm0,(%rax) lea 16(%rax),%rax shufps \$0b00010000,%xmm2,%xmm4 xorps %xmm4,%xmm2 shufps \$0b10001100,%xmm2,%xmm4 xorps %xmm4,%xmm2 shufps \$0b10101010,%xmm1,%xmm1 # critical path xorps %xmm1,%xmm2 ret .cfi_endproc .size ${PREFIX}_set_encrypt_key_base,.-${PREFIX}_set_encrypt_key_base .globl ${PREFIX}_set_encrypt_key_alt .type ${PREFIX}_set_encrypt_key_alt,\@abi-omnipotent .align 16 ${PREFIX}_set_encrypt_key_alt: .cfi_startproc .seh_startproc _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST movb \$1,BORINGSSL_function_hit+3(%rip) #endif sub \$8,%rsp .cfi_adjust_cfa_offset 8 .seh_stackalloc 8 .seh_endprologue movups ($inp),%xmm0 # pull first 128 bits of *userKey xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 lea 16($key),%rax # %rax is used as modifiable copy of $key cmp \$256,$bits je .L14rounds_alt # 192-bit key support was removed. cmp \$128,$bits jne .Lbad_keybits_alt mov \$9,$bits # 10 rounds for 128-bit key movdqa .Lkey_rotate(%rip),%xmm5 mov \$8,%r10d movdqa .Lkey_rcon1(%rip),%xmm4 movdqa %xmm0,%xmm2 movdqu %xmm0,($key) jmp .Loop_key128 .align 16 .Loop_key128: pshufb %xmm5,%xmm0 aesenclast %xmm4,%xmm0 pslld \$1,%xmm4 lea 16(%rax),%rax movdqa %xmm2,%xmm3 pslldq \$4,%xmm2 pxor %xmm2,%xmm3 pslldq \$4,%xmm2 pxor %xmm2,%xmm3 pslldq \$4,%xmm2 pxor %xmm3,%xmm2 pxor %xmm2,%xmm0 movdqu %xmm0,-16(%rax) movdqa %xmm0,%xmm2 dec %r10d jnz .Loop_key128 movdqa .Lkey_rcon1b(%rip),%xmm4 pshufb %xmm5,%xmm0 aesenclast %xmm4,%xmm0 pslld \$1,%xmm4 movdqa %xmm2,%xmm3 pslldq \$4,%xmm2 pxor %xmm2,%xmm3 pslldq \$4,%xmm2 pxor %xmm2,%xmm3 pslldq \$4,%xmm2 pxor %xmm3,%xmm2 pxor %xmm2,%xmm0 movdqu %xmm0,(%rax) movdqa %xmm0,%xmm2 pshufb %xmm5,%xmm0 aesenclast %xmm4,%xmm0 movdqa %xmm2,%xmm3 pslldq \$4,%xmm2 pxor %xmm2,%xmm3 pslldq \$4,%xmm2 pxor %xmm2,%xmm3 pslldq \$4,%xmm2 pxor %xmm3,%xmm2 pxor %xmm2,%xmm0 movdqu %xmm0,16(%rax) mov $bits,96(%rax) # 240($key) xor %eax,%eax jmp .Lenc_key_ret_alt # 192-bit key support was removed. .align 16 .L14rounds_alt: movups 16($inp),%xmm2 # remaining half of *userKey mov \$13,$bits # 14 rounds for 256 lea 16(%rax),%rax movdqa .Lkey_rotate(%rip),%xmm5 movdqa .Lkey_rcon1(%rip),%xmm4 mov \$7,%r10d movdqu %xmm0,0($key) movdqa %xmm2,%xmm1 movdqu %xmm2,16($key) jmp .Loop_key256 .align 16 .Loop_key256: pshufb %xmm5,%xmm2 aesenclast %xmm4,%xmm2 movdqa %xmm0,%xmm3 pslldq \$4,%xmm0 pxor %xmm0,%xmm3 pslldq \$4,%xmm0 pxor %xmm0,%xmm3 pslldq \$4,%xmm0 pxor %xmm3,%xmm0 pslld \$1,%xmm4 pxor %xmm2,%xmm0 movdqu %xmm0,(%rax) dec %r10d jz .Ldone_key256 pshufd \$0xff,%xmm0,%xmm2 pxor %xmm3,%xmm3 aesenclast %xmm3,%xmm2 movdqa %xmm1,%xmm3 pslldq \$4,%xmm1 pxor %xmm1,%xmm3 pslldq \$4,%xmm1 pxor %xmm1,%xmm3 pslldq \$4,%xmm1 pxor %xmm3,%xmm1 pxor %xmm1,%xmm2 movdqu %xmm2,16(%rax) lea 32(%rax),%rax movdqa %xmm2,%xmm1 jmp .Loop_key256 .Ldone_key256: mov $bits,16(%rax) # 240($key) xor %eax,%eax jmp .Lenc_key_ret_alt .align 16 .Lbad_keybits_alt: mov \$-2,%rax .Lenc_key_ret_alt: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 add \$8,%rsp .cfi_adjust_cfa_offset -8 ret .cfi_endproc .seh_endproc .size ${PREFIX}_set_encrypt_key_alt,.-${PREFIX}_set_encrypt_key_alt ___ } $code.=<<___; .section .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lincrement32: .long 6,6,6,0 .Lincrement64: .long 1,0,0,0 .Lincrement1: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .Lkey_rotate: .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d .Lkey_rotate192: .long 0x04070605,0x04070605,0x04070605,0x04070605 .Lkey_rcon1: .long 1,1,1,1 .Lkey_rcon1b: .long 0x1b,0x1b,0x1b,0x1b .asciz "AES for Intel AES-NI, CRYPTOGAMS by " .align 64 .text ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind ___ $code.=<<___ if ($PREFIX eq "aes_hw"); .type ctr_xts_se_handler,\@abi-omnipotent .align 16 ctr_xts_se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue lable cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail mov 208($context),%rax # pull context->R11 lea -0xa8(%rax),%rsi # %xmm save area lea 512($context),%rdi # & context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq mov -8(%rax),%rbp # restore saved %rbp mov %rbp,160($context) # restore context->Rbp .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size ctr_xts_se_handler,.-ctr_xts_se_handler .section .pdata .align 4 ___ $code.=<<___ if ($PREFIX eq "aes_hw"); .rva .LSEH_begin_${PREFIX}_ctr32_encrypt_blocks .rva .LSEH_end_${PREFIX}_ctr32_encrypt_blocks .rva .LSEH_info_ctr32 ___ $code.=<<___; .section .xdata .align 8 ___ $code.=<<___ if ($PREFIX eq "aes_hw"); .LSEH_info_ctr32: .byte 9,0,0,0 .rva ctr_xts_se_handler .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] ___ } sub rex { local *opcode=shift; my ($dst,$src)=@_; my $rex=0; $rex|=0x04 if($dst>=8); $rex|=0x01 if($src>=8); push @opcode,$rex|0x40 if($rex); } sub aesni { my $line=shift; my @opcode=(0x66); if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { rex(\@opcode,$4,$3); push @opcode,0x0f,0x3a,0xdf; push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M my $c=$2; push @opcode,$c=~/^0/?oct($c):$c; return ".byte\t".join(',',@opcode); } elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { my %opcodelet = ( "aesimc" => 0xdb, "aesenc" => 0xdc, "aesenclast" => 0xdd, "aesdec" => 0xde, "aesdeclast" => 0xdf ); return undef if (!defined($opcodelet{$1})); rex(\@opcode,$3,$2); push @opcode,0x0f,0x38,$opcodelet{$1}; push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { my %opcodelet = ( "aesenc" => 0xdc, "aesenclast" => 0xdd, "aesdec" => 0xde, "aesdeclast" => 0xdf ); return undef if (!defined($opcodelet{$1})); my $off = $2; push @opcode,0x44 if ($3>=8); push @opcode,0x0f,0x38,$opcodelet{$1}; push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M push @opcode,($off=~/^0/?oct($off):$off)&0xff; return ".byte\t".join(',',@opcode); } return $line; } sub movbe { ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; } $code =~ s/\`([^\`]*)\`/eval($1)/gem; $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-armx.pl000064400000000000000000000347431046102023000203170ustar 00000000000000#! /usr/bin/env perl # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # This module implements support for ARMv8 AES instructions. The # module is endian-agnostic in sense that it supports both big- and # little-endian cases. As does it support both 32- and 64-bit modes # of operation. Latter is achieved by limiting amount of utilized # registers to 16, which implies additional NEON load and integer # instructions. This has no effect on mighty Apple A7, where results # are literally equal to the theoretical estimates based on AES # instruction latencies and issue rates. On Cortex-A53, an in-order # execution core, this costs up to 10-15%, which is partially # compensated by implementing dedicated code path for 128-bit # CBC encrypt case. On Cortex-A57 parallelizable mode performance # seems to be limited by sheer amount of NEON instructions... # # Performance in cycles per byte processed with 128-bit key: # # CBC enc CBC dec CTR # Apple A7 2.39 1.20 1.20 # Cortex-A53 1.32 1.29 1.46 # Cortex-A57(*) 1.95 0.85 0.93 # Denver 1.96 0.86 0.80 # Mongoose 1.33 1.20 1.20 # # (*) original 3.64/1.34/1.32 results were for r0p0 revision # and are still same even for updated module; $flavour = shift; $output = shift; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $prefix="aes_hw"; $code=<<___; #if __ARM_MAX_ARCH__>=7 .text ___ $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); $code.=<<___ if ($flavour !~ /64/); .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) .fpu neon .code 32 #undef __thumb2__ ___ # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to # maintain both 32- and 64-bit codes within single module and # transliterate common code to either flavour with regex vodoo. # {{{ my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); # On AArch64, put the data .rodata and use adrp + add for compatibility with # execute-only memory. On AArch32, put it in .text and use adr. $code.= ".section .rodata\n" if ($flavour =~ /64/); $code.=<<___; .align 5 .Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b .text .globl ${prefix}_set_encrypt_key .type ${prefix}_set_encrypt_key,%function .align 5 ${prefix}_set_encrypt_key: .Lenc_key: ___ $code.=<<___ if ($flavour =~ /64/); // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ___ $code.=<<___; mov $ptr,#-2 cmp $bits,#128 b.lt .Lenc_key_abort cmp $bits,#256 b.gt .Lenc_key_abort tst $bits,#0x3f b.ne .Lenc_key_abort ___ $code.=<<___ if ($flavour =~ /64/); adrp $ptr,:pg_hi21:.Lrcon add $ptr,$ptr,:lo12:.Lrcon ___ $code.=<<___ if ($flavour !~ /64/); adr $ptr,.Lrcon ___ $code.=<<___; cmp $bits,#192 veor $zero,$zero,$zero vld1.8 {$in0},[$inp],#16 mov $bits,#8 // reuse $bits vld1.32 {$rcon,$mask},[$ptr],#32 b.lt .Loop128 // 192-bit key support was removed. b .L256 .align 4 .Loop128: vtbl.8 $key,{$in0},$mask vext.8 $tmp,$zero,$in0,#12 vst1.32 {$in0},[$out],#16 aese $key,$zero subs $bits,$bits,#1 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $key,$key,$rcon veor $in0,$in0,$tmp vshl.u8 $rcon,$rcon,#1 veor $in0,$in0,$key b.ne .Loop128 vld1.32 {$rcon},[$ptr] vtbl.8 $key,{$in0},$mask vext.8 $tmp,$zero,$in0,#12 vst1.32 {$in0},[$out],#16 aese $key,$zero veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $key,$key,$rcon veor $in0,$in0,$tmp vshl.u8 $rcon,$rcon,#1 veor $in0,$in0,$key vtbl.8 $key,{$in0},$mask vext.8 $tmp,$zero,$in0,#12 vst1.32 {$in0},[$out],#16 aese $key,$zero veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $key,$key,$rcon veor $in0,$in0,$tmp veor $in0,$in0,$key vst1.32 {$in0},[$out] add $out,$out,#0x50 mov $rounds,#10 b .Ldone // 192-bit key support was removed. .align 4 .L256: vld1.8 {$in1},[$inp] mov $bits,#7 mov $rounds,#14 vst1.32 {$in0},[$out],#16 .Loop256: vtbl.8 $key,{$in1},$mask vext.8 $tmp,$zero,$in0,#12 vst1.32 {$in1},[$out],#16 aese $key,$zero subs $bits,$bits,#1 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $key,$key,$rcon veor $in0,$in0,$tmp vshl.u8 $rcon,$rcon,#1 veor $in0,$in0,$key vst1.32 {$in0},[$out],#16 b.eq .Ldone vdup.32 $key,${in0}[3] // just splat vext.8 $tmp,$zero,$in1,#12 aese $key,$zero veor $in1,$in1,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in1,$in1,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in1,$in1,$tmp veor $in1,$in1,$key b .Loop256 .Ldone: str $rounds,[$out] mov $ptr,#0 .Lenc_key_abort: mov x0,$ptr // return value `"ldr x29,[sp],#16" if ($flavour =~ /64/)` ret .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key ___ }}} {{{ my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my ($rounds,$cnt,$key_)=("w5","w6","x7"); my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); my $step="x12"; # aliases with $tctr2 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); my ($dat,$tmp)=($dat0,$tmp0); ### q8-q15 preloaded key schedule $code.=<<___; .globl ${prefix}_ctr32_encrypt_blocks .type ${prefix}_ctr32_encrypt_blocks,%function .align 5 ${prefix}_ctr32_encrypt_blocks: ___ $code.=<<___ if ($flavour =~ /64/); // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ___ $code.=<<___ if ($flavour !~ /64/); mov ip,sp stmdb sp!,{r4-r10,lr} vstmdb sp!,{d8-d15} @ ABI specification says so ldr r4, [ip] @ load remaining arg ___ $code.=<<___; ldr $rounds,[$key,#240] ldr $ctr, [$ivp, #12] vld1.32 {$dat0},[$ivp] vld1.32 {q8-q9},[$key] // load key schedule... sub $rounds,$rounds,#4 mov $step,#16 cmp $len,#2 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys sub $rounds,$rounds,#2 vld1.32 {q12-q13},[$key_],#32 vld1.32 {q14-q15},[$key_],#32 vld1.32 {$rndlast},[$key_] add $key_,$key,#32 mov $cnt,$rounds cclr $step,lo // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are // affected by silicon errata #1742098 [0] and #1655431 [1], // respectively, where the second instruction of an aese/aesmc // instruction pair may execute twice if an interrupt is taken right // after the first instruction consumes an input register of which a // single 32-bit lane has been updated the last time it was modified. // // This function uses a counter in one 32-bit lane. The vmov.32 lines // could write to $dat1 and $dat2 directly, but that trips this bugs. // We write to $ivec and copy to the final register as a workaround. // // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice #ifndef __ARMEB__ rev $ctr, $ctr #endif add $tctr1, $ctr, #1 vorr $ivec,$dat0,$dat0 rev $tctr1, $tctr1 vmov.32 ${ivec}[3],$tctr1 add $ctr, $ctr, #2 vorr $dat1,$ivec,$ivec b.ls .Lctr32_tail rev $tctr2, $ctr vmov.32 ${ivec}[3],$tctr2 sub $len,$len,#3 // bias vorr $dat2,$ivec,$ivec b .Loop3x_ctr32 .align 4 .Loop3x_ctr32: aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 vld1.32 {q9},[$key_],#16 b.gt .Loop3x_ctr32 aese $dat0,q8 aesmc $tmp0,$dat0 aese $dat1,q8 aesmc $tmp1,$dat1 vld1.8 {$in0},[$inp],#16 add $tctr0,$ctr,#1 aese $dat2,q8 aesmc $dat2,$dat2 vld1.8 {$in1},[$inp],#16 rev $tctr0,$tctr0 aese $tmp0,q9 aesmc $tmp0,$tmp0 aese $tmp1,q9 aesmc $tmp1,$tmp1 vld1.8 {$in2},[$inp],#16 mov $key_,$key aese $dat2,q9 aesmc $tmp2,$dat2 aese $tmp0,q12 aesmc $tmp0,$tmp0 aese $tmp1,q12 aesmc $tmp1,$tmp1 veor $in0,$in0,$rndlast add $tctr1,$ctr,#2 aese $tmp2,q12 aesmc $tmp2,$tmp2 veor $in1,$in1,$rndlast add $ctr,$ctr,#3 aese $tmp0,q13 aesmc $tmp0,$tmp0 aese $tmp1,q13 aesmc $tmp1,$tmp1 // Note the logic to update $dat0, $dat1, and $dat1 is written to work // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in // 32-bit mode. See the comment above. veor $in2,$in2,$rndlast vmov.32 ${ivec}[3], $tctr0 aese $tmp2,q13 aesmc $tmp2,$tmp2 vorr $dat0,$ivec,$ivec rev $tctr1,$tctr1 aese $tmp0,q14 aesmc $tmp0,$tmp0 vmov.32 ${ivec}[3], $tctr1 rev $tctr2,$ctr aese $tmp1,q14 aesmc $tmp1,$tmp1 vorr $dat1,$ivec,$ivec vmov.32 ${ivec}[3], $tctr2 aese $tmp2,q14 aesmc $tmp2,$tmp2 vorr $dat2,$ivec,$ivec subs $len,$len,#3 aese $tmp0,q15 aese $tmp1,q15 aese $tmp2,q15 veor $in0,$in0,$tmp0 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] vst1.8 {$in0},[$out],#16 veor $in1,$in1,$tmp1 mov $cnt,$rounds vst1.8 {$in1},[$out],#16 veor $in2,$in2,$tmp2 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$in2},[$out],#16 b.hs .Loop3x_ctr32 adds $len,$len,#3 b.eq .Lctr32_done cmp $len,#1 mov $step,#16 cclr $step,eq .Lctr32_tail: aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 vld1.32 {q9},[$key_],#16 b.gt .Lctr32_tail aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 vld1.8 {$in0},[$inp],$step aese $dat0,q12 aesmc $dat0,$dat0 aese $dat1,q12 aesmc $dat1,$dat1 vld1.8 {$in1},[$inp] aese $dat0,q13 aesmc $dat0,$dat0 aese $dat1,q13 aesmc $dat1,$dat1 veor $in0,$in0,$rndlast aese $dat0,q14 aesmc $dat0,$dat0 aese $dat1,q14 aesmc $dat1,$dat1 veor $in1,$in1,$rndlast aese $dat0,q15 aese $dat1,q15 cmp $len,#1 veor $in0,$in0,$dat0 veor $in1,$in1,$dat1 vst1.8 {$in0},[$out],#16 b.eq .Lctr32_done vst1.8 {$in1},[$out] .Lctr32_done: ___ $code.=<<___ if ($flavour !~ /64/); vldmia sp!,{d8-d15} ldmia sp!,{r4-r10,pc} ___ $code.=<<___ if ($flavour =~ /64/); ldr x29,[sp],#16 ret ___ $code.=<<___; .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks ___ }}} $code.=<<___; #endif ___ ######################################## if ($flavour =~ /64/) { ######## 64-bit code my %opcode = ( "aesd" => 0x4e285800, "aese" => 0x4e284800, "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); local *unaes = sub { my ($mnemonic,$arg)=@_; $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && sprintf ".inst\t0x%08x\t//%s %s", $opcode{$mnemonic}|$1|($2<<5), $mnemonic,$arg; }; foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/geo; s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers s/@\s/\/\//o; # old->new style commentary #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or s/vmov\.i8/movi/o or # fix up legacy mnemonics s/vext\.8/ext/o or s/vrev32\.8/rev32/o or s/vtst\.8/cmtst/o or s/vshr/ushr/o or s/^(\s+)v/$1/o or # strip off v prefix s/\bbx\s+lr\b/ret/o; # fix up remaining legacy suffixes s/\.[ui]?8//o; m/\],#8/o and s/\.16b/\.8b/go; s/\.[ui]?32//o and s/\.16b/\.4s/go; s/\.[ui]?64//o and s/\.16b/\.2d/go; s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; # Switch preprocessor checks to aarch64 versions. s/__ARME([BL])__/__AARCH64E$1__/go; print $_,"\n"; } } else { ######## 32-bit code my %opcode = ( "aesd" => 0xf3b00340, "aese" => 0xf3b00300, "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); local *unaes = sub { my ($mnemonic,$arg)=@_; if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) |(($2&7)<<1) |(($2&8)<<2); # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; } }; sub unvtbl { my $arg=shift; $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && sprintf "vtbl.8 d%d,{q%d},d%d\n\t". "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; } sub unvdup32 { my $arg=shift; $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; } sub unvmov32 { my $arg=shift; $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; } foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/geo; s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers s/\/\/\s?/@ /o; # new->old style commentary # fix up remaining new-style suffixes s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or s/\],#[0-9]+/]!/o; s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or s/vtbl\.8\s+(.*)/unvtbl($1)/geo or s/vdup\.32\s+(.*)/unvdup32($1)/geo or s/vmov\.32\s+(.*)/unvmov32($1)/geo or s/^(\s+)b\./$1b/o or s/^(\s+)mov\./$1mov/o or s/^(\s+)ret/$1bx\tlr/o; print $_,"\n"; } } close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl000064400000000000000000003156141046102023000211500ustar 00000000000000#! /usr/bin/env perl # Copyright (c) 2022, ARM Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #======================================================================== # Written by Fangming Fang for the OpenSSL project, # derived from https://github.com/ARM-software/AArch64cryptolib, original # author Samuel Lee . #======================================================================== # # Approach - assume we don't want to reload constants, so reserve ~half of # vector register file for constants # # main loop to act on 4 16B blocks per iteration, and then do modulo of the # accumulated intermediate hashes from the 4 blocks # # ____________________________________________________ # | | # | PRE | # |____________________________________________________| # | | | | # | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 | # |________________|________________|__________________| # | | | | # | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 | # |________________|________________|__________________| # | | | | # | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 | # |________________|________________|__________________| # | | | | # | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 | # |________________|____(mostly)____|__________________| # | | # | MODULO | # |____________________________________________________| # # PRE: Ensure previous generated intermediate hash is aligned and merged with # result for GHASH 4k+0 # # EXT low_acc, low_acc, low_acc, #8 # EOR res_curr (4k+0), res_curr (4k+0), low_acc # # CTR block: Increment and byte reverse counter in scalar registers and transfer # to SIMD registers # # REV ctr32, rev_ctr32 # ORR ctr64, constctr96_top32, ctr32, LSL #32 # // Keeping this in scalar registers to free up space in SIMD RF # INS ctr_next.d[0], constctr96_bottom64 # INS ctr_next.d[1], ctr64X # ADD rev_ctr32, #1 # # AES block: # # Do AES encryption/decryption on CTR block X and EOR it with input block X. # Take 256 bytes key below for example. Doing small trick here of loading input # in scalar registers, EORing with last key and then transferring Given we are # very constrained in our ASIMD registers this is quite important # # Encrypt: # LDR input_low, [ input_ptr ], #8 # LDR input_high, [ input_ptr ], #8 # EOR input_low, k14_low # EOR input_high, k14_high # INS res_curr.d[0], input_low # INS res_curr.d[1], input_high # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k13 # EOR res_curr, res_curr, ctr_curr # ST1 { res_curr.16b }, [ output_ptr ], #16 # # Decrypt: # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k13 # LDR res_curr, [ input_ptr ], #16 # EOR res_curr, res_curr, ctr_curr # MOV output_low, res_curr.d[0] # MOV output_high, res_curr.d[1] # EOR output_low, k14_low # EOR output_high, k14_high # STP output_low, output_high, [ output_ptr ], #16 # # GHASH block X: # Do 128b karatsuba polynomial multiplication on block. We only have # 64b->128b polynomial multipliers, naively that means we need to do 4 64b # multiplies to generate a 128b. # # multiplication: # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ # (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64 # # The idea behind Karatsuba multiplication is that we can do just 3 64b # multiplies: # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ # (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ # Pmull(Al,Bl))<<64 # # There is some complication here because the bit order of GHASH's PMULL is # reversed compared to elsewhere, so we are multiplying with "twisted" # powers of H # # Note: We can PMULL directly into the acc_x in first GHASH of the loop # # Note: For scheduling big cores we want to split the processing to happen over # two loop iterations - otherwise the critical path latency dominates the # performance. # # This has a knock on effect on register pressure, so we have to be a bit # more clever with our temporary registers than indicated here # # REV64 res_curr, res_curr # INS t_m.d[0], res_curr.d[1] # EOR t_m.8B, t_m.8B, res_curr.8B # PMULL2 t_h, res_curr, HX # PMULL t_l, res_curr, HX # PMULL t_m, t_m, HX_k # EOR acc_h, acc_h, t_h # EOR acc_l, acc_l, t_l # EOR acc_m, acc_m, t_m # # MODULO: take the partial accumulators (~representing sum of 256b # multiplication results), from GHASH and do modulo reduction on them # There is some complication here because the bit order of GHASH's # PMULL is reversed compared to elsewhere, so we are doing modulo with # a reversed constant # # EOR acc_m, acc_m, acc_h # EOR acc_m, acc_m, acc_l // Finish off karatsuba processing # PMULL t_mod, acc_h, mod_constant # EXT acc_h, acc_h, acc_h, #8 # EOR acc_m, acc_m, acc_h # EOR acc_m, acc_m, t_mod # PMULL acc_h, acc_m, mod_constant # EXT acc_m, acc_m, acc_m, #8 # EOR acc_l, acc_l, acc_h # EOR acc_l, acc_l, acc_m # # This code was then modified to merge the AES-128-GCM, AES-192-GCM, and # AES-256-GCM implementations into a single function to reduce size. We move the # last two round keys into consistent registers across all sizes, as they're # treated special. Then, after rounds 0 through 8, we added some branches to # conditionally run rounds 9-10 (AES-192 + AES-256) and 11-12 (AES-256), before # merging back into code which finishes up the last two rounds. # # There is a mostly decision to be made around how much parallel work goes # before or after the conditional part. We attempted to preserve the original # scheduling where possible, but it's possible other schedulings are more # optimal with the current ordering. $flavour = shift; $output = shift; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; $code=<<___; #if __ARM_MAX_ARCH__ >= 8 .arch armv8-a+crypto .text ___ $input_ptr="x0"; #argument block $bit_length="x1"; $output_ptr="x2"; $current_tag="x3"; $Htable="x6"; $counter="x16"; $cc="x8"; { my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); my ($output_l0,$output_h0)=map("x$_",(6..7)); # rkN_l and rkN_h store the final round key, which is handled slightly # differently because it is EORed through general-purpose registers. my $ctr32w="w9"; my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rkN_l,$rkN_h,$len)=map("x$_",(9..15)); my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); my $rounds="x17"; my $roundsw="w17"; my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); my $t0="v8"; my $t0d="d8"; my $t1="v4"; my $t1d="d4"; my $t2="v8"; my $t2d="d8"; my $t3="v4"; my $t3d="d4"; my $t4="v4"; my $t4d="d4"; my $t5="v5"; my $t5d="d5"; my $t6="v8"; my $t6d="d8"; my $t7="v5"; my $t7d="d5"; my $t8="v6"; my $t8d="d6"; my $t9="v4"; my $t9d="d4"; my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); my $mod_constantd="d8"; my $mod_constant="v8"; my $mod_t="v7"; # rkNm1 stores the second-to-last round key, which is handled slightly # differently because it uses plain AESE instead of an AESE + AESMC macro-op. my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rkNm1)=map("v$_.16b",(18..31)); my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rkNm1q)=map("q$_",(18..31)); my $rk2q1="v20.1q"; my $rk3q1="v21.1q"; my $rk4v="v22"; my $rk4d="d22"; ################################################################################ # size_t aes_gcm_enc_kernel(const uint8_t *in, # size_t len_bits, # uint8_t *out, # u64 *Xi, # uint8_t ivec[16], # const void *key, # const void *Htable); # $code.=<<___; .global aes_gcm_enc_kernel .type aes_gcm_enc_kernel,%function .align 4 aes_gcm_enc_kernel: AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-128]! mov x29, sp stp x19, x20, [sp, #16] mov $counter, x4 mov $cc, x5 stp x21, x22, [sp, #32] stp x23, x24, [sp, #48] stp d8, d9, [sp, #64] stp d10, d11, [sp, #80] stp d12, d13, [sp, #96] stp d14, d15, [sp, #112] ldr $roundsw, [$cc, #240] add $input_l1, $cc, $rounds, lsl #4 // borrow input_l1 for last key ldp $rkN_l, $rkN_h, [$input_l1] // load round N keys ldr $rkNm1q, [$input_l1, #-16] // load round N-1 keys add $end_input_ptr, $input_ptr, $bit_length, lsr #3 // end_input_ptr lsr $main_end_input_ptr, $bit_length, #3 // byte_len mov $len, $main_end_input_ptr ldp $ctr96_b64x, $ctr96_t32x, [$counter] // ctr96_b64, ctr96_t32 ld1 { $ctr0b}, [$counter] // special case vector load initial counter so we can start first AES block as quickly as possible sub $main_end_input_ptr, $main_end_input_ptr, #1 // byte_len - 1 ldr $rk0q, [$cc, #0] // load rk0 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ldr $rk7q, [$cc, #112] // load rk7 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr lsr $rctr32x, $ctr96_t32x, #32 fmov $ctr2d, $ctr96_b64x // CTR block 2 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w rev $rctr32w, $rctr32w // rev_ctr32 fmov $ctr1d, $ctr96_b64x // CTR block 1 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 0 add $rctr32w, $rctr32w, #1 // increment rev_ctr32 rev $ctr32w, $rctr32w // CTR block 1 fmov $ctr3d, $ctr96_b64x // CTR block 3 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 1 add $rctr32w, $rctr32w, #1 // CTR block 1 ldr $rk1q, [$cc, #16] // load rk1 fmov $ctr1.d[1], $ctr32x // CTR block 1 rev $ctr32w, $rctr32w // CTR block 2 add $rctr32w, $rctr32w, #1 // CTR block 2 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 2 ldr $rk2q, [$cc, #32] // load rk2 fmov $ctr2.d[1], $ctr32x // CTR block 2 rev $ctr32w, $rctr32w // CTR block 3 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 1 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 3 fmov $ctr3.d[1], $ctr32x // CTR block 3 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 0 ldr $rk3q, [$cc, #48] // load rk3 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 2 ldr $rk6q, [$cc, #96] // load rk6 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 0 ldr $rk5q, [$cc, #80] // load rk5 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 1 ldr $h3q, [$Htable, #48] // load h3l | h3h ext $h3b, $h3b, $h3b, #8 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 0 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 1 ldr $rk4q, [$cc, #64] // load rk4 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 2 ldr $h2q, [$Htable, #32] // load h2l | h2h ext $h2b, $h2b, $h2b, #8 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 1 ldr $rk12q, [$cc, #192] // load rk12 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 2 ldr $h4q, [$Htable, #80] // load h4l | h4h ext $h4b, $h4b, $h4b, #8 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 3 ldr $rk11q, [$cc, #176] // load rk11 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 2 ldr $rk8q, [$cc, #128] // load rk8 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 3 add $rctr32w, $rctr32w, #1 // CTR block 3 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 3 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 3 ld1 { $acc_lb}, [$current_tag] ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 4 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 4 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 4 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 4 cmp $rounds, #12 // setup flags for AES-128/192/256 check aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 5 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 5 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 5 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 5 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 6 trn2 $h34k.2d, $h3.2d, $h4.2d // h4l | h3l aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 6 ldr $rk9q, [$cc, #144] // load rk9 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 6 ldr $h1q, [$Htable] // load h1l | h1h ext $h1b, $h1b, $h1b, #8 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 6 ldr $rk10q, [$cc, #160] // load rk10 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 7 trn1 $acc_h.2d, $h3.2d, $h4.2d // h4h | h3h aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 7 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 7 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 7 trn2 $h12k.2d, $h1.2d, $h2.2d // h2l | h1l aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 8 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 8 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 8 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 8 b.lt .Lenc_finish_first_blocks // branch if AES-128 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 9 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 9 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 9 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 9 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 10 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 10 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 10 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 10 b.eq .Lenc_finish_first_blocks // branch if AES-192 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 11 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 11 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 11 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 11 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 12 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 12 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 12 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 12 .Lenc_finish_first_blocks: cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks eor $h34k.16b, $h34k.16b, $acc_h.16b // h4k | h3k aese $ctr2b, $rkNm1 // AES block 2 - round N-1 trn1 $t0.2d, $h1.2d, $h2.2d // h2h | h1h aese $ctr1b, $rkNm1 // AES block 1 - round N-1 aese $ctr0b, $rkNm1 // AES block 0 - round N-1 aese $ctr3b, $rkNm1 // AES block 3 - round N-1 eor $h12k.16b, $h12k.16b, $t0.16b // h2k | h1k b.ge .Lenc_tail // handle tail ldp $input_l1, $input_h1, [$input_ptr, #16] // AES block 1 - load plaintext rev $ctr32w, $rctr32w // CTR block 4 ldp $input_l0, $input_h0, [$input_ptr, #0] // AES block 0 - load plaintext ldp $input_l3, $input_h3, [$input_ptr, #48] // AES block 3 - load plaintext ldp $input_l2, $input_h2, [$input_ptr, #32] // AES block 2 - load plaintext add $input_ptr, $input_ptr, #64 // AES input_ptr update eor $input_l1, $input_l1, $rkN_l // AES block 1 - round N low eor $input_h1, $input_h1, $rkN_h // AES block 1 - round N high fmov $ctr_t1d, $input_l1 // AES block 1 - mov low eor $input_l0, $input_l0, $rkN_l // AES block 0 - round N low eor $input_h0, $input_h0, $rkN_h // AES block 0 - round N high eor $input_h3, $input_h3, $rkN_h // AES block 3 - round N high fmov $ctr_t0d, $input_l0 // AES block 0 - mov low cmp $input_ptr, $main_end_input_ptr // check if we have <= 8 blocks fmov $ctr_t0.d[1], $input_h0 // AES block 0 - mov high eor $input_l3, $input_l3, $rkN_l // AES block 3 - round N low eor $input_l2, $input_l2, $rkN_l // AES block 2 - round N low fmov $ctr_t1.d[1], $input_h1 // AES block 1 - mov high fmov $ctr_t2d, $input_l2 // AES block 2 - mov low add $rctr32w, $rctr32w, #1 // CTR block 4 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4 fmov $ctr_t3d, $input_l3 // AES block 3 - mov low eor $input_h2, $input_h2, $rkN_h // AES block 2 - round N high fmov $ctr_t2.d[1], $input_h2 // AES block 2 - mov high eor $res0b, $ctr_t0b, $ctr0b // AES block 0 - result fmov $ctr0d, $ctr96_b64x // CTR block 4 fmov $ctr0.d[1], $ctr32x // CTR block 4 rev $ctr32w, $rctr32w // CTR block 5 add $rctr32w, $rctr32w, #1 // CTR block 5 eor $res1b, $ctr_t1b, $ctr1b // AES block 1 - result fmov $ctr1d, $ctr96_b64x // CTR block 5 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 5 fmov $ctr1.d[1], $ctr32x // CTR block 5 rev $ctr32w, $rctr32w // CTR block 6 st1 { $res0b}, [$output_ptr], #16 // AES block 0 - store result fmov $ctr_t3.d[1], $input_h3 // AES block 3 - mov high orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 6 eor $res2b, $ctr_t2b, $ctr2b // AES block 2 - result st1 { $res1b}, [$output_ptr], #16 // AES block 1 - store result add $rctr32w, $rctr32w, #1 // CTR block 6 fmov $ctr2d, $ctr96_b64x // CTR block 6 fmov $ctr2.d[1], $ctr32x // CTR block 6 st1 { $res2b}, [$output_ptr], #16 // AES block 2 - store result rev $ctr32w, $rctr32w // CTR block 7 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 7 eor $res3b, $ctr_t3b, $ctr3b // AES block 3 - result st1 { $res3b}, [$output_ptr], #16 // AES block 3 - store result b.ge .Lenc_prepretail // do prepretail .Lenc_main_loop: // main loop start aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0 rev64 $res0b, $res0b // GHASH block 4k (only t0 is free) aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0 fmov $ctr3d, $ctr96_b64x // CTR block 4k+3 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0 ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1 fmov $ctr3.d[1], $ctr32x // CTR block 4k+3 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1 ldp $input_l3, $input_h3, [$input_ptr, #48] // AES block 4k+7 - load plaintext aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1 ldp $input_l2, $input_h2, [$input_ptr, #32] // AES block 4k+6 - load plaintext aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2 eor $res0b, $res0b, $acc_lb // PRE 1 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0 eor $input_l3, $input_l3, $rkN_l // AES block 4k+7 - round N low aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3 mov $acc_md, $h34k.d[1] // GHASH block 4k - mid pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high eor $input_h2, $input_h2, $rkN_h // AES block 4k+6 - round N high mov $t0d, $res0.d[1] // GHASH block 4k - mid aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1 rev64 $res1b, $res1b // GHASH block 4k+1 (t0 and t1 free) aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4 pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5 rev64 $res3b, $res3b // GHASH block 4k+3 (t0, t1, t2 and t3 free) pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid rev64 $res2b, $res2b // GHASH block 4k+2 (t0, t1, and t2 free) pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2 eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4 mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3 eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6 eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4 pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5 ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6 eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7 pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6 ldp $input_l1, $input_h1, [$input_ptr, #16] // AES block 4k+5 - load plaintext aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8 mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6 eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7 eor $input_l1, $input_l1, $rkN_l // AES block 4k+5 - round N low aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8 eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7 eor $input_l2, $input_l2, $rkN_l // AES block 4k+6 - round N low aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8 movi $mod_constant.8b, #0xc2 pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high cmp $rounds, #12 // setup flags for AES-128/192/256 check fmov $ctr_t1d, $input_l1 // AES block 4k+5 - mov low ldp $input_l0, $input_h0, [$input_ptr, #0] // AES block 4k+4 - load plaintext b.lt .Lenc_main_loop_continue // branch if AES-128 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10 b.eq .Lenc_main_loop_continue // branch if AES-192 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12 .Lenc_main_loop_continue: shl $mod_constantd, $mod_constantd, #56 // mod_constant eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid add $rctr32w, $rctr32w, #1 // CTR block 4k+3 eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up add $input_ptr, $input_ptr, #64 // AES input_ptr update pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid rev $ctr32w, $rctr32w // CTR block 4k+8 ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment eor $input_l0, $input_l0, $rkN_l // AES block 4k+4 - round N low eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up eor $input_h0, $input_h0, $rkN_h // AES block 4k+4 - round N high fmov $ctr_t0d, $input_l0 // AES block 4k+4 - mov low orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+8 eor $mod_t.16b, $acc_hb, $mod_t.16b // MODULO - fold into mid eor $input_h1, $input_h1, $rkN_h // AES block 4k+5 - round N high eor $input_h3, $input_h3, $rkN_h // AES block 4k+7 - round N high add $rctr32w, $rctr32w, #1 // CTR block 4k+8 aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1 fmov $ctr_t0.d[1], $input_h0 // AES block 4k+4 - mov high eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid fmov $ctr_t3d, $input_l3 // AES block 4k+7 - mov low aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1 fmov $ctr_t1.d[1], $input_h1 // AES block 4k+5 - mov high fmov $ctr_t2d, $input_l2 // AES block 4k+6 - mov low cmp $input_ptr, $main_end_input_ptr // LOOP CONTROL fmov $ctr_t2.d[1], $input_h2 // AES block 4k+6 - mov high pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low eor $res0b, $ctr_t0b, $ctr0b // AES block 4k+4 - result fmov $ctr0d, $ctr96_b64x // CTR block 4k+8 fmov $ctr0.d[1], $ctr32x // CTR block 4k+8 rev $ctr32w, $rctr32w // CTR block 4k+9 add $rctr32w, $rctr32w, #1 // CTR block 4k+9 eor $res1b, $ctr_t1b, $ctr1b // AES block 4k+5 - result fmov $ctr1d, $ctr96_b64x // CTR block 4k+9 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+9 fmov $ctr1.d[1], $ctr32x // CTR block 4k+9 aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1 rev $ctr32w, $rctr32w // CTR block 4k+10 st1 { $res0b}, [$output_ptr], #16 // AES block 4k+4 - store result orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+10 eor $acc_lb, $acc_lb, $acc_hb // MODULO - fold into low fmov $ctr_t3.d[1], $input_h3 // AES block 4k+7 - mov high ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment st1 { $res1b}, [$output_ptr], #16 // AES block 4k+5 - store result add $rctr32w, $rctr32w, #1 // CTR block 4k+10 aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1 eor $res2b, $ctr_t2b, $ctr2b // AES block 4k+6 - result fmov $ctr2d, $ctr96_b64x // CTR block 4k+10 st1 { $res2b}, [$output_ptr], #16 // AES block 4k+6 - store result fmov $ctr2.d[1], $ctr32x // CTR block 4k+10 rev $ctr32w, $rctr32w // CTR block 4k+11 eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+11 eor $res3b, $ctr_t3b, $ctr3b // AES block 4k+7 - result st1 { $res3b}, [$output_ptr], #16 // AES block 4k+7 - store result b.lt .Lenc_main_loop .Lenc_prepretail: // PREPRETAIL aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0 rev64 $res2b, $res2b // GHASH block 4k+2 (t0, t1, and t2 free) aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0 fmov $ctr3d, $ctr96_b64x // CTR block 4k+3 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0 rev64 $res0b, $res0b // GHASH block 4k (only t0 is free) fmov $ctr3.d[1], $ctr32x // CTR block 4k+3 ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1 eor $res0b, $res0b, $acc_lb // PRE 1 rev64 $res1b, $res1b // GHASH block 4k+1 (t0 and t1 free) aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0 mov $acc_md, $h34k.d[1] // GHASH block 4k - mid aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1 pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low mov $t0d, $res0.d[1] // GHASH block 4k - mid pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2 eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3 pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2 eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3 eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3 eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4 rev64 $res3b, $res3b // GHASH block 4k+3 (t0, t1, t2 and t3 free) aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4 pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid add $rctr32w, $rctr32w, #1 // CTR block 4k+3 pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4 eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5 eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4 pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5 pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6 movi $mod_constant.8b, #0xc2 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7 eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7 shl $mod_constantd, $mod_constantd, #56 // mod_constant aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8 eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8 cmp $rounds, #12 // setup flags for AES-128/192/256 check aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8 eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7 eor $acc_mb, $acc_mb, $acc_hb // karatsuba tidy up aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8 pmull $t1.1q, $acc_h.1d, $mod_constant.1d ext $acc_hb, $acc_hb, $acc_hb, #8 eor $acc_mb, $acc_mb, $acc_lb b.lt .Lenc_finish_prepretail // branch if AES-128 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10 b.eq .Lenc_finish_prepretail // branch if AES-192 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12 .Lenc_finish_prepretail: eor $acc_mb, $acc_mb, $t1.16b eor $acc_mb, $acc_mb, $acc_hb pmull $t1.1q, $acc_m.1d, $mod_constant.1d ext $acc_mb, $acc_mb, $acc_mb, #8 aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1 eor $acc_lb, $acc_lb, $t1.16b aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1 aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1 aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1 eor $acc_lb, $acc_lb, $acc_mb .Lenc_tail: // TAIL ext $t0.16b, $acc_lb, $acc_lb, #8 // prepare final partial tag sub $main_end_input_ptr, $end_input_ptr, $input_ptr // main_end_input_ptr is number of bytes left to process ldp $input_l0, $input_h0, [$input_ptr], #16 // AES block 4k+4 - load plaintext eor $input_l0, $input_l0, $rkN_l // AES block 4k+4 - round N low eor $input_h0, $input_h0, $rkN_h // AES block 4k+4 - round N high cmp $main_end_input_ptr, #48 fmov $ctr_t0d, $input_l0 // AES block 4k+4 - mov low fmov $ctr_t0.d[1], $input_h0 // AES block 4k+4 - mov high eor $res1b, $ctr_t0b, $ctr0b // AES block 4k+4 - result b.gt .Lenc_blocks_more_than_3 cmp $main_end_input_ptr, #32 mov $ctr3b, $ctr2b movi $acc_l.8b, #0 movi $acc_h.8b, #0 sub $rctr32w, $rctr32w, #1 mov $ctr2b, $ctr1b movi $acc_m.8b, #0 b.gt .Lenc_blocks_more_than_2 mov $ctr3b, $ctr1b sub $rctr32w, $rctr32w, #1 cmp $main_end_input_ptr, #16 b.gt .Lenc_blocks_more_than_1 sub $rctr32w, $rctr32w, #1 b .Lenc_blocks_less_than_1 .Lenc_blocks_more_than_3: // blocks left > 3 st1 { $res1b}, [$output_ptr], #16 // AES final-3 block - store result ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final-2 block - load input low & high rev64 $res0b, $res1b // GHASH final-3 block eor $input_l0, $input_l0, $rkN_l // AES final-2 block - round N low eor $res0b, $res0b, $t0.16b // feed in partial tag eor $input_h0, $input_h0, $rkN_h // AES final-2 block - round N high mov $rk4d, $res0.d[1] // GHASH final-3 block - mid fmov $res1d, $input_l0 // AES final-2 block - mov low fmov $res1.d[1], $input_h0 // AES final-2 block - mov high eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-3 block - mid movi $t0.8b, #0 // suppress further partial tag feed in mov $acc_md, $h34k.d[1] // GHASH final-3 block - mid pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH final-3 block - low pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH final-3 block - high pmull $acc_m.1q, $rk4v.1d, $acc_m.1d // GHASH final-3 block - mid eor $res1b, $res1b, $ctr1b // AES final-2 block - result .Lenc_blocks_more_than_2: // blocks left > 2 st1 { $res1b}, [$output_ptr], #16 // AES final-2 block - store result ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final-1 block - load input low & high rev64 $res0b, $res1b // GHASH final-2 block eor $input_l0, $input_l0, $rkN_l // AES final-1 block - round N low eor $res0b, $res0b, $t0.16b // feed in partial tag fmov $res1d, $input_l0 // AES final-1 block - mov low eor $input_h0, $input_h0, $rkN_h // AES final-1 block - round N high fmov $res1.d[1], $input_h0 // AES final-1 block - mov high movi $t0.8b, #0 // suppress further partial tag feed in pmull2 $rk2q1, $res0.2d, $h3.2d // GHASH final-2 block - high mov $rk4d, $res0.d[1] // GHASH final-2 block - mid pmull $rk3q1, $res0.1d, $h3.1d // GHASH final-2 block - low eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-2 block - mid eor $res1b, $res1b, $ctr2b // AES final-1 block - result eor $acc_hb, $acc_hb, $rk2 // GHASH final-2 block - high pmull $rk4v.1q, $rk4v.1d, $h34k.1d // GHASH final-2 block - mid eor $acc_lb, $acc_lb, $rk3 // GHASH final-2 block - low eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-2 block - mid .Lenc_blocks_more_than_1: // blocks left > 1 st1 { $res1b}, [$output_ptr], #16 // AES final-1 block - store result rev64 $res0b, $res1b // GHASH final-1 block ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final block - load input low & high eor $res0b, $res0b, $t0.16b // feed in partial tag movi $t0.8b, #0 // suppress further partial tag feed in eor $input_l0, $input_l0, $rkN_l // AES final block - round N low mov $rk4d, $res0.d[1] // GHASH final-1 block - mid pmull2 $rk2q1, $res0.2d, $h2.2d // GHASH final-1 block - high eor $input_h0, $input_h0, $rkN_h // AES final block - round N high eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-1 block - mid eor $acc_hb, $acc_hb, $rk2 // GHASH final-1 block - high ins $rk4v.d[1], $rk4v.d[0] // GHASH final-1 block - mid fmov $res1d, $input_l0 // AES final block - mov low fmov $res1.d[1], $input_h0 // AES final block - mov high pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d // GHASH final-1 block - mid pmull $rk3q1, $res0.1d, $h2.1d // GHASH final-1 block - low eor $res1b, $res1b, $ctr3b // AES final block - result eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-1 block - mid eor $acc_lb, $acc_lb, $rk3 // GHASH final-1 block - low .Lenc_blocks_less_than_1: // blocks left <= 1 and $bit_length, $bit_length, #127 // bit_length %= 128 mvn $rkN_l, xzr // rkN_l = 0xffffffffffffffff sub $bit_length, $bit_length, #128 // bit_length -= 128 neg $bit_length, $bit_length // bit_length = 128 - #bits in input (in range [1,128]) ld1 { $rk0}, [$output_ptr] // load existing bytes where the possibly partial last block is to be stored mvn $rkN_h, xzr // rkN_h = 0xffffffffffffffff and $bit_length, $bit_length, #127 // bit_length %= 128 lsr $rkN_h, $rkN_h, $bit_length // rkN_h is mask for top 64b of last block cmp $bit_length, #64 csel $input_l0, $rkN_l, $rkN_h, lt csel $input_h0, $rkN_h, xzr, lt fmov $ctr0d, $input_l0 // ctr0b is mask for last block fmov $ctr0.d[1], $input_h0 and $res1b, $res1b, $ctr0b // possibly partial last block has zeroes in highest bits rev64 $res0b, $res1b // GHASH final block eor $res0b, $res0b, $t0.16b // feed in partial tag bif $res1b, $rk0, $ctr0b // insert existing bytes in top end of result before storing pmull2 $rk2q1, $res0.2d, $h1.2d // GHASH final block - high mov $t0d, $res0.d[1] // GHASH final block - mid rev $ctr32w, $rctr32w pmull $rk3q1, $res0.1d, $h1.1d // GHASH final block - low eor $acc_hb, $acc_hb, $rk2 // GHASH final block - high eor $t0.8b, $t0.8b, $res0.8b // GHASH final block - mid pmull $t0.1q, $t0.1d, $h12k.1d // GHASH final block - mid eor $acc_lb, $acc_lb, $rk3 // GHASH final block - low eor $acc_mb, $acc_mb, $t0.16b // GHASH final block - mid movi $mod_constant.8b, #0xc2 eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up shl $mod_constantd, $mod_constantd, #56 // mod_constant eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment str $ctr32w, [$counter, #12] // store the updated counter st1 { $res1b}, [$output_ptr] // store all 16B eor $acc_lb, $acc_lb, $acc_hb // MODULO - fold into low eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb mov x0, $len st1 { $acc_l.16b }, [$current_tag] ldp x19, x20, [sp, #16] ldp x21, x22, [sp, #32] ldp x23, x24, [sp, #48] ldp d8, d9, [sp, #64] ldp d10, d11, [sp, #80] ldp d12, d13, [sp, #96] ldp d14, d15, [sp, #112] ldp x29, x30, [sp], #128 AARCH64_VALIDATE_LINK_REGISTER ret .size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel ___ { my $t8="v4"; my $t8d="d4"; my $t9="v6"; my $t9d="d6"; ################################################################################ # size_t aes_gcm_dec_kernel(const uint8_t *in, # size_t len_bits, # uint8_t *out, # u64 *Xi, # uint8_t ivec[16], # const void *key); # $code.=<<___; .global aes_gcm_dec_kernel .type aes_gcm_dec_kernel,%function .align 4 aes_gcm_dec_kernel: AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-128]! mov x29, sp stp x19, x20, [sp, #16] mov $counter, x4 mov $cc, x5 stp x21, x22, [sp, #32] stp x23, x24, [sp, #48] stp d8, d9, [sp, #64] stp d10, d11, [sp, #80] stp d12, d13, [sp, #96] stp d14, d15, [sp, #112] ldr $roundsw, [$cc, #240] add $input_l1, $cc, $rounds, lsl #4 // borrow input_l1 for last key ldp $rkN_l, $rkN_h, [$input_l1] // load round N keys ldr $rkNm1q, [$input_l1, #-16] // load round N-1 keys lsr $main_end_input_ptr, $bit_length, #3 // byte_len mov $len, $main_end_input_ptr ldp $ctr96_b64x, $ctr96_t32x, [$counter] // ctr96_b64, ctr96_t32 ldr $rk8q, [$cc, #128] // load rk8 sub $main_end_input_ptr, $main_end_input_ptr, #1 // byte_len - 1 ldr $rk7q, [$cc, #112] // load rk7 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) add $end_input_ptr, $input_ptr, $bit_length, lsr #3 // end_input_ptr ldr $rk6q, [$cc, #96] // load rk6 lsr $rctr32x, $ctr96_t32x, #32 ldr $rk5q, [$cc, #80] // load rk5 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w ldr $rk3q, [$cc, #48] // load rk3 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr rev $rctr32w, $rctr32w // rev_ctr32 add $rctr32w, $rctr32w, #1 // increment rev_ctr32 fmov $ctr3d, $ctr96_b64x // CTR block 3 rev $ctr32w, $rctr32w // CTR block 1 add $rctr32w, $rctr32w, #1 // CTR block 1 fmov $ctr1d, $ctr96_b64x // CTR block 1 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 1 ld1 { $ctr0b}, [$counter] // special case vector load initial counter so we can start first AES block as quickly as possible fmov $ctr1.d[1], $ctr32x // CTR block 1 rev $ctr32w, $rctr32w // CTR block 2 add $rctr32w, $rctr32w, #1 // CTR block 2 fmov $ctr2d, $ctr96_b64x // CTR block 2 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 2 fmov $ctr2.d[1], $ctr32x // CTR block 2 rev $ctr32w, $rctr32w // CTR block 3 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 3 ldr $rk0q, [$cc, #0] // load rk0 fmov $ctr3.d[1], $ctr32x // CTR block 3 add $rctr32w, $rctr32w, #1 // CTR block 3 ldr $rk4q, [$cc, #64] // load rk4 ldr $rk1q, [$cc, #16] // load rk1 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 0 ldr $h3q, [$Htable, #48] // load h3l | h3h ext $h3b, $h3b, $h3b, #8 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 0 ldr $h4q, [$Htable, #80] // load h4l | h4h ext $h4b, $h4b, $h4b, #8 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 0 ldr $h2q, [$Htable, #32] // load h2l | h2h ext $h2b, $h2b, $h2b, #8 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 0 ldr $rk2q, [$cc, #32] // load rk2 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 1 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 1 ld1 { $acc_lb}, [$current_tag] ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 1 ldr $rk9q, [$cc, #144] // load rk9 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 1 ldr $rk12q, [$cc, #192] // load rk12 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 2 ldr $h1q, [$Htable] // load h1l | h1h ext $h1b, $h1b, $h1b, #8 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 2 ldr $rk10q, [$cc, #160] // load rk10 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 2 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 3 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 2 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 3 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 4 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 3 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 3 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 4 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 4 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 4 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 5 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 5 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 5 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 5 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 6 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 6 cmp $rounds, #12 // setup flags for AES-128/192/256 check aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 6 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 6 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 7 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 7 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 7 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 8 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 7 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 8 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 8 ldr $rk11q, [$cc, #176] // load rk11 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 8 b.lt .Ldec_finish_first_blocks // branch if AES-128 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 9 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 9 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 9 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 9 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 10 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 10 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 10 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 10 b.eq .Ldec_finish_first_blocks // branch if AES-192 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 11 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 11 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 11 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 11 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 12 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 12 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 12 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 12 .Ldec_finish_first_blocks: cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks trn1 $acc_h.2d, $h3.2d, $h4.2d // h4h | h3h trn2 $h34k.2d, $h3.2d, $h4.2d // h4l | h3l trn1 $t0.2d, $h1.2d, $h2.2d // h2h | h1h trn2 $h12k.2d, $h1.2d, $h2.2d // h2l | h1l eor $h34k.16b, $h34k.16b, $acc_h.16b // h4k | h3k aese $ctr1b, $rkNm1 // AES block 1 - round N-1 aese $ctr2b, $rkNm1 // AES block 2 - round N-1 eor $h12k.16b, $h12k.16b, $t0.16b // h2k | h1k aese $ctr3b, $rkNm1 // AES block 3 - round N-1 aese $ctr0b, $rkNm1 // AES block 0 - round N-1 b.ge .Ldec_tail // handle tail ldr $res0q, [$input_ptr, #0] // AES block 0 - load ciphertext ldr $res1q, [$input_ptr, #16] // AES block 1 - load ciphertext rev $ctr32w, $rctr32w // CTR block 4 eor $ctr0b, $res0b, $ctr0b // AES block 0 - result eor $ctr1b, $res1b, $ctr1b // AES block 1 - result rev64 $res1b, $res1b // GHASH block 1 ldr $res3q, [$input_ptr, #48] // AES block 3 - load ciphertext mov $output_h0, $ctr0.d[1] // AES block 0 - mov high mov $output_l0, $ctr0.d[0] // AES block 0 - mov low rev64 $res0b, $res0b // GHASH block 0 add $rctr32w, $rctr32w, #1 // CTR block 4 fmov $ctr0d, $ctr96_b64x // CTR block 4 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4 fmov $ctr0.d[1], $ctr32x // CTR block 4 rev $ctr32w, $rctr32w // CTR block 5 add $rctr32w, $rctr32w, #1 // CTR block 5 mov $output_l1, $ctr1.d[0] // AES block 1 - mov low orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 5 mov $output_h1, $ctr1.d[1] // AES block 1 - mov high eor $output_h0, $output_h0, $rkN_h // AES block 0 - round N high eor $output_l0, $output_l0, $rkN_l // AES block 0 - round N low stp $output_l0, $output_h0, [$output_ptr], #16 // AES block 0 - store result fmov $ctr1d, $ctr96_b64x // CTR block 5 ldr $res2q, [$input_ptr, #32] // AES block 2 - load ciphertext add $input_ptr, $input_ptr, #64 // AES input_ptr update fmov $ctr1.d[1], $ctr32x // CTR block 5 rev $ctr32w, $rctr32w // CTR block 6 add $rctr32w, $rctr32w, #1 // CTR block 6 eor $output_l1, $output_l1, $rkN_l // AES block 1 - round N low orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 6 eor $output_h1, $output_h1, $rkN_h // AES block 1 - round N high stp $output_l1, $output_h1, [$output_ptr], #16 // AES block 1 - store result eor $ctr2b, $res2b, $ctr2b // AES block 2 - result cmp $input_ptr, $main_end_input_ptr // check if we have <= 8 blocks b.ge .Ldec_prepretail // do prepretail .Ldec_main_loop: // main loop start mov $output_l2, $ctr2.d[0] // AES block 4k+2 - mov low ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0 eor $ctr3b, $res3b, $ctr3b // AES block 4k+3 - result aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0 mov $output_h2, $ctr2.d[1] // AES block 4k+2 - mov high aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0 fmov $ctr2d, $ctr96_b64x // CTR block 4k+6 fmov $ctr2.d[1], $ctr32x // CTR block 4k+6 eor $res0b, $res0b, $acc_lb // PRE 1 rev $ctr32w, $rctr32w // CTR block 4k+7 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1 mov $output_h3, $ctr3.d[1] // AES block 4k+3 - mov high aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1 mov $output_l3, $ctr3.d[0] // AES block 4k+3 - mov low pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high mov $t0d, $res0.d[1] // GHASH block 4k - mid fmov $ctr3d, $ctr96_b64x // CTR block 4k+7 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+7 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0 fmov $ctr3.d[1], $ctr32x // CTR block 4k+7 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2 eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3 eor $output_h2, $output_h2, $rkN_h // AES block 4k+2 - round N high aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1 mov $acc_md, $h34k.d[1] // GHASH block 4k - mid aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3 rev64 $res2b, $res2b // GHASH block 4k+2 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0 eor $output_l2, $output_l2, $rkN_l // AES block 4k+2 - round N low aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2 stp $output_l2, $output_h2, [$output_ptr], #16 // AES block 4k+2 - store result pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3 rev64 $res3b, $res3b // GHASH block 4k+3 pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid eor $output_l3, $output_l3, $rkN_l // AES block 4k+3 - round N low pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low eor $output_h3, $output_h3, $rkN_h // AES block 4k+3 - round N high eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1 mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4 eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5 add $rctr32w, $rctr32w, #1 // CTR block 4k+7 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2 mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4 eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3 eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5 eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid rev $ctr32w, $rctr32w // CTR block 4k+8 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6 ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6 add $rctr32w, $rctr32w, #1 // CTR block 4k+8 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7 eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7 pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5 pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8 eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6 pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+8 eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high cmp $rounds, #12 // setup flags for AES-128/192/256 check eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6 eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid movi $mod_constant.8b, #0xc2 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7 eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7 shl $mod_constantd, $mod_constantd, #56 // mod_constant aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8 eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8 b.lt .Ldec_main_loop_continue // branch if AES-128 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10 b.eq .Ldec_main_loop_continue // branch if AES-192 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12 .Ldec_main_loop_continue: pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up ldr $res0q, [$input_ptr, #0] // AES block 4k+4 - load ciphertext aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1 ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up ldr $res1q, [$input_ptr, #16] // AES block 4k+5 - load ciphertext eor $ctr0b, $res0b, $ctr0b // AES block 4k+4 - result stp $output_l3, $output_h3, [$output_ptr], #16 // AES block 4k+3 - store result eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid ldr $res3q, [$input_ptr, #48] // AES block 4k+7 - load ciphertext ldr $res2q, [$input_ptr, #32] // AES block 4k+6 - load ciphertext mov $output_h0, $ctr0.d[1] // AES block 4k+4 - mov high eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1 add $input_ptr, $input_ptr, #64 // AES input_ptr update mov $output_l0, $ctr0.d[0] // AES block 4k+4 - mov low fmov $ctr0d, $ctr96_b64x // CTR block 4k+8 fmov $ctr0.d[1], $ctr32x // CTR block 4k+8 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low eor $ctr1b, $res1b, $ctr1b // AES block 4k+5 - result rev $ctr32w, $rctr32w // CTR block 4k+9 aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+9 cmp $input_ptr, $main_end_input_ptr // LOOP CONTROL add $rctr32w, $rctr32w, #1 // CTR block 4k+9 eor $output_l0, $output_l0, $rkN_l // AES block 4k+4 - round N low eor $output_h0, $output_h0, $rkN_h // AES block 4k+4 - round N high mov $output_h1, $ctr1.d[1] // AES block 4k+5 - mov high eor $ctr2b, $res2b, $ctr2b // AES block 4k+6 - result eor $acc_lb, $acc_lb, $mod_constant.16b // MODULO - fold into low mov $output_l1, $ctr1.d[0] // AES block 4k+5 - mov low fmov $ctr1d, $ctr96_b64x // CTR block 4k+9 ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment fmov $ctr1.d[1], $ctr32x // CTR block 4k+9 rev $ctr32w, $rctr32w // CTR block 4k+10 add $rctr32w, $rctr32w, #1 // CTR block 4k+10 aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+10 rev64 $res1b, $res1b // GHASH block 4k+5 eor $output_h1, $output_h1, $rkN_h // AES block 4k+5 - round N high stp $output_l0, $output_h0, [$output_ptr], #16 // AES block 4k+4 - store result eor $output_l1, $output_l1, $rkN_l // AES block 4k+5 - round N low stp $output_l1, $output_h1, [$output_ptr], #16 // AES block 4k+5 - store result rev64 $res0b, $res0b // GHASH block 4k+4 eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low b.lt .Ldec_main_loop .Ldec_prepretail: // PREPRETAIL ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0 mov $output_l2, $ctr2.d[0] // AES block 4k+2 - mov low eor $ctr3b, $res3b, $ctr3b // AES block 4k+3 - result aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0 mov $output_h2, $ctr2.d[1] // AES block 4k+2 - mov high aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0 fmov $ctr2d, $ctr96_b64x // CTR block 4k+6 fmov $ctr2.d[1], $ctr32x // CTR block 4k+6 rev $ctr32w, $rctr32w // CTR block 4k+7 eor $res0b, $res0b, $acc_lb // PRE 1 rev64 $res2b, $res2b // GHASH block 4k+2 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+7 mov $output_l3, $ctr3.d[0] // AES block 4k+3 - mov low aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1 mov $output_h3, $ctr3.d[1] // AES block 4k+3 - mov high pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low mov $t0d, $res0.d[1] // GHASH block 4k - mid fmov $ctr3d, $ctr96_b64x // CTR block 4k+7 pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high fmov $ctr3.d[1], $ctr32x // CTR block 4k+7 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0 mov $acc_md, $h34k.d[1] // GHASH block 4k - mid aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1 eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1 rev64 $res3b, $res3b // GHASH block 4k+3 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0 pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1 mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2 eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3 mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2 eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3 eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5 eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4 pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5 ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3 eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4 mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4 pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5 eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6 eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6 movi $mod_constant.8b, #0xc2 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6 eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7 cmp $rounds, #12 // setup flags for AES-128/192/256 check eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7 eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7 eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8 shl $mod_constantd, $mod_constantd, #56 // mod_constant aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8 b.lt .Ldec_finish_prepretail // branch if AES-128 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10 b.eq .Ldec_finish_prepretail // branch if AES-192 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12 .Ldec_finish_prepretail: eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid eor $output_h2, $output_h2, $rkN_h // AES block 4k+2 - round N high eor $output_l3, $output_l3, $rkN_l // AES block 4k+3 - round N low eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid add $rctr32w, $rctr32w, #1 // CTR block 4k+7 eor $output_l2, $output_l2, $rkN_l // AES block 4k+2 - round N low pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low eor $output_h3, $output_h3, $rkN_h // AES block 4k+3 - round N high stp $output_l2, $output_h2, [$output_ptr], #16 // AES block 4k+2 - store result ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment stp $output_l3, $output_h3, [$output_ptr], #16 // AES block 4k+3 - store result eor $acc_lb, $acc_lb, $mod_constant.16b // MODULO - fold into low aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1 aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1 aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1 aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1 eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low .Ldec_tail: // TAIL sub $main_end_input_ptr, $end_input_ptr, $input_ptr // main_end_input_ptr is number of bytes left to process ld1 { $res1b}, [$input_ptr], #16 // AES block 4k+4 - load ciphertext eor $ctr0b, $res1b, $ctr0b // AES block 4k+4 - result mov $output_l0, $ctr0.d[0] // AES block 4k+4 - mov low mov $output_h0, $ctr0.d[1] // AES block 4k+4 - mov high ext $t0.16b, $acc_lb, $acc_lb, #8 // prepare final partial tag cmp $main_end_input_ptr, #48 eor $output_l0, $output_l0, $rkN_l // AES block 4k+4 - round N low eor $output_h0, $output_h0, $rkN_h // AES block 4k+4 - round N high b.gt .Ldec_blocks_more_than_3 sub $rctr32w, $rctr32w, #1 mov $ctr3b, $ctr2b movi $acc_m.8b, #0 movi $acc_l.8b, #0 cmp $main_end_input_ptr, #32 movi $acc_h.8b, #0 mov $ctr2b, $ctr1b b.gt .Ldec_blocks_more_than_2 sub $rctr32w, $rctr32w, #1 mov $ctr3b, $ctr1b cmp $main_end_input_ptr, #16 b.gt .Ldec_blocks_more_than_1 sub $rctr32w, $rctr32w, #1 b .Ldec_blocks_less_than_1 .Ldec_blocks_more_than_3: // blocks left > 3 rev64 $res0b, $res1b // GHASH final-3 block ld1 { $res1b}, [$input_ptr], #16 // AES final-2 block - load ciphertext stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-3 block - store result mov $acc_md, $h34k.d[1] // GHASH final-3 block - mid eor $res0b, $res0b, $t0.16b // feed in partial tag eor $ctr0b, $res1b, $ctr1b // AES final-2 block - result mov $rk4d, $res0.d[1] // GHASH final-3 block - mid mov $output_l0, $ctr0.d[0] // AES final-2 block - mov low mov $output_h0, $ctr0.d[1] // AES final-2 block - mov high eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-3 block - mid movi $t0.8b, #0 // suppress further partial tag feed in pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH final-3 block - high pmull $acc_m.1q, $rk4v.1d, $acc_m.1d // GHASH final-3 block - mid eor $output_l0, $output_l0, $rkN_l // AES final-2 block - round N low pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH final-3 block - low eor $output_h0, $output_h0, $rkN_h // AES final-2 block - round N high .Ldec_blocks_more_than_2: // blocks left > 2 rev64 $res0b, $res1b // GHASH final-2 block ld1 { $res1b}, [$input_ptr], #16 // AES final-1 block - load ciphertext eor $res0b, $res0b, $t0.16b // feed in partial tag stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-2 block - store result eor $ctr0b, $res1b, $ctr2b // AES final-1 block - result mov $rk4d, $res0.d[1] // GHASH final-2 block - mid pmull $rk3q1, $res0.1d, $h3.1d // GHASH final-2 block - low pmull2 $rk2q1, $res0.2d, $h3.2d // GHASH final-2 block - high eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-2 block - mid mov $output_l0, $ctr0.d[0] // AES final-1 block - mov low mov $output_h0, $ctr0.d[1] // AES final-1 block - mov high eor $acc_lb, $acc_lb, $rk3 // GHASH final-2 block - low movi $t0.8b, #0 // suppress further partial tag feed in pmull $rk4v.1q, $rk4v.1d, $h34k.1d // GHASH final-2 block - mid eor $acc_hb, $acc_hb, $rk2 // GHASH final-2 block - high eor $output_l0, $output_l0, $rkN_l // AES final-1 block - round N low eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-2 block - mid eor $output_h0, $output_h0, $rkN_h // AES final-1 block - round N high .Ldec_blocks_more_than_1: // blocks left > 1 stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-1 block - store result rev64 $res0b, $res1b // GHASH final-1 block ld1 { $res1b}, [$input_ptr], #16 // AES final block - load ciphertext eor $res0b, $res0b, $t0.16b // feed in partial tag movi $t0.8b, #0 // suppress further partial tag feed in mov $rk4d, $res0.d[1] // GHASH final-1 block - mid eor $ctr0b, $res1b, $ctr3b // AES final block - result pmull2 $rk2q1, $res0.2d, $h2.2d // GHASH final-1 block - high eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-1 block - mid pmull $rk3q1, $res0.1d, $h2.1d // GHASH final-1 block - low mov $output_l0, $ctr0.d[0] // AES final block - mov low ins $rk4v.d[1], $rk4v.d[0] // GHASH final-1 block - mid mov $output_h0, $ctr0.d[1] // AES final block - mov high pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d // GHASH final-1 block - mid eor $output_l0, $output_l0, $rkN_l // AES final block - round N low eor $acc_lb, $acc_lb, $rk3 // GHASH final-1 block - low eor $acc_hb, $acc_hb, $rk2 // GHASH final-1 block - high eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-1 block - mid eor $output_h0, $output_h0, $rkN_h // AES final block - round N high .Ldec_blocks_less_than_1: // blocks left <= 1 and $bit_length, $bit_length, #127 // bit_length %= 128 mvn $rkN_h, xzr // rkN_h = 0xffffffffffffffff sub $bit_length, $bit_length, #128 // bit_length -= 128 mvn $rkN_l, xzr // rkN_l = 0xffffffffffffffff ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] // load existing bytes we need to not overwrite neg $bit_length, $bit_length // bit_length = 128 - #bits in input (in range [1,128]) and $bit_length, $bit_length, #127 // bit_length %= 128 lsr $rkN_h, $rkN_h, $bit_length // rkN_h is mask for top 64b of last block cmp $bit_length, #64 csel $ctr32x, $rkN_l, $rkN_h, lt csel $ctr96_b64x, $rkN_h, xzr, lt fmov $ctr0d, $ctr32x // ctr0b is mask for last block and $output_l0, $output_l0, $ctr32x mov $ctr0.d[1], $ctr96_b64x bic $end_input_ptr, $end_input_ptr, $ctr32x // mask out low existing bytes rev $ctr32w, $rctr32w bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x // mask out high existing bytes orr $output_l0, $output_l0, $end_input_ptr and $output_h0, $output_h0, $ctr96_b64x orr $output_h0, $output_h0, $main_end_input_ptr and $res1b, $res1b, $ctr0b // possibly partial last block has zeroes in highest bits rev64 $res0b, $res1b // GHASH final block eor $res0b, $res0b, $t0.16b // feed in partial tag pmull $rk3q1, $res0.1d, $h1.1d // GHASH final block - low mov $t0d, $res0.d[1] // GHASH final block - mid eor $t0.8b, $t0.8b, $res0.8b // GHASH final block - mid pmull2 $rk2q1, $res0.2d, $h1.2d // GHASH final block - high pmull $t0.1q, $t0.1d, $h12k.1d // GHASH final block - mid eor $acc_hb, $acc_hb, $rk2 // GHASH final block - high eor $acc_lb, $acc_lb, $rk3 // GHASH final block - low eor $acc_mb, $acc_mb, $t0.16b // GHASH final block - mid movi $mod_constant.8b, #0xc2 eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up shl $mod_constantd, $mod_constantd, #56 // mod_constant eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment eor $acc_lb, $acc_lb, $mod_constant.16b // MODULO - fold into low stp $output_l0, $output_h0, [$output_ptr] str $ctr32w, [$counter, #12] // store the updated counter eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb mov x0, $len st1 { $acc_l.16b }, [$current_tag] ldp x19, x20, [sp, #16] ldp x21, x22, [sp, #32] ldp x23, x24, [sp, #48] ldp d8, d9, [sp, #64] ldp d10, d11, [sp, #80] ldp d12, d13, [sp, #96] ldp d14, d15, [sp, #112] ldp x29, x30, [sp], #128 AARCH64_VALIDATE_LINK_REGISTER ret .size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel ___ } } $code.=<<___; #endif ___ print $code; close STDOUT or die "error closing STDOUT: $!"; # enforce flush ring-0.17.14/crypto/fipsmodule/aes/asm/bsaes-armv7.pl000064400000000000000000000726531046102023000204550ustar 00000000000000#! /usr/bin/env perl # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # # Specific modes and adaptation for Linux kernel by Ard Biesheuvel # of Linaro. # ==================================================================== # Bit-sliced AES for ARM NEON # # February 2012. # # This implementation is direct adaptation of bsaes-x86_64 module for # ARM NEON. Except that this module is endian-neutral [in sense that # it can be compiled for either endianness] by courtesy of vld1.8's # neutrality. Initial version doesn't implement interface to OpenSSL, # only low-level primitives and unsupported entry points, just enough # to collect performance results, which for Cortex-A8 core are: # # encrypt 19.5 cycles per byte processed with 128-bit key # decrypt 22.1 cycles per byte processed with 128-bit key # key conv. 440 cycles per 128-bit key/0.18 of 8x block # # Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, # which is [much] worse than anticipated (for further details see # http://www.openssl.org/~appro/Snapdragon-S4.html). # # Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code # manages in 20.0 cycles]. # # When comparing to x86_64 results keep in mind that NEON unit is # [mostly] single-issue and thus can't [fully] benefit from # instruction-level parallelism. And when comparing to aes-armv4 # results keep in mind key schedule conversion overhead (see # bsaes-x86_64.pl for further details)... # # # April-August 2013 # Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. $flavour = shift; if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; } else { open OUT,">$output"; *STDOUT=*OUT; } my ($inp,$out,$len,$key)=("r0","r1","r2","r3"); my @XMM=map("q$_",(0..15)); { my ($key,$rounds,$const)=("r4","r5","r6"); sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } sub Sbox { # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb my @b=@_[0..7]; my @t=@_[8..11]; my @s=@_[12..15]; &InBasisChange (@b); &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); &OutBasisChange (@b[7,1,4,2,6,5,0,3]); } sub InBasisChange { # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb my @b=@_[0..7]; $code.=<<___; veor @b[2], @b[2], @b[1] veor @b[5], @b[5], @b[6] veor @b[3], @b[3], @b[0] veor @b[6], @b[6], @b[2] veor @b[5], @b[5], @b[0] veor @b[6], @b[6], @b[3] veor @b[3], @b[3], @b[7] veor @b[7], @b[7], @b[5] veor @b[3], @b[3], @b[4] veor @b[4], @b[4], @b[5] veor @b[2], @b[2], @b[7] veor @b[3], @b[3], @b[1] veor @b[1], @b[1], @b[5] ___ } sub OutBasisChange { # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb my @b=@_[0..7]; $code.=<<___; veor @b[0], @b[0], @b[6] veor @b[1], @b[1], @b[4] veor @b[4], @b[4], @b[6] veor @b[2], @b[2], @b[0] veor @b[6], @b[6], @b[1] veor @b[1], @b[1], @b[5] veor @b[5], @b[5], @b[3] veor @b[3], @b[3], @b[7] veor @b[7], @b[7], @b[5] veor @b[2], @b[2], @b[5] veor @b[4], @b[4], @b[7] ___ } sub InvSbox { # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb my @b=@_[0..7]; my @t=@_[8..11]; my @s=@_[12..15]; &InvInBasisChange (@b); &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); } sub InvInBasisChange { # OutBasisChange in reverse (with twist) my @b=@_[5,1,2,6,3,7,0,4]; $code.=<<___ veor @b[1], @b[1], @b[7] veor @b[4], @b[4], @b[7] veor @b[7], @b[7], @b[5] veor @b[1], @b[1], @b[3] veor @b[2], @b[2], @b[5] veor @b[3], @b[3], @b[7] veor @b[6], @b[6], @b[1] veor @b[2], @b[2], @b[0] veor @b[5], @b[5], @b[3] veor @b[4], @b[4], @b[6] veor @b[0], @b[0], @b[6] veor @b[1], @b[1], @b[4] ___ } sub InvOutBasisChange { # InBasisChange in reverse my @b=@_[2,5,7,3,6,1,0,4]; $code.=<<___; veor @b[1], @b[1], @b[5] veor @b[2], @b[2], @b[7] veor @b[3], @b[3], @b[1] veor @b[4], @b[4], @b[5] veor @b[7], @b[7], @b[5] veor @b[3], @b[3], @b[4] veor @b[5], @b[5], @b[0] veor @b[3], @b[3], @b[7] veor @b[6], @b[6], @b[2] veor @b[2], @b[2], @b[1] veor @b[6], @b[6], @b[3] veor @b[3], @b[3], @b[0] veor @b[5], @b[5], @b[6] ___ } sub Mul_GF4 { #;************************************************************* #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * #;************************************************************* my ($x0,$x1,$y0,$y1,$t0,$t1)=@_; $code.=<<___; veor $t0, $y0, $y1 vand $t0, $t0, $x0 veor $x0, $x0, $x1 vand $t1, $x1, $y0 vand $x0, $x0, $y1 veor $x1, $t1, $t0 veor $x0, $x0, $t1 ___ } sub Mul_GF4_N { # not used, see next subroutine # multiply and scale by N my ($x0,$x1,$y0,$y1,$t0)=@_; $code.=<<___; veor $t0, $y0, $y1 vand $t0, $t0, $x0 veor $x0, $x0, $x1 vand $x1, $x1, $y0 vand $x0, $x0, $y1 veor $x1, $x1, $x0 veor $x0, $x0, $t0 ___ } sub Mul_GF4_N_GF4 { # interleaved Mul_GF4_N and Mul_GF4 my ($x0,$x1,$y0,$y1,$t0, $x2,$x3,$y2,$y3,$t1)=@_; $code.=<<___; veor $t0, $y0, $y1 veor $t1, $y2, $y3 vand $t0, $t0, $x0 vand $t1, $t1, $x2 veor $x0, $x0, $x1 veor $x2, $x2, $x3 vand $x1, $x1, $y0 vand $x3, $x3, $y2 vand $x0, $x0, $y1 vand $x2, $x2, $y3 veor $x1, $x1, $x0 veor $x2, $x2, $x3 veor $x0, $x0, $t0 veor $x3, $x3, $t1 ___ } sub Mul_GF16_2 { my @x=@_[0..7]; my @y=@_[8..11]; my @t=@_[12..15]; $code.=<<___; veor @t[0], @x[0], @x[2] veor @t[1], @x[1], @x[3] ___ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]); $code.=<<___; veor @y[0], @y[0], @y[2] veor @y[1], @y[1], @y[3] ___ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], @x[2], @x[3], @y[2], @y[3], @t[2]); $code.=<<___; veor @x[0], @x[0], @t[0] veor @x[2], @x[2], @t[0] veor @x[1], @x[1], @t[1] veor @x[3], @x[3], @t[1] veor @t[0], @x[4], @x[6] veor @t[1], @x[5], @x[7] ___ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], @x[6], @x[7], @y[2], @y[3], @t[2]); $code.=<<___; veor @y[0], @y[0], @y[2] veor @y[1], @y[1], @y[3] ___ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]); $code.=<<___; veor @x[4], @x[4], @t[0] veor @x[6], @x[6], @t[0] veor @x[5], @x[5], @t[1] veor @x[7], @x[7], @t[1] ___ } sub Inv_GF256 { #;******************************************************************** #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * #;******************************************************************** my @x=@_[0..7]; my @t=@_[8..11]; my @s=@_[12..15]; # direct optimizations from hardware $code.=<<___; veor @t[3], @x[4], @x[6] veor @t[2], @x[5], @x[7] veor @t[1], @x[1], @x[3] veor @s[1], @x[7], @x[6] vmov @t[0], @t[2] veor @s[0], @x[0], @x[2] vorr @t[2], @t[2], @t[1] veor @s[3], @t[3], @t[0] vand @s[2], @t[3], @s[0] vorr @t[3], @t[3], @s[0] veor @s[0], @s[0], @t[1] vand @t[0], @t[0], @t[1] veor @t[1], @x[3], @x[2] vand @s[3], @s[3], @s[0] vand @s[1], @s[1], @t[1] veor @t[1], @x[4], @x[5] veor @s[0], @x[1], @x[0] veor @t[3], @t[3], @s[1] veor @t[2], @t[2], @s[1] vand @s[1], @t[1], @s[0] vorr @t[1], @t[1], @s[0] veor @t[3], @t[3], @s[3] veor @t[0], @t[0], @s[1] veor @t[2], @t[2], @s[2] veor @t[1], @t[1], @s[3] veor @t[0], @t[0], @s[2] vand @s[0], @x[7], @x[3] veor @t[1], @t[1], @s[2] vand @s[1], @x[6], @x[2] vand @s[2], @x[5], @x[1] vorr @s[3], @x[4], @x[0] veor @t[3], @t[3], @s[0] veor @t[1], @t[1], @s[2] veor @t[0], @t[0], @s[3] veor @t[2], @t[2], @s[1] @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 @ new smaller inversion vand @s[2], @t[3], @t[1] vmov @s[0], @t[0] veor @s[1], @t[2], @s[2] veor @s[3], @t[0], @s[2] veor @s[2], @t[0], @s[2] @ @s[2]=@s[3] vbsl @s[1], @t[1], @t[0] vbsl @s[3], @t[3], @t[2] veor @t[3], @t[3], @t[2] vbsl @s[0], @s[1], @s[2] vbsl @t[0], @s[2], @s[1] vand @s[2], @s[0], @s[3] veor @t[1], @t[1], @t[0] veor @s[2], @s[2], @t[3] ___ # output in s3, s2, s1, t1 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb } # AES linear components sub ShiftRows { my @x=@_[0..7]; my @t=@_[8..11]; my $mask=pop; $code.=<<___; vldmia $key!, {@t[0]-@t[3]} veor @t[0], @t[0], @x[0] veor @t[1], @t[1], @x[1] vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)` vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)` vldmia $key!, {@t[0]} veor @t[2], @t[2], @x[2] vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)` vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)` vldmia $key!, {@t[1]} veor @t[3], @t[3], @x[3] vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)` vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)` vldmia $key!, {@t[2]} vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)` vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)` vldmia $key!, {@t[3]} veor @t[0], @t[0], @x[4] veor @t[1], @t[1], @x[5] vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)` vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)` veor @t[2], @t[2], @x[6] vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)` vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)` veor @t[3], @t[3], @x[7] vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)` vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)` vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)` vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)` ___ } sub MixColumns { # modified to emit output in order suitable for feeding back to aesenc[last] my @x=@_[0..7]; my @t=@_[8..15]; my $inv=@_[16]; # optional $code.=<<___; vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32 vext.8 @t[1], @x[1], @x[1], #12 veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32) vext.8 @t[2], @x[2], @x[2], #12 veor @x[1], @x[1], @t[1] vext.8 @t[3], @x[3], @x[3], #12 veor @x[2], @x[2], @t[2] vext.8 @t[4], @x[4], @x[4], #12 veor @x[3], @x[3], @t[3] vext.8 @t[5], @x[5], @x[5], #12 veor @x[4], @x[4], @t[4] vext.8 @t[6], @x[6], @x[6], #12 veor @x[5], @x[5], @t[5] vext.8 @t[7], @x[7], @x[7], #12 veor @x[6], @x[6], @t[6] veor @t[1], @t[1], @x[0] veor @x[7], @x[7], @t[7] vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64) veor @t[2], @t[2], @x[1] veor @t[0], @t[0], @x[7] veor @t[1], @t[1], @x[7] vext.8 @x[1], @x[1], @x[1], #8 veor @t[5], @t[5], @x[4] veor @x[0], @x[0], @t[0] veor @t[6], @t[6], @x[5] veor @x[1], @x[1], @t[1] vext.8 @t[0], @x[4], @x[4], #8 veor @t[4], @t[4], @x[3] vext.8 @t[1], @x[5], @x[5], #8 veor @t[7], @t[7], @x[6] vext.8 @x[4], @x[3], @x[3], #8 veor @t[3], @t[3], @x[2] vext.8 @x[5], @x[7], @x[7], #8 veor @t[4], @t[4], @x[7] vext.8 @x[3], @x[6], @x[6], #8 veor @t[3], @t[3], @x[7] vext.8 @x[6], @x[2], @x[2], #8 veor @x[7], @t[1], @t[5] ___ $code.=<<___ if (!$inv); veor @x[2], @t[0], @t[4] veor @x[4], @x[4], @t[3] veor @x[5], @x[5], @t[7] veor @x[3], @x[3], @t[6] @ vmov @x[2], @t[0] veor @x[6], @x[6], @t[2] @ vmov @x[7], @t[1] ___ $code.=<<___ if ($inv); veor @t[3], @t[3], @x[4] veor @x[5], @x[5], @t[7] veor @x[2], @x[3], @t[6] veor @x[3], @t[0], @t[4] veor @x[4], @x[6], @t[2] vmov @x[6], @t[3] @ vmov @x[7], @t[1] ___ } sub InvMixColumns_orig { my @x=@_[0..7]; my @t=@_[8..15]; $code.=<<___; @ multiplication by 0x0e vext.8 @t[7], @x[7], @x[7], #12 vmov @t[2], @x[2] veor @x[2], @x[2], @x[5] @ 2 5 veor @x[7], @x[7], @x[5] @ 7 5 vext.8 @t[0], @x[0], @x[0], #12 vmov @t[5], @x[5] veor @x[5], @x[5], @x[0] @ 5 0 [1] veor @x[0], @x[0], @x[1] @ 0 1 vext.8 @t[1], @x[1], @x[1], #12 veor @x[1], @x[1], @x[2] @ 1 25 veor @x[0], @x[0], @x[6] @ 01 6 [2] vext.8 @t[3], @x[3], @x[3], #12 veor @x[1], @x[1], @x[3] @ 125 3 [4] veor @x[2], @x[2], @x[0] @ 25 016 [3] veor @x[3], @x[3], @x[7] @ 3 75 veor @x[7], @x[7], @x[6] @ 75 6 [0] vext.8 @t[6], @x[6], @x[6], #12 vmov @t[4], @x[4] veor @x[6], @x[6], @x[4] @ 6 4 veor @x[4], @x[4], @x[3] @ 4 375 [6] veor @x[3], @x[3], @x[7] @ 375 756=36 veor @x[6], @x[6], @t[5] @ 64 5 [7] veor @x[3], @x[3], @t[2] @ 36 2 vext.8 @t[5], @t[5], @t[5], #12 veor @x[3], @x[3], @t[4] @ 362 4 [5] ___ my @y = @x[7,5,0,2,1,3,4,6]; $code.=<<___; @ multiplication by 0x0b veor @y[1], @y[1], @y[0] veor @y[0], @y[0], @t[0] vext.8 @t[2], @t[2], @t[2], #12 veor @y[1], @y[1], @t[1] veor @y[0], @y[0], @t[5] vext.8 @t[4], @t[4], @t[4], #12 veor @y[1], @y[1], @t[6] veor @y[0], @y[0], @t[7] veor @t[7], @t[7], @t[6] @ clobber t[7] veor @y[3], @y[3], @t[0] veor @y[1], @y[1], @y[0] vext.8 @t[0], @t[0], @t[0], #12 veor @y[2], @y[2], @t[1] veor @y[4], @y[4], @t[1] vext.8 @t[1], @t[1], @t[1], #12 veor @y[2], @y[2], @t[2] veor @y[3], @y[3], @t[2] veor @y[5], @y[5], @t[2] veor @y[2], @y[2], @t[7] vext.8 @t[2], @t[2], @t[2], #12 veor @y[3], @y[3], @t[3] veor @y[6], @y[6], @t[3] veor @y[4], @y[4], @t[3] veor @y[7], @y[7], @t[4] vext.8 @t[3], @t[3], @t[3], #12 veor @y[5], @y[5], @t[4] veor @y[7], @y[7], @t[7] veor @t[7], @t[7], @t[5] @ clobber t[7] even more veor @y[3], @y[3], @t[5] veor @y[4], @y[4], @t[4] veor @y[5], @y[5], @t[7] vext.8 @t[4], @t[4], @t[4], #12 veor @y[6], @y[6], @t[7] veor @y[4], @y[4], @t[7] veor @t[7], @t[7], @t[5] vext.8 @t[5], @t[5], @t[5], #12 @ multiplication by 0x0d veor @y[4], @y[4], @y[7] veor @t[7], @t[7], @t[6] @ restore t[7] veor @y[7], @y[7], @t[4] vext.8 @t[6], @t[6], @t[6], #12 veor @y[2], @y[2], @t[0] veor @y[7], @y[7], @t[5] vext.8 @t[7], @t[7], @t[7], #12 veor @y[2], @y[2], @t[2] veor @y[3], @y[3], @y[1] veor @y[1], @y[1], @t[1] veor @y[0], @y[0], @t[0] veor @y[3], @y[3], @t[0] veor @y[1], @y[1], @t[5] veor @y[0], @y[0], @t[5] vext.8 @t[0], @t[0], @t[0], #12 veor @y[1], @y[1], @t[7] veor @y[0], @y[0], @t[6] veor @y[3], @y[3], @y[1] veor @y[4], @y[4], @t[1] vext.8 @t[1], @t[1], @t[1], #12 veor @y[7], @y[7], @t[7] veor @y[4], @y[4], @t[2] veor @y[5], @y[5], @t[2] veor @y[2], @y[2], @t[6] veor @t[6], @t[6], @t[3] @ clobber t[6] vext.8 @t[2], @t[2], @t[2], #12 veor @y[4], @y[4], @y[7] veor @y[3], @y[3], @t[6] veor @y[6], @y[6], @t[6] veor @y[5], @y[5], @t[5] vext.8 @t[5], @t[5], @t[5], #12 veor @y[6], @y[6], @t[4] vext.8 @t[4], @t[4], @t[4], #12 veor @y[5], @y[5], @t[6] veor @y[6], @y[6], @t[7] vext.8 @t[7], @t[7], @t[7], #12 veor @t[6], @t[6], @t[3] @ restore t[6] vext.8 @t[3], @t[3], @t[3], #12 @ multiplication by 0x09 veor @y[4], @y[4], @y[1] veor @t[1], @t[1], @y[1] @ t[1]=y[1] veor @t[0], @t[0], @t[5] @ clobber t[0] vext.8 @t[6], @t[6], @t[6], #12 veor @t[1], @t[1], @t[5] veor @y[3], @y[3], @t[0] veor @t[0], @t[0], @y[0] @ t[0]=y[0] veor @t[1], @t[1], @t[6] veor @t[6], @t[6], @t[7] @ clobber t[6] veor @y[4], @y[4], @t[1] veor @y[7], @y[7], @t[4] veor @y[6], @y[6], @t[3] veor @y[5], @y[5], @t[2] veor @t[4], @t[4], @y[4] @ t[4]=y[4] veor @t[3], @t[3], @y[3] @ t[3]=y[3] veor @t[5], @t[5], @y[5] @ t[5]=y[5] veor @t[2], @t[2], @y[2] @ t[2]=y[2] veor @t[3], @t[3], @t[7] veor @XMM[5], @t[5], @t[6] veor @XMM[6], @t[6], @y[6] @ t[6]=y[6] veor @XMM[2], @t[2], @t[6] veor @XMM[7], @t[7], @y[7] @ t[7]=y[7] vmov @XMM[0], @t[0] vmov @XMM[1], @t[1] @ vmov @XMM[2], @t[2] vmov @XMM[3], @t[3] vmov @XMM[4], @t[4] @ vmov @XMM[5], @t[5] @ vmov @XMM[6], @t[6] @ vmov @XMM[7], @t[7] ___ } sub InvMixColumns { my @x=@_[0..7]; my @t=@_[8..15]; # Thanks to Jussi Kivilinna for providing pointer to # # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | $code.=<<___; @ multiplication by 0x05-0x00-0x04-0x00 vext.8 @t[0], @x[0], @x[0], #8 vext.8 @t[6], @x[6], @x[6], #8 vext.8 @t[7], @x[7], @x[7], #8 veor @t[0], @t[0], @x[0] vext.8 @t[1], @x[1], @x[1], #8 veor @t[6], @t[6], @x[6] vext.8 @t[2], @x[2], @x[2], #8 veor @t[7], @t[7], @x[7] vext.8 @t[3], @x[3], @x[3], #8 veor @t[1], @t[1], @x[1] vext.8 @t[4], @x[4], @x[4], #8 veor @t[2], @t[2], @x[2] vext.8 @t[5], @x[5], @x[5], #8 veor @t[3], @t[3], @x[3] veor @t[4], @t[4], @x[4] veor @t[5], @t[5], @x[5] veor @x[0], @x[0], @t[6] veor @x[1], @x[1], @t[6] veor @x[2], @x[2], @t[0] veor @x[4], @x[4], @t[2] veor @x[3], @x[3], @t[1] veor @x[1], @x[1], @t[7] veor @x[2], @x[2], @t[7] veor @x[4], @x[4], @t[6] veor @x[5], @x[5], @t[3] veor @x[3], @x[3], @t[6] veor @x[6], @x[6], @t[4] veor @x[4], @x[4], @t[7] veor @x[5], @x[5], @t[7] veor @x[7], @x[7], @t[5] ___ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 } sub swapmove { my ($a,$b,$n,$mask,$t)=@_; $code.=<<___; vshr.u64 $t, $b, #$n veor $t, $t, $a vand $t, $t, $mask veor $a, $a, $t vshl.u64 $t, $t, #$n veor $b, $b, $t ___ } sub swapmove2x { my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; $code.=<<___; vshr.u64 $t0, $b0, #$n vshr.u64 $t1, $b1, #$n veor $t0, $t0, $a0 veor $t1, $t1, $a1 vand $t0, $t0, $mask vand $t1, $t1, $mask veor $a0, $a0, $t0 vshl.u64 $t0, $t0, #$n veor $a1, $a1, $t1 vshl.u64 $t1, $t1, #$n veor $b0, $b0, $t0 veor $b1, $b1, $t1 ___ } sub bitslice { my @x=reverse(@_[0..7]); my ($t0,$t1,$t2,$t3)=@_[8..11]; $code.=<<___; vmov.i8 $t0,#0x55 @ compose .LBS0 vmov.i8 $t1,#0x33 @ compose .LBS1 ___ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); $code.=<<___; vmov.i8 $t0,#0x0f @ compose .LBS2 ___ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); } $code.=<<___; #ifndef __KERNEL__ # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} # define VFP_ABI_POP vldmia sp!,{d8-d15} # define VFP_ABI_FRAME 0x40 #else # define VFP_ABI_PUSH # define VFP_ABI_POP # define VFP_ABI_FRAME 0 # define BSAES_ASM_EXTENDED_KEY # define __ARM_MAX_ARCH__ 7 #endif #ifdef __thumb__ # define adrl adr #endif #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon .text .syntax unified @ ARMv7-capable assembler is expected to handle this #if defined(__thumb2__) && !defined(__APPLE__) .thumb #else .code 32 # undef __thumb2__ #endif .type _bsaes_const,%object .align 6 _bsaes_const: .LM0ISR: @ InvShiftRows constants .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 .LISR: .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 .LISRM0: .quad 0x01040b0e0205080f, 0x0306090c00070a0d .LM0SR: @ ShiftRows constants .quad 0x0a0e02060f03070b, 0x0004080c05090d01 .LSR: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b .LSRM0: .quad 0x0304090e00050a0f, 0x01060b0c0207080d .LM0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d .LREVM0SR: .quad 0x090d01050c000408, 0x03070b0f060a0e02 .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by " .align 6 .size _bsaes_const,.-_bsaes_const .type _bsaes_encrypt8,%function .align 4 _bsaes_encrypt8: adr $const,. vldmia $key!, {@XMM[9]} @ round 0 key #if defined(__thumb2__) || defined(__APPLE__) adr $const,.LM0SR #else sub $const,$const,#_bsaes_encrypt8-.LM0SR #endif vldmia $const!, {@XMM[8]} @ .LM0SR _bsaes_encrypt8_alt: veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key veor @XMM[11], @XMM[1], @XMM[9] vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])` veor @XMM[12], @XMM[2], @XMM[9] vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])` vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])` veor @XMM[13], @XMM[3], @XMM[9] vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])` vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])` veor @XMM[14], @XMM[4], @XMM[9] vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])` vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])` veor @XMM[15], @XMM[5], @XMM[9] vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])` vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])` veor @XMM[10], @XMM[6], @XMM[9] vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])` vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])` veor @XMM[11], @XMM[7], @XMM[9] vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])` vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])` vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])` vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])` _bsaes_encrypt8_bitslice: ___ &bitslice (@XMM[0..7, 8..11]); $code.=<<___; sub $rounds,$rounds,#1 b .Lenc_sbox .align 4 .Lenc_loop: ___ &ShiftRows (@XMM[0..7, 8..12]); $code.=".Lenc_sbox:\n"; &Sbox (@XMM[0..7, 8..15]); $code.=<<___; subs $rounds,$rounds,#1 bcc .Lenc_done ___ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); $code.=<<___; vldmia $const, {@XMM[12]} @ .LSR ite eq @ Thumb2 thing, samity check in ARM addeq $const,$const,#0x10 bne .Lenc_loop vldmia $const, {@XMM[12]} @ .LSRM0 b .Lenc_loop .align 4 .Lenc_done: ___ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); $code.=<<___; vldmia $key, {@XMM[8]} @ last round key veor @XMM[4], @XMM[4], @XMM[8] veor @XMM[6], @XMM[6], @XMM[8] veor @XMM[3], @XMM[3], @XMM[8] veor @XMM[7], @XMM[7], @XMM[8] veor @XMM[2], @XMM[2], @XMM[8] veor @XMM[5], @XMM[5], @XMM[8] veor @XMM[0], @XMM[0], @XMM[8] veor @XMM[1], @XMM[1], @XMM[8] bx lr .size _bsaes_encrypt8,.-_bsaes_encrypt8 ___ } { my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6"); sub bitslice_key { my @x=reverse(@_[0..7]); my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; &swapmove (@x[0,1],1,$bs0,$t2,$t3); $code.=<<___; @ &swapmove(@x[2,3],1,$t0,$t2,$t3); vmov @x[2], @x[0] vmov @x[3], @x[1] ___ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); $code.=<<___; @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); vmov @x[4], @x[0] vmov @x[6], @x[2] vmov @x[5], @x[1] vmov @x[7], @x[3] ___ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); } $code.=<<___; .type _bsaes_key_convert,%function .align 4 _bsaes_key_convert: adr $const,. vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key #if defined(__thumb2__) || defined(__APPLE__) adr $const,.LM0 #else sub $const,$const,#_bsaes_key_convert-.LM0 #endif vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key vmov.i8 @XMM[8], #0x01 @ bit masks vmov.i8 @XMM[9], #0x02 vmov.i8 @XMM[10], #0x04 vmov.i8 @XMM[11], #0x08 vmov.i8 @XMM[12], #0x10 vmov.i8 @XMM[13], #0x20 vldmia $const, {@XMM[14]} @ .LM0 #ifdef __ARMEL__ vrev32.8 @XMM[7], @XMM[7] vrev32.8 @XMM[15], @XMM[15] #endif sub $rounds,$rounds,#1 vstmia $out!, {@XMM[7]} @ save round 0 key b .Lkey_loop .align 4 .Lkey_loop: vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])` vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])` vmov.i8 @XMM[6], #0x40 vmov.i8 @XMM[15], #0x80 vtst.8 @XMM[0], @XMM[7], @XMM[8] vtst.8 @XMM[1], @XMM[7], @XMM[9] vtst.8 @XMM[2], @XMM[7], @XMM[10] vtst.8 @XMM[3], @XMM[7], @XMM[11] vtst.8 @XMM[4], @XMM[7], @XMM[12] vtst.8 @XMM[5], @XMM[7], @XMM[13] vtst.8 @XMM[6], @XMM[7], @XMM[6] vtst.8 @XMM[7], @XMM[7], @XMM[15] vld1.8 {@XMM[15]}, [$inp]! @ load next round key vmvn @XMM[0], @XMM[0] @ "pnot" vmvn @XMM[1], @XMM[1] vmvn @XMM[5], @XMM[5] vmvn @XMM[6], @XMM[6] #ifdef __ARMEL__ vrev32.8 @XMM[15], @XMM[15] #endif subs $rounds,$rounds,#1 vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key bne .Lkey_loop vmov.i8 @XMM[7],#0x63 @ compose .L63 @ don't save last round key bx lr .size _bsaes_key_convert,.-_bsaes_key_convert ___ } { my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10))); my $const = "r6"; # shared with _bsaes_encrypt8_alt my $keysched = "sp"; $code.=<<___; .global bsaes_ctr32_encrypt_blocks .type bsaes_ctr32_encrypt_blocks,%function .align 5 bsaes_ctr32_encrypt_blocks: @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this @ out to retain a constant-time implementation. mov ip, sp stmdb sp!, {r4-r10, lr} VFP_ABI_PUSH ldr $ctr, [ip] @ ctr is 1st arg on the stack sub sp, sp, #0x10 @ scratch space to carry over the ctr mov $fp, sp @ save sp ldr $rounds, [$key, #240] @ get # of rounds #ifndef BSAES_ASM_EXTENDED_KEY @ allocate the key schedule on the stack sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key add r12, #`128-32` @ size of bit-sliced key schedule @ populate the key schedule mov r4, $key @ pass key mov r5, $rounds @ pass # of rounds mov sp, r12 @ sp is $keysched bl _bsaes_key_convert veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key vstmia r12, {@XMM[7]} @ save last round key vld1.8 {@XMM[0]}, [$ctr] @ load counter #ifdef __APPLE__ mov $ctr, #:lower16:(.LREVM0SR-.LM0) add $ctr, $const, $ctr #else add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr #endif vldmia $keysched, {@XMM[4]} @ load round0 key #else ldr r12, [$key, #244] eors r12, #1 beq 0f @ populate the key schedule str r12, [$key, #244] mov r4, $key @ pass key mov r5, $rounds @ pass # of rounds add r12, $key, #248 @ pass key schedule bl _bsaes_key_convert veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key vstmia r12, {@XMM[7]} @ save last round key .align 2 0: add r12, $key, #248 vld1.8 {@XMM[0]}, [$ctr] @ load counter adrl $ctr, .LREVM0SR @ borrow $ctr vldmia r12, {@XMM[4]} @ load round0 key sub sp, #0x10 @ place for adjusted round0 key #endif vmov.i32 @XMM[8],#1 @ compose 1<<96 veor @XMM[9],@XMM[9],@XMM[9] vrev32.8 @XMM[0],@XMM[0] vext.8 @XMM[8],@XMM[9],@XMM[8],#4 vrev32.8 @XMM[4],@XMM[4] vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 vstmia $keysched, {@XMM[4]} @ save adjusted round0 key b .Lctr_enc_loop .align 4 .Lctr_enc_loop: vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96 vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1 vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2 vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3 vadd.u32 @XMM[4], @XMM[1], @XMM[10] vadd.u32 @XMM[5], @XMM[2], @XMM[10] vadd.u32 @XMM[6], @XMM[3], @XMM[10] vadd.u32 @XMM[7], @XMM[4], @XMM[10] vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter @ Borrow prologue from _bsaes_encrypt8 to use the opportunity @ to flip byte order in 32-bit counter vldmia $keysched, {@XMM[9]} @ load round0 key #ifndef BSAES_ASM_EXTENDED_KEY add r4, $keysched, #0x10 @ pass next round key #else add r4, $key, #`248+16` #endif vldmia $ctr, {@XMM[8]} @ .LREVM0SR mov r5, $rounds @ pass rounds vstmia $fp, {@XMM[10]} @ save next counter #ifdef __APPLE__ mov $const, #:lower16:(.LREVM0SR-.LSR) sub $const, $ctr, $const #else sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants #endif bl _bsaes_encrypt8_alt subs $len, $len, #8 blo .Lctr_enc_loop_done vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! veor @XMM[0], @XMM[8] veor @XMM[1], @XMM[9] vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! veor @XMM[4], @XMM[10] veor @XMM[6], @XMM[11] vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! veor @XMM[3], @XMM[12] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output veor @XMM[7], @XMM[13] veor @XMM[2], @XMM[14] vst1.8 {@XMM[4]}, [$out]! veor @XMM[5], @XMM[15] vst1.8 {@XMM[6]}, [$out]! vmov.i32 @XMM[8], #1 @ compose 1<<96 vst1.8 {@XMM[3]}, [$out]! veor @XMM[9], @XMM[9], @XMM[9] vst1.8 {@XMM[7]}, [$out]! vext.8 @XMM[8], @XMM[9], @XMM[8], #4 vst1.8 {@XMM[2]}, [$out]! vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 vst1.8 {@XMM[5]}, [$out]! vldmia $fp, {@XMM[0]} @ load counter bne .Lctr_enc_loop b .Lctr_enc_done .align 4 .Lctr_enc_loop_done: add $len, $len, #8 vld1.8 {@XMM[8]}, [$inp]! @ load input veor @XMM[0], @XMM[8] vst1.8 {@XMM[0]}, [$out]! @ write output cmp $len, #2 blo .Lctr_enc_done vld1.8 {@XMM[9]}, [$inp]! veor @XMM[1], @XMM[9] vst1.8 {@XMM[1]}, [$out]! beq .Lctr_enc_done vld1.8 {@XMM[10]}, [$inp]! veor @XMM[4], @XMM[10] vst1.8 {@XMM[4]}, [$out]! cmp $len, #4 blo .Lctr_enc_done vld1.8 {@XMM[11]}, [$inp]! veor @XMM[6], @XMM[11] vst1.8 {@XMM[6]}, [$out]! beq .Lctr_enc_done vld1.8 {@XMM[12]}, [$inp]! veor @XMM[3], @XMM[12] vst1.8 {@XMM[3]}, [$out]! cmp $len, #6 blo .Lctr_enc_done vld1.8 {@XMM[13]}, [$inp]! veor @XMM[7], @XMM[13] vst1.8 {@XMM[7]}, [$out]! beq .Lctr_enc_done vld1.8 {@XMM[14]}, [$inp] veor @XMM[2], @XMM[14] vst1.8 {@XMM[2]}, [$out]! .Lctr_enc_done: vmov.i32 q0, #0 vmov.i32 q1, #0 #ifndef BSAES_ASM_EXTENDED_KEY .Lctr_enc_bzero: @ wipe key schedule [if any] vstmia $keysched!, {q0-q1} cmp $keysched, $fp bne .Lctr_enc_bzero #else vstmia $keysched, {q0-q1} #endif mov sp, $fp add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb VFP_ABI_POP ldmia sp!, {r4-r10, pc} @ return @ OpenSSL contains aes_nohw_* fallback code here. We patch this @ out to retain a constant-time implementation. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks ___ } $code.=<<___; #endif ___ $code =~ s/\`([^\`]*)\`/eval($1)/gem; open SELF,$0; while() { next if (/^#!/); last if (!s/^#/@/ and !/^$/); print; } close SELF; print $code; close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-armv4.pl000064400000000000000000000217121046102023000204350ustar 00000000000000#! /usr/bin/env perl # Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # April 2010 # # The module implements "4-bit" GCM GHASH function and underlying # single multiplication operation in GF(2^128). "4-bit" means that it # uses 256 bytes per-key table [+32 bytes shared table]. There is no # experimental performance data available yet. The only approximation # that can be made at this point is based on code size. Inner loop is # 32 instructions long and on single-issue core should execute in <40 # cycles. Having verified that gcc 3.4 didn't unroll corresponding # loop, this assembler loop body was found to be ~3x smaller than # compiler-generated one... # # July 2010 # # Rescheduling for dual-issue pipeline resulted in 8.5% improvement on # Cortex A8 core and ~25 cycles per processed byte (which was observed # to be ~3 times faster than gcc-generated code:-) # # February 2011 # # Profiler-assisted and platform-specific optimization resulted in 7% # improvement on Cortex A8 core and ~23.5 cycles per byte. # # March 2011 # # Add NEON implementation featuring polynomial multiplication, i.e. no # lookup tables involved. On Cortex A8 it was measured to process one # byte in 15 cycles or 55% faster than integer-only code. # # April 2014 # # Switch to multiplication algorithm suggested in paper referred # below and combine it with reduction algorithm from x86 module. # Performance improvement over previous version varies from 65% on # Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 # processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63, # Snapdragon S4 - in 9.33. # # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software # Polynomial Multiplication on ARM Processors using the NEON Engine. # # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf # ==================================================================== # Note about "528B" variant. In ARM case it makes lesser sense to # implement it for following reasons: # # - performance improvement won't be anywhere near 50%, because 128- # bit shift operation is neatly fused with 128-bit xor here, and # "538B" variant would eliminate only 4-5 instructions out of 32 # in the inner loop (meaning that estimated improvement is ~15%); # - ARM-based systems are often embedded ones and extra memory # consumption might be unappreciated (for so little improvement); # # Byte order [in]dependence. ========================================= # # Caller is expected to maintain specific *dword* order in Htable, # namely with *least* significant dword of 128-bit value at *lower* # address. This differs completely from C code and has everything to # do with ldm instruction and order in which dwords are "consumed" by # algorithm. *Byte* order within these dwords in turn is whatever # *native* byte order on current platform. See gcm128.c for working # example... # This file was patched in BoringSSL to remove the variable-time 4-bit # implementation. $flavour = shift; if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; } else { open OUT,">$output"; *STDOUT=*OUT; } $Xi="r0"; # argument block $Htbl="r1"; $inp="r2"; $len="r3"; $code=<<___; @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both @ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL @ instructions are in aesv8-armx.pl.) .arch armv7-a .text #if defined(__thumb2__) || defined(__clang__) .syntax unified #define ldrplb ldrbpl #define ldrneb ldrbne #endif #if defined(__thumb2__) .thumb #else .code 32 #endif ___ { my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); my ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); sub clmul64x64 { my ($r,$a,$b)=@_; $code.=<<___; vext.8 $t0#lo, $a, $a, #1 @ A1 vmull.p8 $t0, $t0#lo, $b @ F = A1*B vext.8 $r#lo, $b, $b, #1 @ B1 vmull.p8 $r, $a, $r#lo @ E = A*B1 vext.8 $t1#lo, $a, $a, #2 @ A2 vmull.p8 $t1, $t1#lo, $b @ H = A2*B vext.8 $t3#lo, $b, $b, #2 @ B2 vmull.p8 $t3, $a, $t3#lo @ G = A*B2 vext.8 $t2#lo, $a, $a, #3 @ A3 veor $t0, $t0, $r @ L = E + F vmull.p8 $t2, $t2#lo, $b @ J = A3*B vext.8 $r#lo, $b, $b, #3 @ B3 veor $t1, $t1, $t3 @ M = G + H vmull.p8 $r, $a, $r#lo @ I = A*B3 veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 vand $t0#hi, $t0#hi, $k48 vext.8 $t3#lo, $b, $b, #4 @ B4 veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 vand $t1#hi, $t1#hi, $k32 vmull.p8 $t3, $a, $t3#lo @ K = A*B4 veor $t2, $t2, $r @ N = I + J veor $t0#lo, $t0#lo, $t0#hi veor $t1#lo, $t1#lo, $t1#hi veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 vand $t2#hi, $t2#hi, $k16 vext.8 $t0, $t0, $t0, #15 veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 vmov.i64 $t3#hi, #0 vext.8 $t1, $t1, $t1, #14 veor $t2#lo, $t2#lo, $t2#hi vmull.p8 $r, $a, $b @ D = A*B vext.8 $t3, $t3, $t3, #12 vext.8 $t2, $t2, $t2, #13 veor $t0, $t0, $t1 veor $t2, $t2, $t3 veor $r, $r, $t0 veor $r, $r, $t2 ___ } $code.=<<___; #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon .global gcm_init_neon .type gcm_init_neon,%function .align 4 gcm_init_neon: vld1.64 $IN#hi,[r1]! @ load H vmov.i8 $t0,#0xe1 vld1.64 $IN#lo,[r1] vshl.i64 $t0#hi,#57 vshr.u64 $t0#lo,#63 @ t0=0xc2....01 vdup.8 $t1,$IN#hi[7] vshr.u64 $Hlo,$IN#lo,#63 vshr.s8 $t1,#7 @ broadcast carry bit vshl.i64 $IN,$IN,#1 vand $t0,$t0,$t1 vorr $IN#hi,$Hlo @ H<<<=1 veor $IN,$IN,$t0 @ twisted H vstmia r0,{$IN} ret @ bx lr .size gcm_init_neon,.-gcm_init_neon .global gcm_gmult_neon .type gcm_gmult_neon,%function .align 4 gcm_gmult_neon: vld1.64 $IN#hi,[$Xi]! @ load Xi vld1.64 $IN#lo,[$Xi]! vmov.i64 $k48,#0x0000ffffffffffff vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H vmov.i64 $k32,#0x00000000ffffffff #ifdef __ARMEL__ vrev64.8 $IN,$IN #endif vmov.i64 $k16,#0x000000000000ffff veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing mov $len,#16 b .Lgmult_neon .size gcm_gmult_neon,.-gcm_gmult_neon .global gcm_ghash_neon .type gcm_ghash_neon,%function .align 4 gcm_ghash_neon: vld1.64 $Xl#hi,[$Xi]! @ load Xi vld1.64 $Xl#lo,[$Xi]! vmov.i64 $k48,#0x0000ffffffffffff vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H vmov.i64 $k32,#0x00000000ffffffff #ifdef __ARMEL__ vrev64.8 $Xl,$Xl #endif vmov.i64 $k16,#0x000000000000ffff veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing .Loop_neon: vld1.64 $IN#hi,[$inp]! @ load inp vld1.64 $IN#lo,[$inp]! #ifdef __ARMEL__ vrev64.8 $IN,$IN #endif veor $IN,$Xl @ inp^=Xi .Lgmult_neon: ___ &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo $code.=<<___; veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing ___ &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi) &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi $code.=<<___; veor $Xm,$Xm,$Xl @ Karatsuba post-processing veor $Xm,$Xm,$Xh veor $Xl#hi,$Xl#hi,$Xm#lo veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result @ equivalent of reduction_avx from ghash-x86_64.pl vshl.i64 $t1,$Xl,#57 @ 1st phase vshl.i64 $t2,$Xl,#62 veor $t2,$t2,$t1 @ vshl.i64 $t1,$Xl,#63 veor $t2, $t2, $t1 @ veor $Xl#hi,$Xl#hi,$t2#lo @ veor $Xh#lo,$Xh#lo,$t2#hi vshr.u64 $t2,$Xl,#1 @ 2nd phase veor $Xh,$Xh,$Xl veor $Xl,$Xl,$t2 @ vshr.u64 $t2,$t2,#6 vshr.u64 $Xl,$Xl,#1 @ veor $Xl,$Xl,$Xh @ veor $Xl,$Xl,$t2 @ subs $len,#16 bne .Loop_neon #ifdef __ARMEL__ vrev64.8 $Xl,$Xl #endif sub $Xi,#16 vst1.64 $Xl#hi,[$Xi]! @ write out Xi vst1.64 $Xl#lo,[$Xi] ret @ bx lr .size gcm_ghash_neon,.-gcm_ghash_neon #endif ___ } $code.=<<___; .asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by " .align 2 ___ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or s/\bret\b/bx lr/go or s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; # enforce flush ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl000064400000000000000000000236531046102023000214040ustar 00000000000000#! /usr/bin/env perl # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It # implements the multiplication algorithm described in: # # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software # Polynomial Multiplication on ARM Processors using the NEON Engine. # # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf # # The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is # AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit # NEON, the low and high halves of the 128-bit register q0 are accessible as # 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of # vN. Where the 32-bit version would use the upper half, this file must keep # halves in separate registers. # # The other distinction is in syntax. 32-bit NEON embeds lane information in the # instruction name, while AArch64 uses suffixes on the registers. For instance, # left-shifting 64-bit lanes of a SIMD register in 32-bit would be written: # # vshl.i64 q0, q0, #1 # # in 64-bit, it would be written: # # shl v0.2d, v0.2d, #1 # # See Programmer's Guide for ARMv8-A, section 7 for details. # http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf # # Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ # only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials # and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit # polynomial and is conditioned on the PMULL extension. This file emulates the # latter with the former. use strict; my $flavour = shift; my $output; if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir = $1; my $xlate; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; } else { open OUT,">$output"; *STDOUT=*OUT; } my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3)); # argument block my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4)); my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7)); # d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers # to spare. my ($t0, $t1, $t2, $t3) = map("v$_", (16..19)); my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23)); my ($k48_k32, $k16_k0) = map("v$_", (24..25)); my $code = ""; # clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b # must be distinct from $t* and $k*. $t* are clobbered by the emitted code. sub clmul64x64 { my ($r, $a, $b) = @_; $code .= <<___; ext $t0.8b, $a.8b, $a.8b, #1 // A1 pmull $t0.8h, $t0.8b, $b.8b // F = A1*B ext $r.8b, $b.8b, $b.8b, #1 // B1 pmull $r.8h, $a.8b, $r.8b // E = A*B1 ext $t1.8b, $a.8b, $a.8b, #2 // A2 pmull $t1.8h, $t1.8b, $b.8b // H = A2*B ext $t3.8b, $b.8b, $b.8b, #2 // B2 pmull $t3.8h, $a.8b, $t3.8b // G = A*B2 ext $t2.8b, $a.8b, $a.8b, #3 // A3 eor $t0.16b, $t0.16b, $r.16b // L = E + F pmull $t2.8h, $t2.8b, $b.8b // J = A3*B ext $r.8b, $b.8b, $b.8b, #3 // B3 eor $t1.16b, $t1.16b, $t3.16b // M = G + H pmull $r.8h, $a.8b, $r.8b // I = A*B3 // Here we diverge from the 32-bit version. It computes the following // (instructions reordered for clarity): // // veor \$t0#lo, \$t0#lo, \$t0#hi @ t0 = P0 + P1 (L) // vand \$t0#hi, \$t0#hi, \$k48 // veor \$t0#lo, \$t0#lo, \$t0#hi // // veor \$t1#lo, \$t1#lo, \$t1#hi @ t1 = P2 + P3 (M) // vand \$t1#hi, \$t1#hi, \$k32 // veor \$t1#lo, \$t1#lo, \$t1#hi // // veor \$t2#lo, \$t2#lo, \$t2#hi @ t2 = P4 + P5 (N) // vand \$t2#hi, \$t2#hi, \$k16 // veor \$t2#lo, \$t2#lo, \$t2#hi // // veor \$t3#lo, \$t3#lo, \$t3#hi @ t3 = P6 + P7 (K) // vmov.i64 \$t3#hi, #0 // // \$kN is a mask with the bottom N bits set. AArch64 cannot compute on // upper halves of SIMD registers, so we must split each half into // separate registers. To compensate, we pair computations up and // parallelize. ext $t3.8b, $b.8b, $b.8b, #4 // B4 eor $t2.16b, $t2.16b, $r.16b // N = I + J pmull $t3.8h, $a.8b, $t3.8b // K = A*B4 // This can probably be scheduled more efficiently. For now, we just // pair up independent instructions. zip1 $t0l_t1l.2d, $t0.2d, $t1.2d zip1 $t2l_t3l.2d, $t2.2d, $t3.2d zip2 $t0h_t1h.2d, $t0.2d, $t1.2d zip2 $t2h_t3h.2d, $t2.2d, $t3.2d eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b and $t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b and $t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b zip1 $t0.2d, $t0l_t1l.2d, $t0h_t1h.2d zip1 $t2.2d, $t2l_t3l.2d, $t2h_t3h.2d zip2 $t1.2d, $t0l_t1l.2d, $t0h_t1h.2d zip2 $t3.2d, $t2l_t3l.2d, $t2h_t3h.2d ext $t0.16b, $t0.16b, $t0.16b, #15 // t0 = t0 << 8 ext $t1.16b, $t1.16b, $t1.16b, #14 // t1 = t1 << 16 pmull $r.8h, $a.8b, $b.8b // D = A*B ext $t3.16b, $t3.16b, $t3.16b, #12 // t3 = t3 << 32 ext $t2.16b, $t2.16b, $t2.16b, #13 // t2 = t2 << 24 eor $t0.16b, $t0.16b, $t1.16b eor $t2.16b, $t2.16b, $t3.16b eor $r.16b, $r.16b, $t0.16b eor $r.16b, $r.16b, $t2.16b ___ } $code .= <<___; .text .global gcm_init_neon .type gcm_init_neon,%function .align 4 gcm_init_neon: AARCH64_VALID_CALL_TARGET // This function is adapted from gcm_init_v8. xC2 is t3. ld1 {$t1.2d}, [x1] // load H movi $t3.16b, #0xe1 shl $t3.2d, $t3.2d, #57 // 0xc2.0 ext $INlo.16b, $t1.16b, $t1.16b, #8 ushr $t2.2d, $t3.2d, #63 dup $t1.4s, $t1.s[1] ext $t0.16b, $t2.16b, $t3.16b, #8 // t0=0xc2....01 ushr $t2.2d, $INlo.2d, #63 sshr $t1.4s, $t1.4s, #31 // broadcast carry bit and $t2.16b, $t2.16b, $t0.16b shl $INlo.2d, $INlo.2d, #1 ext $t2.16b, $t2.16b, $t2.16b, #8 and $t0.16b, $t0.16b, $t1.16b orr $INlo.16b, $INlo.16b, $t2.16b // H<<<=1 eor $Hlo.16b, $INlo.16b, $t0.16b // twisted H st1 {$Hlo.2d}, [x0] // store Htable[0] ret .size gcm_init_neon,.-gcm_init_neon .global gcm_gmult_neon .type gcm_gmult_neon,%function .align 4 gcm_gmult_neon: AARCH64_VALID_CALL_TARGET ld1 {$INlo.16b}, [$Xi] // load Xi ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H ld1 {$Hhi.1d}, [$Htbl] adrp x9, :pg_hi21:.Lmasks // load constants add x9, x9, :lo12:.Lmasks ld1 {$k48_k32.2d, $k16_k0.2d}, [x9] rev64 $INlo.16b, $INlo.16b // byteswap Xi ext $INlo.16b, $INlo.16b, $INlo.16b, #8 eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing mov $len, #16 b .Lgmult_neon .size gcm_gmult_neon,.-gcm_gmult_neon .global gcm_ghash_neon .type gcm_ghash_neon,%function .align 4 gcm_ghash_neon: AARCH64_VALID_CALL_TARGET ld1 {$Xl.16b}, [$Xi] // load Xi ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H ld1 {$Hhi.1d}, [$Htbl] adrp x9, :pg_hi21:.Lmasks // load constants add x9, x9, :lo12:.Lmasks ld1 {$k48_k32.2d, $k16_k0.2d}, [x9] rev64 $Xl.16b, $Xl.16b // byteswap Xi ext $Xl.16b, $Xl.16b, $Xl.16b, #8 eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing .Loop_neon: ld1 {$INlo.16b}, [$inp], #16 // load inp rev64 $INlo.16b, $INlo.16b // byteswap inp ext $INlo.16b, $INlo.16b, $INlo.16b, #8 eor $INlo.16b, $INlo.16b, $Xl.16b // inp ^= Xi .Lgmult_neon: // Split the input into $INlo and $INhi. (The upper halves are unused, // so it is okay to leave them alone.) ins $INhi.d[0], $INlo.d[1] ___ &clmul64x64 ($Xl, $Hlo, $INlo); # H.lo·Xi.lo $code .= <<___; eor $INlo.8b, $INlo.8b, $INhi.8b // Karatsuba pre-processing ___ &clmul64x64 ($Xm, $Hhl, $INlo); # (H.lo+H.hi)·(Xi.lo+Xi.hi) &clmul64x64 ($Xh, $Hhi, $INhi); # H.hi·Xi.hi $code .= <<___; ext $t0.16b, $Xl.16b, $Xh.16b, #8 eor $Xm.16b, $Xm.16b, $Xl.16b // Karatsuba post-processing eor $Xm.16b, $Xm.16b, $Xh.16b eor $Xm.16b, $Xm.16b, $t0.16b // Xm overlaps Xh.lo and Xl.hi ins $Xl.d[1], $Xm.d[0] // Xh|Xl - 256-bit result // This is a no-op due to the ins instruction below. // ins $Xh.d[0], $Xm.d[1] // equivalent of reduction_avx from ghash-x86_64.pl shl $t1.2d, $Xl.2d, #57 // 1st phase shl $t2.2d, $Xl.2d, #62 eor $t2.16b, $t2.16b, $t1.16b // shl $t1.2d, $Xl.2d, #63 eor $t2.16b, $t2.16b, $t1.16b // // Note Xm contains {Xl.d[1], Xh.d[0]}. eor $t2.16b, $t2.16b, $Xm.16b ins $Xl.d[1], $t2.d[0] // Xl.d[1] ^= t2.d[0] ins $Xh.d[0], $t2.d[1] // Xh.d[0] ^= t2.d[1] ushr $t2.2d, $Xl.2d, #1 // 2nd phase eor $Xh.16b, $Xh.16b,$Xl.16b eor $Xl.16b, $Xl.16b,$t2.16b // ushr $t2.2d, $t2.2d, #6 ushr $Xl.2d, $Xl.2d, #1 // eor $Xl.16b, $Xl.16b, $Xh.16b // eor $Xl.16b, $Xl.16b, $t2.16b // subs $len, $len, #16 bne .Loop_neon rev64 $Xl.16b, $Xl.16b // byteswap Xi and write ext $Xl.16b, $Xl.16b, $Xl.16b, #8 st1 {$Xl.16b}, [$Xi] ret .size gcm_ghash_neon,.-gcm_ghash_neon .section .rodata .align 4 .Lmasks: .quad 0x0000ffffffffffff // k48 .quad 0x00000000ffffffff // k32 .quad 0x000000000000ffff // k16 .quad 0x0000000000000000 // k0 .asciz "GHASH for ARMv8, derived from ARMv4 version by " .align 2 ___ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; # enforce flush ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86.pl000064400000000000000000000364651046102023000200440ustar 00000000000000#! /usr/bin/env perl # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # March, May, June 2010 # # The module implements "4-bit" GCM GHASH function and underlying # single multiplication operation in GF(2^128). "4-bit" means that it # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two # code paths: vanilla x86 and vanilla SSE. Former will be executed on # 486 and Pentium, latter on all others. SSE GHASH features so called # "528B" variant of "4-bit" method utilizing additional 256+16 bytes # of per-key storage [+512 bytes shared table]. Performance results # are for streamed GHASH subroutine and are expressed in cycles per # processed byte, less is better: # # gcc 2.95.3(*) SSE assembler x86 assembler # # Pentium 105/111(**) - 50 # PIII 68 /75 12.2 24 # P4 125/125 17.8 84(***) # Opteron 66 /70 10.1 30 # Core2 54 /67 8.4 18 # Atom 105/105 16.8 53 # VIA Nano 69 /71 13.0 27 # # (*) gcc 3.4.x was observed to generate few percent slower code, # which is one of reasons why 2.95.3 results were chosen, # another reason is lack of 3.4.x results for older CPUs; # comparison with SSE results is not completely fair, because C # results are for vanilla "256B" implementation, while # assembler results are for "528B";-) # (**) second number is result for code compiled with -fPIC flag, # which is actually more relevant, because assembler code is # position-independent; # (***) see comment in non-MMX routine for further details; # # To summarize, it's >2-5 times faster than gcc-generated code. To # anchor it to something else SHA1 assembler processes one byte in # ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE # in particular, see comment at the end of the file... # May 2010 # # Add PCLMULQDQ version performing at 2.10 cycles per processed byte. # The question is how close is it to theoretical limit? The pclmulqdq # instruction latency appears to be 14 cycles and there can't be more # than 2 of them executing at any given time. This means that single # Karatsuba multiplication would take 28 cycles *plus* few cycles for # pre- and post-processing. Then multiplication has to be followed by # modulo-reduction. Given that aggregated reduction method [see # "Carry-less Multiplication and Its Usage for Computing the GCM Mode" # white paper by Intel] allows you to perform reduction only once in # a while we can assume that asymptotic performance can be estimated # as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction # and Naggr is the aggregation factor. # # Before we proceed to this implementation let's have closer look at # the best-performing code suggested by Intel in their white paper. # By tracing inter-register dependencies Tmod is estimated as ~19 # cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per # processed byte. As implied, this is quite optimistic estimate, # because it does not account for Karatsuba pre- and post-processing, # which for a single multiplication is ~5 cycles. Unfortunately Intel # does not provide performance data for GHASH alone. But benchmarking # AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt # alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that # the result accounts even for pre-computing of degrees of the hash # key H, but its portion is negligible at 16KB buffer size. # # Moving on to the implementation in question. Tmod is estimated as # ~13 cycles and Naggr is 2, giving asymptotic performance of ... # 2.16. How is it possible that measured performance is better than # optimistic theoretical estimate? There is one thing Intel failed # to recognize. By serializing GHASH with CTR in same subroutine # former's performance is really limited to above (Tmul + Tmod/Naggr) # equation. But if GHASH procedure is detached, the modulo-reduction # can be interleaved with Naggr-1 multiplications at instruction level # and under ideal conditions even disappear from the equation. So that # optimistic theoretical estimate for this implementation is ... # 28/16=1.75, and not 2.16. Well, it's probably way too optimistic, # at least for such small Naggr. I'd argue that (28+Tproc/Naggr), # where Tproc is time required for Karatsuba pre- and post-processing, # is more realistic estimate. In this case it gives ... 1.91 cycles. # Or in other words, depending on how well we can interleave reduction # and one of the two multiplications the performance should be between # 1.91 and 2.16. As already mentioned, this implementation processes # one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart # - in 2.02. x86_64 performance is better, because larger register # bank allows to interleave reduction and multiplication better. # # Does it make sense to increase Naggr? To start with it's virtually # impossible in 32-bit mode, because of limited register bank # capacity. Otherwise improvement has to be weighed against slower # setup, as well as code size and complexity increase. As even # optimistic estimate doesn't promise 30% performance improvement, # there are currently no plans to increase Naggr. # # Special thanks to David Woodhouse for providing access to a # Westmere-based system on behalf of Intel Open Source Technology Centre. # January 2010 # # Tweaked to optimize transitions between integer and FP operations # on same XMM register, PCLMULQDQ subroutine was measured to process # one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere. # The minor regression on Westmere is outweighed by ~15% improvement # on Sandy Bridge. Strangely enough attempt to modify 64-bit code in # similar manner resulted in almost 20% degradation on Sandy Bridge, # where original 64-bit code processes one byte in 1.95 cycles. ##################################################################### # For reference, AMD Bulldozer processes one byte in 1.98 cycles in # 32-bit mode and 1.89 in 64-bit. # February 2013 # # Overhaul: aggregate Karatsuba post-processing, improve ILP in # reduction_alg9. Resulting performance is 1.96 cycles per byte on # Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer. # This file was patched in BoringSSL to remove the variable-time 4-bit # implementation. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../../perlasm"); require "x86asm.pl"; $output=pop; open STDOUT,">$output"; &asm_init($ARGV[0]); $x86only=0; $sse2=1; if (!$x86only) {{{ if ($sse2) {{ ###################################################################### # PCLMULQDQ version. $Xip="eax"; $Htbl="edx"; $const="ecx"; $inp="esi"; $len="ebx"; ($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2"; ($T1,$T2,$T3)=("xmm3","xmm4","xmm5"); ($Xn,$Xhn)=("xmm6","xmm7"); &static_label("bswap"); sub clmul64x64_T2 { # minimal "register" pressure my ($Xhi,$Xi,$Hkey,$HK)=@_; &movdqa ($Xhi,$Xi); # &pshufd ($T1,$Xi,0b01001110); &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK)); &pxor ($T1,$Xi); # &pxor ($T2,$Hkey) if (!defined($HK)); $HK=$T2 if (!defined($HK)); &pclmulqdq ($Xi,$Hkey,0x00); ####### &pclmulqdq ($Xhi,$Hkey,0x11); ####### &pclmulqdq ($T1,$HK,0x00); ####### &xorps ($T1,$Xi); # &xorps ($T1,$Xhi); # &movdqa ($T2,$T1); # &psrldq ($T1,8); &pslldq ($T2,8); # &pxor ($Xhi,$T1); &pxor ($Xi,$T2); # } if (1) { # Algorithm 9 with <<1 twist. # Reduction is shorter and uses only two # temporary registers, which makes it better # candidate for interleaving with 64x64 # multiplication. Pre-modulo-scheduled loop # was found to be ~20% faster than Algorithm 5 # below. Algorithm 9 was therefore chosen for # further optimization... sub reduction_alg9 { # 17/11 times faster than Intel version my ($Xhi,$Xi) = @_; # 1st phase &movdqa ($T2,$Xi); # &movdqa ($T1,$Xi); &psllq ($Xi,5); &pxor ($T1,$Xi); # &psllq ($Xi,1); &pxor ($Xi,$T1); # &psllq ($Xi,57); # &movdqa ($T1,$Xi); # &pslldq ($Xi,8); &psrldq ($T1,8); # &pxor ($Xi,$T2); &pxor ($Xhi,$T1); # # 2nd phase &movdqa ($T2,$Xi); &psrlq ($Xi,1); &pxor ($Xhi,$T2); # &pxor ($T2,$Xi); &psrlq ($Xi,5); &pxor ($Xi,$T2); # &psrlq ($Xi,1); # &pxor ($Xi,$Xhi) # } &function_begin_B("gcm_init_clmul"); &mov ($Htbl,&wparam(0)); &mov ($Xip,&wparam(1)); &call (&label("pic")); &set_label("pic"); &blindpop ($const); &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); &movdqu ($Hkey,&QWP(0,$Xip)); &pshufd ($Hkey,$Hkey,0b01001110);# dword swap # <<1 twist &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword &movdqa ($T1,$Hkey); &psllq ($Hkey,1); &pxor ($T3,$T3); # &psrlq ($T1,63); &pcmpgtd ($T3,$T2); # broadcast carry bit &pslldq ($T1,8); &por ($Hkey,$T1); # H<<=1 # magic reduction &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial # calculate H^2 &movdqa ($Xi,$Hkey); &clmul64x64_T2 ($Xhi,$Xi,$Hkey); &reduction_alg9 ($Xhi,$Xi); &pshufd ($T1,$Hkey,0b01001110); &pshufd ($T2,$Xi,0b01001110); &pxor ($T1,$Hkey); # Karatsuba pre-processing &movdqu (&QWP(0,$Htbl),$Hkey); # save H &pxor ($T2,$Xi); # Karatsuba pre-processing &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 &palignr ($T2,$T1,8); # low part is H.lo^H.hi &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt" &ret (); &function_end_B("gcm_init_clmul"); &function_begin("gcm_ghash_clmul"); &mov ($Xip,&wparam(0)); &mov ($Htbl,&wparam(1)); &mov ($inp,&wparam(2)); &mov ($len,&wparam(3)); &call (&label("pic")); &set_label("pic"); &blindpop ($const); &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); &movdqu ($Xi,&QWP(0,$Xip)); &movdqa ($T3,&QWP(0,$const)); &movdqu ($Hkey,&QWP(0,$Htbl)); &pshufb ($Xi,$T3); &sub ($len,0x10); &jz (&label("odd_tail")); ####### # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = # [(H*Ii+1) + (H*Xi+1)] mod P = # [(H*Ii+1) + H^2*(Ii+Xi)] mod P # &movdqu ($T1,&QWP(0,$inp)); # Ii &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 &pshufb ($T1,$T3); &pshufb ($Xn,$T3); &movdqu ($T3,&QWP(32,$Htbl)); &pxor ($Xi,$T1); # Ii+Xi &pshufd ($T1,$Xn,0b01001110); # H*Ii+1 &movdqa ($Xhn,$Xn); &pxor ($T1,$Xn); # &lea ($inp,&DWP(32,$inp)); # i+=2 &pclmulqdq ($Xn,$Hkey,0x00); ####### &pclmulqdq ($Xhn,$Hkey,0x11); ####### &pclmulqdq ($T1,$T3,0x00); ####### &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 &nop (); &sub ($len,0x20); &jbe (&label("even_tail")); &jmp (&label("mod_loop")); &set_label("mod_loop",32); &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) &movdqa ($Xhi,$Xi); &pxor ($T2,$Xi); # &nop (); &pclmulqdq ($Xi,$Hkey,0x00); ####### &pclmulqdq ($Xhi,$Hkey,0x11); ####### &pclmulqdq ($T2,$T3,0x10); ####### &movups ($Hkey,&QWP(0,$Htbl)); # load H &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) &movdqa ($T3,&QWP(0,$const)); &xorps ($Xhi,$Xhn); &movdqu ($Xhn,&QWP(0,$inp)); # Ii &pxor ($T1,$Xi); # aggregated Karatsuba post-processing &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 &pxor ($T1,$Xhi); # &pshufb ($Xhn,$T3); &pxor ($T2,$T1); # &movdqa ($T1,$T2); # &psrldq ($T2,8); &pslldq ($T1,8); # &pxor ($Xhi,$T2); &pxor ($Xi,$T1); # &pshufb ($Xn,$T3); &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase &movdqa ($T1,$Xi); &psllq ($Xi,5); &pxor ($T1,$Xi); # &psllq ($Xi,1); &pxor ($Xi,$T1); # &pclmulqdq ($Xn,$Hkey,0x00); ####### &movups ($T3,&QWP(32,$Htbl)); &psllq ($Xi,57); # &movdqa ($T1,$Xi); # &pslldq ($Xi,8); &psrldq ($T1,8); # &pxor ($Xi,$T2); &pxor ($Xhi,$T1); # &pshufd ($T1,$Xhn,0b01001110); &movdqa ($T2,$Xi); # 2nd phase &psrlq ($Xi,1); &pxor ($T1,$Xhn); &pxor ($Xhi,$T2); # &pclmulqdq ($Xhn,$Hkey,0x11); ####### &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 &pxor ($T2,$Xi); &psrlq ($Xi,5); &pxor ($Xi,$T2); # &psrlq ($Xi,1); # &pxor ($Xi,$Xhi) # &pclmulqdq ($T1,$T3,0x00); ####### &lea ($inp,&DWP(32,$inp)); &sub ($len,0x20); &ja (&label("mod_loop")); &set_label("even_tail"); &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) &movdqa ($Xhi,$Xi); &pxor ($T2,$Xi); # &pclmulqdq ($Xi,$Hkey,0x00); ####### &pclmulqdq ($Xhi,$Hkey,0x11); ####### &pclmulqdq ($T2,$T3,0x10); ####### &movdqa ($T3,&QWP(0,$const)); &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) &xorps ($Xhi,$Xhn); &pxor ($T1,$Xi); # aggregated Karatsuba post-processing &pxor ($T1,$Xhi); # &pxor ($T2,$T1); # &movdqa ($T1,$T2); # &psrldq ($T2,8); &pslldq ($T1,8); # &pxor ($Xhi,$T2); &pxor ($Xi,$T1); # &reduction_alg9 ($Xhi,$Xi); &test ($len,$len); &jnz (&label("done")); &movups ($Hkey,&QWP(0,$Htbl)); # load H &set_label("odd_tail"); &movdqu ($T1,&QWP(0,$inp)); # Ii &pshufb ($T1,$T3); &pxor ($Xi,$T1); # Ii+Xi &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) &reduction_alg9 ($Xhi,$Xi); &set_label("done"); &pshufb ($Xi,$T3); &movdqu (&QWP(0,$Xip),$Xi); &function_end("gcm_ghash_clmul"); } &set_label("bswap",64); &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial }} # $sse2 }}} # !$x86only &asciz("GHASH for x86, CRYPTOGAMS by "); &asm_finish(); close STDOUT or die "error closing STDOUT: $!"; # A question was risen about choice of vanilla MMX. Or rather why wasn't # SSE2 chosen instead? In addition to the fact that MMX runs on legacy # CPUs such as PIII, "4-bit" MMX version was observed to provide better # performance than *corresponding* SSE2 one even on contemporary CPUs. # SSE2 results were provided by Peter-Michael Hager. He maintains SSE2 # implementation featuring full range of lookup-table sizes, but with # per-invocation lookup table setup. Latter means that table size is # chosen depending on how much data is to be hashed in every given call, # more data - larger table. Best reported result for Core2 is ~4 cycles # per processed byte out of 64KB block. This number accounts even for # 64KB table setup overhead. As discussed in gcm128.c we choose to be # more conservative in respect to lookup table sizes, but how do the # results compare? Minimalistic "256B" MMX version delivers ~11 cycles # on same platform. As also discussed in gcm128.c, next in line "8-bit # Shoup's" or "4KB" method should deliver twice the performance of # "256B" one, in other words not worse than ~6 cycles per byte. It # should be also be noted that in SSE2 case improvement can be "super- # linear," i.e. more than twice, mostly because >>8 maps to single # instruction on SSE2 register. This is unlike "4-bit" case when >>4 # maps to same amount of instructions in both MMX and SSE2 cases. # Bottom line is that switch to SSE2 is considered to be justifiable # only in case we choose to implement "8-bit" method... ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86_64.pl000064400000000000000000000720171046102023000203460ustar 00000000000000#! /usr/bin/env perl # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # March, June 2010 # # The module implements "4-bit" GCM GHASH function and underlying # single multiplication operation in GF(2^128). "4-bit" means that # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH # function features so called "528B" variant utilizing additional # 256+16 bytes of per-key storage [+512 bytes shared table]. # Performance results are for this streamed GHASH subroutine and are # expressed in cycles per processed byte, less is better: # # gcc 3.4.x(*) assembler # # P4 28.6 14.0 +100% # Opteron 19.3 7.7 +150% # Core2 17.8 8.1(**) +120% # Atom 31.6 16.8 +88% # VIA Nano 21.8 10.1 +115% # # (*) comparison is not completely fair, because C results are # for vanilla "256B" implementation, while assembler results # are for "528B";-) # (**) it's mystery [to me] why Core2 result is not same as for # Opteron; # May 2010 # # Add PCLMULQDQ version performing at 2.02 cycles per processed byte. # See ghash-x86.pl for background information and details about coding # techniques. # # Special thanks to David Woodhouse for providing access to a # Westmere-based system on behalf of Intel Open Source Technology Centre. # December 2012 # # Overhaul: aggregate Karatsuba post-processing, improve ILP in # reduction_alg9, increase reduction aggregate factor to 4x. As for # the latter. ghash-x86.pl discusses that it makes lesser sense to # increase aggregate factor. Then why increase here? Critical path # consists of 3 independent pclmulqdq instructions, Karatsuba post- # processing and reduction. "On top" of this we lay down aggregated # multiplication operations, triplets of independent pclmulqdq's. As # issue rate for pclmulqdq is limited, it makes lesser sense to # aggregate more multiplications than it takes to perform remaining # non-multiplication operations. 2x is near-optimal coefficient for # contemporary Intel CPUs (therefore modest improvement coefficient), # but not for Bulldozer. Latter is because logical SIMD operations # are twice as slow in comparison to Intel, so that critical path is # longer. A CPU with higher pclmulqdq issue rate would also benefit # from higher aggregate factor... # # Westmere 1.78(+13%) # Sandy Bridge 1.80(+8%) # Ivy Bridge 1.80(+7%) # Haswell 0.55(+93%) (if system doesn't support AVX) # Broadwell 0.45(+110%)(if system doesn't support AVX) # Skylake 0.44(+110%)(if system doesn't support AVX) # Bulldozer 1.49(+27%) # Silvermont 2.88(+13%) # Knights L 2.12(-) (if system doesn't support AVX) # Goldmont 1.08(+24%) # March 2013 # # ... 8x aggregate factor AVX code path is using reduction algorithm # suggested by Shay Gueron[1]. Even though contemporary AVX-capable # CPUs such as Sandy and Ivy Bridge can execute it, the code performs # sub-optimally in comparison to above mentioned version. But thanks # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that # it performs in 0.41 cycles per byte on Haswell processor, in # 0.29 on Broadwell, and in 0.36 on Skylake. # # Knights Landing achieves 1.09 cpb. # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # This file was patched in BoringSSL to remove the variable-time 4-bit # implementation. $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; # See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be # computed incorrectly. # # In upstream, this is controlled by shelling out to the compiler to check # versions, but BoringSSL is intended to be used with pre-generated perlasm # output, so this isn't useful anyway. $avx = 1; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $do4xaggr=1; $code=<<___; .text ___ ###################################################################### # PCLMULQDQ version. @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order ("%rdi","%rsi","%rdx","%rcx"); # Unix order ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); sub clmul64x64_T2 { # minimal register pressure my ($Xhi,$Xi,$Hkey,$HK)=@_; if (!defined($HK)) { $HK = $T2; $code.=<<___; movdqa $Xi,$Xhi # pshufd \$0b01001110,$Xi,$T1 pshufd \$0b01001110,$Hkey,$T2 pxor $Xi,$T1 # pxor $Hkey,$T2 ___ } else { $code.=<<___; movdqa $Xi,$Xhi # pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 # ___ } $code.=<<___; pclmulqdq \$0x00,$Hkey,$Xi ####### pclmulqdq \$0x11,$Hkey,$Xhi ####### pclmulqdq \$0x00,$HK,$T1 ####### pxor $Xi,$T1 # pxor $Xhi,$T1 # movdqa $T1,$T2 # psrldq \$8,$T1 pslldq \$8,$T2 # pxor $T1,$Xhi pxor $T2,$Xi # ___ } sub reduction_alg9 { # 17/11 times faster than Intel version my ($Xhi,$Xi) = @_; $code.=<<___; # 1st phase movdqa $Xi,$T2 # movdqa $Xi,$T1 psllq \$5,$Xi pxor $Xi,$T1 # psllq \$1,$Xi pxor $T1,$Xi # psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # # 2nd phase movdqa $Xi,$T2 psrlq \$1,$Xi pxor $T2,$Xhi # pxor $Xi,$T2 psrlq \$5,$Xi pxor $T2,$Xi # psrlq \$1,$Xi # pxor $Xhi,$Xi # ___ } { my ($Htbl,$Xip)=@_4args; my $HK="%xmm6"; $code.=<<___; .globl gcm_init_clmul .type gcm_init_clmul,\@abi-omnipotent .align 16 gcm_init_clmul: .cfi_startproc .seh_startproc _CET_ENDBR .L_init_clmul: ___ $code.=<<___ if ($win64); sub \$0x18,%rsp .seh_stackalloc 0x18 movaps %xmm6,(%rsp) .seh_savexmm %xmm6, 0 .seh_endprologue ___ $code.=<<___; movdqu ($Xip),$Hkey pshufd \$0b01001110,$Hkey,$Hkey # dword swap # <<1 twist pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword movdqa $Hkey,$T1 psllq \$1,$Hkey pxor $T3,$T3 # psrlq \$63,$T1 pcmpgtd $T2,$T3 # broadcast carry bit pslldq \$8,$T1 por $T1,$Hkey # H<<=1 # magic reduction pand .L0x1c2_polynomial(%rip),$T3 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial # calculate H^2 pshufd \$0b01001110,$Hkey,$HK movdqa $Hkey,$Xi pxor $Hkey,$HK ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); &reduction_alg9 ($Xhi,$Xi); $code.=<<___; pshufd \$0b01001110,$Hkey,$T1 pshufd \$0b01001110,$Xi,$T2 pxor $Hkey,$T1 # Karatsuba pre-processing movdqu $Hkey,0x00($Htbl) # save H pxor $Xi,$T2 # Karatsuba pre-processing movdqu $Xi,0x10($Htbl) # save H^2 palignr \$8,$T1,$T2 # low part is H.lo^H.hi... movdqu $T2,0x20($Htbl) # save Karatsuba "salt" ___ if ($do4xaggr) { &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 &reduction_alg9 ($Xhi,$Xi); $code.=<<___; movdqa $Xi,$T3 ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 &reduction_alg9 ($Xhi,$Xi); $code.=<<___; pshufd \$0b01001110,$T3,$T1 pshufd \$0b01001110,$Xi,$T2 pxor $T3,$T1 # Karatsuba pre-processing movdqu $T3,0x30($Htbl) # save H^3 pxor $Xi,$T2 # Karatsuba pre-processing movdqu $Xi,0x40($Htbl) # save H^4 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... movdqu $T2,0x50($Htbl) # save Karatsuba "salt" ___ } $code.=<<___ if ($win64); movaps (%rsp),%xmm6 lea 0x18(%rsp),%rsp ___ $code.=<<___; ret .cfi_endproc .seh_endproc .size gcm_init_clmul,.-gcm_init_clmul ___ } { my ($Xip,$Htbl,$inp,$len)=@_4args; my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); $code.=<<___; .globl gcm_ghash_clmul .type gcm_ghash_clmul,\@abi-omnipotent .align 32 gcm_ghash_clmul: .cfi_startproc .seh_startproc _CET_ENDBR .L_ghash_clmul: ___ $code.=<<___ if ($win64); lea -0x88(%rsp),%rax lea -0x20(%rax),%rsp .seh_stackalloc 0x20+0x88 movaps %xmm6,-0x20(%rax) .seh_savexmm %xmm6, 0x20-0x20 movaps %xmm7,-0x10(%rax) .seh_savexmm %xmm7, 0x20-0x10 movaps %xmm8,0(%rax) .seh_savexmm %xmm8, 0x20+0 movaps %xmm9,0x10(%rax) .seh_savexmm %xmm9, 0x20+0x10 movaps %xmm10,0x20(%rax) .seh_savexmm %xmm10, 0x20+0x20 movaps %xmm11,0x30(%rax) .seh_savexmm %xmm11, 0x20+0x30 movaps %xmm12,0x40(%rax) .seh_savexmm %xmm12, 0x20+0x40 movaps %xmm13,0x50(%rax) .seh_savexmm %xmm13, 0x20+0x50 movaps %xmm14,0x60(%rax) .seh_savexmm %xmm14, 0x20+0x60 movaps %xmm15,0x70(%rax) .seh_savexmm %xmm15, 0x20+0x70 .seh_endprologue ___ $code.=<<___; movdqa .Lbswap_mask(%rip),$T3 movdqu ($Xip),$Xi movdqu ($Htbl),$Hkey movdqu 0x20($Htbl),$HK pshufb $T3,$Xi sub \$0x10,$len jz .Lodd_tail movdqu 0x10($Htbl),$Hkey2 ___ if ($do4xaggr) { my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); $code.=<<___; cmp \$0x30,$len jb .Lskip4x sub \$0x30,$len mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff movdqu 0x30($Htbl),$Hkey3 movdqu 0x40($Htbl),$Hkey4 ####### # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P # movdqu 0x30($inp),$Xln movdqu 0x20($inp),$Xl pshufb $T3,$Xln pshufb $T3,$Xl movdqa $Xln,$Xhn pshufd \$0b01001110,$Xln,$Xmn pxor $Xln,$Xmn pclmulqdq \$0x00,$Hkey,$Xln pclmulqdq \$0x11,$Hkey,$Xhn pclmulqdq \$0x00,$HK,$Xmn movdqa $Xl,$Xh pshufd \$0b01001110,$Xl,$Xm pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey2,$Xl pclmulqdq \$0x11,$Hkey2,$Xh pclmulqdq \$0x10,$HK,$Xm xorps $Xl,$Xln xorps $Xh,$Xhn movups 0x50($Htbl),$HK xorps $Xm,$Xmn movdqu 0x10($inp),$Xl movdqu 0($inp),$T1 pshufb $T3,$Xl pshufb $T3,$T1 movdqa $Xl,$Xh pshufd \$0b01001110,$Xl,$Xm pxor $T1,$Xi pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey3,$Xl movdqa $Xi,$Xhi pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 pclmulqdq \$0x11,$Hkey3,$Xh pclmulqdq \$0x00,$HK,$Xm xorps $Xl,$Xln xorps $Xh,$Xhn lea 0x40($inp),$inp sub \$0x40,$len jc .Ltail4x jmp .Lmod4_loop .align 32 .Lmod4_loop: pclmulqdq \$0x00,$Hkey4,$Xi xorps $Xm,$Xmn movdqu 0x30($inp),$Xl pshufb $T3,$Xl pclmulqdq \$0x11,$Hkey4,$Xhi xorps $Xln,$Xi movdqu 0x20($inp),$Xln movdqa $Xl,$Xh pclmulqdq \$0x10,$HK,$T1 pshufd \$0b01001110,$Xl,$Xm xorps $Xhn,$Xhi pxor $Xl,$Xm pshufb $T3,$Xln movups 0x20($Htbl),$HK xorps $Xmn,$T1 pclmulqdq \$0x00,$Hkey,$Xl pshufd \$0b01001110,$Xln,$Xmn pxor $Xi,$T1 # aggregated Karatsuba post-processing movdqa $Xln,$Xhn pxor $Xhi,$T1 # pxor $Xln,$Xmn movdqa $T1,$T2 # pclmulqdq \$0x11,$Hkey,$Xh pslldq \$8,$T1 psrldq \$8,$T2 # pxor $T1,$Xi movdqa .L7_mask(%rip),$T1 pxor $T2,$Xhi # movq %rax,$T2 pand $Xi,$T1 # 1st phase pshufb $T1,$T2 # pxor $Xi,$T2 # pclmulqdq \$0x00,$HK,$Xm psllq \$57,$T2 # movdqa $T2,$T1 # pslldq \$8,$T2 pclmulqdq \$0x00,$Hkey2,$Xln psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # movdqu 0($inp),$T1 movdqa $Xi,$T2 # 2nd phase psrlq \$1,$Xi pclmulqdq \$0x11,$Hkey2,$Xhn xorps $Xl,$Xln movdqu 0x10($inp),$Xl pshufb $T3,$Xl pclmulqdq \$0x10,$HK,$Xmn xorps $Xh,$Xhn movups 0x50($Htbl),$HK pshufb $T3,$T1 pxor $T2,$Xhi # pxor $Xi,$T2 psrlq \$5,$Xi movdqa $Xl,$Xh pxor $Xm,$Xmn pshufd \$0b01001110,$Xl,$Xm pxor $T2,$Xi # pxor $T1,$Xhi pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey3,$Xl psrlq \$1,$Xi # pxor $Xhi,$Xi # movdqa $Xi,$Xhi pclmulqdq \$0x11,$Hkey3,$Xh xorps $Xl,$Xln pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 pclmulqdq \$0x00,$HK,$Xm xorps $Xh,$Xhn lea 0x40($inp),$inp sub \$0x40,$len jnc .Lmod4_loop .Ltail4x: pclmulqdq \$0x00,$Hkey4,$Xi pclmulqdq \$0x11,$Hkey4,$Xhi pclmulqdq \$0x10,$HK,$T1 xorps $Xm,$Xmn xorps $Xln,$Xi xorps $Xhn,$Xhi pxor $Xi,$Xhi # aggregated Karatsuba post-processing pxor $Xmn,$T1 pxor $Xhi,$T1 # pxor $Xi,$Xhi movdqa $T1,$T2 # psrldq \$8,$T1 pslldq \$8,$T2 # pxor $T1,$Xhi pxor $T2,$Xi # ___ &reduction_alg9($Xhi,$Xi); $code.=<<___; add \$0x40,$len jz .Ldone movdqu 0x20($Htbl),$HK sub \$0x10,$len jz .Lodd_tail .Lskip4x: ___ } $code.=<<___; ####### # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = # [(H*Ii+1) + (H*Xi+1)] mod P = # [(H*Ii+1) + H^2*(Ii+Xi)] mod P # movdqu ($inp),$T1 # Ii movdqu 16($inp),$Xln # Ii+1 pshufb $T3,$T1 pshufb $T3,$Xln pxor $T1,$Xi # Ii+Xi movdqa $Xln,$Xhn pshufd \$0b01001110,$Xln,$Xmn pxor $Xln,$Xmn pclmulqdq \$0x00,$Hkey,$Xln pclmulqdq \$0x11,$Hkey,$Xhn pclmulqdq \$0x00,$HK,$Xmn lea 32($inp),$inp # i+=2 nop sub \$0x20,$len jbe .Leven_tail nop jmp .Lmod_loop .align 32 .Lmod_loop: movdqa $Xi,$Xhi movdqa $Xmn,$T1 pshufd \$0b01001110,$Xi,$Xmn # pxor $Xi,$Xmn # pclmulqdq \$0x00,$Hkey2,$Xi pclmulqdq \$0x11,$Hkey2,$Xhi pclmulqdq \$0x10,$HK,$Xmn pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi movdqu ($inp),$T2 # Ii pxor $Xi,$T1 # aggregated Karatsuba post-processing pshufb $T3,$T2 movdqu 16($inp),$Xln # Ii+1 pxor $Xhi,$T1 pxor $T2,$Xhi # "Ii+Xi", consume early pxor $T1,$Xmn pshufb $T3,$Xln movdqa $Xmn,$T1 # psrldq \$8,$T1 pslldq \$8,$Xmn # pxor $T1,$Xhi pxor $Xmn,$Xi # movdqa $Xln,$Xhn # movdqa $Xi,$T2 # 1st phase movdqa $Xi,$T1 psllq \$5,$Xi pxor $Xi,$T1 # pclmulqdq \$0x00,$Hkey,$Xln ####### psllq \$1,$Xi pxor $T1,$Xi # psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi psrldq \$8,$T1 # pxor $T2,$Xi pshufd \$0b01001110,$Xhn,$Xmn pxor $T1,$Xhi # pxor $Xhn,$Xmn # movdqa $Xi,$T2 # 2nd phase psrlq \$1,$Xi pclmulqdq \$0x11,$Hkey,$Xhn ####### pxor $T2,$Xhi # pxor $Xi,$T2 psrlq \$5,$Xi pxor $T2,$Xi # lea 32($inp),$inp psrlq \$1,$Xi # pclmulqdq \$0x00,$HK,$Xmn ####### pxor $Xhi,$Xi # sub \$0x20,$len ja .Lmod_loop .Leven_tail: movdqa $Xi,$Xhi movdqa $Xmn,$T1 pshufd \$0b01001110,$Xi,$Xmn # pxor $Xi,$Xmn # pclmulqdq \$0x00,$Hkey2,$Xi pclmulqdq \$0x11,$Hkey2,$Xhi pclmulqdq \$0x10,$HK,$Xmn pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi pxor $Xi,$T1 pxor $Xhi,$T1 pxor $T1,$Xmn movdqa $Xmn,$T1 # psrldq \$8,$T1 pslldq \$8,$Xmn # pxor $T1,$Xhi pxor $Xmn,$Xi # ___ &reduction_alg9 ($Xhi,$Xi); $code.=<<___; test $len,$len jnz .Ldone .Lodd_tail: movdqu ($inp),$T1 # Ii pshufb $T3,$T1 pxor $T1,$Xi # Ii+Xi ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) &reduction_alg9 ($Xhi,$Xi); $code.=<<___; .Ldone: pshufb $T3,$Xi movdqu $Xi,($Xip) ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 movaps 0x10(%rsp),%xmm7 movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 movaps 0x50(%rsp),%xmm11 movaps 0x60(%rsp),%xmm12 movaps 0x70(%rsp),%xmm13 movaps 0x80(%rsp),%xmm14 movaps 0x90(%rsp),%xmm15 lea 0xa8(%rsp),%rsp ___ $code.=<<___; ret .cfi_endproc .seh_endproc .size gcm_ghash_clmul,.-gcm_ghash_clmul ___ } $code.=<<___; .globl gcm_init_avx .type gcm_init_avx,\@abi-omnipotent .align 32 gcm_init_avx: .cfi_startproc .seh_startproc _CET_ENDBR ___ if ($avx) { my ($Htbl,$Xip)=@_4args; my $HK="%xmm6"; $code.=<<___ if ($win64); sub \$0x18,%rsp .seh_stackalloc 0x18 movaps %xmm6,(%rsp) .seh_savexmm %xmm6, 0 .seh_endprologue ___ $code.=<<___; vzeroupper vmovdqu ($Xip),$Hkey vpshufd \$0b01001110,$Hkey,$Hkey # dword swap # <<1 twist vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword vpsrlq \$63,$Hkey,$T1 vpsllq \$1,$Hkey,$Hkey vpxor $T3,$T3,$T3 # vpcmpgtd $T2,$T3,$T3 # broadcast carry bit vpslldq \$8,$T1,$T1 vpor $T1,$Hkey,$Hkey # H<<=1 # magic reduction vpand .L0x1c2_polynomial(%rip),$T3,$T3 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial vpunpckhqdq $Hkey,$Hkey,$HK vmovdqa $Hkey,$Xi vpxor $Hkey,$HK,$HK mov \$4,%r10 # up to H^8 jmp .Linit_start_avx ___ sub clmul64x64_avx { my ($Xhi,$Xi,$Hkey,$HK)=@_; if (!defined($HK)) { $HK = $T2; $code.=<<___; vpunpckhqdq $Xi,$Xi,$T1 vpunpckhqdq $Hkey,$Hkey,$T2 vpxor $Xi,$T1,$T1 # vpxor $Hkey,$T2,$T2 ___ } else { $code.=<<___; vpunpckhqdq $Xi,$Xi,$T1 vpxor $Xi,$T1,$T1 # ___ } $code.=<<___; vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### vpclmulqdq \$0x00,$HK,$T1,$T1 ####### vpxor $Xi,$Xhi,$T2 # vpxor $T2,$T1,$T1 # vpslldq \$8,$T1,$T2 # vpsrldq \$8,$T1,$T1 vpxor $T2,$Xi,$Xi # vpxor $T1,$Xhi,$Xhi ___ } sub reduction_avx { my ($Xhi,$Xi) = @_; $code.=<<___; vpsllq \$57,$Xi,$T1 # 1st phase vpsllq \$62,$Xi,$T2 vpxor $T1,$T2,$T2 # vpsllq \$63,$Xi,$T1 vpxor $T1,$T2,$T2 # vpslldq \$8,$T2,$T1 # vpsrldq \$8,$T2,$T2 vpxor $T1,$Xi,$Xi # vpxor $T2,$Xhi,$Xhi vpsrlq \$1,$Xi,$T2 # 2nd phase vpxor $Xi,$Xhi,$Xhi vpxor $T2,$Xi,$Xi # vpsrlq \$5,$T2,$T2 vpxor $T2,$Xi,$Xi # vpsrlq \$1,$Xi,$Xi # vpxor $Xhi,$Xi,$Xi # ___ } $code.=<<___; .align 32 .Linit_loop_avx: vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" ___ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 &reduction_avx ($Xhi,$Xi); $code.=<<___; .Linit_start_avx: vmovdqa $Xi,$T3 ___ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 &reduction_avx ($Xhi,$Xi); $code.=<<___; vpshufd \$0b01001110,$T3,$T1 vpshufd \$0b01001110,$Xi,$T2 vpxor $T3,$T1,$T1 # Karatsuba pre-processing vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 lea 0x30($Htbl),$Htbl sub \$1,%r10 jnz .Linit_loop_avx vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped vmovdqu $T3,-0x10($Htbl) vzeroupper ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 lea 0x18(%rsp),%rsp ___ $code.=<<___; ret .seh_endproc .cfi_endproc .size gcm_init_avx,.-gcm_init_avx ___ } else { $code.=<<___; jmp .L_init_clmul .size gcm_init_avx,.-gcm_init_avx ___ } $code.=<<___; .globl gcm_ghash_avx .type gcm_ghash_avx,\@abi-omnipotent .align 32 gcm_ghash_avx: .cfi_startproc .seh_startproc _CET_ENDBR ___ if ($avx) { my ($Xip,$Htbl,$inp,$len)=@_4args; my ($Xlo,$Xhi,$Xmi, $Zlo,$Zhi,$Zmi, $Hkey,$HK,$T1,$T2, $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); $code.=<<___ if ($win64); lea -0x88(%rsp),%rax lea -0x20(%rax),%rsp .seh_stackalloc 0x20+0x88 movaps %xmm6,-0x20(%rax) .seh_savexmm %xmm6, 0x20-0x20 movaps %xmm7,-0x10(%rax) .seh_savexmm %xmm7, 0x20-0x10 movaps %xmm8,0(%rax) .seh_savexmm %xmm8, 0x20+0 movaps %xmm9,0x10(%rax) .seh_savexmm %xmm9, 0x20+0x10 movaps %xmm10,0x20(%rax) .seh_savexmm %xmm10, 0x20+0x20 movaps %xmm11,0x30(%rax) .seh_savexmm %xmm11, 0x20+0x30 movaps %xmm12,0x40(%rax) .seh_savexmm %xmm12, 0x20+0x40 movaps %xmm13,0x50(%rax) .seh_savexmm %xmm13, 0x20+0x50 movaps %xmm14,0x60(%rax) .seh_savexmm %xmm14, 0x20+0x60 movaps %xmm15,0x70(%rax) .seh_savexmm %xmm15, 0x20+0x70 .seh_endprologue ___ $code.=<<___; vzeroupper vmovdqu ($Xip),$Xi # load $Xi lea .L0x1c2_polynomial(%rip),%r10 lea 0x40($Htbl),$Htbl # size optimization vmovdqu .Lbswap_mask(%rip),$bswap vpshufb $bswap,$Xi,$Xi cmp \$0x80,$len jb .Lshort_avx sub \$0x80,$len vmovdqu 0x70($inp),$Ii # I[7] vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 vpshufb $bswap,$Ii,$Ii vmovdqu 0x20-0x40($Htbl),$HK vpunpckhqdq $Ii,$Ii,$T2 vmovdqu 0x60($inp),$Ij # I[6] vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Ii,$T2,$T2 vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 vpunpckhqdq $Ij,$Ij,$T1 vmovdqu 0x50($inp),$Ii # I[5] vpclmulqdq \$0x00,$HK,$T2,$Xmi vpxor $Ij,$T1,$T1 vpshufb $bswap,$Ii,$Ii vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpunpckhqdq $Ii,$Ii,$T2 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 vpxor $Ii,$T2,$T2 vmovdqu 0x40($inp),$Ij # I[4] vpclmulqdq \$0x10,$HK,$T1,$Zmi vmovdqu 0x50-0x40($Htbl),$HK vpshufb $bswap,$Ij,$Ij vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Xhi,$Zhi,$Zhi vpunpckhqdq $Ij,$Ij,$T1 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T2,$Xmi vpxor $Ij,$T1,$T1 vmovdqu 0x30($inp),$Ii # I[3] vpxor $Zlo,$Xlo,$Xlo vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpxor $Zhi,$Xhi,$Xhi vpshufb $bswap,$Ii,$Ii vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 vpxor $Zmi,$Xmi,$Xmi vpunpckhqdq $Ii,$Ii,$T2 vpclmulqdq \$0x10,$HK,$T1,$Zmi vmovdqu 0x80-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vmovdqu 0x20($inp),$Ij # I[2] vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Xhi,$Zhi,$Zhi vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 vpxor $Xmi,$Zmi,$Zmi vpunpckhqdq $Ij,$Ij,$T1 vpclmulqdq \$0x00,$HK,$T2,$Xmi vpxor $Ij,$T1,$T1 vmovdqu 0x10($inp),$Ii # I[1] vpxor $Zlo,$Xlo,$Xlo vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpxor $Zhi,$Xhi,$Xhi vpshufb $bswap,$Ii,$Ii vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 vpxor $Zmi,$Xmi,$Xmi vpunpckhqdq $Ii,$Ii,$T2 vpclmulqdq \$0x10,$HK,$T1,$Zmi vmovdqu 0xb0-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vmovdqu ($inp),$Ij # I[0] vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Xhi,$Zhi,$Zhi vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x10,$HK,$T2,$Xmi lea 0x80($inp),$inp cmp \$0x80,$len jb .Ltail_avx vpxor $Xi,$Ij,$Ij # accumulate $Xi sub \$0x80,$len jmp .Loop8x_avx .align 32 .Loop8x_avx: vpunpckhqdq $Ij,$Ij,$T1 vmovdqu 0x70($inp),$Ii # I[7] vpxor $Xlo,$Zlo,$Zlo vpxor $Ij,$T1,$T1 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi vpshufb $bswap,$Ii,$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xo vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 vpunpckhqdq $Ii,$Ii,$T2 vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Tred vmovdqu 0x20-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vmovdqu 0x60($inp),$Ij # I[6] vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Zlo,$Xi,$Xi # collect result vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vxorps $Zhi,$Xo,$Xo vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 vpunpckhqdq $Ij,$Ij,$T1 vpclmulqdq \$0x00,$HK, $T2,$Xmi vpxor $Zmi,$Tred,$Tred vxorps $Ij,$T1,$T1 vmovdqu 0x50($inp),$Ii # I[5] vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpxor $Xo,$Tred,$Tred vpslldq \$8,$Tred,$T2 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vpsrldq \$8,$Tred,$Tred vpxor $T2, $Xi, $Xi vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 vpshufb $bswap,$Ii,$Ii vxorps $Tred,$Xo, $Xo vpxor $Xhi,$Zhi,$Zhi vpunpckhqdq $Ii,$Ii,$T2 vpclmulqdq \$0x10,$HK, $T1,$Zmi vmovdqu 0x50-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vpxor $Xmi,$Zmi,$Zmi vmovdqu 0x40($inp),$Ij # I[4] vpalignr \$8,$Xi,$Xi,$Tred # 1st phase vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpshufb $bswap,$Ij,$Ij vpxor $Zlo,$Xlo,$Xlo vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 vpunpckhqdq $Ij,$Ij,$T1 vpxor $Zhi,$Xhi,$Xhi vpclmulqdq \$0x00,$HK, $T2,$Xmi vxorps $Ij,$T1,$T1 vpxor $Zmi,$Xmi,$Xmi vmovdqu 0x30($inp),$Ii # I[3] vpclmulqdq \$0x10,(%r10),$Xi,$Xi vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpshufb $bswap,$Ii,$Ii vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 vpunpckhqdq $Ii,$Ii,$T2 vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x10,$HK, $T1,$Zmi vmovdqu 0x80-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vpxor $Xmi,$Zmi,$Zmi vmovdqu 0x20($inp),$Ij # I[2] vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpshufb $bswap,$Ij,$Ij vpxor $Zlo,$Xlo,$Xlo vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 vpunpckhqdq $Ij,$Ij,$T1 vpxor $Zhi,$Xhi,$Xhi vpclmulqdq \$0x00,$HK, $T2,$Xmi vpxor $Ij,$T1,$T1 vpxor $Zmi,$Xmi,$Xmi vxorps $Tred,$Xi,$Xi vmovdqu 0x10($inp),$Ii # I[1] vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpshufb $bswap,$Ii,$Ii vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 vpclmulqdq \$0x10,(%r10),$Xi,$Xi vxorps $Xo,$Tred,$Tred vpunpckhqdq $Ii,$Ii,$T2 vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x10,$HK, $T1,$Zmi vmovdqu 0xb0-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vpxor $Xmi,$Zmi,$Zmi vmovdqu ($inp),$Ij # I[0] vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 vpxor $Tred,$Ij,$Ij vpclmulqdq \$0x10,$HK, $T2,$Xmi vpxor $Xi,$Ij,$Ij # accumulate $Xi lea 0x80($inp),$inp sub \$0x80,$len jnc .Loop8x_avx add \$0x80,$len jmp .Ltail_no_xor_avx .align 32 .Lshort_avx: vmovdqu -0x10($inp,$len),$Ii # very last word lea ($inp,$len),$inp vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 vmovdqu 0x20-0x40($Htbl),$HK vpshufb $bswap,$Ii,$Ij vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, vmovdqa $Xhi,$Zhi # $Zhi and vmovdqa $Xmi,$Zmi # $Zmi sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x20($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vpsrldq \$8,$HK,$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x30($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vmovdqu 0x50-0x40($Htbl),$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x40($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vpsrldq \$8,$HK,$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x50($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vmovdqu 0x80-0x40($Htbl),$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x60($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vpsrldq \$8,$HK,$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x70($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vmovq 0xb8-0x40($Htbl),$HK sub \$0x10,$len jmp .Ltail_avx .align 32 .Ltail_avx: vpxor $Xi,$Ij,$Ij # accumulate $Xi .Ltail_no_xor_avx: vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vmovdqu (%r10),$Tred vpxor $Xlo,$Zlo,$Xi vpxor $Xhi,$Zhi,$Xo vpxor $Xmi,$Zmi,$Zmi vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing vpxor $Xo, $Zmi,$Zmi vpslldq \$8, $Zmi,$T2 vpsrldq \$8, $Zmi,$Zmi vpxor $T2, $Xi, $Xi vpxor $Zmi,$Xo, $Xo vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase vpalignr \$8,$Xi,$Xi,$Xi vpxor $T2,$Xi,$Xi vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase vpalignr \$8,$Xi,$Xi,$Xi vpxor $Xo,$Xi,$Xi vpxor $T2,$Xi,$Xi cmp \$0,$len jne .Lshort_avx vpshufb $bswap,$Xi,$Xi vmovdqu $Xi,($Xip) vzeroupper ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 movaps 0x10(%rsp),%xmm7 movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 movaps 0x50(%rsp),%xmm11 movaps 0x60(%rsp),%xmm12 movaps 0x70(%rsp),%xmm13 movaps 0x80(%rsp),%xmm14 movaps 0x90(%rsp),%xmm15 lea 0xa8(%rsp),%rsp ___ $code.=<<___; ret .cfi_endproc .seh_endproc .size gcm_ghash_avx,.-gcm_ghash_avx ___ } else { $code.=<<___; jmp .L_ghash_clmul .size gcm_ghash_avx,.-gcm_ghash_avx ___ } $code.=<<___; .section .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .L7_mask: .long 7,0,7,0 .align 64 .asciz "GHASH for x86_64, CRYPTOGAMS by " .align 64 .text ___ $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/aes/asm/ghashv8-armx.pl000064400000000000000000000216531046102023000206350ustar 00000000000000#! /usr/bin/env perl # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. # # June 2014 # # Initial version was developed in tight cooperation with Ard # Biesheuvel of Linaro from bits-n-pieces from other assembly modules. # Just like aesv8-armx.pl this module supports both AArch32 and # AArch64 execution modes. # # July 2014 # # Implement 2x aggregated reduction [see ghash-x86.pl for background # information]. # # November 2017 # # AArch64 register bank to "accommodate" 4x aggregated reduction and # improve performance by 20-70% depending on processor. # # Current performance in cycles per processed byte: # # 64-bit PMULL 32-bit PMULL 32-bit NEON(*) # Apple A7 0.58 0.92 5.62 # Cortex-A53 0.85 1.01 8.39 # Cortex-A57 0.73 1.17 7.61 # Denver 0.51 0.65 6.02 # Mongoose 0.65 1.10 8.06 # Kryo 0.76 1.16 8.00 # # (*) presented for reference/comparison purposes; $flavour = shift; $output = shift; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $Xi="x0"; # argument block $Htbl="x1"; $inp="x2"; $len="x3"; $inc="x12"; { my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); $code=<<___; #if __ARM_MAX_ARCH__>=7 .text ___ $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); $code.=<<___ if ($flavour !~ /64/); .fpu neon .code 32 #undef __thumb2__ ___ ################################################################################ # void gcm_init_clmul(u128 Htable[16],const u64 H[2]); # # input: 128-bit H - secret parameter E(K,0^128) # output: precomputed table filled with degrees of twisted H; # H is twisted to handle reverse bitness of GHASH; # only few of 16 slots of Htable[16] are used; # data is opaque to outside world (which allows to # optimize the code independently); # $code.=<<___; .global gcm_init_clmul .type gcm_init_clmul,%function .align 4 gcm_init_clmul: AARCH64_VALID_CALL_TARGET vld1.64 {$t1},[x1] @ load input H vmov.i8 $xC2,#0xe1 vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 vext.8 $IN,$t1,$t1,#8 vshr.u64 $t2,$xC2,#63 vdup.32 $t1,${t1}[1] vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01 vshr.u64 $t2,$IN,#63 vshr.s32 $t1,$t1,#31 @ broadcast carry bit vand $t2,$t2,$t0 vshl.i64 $IN,$IN,#1 vext.8 $t2,$t2,$t2,#8 vand $t0,$t0,$t1 vorr $IN,$IN,$t2 @ H<<<=1 veor $H,$IN,$t0 @ twisted H vst1.64 {$H},[x0],#16 @ store Htable[0] @ calculate H^2 vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing vpmull.p64 $Xl,$H,$H veor $t0,$t0,$H vpmull2.p64 $Xh,$H,$H vpmull.p64 $Xm,$t0,$t0 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh veor $Xm,$Xm,$t1 veor $Xm,$Xm,$t2 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl veor $Xl,$Xm,$t2 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase vpmull.p64 $Xl,$Xl,$xC2 veor $t2,$t2,$Xh veor $H2,$Xl,$t2 vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing veor $t1,$t1,$H2 vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2] ___ if ($flavour =~ /64/) { my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7)); $code.=<<___; @ calculate H^3 and H^4 vpmull.p64 $Xl,$H, $H2 vpmull.p64 $Yl,$H2,$H2 vpmull2.p64 $Xh,$H, $H2 vpmull2.p64 $Yh,$H2,$H2 vpmull.p64 $Xm,$t0,$t1 vpmull.p64 $Ym,$t1,$t1 vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing vext.8 $t1,$Yl,$Yh,#8 veor $t2,$Xl,$Xh veor $Xm,$Xm,$t0 veor $t3,$Yl,$Yh veor $Ym,$Ym,$t1 veor $Xm,$Xm,$t2 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase veor $Ym,$Ym,$t3 vpmull.p64 $t3,$Yl,$xC2 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result vmov $Yh#lo,$Ym#hi vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl vmov $Ym#hi,$Yl#lo veor $Xl,$Xm,$t2 veor $Yl,$Ym,$t3 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase vext.8 $t3,$Yl,$Yl,#8 vpmull.p64 $Xl,$Xl,$xC2 vpmull.p64 $Yl,$Yl,$xC2 veor $t2,$t2,$Xh veor $t3,$t3,$Yh veor $H, $Xl,$t2 @ H^3 veor $H2,$Yl,$t3 @ H^4 vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing vext.8 $t1,$H2,$H2,#8 veor $t0,$t0,$H veor $t1,$t1,$H2 vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed vst1.64 {$H-$H2},[x0] @ store Htable[3..5] ___ } $code.=<<___; ret .size gcm_init_clmul,.-gcm_init_clmul ___ ################################################################################ # void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]); # # input: Xi - current hash value; # Htable - table precomputed in gcm_init_clmul; # output: Xi - next hash value Xi; # $code.=<<___; .global gcm_gmult_clmul .type gcm_gmult_clmul,%function .align 4 gcm_gmult_clmul: AARCH64_VALID_CALL_TARGET vld1.64 {$t1},[$Xi] @ load Xi vmov.i8 $xC2,#0xe1 vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ... vshl.u64 $xC2,$xC2,#57 #ifndef __ARMEB__ vrev64.8 $t1,$t1 #endif vext.8 $IN,$t1,$t1,#8 vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo veor $t1,$t1,$IN @ Karatsuba pre-processing vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh veor $Xm,$Xm,$t1 veor $Xm,$Xm,$t2 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl veor $Xl,$Xm,$t2 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction vpmull.p64 $Xl,$Xl,$xC2 veor $t2,$t2,$Xh veor $Xl,$Xl,$t2 #ifndef __ARMEB__ vrev64.8 $Xl,$Xl #endif vext.8 $Xl,$Xl,$Xl,#8 vst1.64 {$Xl},[$Xi] @ write out Xi ret .size gcm_gmult_clmul,.-gcm_gmult_clmul ___ } $code.=<<___; .asciz "GHASH for ARMv8, CRYPTOGAMS by " .align 2 #endif ___ if ($flavour =~ /64/) { ######## 64-bit code sub unvmov { my $arg=shift; $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, $3<8?$3:$3+8,($4 eq "lo")?0:1; } foreach(split("\n",$code)) { s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or s/vmov\.i8/movi/o or # fix up legacy mnemonics s/vmov\s+(.*)/unvmov($1)/geo or s/vext\.8/ext/o or s/vshr\.s/sshr\.s/o or s/vshr/ushr/o or s/^(\s+)v/$1/o or # strip off v prefix s/\bbx\s+lr\b/ret/o; s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers s/@\s/\/\//o; # old->new style commentary # fix up remaining legacy suffixes s/\.[ui]?8(\s)/$1/o; s/\.[uis]?32//o and s/\.16b/\.4s/go; m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments s/\.[uisp]?64//o and s/\.16b/\.2d/go; s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; # Switch preprocessor checks to aarch64 versions. s/__ARME([BL])__/__AARCH64E$1__/go; print $_,"\n"; } } else { ######## 32-bit code sub unvdup32 { my $arg=shift; $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; } sub unvpmullp64 { my ($mnemonic,$arg)=@_; if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) |(($2&7)<<17)|(($2&8)<<4) |(($3&7)<<1) |(($3&8)<<2); $word |= 0x00010001 if ($mnemonic =~ "2"); # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; } } foreach(split("\n",$code)) { s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers s/\/\/\s?/@ /o; # new->old style commentary # fix up remaining new-style suffixes s/\],#[0-9]+/]!/o; s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or s/vdup\.32\s+(.*)/unvdup32($1)/geo or s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or s/^(\s+)b\./$1b/o or s/^(\s+)ret/$1bx\tlr/o; print $_,"\n"; } } close STDOUT or die "error closing STDOUT: $!"; # enforce flush ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv7.pl000064400000000000000000000720131046102023000204640ustar 00000000000000#! /usr/bin/env perl # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ###################################################################### ## Constant-time SSSE3 AES core implementation. ## version 0.1 ## ## By Mike Hamburg (Stanford University), 2009 ## Public domain. ## ## For details see http://shiftleft.org/papers/vector_aes/ and ## http://crypto.stanford.edu/vpaes/. ## ###################################################################### # Adapted from the original x86_64 version and 's ARMv8 # version. # # armv7, aarch64, and x86_64 differ in several ways: # # * x86_64 SSSE3 instructions are two-address (destination operand is also a # source), while NEON is three-address (destination operand is separate from # two sources). # # * aarch64 has 32 SIMD registers available, while x86_64 and armv7 have 16. # # * x86_64 instructions can take memory references, while ARM is a load/store # architecture. This means we sometimes need a spare register. # # * aarch64 and x86_64 have 128-bit byte shuffle instructions (tbl and pshufb), # while armv7 only has a 64-bit byte shuffle (vtbl). # # This means this armv7 version must be a mix of both aarch64 and x86_64 # implementations. armv7 and aarch64 have analogous SIMD instructions, so we # base the instructions on aarch64. However, we cannot use aarch64's register # allocation. x86_64's register count matches, but x86_64 is two-address. # vpaes-armv8.pl already accounts for this in the comments, which use # three-address AVX instructions instead of the original SSSE3 ones. We base # register usage on these comments, which are preserved in this file. # # This means we do not use separate input and output registers as in aarch64 and # cannot pin as many constants in the preheat functions. However, the load/store # architecture means we must still deviate from x86_64 in places. # # Next, we account for the byte shuffle instructions. vtbl takes 64-bit source # and destination and 128-bit table. Fortunately, armv7 also allows addressing # upper and lower halves of each 128-bit register. The lower half of q{N} is # d{2*N}. The upper half is d{2*N+1}. Instead of the following non-existent # instruction, # # vtbl.8 q0, q1, q2 @ Index each of q2's 16 bytes into q1. Store in q0. # # we write: # # vtbl.8 d0, q1, d4 @ Index each of d4's 8 bytes into q1. Store in d0. # vtbl.8 d1, q1, d5 @ Index each of d5's 8 bytes into q1. Store in d1. # # For readability, we write d0 and d1 as q0#lo and q0#hi, respectively and # post-process before outputting. (This is adapted from ghash-armv4.pl.) Note, # however, that destination (q0) and table (q1) registers may no longer match. # We adjust the register usage from x86_64 to avoid this. (Unfortunately, the # two-address pshufb always matched these operands, so this is common.) # # This file also runs against the limit of ARMv7's ADR pseudo-instruction. ADR # expands to an ADD or SUB of the pc register to find an address. That immediate # must fit in ARM's encoding scheme: 8 bits of constant and 4 bits of rotation. # This means larger values must be more aligned. # # ARM additionally has two encodings, ARM and Thumb mode. Our assembly files may # use either encoding (do we actually need to support this?). In ARM mode, the # distances get large enough to require 16-byte alignment. Moving constants # closer to their use resolves most of this, but common constants in # _vpaes_consts are used by the whole file. Affected ADR instructions must be # placed at 8 mod 16 (the pc register is 8 ahead). Instructions with this # constraint have been commented. # # For details on ARM's immediate value encoding scheme, see # https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/ # # Finally, a summary of armv7 and aarch64 SIMD syntax differences: # # * armv7 prefixes SIMD instructions with 'v', while aarch64 does not. # # * armv7 SIMD registers are named like q0 (and d0 for the half-width ones). # aarch64 names registers like v0, and denotes half-width operations in an # instruction suffix (see below). # # * aarch64 embeds size and lane information in register suffixes. v0.16b is # 16 bytes, v0.8h is eight u16s, v0.4s is four u32s, and v0.2d is two u64s. # armv7 embeds the total size in the register name (see above) and the size of # each element in an instruction suffix, which may look like vmov.i8, # vshr.u8, or vtbl.8, depending on instruction. use strict; my $flavour = shift; my $output; while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; my $code = ""; $code.=<<___; .syntax unified .arch armv7-a .fpu neon #if defined(__thumb2__) .thumb #else .code 32 #endif .text .type _vpaes_consts,%object .align 7 @ totally strategic alignment _vpaes_consts: .Lk_mc_forward: @ mc_forward .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 .quad 0x080B0A0904070605, 0x000302010C0F0E0D .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 .quad 0x000302010C0F0E0D, 0x080B0A0904070605 .Lk_mc_backward:@ mc_backward .quad 0x0605040702010003, 0x0E0D0C0F0A09080B .quad 0x020100030E0D0C0F, 0x0A09080B06050407 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 .quad 0x0A09080B06050407, 0x020100030E0D0C0F .Lk_sr: @ sr .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 .quad 0x030E09040F0A0500, 0x0B06010C07020D08 .quad 0x0F060D040B020900, 0x070E050C030A0108 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 @ @ "Hot" constants @ .Lk_inv: @ inv, inva .quad 0x0E05060F0D080180, 0x040703090A0B0C02 .quad 0x01040A060F0B0780, 0x030D0E0C02050809 .Lk_ipt: @ input transform (lo, hi) .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 .Lk_sbo: @ sbou, sbot .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA .Lk_sb1: @ sb1u, sb1t .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 .Lk_sb2: @ sb2u, sb2t .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD .asciz "Vector Permutation AES for ARMv7 NEON, Mike Hamburg (Stanford University)" .size _vpaes_consts,.-_vpaes_consts .align 6 ___ { my ($inp,$out,$key) = map("r$_", (0..2)); my ($invlo,$invhi) = map("q$_", (10..11)); my ($sb1u,$sb1t,$sb2u,$sb2t) = map("q$_", (12..15)); $code.=<<___; @@ @@ _aes_preheat @@ @@ Fills q9-q15 as specified below. @@ .type _vpaes_preheat,%function .align 4 _vpaes_preheat: adr r10, .Lk_inv vmov.i8 q9, #0x0f @ .Lk_s0F vld1.64 {q10,q11}, [r10]! @ .Lk_inv add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo vld1.64 {q12,q13}, [r10]! @ .Lk_sb1 vld1.64 {q14,q15}, [r10] @ .Lk_sb2 bx lr @@ @@ _aes_encrypt_core @@ @@ AES-encrypt q0. @@ @@ Inputs: @@ q0 = input @@ q9-q15 as in _vpaes_preheat @@ [$key] = scheduled keys @@ @@ Output in q0 @@ Clobbers q1-q5, r8-r11 @@ Preserves q6-q8 so you get some local vectors @@ @@ .type _vpaes_encrypt_core,%function .align 4 _vpaes_encrypt_core: mov r9, $key ldr r8, [$key,#240] @ pull rounds adr r11, .Lk_ipt @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi vld1.64 {q2, q3}, [r11] adr r11, .Lk_mc_forward+16 vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 vtbl.8 q1#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm1 vtbl.8 q1#hi, {q2}, q1#hi vtbl.8 q2#lo, {q3}, q0#lo @ vpshufb %xmm0, %xmm3, %xmm2 vtbl.8 q2#hi, {q3}, q0#hi veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 @ .Lenc_entry ends with a bnz instruction which is normally paired with @ subs in .Lenc_loop. tst r8, r8 b .Lenc_entry .align 4 .Lenc_loop: @ middle of middle round add r10, r11, #0x40 vtbl.8 q4#lo, {$sb1t}, q2#lo @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u vtbl.8 q4#hi, {$sb1t}, q2#hi vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] vtbl.8 q0#lo, {$sb1u}, q3#lo @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t vtbl.8 q0#hi, {$sb1u}, q3#hi veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k vtbl.8 q5#lo, {$sb2t}, q2#lo @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u vtbl.8 q5#hi, {$sb2t}, q2#hi veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A vtbl.8 q2#lo, {$sb2u}, q3#lo @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t vtbl.8 q2#hi, {$sb2u}, q3#hi vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] vtbl.8 q3#lo, {q0}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B vtbl.8 q3#hi, {q0}, q1#hi veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A @ Write to q5 instead of q0, so the table and destination registers do @ not overlap. vtbl.8 q5#lo, {q0}, q4#lo @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D vtbl.8 q5#hi, {q0}, q4#hi veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B vtbl.8 q4#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C vtbl.8 q4#hi, {q3}, q1#hi @ Here we restore the original q0/q5 usage. veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D and r11, r11, #~(1<<6) @ and \$0x30, %r11 # ... mod 4 veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D subs r8, r8, #1 @ nr-- .Lenc_entry: @ top of round vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i vtbl.8 q5#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k vtbl.8 q5#hi, {$invhi}, q1#hi veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i vtbl.8 q3#hi, {$invlo}, q0#hi vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j vtbl.8 q4#hi, {$invlo}, q1#hi veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k vtbl.8 q2#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak vtbl.8 q2#hi, {$invlo}, q3#hi vtbl.8 q3#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak vtbl.8 q3#hi, {$invlo}, q4#hi veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 bne .Lenc_loop @ middle of last round add r10, r11, #0x80 adr r11, .Lk_sbo @ Read to q1 instead of q4, so the vtbl.8 instruction below does not @ overlap table and destination registers. vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 vtbl.8 q4#lo, {q1}, q2#lo @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou vtbl.8 q4#hi, {q1}, q2#hi vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] @ Write to q2 instead of q0 below, to avoid overlapping table and @ destination registers. vtbl.8 q2#lo, {q0}, q3#lo @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t vtbl.8 q2#hi, {q0}, q3#hi veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A @ Here we restore the original q0/q2 usage. vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0 vtbl.8 q0#hi, {q2}, q1#hi bx lr .size _vpaes_encrypt_core,.-_vpaes_encrypt_core ___ } { my ($inp,$bits,$out,$dir)=("r0","r1","r2","r3"); my ($rcon,$s0F,$invlo,$invhi,$s63) = map("q$_",(8..12)); $code.=<<___; @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @@ @@ @@ AES key schedule @@ @@ @@ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @ This function diverges from both x86_64 and armv7 in which constants are @ pinned. x86_64 has a common preheat function for all operations. aarch64 @ separates them because it has enough registers to pin nearly all constants. @ armv7 does not have enough registers, but needing explicit loads and stores @ also complicates using x86_64's register allocation directly. @ @ We pin some constants for convenience and leave q14 and q15 free to load @ others on demand. @ @ Key schedule constants @ .type _vpaes_key_consts,%object .align 4 _vpaes_key_consts: .Lk_rcon: @ rcon .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 .Lk_opt: @ output transform .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 .Lk_deskew: @ deskew tables: inverts the sbox's "skew" .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 .size _vpaes_key_consts,.-_vpaes_key_consts .type _vpaes_key_preheat,%function .align 4 _vpaes_key_preheat: adr r11, .Lk_rcon vmov.i8 $s63, #0x5b @ .Lk_s63 adr r10, .Lk_inv @ Must be aligned to 8 mod 16. vmov.i8 $s0F, #0x0f @ .Lk_s0F vld1.64 {$invlo,$invhi}, [r10] @ .Lk_inv vld1.64 {$rcon}, [r11] @ .Lk_rcon bx lr .size _vpaes_key_preheat,.-_vpaes_key_preheat .type _vpaes_schedule_core,%function .align 4 _vpaes_schedule_core: @ We only need to save lr, but ARM requires an 8-byte stack alignment, @ so save an extra register. stmdb sp!, {r3,lr} bl _vpaes_key_preheat @ load the tables adr r11, .Lk_ipt @ Must be aligned to 8 mod 16. vld1.64 {q0}, [$inp]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) @ input transform @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not @ overlap table and destination. vmov q4, q0 @ vmovdqa %xmm0, %xmm3 bl _vpaes_schedule_transform adr r10, .Lk_sr @ Must be aligned to 8 mod 16. vmov q7, q0 @ vmovdqa %xmm0, %xmm7 add r8, r8, r10 @ encrypting, output zeroth round key after transform vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx) @ *ring*: Decryption removed. .Lschedule_go: cmp $bits, #192 @ cmp \$192, %esi bhi .Lschedule_256 @ 128: fall though @@ @@ .schedule_128 @@ @@ 128-bit specific part of key schedule. @@ @@ This schedule is really simple, because all its parts @@ are accomplished by the subroutines. @@ .Lschedule_128: mov $inp, #10 @ mov \$10, %esi .Loop_schedule_128: bl _vpaes_schedule_round subs $inp, $inp, #1 @ dec %esi beq .Lschedule_mangle_last bl _vpaes_schedule_mangle @ write output b .Loop_schedule_128 @@ @@ .aes_schedule_256 @@ @@ 256-bit specific part of key schedule. @@ @@ The structure here is very similar to the 128-bit @@ schedule, but with an additional "low side" in @@ q6. The low side's rounds are the same as the @@ high side's, except no rcon and no rotation. @@ .align 4 .Lschedule_256: vld1.64 {q0}, [$inp] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) bl _vpaes_schedule_transform @ input transform mov $inp, #7 @ mov \$7, %esi .Loop_schedule_256: bl _vpaes_schedule_mangle @ output low result vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 @ high round bl _vpaes_schedule_round subs $inp, $inp, #1 @ dec %esi beq .Lschedule_mangle_last bl _vpaes_schedule_mangle @ low round. swap xmm7 and xmm6 vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0 vmov.i8 q4, #0 vmov q5, q7 @ vmovdqa %xmm7, %xmm5 vmov q7, q6 @ vmovdqa %xmm6, %xmm7 bl _vpaes_schedule_low_round vmov q7, q5 @ vmovdqa %xmm5, %xmm7 b .Loop_schedule_256 @@ @@ .aes_schedule_mangle_last @@ @@ Mangler for last round of key schedule @@ Mangles q0 @@ when encrypting, outputs out(q0) ^ 63 @@ when decrypting, outputs unskew(q0) @@ @@ Always called right before return... jumps to cleanup and exits @@ .align 4 .Lschedule_mangle_last: @ schedule last round key from xmm0 adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew @ encrypting vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform add $out, $out, #32 @ add \$32, %rdx vmov q2, q0 vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0 # output permute vtbl.8 q0#hi, {q2}, q1#hi .Lschedule_mangle_last_dec: sub $out, $out, #16 @ add \$-16, %rdx veor q0, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0 bl _vpaes_schedule_transform @ output transform vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx) # save last key @ cleanup veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 ldmia sp!, {r3,pc} @ return .size _vpaes_schedule_core,.-_vpaes_schedule_core @@ @@ .aes_schedule_round @@ @@ Runs one main round of the key schedule on q0, q7 @@ @@ Specifically, runs subbytes on the high dword of q0 @@ then rotates it by one byte and xors into the low dword of @@ q7. @@ @@ Adds rcon from low byte of q8, then rotates q8 for @@ next rcon. @@ @@ Smears the dwords of q7 by xoring the low into the @@ second low, result into third, result into highest. @@ @@ Returns results in q7 = q0. @@ Clobbers q1-q4, r11. @@ .type _vpaes_schedule_round,%function .align 4 _vpaes_schedule_round: @ extract rcon from xmm8 vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 vext.8 q1, $rcon, q4, #15 @ vpalignr \$15, %xmm8, %xmm4, %xmm1 vext.8 $rcon, $rcon, $rcon, #15 @ vpalignr \$15, %xmm8, %xmm8, %xmm8 veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 @ rotate vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0 vext.8 q0, q0, q0, #1 @ vpalignr \$1, %xmm0, %xmm0, %xmm0 @ fall through... @ low round: same as high round, but no rotation and no rcon. _vpaes_schedule_low_round: @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. @ We pin other values in _vpaes_key_preheat, so load them now. adr r11, .Lk_sb1 vld1.64 {q14,q15}, [r11] @ smear xmm7 vext.8 q1, q4, q7, #12 @ vpslldq \$4, %xmm7, %xmm1 veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 vext.8 q4, q4, q7, #8 @ vpslldq \$8, %xmm7, %xmm4 @ subbytes vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1 # 0 = k vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 vtbl.8 q2#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k vtbl.8 q2#hi, {$invhi}, q1#hi veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i vtbl.8 q3#hi, {$invlo}, q0#hi veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j vtbl.8 q4#hi, {$invlo}, q1#hi veor q7, q7, $s63 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7 vtbl.8 q3#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak vtbl.8 q3#hi, {$invlo}, q3#hi veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k vtbl.8 q2#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak vtbl.8 q2#hi, {$invlo}, q4#hi veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo vtbl.8 q4#lo, {q15}, q3#lo @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou vtbl.8 q4#hi, {q15}, q3#hi vtbl.8 q1#lo, {q14}, q2#lo @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t vtbl.8 q1#hi, {q14}, q2#hi veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output @ add in smeared stuff veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 bx lr .size _vpaes_schedule_round,.-_vpaes_schedule_round @@ @@ .aes_schedule_transform @@ @@ Linear-transform q0 according to tables at [r11] @@ @@ Requires that q9 = 0x0F0F... as in preheat @@ Output in q0 @@ Clobbers q1, q2, q14, q15 @@ .type _vpaes_schedule_transform,%function .align 4 _vpaes_schedule_transform: vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo @ vmovdqa 16(%r11), %xmm1 # hi vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1 vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 vtbl.8 q2#lo, {q14}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm2 vtbl.8 q2#hi, {q14}, q1#hi vtbl.8 q0#lo, {q15}, q0#lo @ vpshufb %xmm0, %xmm1, %xmm0 vtbl.8 q0#hi, {q15}, q0#hi veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 bx lr .size _vpaes_schedule_transform,.-_vpaes_schedule_transform @@ @@ .aes_schedule_mangle @@ @@ Mangles q0 from (basis-transformed) standard version @@ to our version. @@ @@ On encrypt, @@ xor with 0x63 @@ multiply by circulant 0,1,1,1 @@ apply shiftrows transform @@ @@ On decrypt, @@ xor with 0x63 @@ multiply by "inverse mixcolumns" circulant E,B,D,9 @@ deskew @@ apply shiftrows transform @@ @@ @@ Writes out to [r2], and increments or decrements it @@ Keeps track of round number mod 4 in r8 @@ Preserves q0 @@ Clobbers q1-q5 @@ .type _vpaes_schedule_mangle,%function .align 4 _vpaes_schedule_mangle: tst $dir, $dir vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16. vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5 @ encrypting @ Write to q2 so we do not overlap table and destination below. veor q2, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4 add $out, $out, #16 @ add \$16, %rdx vtbl.8 q4#lo, {q2}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm4 vtbl.8 q4#hi, {q2}, q5#hi vtbl.8 q1#lo, {q4}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm1 vtbl.8 q1#hi, {q4}, q5#hi vtbl.8 q3#lo, {q1}, q5#lo @ vpshufb %xmm5, %xmm1, %xmm3 vtbl.8 q3#hi, {q1}, q5#hi veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 .Lschedule_mangle_both: @ Write to q2 so table and destination do not overlap. vtbl.8 q2#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm3 vtbl.8 q2#hi, {q3}, q1#hi add r8, r8, #64-16 @ add \$-16, %r8 and r8, r8, #~(1<<6) @ and \$0x30, %r8 vst1.64 {q2}, [$out] @ vmovdqu %xmm3, (%rdx) bx lr .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle .globl vpaes_set_encrypt_key .type vpaes_set_encrypt_key,%function .align 4 vpaes_set_encrypt_key: stmdb sp!, {r7-r11, lr} vstmdb sp!, {d8-d15} lsr r9, $bits, #5 @ shr \$5,%eax add r9, r9, #5 @ \$5,%eax str r9, [$out,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; mov $dir, #0 @ mov \$0,%ecx mov r8, #0x30 @ mov \$0x30,%r8d bl _vpaes_schedule_core eor r0, r0, r0 vldmia sp!, {d8-d15} ldmia sp!, {r7-r11, pc} @ return .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key ___ } { my ($out, $inp) = map("r$_", (0..1)); my ($s0F, $s63, $s63_raw, $mc_forward) = map("q$_", (9..12)); $code .= <<___; @ Additional constants for converting to bsaes. .type _vpaes_convert_consts,%object .align 4 _vpaes_convert_consts: @ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear @ transform in the AES S-box. 0x63 is incorporated into the low half of the @ table. This was computed with the following script: @ @ def u64s_to_u128(x, y): @ return x | (y << 64) @ def u128_to_u64s(w): @ return w & ((1<<64)-1), w >> 64 @ def get_byte(w, i): @ return (w >> (i*8)) & 0xff @ def apply_table(table, b): @ lo = b & 0xf @ hi = b >> 4 @ return get_byte(table[0], lo) ^ get_byte(table[1], hi) @ def opt(b): @ table = [ @ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), @ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), @ ] @ return apply_table(table, b) @ def rot_byte(b, n): @ return 0xff & ((b << n) | (b >> (8-n))) @ def skew(x): @ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ @ rot_byte(x, 4)) @ table = [0, 0] @ for i in range(16): @ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) @ table[1] |= skew(opt(i<<4)) << (i*8) @ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[0])) @ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[1])) .Lk_opt_then_skew: .quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b .quad 0x1f30062936192f00, 0xb49bad829db284ab @ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); .globl vpaes_encrypt_key_to_bsaes .type vpaes_encrypt_key_to_bsaes,%function .align 4 vpaes_encrypt_key_to_bsaes: stmdb sp!, {r11, lr} @ See _vpaes_schedule_core for the key schedule logic. In particular, @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last @ contain the transformations not in the bsaes representation. This @ function inverts those transforms. @ @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key @ representation, which does not match the other aes_nohw_* @ implementations. The ARM aes_nohw_* stores each 32-bit word @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the @ cost of extra REV and VREV32 operations in little-endian ARM. vmov.i8 $s0F, #0x0f @ Required by _vpaes_schedule_transform adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16. add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) vld1.64 {$mc_forward}, [r2] vmov.i8 $s63, #0x5b @ .Lk_s63 from vpaes-x86_64 adr r11, .Lk_opt @ Must be aligned to 8 mod 16. vmov.i8 $s63_raw, #0x63 @ .LK_s63 without .Lk_ipt applied @ vpaes stores one fewer round count than bsaes, but the number of keys @ is the same. ldr r2, [$inp,#240] add r2, r2, #1 str r2, [$out,#240] @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). @ Invert this with .Lk_opt. vld1.64 {q0}, [$inp]! bl _vpaes_schedule_transform vrev32.8 q0, q0 vst1.64 {q0}, [$out]! @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. .Loop_enc_key_to_bsaes: vld1.64 {q0}, [$inp]! @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. @ We use r3 rather than r8 to avoid a callee-saved register. vld1.64 {q1}, [r3] vtbl.8 q2#lo, {q0}, q1#lo vtbl.8 q2#hi, {q0}, q1#hi add r3, r3, #16 and r3, r3, #~(1<<6) vmov q0, q2 @ Handle the last key differently. subs r2, r2, #1 beq .Loop_enc_key_to_bsaes_last @ Multiply by the circulant. This is its own inverse. vtbl.8 q1#lo, {q0}, $mc_forward#lo vtbl.8 q1#hi, {q0}, $mc_forward#hi vmov q0, q1 vtbl.8 q2#lo, {q1}, $mc_forward#lo vtbl.8 q2#hi, {q1}, $mc_forward#hi veor q0, q0, q2 vtbl.8 q1#lo, {q2}, $mc_forward#lo vtbl.8 q1#hi, {q2}, $mc_forward#hi veor q0, q0, q1 @ XOR and finish. veor q0, q0, $s63 bl _vpaes_schedule_transform vrev32.8 q0, q0 vst1.64 {q0}, [$out]! b .Loop_enc_key_to_bsaes .Loop_enc_key_to_bsaes_last: @ The final key does not have a basis transform (note @ .Lschedule_mangle_last inverts the original transform). It only XORs @ 0x63 and applies ShiftRows. The latter was already inverted in the @ loop. Note that, because we act on the original representation, we use @ $s63_raw, not $s63. veor q0, q0, $s63_raw vrev32.8 q0, q0 vst1.64 {q0}, [$out] @ Wipe registers which contained key material. veor q0, q0, q0 veor q1, q1, q1 veor q2, q2, q2 ldmia sp!, {r11, pc} @ return .size vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes ___ } { # Register-passed parameters. my ($inp, $out, $len, $key) = map("r$_", 0..3); # Temporaries. _vpaes_encrypt_core already uses r8..r11, so overlap $ivec and # $tmp. $ctr is r7 because it must be preserved across calls. my ($ctr, $ivec, $tmp) = map("r$_", 7..9); # void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, # const AES_KEY *key, const uint8_t ivec[16]); $code .= <<___; .globl vpaes_ctr32_encrypt_blocks .type vpaes_ctr32_encrypt_blocks,%function .align 4 vpaes_ctr32_encrypt_blocks: mov ip, sp stmdb sp!, {r7-r11, lr} @ This function uses q4-q7 (d8-d15), which are callee-saved. vstmdb sp!, {d8-d15} cmp $len, #0 @ $ivec is passed on the stack. ldr $ivec, [ip] beq .Lctr32_done @ _vpaes_encrypt_core expects the key in r2, so swap $len and $key. mov $tmp, $key mov $key, $len mov $len, $tmp ___ my ($len, $key) = ($key, $len); $code .= <<___; @ Load the IV and counter portion. ldr $ctr, [$ivec, #12] vld1.8 {q7}, [$ivec] bl _vpaes_preheat rev $ctr, $ctr @ The counter is big-endian. .Lctr32_loop: vmov q0, q7 vld1.8 {q6}, [$inp]! @ Load input ahead of time bl _vpaes_encrypt_core veor q0, q0, q6 @ XOR input and result vst1.8 {q0}, [$out]! subs $len, $len, #1 @ Update the counter. add $ctr, $ctr, #1 rev $tmp, $ctr vmov.32 q7#hi[1], $tmp bne .Lctr32_loop .Lctr32_done: vldmia sp!, {d8-d15} ldmia sp!, {r7-r11, pc} @ return .size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks ___ } foreach (split("\n",$code)) { s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv8.pl000064400000000000000000000704411046102023000204700ustar 00000000000000#! /usr/bin/env perl # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ###################################################################### ## Constant-time SSSE3 AES core implementation. ## version 0.1 ## ## By Mike Hamburg (Stanford University), 2009 ## Public domain. ## ## For details see http://shiftleft.org/papers/vector_aes/ and ## http://crypto.stanford.edu/vpaes/. ## ###################################################################### # ARMv8 NEON adaptation by # # Reason for undertaken effort is that there is at least one popular # SoC based on Cortex-A53 that doesn't have crypto extensions. # # CBC enc ECB enc/dec(*) [bit-sliced enc/dec] # Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ] # Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ] # X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ] # Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] # Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] # Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ] # # (*) ECB denotes approximate result for parallelizable modes # such as CBC decrypt, CTR, etc.; # (**) these results are worse than scalar compiler-generated # code, but it's constant-time and therefore preferred; # (***) presented for reference/comparison purposes; $flavour = shift; while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $code.=<<___; .section .rodata .type _vpaes_consts,%object .align 7 // totally strategic alignment _vpaes_consts: .Lk_mc_forward: // mc_forward .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 .quad 0x080B0A0904070605, 0x000302010C0F0E0D .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 .quad 0x000302010C0F0E0D, 0x080B0A0904070605 .Lk_mc_backward:// mc_backward .quad 0x0605040702010003, 0x0E0D0C0F0A09080B .quad 0x020100030E0D0C0F, 0x0A09080B06050407 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 .quad 0x0A09080B06050407, 0x020100030E0D0C0F .Lk_sr: // sr .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 .quad 0x030E09040F0A0500, 0x0B06010C07020D08 .quad 0x0F060D040B020900, 0x070E050C030A0108 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 // // "Hot" constants // .Lk_inv: // inv, inva .quad 0x0E05060F0D080180, 0x040703090A0B0C02 .quad 0x01040A060F0B0780, 0x030D0E0C02050809 .Lk_ipt: // input transform (lo, hi) .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 .Lk_sbo: // sbou, sbot .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA .Lk_sb1: // sb1u, sb1t .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 .Lk_sb2: // sb2u, sb2t .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD // // Key schedule constants // .Lk_dksd: // decryption key schedule: invskew x*D .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E .Lk_dksb: // decryption key schedule: invskew x*B .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 .Lk_dks9: // decryption key schedule: invskew x*9 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE .Lk_rcon: // rcon .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 .Lk_opt: // output transform .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 .Lk_deskew: // deskew tables: inverts the sbox's "skew" .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 .asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)" .size _vpaes_consts,.-_vpaes_consts .align 6 .text ___ { my ($inp,$out,$key) = map("x$_",(0..2)); my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23)); my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27)); my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31)); $code.=<<___; ## ## _aes_preheat ## ## Fills register %r10 -> .aes_consts (so you can -fPIC) ## and %xmm9-%xmm15 as specified below. ## .type _vpaes_encrypt_preheat,%function .align 4 _vpaes_encrypt_preheat: adrp x10, :pg_hi21:.Lk_inv add x10, x10, :lo12:.Lk_inv movi v17.16b, #0x0f ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 ret .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat ## ## _aes_encrypt_core ## ## AES-encrypt %xmm0. ## ## Inputs: ## %xmm0 = input ## %xmm9-%xmm15 as in _vpaes_preheat ## (%rdx) = scheduled keys ## ## Output in %xmm0 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax ## Preserves %xmm6 - %xmm8 so you get some local vectors ## ## .type _vpaes_encrypt_core,%function .align 4 _vpaes_encrypt_core: mov x9, $key ldr w8, [$key,#240] // pull rounds adrp x11, :pg_hi21:.Lk_mc_forward+16 add x11, x11, :lo12:.Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 b .Lenc_entry .align 4 .Lenc_loop: // middle of middle round add x10, x11, #0x40 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D sub w8, w8, #1 // nr-- .Lenc_entry: // top of round and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 cbnz w8, .Lenc_loop // middle of last round add x10, x11, #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 ret .size _vpaes_encrypt_core,.-_vpaes_encrypt_core .type _vpaes_encrypt_2x,%function .align 4 _vpaes_encrypt_2x: mov x9, $key ldr w8, [$key,#240] // pull rounds adrp x11, :pg_hi21:.Lk_mc_forward+16 add x11, x11, :lo12:.Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 and v9.16b, v15.16b, v17.16b ushr v8.16b, v15.16b, #4 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 tbl v9.16b, {$iptlo}, v9.16b // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 tbl v10.16b, {$ipthi}, v8.16b eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 eor v8.16b, v9.16b, v16.16b eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 eor v8.16b, v8.16b, v10.16b b .Lenc_2x_entry .align 4 .Lenc_2x_loop: // middle of middle round add x10, x11, #0x40 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u tbl v12.16b, {$sb1t}, v10.16b ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t tbl v8.16b, {$sb1u}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v12.16b, v12.16b, v16.16b tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u tbl v13.16b, {$sb2t}, v10.16b eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A eor v8.16b, v8.16b, v12.16b tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t tbl v10.16b, {$sb2u}, v11.16b ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B tbl v11.16b, {v8.16b}, v1.16b eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A eor v10.16b, v10.16b, v13.16b tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D tbl v8.16b, {v8.16b}, v4.16b eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B eor v11.16b, v11.16b, v10.16b tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C tbl v12.16b, {v11.16b},v1.16b eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D eor v8.16b, v8.16b, v11.16b and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D eor v8.16b, v8.16b, v12.16b sub w8, w8, #1 // nr-- .Lenc_2x_entry: // top of round and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i and v9.16b, v8.16b, v17.16b ushr v8.16b, v8.16b, #4 tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k tbl v13.16b, {$invhi},v9.16b eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j eor v9.16b, v9.16b, v8.16b tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i tbl v11.16b, {$invlo},v8.16b tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j tbl v12.16b, {$invlo},v9.16b eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k eor v11.16b, v11.16b, v13.16b eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k eor v12.16b, v12.16b, v13.16b tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak tbl v10.16b, {$invlo},v11.16b tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak tbl v11.16b, {$invlo},v12.16b eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v10.16b, v10.16b, v9.16b eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo eor v11.16b, v11.16b, v8.16b ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 cbnz w8, .Lenc_2x_loop // middle of last round add x10, x11, #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou tbl v12.16b, {$sbou}, v10.16b ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t tbl v8.16b, {$sbot}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v12.16b, v12.16b, v16.16b eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A eor v8.16b, v8.16b, v12.16b tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 tbl v1.16b, {v8.16b},v1.16b ret .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x ___ } { my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3"); my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8)); $code.=<<___; ######################################################## ## ## ## AES key schedule ## ## ## ######################################################## .type _vpaes_key_preheat,%function .align 4 _vpaes_key_preheat: adrp x10, :pg_hi21:.Lk_inv add x10, x10, :lo12:.Lk_inv movi v16.16b, #0x5b // .Lk_s63 adrp x11, :pg_hi21:.Lk_sb1 add x11, x11, :lo12:.Lk_sb1 movi v17.16b, #0x0f // .Lk_s0F ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt adrp x10, :pg_hi21:.Lk_dksd add x10, x10, :lo12:.Lk_dksd ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1 adrp x11, :pg_hi21:.Lk_mc_forward add x11, x11, :lo12:.Lk_mc_forward ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 ld1 {v8.2d}, [x10] // .Lk_rcon ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] ret .size _vpaes_key_preheat,.-_vpaes_key_preheat .type _vpaes_schedule_core,%function .align 4 _vpaes_schedule_core: AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp,#-16]! add x29,sp,#0 bl _vpaes_key_preheat // load the tables ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) // input transform mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 bl _vpaes_schedule_transform mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 adrp x10, :pg_hi21:.Lk_sr // lea .Lk_sr(%rip),%r10 add x10, x10, :lo12:.Lk_sr add x8, x8, x10 // encrypting, output zeroth round key after transform st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) cmp $bits, #192 // cmp \$192, %esi b.hi .Lschedule_256 b.eq .Lschedule_192 // 128: fall though ## ## .schedule_128 ## ## 128-bit specific part of key schedule. ## ## This schedule is really simple, because all its parts ## are accomplished by the subroutines. ## .Lschedule_128: mov $inp, #10 // mov \$10, %esi .Loop_schedule_128: sub $inp, $inp, #1 // dec %esi bl _vpaes_schedule_round cbz $inp, .Lschedule_mangle_last bl _vpaes_schedule_mangle // write output b .Loop_schedule_128 ## ## .aes_schedule_192 ## ## 192-bit specific part of key schedule. ## ## The main body of this schedule is the same as the 128-bit ## schedule, but with more smearing. The long, high side is ## stored in %xmm7 as before, and the short, low side is in ## the high bits of %xmm6. ## ## This schedule is somewhat nastier, however, because each ## round produces 192 bits of key material, or 1.5 round keys. ## Therefore, on each cycle we do 2 rounds and produce 3 round ## keys. ## .align 4 .Lschedule_192: sub $inp, $inp, #8 ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) bl _vpaes_schedule_transform // input transform mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros mov $inp, #4 // mov \$4, %esi .Loop_schedule_192: sub $inp, $inp, #1 // dec %esi bl _vpaes_schedule_round ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0 bl _vpaes_schedule_mangle // save key n bl _vpaes_schedule_192_smear bl _vpaes_schedule_mangle // save key n+1 bl _vpaes_schedule_round cbz $inp, .Lschedule_mangle_last bl _vpaes_schedule_mangle // save key n+2 bl _vpaes_schedule_192_smear b .Loop_schedule_192 ## ## .aes_schedule_256 ## ## 256-bit specific part of key schedule. ## ## The structure here is very similar to the 128-bit ## schedule, but with an additional "low side" in ## %xmm6. The low side's rounds are the same as the ## high side's, except no rcon and no rotation. ## .align 4 .Lschedule_256: ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) bl _vpaes_schedule_transform // input transform mov $inp, #7 // mov \$7, %esi .Loop_schedule_256: sub $inp, $inp, #1 // dec %esi bl _vpaes_schedule_mangle // output low result mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 // high round bl _vpaes_schedule_round cbz $inp, .Lschedule_mangle_last bl _vpaes_schedule_mangle // low round. swap xmm7 and xmm6 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 movi v4.16b, #0 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 bl _vpaes_schedule_low_round mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 b .Loop_schedule_256 ## ## .aes_schedule_mangle_last ## ## Mangler for last round of key schedule ## Mangles %xmm0 ## when encrypting, outputs out(%xmm0) ^ 63 ## when decrypting, outputs unskew(%xmm0) ## ## Always called right before return... jumps to cleanup and exits ## .align 4 .Lschedule_mangle_last: // schedule last round key from xmm0 adrp x11, :pg_hi21:.Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew add x11, x11, :lo12:.Lk_deskew cbnz $dir, .Lschedule_mangle_last_dec // encrypting ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 adrp x11, :pg_hi21:.Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform add x11, x11, :lo12:.Lk_opt add $out, $out, #32 // add \$32, %rdx tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute .Lschedule_mangle_last_dec: ld1 {v20.2d-v21.2d}, [x11] // reload constants sub $out, $out, #16 // add \$-16, %rdx eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 bl _vpaes_schedule_transform // output transform st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key // cleanup eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 ldp x29, x30, [sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret .size _vpaes_schedule_core,.-_vpaes_schedule_core ## ## .aes_schedule_192_smear ## ## Smear the short, low side in the 192-bit key schedule. ## ## Inputs: ## %xmm7: high side, b a x y ## %xmm6: low side, d c 0 0 ## %xmm13: 0 ## ## Outputs: ## %xmm6: b+c+d b+c 0 0 ## %xmm0: b+c+d b+c b a ## .type _vpaes_schedule_192_smear,%function .align 4 _vpaes_schedule_192_smear: movi v1.16b, #0 dup v0.4s, v7.s[3] ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros ret .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear ## ## .aes_schedule_round ## ## Runs one main round of the key schedule on %xmm0, %xmm7 ## ## Specifically, runs subbytes on the high dword of %xmm0 ## then rotates it by one byte and xors into the low dword of ## %xmm7. ## ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for ## next rcon. ## ## Smears the dwords of %xmm7 by xoring the low into the ## second low, result into third, result into highest. ## ## Returns results in %xmm7 = %xmm0. ## Clobbers %xmm1-%xmm4, %r11. ## .type _vpaes_schedule_round,%function .align 4 _vpaes_schedule_round: // extract rcon from xmm8 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1 ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 // rotate dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0 // fall through... // low round: same as high round, but no rotation and no rcon. _vpaes_schedule_low_round: // smear xmm7 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4 // subbytes and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output // add in smeared stuff eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 ret .size _vpaes_schedule_round,.-_vpaes_schedule_round ## ## .aes_schedule_transform ## ## Linear-transform %xmm0 according to tables at (%r11) ## ## Requires that %xmm9 = 0x0F0F... as in preheat ## Output in %xmm0 ## Clobbers %xmm1, %xmm2 ## .type _vpaes_schedule_transform,%function .align 4 _vpaes_schedule_transform: and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 // vmovdqa (%r11), %xmm2 # lo tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 // vmovdqa 16(%r11), %xmm1 # hi tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 ret .size _vpaes_schedule_transform,.-_vpaes_schedule_transform ## ## .aes_schedule_mangle ## ## Mangle xmm0 from (basis-transformed) standard version ## to our version. ## ## On encrypt, ## xor with 0x63 ## multiply by circulant 0,1,1,1 ## apply shiftrows transform ## ## On decrypt, ## xor with 0x63 ## multiply by "inverse mixcolumns" circulant E,B,D,9 ## deskew ## apply shiftrows transform ## ## ## Writes out to (%rdx), and increments or decrements it ## Keeps track of round number mod 4 in %r8 ## Preserves xmm0 ## Clobbers xmm1-xmm5 ## .type _vpaes_schedule_mangle,%function .align 4 _vpaes_schedule_mangle: mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later // vmovdqa .Lk_mc_forward(%rip),%xmm5 // encrypting eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 add $out, $out, #16 // add \$16, %rdx tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 .Lschedule_mangle_both: tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 add x8, x8, #48 // add \$-16, %r8 and x8, x8, #~(1<<6) // and \$0x30, %r8 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) ret .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle .globl vpaes_set_encrypt_key .type vpaes_set_encrypt_key,%function .align 4 vpaes_set_encrypt_key: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so lsr w9, $bits, #5 // shr \$5,%eax add w9, w9, #5 // \$5,%eax str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; mov $dir, #0 // mov \$0,%ecx mov x8, #0x30 // mov \$0x30,%r8d bl _vpaes_schedule_core eor x0, x0, x0 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key ___ } { my ($inp,$out,$len,$key,$ivec) = map("x$_",(0..4)); my ($ctr, $ctr_tmp) = ("w6", "w7"); # void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, # const AES_KEY *key, const uint8_t ivec[16]); $code.=<<___; .globl vpaes_ctr32_encrypt_blocks .type vpaes_ctr32_encrypt_blocks,%function .align 4 vpaes_ctr32_encrypt_blocks: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so stp d10,d11,[sp,#-16]! stp d12,d13,[sp,#-16]! stp d14,d15,[sp,#-16]! cbz $len, .Lctr32_done // Note, unlike the other functions, $len here is measured in blocks, // not bytes. mov x17, $len mov x2, $key // Load the IV and counter portion. ldr $ctr, [$ivec, #12] ld1 {v7.16b}, [$ivec] bl _vpaes_encrypt_preheat tst x17, #1 rev $ctr, $ctr // The counter is big-endian. b.eq .Lctr32_prep_loop // Handle one block so the remaining block count is even for // _vpaes_encrypt_2x. ld1 {v6.16b}, [$inp], #16 // Load input ahead of time bl _vpaes_encrypt_core eor v0.16b, v0.16b, v6.16b // XOR input and result st1 {v0.16b}, [$out], #16 subs x17, x17, #1 // Update the counter. add $ctr, $ctr, #1 rev $ctr_tmp, $ctr mov v7.s[3], $ctr_tmp b.ls .Lctr32_done .Lctr32_prep_loop: // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x // uses v14 and v15. mov v15.16b, v7.16b mov v14.16b, v7.16b add $ctr, $ctr, #1 rev $ctr_tmp, $ctr mov v15.s[3], $ctr_tmp .Lctr32_loop: ld1 {v6.16b,v7.16b}, [$inp], #32 // Load input ahead of time bl _vpaes_encrypt_2x eor v0.16b, v0.16b, v6.16b // XOR input and result eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) st1 {v0.16b,v1.16b}, [$out], #32 subs x17, x17, #2 // Update the counter. add $ctr_tmp, $ctr, #1 add $ctr, $ctr, #2 rev $ctr_tmp, $ctr_tmp mov v14.s[3], $ctr_tmp rev $ctr_tmp, $ctr mov v15.s[3], $ctr_tmp b.hi .Lctr32_loop .Lctr32_done: ldp d14,d15,[sp],#16 ldp d12,d13,[sp],#16 ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks ___ } print $code; close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86.pl000064400000000000000000000437521046102023000200650ustar 00000000000000#! /usr/bin/env perl # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ###################################################################### ## Constant-time SSSE3 AES core implementation. ## version 0.1 ## ## By Mike Hamburg (Stanford University), 2009 ## Public domain. ## ## For details see http://shiftleft.org/papers/vector_aes/ and ## http://crypto.stanford.edu/vpaes/. ###################################################################### # September 2011. # # Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for # aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt # doesn't handle partial vectors (doesn't have to if called from # EVP only). "Drop-in" implies that this module doesn't share key # schedule structure with the original nor does it make assumption # about its alignment... # # Performance summary. aes-586.pl column lists large-block CBC # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per # byte processed with 128-bit key, and vpaes-x86.pl column - [also # large-block CBC] encrypt/decrypt. # # aes-586.pl vpaes-x86.pl # # Core 2(**) 28.1/41.4/18.3 21.9/25.2(***) # Nehalem 27.9/40.4/18.1 10.2/11.9 # Atom 70.7/92.1/60.1 61.1/75.4(***) # Silvermont 45.4/62.9/24.1 49.2/61.1(***) # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast # majority of contemporary cores share cache, slower code path # is common place. In other words "with-hyper-threading-off" # results are presented mostly for reference purposes. # # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. # # (***) Less impressive improvement on Core 2 and Atom is due to slow # pshufb, yet it's respectable +28%/64% improvement on Core 2 # and +15% on Atom (as implied, over "hyper-threading-safe" # code path). # # $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../../perlasm"); require "x86asm.pl"; $output = pop; open OUT,">$output"; *STDOUT=*OUT; &asm_init($ARGV[0]); $PREFIX="vpaes"; my ($round, $base, $magic, $key, $const, $inp, $out)= ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); &preprocessor_ifdef("BORINGSSL_DISPATCH_TEST") &external_label("BORINGSSL_function_hit"); &preprocessor_endif(); &static_label("_vpaes_consts"); &static_label("_vpaes_schedule_low_round"); &set_label("_vpaes_consts",64); $k_inv=-0x30; # inv, inva &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); $k_s0F=-0x10; # s0F &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); $k_ipt=0x00; # input transform (lo, hi) &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); $k_sb1=0x20; # sb1u, sb1t &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); $k_sb2=0x40; # sb2u, sb2t &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); $k_sbo=0x60; # sbou, sbot &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); $k_mc_forward=0x80; # mc_forward &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); $k_mc_backward=0xc0; # mc_backward &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); $k_sr=0x100; # sr &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); $k_rcon=0x140; # rcon &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); $k_s63=0x150; # s63: all equal to 0x63 transformed &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); $k_opt=0x160; # output transform &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); $k_deskew=0x180; # deskew tables: inverts the sbox's "skew" &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); &asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)"); &align (64); &function_begin_B("_vpaes_preheat"); &add ($const,&DWP(0,"esp")); &movdqa ("xmm7",&QWP($k_inv,$const)); &movdqa ("xmm6",&QWP($k_s0F,$const)); &ret (); &function_end_B("_vpaes_preheat"); ## ## _aes_encrypt_core ## ## AES-encrypt %xmm0. ## ## Inputs: ## %xmm0 = input ## %xmm6-%xmm7 as in _vpaes_preheat ## (%edx) = scheduled keys ## ## Output in %xmm0 ## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx ## ## &function_begin_B("_vpaes_encrypt_core"); &mov ($magic,16); &mov ($round,&DWP(240,$key)); &movdqa ("xmm1","xmm6") &movdqa ("xmm2",&QWP($k_ipt,$const)); &pandn ("xmm1","xmm0"); &pand ("xmm0","xmm6"); &movdqu ("xmm5",&QWP(0,$key)); &pshufb ("xmm2","xmm0"); &movdqa ("xmm0",&QWP($k_ipt+16,$const)); &pxor ("xmm2","xmm5"); &psrld ("xmm1",4); &add ($key,16); &pshufb ("xmm0","xmm1"); &lea ($base,&DWP($k_mc_backward,$const)); &pxor ("xmm0","xmm2"); &jmp (&label("enc_entry")); &set_label("enc_loop",16); # middle of middle round &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t &pshufb ("xmm4","xmm2"); # 4 = sb1u &pshufb ("xmm0","xmm3"); # 0 = sb1t &pxor ("xmm4","xmm5"); # 4 = sb1u + k &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u &pxor ("xmm0","xmm4"); # 0 = A &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] &pshufb ("xmm5","xmm2"); # 4 = sb2u &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] &pshufb ("xmm2","xmm3"); # 2 = sb2t &movdqa ("xmm3","xmm0"); # 3 = A &pxor ("xmm2","xmm5"); # 2 = 2A &pshufb ("xmm0","xmm1"); # 0 = B &add ($key,16); # next key &pxor ("xmm0","xmm2"); # 0 = 2A+B &pshufb ("xmm3","xmm4"); # 3 = D &add ($magic,16); # next mc &pxor ("xmm3","xmm0"); # 3 = 2A+B+D &pshufb ("xmm0","xmm1"); # 0 = 2B+C &and ($magic,0x30); # ... mod 4 &sub ($round,1); # nr-- &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D &set_label("enc_entry"); # top of round &movdqa ("xmm1","xmm6"); # 1 : i &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k &pandn ("xmm1","xmm0"); # 1 = i<<4 &psrld ("xmm1",4); # 1 = i &pand ("xmm0","xmm6"); # 0 = k &pshufb ("xmm5","xmm0"); # 2 = a/k &movdqa ("xmm3","xmm7"); # 3 : 1/i &pxor ("xmm0","xmm1"); # 0 = j &pshufb ("xmm3","xmm1"); # 3 = 1/i &movdqa ("xmm4","xmm7"); # 4 : 1/j &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k &pshufb ("xmm4","xmm0"); # 4 = 1/j &movdqa ("xmm2","xmm7"); # 2 : 1/iak &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k &pshufb ("xmm2","xmm3"); # 2 = 1/iak &movdqa ("xmm3","xmm7"); # 3 : 1/jak &pxor ("xmm2","xmm0"); # 2 = io &pshufb ("xmm3","xmm4"); # 3 = 1/jak &movdqu ("xmm5",&QWP(0,$key)); &pxor ("xmm3","xmm1"); # 3 = jo &jnz (&label("enc_loop")); # middle of last round &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 &pshufb ("xmm4","xmm2"); # 4 = sbou &pxor ("xmm4","xmm5"); # 4 = sb1u + k &pshufb ("xmm0","xmm3"); # 0 = sb1t &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] &pxor ("xmm0","xmm4"); # 0 = A &pshufb ("xmm0","xmm1"); &ret (); &function_end_B("_vpaes_encrypt_core"); ######################################################## ## ## ## AES key schedule ## ## ## ######################################################## &function_begin_B("_vpaes_schedule_core"); &add ($const,&DWP(0,"esp")); &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon # input transform &movdqa ("xmm3","xmm0"); &lea ($base,&DWP($k_ipt,$const)); &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 &call ("_vpaes_schedule_transform"); &movdqa ("xmm7","xmm0"); &test ($out,$out); &jnz (&label("schedule_am_decrypting")); # encrypting, output zeroth round key after transform &movdqu (&QWP(0,$key),"xmm0"); &jmp (&label("schedule_go")); &set_label("schedule_am_decrypting"); # decrypting, output zeroth round key after shiftrows &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); &pshufb ("xmm3","xmm1"); &movdqu (&QWP(0,$key),"xmm3"); &xor ($magic,0x30); &set_label("schedule_go"); &cmp ($round,192); &ja (&label("schedule_256")); # 192-bit key support was removed. # 128: fall though ## ## .schedule_128 ## ## 128-bit specific part of key schedule. ## ## This schedule is really simple, because all its parts ## are accomplished by the subroutines. ## &set_label("schedule_128"); &mov ($round,10); &set_label("loop_schedule_128"); &call ("_vpaes_schedule_round"); &dec ($round); &jz (&label("schedule_mangle_last")); &call ("_vpaes_schedule_mangle"); # write output &jmp (&label("loop_schedule_128")); ## ## .aes_schedule_256 ## ## 256-bit specific part of key schedule. ## ## The structure here is very similar to the 128-bit ## schedule, but with an additional "low side" in ## %xmm6. The low side's rounds are the same as the ## high side's, except no rcon and no rotation. ## &set_label("schedule_256",16); &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) &call ("_vpaes_schedule_transform"); # input transform &mov ($round,7); &set_label("loop_schedule_256"); &call ("_vpaes_schedule_mangle"); # output low result &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 # high round &call ("_vpaes_schedule_round"); &dec ($round); &jz (&label("schedule_mangle_last")); &call ("_vpaes_schedule_mangle"); # low round. swap xmm7 and xmm6 &pshufd ("xmm0","xmm0",0xFF); &movdqa (&QWP(20,"esp"),"xmm7"); &movdqa ("xmm7","xmm6"); &call ("_vpaes_schedule_low_round"); &movdqa ("xmm7",&QWP(20,"esp")); &jmp (&label("loop_schedule_256")); ## ## .aes_schedule_mangle_last ## ## Mangler for last round of key schedule ## Mangles %xmm0 ## when encrypting, outputs out(%xmm0) ^ 63 ## when decrypting, outputs unskew(%xmm0) ## ## Always called right before return... jumps to cleanup and exits ## &set_label("schedule_mangle_last",16); # schedule last round key from xmm0 &lea ($base,&DWP($k_deskew,$const)); &test ($out,$out); &jnz (&label("schedule_mangle_last_dec")); # encrypting &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); &pshufb ("xmm0","xmm1"); # output permute &lea ($base,&DWP($k_opt,$const)); # prepare to output transform &add ($key,32); &set_label("schedule_mangle_last_dec"); &add ($key,-16); &pxor ("xmm0",&QWP($k_s63,$const)); &call ("_vpaes_schedule_transform"); # output transform &movdqu (&QWP(0,$key),"xmm0"); # save last key # cleanup &pxor ("xmm0","xmm0"); &pxor ("xmm1","xmm1"); &pxor ("xmm2","xmm2"); &pxor ("xmm3","xmm3"); &pxor ("xmm4","xmm4"); &pxor ("xmm5","xmm5"); &pxor ("xmm6","xmm6"); &pxor ("xmm7","xmm7"); &ret (); &function_end_B("_vpaes_schedule_core"); ## ## .aes_schedule_round ## ## Runs one main round of the key schedule on %xmm0, %xmm7 ## ## Specifically, runs subbytes on the high dword of %xmm0 ## then rotates it by one byte and xors into the low dword of ## %xmm7. ## ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for ## next rcon. ## ## Smears the dwords of %xmm7 by xoring the low into the ## second low, result into third, result into highest. ## ## Returns results in %xmm7 = %xmm0. ## Clobbers %xmm1-%xmm5. ## &function_begin_B("_vpaes_schedule_round"); # extract rcon from xmm8 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 &pxor ("xmm1","xmm1"); &palignr("xmm1","xmm2",15); &palignr("xmm2","xmm2",15); &pxor ("xmm7","xmm1"); # rotate &pshufd ("xmm0","xmm0",0xFF); &palignr("xmm0","xmm0",1); # fall through... &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 # low round: same as high round, but no rotation and no rcon. &set_label("_vpaes_schedule_low_round"); # smear xmm7 &movdqa ("xmm1","xmm7"); &pslldq ("xmm7",4); &pxor ("xmm7","xmm1"); &movdqa ("xmm1","xmm7"); &pslldq ("xmm7",8); &pxor ("xmm7","xmm1"); &pxor ("xmm7",&QWP($k_s63,$const)); # subbyte &movdqa ("xmm4",&QWP($k_s0F,$const)); &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j &movdqa ("xmm1","xmm4"); &pandn ("xmm1","xmm0"); &psrld ("xmm1",4); # 1 = i &pand ("xmm0","xmm4"); # 0 = k &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k &pshufb ("xmm2","xmm0"); # 2 = a/k &pxor ("xmm0","xmm1"); # 0 = j &movdqa ("xmm3","xmm5"); # 3 : 1/i &pshufb ("xmm3","xmm1"); # 3 = 1/i &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k &movdqa ("xmm4","xmm5"); # 4 : 1/j &pshufb ("xmm4","xmm0"); # 4 = 1/j &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k &movdqa ("xmm2","xmm5"); # 2 : 1/iak &pshufb ("xmm2","xmm3"); # 2 = 1/iak &pxor ("xmm2","xmm0"); # 2 = io &movdqa ("xmm3","xmm5"); # 3 : 1/jak &pshufb ("xmm3","xmm4"); # 3 = 1/jak &pxor ("xmm3","xmm1"); # 3 = jo &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou &pshufb ("xmm4","xmm2"); # 4 = sbou &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot &pshufb ("xmm0","xmm3"); # 0 = sb1t &pxor ("xmm0","xmm4"); # 0 = sbox output # add in smeared stuff &pxor ("xmm0","xmm7"); &movdqa ("xmm7","xmm0"); &ret (); &function_end_B("_vpaes_schedule_round"); ## ## .aes_schedule_transform ## ## Linear-transform %xmm0 according to tables at (%ebx) ## ## Output in %xmm0 ## Clobbers %xmm1, %xmm2 ## &function_begin_B("_vpaes_schedule_transform"); &movdqa ("xmm2",&QWP($k_s0F,$const)); &movdqa ("xmm1","xmm2"); &pandn ("xmm1","xmm0"); &psrld ("xmm1",4); &pand ("xmm0","xmm2"); &movdqa ("xmm2",&QWP(0,$base)); &pshufb ("xmm2","xmm0"); &movdqa ("xmm0",&QWP(16,$base)); &pshufb ("xmm0","xmm1"); &pxor ("xmm0","xmm2"); &ret (); &function_end_B("_vpaes_schedule_transform"); ## ## .aes_schedule_mangle ## ## Mangle xmm0 from (basis-transformed) standard version ## to our version. ## ## On encrypt, ## xor with 0x63 ## multiply by circulant 0,1,1,1 ## apply shiftrows transform ## ## On decrypt, ## xor with 0x63 ## multiply by "inverse mixcolumns" circulant E,B,D,9 ## deskew ## apply shiftrows transform ## ## ## Writes out to (%edx), and increments or decrements it ## Keeps track of round number mod 4 in %ecx ## Preserves xmm0 ## Clobbers xmm1-xmm5 ## &function_begin_B("_vpaes_schedule_mangle"); &movdqa ("xmm4","xmm0"); # save xmm0 for later &movdqa ("xmm5",&QWP($k_mc_forward,$const)); &test ($out,$out); &jnz (&label("schedule_mangle_dec")); # encrypting &add ($key,16); &pxor ("xmm4",&QWP($k_s63,$const)); &pshufb ("xmm4","xmm5"); &movdqa ("xmm3","xmm4"); &pshufb ("xmm4","xmm5"); &pxor ("xmm3","xmm4"); &pshufb ("xmm4","xmm5"); &pxor ("xmm3","xmm4"); &jmp (&label("schedule_mangle_both")); &set_label("schedule_mangle_dec",16); # inverse mix columns &movdqa ("xmm2",&QWP($k_s0F,$const)); &lea ($inp,&DWP($k_dksd,$const)); &movdqa ("xmm1","xmm2"); &pandn ("xmm1","xmm4"); &psrld ("xmm1",4); # 1 = hi &pand ("xmm4","xmm2"); # 4 = lo &movdqa ("xmm2",&QWP(0,$inp)); &pshufb ("xmm2","xmm4"); &movdqa ("xmm3",&QWP(0x10,$inp)); &pshufb ("xmm3","xmm1"); &pxor ("xmm3","xmm2"); &pshufb ("xmm3","xmm5"); &movdqa ("xmm2",&QWP(0x20,$inp)); &pshufb ("xmm2","xmm4"); &pxor ("xmm2","xmm3"); &movdqa ("xmm3",&QWP(0x30,$inp)); &pshufb ("xmm3","xmm1"); &pxor ("xmm3","xmm2"); &pshufb ("xmm3","xmm5"); &movdqa ("xmm2",&QWP(0x40,$inp)); &pshufb ("xmm2","xmm4"); &pxor ("xmm2","xmm3"); &movdqa ("xmm3",&QWP(0x50,$inp)); &pshufb ("xmm3","xmm1"); &pxor ("xmm3","xmm2"); &pshufb ("xmm3","xmm5"); &movdqa ("xmm2",&QWP(0x60,$inp)); &pshufb ("xmm2","xmm4"); &pxor ("xmm2","xmm3"); &movdqa ("xmm3",&QWP(0x70,$inp)); &pshufb ("xmm3","xmm1"); &pxor ("xmm3","xmm2"); &add ($key,-16); &set_label("schedule_mangle_both"); &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); &pshufb ("xmm3","xmm1"); &add ($magic,-16); &and ($magic,0x30); &movdqu (&QWP(0,$key),"xmm3"); &ret (); &function_end_B("_vpaes_schedule_mangle"); # # Interface to OpenSSL # &function_begin("${PREFIX}_set_encrypt_key"); record_function_hit(5); &mov ($inp,&wparam(0)); # inp &lea ($base,&DWP(-56,"esp")); &mov ($round,&wparam(1)); # bits &and ($base,-16); &mov ($key,&wparam(2)); # key &xchg ($base,"esp"); # alloca &mov (&DWP(48,"esp"),$base); &mov ($base,$round); &shr ($base,5); &add ($base,5); &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; &mov ($magic,0x30); &mov ($out,0); &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); &call ("_vpaes_schedule_core"); &set_label("pic_point"); &mov ("esp",&DWP(48,"esp")); &xor ("eax","eax"); &function_end("${PREFIX}_set_encrypt_key"); &function_begin("${PREFIX}_encrypt"); record_function_hit(4); &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); &call ("_vpaes_preheat"); &set_label("pic_point"); &mov ($inp,&wparam(0)); # inp &lea ($base,&DWP(-56,"esp")); &mov ($out,&wparam(1)); # out &and ($base,-16); &mov ($key,&wparam(2)); # key &xchg ($base,"esp"); # alloca &mov (&DWP(48,"esp"),$base); &movdqu ("xmm0",&QWP(0,$inp)); &call ("_vpaes_encrypt_core"); &movdqu (&QWP(0,$out),"xmm0"); &mov ("esp",&DWP(48,"esp")); &function_end("${PREFIX}_encrypt"); &asm_finish(); close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl000064400000000000000000000652661046102023000204020ustar 00000000000000#! /usr/bin/env perl # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ###################################################################### ## Constant-time SSSE3 AES core implementation. ## version 0.1 ## ## By Mike Hamburg (Stanford University), 2009 ## Public domain. ## ## For details see http://shiftleft.org/papers/vector_aes/ and ## http://crypto.stanford.edu/vpaes/. ###################################################################### # September 2011. # # Interface to OpenSSL as "almost" drop-in replacement for # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt # doesn't handle partial vectors (doesn't have to if called from # EVP only). "Drop-in" implies that this module doesn't share key # schedule structure with the original nor does it make assumption # about its alignment... # # Performance summary. aes-x86_64.pl column lists large-block CBC # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per # byte processed with 128-bit key, and vpaes-x86_64.pl column - # [also large-block CBC] encrypt/decrypt. # # aes-x86_64.pl vpaes-x86_64.pl # # Core 2(**) 29.6/41.1/14.3 21.9/25.2(***) # Nehalem 29.6/40.3/14.6 10.0/11.8 # Atom 57.3/74.2/32.1 60.9/77.2(***) # Silvermont 52.7/64.0/19.5 48.8/60.8(***) # Goldmont 38.9/49.0/17.8 10.6/12.6 # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast # majority of contemporary cores share cache, slower code path # is common place. In other words "with-hyper-threading-off" # results are presented mostly for reference purposes. # # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. # # (***) Less impressive improvement on Core 2 and Atom is due to slow # pshufb, yet it's respectable +36%/62% improvement on Core 2 # (as implied, over "hyper-threading-safe" code path). # # $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $PREFIX="vpaes"; $code.=<<___; .text ## ## _aes_encrypt_core ## ## AES-encrypt %xmm0. ## ## Inputs: ## %xmm0 = input ## %xmm9-%xmm15 as in _vpaes_preheat ## (%rdx) = scheduled keys ## ## Output in %xmm0 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax ## Preserves %xmm6 - %xmm8 so you get some local vectors ## ## .type _vpaes_encrypt_core,\@abi-omnipotent .align 16 _vpaes_encrypt_core: .cfi_startproc mov %rdx, %r9 mov \$16, %r11 mov 240(%rdx),%eax movdqa %xmm9, %xmm1 movdqa .Lk_ipt(%rip), %xmm2 # iptlo pandn %xmm0, %xmm1 movdqu (%r9), %xmm5 # round0 key psrld \$4, %xmm1 pand %xmm9, %xmm0 pshufb %xmm0, %xmm2 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi pshufb %xmm1, %xmm0 pxor %xmm5, %xmm2 add \$16, %r9 pxor %xmm2, %xmm0 lea .Lk_mc_backward(%rip),%r10 jmp .Lenc_entry .align 16 .Lenc_loop: # middle of middle round movdqa %xmm13, %xmm4 # 4 : sb1u movdqa %xmm12, %xmm0 # 0 : sb1t pshufb %xmm2, %xmm4 # 4 = sb1u pshufb %xmm3, %xmm0 # 0 = sb1t pxor %xmm5, %xmm4 # 4 = sb1u + k movdqa %xmm15, %xmm5 # 4 : sb2u pxor %xmm4, %xmm0 # 0 = A movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] pshufb %xmm2, %xmm5 # 4 = sb2u movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] movdqa %xmm14, %xmm2 # 2 : sb2t pshufb %xmm3, %xmm2 # 2 = sb2t movdqa %xmm0, %xmm3 # 3 = A pxor %xmm5, %xmm2 # 2 = 2A pshufb %xmm1, %xmm0 # 0 = B add \$16, %r9 # next key pxor %xmm2, %xmm0 # 0 = 2A+B pshufb %xmm4, %xmm3 # 3 = D add \$16, %r11 # next mc pxor %xmm0, %xmm3 # 3 = 2A+B+D pshufb %xmm1, %xmm0 # 0 = 2B+C and \$0x30, %r11 # ... mod 4 sub \$1,%rax # nr-- pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D .Lenc_entry: # top of round movdqa %xmm9, %xmm1 # 1 : i movdqa %xmm11, %xmm5 # 2 : a/k pandn %xmm0, %xmm1 # 1 = i<<4 psrld \$4, %xmm1 # 1 = i pand %xmm9, %xmm0 # 0 = k pshufb %xmm0, %xmm5 # 2 = a/k movdqa %xmm10, %xmm3 # 3 : 1/i pxor %xmm1, %xmm0 # 0 = j pshufb %xmm1, %xmm3 # 3 = 1/i movdqa %xmm10, %xmm4 # 4 : 1/j pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k pshufb %xmm0, %xmm4 # 4 = 1/j movdqa %xmm10, %xmm2 # 2 : 1/iak pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k pshufb %xmm3, %xmm2 # 2 = 1/iak movdqa %xmm10, %xmm3 # 3 : 1/jak pxor %xmm0, %xmm2 # 2 = io pshufb %xmm4, %xmm3 # 3 = 1/jak movdqu (%r9), %xmm5 pxor %xmm1, %xmm3 # 3 = jo jnz .Lenc_loop # middle of last round movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 pshufb %xmm2, %xmm4 # 4 = sbou pxor %xmm5, %xmm4 # 4 = sb1u + k pshufb %xmm3, %xmm0 # 0 = sb1t movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] pxor %xmm4, %xmm0 # 0 = A pshufb %xmm1, %xmm0 ret .cfi_endproc .size _vpaes_encrypt_core,.-_vpaes_encrypt_core ## ## _aes_encrypt_core_2x ## ## AES-encrypt %xmm0 and %xmm6 in parallel. ## ## Inputs: ## %xmm0 and %xmm6 = input ## %xmm9 and %xmm10 as in _vpaes_preheat ## (%rdx) = scheduled keys ## ## Output in %xmm0 and %xmm6 ## Clobbers %xmm1-%xmm5, %xmm7, %xmm8, %xmm11-%xmm13, %r9, %r10, %r11, %rax ## Preserves %xmm14 and %xmm15 ## ## This function stitches two parallel instances of _vpaes_encrypt_core. x86_64 ## provides 16 XMM registers. _vpaes_encrypt_core computes over six registers ## (%xmm0-%xmm5) and additionally uses seven registers with preloaded constants ## from _vpaes_preheat (%xmm9-%xmm15). This does not quite fit two instances, ## so we spill some of %xmm9 through %xmm15 back to memory. We keep %xmm9 and ## %xmm10 in registers as these values are used several times in a row. The ## remainder are read once per round and are spilled to memory. This leaves two ## registers preserved for the caller. ## ## Thus, of the two _vpaes_encrypt_core instances, the first uses (%xmm0-%xmm5) ## as before. The second uses %xmm6-%xmm8,%xmm11-%xmm13. (Add 6 to %xmm2 and ## below. Add 8 to %xmm3 and up.) Instructions in the second instance are ## indented by one space. ## ## .type _vpaes_encrypt_core_2x,\@abi-omnipotent .align 16 _vpaes_encrypt_core_2x: .cfi_startproc mov %rdx, %r9 mov \$16, %r11 mov 240(%rdx),%eax movdqa %xmm9, %xmm1 movdqa %xmm9, %xmm7 movdqa .Lk_ipt(%rip), %xmm2 # iptlo movdqa %xmm2, %xmm8 pandn %xmm0, %xmm1 pandn %xmm6, %xmm7 movdqu (%r9), %xmm5 # round0 key # Also use %xmm5 in the second instance. psrld \$4, %xmm1 psrld \$4, %xmm7 pand %xmm9, %xmm0 pand %xmm9, %xmm6 pshufb %xmm0, %xmm2 pshufb %xmm6, %xmm8 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi movdqa %xmm0, %xmm6 pshufb %xmm1, %xmm0 pshufb %xmm7, %xmm6 pxor %xmm5, %xmm2 pxor %xmm5, %xmm8 add \$16, %r9 pxor %xmm2, %xmm0 pxor %xmm8, %xmm6 lea .Lk_mc_backward(%rip),%r10 jmp .Lenc2x_entry .align 16 .Lenc2x_loop: # middle of middle round movdqa .Lk_sb1(%rip), %xmm4 # 4 : sb1u movdqa .Lk_sb1+16(%rip),%xmm0 # 0 : sb1t movdqa %xmm4, %xmm12 movdqa %xmm0, %xmm6 pshufb %xmm2, %xmm4 # 4 = sb1u pshufb %xmm8, %xmm12 pshufb %xmm3, %xmm0 # 0 = sb1t pshufb %xmm11, %xmm6 pxor %xmm5, %xmm4 # 4 = sb1u + k pxor %xmm5, %xmm12 movdqa .Lk_sb2(%rip), %xmm5 # 4 : sb2u movdqa %xmm5, %xmm13 pxor %xmm4, %xmm0 # 0 = A pxor %xmm12, %xmm6 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] # Also use %xmm1 in the second instance. pshufb %xmm2, %xmm5 # 4 = sb2u pshufb %xmm8, %xmm13 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] # Also use %xmm4 in the second instance. movdqa .Lk_sb2+16(%rip), %xmm2 # 2 : sb2t movdqa %xmm2, %xmm8 pshufb %xmm3, %xmm2 # 2 = sb2t pshufb %xmm11, %xmm8 movdqa %xmm0, %xmm3 # 3 = A movdqa %xmm6, %xmm11 pxor %xmm5, %xmm2 # 2 = 2A pxor %xmm13, %xmm8 pshufb %xmm1, %xmm0 # 0 = B pshufb %xmm1, %xmm6 add \$16, %r9 # next key pxor %xmm2, %xmm0 # 0 = 2A+B pxor %xmm8, %xmm6 pshufb %xmm4, %xmm3 # 3 = D pshufb %xmm4, %xmm11 add \$16, %r11 # next mc pxor %xmm0, %xmm3 # 3 = 2A+B+D pxor %xmm6, %xmm11 pshufb %xmm1, %xmm0 # 0 = 2B+C pshufb %xmm1, %xmm6 and \$0x30, %r11 # ... mod 4 sub \$1,%rax # nr-- pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D pxor %xmm11, %xmm6 .Lenc2x_entry: # top of round movdqa %xmm9, %xmm1 # 1 : i movdqa %xmm9, %xmm7 movdqa .Lk_inv+16(%rip), %xmm5 # 2 : a/k movdqa %xmm5, %xmm13 pandn %xmm0, %xmm1 # 1 = i<<4 pandn %xmm6, %xmm7 psrld \$4, %xmm1 # 1 = i psrld \$4, %xmm7 pand %xmm9, %xmm0 # 0 = k pand %xmm9, %xmm6 pshufb %xmm0, %xmm5 # 2 = a/k pshufb %xmm6, %xmm13 movdqa %xmm10, %xmm3 # 3 : 1/i movdqa %xmm10, %xmm11 pxor %xmm1, %xmm0 # 0 = j pxor %xmm7, %xmm6 pshufb %xmm1, %xmm3 # 3 = 1/i pshufb %xmm7, %xmm11 movdqa %xmm10, %xmm4 # 4 : 1/j movdqa %xmm10, %xmm12 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k pxor %xmm13, %xmm11 pshufb %xmm0, %xmm4 # 4 = 1/j pshufb %xmm6, %xmm12 movdqa %xmm10, %xmm2 # 2 : 1/iak movdqa %xmm10, %xmm8 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k pxor %xmm13, %xmm12 pshufb %xmm3, %xmm2 # 2 = 1/iak pshufb %xmm11, %xmm8 movdqa %xmm10, %xmm3 # 3 : 1/jak movdqa %xmm10, %xmm11 pxor %xmm0, %xmm2 # 2 = io pxor %xmm6, %xmm8 pshufb %xmm4, %xmm3 # 3 = 1/jak pshufb %xmm12, %xmm11 movdqu (%r9), %xmm5 # Also use %xmm5 in the second instance. pxor %xmm1, %xmm3 # 3 = jo pxor %xmm7, %xmm11 jnz .Lenc2x_loop # middle of last round movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 movdqa %xmm4, %xmm12 movdqa %xmm0, %xmm6 pshufb %xmm2, %xmm4 # 4 = sbou pshufb %xmm8, %xmm12 pxor %xmm5, %xmm4 # 4 = sb1u + k pxor %xmm5, %xmm12 pshufb %xmm3, %xmm0 # 0 = sb1t pshufb %xmm11, %xmm6 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] # Also use %xmm1 in the second instance. pxor %xmm4, %xmm0 # 0 = A pxor %xmm12, %xmm6 pshufb %xmm1, %xmm0 pshufb %xmm1, %xmm6 ret .cfi_endproc .size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x ######################################################## ## ## ## AES key schedule ## ## ## ######################################################## .type _vpaes_schedule_core,\@abi-omnipotent .align 16 _vpaes_schedule_core: .cfi_startproc # rdi = key # rsi = size in bits # rdx = buffer # rcx = direction. 0=encrypt, 1=decrypt call _vpaes_preheat # load the tables movdqa .Lk_rcon(%rip), %xmm8 # load rcon movdqu (%rdi), %xmm0 # load key (unaligned) # input transform movdqa %xmm0, %xmm3 lea .Lk_ipt(%rip), %r11 call _vpaes_schedule_transform movdqa %xmm0, %xmm7 lea .Lk_sr(%rip),%r10 # encrypting, output zeroth round key after transform movdqu %xmm0, (%rdx) .Lschedule_go: cmp \$192, %esi ja .Lschedule_256 # 192-bit key support was removed. # 128: fall though ## ## .schedule_128 ## ## 128-bit specific part of key schedule. ## ## This schedule is really simple, because all its parts ## are accomplished by the subroutines. ## .Lschedule_128: mov \$10, %esi .Loop_schedule_128: call _vpaes_schedule_round dec %rsi jz .Lschedule_mangle_last call _vpaes_schedule_mangle # write output jmp .Loop_schedule_128 ## ## .aes_schedule_256 ## ## 256-bit specific part of key schedule. ## ## The structure here is very similar to the 128-bit ## schedule, but with an additional "low side" in ## %xmm6. The low side's rounds are the same as the ## high side's, except no rcon and no rotation. ## .align 16 .Lschedule_256: movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) call _vpaes_schedule_transform # input transform mov \$7, %esi .Loop_schedule_256: call _vpaes_schedule_mangle # output low result movdqa %xmm0, %xmm6 # save cur_lo in xmm6 # high round call _vpaes_schedule_round dec %rsi jz .Lschedule_mangle_last call _vpaes_schedule_mangle # low round. swap xmm7 and xmm6 pshufd \$0xFF, %xmm0, %xmm0 movdqa %xmm7, %xmm5 movdqa %xmm6, %xmm7 call _vpaes_schedule_low_round movdqa %xmm5, %xmm7 jmp .Loop_schedule_256 ## ## .aes_schedule_mangle_last ## ## Mangler for last round of key schedule ## Mangles %xmm0 ## when encrypting, outputs out(%xmm0) ^ 63 ## when decrypting, outputs unskew(%xmm0) ## ## Always called right before return... jumps to cleanup and exits ## .align 16 .Lschedule_mangle_last: # schedule last round key from xmm0 lea .Lk_deskew(%rip),%r11 # prepare to deskew # encrypting movdqa (%r8,%r10),%xmm1 pshufb %xmm1, %xmm0 # output permute lea .Lk_opt(%rip), %r11 # prepare to output transform add \$32, %rdx .Lschedule_mangle_last_dec: add \$-16, %rdx pxor .Lk_s63(%rip), %xmm0 call _vpaes_schedule_transform # output transform movdqu %xmm0, (%rdx) # save last key # cleanup pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 ret .cfi_endproc .size _vpaes_schedule_core,.-_vpaes_schedule_core ## ## .aes_schedule_round ## ## Runs one main round of the key schedule on %xmm0, %xmm7 ## ## Specifically, runs subbytes on the high dword of %xmm0 ## then rotates it by one byte and xors into the low dword of ## %xmm7. ## ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for ## next rcon. ## ## Smears the dwords of %xmm7 by xoring the low into the ## second low, result into third, result into highest. ## ## Returns results in %xmm7 = %xmm0. ## Clobbers %xmm1-%xmm4, %r11. ## .type _vpaes_schedule_round,\@abi-omnipotent .align 16 _vpaes_schedule_round: .cfi_startproc # extract rcon from xmm8 pxor %xmm1, %xmm1 palignr \$15, %xmm8, %xmm1 palignr \$15, %xmm8, %xmm8 pxor %xmm1, %xmm7 # rotate pshufd \$0xFF, %xmm0, %xmm0 palignr \$1, %xmm0, %xmm0 # fall through... # low round: same as high round, but no rotation and no rcon. _vpaes_schedule_low_round: # smear xmm7 movdqa %xmm7, %xmm1 pslldq \$4, %xmm7 pxor %xmm1, %xmm7 movdqa %xmm7, %xmm1 pslldq \$8, %xmm7 pxor %xmm1, %xmm7 pxor .Lk_s63(%rip), %xmm7 # subbytes movdqa %xmm9, %xmm1 pandn %xmm0, %xmm1 psrld \$4, %xmm1 # 1 = i pand %xmm9, %xmm0 # 0 = k movdqa %xmm11, %xmm2 # 2 : a/k pshufb %xmm0, %xmm2 # 2 = a/k pxor %xmm1, %xmm0 # 0 = j movdqa %xmm10, %xmm3 # 3 : 1/i pshufb %xmm1, %xmm3 # 3 = 1/i pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k movdqa %xmm10, %xmm4 # 4 : 1/j pshufb %xmm0, %xmm4 # 4 = 1/j pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k movdqa %xmm10, %xmm2 # 2 : 1/iak pshufb %xmm3, %xmm2 # 2 = 1/iak pxor %xmm0, %xmm2 # 2 = io movdqa %xmm10, %xmm3 # 3 : 1/jak pshufb %xmm4, %xmm3 # 3 = 1/jak pxor %xmm1, %xmm3 # 3 = jo movdqa %xmm13, %xmm4 # 4 : sbou pshufb %xmm2, %xmm4 # 4 = sbou movdqa %xmm12, %xmm0 # 0 : sbot pshufb %xmm3, %xmm0 # 0 = sb1t pxor %xmm4, %xmm0 # 0 = sbox output # add in smeared stuff pxor %xmm7, %xmm0 movdqa %xmm0, %xmm7 ret .cfi_endproc .size _vpaes_schedule_round,.-_vpaes_schedule_round ## ## .aes_schedule_transform ## ## Linear-transform %xmm0 according to tables at (%r11) ## ## Requires that %xmm9 = 0x0F0F... as in preheat ## Output in %xmm0 ## Clobbers %xmm1, %xmm2 ## .type _vpaes_schedule_transform,\@abi-omnipotent .align 16 _vpaes_schedule_transform: .cfi_startproc movdqa %xmm9, %xmm1 pandn %xmm0, %xmm1 psrld \$4, %xmm1 pand %xmm9, %xmm0 movdqa (%r11), %xmm2 # lo pshufb %xmm0, %xmm2 movdqa 16(%r11), %xmm0 # hi pshufb %xmm1, %xmm0 pxor %xmm2, %xmm0 ret .cfi_endproc .size _vpaes_schedule_transform,.-_vpaes_schedule_transform ## ## .aes_schedule_mangle ## ## Mangle xmm0 from (basis-transformed) standard version ## to our version. ## ## On encrypt, ## xor with 0x63 ## multiply by circulant 0,1,1,1 ## apply shiftrows transform ## ## On decrypt, ## xor with 0x63 ## multiply by "inverse mixcolumns" circulant E,B,D,9 ## deskew ## apply shiftrows transform ## ## ## Writes out to (%rdx), and increments or decrements it ## Keeps track of round number mod 4 in %r8 ## Preserves xmm0 ## Clobbers xmm1-xmm5 ## .type _vpaes_schedule_mangle,\@abi-omnipotent .align 16 _vpaes_schedule_mangle: .cfi_startproc movdqa %xmm0, %xmm4 # save xmm0 for later movdqa .Lk_mc_forward(%rip),%xmm5 # encrypting add \$16, %rdx pxor .Lk_s63(%rip),%xmm4 pshufb %xmm5, %xmm4 movdqa %xmm4, %xmm3 pshufb %xmm5, %xmm4 pxor %xmm4, %xmm3 pshufb %xmm5, %xmm4 pxor %xmm4, %xmm3 .Lschedule_mangle_both: movdqa (%r8,%r10),%xmm1 pshufb %xmm1,%xmm3 add \$-16, %r8 and \$0x30, %r8 movdqu %xmm3, (%rdx) ret .cfi_endproc .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle # # Interface to OpenSSL # .globl ${PREFIX}_set_encrypt_key .type ${PREFIX}_set_encrypt_key,\@function,3 .align 16 ${PREFIX}_set_encrypt_key: .cfi_startproc _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST .extern BORINGSSL_function_hit movb \$1, BORINGSSL_function_hit+5(%rip) #endif ___ $code.=<<___ if ($win64); lea -0xb8(%rsp),%rsp movaps %xmm6,0x10(%rsp) movaps %xmm7,0x20(%rsp) movaps %xmm8,0x30(%rsp) movaps %xmm9,0x40(%rsp) movaps %xmm10,0x50(%rsp) movaps %xmm11,0x60(%rsp) movaps %xmm12,0x70(%rsp) movaps %xmm13,0x80(%rsp) movaps %xmm14,0x90(%rsp) movaps %xmm15,0xa0(%rsp) .Lenc_key_body: ___ $code.=<<___; mov %esi,%eax shr \$5,%eax add \$5,%eax mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; mov \$0,%ecx mov \$0x30,%r8d call _vpaes_schedule_core ___ $code.=<<___ if ($win64); movaps 0x10(%rsp),%xmm6 movaps 0x20(%rsp),%xmm7 movaps 0x30(%rsp),%xmm8 movaps 0x40(%rsp),%xmm9 movaps 0x50(%rsp),%xmm10 movaps 0x60(%rsp),%xmm11 movaps 0x70(%rsp),%xmm12 movaps 0x80(%rsp),%xmm13 movaps 0x90(%rsp),%xmm14 movaps 0xa0(%rsp),%xmm15 lea 0xb8(%rsp),%rsp .Lenc_key_epilogue: ___ $code.=<<___; xor %eax,%eax ret .cfi_endproc .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key ___ { my ($inp,$out,$blocks,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx","%r8"); # void vpaes_ctr32_encrypt_blocks(const uint8_t *inp, uint8_t *out, # size_t blocks, const AES_KEY *key, # const uint8_t ivp[16]); $code.=<<___; .globl ${PREFIX}_ctr32_encrypt_blocks .type ${PREFIX}_ctr32_encrypt_blocks,\@function,5 .align 16 ${PREFIX}_ctr32_encrypt_blocks: .cfi_startproc _CET_ENDBR # _vpaes_encrypt_core and _vpaes_encrypt_core_2x expect the key in %rdx. xchg $key, $blocks ___ ($blocks,$key)=($key,$blocks); $code.=<<___; test $blocks, $blocks jz .Lctr32_abort ___ $code.=<<___ if ($win64); lea -0xb8(%rsp),%rsp movaps %xmm6,0x10(%rsp) movaps %xmm7,0x20(%rsp) movaps %xmm8,0x30(%rsp) movaps %xmm9,0x40(%rsp) movaps %xmm10,0x50(%rsp) movaps %xmm11,0x60(%rsp) movaps %xmm12,0x70(%rsp) movaps %xmm13,0x80(%rsp) movaps %xmm14,0x90(%rsp) movaps %xmm15,0xa0(%rsp) .Lctr32_body: ___ $code.=<<___; movdqu ($ivp), %xmm0 # Load IV. movdqa .Lctr_add_one(%rip), %xmm8 sub $inp, $out # This allows only incrementing $inp. call _vpaes_preheat movdqa %xmm0, %xmm6 pshufb .Lrev_ctr(%rip), %xmm6 test \$1, $blocks jz .Lctr32_prep_loop # Handle one block so the remaining block count is even for # _vpaes_encrypt_core_2x. movdqu ($inp), %xmm7 # Load input. call _vpaes_encrypt_core pxor %xmm7, %xmm0 paddd %xmm8, %xmm6 movdqu %xmm0, ($out,$inp) sub \$1, $blocks lea 16($inp), $inp jz .Lctr32_done .Lctr32_prep_loop: # _vpaes_encrypt_core_2x leaves only %xmm14 and %xmm15 as spare # registers. We maintain two byte-swapped counters in them. movdqa %xmm6, %xmm14 movdqa %xmm6, %xmm15 paddd %xmm8, %xmm15 .Lctr32_loop: movdqa .Lrev_ctr(%rip), %xmm1 # Set up counters. movdqa %xmm14, %xmm0 movdqa %xmm15, %xmm6 pshufb %xmm1, %xmm0 pshufb %xmm1, %xmm6 call _vpaes_encrypt_core_2x movdqu ($inp), %xmm1 # Load input. movdqu 16($inp), %xmm2 movdqa .Lctr_add_two(%rip), %xmm3 pxor %xmm1, %xmm0 # XOR input. pxor %xmm2, %xmm6 paddd %xmm3, %xmm14 # Increment counters. paddd %xmm3, %xmm15 movdqu %xmm0, ($out,$inp) # Write output. movdqu %xmm6, 16($out,$inp) sub \$2, $blocks # Advance loop. lea 32($inp), $inp jnz .Lctr32_loop .Lctr32_done: ___ $code.=<<___ if ($win64); movaps 0x10(%rsp),%xmm6 movaps 0x20(%rsp),%xmm7 movaps 0x30(%rsp),%xmm8 movaps 0x40(%rsp),%xmm9 movaps 0x50(%rsp),%xmm10 movaps 0x60(%rsp),%xmm11 movaps 0x70(%rsp),%xmm12 movaps 0x80(%rsp),%xmm13 movaps 0x90(%rsp),%xmm14 movaps 0xa0(%rsp),%xmm15 lea 0xb8(%rsp),%rsp .Lctr32_epilogue: ___ $code.=<<___; .Lctr32_abort: ret .cfi_endproc .size ${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks ___ } $code.=<<___; ## ## _aes_preheat ## ## Fills register %r10 -> .aes_consts (so you can -fPIC) ## and %xmm9-%xmm15 as specified below. ## .type _vpaes_preheat,\@abi-omnipotent .align 16 _vpaes_preheat: .cfi_startproc lea .Lk_s0F(%rip), %r10 movdqa -0x20(%r10), %xmm10 # .Lk_inv movdqa -0x10(%r10), %xmm11 # .Lk_inv+16 movdqa 0x00(%r10), %xmm9 # .Lk_s0F movdqa 0x30(%r10), %xmm13 # .Lk_sb1 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16 movdqa 0x50(%r10), %xmm15 # .Lk_sb2 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16 ret .cfi_endproc .size _vpaes_preheat,.-_vpaes_preheat ######################################################## ## ## ## Constants ## ## ## ######################################################## .type _vpaes_consts,\@object .section .rodata .align 64 _vpaes_consts: .Lk_inv: # inv, inva .quad 0x0E05060F0D080180, 0x040703090A0B0C02 .quad 0x01040A060F0B0780, 0x030D0E0C02050809 .Lk_s0F: # s0F .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F .Lk_ipt: # input transform (lo, hi) .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 .Lk_sb1: # sb1u, sb1t .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF .Lk_sb2: # sb2u, sb2t .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A .Lk_sbo: # sbou, sbot .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA .Lk_mc_forward: # mc_forward .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 .quad 0x080B0A0904070605, 0x000302010C0F0E0D .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 .quad 0x000302010C0F0E0D, 0x080B0A0904070605 .Lk_mc_backward:# mc_backward .quad 0x0605040702010003, 0x0E0D0C0F0A09080B .quad 0x020100030E0D0C0F, 0x0A09080B06050407 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 .quad 0x0A09080B06050407, 0x020100030E0D0C0F .Lk_sr: # sr .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 .quad 0x030E09040F0A0500, 0x0B06010C07020D08 .quad 0x0F060D040B020900, 0x070E050C030A0108 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 .Lk_rcon: # rcon .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 .Lk_s63: # s63: all equal to 0x63 transformed .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B .Lk_opt: # output transform .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 .Lk_deskew: # deskew tables: inverts the sbox's "skew" .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 # .Lrev_ctr is a permutation which byte-swaps the counter portion of the IV. .Lrev_ctr: .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 # .Lctr_add_* may be added to a byte-swapped xmm register to increment the # counter. The register must be byte-swapped again to form the actual input. .Lctr_add_one: .quad 0x0000000000000000, 0x0000000100000000 .Lctr_add_two: .quad 0x0000000000000000, 0x0000000200000000 .asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)" .align 64 .size _vpaes_consts,.-_vpaes_consts .text ___ if ($win64) { # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue lea 16(%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq lea 0xb8(%rax),%rax # adjust stack pointer .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$`1232/8`,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler .section .pdata .align 4 .rva .LSEH_begin_${PREFIX}_set_encrypt_key .rva .LSEH_end_${PREFIX}_set_encrypt_key .rva .LSEH_info_${PREFIX}_set_encrypt_key .rva .LSEH_begin_${PREFIX}_ctr32_encrypt_blocks .rva .LSEH_end_${PREFIX}_ctr32_encrypt_blocks .rva .LSEH_info_${PREFIX}_ctr32_encrypt_blocks .section .xdata .align 8 .LSEH_info_${PREFIX}_set_encrypt_key: .byte 9,0,0,0 .rva se_handler .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[] .LSEH_info_${PREFIX}_ctr32_encrypt_blocks: .byte 9,0,0,0 .rva se_handler .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] ___ } $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/bn/asm/armv4-mont.pl000064400000000000000000000462321046102023000201530ustar 00000000000000#! /usr/bin/env perl # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # January 2007. # Montgomery multiplication for ARMv4. # # Performance improvement naturally varies among CPU implementations # and compilers. The code was observed to provide +65-35% improvement # [depending on key length, less for longer keys] on ARM920T, and # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code # base and compiler generated code with in-lined umull and even umlal # instructions. The latter means that this code didn't really have an # "advantage" of utilizing some "secret" instruction. # # The code is interoperable with Thumb ISA and is rather compact, less # than 1/2KB. Windows CE port would be trivial, as it's exclusively # about decorations, ABI and instruction syntax are identical. # November 2013 # # Add NEON code path, which handles lengths divisible by 8. RSA/DSA # performance improvement on Cortex-A8 is ~45-100% depending on key # length, more for longer keys. On Cortex-A15 the span is ~10-105%. # On Snapdragon S4 improvement was measured to vary from ~70% to # incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is # rather because original integer-only code seems to perform # suboptimally on S4. Situation on Cortex-A9 is unfortunately # different. It's being looked into, but the trouble is that # performance for vectors longer than 256 bits is actually couple # of percent worse than for integer-only code. The code is chosen # for execution on all NEON-capable processors, because gain on # others outweighs the marginal loss on Cortex-A9. # September 2015 # # Align Cortex-A9 performance with November 2013 improvements, i.e. # NEON code is now ~20-105% faster than integer-only one on this # processor. But this optimization further improved performance even # on other processors: NEON code path is ~45-180% faster than original # integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on # Snapdragon S4. $flavour = shift; if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; } else { open OUT,">$output"; *STDOUT=*OUT; } $num="r0"; # starts as num argument, but holds &tp[num-1] $ap="r1"; $bp="r2"; $bi="r2"; $rp="r2"; $np="r3"; $tp="r4"; $aj="r5"; $nj="r6"; $tj="r7"; $n0="r8"; ########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer $alo="r10"; # sl, gcc uses it to keep @GOT $ahi="r11"; # fp $nlo="r12"; # ip ########### # r13 is stack pointer $nhi="r14"; # lr ########### # r15 is program counter #### argument block layout relative to &tp[num-1], a.k.a. $num $_rp="$num,#12*4"; # ap permanently resides in r1 $_bp="$num,#13*4"; # np permanently resides in r3 $_n0="$num,#14*4"; $_num="$num,#15*4"; $_bpend=$_num; $code=<<___; @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both @ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. .arch armv7-a .text #if defined(__thumb2__) .syntax unified .thumb #else .code 32 #endif .global bn_mul_mont_nohw .type bn_mul_mont_nohw,%function .align 5 bn_mul_mont_nohw: ldr ip,[sp,#4] @ load num stmdb sp!,{r0,r2} @ sp points at argument block cmp ip,#2 mov $num,ip @ load num #ifdef __thumb2__ ittt lt #endif movlt r0,#0 addlt sp,sp,#2*4 blt .Labrt stmdb sp!,{r4-r12,lr} @ save 10 registers mov $num,$num,lsl#2 @ rescale $num for byte count sub sp,sp,$num @ alloca(4*num) sub sp,sp,#4 @ +extra dword sub $num,$num,#4 @ "num=num-1" add $tp,$bp,$num @ &bp[num-1] add $num,sp,$num @ $num to point at &tp[num-1] ldr $n0,[$_n0] @ &n0 ldr $bi,[$bp] @ bp[0] ldr $aj,[$ap],#4 @ ap[0],ap++ ldr $nj,[$np],#4 @ np[0],np++ ldr $n0,[$n0] @ *n0 str $tp,[$_bpend] @ save &bp[num] umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] str $n0,[$_n0] @ save n0 value mul $n0,$alo,$n0 @ "tp[0]"*n0 mov $nlo,#0 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" mov $tp,sp .L1st: ldr $aj,[$ap],#4 @ ap[j],ap++ mov $alo,$ahi ldr $nj,[$np],#4 @ np[j],np++ mov $ahi,#0 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] mov $nhi,#0 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 adds $nlo,$nlo,$alo str $nlo,[$tp],#4 @ tp[j-1]=,tp++ adc $nlo,$nhi,#0 cmp $tp,$num bne .L1st adds $nlo,$nlo,$ahi ldr $tp,[$_bp] @ restore bp mov $nhi,#0 ldr $n0,[$_n0] @ restore n0 adc $nhi,$nhi,#0 str $nlo,[$num] @ tp[num-1]= mov $tj,sp str $nhi,[$num,#4] @ tp[num]= .Louter: sub $tj,$num,$tj @ "original" $num-1 value sub $ap,$ap,$tj @ "rewind" ap to &ap[1] ldr $bi,[$tp,#4]! @ *(++bp) sub $np,$np,$tj @ "rewind" np to &np[1] ldr $aj,[$ap,#-4] @ ap[0] ldr $alo,[sp] @ tp[0] ldr $nj,[$np,#-4] @ np[0] ldr $tj,[sp,#4] @ tp[1] mov $ahi,#0 umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] str $tp,[$_bp] @ save bp mul $n0,$alo,$n0 mov $nlo,#0 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" mov $tp,sp .Linner: ldr $aj,[$ap],#4 @ ap[j],ap++ adds $alo,$ahi,$tj @ +=tp[j] ldr $nj,[$np],#4 @ np[j],np++ mov $ahi,#0 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] mov $nhi,#0 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 adc $ahi,$ahi,#0 ldr $tj,[$tp,#8] @ tp[j+1] adds $nlo,$nlo,$alo str $nlo,[$tp],#4 @ tp[j-1]=,tp++ adc $nlo,$nhi,#0 cmp $tp,$num bne .Linner adds $nlo,$nlo,$ahi mov $nhi,#0 ldr $tp,[$_bp] @ restore bp adc $nhi,$nhi,#0 ldr $n0,[$_n0] @ restore n0 adds $nlo,$nlo,$tj ldr $tj,[$_bpend] @ restore &bp[num] adc $nhi,$nhi,#0 str $nlo,[$num] @ tp[num-1]= str $nhi,[$num,#4] @ tp[num]= cmp $tp,$tj #ifdef __thumb2__ itt ne #endif movne $tj,sp bne .Louter ldr $rp,[$_rp] @ pull rp mov $aj,sp add $num,$num,#4 @ $num to point at &tp[num] sub $aj,$num,$aj @ "original" num value mov $tp,sp @ "rewind" $tp mov $ap,$tp @ "borrow" $ap sub $np,$np,$aj @ "rewind" $np to &np[0] subs $tj,$tj,$tj @ "clear" carry flag .Lsub: ldr $tj,[$tp],#4 ldr $nj,[$np],#4 sbcs $tj,$tj,$nj @ tp[j]-np[j] str $tj,[$rp],#4 @ rp[j]= teq $tp,$num @ preserve carry bne .Lsub sbcs $nhi,$nhi,#0 @ upmost carry mov $tp,sp @ "rewind" $tp sub $rp,$rp,$aj @ "rewind" $rp .Lcopy: ldr $tj,[$tp] @ conditional copy ldr $aj,[$rp] str sp,[$tp],#4 @ zap tp #ifdef __thumb2__ it cc #endif movcc $aj,$tj str $aj,[$rp],#4 teq $tp,$num @ preserve carry bne .Lcopy mov sp,$num add sp,sp,#4 @ skip over tp[num+1] ldmia sp!,{r4-r12,lr} @ restore registers add sp,sp,#2*4 @ skip over {r0,r2} mov r0,#1 .Labrt: #if __ARM_ARCH>=5 ret @ bx lr #else tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif .size bn_mul_mont_nohw,.-bn_mul_mont_nohw ___ { my ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); my ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); my ($Z,$Temp)=("q4","q5"); my @ACC=map("q$_",(6..13)); my ($Bi,$Ni,$M0)=map("d$_",(28..31)); my $zero="$Z#lo"; my $temp="$Temp#lo"; my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11)); $code.=<<___; #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon .global bn_mul8x_mont_neon .type bn_mul8x_mont_neon,%function .align 5 bn_mul8x_mont_neon: mov ip,sp stmdb sp!,{r4-r11} vstmdb sp!,{d8-d15} @ ABI specification says so ldmia ip,{r4-r5} @ load rest of parameter block mov ip,sp cmp $num,#8 bhi .LNEON_8n @ special case for $num==8, everything is in register bank... vld1.32 {${Bi}[0]}, [$bptr,:32]! veor $zero,$zero,$zero sub $toutptr,sp,$num,lsl#4 vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( and $toutptr,$toutptr,#-64 vld1.32 {${M0}[0]}, [$n0,:32] mov sp,$toutptr @ alloca vzip.16 $Bi,$zero vmull.u32 @ACC[0],$Bi,${A0}[0] vmull.u32 @ACC[1],$Bi,${A0}[1] vmull.u32 @ACC[2],$Bi,${A1}[0] vshl.i64 $Ni,@ACC[0]#hi,#16 vmull.u32 @ACC[3],$Bi,${A1}[1] vadd.u64 $Ni,$Ni,@ACC[0]#lo veor $zero,$zero,$zero vmul.u32 $Ni,$Ni,$M0 vmull.u32 @ACC[4],$Bi,${A2}[0] vld1.32 {$N0-$N3}, [$nptr]! vmull.u32 @ACC[5],$Bi,${A2}[1] vmull.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero vmull.u32 @ACC[7],$Bi,${A3}[1] vmlal.u32 @ACC[0],$Ni,${N0}[0] sub $outer,$num,#1 vmlal.u32 @ACC[1],$Ni,${N0}[1] vmlal.u32 @ACC[2],$Ni,${N1}[0] vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vmov $Temp,@ACC[0] vmlal.u32 @ACC[5],$Ni,${N2}[1] vmov @ACC[0],@ACC[1] vmlal.u32 @ACC[6],$Ni,${N3}[0] vmov @ACC[1],@ACC[2] vmlal.u32 @ACC[7],$Ni,${N3}[1] vmov @ACC[2],@ACC[3] vmov @ACC[3],@ACC[4] vshr.u64 $temp,$temp,#16 vmov @ACC[4],@ACC[5] vmov @ACC[5],@ACC[6] vadd.u64 $temp,$temp,$Temp#hi vmov @ACC[6],@ACC[7] veor @ACC[7],@ACC[7] vshr.u64 $temp,$temp,#16 b .LNEON_outer8 .align 4 .LNEON_outer8: vld1.32 {${Bi}[0]}, [$bptr,:32]! veor $zero,$zero,$zero vzip.16 $Bi,$zero vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp vmlal.u32 @ACC[0],$Bi,${A0}[0] vmlal.u32 @ACC[1],$Bi,${A0}[1] vmlal.u32 @ACC[2],$Bi,${A1}[0] vshl.i64 $Ni,@ACC[0]#hi,#16 vmlal.u32 @ACC[3],$Bi,${A1}[1] vadd.u64 $Ni,$Ni,@ACC[0]#lo veor $zero,$zero,$zero subs $outer,$outer,#1 vmul.u32 $Ni,$Ni,$M0 vmlal.u32 @ACC[4],$Bi,${A2}[0] vmlal.u32 @ACC[5],$Bi,${A2}[1] vmlal.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero vmlal.u32 @ACC[7],$Bi,${A3}[1] vmlal.u32 @ACC[0],$Ni,${N0}[0] vmlal.u32 @ACC[1],$Ni,${N0}[1] vmlal.u32 @ACC[2],$Ni,${N1}[0] vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vmov $Temp,@ACC[0] vmlal.u32 @ACC[5],$Ni,${N2}[1] vmov @ACC[0],@ACC[1] vmlal.u32 @ACC[6],$Ni,${N3}[0] vmov @ACC[1],@ACC[2] vmlal.u32 @ACC[7],$Ni,${N3}[1] vmov @ACC[2],@ACC[3] vmov @ACC[3],@ACC[4] vshr.u64 $temp,$temp,#16 vmov @ACC[4],@ACC[5] vmov @ACC[5],@ACC[6] vadd.u64 $temp,$temp,$Temp#hi vmov @ACC[6],@ACC[7] veor @ACC[7],@ACC[7] vshr.u64 $temp,$temp,#16 bne .LNEON_outer8 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp mov $toutptr,sp vshr.u64 $temp,@ACC[0]#lo,#16 mov $inner,$num vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp add $tinptr,sp,#96 vshr.u64 $temp,@ACC[0]#hi,#16 vzip.16 @ACC[0]#lo,@ACC[0]#hi b .LNEON_tail_entry .align 4 .LNEON_8n: veor @ACC[0],@ACC[0],@ACC[0] sub $toutptr,sp,#128 veor @ACC[1],@ACC[1],@ACC[1] sub $toutptr,$toutptr,$num,lsl#4 veor @ACC[2],@ACC[2],@ACC[2] and $toutptr,$toutptr,#-64 veor @ACC[3],@ACC[3],@ACC[3] mov sp,$toutptr @ alloca veor @ACC[4],@ACC[4],@ACC[4] add $toutptr,$toutptr,#256 veor @ACC[5],@ACC[5],@ACC[5] sub $inner,$num,#8 veor @ACC[6],@ACC[6],@ACC[6] veor @ACC[7],@ACC[7],@ACC[7] .LNEON_8n_init: vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! subs $inner,$inner,#8 vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]! bne .LNEON_8n_init add $tinptr,sp,#256 vld1.32 {$A0-$A3},[$aptr]! add $bnptr,sp,#8 vld1.32 {${M0}[0]},[$n0,:32] mov $outer,$num b .LNEON_8n_outer .align 4 .LNEON_8n_outer: vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ veor $zero,$zero,$zero vzip.16 $Bi,$zero add $toutptr,sp,#128 vld1.32 {$N0-$N3},[$nptr]! vmlal.u32 @ACC[0],$Bi,${A0}[0] vmlal.u32 @ACC[1],$Bi,${A0}[1] veor $zero,$zero,$zero vmlal.u32 @ACC[2],$Bi,${A1}[0] vshl.i64 $Ni,@ACC[0]#hi,#16 vmlal.u32 @ACC[3],$Bi,${A1}[1] vadd.u64 $Ni,$Ni,@ACC[0]#lo vmlal.u32 @ACC[4],$Bi,${A2}[0] vmul.u32 $Ni,$Ni,$M0 vmlal.u32 @ACC[5],$Bi,${A2}[1] vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0] vmlal.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero vmlal.u32 @ACC[7],$Bi,${A3}[1] ___ for ($i=0; $i<7;) { $code.=<<___; vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ vmlal.u32 @ACC[0],$Ni,${N0}[0] veor $temp,$temp,$temp vmlal.u32 @ACC[1],$Ni,${N0}[1] vzip.16 $Bi,$temp vmlal.u32 @ACC[2],$Ni,${N1}[0] vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi vmlal.u32 @ACC[5],$Ni,${N2}[1] vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 vmlal.u32 @ACC[6],$Ni,${N3}[0] vmlal.u32 @ACC[7],$Ni,${N3}[1] vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i] ___ push(@ACC,shift(@ACC)); $i++; $code.=<<___; vmlal.u32 @ACC[0],$Bi,${A0}[0] vld1.64 {@ACC[7]},[$tinptr,:128]! vmlal.u32 @ACC[1],$Bi,${A0}[1] veor $zero,$zero,$zero vmlal.u32 @ACC[2],$Bi,${A1}[0] vshl.i64 $Ni,@ACC[0]#hi,#16 vmlal.u32 @ACC[3],$Bi,${A1}[1] vadd.u64 $Ni,$Ni,@ACC[0]#lo vmlal.u32 @ACC[4],$Bi,${A2}[0] vmul.u32 $Ni,$Ni,$M0 vmlal.u32 @ACC[5],$Bi,${A2}[1] vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i] vmlal.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero vmlal.u32 @ACC[7],$Bi,${A3}[1] ___ } $code.=<<___; vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] vmlal.u32 @ACC[0],$Ni,${N0}[0] vld1.32 {$A0-$A3},[$aptr]! vmlal.u32 @ACC[1],$Ni,${N0}[1] vmlal.u32 @ACC[2],$Ni,${N1}[0] vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi vmlal.u32 @ACC[5],$Ni,${N2}[1] vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 vmlal.u32 @ACC[6],$Ni,${N3}[0] vmlal.u32 @ACC[7],$Ni,${N3}[1] vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i] add $bnptr,sp,#8 @ rewind ___ push(@ACC,shift(@ACC)); $code.=<<___; sub $inner,$num,#8 b .LNEON_8n_inner .align 4 .LNEON_8n_inner: subs $inner,$inner,#8 vmlal.u32 @ACC[0],$Bi,${A0}[0] vld1.64 {@ACC[7]},[$tinptr,:128] vmlal.u32 @ACC[1],$Bi,${A0}[1] vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0] vmlal.u32 @ACC[2],$Bi,${A1}[0] vld1.32 {$N0-$N3},[$nptr]! vmlal.u32 @ACC[3],$Bi,${A1}[1] it ne addne $tinptr,$tinptr,#16 @ don't advance in last iteration vmlal.u32 @ACC[4],$Bi,${A2}[0] vmlal.u32 @ACC[5],$Bi,${A2}[1] vmlal.u32 @ACC[6],$Bi,${A3}[0] vmlal.u32 @ACC[7],$Bi,${A3}[1] ___ for ($i=1; $i<8; $i++) { $code.=<<___; vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i] vmlal.u32 @ACC[0],$Ni,${N0}[0] vmlal.u32 @ACC[1],$Ni,${N0}[1] vmlal.u32 @ACC[2],$Ni,${N1}[0] vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vmlal.u32 @ACC[5],$Ni,${N2}[1] vmlal.u32 @ACC[6],$Ni,${N3}[0] vmlal.u32 @ACC[7],$Ni,${N3}[1] vst1.64 {@ACC[0]},[$toutptr,:128]! ___ push(@ACC,shift(@ACC)); $code.=<<___; vmlal.u32 @ACC[0],$Bi,${A0}[0] vld1.64 {@ACC[7]},[$tinptr,:128] vmlal.u32 @ACC[1],$Bi,${A0}[1] vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i] vmlal.u32 @ACC[2],$Bi,${A1}[0] it ne addne $tinptr,$tinptr,#16 @ don't advance in last iteration vmlal.u32 @ACC[3],$Bi,${A1}[1] vmlal.u32 @ACC[4],$Bi,${A2}[0] vmlal.u32 @ACC[5],$Bi,${A2}[1] vmlal.u32 @ACC[6],$Bi,${A3}[0] vmlal.u32 @ACC[7],$Bi,${A3}[1] ___ } $code.=<<___; it eq subeq $aptr,$aptr,$num,lsl#2 @ rewind vmlal.u32 @ACC[0],$Ni,${N0}[0] vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] vmlal.u32 @ACC[1],$Ni,${N0}[1] vld1.32 {$A0-$A3},[$aptr]! vmlal.u32 @ACC[2],$Ni,${N1}[0] add $bnptr,sp,#8 @ rewind vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vmlal.u32 @ACC[5],$Ni,${N2}[1] vmlal.u32 @ACC[6],$Ni,${N3}[0] vst1.64 {@ACC[0]},[$toutptr,:128]! vmlal.u32 @ACC[7],$Ni,${N3}[1] bne .LNEON_8n_inner ___ push(@ACC,shift(@ACC)); $code.=<<___; add $tinptr,sp,#128 vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! veor q2,q2,q2 @ $N0-$N1 vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! veor q3,q3,q3 @ $N2-$N3 vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! vst1.64 {@ACC[6]},[$toutptr,:128] subs $outer,$outer,#8 vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]! vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]! vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]! vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]! itt ne subne $nptr,$nptr,$num,lsl#2 @ rewind bne .LNEON_8n_outer add $toutptr,sp,#128 vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame vshr.u64 $temp,@ACC[0]#lo,#16 vst1.64 {q2-q3},[sp,:256]! vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp vst1.64 {q2-q3}, [sp,:256]! vshr.u64 $temp,@ACC[0]#hi,#16 vst1.64 {q2-q3}, [sp,:256]! vzip.16 @ACC[0]#lo,@ACC[0]#hi mov $inner,$num b .LNEON_tail_entry .align 4 .LNEON_tail: vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp vshr.u64 $temp,@ACC[0]#lo,#16 vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]! vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]! vshr.u64 $temp,@ACC[0]#hi,#16 vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]! vzip.16 @ACC[0]#lo,@ACC[0]#hi .LNEON_tail_entry: ___ for ($i=1; $i<8; $i++) { $code.=<<___; vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]! vshr.u64 $temp,@ACC[1]#lo,#16 vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp vshr.u64 $temp,@ACC[1]#hi,#16 vzip.16 @ACC[1]#lo,@ACC[1]#hi ___ push(@ACC,shift(@ACC)); } push(@ACC,shift(@ACC)); $code.=<<___; vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]! subs $inner,$inner,#8 vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]! bne .LNEON_tail vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr subs $aptr,sp,#0 @ clear carry flag add $bptr,sp,$num,lsl#2 .LNEON_sub: ldmia $aptr!, {r4-r7} ldmia $nptr!, {r8-r11} sbcs r8, r4,r8 sbcs r9, r5,r9 sbcs r10,r6,r10 sbcs r11,r7,r11 teq $aptr,$bptr @ preserves carry stmia $rptr!, {r8-r11} bne .LNEON_sub ldr r10, [$aptr] @ load top-most bit mov r11,sp veor q0,q0,q0 sub r11,$bptr,r11 @ this is num*4 veor q1,q1,q1 mov $aptr,sp sub $rptr,$rptr,r11 @ rewind $rptr mov $nptr,$bptr @ second 3/4th of frame sbcs r10,r10,#0 @ result is carry flag .LNEON_copy_n_zap: ldmia $aptr!, {r4-r7} ldmia $rptr, {r8-r11} it cc movcc r8, r4 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe itt cc movcc r9, r5 movcc r10,r6 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe it cc movcc r11,r7 ldmia $aptr, {r4-r7} stmia $rptr!, {r8-r11} sub $aptr,$aptr,#16 ldmia $rptr, {r8-r11} it cc movcc r8, r4 vst1.64 {q0-q1}, [$aptr,:256]! @ wipe itt cc movcc r9, r5 movcc r10,r6 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe it cc movcc r11,r7 teq $aptr,$bptr @ preserves carry stmia $rptr!, {r8-r11} bne .LNEON_copy_n_zap mov sp,ip vldmia sp!,{d8-d15} ldmia sp!,{r4-r11} ret @ bx lr .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon #endif ___ } $code.=<<___; .asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by " ___ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or s/\bret\b/bx lr/g or s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/bn/asm/armv8-mont.pl000064400000000000000000001103351046102023000201530ustar 00000000000000#! /usr/bin/env perl # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # March 2015 # # "Teaser" Montgomery multiplication module for ARMv8. Needs more # work. While it does improve RSA sign performance by 20-30% (less for # longer keys) on most processors, for some reason RSA2048 is not # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication # instruction issue rate is limited on processor in question, meaning # that dedicated squaring procedure is a must. Well, actually all # contemporary AArch64 processors seem to have limited multiplication # issue rate, i.e. they can't issue multiplication every cycle, which # explains moderate improvement coefficients in comparison to # compiler-generated code. Recall that compiler is instructed to use # umulh and therefore uses same amount of multiplication instructions # to do the job. Assembly's edge is to minimize number of "collateral" # instructions and of course instruction scheduling. # # April 2015 # # Squaring procedure that handles lengths divisible by 8 improves # RSA/DSA performance by 25-40-60% depending on processor and key # length. Overall improvement coefficients are always positive in # comparison to compiler-generated code. On Cortex-A57 improvement # is still modest on longest key lengths, while others exhibit e.g. # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster # on Cortex-A57 and ~60-100% faster on others. $flavour = shift; $output = shift; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; ($lo0,$hi0,$aj,$m0,$alo,$ahi, $lo1,$hi1,$nj,$m1,$nlo,$nhi, $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); # void bn_mul_mont_nohw( $rp="x0"; # BN_ULONG *rp, $ap="x1"; # const BN_ULONG *ap, $bp="x2"; # const BN_ULONG *bp, $np="x3"; # const BN_ULONG *np, $n0="x4"; # const BN_ULONG *n0, $num="x5"; # size_t num); $code.=<<___; .text .globl bn_mul_mont_nohw .type bn_mul_mont_nohw,%function .align 5 bn_mul_mont_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] ldr $m0,[$bp],#8 // bp[0] sub $tp,sp,$num,lsl#3 ldp $hi0,$aj,[$ap],#16 // ap[0..1] lsl $num,$num,#3 ldr $n0,[$n0] // *n0 and $tp,$tp,#-16 // ABI says so ldp $hi1,$nj,[$np],#16 // np[0..1] mul $lo0,$hi0,$m0 // ap[0]*bp[0] sub $j,$num,#16 // j=num-2 umulh $hi0,$hi0,$m0 mul $alo,$aj,$m0 // ap[1]*bp[0] umulh $ahi,$aj,$m0 mul $m1,$lo0,$n0 // "tp[0]"*n0 mov sp,$tp // alloca // (*) mul $lo1,$hi1,$m1 // np[0]*m1 umulh $hi1,$hi1,$m1 mul $nlo,$nj,$m1 // np[1]*m1 // (*) adds $lo1,$lo1,$lo0 // discarded // (*) As for removal of first multiplication and addition // instructions. The outcome of first addition is // guaranteed to be zero, which leaves two computationally // significant outcomes: it either carries or not. Then // question is when does it carry? Is there alternative // way to deduce it? If you follow operations, you can // observe that condition for carry is quite simple: // $lo0 being non-zero. So that carry can be calculated // by adding -1 to $lo0. That's what next instruction does. subs xzr,$lo0,#1 // (*) umulh $nhi,$nj,$m1 adc $hi1,$hi1,xzr cbz $j,.L1st_skip .L1st: ldr $aj,[$ap],#8 adds $lo0,$alo,$hi0 sub $j,$j,#8 // j-- adc $hi0,$ahi,xzr ldr $nj,[$np],#8 adds $lo1,$nlo,$hi1 mul $alo,$aj,$m0 // ap[j]*bp[0] adc $hi1,$nhi,xzr umulh $ahi,$aj,$m0 adds $lo1,$lo1,$lo0 mul $nlo,$nj,$m1 // np[j]*m1 adc $hi1,$hi1,xzr umulh $nhi,$nj,$m1 str $lo1,[$tp],#8 // tp[j-1] cbnz $j,.L1st .L1st_skip: adds $lo0,$alo,$hi0 sub $ap,$ap,$num // rewind $ap adc $hi0,$ahi,xzr adds $lo1,$nlo,$hi1 sub $np,$np,$num // rewind $np adc $hi1,$nhi,xzr adds $lo1,$lo1,$lo0 sub $i,$num,#8 // i=num-1 adcs $hi1,$hi1,$hi0 adc $ovf,xzr,xzr // upmost overflow bit stp $lo1,$hi1,[$tp] .Louter: ldr $m0,[$bp],#8 // bp[i] ldp $hi0,$aj,[$ap],#16 ldr $tj,[sp] // tp[0] add $tp,sp,#8 mul $lo0,$hi0,$m0 // ap[0]*bp[i] sub $j,$num,#16 // j=num-2 umulh $hi0,$hi0,$m0 ldp $hi1,$nj,[$np],#16 mul $alo,$aj,$m0 // ap[1]*bp[i] adds $lo0,$lo0,$tj umulh $ahi,$aj,$m0 adc $hi0,$hi0,xzr mul $m1,$lo0,$n0 sub $i,$i,#8 // i-- // (*) mul $lo1,$hi1,$m1 // np[0]*m1 umulh $hi1,$hi1,$m1 mul $nlo,$nj,$m1 // np[1]*m1 // (*) adds $lo1,$lo1,$lo0 subs xzr,$lo0,#1 // (*) umulh $nhi,$nj,$m1 cbz $j,.Linner_skip .Linner: ldr $aj,[$ap],#8 adc $hi1,$hi1,xzr ldr $tj,[$tp],#8 // tp[j] adds $lo0,$alo,$hi0 sub $j,$j,#8 // j-- adc $hi0,$ahi,xzr adds $lo1,$nlo,$hi1 ldr $nj,[$np],#8 adc $hi1,$nhi,xzr mul $alo,$aj,$m0 // ap[j]*bp[i] adds $lo0,$lo0,$tj umulh $ahi,$aj,$m0 adc $hi0,$hi0,xzr mul $nlo,$nj,$m1 // np[j]*m1 adds $lo1,$lo1,$lo0 umulh $nhi,$nj,$m1 str $lo1,[$tp,#-16] // tp[j-1] cbnz $j,.Linner .Linner_skip: ldr $tj,[$tp],#8 // tp[j] adc $hi1,$hi1,xzr adds $lo0,$alo,$hi0 sub $ap,$ap,$num // rewind $ap adc $hi0,$ahi,xzr adds $lo1,$nlo,$hi1 sub $np,$np,$num // rewind $np adcs $hi1,$nhi,$ovf adc $ovf,xzr,xzr adds $lo0,$lo0,$tj adc $hi0,$hi0,xzr adds $lo1,$lo1,$lo0 adcs $hi1,$hi1,$hi0 adc $ovf,$ovf,xzr // upmost overflow bit stp $lo1,$hi1,[$tp,#-16] cbnz $i,.Louter // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. ldr $tj,[sp] // tp[0] add $tp,sp,#8 ldr $nj,[$np],#8 // np[0] subs $j,$num,#8 // j=num-1 and clear borrow mov $ap,$rp .Lsub: sbcs $aj,$tj,$nj // tp[j]-np[j] ldr $tj,[$tp],#8 sub $j,$j,#8 // j-- ldr $nj,[$np],#8 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] cbnz $j,.Lsub sbcs $aj,$tj,$nj sbcs $ovf,$ovf,xzr // did it borrow? str $aj,[$ap],#8 // rp[num-1] ldr $tj,[sp] // tp[0] add $tp,sp,#8 ldr $aj,[$rp],#8 // rp[0] sub $num,$num,#8 // num-- nop .Lcond_copy: sub $num,$num,#8 // num-- csel $nj,$tj,$aj,lo // did it borrow? ldr $tj,[$tp],#8 ldr $aj,[$rp],#8 str xzr,[$tp,#-16] // wipe tp str $nj,[$rp,#-16] cbnz $num,.Lcond_copy csel $nj,$tj,$aj,lo str xzr,[$tp,#-8] // wipe tp str $nj,[$rp,#-8] ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldr x29,[sp],#64 AARCH64_VALIDATE_LINK_REGISTER ret .size bn_mul_mont_nohw,.-bn_mul_mont_nohw ___ { ######################################################################## # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13)); my ($t0,$t1,$t2,$t3)=map("x$_",(14..17)); my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); my ($cnt,$carry,$topmost)=("x27","x28","x30"); my ($tp,$ap_end,$na0)=($bp,$np,$carry); $code.=<<___; .globl bn_sqr8x_mont .type bn_sqr8x_mont,%function .align 5 bn_sqr8x_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] stp $rp,$np,[sp,#96] // offload rp and np ldp $a0,$a1,[$ap,#8*0] ldp $a2,$a3,[$ap,#8*2] ldp $a4,$a5,[$ap,#8*4] ldp $a6,$a7,[$ap,#8*6] sub $tp,sp,$num,lsl#4 lsl $num,$num,#3 ldr $n0,[$n0] // *n0 mov sp,$tp // alloca sub $cnt,$num,#8*8 b .Lsqr8x_zero_start .Lsqr8x_zero: sub $cnt,$cnt,#8*8 stp xzr,xzr,[$tp,#8*0] stp xzr,xzr,[$tp,#8*2] stp xzr,xzr,[$tp,#8*4] stp xzr,xzr,[$tp,#8*6] .Lsqr8x_zero_start: stp xzr,xzr,[$tp,#8*8] stp xzr,xzr,[$tp,#8*10] stp xzr,xzr,[$tp,#8*12] stp xzr,xzr,[$tp,#8*14] add $tp,$tp,#8*16 cbnz $cnt,.Lsqr8x_zero add $ap_end,$ap,$num add $ap,$ap,#8*8 mov $acc0,xzr mov $acc1,xzr mov $acc2,xzr mov $acc3,xzr mov $acc4,xzr mov $acc5,xzr mov $acc6,xzr mov $acc7,xzr mov $tp,sp str $n0,[x29,#112] // offload n0 // Multiply everything but a[i]*a[i] .align 4 .Lsqr8x_outer_loop: // a[1]a[0] (i) // a[2]a[0] // a[3]a[0] // a[4]a[0] // a[5]a[0] // a[6]a[0] // a[7]a[0] // a[2]a[1] (ii) // a[3]a[1] // a[4]a[1] // a[5]a[1] // a[6]a[1] // a[7]a[1] // a[3]a[2] (iii) // a[4]a[2] // a[5]a[2] // a[6]a[2] // a[7]a[2] // a[4]a[3] (iv) // a[5]a[3] // a[6]a[3] // a[7]a[3] // a[5]a[4] (v) // a[6]a[4] // a[7]a[4] // a[6]a[5] (vi) // a[7]a[5] // a[7]a[6] (vii) mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i) mul $t1,$a2,$a0 mul $t2,$a3,$a0 mul $t3,$a4,$a0 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0]) mul $t0,$a5,$a0 adcs $acc2,$acc2,$t1 mul $t1,$a6,$a0 adcs $acc3,$acc3,$t2 mul $t2,$a7,$a0 adcs $acc4,$acc4,$t3 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0]) adcs $acc5,$acc5,$t0 umulh $t0,$a2,$a0 adcs $acc6,$acc6,$t1 umulh $t1,$a3,$a0 adcs $acc7,$acc7,$t2 umulh $t2,$a4,$a0 stp $acc0,$acc1,[$tp],#8*2 // t[0..1] adc $acc0,xzr,xzr // t[8] adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0]) umulh $t3,$a5,$a0 adcs $acc3,$acc3,$t0 umulh $t0,$a6,$a0 adcs $acc4,$acc4,$t1 umulh $t1,$a7,$a0 adcs $acc5,$acc5,$t2 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii) adcs $acc6,$acc6,$t3 mul $t3,$a3,$a1 adcs $acc7,$acc7,$t0 mul $t0,$a4,$a1 adc $acc0,$acc0,$t1 mul $t1,$a5,$a1 adds $acc3,$acc3,$t2 mul $t2,$a6,$a1 adcs $acc4,$acc4,$t3 mul $t3,$a7,$a1 adcs $acc5,$acc5,$t0 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1]) adcs $acc6,$acc6,$t1 umulh $t1,$a3,$a1 adcs $acc7,$acc7,$t2 umulh $t2,$a4,$a1 adcs $acc0,$acc0,$t3 umulh $t3,$a5,$a1 stp $acc2,$acc3,[$tp],#8*2 // t[2..3] adc $acc1,xzr,xzr // t[9] adds $acc4,$acc4,$t0 umulh $t0,$a6,$a1 adcs $acc5,$acc5,$t1 umulh $t1,$a7,$a1 adcs $acc6,$acc6,$t2 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii) adcs $acc7,$acc7,$t3 mul $t3,$a4,$a2 adcs $acc0,$acc0,$t0 mul $t0,$a5,$a2 adc $acc1,$acc1,$t1 mul $t1,$a6,$a2 adds $acc5,$acc5,$t2 mul $t2,$a7,$a2 adcs $acc6,$acc6,$t3 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2]) adcs $acc7,$acc7,$t0 umulh $t0,$a4,$a2 adcs $acc0,$acc0,$t1 umulh $t1,$a5,$a2 adcs $acc1,$acc1,$t2 umulh $t2,$a6,$a2 stp $acc4,$acc5,[$tp],#8*2 // t[4..5] adc $acc2,xzr,xzr // t[10] adds $acc6,$acc6,$t3 umulh $t3,$a7,$a2 adcs $acc7,$acc7,$t0 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv) adcs $acc0,$acc0,$t1 mul $t1,$a5,$a3 adcs $acc1,$acc1,$t2 mul $t2,$a6,$a3 adc $acc2,$acc2,$t3 mul $t3,$a7,$a3 adds $acc7,$acc7,$t0 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3]) adcs $acc0,$acc0,$t1 umulh $t1,$a5,$a3 adcs $acc1,$acc1,$t2 umulh $t2,$a6,$a3 adcs $acc2,$acc2,$t3 umulh $t3,$a7,$a3 stp $acc6,$acc7,[$tp],#8*2 // t[6..7] adc $acc3,xzr,xzr // t[11] adds $acc0,$acc0,$t0 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v) adcs $acc1,$acc1,$t1 mul $t1,$a6,$a4 adcs $acc2,$acc2,$t2 mul $t2,$a7,$a4 adc $acc3,$acc3,$t3 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4]) adds $acc1,$acc1,$t0 umulh $t0,$a6,$a4 adcs $acc2,$acc2,$t1 umulh $t1,$a7,$a4 adcs $acc3,$acc3,$t2 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi) adc $acc4,xzr,xzr // t[12] adds $acc2,$acc2,$t3 mul $t3,$a7,$a5 adcs $acc3,$acc3,$t0 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5]) adc $acc4,$acc4,$t1 umulh $t1,$a7,$a5 adds $acc3,$acc3,$t2 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii) adcs $acc4,$acc4,$t3 umulh $t3,$a7,$a6 // hi(a[7]*a[6]) adc $acc5,xzr,xzr // t[13] adds $acc4,$acc4,$t0 sub $cnt,$ap_end,$ap // done yet? adc $acc5,$acc5,$t1 adds $acc5,$acc5,$t2 sub $t0,$ap_end,$num // rewinded ap adc $acc6,xzr,xzr // t[14] add $acc6,$acc6,$t3 cbz $cnt,.Lsqr8x_outer_break mov $n0,$a0 ldp $a0,$a1,[$tp,#8*0] ldp $a2,$a3,[$tp,#8*2] ldp $a4,$a5,[$tp,#8*4] ldp $a6,$a7,[$tp,#8*6] adds $acc0,$acc0,$a0 adcs $acc1,$acc1,$a1 ldp $a0,$a1,[$ap,#8*0] adcs $acc2,$acc2,$a2 adcs $acc3,$acc3,$a3 ldp $a2,$a3,[$ap,#8*2] adcs $acc4,$acc4,$a4 adcs $acc5,$acc5,$a5 ldp $a4,$a5,[$ap,#8*4] adcs $acc6,$acc6,$a6 mov $rp,$ap adcs $acc7,xzr,$a7 ldp $a6,$a7,[$ap,#8*6] add $ap,$ap,#8*8 //adc $carry,xzr,xzr // moved below mov $cnt,#-8*8 // a[8]a[0] // a[9]a[0] // a[a]a[0] // a[b]a[0] // a[c]a[0] // a[d]a[0] // a[e]a[0] // a[f]a[0] // a[8]a[1] // a[f]a[1]........................ // a[8]a[2] // a[f]a[2]........................ // a[8]a[3] // a[f]a[3]........................ // a[8]a[4] // a[f]a[4]........................ // a[8]a[5] // a[f]a[5]........................ // a[8]a[6] // a[f]a[6]........................ // a[8]a[7] // a[f]a[7]........................ .Lsqr8x_mul: mul $t0,$a0,$n0 adc $carry,xzr,xzr // carry bit, modulo-scheduled mul $t1,$a1,$n0 add $cnt,$cnt,#8 mul $t2,$a2,$n0 mul $t3,$a3,$n0 adds $acc0,$acc0,$t0 mul $t0,$a4,$n0 adcs $acc1,$acc1,$t1 mul $t1,$a5,$n0 adcs $acc2,$acc2,$t2 mul $t2,$a6,$n0 adcs $acc3,$acc3,$t3 mul $t3,$a7,$n0 adcs $acc4,$acc4,$t0 umulh $t0,$a0,$n0 adcs $acc5,$acc5,$t1 umulh $t1,$a1,$n0 adcs $acc6,$acc6,$t2 umulh $t2,$a2,$n0 adcs $acc7,$acc7,$t3 umulh $t3,$a3,$n0 adc $carry,$carry,xzr str $acc0,[$tp],#8 adds $acc0,$acc1,$t0 umulh $t0,$a4,$n0 adcs $acc1,$acc2,$t1 umulh $t1,$a5,$n0 adcs $acc2,$acc3,$t2 umulh $t2,$a6,$n0 adcs $acc3,$acc4,$t3 umulh $t3,$a7,$n0 ldr $n0,[$rp,$cnt] adcs $acc4,$acc5,$t0 adcs $acc5,$acc6,$t1 adcs $acc6,$acc7,$t2 adcs $acc7,$carry,$t3 //adc $carry,xzr,xzr // moved above cbnz $cnt,.Lsqr8x_mul // note that carry flag is guaranteed // to be zero at this point cmp $ap,$ap_end // done yet? b.eq .Lsqr8x_break ldp $a0,$a1,[$tp,#8*0] ldp $a2,$a3,[$tp,#8*2] ldp $a4,$a5,[$tp,#8*4] ldp $a6,$a7,[$tp,#8*6] adds $acc0,$acc0,$a0 ldr $n0,[$rp,#-8*8] adcs $acc1,$acc1,$a1 ldp $a0,$a1,[$ap,#8*0] adcs $acc2,$acc2,$a2 adcs $acc3,$acc3,$a3 ldp $a2,$a3,[$ap,#8*2] adcs $acc4,$acc4,$a4 adcs $acc5,$acc5,$a5 ldp $a4,$a5,[$ap,#8*4] adcs $acc6,$acc6,$a6 mov $cnt,#-8*8 adcs $acc7,$acc7,$a7 ldp $a6,$a7,[$ap,#8*6] add $ap,$ap,#8*8 //adc $carry,xzr,xzr // moved above b .Lsqr8x_mul .align 4 .Lsqr8x_break: ldp $a0,$a1,[$rp,#8*0] add $ap,$rp,#8*8 ldp $a2,$a3,[$rp,#8*2] sub $t0,$ap_end,$ap // is it last iteration? ldp $a4,$a5,[$rp,#8*4] sub $t1,$tp,$t0 ldp $a6,$a7,[$rp,#8*6] cbz $t0,.Lsqr8x_outer_loop stp $acc0,$acc1,[$tp,#8*0] ldp $acc0,$acc1,[$t1,#8*0] stp $acc2,$acc3,[$tp,#8*2] ldp $acc2,$acc3,[$t1,#8*2] stp $acc4,$acc5,[$tp,#8*4] ldp $acc4,$acc5,[$t1,#8*4] stp $acc6,$acc7,[$tp,#8*6] mov $tp,$t1 ldp $acc6,$acc7,[$t1,#8*6] b .Lsqr8x_outer_loop .align 4 .Lsqr8x_outer_break: // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] ldp $t1,$t2,[sp,#8*1] ldp $a5,$a7,[$t0,#8*2] add $ap,$t0,#8*4 ldp $t3,$t0,[sp,#8*3] stp $acc0,$acc1,[$tp,#8*0] mul $acc0,$a1,$a1 stp $acc2,$acc3,[$tp,#8*2] umulh $a1,$a1,$a1 stp $acc4,$acc5,[$tp,#8*4] mul $a2,$a3,$a3 stp $acc6,$acc7,[$tp,#8*6] mov $tp,sp umulh $a3,$a3,$a3 adds $acc1,$a1,$t1,lsl#1 extr $t1,$t2,$t1,#63 sub $cnt,$num,#8*4 .Lsqr4x_shift_n_add: adcs $acc2,$a2,$t1 extr $t2,$t3,$t2,#63 sub $cnt,$cnt,#8*4 adcs $acc3,$a3,$t2 ldp $t1,$t2,[$tp,#8*5] mul $a4,$a5,$a5 ldp $a1,$a3,[$ap],#8*2 umulh $a5,$a5,$a5 mul $a6,$a7,$a7 umulh $a7,$a7,$a7 extr $t3,$t0,$t3,#63 stp $acc0,$acc1,[$tp,#8*0] adcs $acc4,$a4,$t3 extr $t0,$t1,$t0,#63 stp $acc2,$acc3,[$tp,#8*2] adcs $acc5,$a5,$t0 ldp $t3,$t0,[$tp,#8*7] extr $t1,$t2,$t1,#63 adcs $acc6,$a6,$t1 extr $t2,$t3,$t2,#63 adcs $acc7,$a7,$t2 ldp $t1,$t2,[$tp,#8*9] mul $a0,$a1,$a1 ldp $a5,$a7,[$ap],#8*2 umulh $a1,$a1,$a1 mul $a2,$a3,$a3 umulh $a3,$a3,$a3 stp $acc4,$acc5,[$tp,#8*4] extr $t3,$t0,$t3,#63 stp $acc6,$acc7,[$tp,#8*6] add $tp,$tp,#8*8 adcs $acc0,$a0,$t3 extr $t0,$t1,$t0,#63 adcs $acc1,$a1,$t0 ldp $t3,$t0,[$tp,#8*3] extr $t1,$t2,$t1,#63 cbnz $cnt,.Lsqr4x_shift_n_add ___ my ($np,$np_end)=($ap,$ap_end); $code.=<<___; ldp $np,$n0,[x29,#104] // pull np and n0 adcs $acc2,$a2,$t1 extr $t2,$t3,$t2,#63 adcs $acc3,$a3,$t2 ldp $t1,$t2,[$tp,#8*5] mul $a4,$a5,$a5 umulh $a5,$a5,$a5 stp $acc0,$acc1,[$tp,#8*0] mul $a6,$a7,$a7 umulh $a7,$a7,$a7 stp $acc2,$acc3,[$tp,#8*2] extr $t3,$t0,$t3,#63 adcs $acc4,$a4,$t3 extr $t0,$t1,$t0,#63 ldp $acc0,$acc1,[sp,#8*0] adcs $acc5,$a5,$t0 extr $t1,$t2,$t1,#63 ldp $a0,$a1,[$np,#8*0] adcs $acc6,$a6,$t1 extr $t2,xzr,$t2,#63 ldp $a2,$a3,[$np,#8*2] adc $acc7,$a7,$t2 ldp $a4,$a5,[$np,#8*4] // Reduce by 512 bits per iteration mul $na0,$n0,$acc0 // t[0]*n0 ldp $a6,$a7,[$np,#8*6] add $np_end,$np,$num ldp $acc2,$acc3,[sp,#8*2] stp $acc4,$acc5,[$tp,#8*4] ldp $acc4,$acc5,[sp,#8*4] stp $acc6,$acc7,[$tp,#8*6] ldp $acc6,$acc7,[sp,#8*6] add $np,$np,#8*8 mov $topmost,xzr // initial top-most carry mov $tp,sp mov $cnt,#8 .Lsqr8x_reduction: // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0) mul $t1,$a1,$na0 sub $cnt,$cnt,#1 mul $t2,$a2,$na0 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing mul $t3,$a3,$na0 // (*) adds xzr,$acc0,$t0 subs xzr,$acc0,#1 // (*) mul $t0,$a4,$na0 adcs $acc0,$acc1,$t1 mul $t1,$a5,$na0 adcs $acc1,$acc2,$t2 mul $t2,$a6,$na0 adcs $acc2,$acc3,$t3 mul $t3,$a7,$na0 adcs $acc3,$acc4,$t0 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0) adcs $acc4,$acc5,$t1 umulh $t1,$a1,$na0 adcs $acc5,$acc6,$t2 umulh $t2,$a2,$na0 adcs $acc6,$acc7,$t3 umulh $t3,$a3,$na0 adc $acc7,xzr,xzr adds $acc0,$acc0,$t0 umulh $t0,$a4,$na0 adcs $acc1,$acc1,$t1 umulh $t1,$a5,$na0 adcs $acc2,$acc2,$t2 umulh $t2,$a6,$na0 adcs $acc3,$acc3,$t3 umulh $t3,$a7,$na0 mul $na0,$n0,$acc0 // next t[0]*n0 adcs $acc4,$acc4,$t0 adcs $acc5,$acc5,$t1 adcs $acc6,$acc6,$t2 adc $acc7,$acc7,$t3 cbnz $cnt,.Lsqr8x_reduction ldp $t0,$t1,[$tp,#8*0] ldp $t2,$t3,[$tp,#8*2] mov $rp,$tp sub $cnt,$np_end,$np // done yet? adds $acc0,$acc0,$t0 adcs $acc1,$acc1,$t1 ldp $t0,$t1,[$tp,#8*4] adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 ldp $t2,$t3,[$tp,#8*6] adcs $acc4,$acc4,$t0 adcs $acc5,$acc5,$t1 adcs $acc6,$acc6,$t2 adcs $acc7,$acc7,$t3 //adc $carry,xzr,xzr // moved below cbz $cnt,.Lsqr8x8_post_condition ldr $n0,[$tp,#-8*8] ldp $a0,$a1,[$np,#8*0] ldp $a2,$a3,[$np,#8*2] ldp $a4,$a5,[$np,#8*4] mov $cnt,#-8*8 ldp $a6,$a7,[$np,#8*6] add $np,$np,#8*8 .Lsqr8x_tail: mul $t0,$a0,$n0 adc $carry,xzr,xzr // carry bit, modulo-scheduled mul $t1,$a1,$n0 add $cnt,$cnt,#8 mul $t2,$a2,$n0 mul $t3,$a3,$n0 adds $acc0,$acc0,$t0 mul $t0,$a4,$n0 adcs $acc1,$acc1,$t1 mul $t1,$a5,$n0 adcs $acc2,$acc2,$t2 mul $t2,$a6,$n0 adcs $acc3,$acc3,$t3 mul $t3,$a7,$n0 adcs $acc4,$acc4,$t0 umulh $t0,$a0,$n0 adcs $acc5,$acc5,$t1 umulh $t1,$a1,$n0 adcs $acc6,$acc6,$t2 umulh $t2,$a2,$n0 adcs $acc7,$acc7,$t3 umulh $t3,$a3,$n0 adc $carry,$carry,xzr str $acc0,[$tp],#8 adds $acc0,$acc1,$t0 umulh $t0,$a4,$n0 adcs $acc1,$acc2,$t1 umulh $t1,$a5,$n0 adcs $acc2,$acc3,$t2 umulh $t2,$a6,$n0 adcs $acc3,$acc4,$t3 umulh $t3,$a7,$n0 ldr $n0,[$rp,$cnt] adcs $acc4,$acc5,$t0 adcs $acc5,$acc6,$t1 adcs $acc6,$acc7,$t2 adcs $acc7,$carry,$t3 //adc $carry,xzr,xzr // moved above cbnz $cnt,.Lsqr8x_tail // note that carry flag is guaranteed // to be zero at this point ldp $a0,$a1,[$tp,#8*0] sub $cnt,$np_end,$np // done yet? sub $t2,$np_end,$num // rewinded np ldp $a2,$a3,[$tp,#8*2] ldp $a4,$a5,[$tp,#8*4] ldp $a6,$a7,[$tp,#8*6] cbz $cnt,.Lsqr8x_tail_break ldr $n0,[$rp,#-8*8] adds $acc0,$acc0,$a0 adcs $acc1,$acc1,$a1 ldp $a0,$a1,[$np,#8*0] adcs $acc2,$acc2,$a2 adcs $acc3,$acc3,$a3 ldp $a2,$a3,[$np,#8*2] adcs $acc4,$acc4,$a4 adcs $acc5,$acc5,$a5 ldp $a4,$a5,[$np,#8*4] adcs $acc6,$acc6,$a6 mov $cnt,#-8*8 adcs $acc7,$acc7,$a7 ldp $a6,$a7,[$np,#8*6] add $np,$np,#8*8 //adc $carry,xzr,xzr // moved above b .Lsqr8x_tail .align 4 .Lsqr8x_tail_break: ldr $n0,[x29,#112] // pull n0 add $cnt,$tp,#8*8 // end of current t[num] window subs xzr,$topmost,#1 // "move" top-most carry to carry bit adcs $t0,$acc0,$a0 adcs $t1,$acc1,$a1 ldp $acc0,$acc1,[$rp,#8*0] adcs $acc2,$acc2,$a2 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] adcs $acc3,$acc3,$a3 ldp $a2,$a3,[$t2,#8*2] adcs $acc4,$acc4,$a4 adcs $acc5,$acc5,$a5 ldp $a4,$a5,[$t2,#8*4] adcs $acc6,$acc6,$a6 adcs $acc7,$acc7,$a7 ldp $a6,$a7,[$t2,#8*6] add $np,$t2,#8*8 adc $topmost,xzr,xzr // top-most carry mul $na0,$n0,$acc0 stp $t0,$t1,[$tp,#8*0] stp $acc2,$acc3,[$tp,#8*2] ldp $acc2,$acc3,[$rp,#8*2] stp $acc4,$acc5,[$tp,#8*4] ldp $acc4,$acc5,[$rp,#8*4] cmp $cnt,x29 // did we hit the bottom? stp $acc6,$acc7,[$tp,#8*6] mov $tp,$rp // slide the window ldp $acc6,$acc7,[$rp,#8*6] mov $cnt,#8 b.ne .Lsqr8x_reduction // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. ldr $rp,[x29,#96] // pull rp add $tp,$tp,#8*8 subs $t0,$acc0,$a0 sbcs $t1,$acc1,$a1 sub $cnt,$num,#8*8 mov $ap_end,$rp // $rp copy .Lsqr8x_sub: sbcs $t2,$acc2,$a2 ldp $a0,$a1,[$np,#8*0] sbcs $t3,$acc3,$a3 stp $t0,$t1,[$rp,#8*0] sbcs $t0,$acc4,$a4 ldp $a2,$a3,[$np,#8*2] sbcs $t1,$acc5,$a5 stp $t2,$t3,[$rp,#8*2] sbcs $t2,$acc6,$a6 ldp $a4,$a5,[$np,#8*4] sbcs $t3,$acc7,$a7 ldp $a6,$a7,[$np,#8*6] add $np,$np,#8*8 ldp $acc0,$acc1,[$tp,#8*0] sub $cnt,$cnt,#8*8 ldp $acc2,$acc3,[$tp,#8*2] ldp $acc4,$acc5,[$tp,#8*4] ldp $acc6,$acc7,[$tp,#8*6] add $tp,$tp,#8*8 stp $t0,$t1,[$rp,#8*4] sbcs $t0,$acc0,$a0 stp $t2,$t3,[$rp,#8*6] add $rp,$rp,#8*8 sbcs $t1,$acc1,$a1 cbnz $cnt,.Lsqr8x_sub sbcs $t2,$acc2,$a2 mov $tp,sp add $ap,sp,$num ldp $a0,$a1,[$ap_end,#8*0] sbcs $t3,$acc3,$a3 stp $t0,$t1,[$rp,#8*0] sbcs $t0,$acc4,$a4 ldp $a2,$a3,[$ap_end,#8*2] sbcs $t1,$acc5,$a5 stp $t2,$t3,[$rp,#8*2] sbcs $t2,$acc6,$a6 ldp $acc0,$acc1,[$ap,#8*0] sbcs $t3,$acc7,$a7 ldp $acc2,$acc3,[$ap,#8*2] sbcs xzr,$topmost,xzr // did it borrow? ldr x30,[x29,#8] // pull return address stp $t0,$t1,[$rp,#8*4] stp $t2,$t3,[$rp,#8*6] sub $cnt,$num,#8*4 .Lsqr4x_cond_copy: sub $cnt,$cnt,#8*4 csel $t0,$acc0,$a0,lo stp xzr,xzr,[$tp,#8*0] csel $t1,$acc1,$a1,lo ldp $a0,$a1,[$ap_end,#8*4] ldp $acc0,$acc1,[$ap,#8*4] csel $t2,$acc2,$a2,lo stp xzr,xzr,[$tp,#8*2] add $tp,$tp,#8*4 csel $t3,$acc3,$a3,lo ldp $a2,$a3,[$ap_end,#8*6] ldp $acc2,$acc3,[$ap,#8*6] add $ap,$ap,#8*4 stp $t0,$t1,[$ap_end,#8*0] stp $t2,$t3,[$ap_end,#8*2] add $ap_end,$ap_end,#8*4 stp xzr,xzr,[$ap,#8*0] stp xzr,xzr,[$ap,#8*2] cbnz $cnt,.Lsqr4x_cond_copy csel $t0,$acc0,$a0,lo stp xzr,xzr,[$tp,#8*0] csel $t1,$acc1,$a1,lo stp xzr,xzr,[$tp,#8*2] csel $t2,$acc2,$a2,lo csel $t3,$acc3,$a3,lo stp $t0,$t1,[$ap_end,#8*0] stp $t2,$t3,[$ap_end,#8*2] b .Lsqr8x_done .align 4 .Lsqr8x8_post_condition: adc $carry,xzr,xzr ldr x30,[x29,#8] // pull return address // $acc0-7,$carry hold result, $a0-7 hold modulus subs $a0,$acc0,$a0 ldr $ap,[x29,#96] // pull rp sbcs $a1,$acc1,$a1 stp xzr,xzr,[sp,#8*0] sbcs $a2,$acc2,$a2 stp xzr,xzr,[sp,#8*2] sbcs $a3,$acc3,$a3 stp xzr,xzr,[sp,#8*4] sbcs $a4,$acc4,$a4 stp xzr,xzr,[sp,#8*6] sbcs $a5,$acc5,$a5 stp xzr,xzr,[sp,#8*8] sbcs $a6,$acc6,$a6 stp xzr,xzr,[sp,#8*10] sbcs $a7,$acc7,$a7 stp xzr,xzr,[sp,#8*12] sbcs $carry,$carry,xzr // did it borrow? stp xzr,xzr,[sp,#8*14] // $a0-7 hold result-modulus csel $a0,$acc0,$a0,lo csel $a1,$acc1,$a1,lo csel $a2,$acc2,$a2,lo csel $a3,$acc3,$a3,lo stp $a0,$a1,[$ap,#8*0] csel $a4,$acc4,$a4,lo csel $a5,$acc5,$a5,lo stp $a2,$a3,[$ap,#8*2] csel $a6,$acc6,$a6,lo csel $a7,$acc7,$a7,lo stp $a4,$a5,[$ap,#8*4] stp $a6,$a7,[$ap,#8*6] .Lsqr8x_done: ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 // x30 is popped earlier AARCH64_VALIDATE_LINK_REGISTER ret .size bn_sqr8x_mont,.-bn_sqr8x_mont ___ } { ######################################################################## # Even though this might look as ARMv8 adaptation of mulx4x_mont from # x86_64-mont5 module, it's different in sense that it performs # reduction 256 bits at a time. my ($a0,$a1,$a2,$a3, $t0,$t1,$t2,$t3, $m0,$m1,$m2,$m3, $acc0,$acc1,$acc2,$acc3,$acc4, $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); my $bp_end=$rp; my ($carry,$topmost) = ($rp,"x30"); $code.=<<___; .globl bn_mul4x_mont .type bn_mul4x_mont,%function .align 5 bn_mul4x_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub $tp,sp,$num,lsl#3 lsl $num,$num,#3 ldr $n0,[$n0] // *n0 sub sp,$tp,#8*4 // alloca add $t0,$bp,$num add $ap_end,$ap,$num stp $rp,$t0,[x29,#96] // offload rp and &b[num] ldr $bi,[$bp,#8*0] // b[0] ldp $a0,$a1,[$ap,#8*0] // a[0..3] ldp $a2,$a3,[$ap,#8*2] add $ap,$ap,#8*4 mov $acc0,xzr mov $acc1,xzr mov $acc2,xzr mov $acc3,xzr ldp $m0,$m1,[$np,#8*0] // n[0..3] ldp $m2,$m3,[$np,#8*2] adds $np,$np,#8*4 // clear carry bit mov $carry,xzr mov $cnt,#0 mov $tp,sp .Loop_mul4x_1st_reduction: mul $t0,$a0,$bi // lo(a[0..3]*b[0]) adc $carry,$carry,xzr // modulo-scheduled mul $t1,$a1,$bi add $cnt,$cnt,#8 mul $t2,$a2,$bi and $cnt,$cnt,#31 mul $t3,$a3,$bi adds $acc0,$acc0,$t0 umulh $t0,$a0,$bi // hi(a[0..3]*b[0]) adcs $acc1,$acc1,$t1 mul $mi,$acc0,$n0 // t[0]*n0 adcs $acc2,$acc2,$t2 umulh $t1,$a1,$bi adcs $acc3,$acc3,$t3 umulh $t2,$a2,$bi adc $acc4,xzr,xzr umulh $t3,$a3,$bi ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) adds $acc1,$acc1,$t0 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0) str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing adcs $acc2,$acc2,$t1 mul $t1,$m1,$mi adcs $acc3,$acc3,$t2 mul $t2,$m2,$mi adc $acc4,$acc4,$t3 // can't overflow mul $t3,$m3,$mi // (*) adds xzr,$acc0,$t0 subs xzr,$acc0,#1 // (*) umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0) adcs $acc0,$acc1,$t1 umulh $t1,$m1,$mi adcs $acc1,$acc2,$t2 umulh $t2,$m2,$mi adcs $acc2,$acc3,$t3 umulh $t3,$m3,$mi adcs $acc3,$acc4,$carry adc $carry,xzr,xzr adds $acc0,$acc0,$t0 sub $t0,$ap_end,$ap adcs $acc1,$acc1,$t1 adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 //adc $carry,$carry,xzr cbnz $cnt,.Loop_mul4x_1st_reduction cbz $t0,.Lmul4x4_post_condition ldp $a0,$a1,[$ap,#8*0] // a[4..7] ldp $a2,$a3,[$ap,#8*2] add $ap,$ap,#8*4 ldr $mi,[sp] // a[0]*n0 ldp $m0,$m1,[$np,#8*0] // n[4..7] ldp $m2,$m3,[$np,#8*2] add $np,$np,#8*4 .Loop_mul4x_1st_tail: mul $t0,$a0,$bi // lo(a[4..7]*b[i]) adc $carry,$carry,xzr // modulo-scheduled mul $t1,$a1,$bi add $cnt,$cnt,#8 mul $t2,$a2,$bi and $cnt,$cnt,#31 mul $t3,$a3,$bi adds $acc0,$acc0,$t0 umulh $t0,$a0,$bi // hi(a[4..7]*b[i]) adcs $acc1,$acc1,$t1 umulh $t1,$a1,$bi adcs $acc2,$acc2,$t2 umulh $t2,$a2,$bi adcs $acc3,$acc3,$t3 umulh $t3,$a3,$bi adc $acc4,xzr,xzr ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) adds $acc1,$acc1,$t0 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0) adcs $acc2,$acc2,$t1 mul $t1,$m1,$mi adcs $acc3,$acc3,$t2 mul $t2,$m2,$mi adc $acc4,$acc4,$t3 // can't overflow mul $t3,$m3,$mi adds $acc0,$acc0,$t0 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0) adcs $acc1,$acc1,$t1 umulh $t1,$m1,$mi adcs $acc2,$acc2,$t2 umulh $t2,$m2,$mi adcs $acc3,$acc3,$t3 adcs $acc4,$acc4,$carry umulh $t3,$m3,$mi adc $carry,xzr,xzr ldr $mi,[sp,$cnt] // next t[0]*n0 str $acc0,[$tp],#8 // result!!! adds $acc0,$acc1,$t0 sub $t0,$ap_end,$ap // done yet? adcs $acc1,$acc2,$t1 adcs $acc2,$acc3,$t2 adcs $acc3,$acc4,$t3 //adc $carry,$carry,xzr cbnz $cnt,.Loop_mul4x_1st_tail sub $t1,$ap_end,$num // rewinded $ap cbz $t0,.Lmul4x_proceed ldp $a0,$a1,[$ap,#8*0] ldp $a2,$a3,[$ap,#8*2] add $ap,$ap,#8*4 ldp $m0,$m1,[$np,#8*0] ldp $m2,$m3,[$np,#8*2] add $np,$np,#8*4 b .Loop_mul4x_1st_tail .align 5 .Lmul4x_proceed: ldr $bi,[$bp,#8*4]! // *++b adc $topmost,$carry,xzr ldp $a0,$a1,[$t1,#8*0] // a[0..3] sub $np,$np,$num // rewind np ldp $a2,$a3,[$t1,#8*2] add $ap,$t1,#8*4 stp $acc0,$acc1,[$tp,#8*0] // result!!! ldp $acc0,$acc1,[sp,#8*4] // t[0..3] stp $acc2,$acc3,[$tp,#8*2] // result!!! ldp $acc2,$acc3,[sp,#8*6] ldp $m0,$m1,[$np,#8*0] // n[0..3] mov $tp,sp ldp $m2,$m3,[$np,#8*2] adds $np,$np,#8*4 // clear carry bit mov $carry,xzr .align 4 .Loop_mul4x_reduction: mul $t0,$a0,$bi // lo(a[0..3]*b[4]) adc $carry,$carry,xzr // modulo-scheduled mul $t1,$a1,$bi add $cnt,$cnt,#8 mul $t2,$a2,$bi and $cnt,$cnt,#31 mul $t3,$a3,$bi adds $acc0,$acc0,$t0 umulh $t0,$a0,$bi // hi(a[0..3]*b[4]) adcs $acc1,$acc1,$t1 mul $mi,$acc0,$n0 // t[0]*n0 adcs $acc2,$acc2,$t2 umulh $t1,$a1,$bi adcs $acc3,$acc3,$t3 umulh $t2,$a2,$bi adc $acc4,xzr,xzr umulh $t3,$a3,$bi ldr $bi,[$bp,$cnt] // next b[i] adds $acc1,$acc1,$t0 // (*) mul $t0,$m0,$mi str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing adcs $acc2,$acc2,$t1 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0 adcs $acc3,$acc3,$t2 mul $t2,$m2,$mi adc $acc4,$acc4,$t3 // can't overflow mul $t3,$m3,$mi // (*) adds xzr,$acc0,$t0 subs xzr,$acc0,#1 // (*) umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0 adcs $acc0,$acc1,$t1 umulh $t1,$m1,$mi adcs $acc1,$acc2,$t2 umulh $t2,$m2,$mi adcs $acc2,$acc3,$t3 umulh $t3,$m3,$mi adcs $acc3,$acc4,$carry adc $carry,xzr,xzr adds $acc0,$acc0,$t0 adcs $acc1,$acc1,$t1 adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 //adc $carry,$carry,xzr cbnz $cnt,.Loop_mul4x_reduction adc $carry,$carry,xzr ldp $t0,$t1,[$tp,#8*4] // t[4..7] ldp $t2,$t3,[$tp,#8*6] ldp $a0,$a1,[$ap,#8*0] // a[4..7] ldp $a2,$a3,[$ap,#8*2] add $ap,$ap,#8*4 adds $acc0,$acc0,$t0 adcs $acc1,$acc1,$t1 adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 //adc $carry,$carry,xzr ldr $mi,[sp] // t[0]*n0 ldp $m0,$m1,[$np,#8*0] // n[4..7] ldp $m2,$m3,[$np,#8*2] add $np,$np,#8*4 .align 4 .Loop_mul4x_tail: mul $t0,$a0,$bi // lo(a[4..7]*b[4]) adc $carry,$carry,xzr // modulo-scheduled mul $t1,$a1,$bi add $cnt,$cnt,#8 mul $t2,$a2,$bi and $cnt,$cnt,#31 mul $t3,$a3,$bi adds $acc0,$acc0,$t0 umulh $t0,$a0,$bi // hi(a[4..7]*b[4]) adcs $acc1,$acc1,$t1 umulh $t1,$a1,$bi adcs $acc2,$acc2,$t2 umulh $t2,$a2,$bi adcs $acc3,$acc3,$t3 umulh $t3,$a3,$bi adc $acc4,xzr,xzr ldr $bi,[$bp,$cnt] // next b[i] adds $acc1,$acc1,$t0 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0) adcs $acc2,$acc2,$t1 mul $t1,$m1,$mi adcs $acc3,$acc3,$t2 mul $t2,$m2,$mi adc $acc4,$acc4,$t3 // can't overflow mul $t3,$m3,$mi adds $acc0,$acc0,$t0 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0) adcs $acc1,$acc1,$t1 umulh $t1,$m1,$mi adcs $acc2,$acc2,$t2 umulh $t2,$m2,$mi adcs $acc3,$acc3,$t3 umulh $t3,$m3,$mi adcs $acc4,$acc4,$carry ldr $mi,[sp,$cnt] // next a[0]*n0 adc $carry,xzr,xzr str $acc0,[$tp],#8 // result!!! adds $acc0,$acc1,$t0 sub $t0,$ap_end,$ap // done yet? adcs $acc1,$acc2,$t1 adcs $acc2,$acc3,$t2 adcs $acc3,$acc4,$t3 //adc $carry,$carry,xzr cbnz $cnt,.Loop_mul4x_tail sub $t1,$np,$num // rewinded np? adc $carry,$carry,xzr cbz $t0,.Loop_mul4x_break ldp $t0,$t1,[$tp,#8*4] ldp $t2,$t3,[$tp,#8*6] ldp $a0,$a1,[$ap,#8*0] ldp $a2,$a3,[$ap,#8*2] add $ap,$ap,#8*4 adds $acc0,$acc0,$t0 adcs $acc1,$acc1,$t1 adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 //adc $carry,$carry,xzr ldp $m0,$m1,[$np,#8*0] ldp $m2,$m3,[$np,#8*2] add $np,$np,#8*4 b .Loop_mul4x_tail .align 4 .Loop_mul4x_break: ldp $t2,$t3,[x29,#96] // pull rp and &b[num] adds $acc0,$acc0,$topmost add $bp,$bp,#8*4 // bp++ adcs $acc1,$acc1,xzr sub $ap,$ap,$num // rewind ap adcs $acc2,$acc2,xzr stp $acc0,$acc1,[$tp,#8*0] // result!!! adcs $acc3,$acc3,xzr ldp $acc0,$acc1,[sp,#8*4] // t[0..3] adc $topmost,$carry,xzr stp $acc2,$acc3,[$tp,#8*2] // result!!! cmp $bp,$t3 // done yet? ldp $acc2,$acc3,[sp,#8*6] ldp $m0,$m1,[$t1,#8*0] // n[0..3] ldp $m2,$m3,[$t1,#8*2] add $np,$t1,#8*4 b.eq .Lmul4x_post ldr $bi,[$bp] ldp $a0,$a1,[$ap,#8*0] // a[0..3] ldp $a2,$a3,[$ap,#8*2] adds $ap,$ap,#8*4 // clear carry bit mov $carry,xzr mov $tp,sp b .Loop_mul4x_reduction .align 4 .Lmul4x_post: // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. mov $rp,$t2 mov $ap_end,$t2 // $rp copy subs $t0,$acc0,$m0 add $tp,sp,#8*8 sbcs $t1,$acc1,$m1 sub $cnt,$num,#8*4 .Lmul4x_sub: sbcs $t2,$acc2,$m2 ldp $m0,$m1,[$np,#8*0] sub $cnt,$cnt,#8*4 ldp $acc0,$acc1,[$tp,#8*0] sbcs $t3,$acc3,$m3 ldp $m2,$m3,[$np,#8*2] add $np,$np,#8*4 ldp $acc2,$acc3,[$tp,#8*2] add $tp,$tp,#8*4 stp $t0,$t1,[$rp,#8*0] sbcs $t0,$acc0,$m0 stp $t2,$t3,[$rp,#8*2] add $rp,$rp,#8*4 sbcs $t1,$acc1,$m1 cbnz $cnt,.Lmul4x_sub sbcs $t2,$acc2,$m2 mov $tp,sp add $ap,sp,#8*4 ldp $a0,$a1,[$ap_end,#8*0] sbcs $t3,$acc3,$m3 stp $t0,$t1,[$rp,#8*0] ldp $a2,$a3,[$ap_end,#8*2] stp $t2,$t3,[$rp,#8*2] ldp $acc0,$acc1,[$ap,#8*0] ldp $acc2,$acc3,[$ap,#8*2] sbcs xzr,$topmost,xzr // did it borrow? ldr x30,[x29,#8] // pull return address sub $cnt,$num,#8*4 .Lmul4x_cond_copy: sub $cnt,$cnt,#8*4 csel $t0,$acc0,$a0,lo stp xzr,xzr,[$tp,#8*0] csel $t1,$acc1,$a1,lo ldp $a0,$a1,[$ap_end,#8*4] ldp $acc0,$acc1,[$ap,#8*4] csel $t2,$acc2,$a2,lo stp xzr,xzr,[$tp,#8*2] add $tp,$tp,#8*4 csel $t3,$acc3,$a3,lo ldp $a2,$a3,[$ap_end,#8*6] ldp $acc2,$acc3,[$ap,#8*6] add $ap,$ap,#8*4 stp $t0,$t1,[$ap_end,#8*0] stp $t2,$t3,[$ap_end,#8*2] add $ap_end,$ap_end,#8*4 cbnz $cnt,.Lmul4x_cond_copy csel $t0,$acc0,$a0,lo stp xzr,xzr,[$tp,#8*0] csel $t1,$acc1,$a1,lo stp xzr,xzr,[$tp,#8*2] csel $t2,$acc2,$a2,lo stp xzr,xzr,[$tp,#8*3] csel $t3,$acc3,$a3,lo stp xzr,xzr,[$tp,#8*4] stp $t0,$t1,[$ap_end,#8*0] stp $t2,$t3,[$ap_end,#8*2] b .Lmul4x_done .align 4 .Lmul4x4_post_condition: adc $carry,$carry,xzr ldr $ap,[x29,#96] // pull rp // $acc0-3,$carry hold result, $m0-7 hold modulus subs $a0,$acc0,$m0 ldr x30,[x29,#8] // pull return address sbcs $a1,$acc1,$m1 stp xzr,xzr,[sp,#8*0] sbcs $a2,$acc2,$m2 stp xzr,xzr,[sp,#8*2] sbcs $a3,$acc3,$m3 stp xzr,xzr,[sp,#8*4] sbcs xzr,$carry,xzr // did it borrow? stp xzr,xzr,[sp,#8*6] // $a0-3 hold result-modulus csel $a0,$acc0,$a0,lo csel $a1,$acc1,$a1,lo csel $a2,$acc2,$a2,lo csel $a3,$acc3,$a3,lo stp $a0,$a1,[$ap,#8*0] stp $a2,$a3,[$ap,#8*2] .Lmul4x_done: ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 // x30 is popped earlier AARCH64_VALIDATE_LINK_REGISTER ret .size bn_mul4x_mont,.-bn_mul4x_mont ___ } $code.=<<___; .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by " .align 4 ___ print $code; close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/bn/asm/x86-mont.pl000064400000000000000000000226071046102023000175470ustar 00000000000000#! /usr/bin/env perl # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # October 2005 # # This is a "teaser" code, as it can be improved in several ways... # First of all non-SSE2 path should be implemented (yes, for now it # performs Montgomery multiplication/convolution only on SSE2-capable # CPUs such as P4, others fall down to original code). Then inner loop # can be unrolled and modulo-scheduled to improve ILP and possibly # moved to 128-bit XMM register bank (though it would require input # rearrangement and/or increase bus bandwidth utilization). Dedicated # squaring procedure should give further performance improvement... # Yet, for being draft, the code improves rsa512 *sign* benchmark by # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) # December 2006 # # Modulo-scheduling SSE2 loops results in further 15-20% improvement. # Integer-only code [being equipped with dedicated squaring procedure] # gives ~40% on rsa512 sign benchmark... $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../../perlasm"); require "x86asm.pl"; $output = pop; open STDOUT,">$output"; &asm_init($ARGV[0]); $sse2=1; &function_begin("bn_mul_mont"); $i="edx"; $j="ecx"; $ap="esi"; $tp="esi"; # overlapping variables!!! $rp="edi"; $bp="edi"; # overlapping variables!!! $np="ebp"; $num="ebx"; $_num=&DWP(4*0,"esp"); # stack top layout $_rp=&DWP(4*1,"esp"); $_ap=&DWP(4*2,"esp"); $_bp=&DWP(4*3,"esp"); $_np=&DWP(4*4,"esp"); $_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); $_sp=&DWP(4*6,"esp"); $_bpend=&DWP(4*7,"esp"); $frame=32; # size of above frame rounded up to 16n &xor ("eax","eax"); &mov ("edi",&wparam(5)); # int num &lea ("esi",&wparam(0)); # put aside pointer to argument block &lea ("edx",&wparam(1)); # load ap &add ("edi",2); # extra two words on top of tp &neg ("edi"); &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2)) &neg ("edi"); # minimize cache contention by arranging 2K window between stack # pointer and ap argument [np is also position sensitive vector, # but it's assumed to be near ap, as it's allocated at ~same # time]. &mov ("eax","ebp"); &sub ("eax","edx"); &and ("eax",2047); &sub ("ebp","eax"); # this aligns sp and ap modulo 2048 &xor ("edx","ebp"); &and ("edx",2048); &xor ("edx",2048); &sub ("ebp","edx"); # this splits them apart modulo 4096 &and ("ebp",-64); # align to cache line # An OS-agnostic version of __chkstk. # # Some OSes (Windows) insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... &mov ("eax","esp"); &sub ("eax","ebp"); &and ("eax",-4096); &mov ("edx","esp"); # saved stack pointer! &lea ("esp",&DWP(0,"ebp","eax")); &mov ("eax",&DWP(0,"esp")); &cmp ("esp","ebp"); &ja (&label("page_walk")); &jmp (&label("page_walk_done")); &set_label("page_walk",16); &lea ("esp",&DWP(-4096,"esp")); &mov ("eax",&DWP(0,"esp")); &cmp ("esp","ebp"); &ja (&label("page_walk")); &set_label("page_walk_done"); ################################# load argument block... &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 #&mov ("edi",&DWP(5*4,"esi"));# int num &mov ("esi",&DWP(0,"esi")); # pull n0[0] &mov ($_rp,"eax"); # ... save a copy of argument block &mov ($_ap,"ebx"); &mov ($_bp,"ecx"); &mov ($_np,"ebp"); &mov ($_n0,"esi"); &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling #&mov ($_num,$num); # redundant as $num is not reused &mov ($_sp,"edx"); # saved stack pointer! if($sse2) { $acc0="mm0"; # mmx register bank layout $acc1="mm1"; $car0="mm2"; $car1="mm3"; $mul0="mm4"; $mul1="mm5"; $temp="mm6"; $mask="mm7"; &mov ("eax",-1); &movd ($mask,"eax"); # mask 32 lower bits &mov ($ap,$_ap); # load input pointers &mov ($bp,$_bp); &mov ($np,$_np); &xor ($i,$i); # i=0 &xor ($j,$j); # j=0 &movd ($mul0,&DWP(0,$bp)); # bp[0] &movd ($mul1,&DWP(0,$ap)); # ap[0] &movd ($car1,&DWP(0,$np)); # np[0] &pmuludq($mul1,$mul0); # ap[0]*bp[0] &movq ($car0,$mul1); &movq ($acc0,$mul1); # I wish movd worked for &pand ($acc0,$mask); # inter-register transfers &pmuludq($mul1,$_n0q); # *=n0 &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 &paddq ($car1,$acc0); &movd ($acc1,&DWP(4,$np)); # np[1] &movd ($acc0,&DWP(4,$ap)); # ap[1] &psrlq ($car0,32); &psrlq ($car1,32); &inc ($j); # j++ &set_label("1st",16); &pmuludq($acc0,$mul0); # ap[j]*bp[0] &pmuludq($acc1,$mul1); # np[j]*m1 &paddq ($car0,$acc0); # +=c0 &paddq ($car1,$acc1); # +=c1 &movq ($acc0,$car0); &pand ($acc0,$mask); &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] &paddq ($car1,$acc0); # +=ap[j]*bp[0]; &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] &psrlq ($car0,32); &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= &psrlq ($car1,32); &lea ($j,&DWP(1,$j)); &cmp ($j,$num); &jl (&label("1st")); &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] &pmuludq($acc1,$mul1); # np[num-1]*m1 &paddq ($car0,$acc0); # +=c0 &paddq ($car1,$acc1); # +=c1 &movq ($acc0,$car0); &pand ($acc0,$mask); &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= &psrlq ($car0,32); &psrlq ($car1,32); &paddq ($car1,$car0); &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] &inc ($i); # i++ &set_label("outer"); &xor ($j,$j); # j=0 &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] &movd ($mul1,&DWP(0,$ap)); # ap[0] &movd ($temp,&DWP($frame,"esp")); # tp[0] &movd ($car1,&DWP(0,$np)); # np[0] &pmuludq($mul1,$mul0); # ap[0]*bp[i] &paddq ($mul1,$temp); # +=tp[0] &movq ($acc0,$mul1); &movq ($car0,$mul1); &pand ($acc0,$mask); &pmuludq($mul1,$_n0q); # *=n0 &pmuludq($car1,$mul1); &paddq ($car1,$acc0); &movd ($temp,&DWP($frame+4,"esp")); # tp[1] &movd ($acc1,&DWP(4,$np)); # np[1] &movd ($acc0,&DWP(4,$ap)); # ap[1] &psrlq ($car0,32); &psrlq ($car1,32); &paddq ($car0,$temp); # +=tp[1] &inc ($j); # j++ &dec ($num); &set_label("inner"); &pmuludq($acc0,$mul0); # ap[j]*bp[i] &pmuludq($acc1,$mul1); # np[j]*m1 &paddq ($car0,$acc0); # +=c0 &paddq ($car1,$acc1); # +=c1 &movq ($acc0,$car0); &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] &pand ($acc0,$mask); &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] &psrlq ($car0,32); &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= &psrlq ($car1,32); &paddq ($car0,$temp); # +=tp[j+1] &dec ($num); &lea ($j,&DWP(1,$j)); # j++ &jnz (&label("inner")); &mov ($num,$j); &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] &pmuludq($acc1,$mul1); # np[num-1]*m1 &paddq ($car0,$acc0); # +=c0 &paddq ($car1,$acc1); # +=c1 &movq ($acc0,$car0); &pand ($acc0,$mask); &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= &psrlq ($car0,32); &psrlq ($car1,32); &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] &paddq ($car1,$car0); &paddq ($car1,$temp); &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] &lea ($i,&DWP(1,$i)); # i++ &cmp ($i,$num); &jle (&label("outer")); &emms (); # done with mmx bank &jmp (&label("common_tail")); } &set_label("common_tail",16); &mov ($np,$_np); # load modulus pointer &mov ($rp,$_rp); # load result pointer &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] &mov ("eax",&DWP(0,$tp)); # tp[0] &mov ($j,$num); # j=num-1 &xor ($i,$i); # i=0 and clear CF! &set_label("sub",16); &sbb ("eax",&DWP(0,$np,$i,4)); &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] &dec ($j); # doesn't affect CF! &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] &lea ($i,&DWP(1,$i)); # i++ &jge (&label("sub")); &sbb ("eax",0); # handle upmost overflow bit &mov ("edx",-1); &xor ("edx","eax"); &jmp (&label("copy")); &set_label("copy",16); # conditional copy &mov ($tp,&DWP($frame,"esp",$num,4)); &mov ($np,&DWP(0,$rp,$num,4)); &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector &and ($tp,"eax"); &and ($np,"edx"); &or ($np,$tp); &mov (&DWP(0,$rp,$num,4),$np); &dec ($num); &jge (&label("copy")); &mov ("esp",$_sp); # pull saved stack pointer &mov ("eax",1); &function_end("bn_mul_mont"); &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by "); &asm_finish(); close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont.pl000064400000000000000000000777351046102023000200740ustar 00000000000000#! /usr/bin/env perl # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # October 2005. # # Montgomery multiplication routine for x86_64. While it gives modest # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more # than twice, >2x, as fast. Most common rsa1024 sign is improved by # respectful 50%. It remains to be seen if loop unrolling and # dedicated squaring routine can provide further improvement... # July 2011. # # Add dedicated squaring procedure. Performance improvement varies # from platform to platform, but in average it's ~5%/15%/25%/33% # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. # August 2011. # # Unroll and modulo-schedule inner loops in such manner that they # are "fallen through" for input lengths of 8, which is critical for # 1024-bit RSA *sign*. Average performance improvement in comparison # to *initial* version of this module from 2005 is ~0%/30%/40%/45% # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. # June 2013. # # Optimize reduction in squaring procedure and improve 1024+-bit RSA # sign performance by 10-16% on Intel Sandy Bridge and later # (virtually same on non-Intel processors). # August 2013. # # Add MULX/ADOX/ADCX code path. $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # In upstream, this is controlled by shelling out to the compiler to check # versions, but BoringSSL is intended to be used with pre-generated perlasm # output, so this isn't useful anyway. $addx = 1; # void bn_mul_mont_nohw( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, $bp="%rdx"; # const BN_ULONG *bp, $np="%rcx"; # const BN_ULONG *np, $n0="%r8"; # const BN_ULONG *n0, # TODO(davidben): The code below treats $num as an int, but C passes in a # size_t. $num="%r9"; # size_t num); $lo0="%r10"; $hi0="%r11"; $hi1="%r13"; $i="%r14"; $j="%r15"; $m0="%rbx"; $m1="%rbp"; $code=<<___; .text .globl bn_mul_mont_nohw .type bn_mul_mont_nohw,\@function,6 .align 16 bn_mul_mont_nohw: .cfi_startproc _CET_ENDBR mov ${num}d,${num}d mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 neg $num mov %rsp,%r11 lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2)) neg $num # restore $num and \$-1024,%r10 # minimize TLB usage # An OS-agnostic version of __chkstk. # # Some OSes (Windows) insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... sub %r10,%r11 and \$-4096,%r11 lea (%r10,%r11),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul_page_walk jmp .Lmul_page_walk_done .align 16 .Lmul_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul_page_walk .Lmul_page_walk_done: mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul_body: mov $bp,%r12 # reassign $bp ___ $bp="%r12"; $code.=<<___; mov ($n0),$n0 # pull n0[0] value mov ($bp),$m0 # m0=bp[0] mov ($ap),%rax xor $i,$i # i=0 xor $j,$j # j=0 mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$lo0 mov ($np),%rax imulq $lo0,$m1 # "tp[0]"*n0 mov %rdx,$hi0 mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax adc \$0,%rdx mov %rdx,$hi1 lea 1($j),$j # j++ jmp .L1st_enter .align 16 .L1st: add %rax,$hi1 mov ($ap,$j,8),%rax adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] mov $lo0,$hi0 adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 .L1st_enter: mulq $m0 # ap[j]*bp[0] add %rax,$hi0 mov ($np,$j,8),%rax adc \$0,%rdx lea 1($j),$j # j++ mov %rdx,$lo0 mulq $m1 # np[j]*m1 cmp $num,$j jne .L1st add %rax,$hi1 mov ($ap),%rax # ap[0] adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 mov $lo0,$hi0 xor %rdx,%rdx add $hi0,$hi1 adc \$0,%rdx mov $hi1,-8(%rsp,$num,8) mov %rdx,(%rsp,$num,8) # store upmost overflow bit lea 1($i),$i # i++ jmp .Louter .align 16 .Louter: mov ($bp,$i,8),$m0 # m0=bp[i] xor $j,$j # j=0 mov $n0,$m1 mov (%rsp),$lo0 mulq $m0 # ap[0]*bp[i] add %rax,$lo0 # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx imulq $lo0,$m1 # tp[0]*n0 mov %rdx,$hi0 mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax adc \$0,%rdx mov 8(%rsp),$lo0 # tp[1] mov %rdx,$hi1 lea 1($j),$j # j++ jmp .Linner_enter .align 16 .Linner: add %rax,$hi1 mov ($ap,$j,8),%rax adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$j,8),$lo0 adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 .Linner_enter: mulq $m0 # ap[j]*bp[i] add %rax,$hi0 mov ($np,$j,8),%rax adc \$0,%rdx add $hi0,$lo0 # ap[j]*bp[i]+tp[j] mov %rdx,$hi0 adc \$0,$hi0 lea 1($j),$j # j++ mulq $m1 # np[j]*m1 cmp $num,$j jne .Linner add %rax,$hi1 mov ($ap),%rax # ap[0] adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$j,8),$lo0 adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 xor %rdx,%rdx add $hi0,$hi1 adc \$0,%rdx add $lo0,$hi1 # pull upmost overflow bit adc \$0,%rdx mov $hi1,-8(%rsp,$num,8) mov %rdx,(%rsp,$num,8) # store upmost overflow bit lea 1($i),$i # i++ cmp $num,$i jb .Louter xor $i,$i # i=0 and clear CF! mov (%rsp),%rax # tp[0] mov $num,$j # j=num .align 16 .Lsub: sbb ($np,$i,8),%rax mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] mov 8(%rsp,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ dec $j # doesn't affect CF! jnz .Lsub sbb \$0,%rax # handle upmost overflow bit mov \$-1,%rbx xor %rax,%rbx # not %rax xor $i,$i mov $num,$j # j=num .Lcopy: # conditional copy mov ($rp,$i,8),%rcx mov (%rsp,$i,8),%rdx and %rbx,%rcx and %rax,%rdx mov $num,(%rsp,$i,8) # zap temporary vector or %rcx,%rdx mov %rdx,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy mov 8(%rsp,$num,8),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul_epilogue: ret .cfi_endproc .size bn_mul_mont_nohw,.-bn_mul_mont_nohw ___ {{{ my @A=("%r10","%r11"); my @N=("%r13","%rdi"); $code.=<<___; .globl bn_mul4x_mont .type bn_mul4x_mont,\@function,6 .align 16 bn_mul4x_mont: .cfi_startproc _CET_ENDBR mov ${num}d,${num}d mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 neg $num mov %rsp,%r11 lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4)) neg $num # restore and \$-1024,%r10 # minimize TLB usage sub %r10,%r11 and \$-4096,%r11 lea (%r10,%r11),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul4x_page_walk jmp .Lmul4x_page_walk_done .Lmul4x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul4x_page_walk .Lmul4x_page_walk_done: mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul4x_body: mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp mov %rdx,%r12 # reassign $bp ___ $bp="%r12"; $code.=<<___; mov ($n0),$n0 # pull n0[0] value mov ($bp),$m0 # m0=bp[0] mov ($ap),%rax xor $i,$i # i=0 xor $j,$j # j=0 mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$A[0] mov ($np),%rax imulq $A[0],$m1 # "tp[0]"*n0 mov %rdx,$A[1] mulq $m1 # np[0]*m1 add %rax,$A[0] # discarded mov 8($ap),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 add %rax,$A[1] mov 8($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 add %rax,$N[1] mov 16($ap),%rax adc \$0,%rdx add $A[1],$N[1] lea 4($j),$j # j++ adc \$0,%rdx mov $N[1],(%rsp) mov %rdx,$N[0] jmp .L1st4x .align 16 .L1st4x: mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov -16($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov -8($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$j,8),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov ($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov 8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-8(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov 8($np,$j,8),%rax adc \$0,%rdx lea 4($j),$j # j++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov -16($ap,$j,8),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-32(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] cmp $num,$j jb .L1st4x mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov -16($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov -8($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] mov $N[0],-8(%rsp,$j,8) mov $N[1],(%rsp,$j,8) # store upmost overflow bit lea 1($i),$i # i++ .align 4 .Louter4x: mov ($bp,$i,8),$m0 # m0=bp[i] xor $j,$j # j=0 mov (%rsp),$A[0] mov $n0,$m1 mulq $m0 # ap[0]*bp[i] add %rax,$A[0] # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx imulq $A[0],$m1 # tp[0]*n0 mov %rdx,$A[1] mulq $m1 # np[0]*m1 add %rax,$A[0] # "$N[0]", discarded mov 8($ap),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov 8($np),%rax adc \$0,%rdx add 8(%rsp),$A[1] # +tp[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov 16($ap),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] lea 4($j),$j # j+=2 adc \$0,%rdx mov $N[1],(%rsp) # tp[j-1] mov %rdx,$N[0] jmp .Linner4x .align 16 .Linner4x: mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov -16($np,$j,8),%rax adc \$0,%rdx add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov -8($np,$j,8),%rax adc \$0,%rdx add -8(%rsp,$j,8),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$j,8),%rax adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov ($np,$j,8),%rax adc \$0,%rdx add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov 8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[0],-8(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov 8($np,$j,8),%rax adc \$0,%rdx add 8(%rsp,$j,8),$A[1] adc \$0,%rdx lea 4($j),$j # j++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov -16($ap,$j,8),%rax adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[1],-32(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] cmp $num,$j jb .Linner4x mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov -16($np,$j,8),%rax adc \$0,%rdx add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov -8($np,$j,8),%rax adc \$0,%rdx add -8(%rsp,$j,8),$A[1] adc \$0,%rdx lea 1($i),$i # i++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] add (%rsp,$num,8),$N[0] # pull upmost overflow bit adc \$0,$N[1] mov $N[0],-8(%rsp,$j,8) mov $N[1],(%rsp,$j,8) # store upmost overflow bit cmp $num,$i jb .Louter4x ___ { my @ri=("%rax","%rdx",$m0,$m1); $code.=<<___; mov 16(%rsp,$num,8),$rp # restore $rp lea -4($num),$j mov 0(%rsp),@ri[0] # tp[0] mov 8(%rsp),@ri[1] # tp[1] shr \$2,$j # j=num/4-1 lea (%rsp),$ap # borrow ap for tp xor $i,$i # i=0 and clear CF! sub 0($np),@ri[0] mov 16($ap),@ri[2] # tp[2] mov 24($ap),@ri[3] # tp[3] sbb 8($np),@ri[1] .Lsub4x: mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] sbb 16($np,$i,8),@ri[2] mov 32($ap,$i,8),@ri[0] # tp[i+1] mov 40($ap,$i,8),@ri[1] sbb 24($np,$i,8),@ri[3] mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] sbb 32($np,$i,8),@ri[0] mov 48($ap,$i,8),@ri[2] mov 56($ap,$i,8),@ri[3] sbb 40($np,$i,8),@ri[1] lea 4($i),$i # i++ dec $j # doesn't affect CF! jnz .Lsub4x mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] mov 32($ap,$i,8),@ri[0] # load overflow bit sbb 16($np,$i,8),@ri[2] mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] sbb 24($np,$i,8),@ri[3] mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] sbb \$0,@ri[0] # handle upmost overflow bit mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] pxor %xmm0,%xmm0 movq @ri[0],%xmm4 pcmpeqd %xmm5,%xmm5 pshufd \$0,%xmm4,%xmm4 mov $num,$j pxor %xmm4,%xmm5 shr \$2,$j # j=num/4 xor %eax,%eax # i=0 jmp .Lcopy4x .align 16 .Lcopy4x: # conditional copy movdqa (%rsp,%rax),%xmm1 movdqu ($rp,%rax),%xmm2 pand %xmm4,%xmm1 pand %xmm5,%xmm2 movdqa 16(%rsp,%rax),%xmm3 movdqa %xmm0,(%rsp,%rax) por %xmm2,%xmm1 movdqu 16($rp,%rax),%xmm2 movdqu %xmm1,($rp,%rax) pand %xmm4,%xmm3 pand %xmm5,%xmm2 movdqa %xmm0,16(%rsp,%rax) por %xmm2,%xmm3 movdqu %xmm3,16($rp,%rax) lea 32(%rax),%rax dec $j jnz .Lcopy4x ___ } $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp .cfi_def_cfa %rsi, 8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret .cfi_endproc .size bn_mul4x_mont,.-bn_mul4x_mont ___ }}} {{{ ###################################################################### # void bn_sqr8x_mont( my $rptr="%rdi"; # const BN_ULONG *rptr, my $aptr="%rsi"; # const BN_ULONG *aptr, my $mulx_adx_capable="%rdx"; # Different than upstream! my $nptr="%rcx"; # const BN_ULONG *nptr, my $n0 ="%r8"; # const BN_ULONG *n0); my $num ="%r9"; # int num, has to be divisible by 8 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); my @A0=("%r10","%r11"); my @A1=("%r12","%r13"); my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); $code.=<<___ if ($addx); .extern bn_sqrx8x_internal # see x86_64-mont5 module ___ $code.=<<___; .extern bn_sqr8x_internal # see x86_64-mont5 module .globl bn_sqr8x_mont .type bn_sqr8x_mont,\@function,6 .align 32 bn_sqr8x_mont: .cfi_startproc _CET_ENDBR mov ${num}d,${num}d mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lsqr8x_prologue: mov ${num}d,%r10d shl \$3,${num}d # convert $num to bytes shl \$3+2,%r10 # 4*$num neg $num ############################################################## # ensure that stack frame doesn't alias with $aptr modulo # 4096. this is done to allow memory disambiguation logic # do its job. # lea -64(%rsp,$num,2),%r11 mov %rsp,%rbp mov ($n0),$n0 # *n0 sub $aptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lsqr8x_sp_alt sub %r11,%rbp # align with $aptr lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lsqr8x_sp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lsqr8x_page_walk jmp .Lsqr8x_page_walk_done .align 16 .Lsqr8x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lsqr8x_page_walk .Lsqr8x_page_walk_done: mov $num,%r10 neg $num mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lsqr8x_body: movq $nptr, %xmm2 # save pointer to modulus pxor %xmm0,%xmm0 movq $rptr,%xmm1 # save $rptr movq %r10, %xmm3 # -$num ___ $code.=<<___ if ($addx); test $mulx_adx_capable,$mulx_adx_capable jz .Lsqr8x_nox call bn_sqrx8x_internal # see x86_64-mont5 module # %rax top-most carry # %rbp nptr # %rcx -8*num # %r8 end of tp[2*num] lea (%r8,%rcx),%rbx mov %rcx,$num mov %rcx,%rdx movq %xmm1,$rptr sar \$3+2,%rcx # %cf=0 jmp .Lsqr8x_sub .align 32 .Lsqr8x_nox: ___ $code.=<<___; call bn_sqr8x_internal # see x86_64-mont5 module # %rax top-most carry # %rbp nptr # %r8 -8*num # %rdi end of tp[2*num] lea (%rdi,$num),%rbx mov $num,%rcx mov $num,%rdx movq %xmm1,$rptr sar \$3+2,%rcx # %cf=0 jmp .Lsqr8x_sub .align 32 .Lsqr8x_sub: mov 8*0(%rbx),%r12 mov 8*1(%rbx),%r13 mov 8*2(%rbx),%r14 mov 8*3(%rbx),%r15 lea 8*4(%rbx),%rbx sbb 8*0(%rbp),%r12 sbb 8*1(%rbp),%r13 sbb 8*2(%rbp),%r14 sbb 8*3(%rbp),%r15 lea 8*4(%rbp),%rbp mov %r12,8*0($rptr) mov %r13,8*1($rptr) mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx # preserves %cf jnz .Lsqr8x_sub sbb \$0,%rax # top-most carry lea (%rbx,$num),%rbx # rewind lea ($rptr,$num),$rptr # rewind movq %rax,%xmm1 pxor %xmm0,%xmm0 pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 jmp .Lsqr8x_cond_copy .align 32 .Lsqr8x_cond_copy: movdqa 16*0(%rbx),%xmm2 movdqa 16*1(%rbx),%xmm3 lea 16*2(%rbx),%rbx movdqu 16*0($rptr),%xmm4 movdqu 16*1($rptr),%xmm5 lea 16*2($rptr),$rptr movdqa %xmm0,-16*2(%rbx) # zero tp movdqa %xmm0,-16*1(%rbx) movdqa %xmm0,-16*2(%rbx,%rdx) movdqa %xmm0,-16*1(%rbx,%rdx) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-16*2($rptr) movdqu %xmm5,-16*1($rptr) add \$32,$num jnz .Lsqr8x_cond_copy mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lsqr8x_epilogue: ret .cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont ___ }}} if ($addx) {{{ my $bp="%rdx"; # original value $code.=<<___; .globl bn_mulx4x_mont .type bn_mulx4x_mont,\@function,6 .align 32 bn_mulx4x_mont: .cfi_startproc _CET_ENDBR mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes xor %r10,%r10 sub $num,%r10 # -$num mov ($n0),$n0 # *n0 lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8) and \$-128,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmulx4x_page_walk jmp .Lmulx4x_page_walk_done .align 16 .Lmulx4x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmulx4x_page_walk .Lmulx4x_page_walk_done: lea ($bp,$num),%r10 ############################################################## # Stack layout # +0 num # +8 off-loaded &b[i] # +16 end of b[num] # +24 saved n0 # +32 saved rp # +40 saved %rsp # +48 inner counter # +56 # +64 tmp[num+1] # mov $num,0(%rsp) # save $num shr \$5,$num mov %r10,16(%rsp) # end of b[num] sub \$1,$num mov $n0, 24(%rsp) # save *n0 mov $rp, 32(%rsp) # save $rp mov %rax,40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 mov $num,48(%rsp) # inner counter jmp .Lmulx4x_body .align 32 .Lmulx4x_body: ___ my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); my $rptr=$bptr; $code.=<<___; lea 8($bp),$bptr mov ($bp),%rdx # b[0], $bp==%rdx actually lea 64+32(%rsp),$tptr mov %rdx,$bi mulx 0*8($aptr),$mi,%rax # a[0]*b[0] mulx 1*8($aptr),%r11,%r14 # a[1]*b[0] add %rax,%r11 mov $bptr,8(%rsp) # off-load &b[i] mulx 2*8($aptr),%r12,%r13 # ... adc %r14,%r12 adc \$0,%r13 mov $mi,$bptr # borrow $bptr imulq 24(%rsp),$mi # "t[0]"*n0 xor $zero,$zero # cf=0, of=0 mulx 3*8($aptr),%rax,%r14 mov $mi,%rdx lea 4*8($aptr),$aptr adcx %rax,%r13 adcx $zero,%r14 # cf=0 mulx 0*8($nptr),%rax,%r10 adcx %rax,$bptr # discarded adox %r11,%r10 mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12 mov 48(%rsp),$bptr # counter value mov %r10,-4*8($tptr) adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-3*8($tptr) adcx %rax,%r12 adox $zero,%r15 # of=0 lea 4*8($nptr),$nptr mov %r12,-2*8($tptr) jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: adcx $zero,%r15 # cf=0, modulo-scheduled mulx 0*8($aptr),%r10,%rax # a[4]*b[0] adcx %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] adcx %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx %r14,%r12 mulx 3*8($aptr),%r13,%r14 .byte 0x67,0x67 mov $mi,%rdx adcx %rax,%r13 adcx $zero,%r14 # cf=0 lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adox %r15,%r10 mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 mov %r11,-4*8($tptr) adox %r15,%r13 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 lea 4*8($nptr),$nptr mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_1st mov 0(%rsp),$num # load num mov 8(%rsp),$bptr # re-load &b[i] adc $zero,%r15 # modulo-scheduled add %r15,%r14 sbb %r15,%r15 # top-most carry mov %r14,-1*8($tptr) jmp .Lmulx4x_outer .align 32 .Lmulx4x_outer: mov ($bptr),%rdx # b[i] lea 8($bptr),$bptr # b++ sub $num,$aptr # rewind $aptr mov %r15,($tptr) # save top-most carry lea 64+4*8(%rsp),$tptr sub $num,$nptr # rewind $nptr mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 mov %rdx,$bi mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] adox -4*8($tptr),$mi adcx %r14,%r11 mulx 2*8($aptr),%r15,%r13 # ... adox -3*8($tptr),%r11 adcx %r15,%r12 adox -2*8($tptr),%r12 adcx $zero,%r13 adox $zero,%r13 mov $bptr,8(%rsp) # off-load &b[i] mov $mi,%r15 imulq 24(%rsp),$mi # "t[0]"*n0 xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 mulx 3*8($aptr),%rax,%r14 mov $mi,%rdx adcx %rax,%r13 adox -1*8($tptr),%r13 adcx $zero,%r14 lea 4*8($aptr),$aptr adox $zero,%r14 mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 mulx 2*8($nptr),%rax,%r12 mov %r10,-4*8($tptr) adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-3*8($tptr) lea 4*8($nptr),$nptr adcx %rax,%r12 adox $zero,%r15 # of=0 mov 48(%rsp),$bptr # counter value mov %r12,-2*8($tptr) jmp .Lmulx4x_inner .align 32 .Lmulx4x_inner: mulx 0*8($aptr),%r10,%rax # a[4]*b[i] adcx $zero,%r15 # cf=0, modulo-scheduled adox %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] adcx 0*8($tptr),%r10 adox %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx 1*8($tptr),%r11 adox %r14,%r12 mulx 3*8($aptr),%r13,%r14 mov $mi,%rdx adcx 2*8($tptr),%r12 adox %rax,%r13 adcx 3*8($tptr),%r13 adox $zero,%r14 # of=0 lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adcx $zero,%r14 # cf=0 adox %r15,%r10 mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 adox %r15,%r13 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-4*8($tptr) mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 lea 4*8($nptr),$nptr mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_inner mov 0(%rsp),$num # load num mov 8(%rsp),$bptr # re-load &b[i] adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$zero # pull top-most carry adc %r15,%r14 sbb %r15,%r15 # top-most carry mov %r14,-1*8($tptr) cmp 16(%rsp),$bptr jne .Lmulx4x_outer lea 64(%rsp),$tptr sub $num,$nptr # rewind $nptr neg %r15 mov $num,%rdx shr \$3+2,$num # %cf=0 mov 32(%rsp),$rptr # restore rp jmp .Lmulx4x_sub .align 32 .Lmulx4x_sub: mov 8*0($tptr),%r11 mov 8*1($tptr),%r12 mov 8*2($tptr),%r13 mov 8*3($tptr),%r14 lea 8*4($tptr),$tptr sbb 8*0($nptr),%r11 sbb 8*1($nptr),%r12 sbb 8*2($nptr),%r13 sbb 8*3($nptr),%r14 lea 8*4($nptr),$nptr mov %r11,8*0($rptr) mov %r12,8*1($rptr) mov %r13,8*2($rptr) mov %r14,8*3($rptr) lea 8*4($rptr),$rptr dec $num # preserves %cf jnz .Lmulx4x_sub sbb \$0,%r15 # top-most carry lea 64(%rsp),$tptr sub %rdx,$rptr # rewind movq %r15,%xmm1 pxor %xmm0,%xmm0 pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 jmp .Lmulx4x_cond_copy .align 32 .Lmulx4x_cond_copy: movdqa 16*0($tptr),%xmm2 movdqa 16*1($tptr),%xmm3 lea 16*2($tptr),$tptr movdqu 16*0($rptr),%xmm4 movdqu 16*1($rptr),%xmm5 lea 16*2($rptr),$rptr movdqa %xmm0,-16*2($tptr) # zero tp movdqa %xmm0,-16*1($tptr) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-16*2($rptr) movdqu %xmm5,-16*1($rptr) sub \$32,%rdx jnz .Lmulx4x_cond_copy mov %rdx,($tptr) mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret .cfi_endproc .size bn_mulx4x_mont,.-bn_mulx4x_mont ___ }}} $code.=<<___; .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by " .align 16 ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type mul_handler,\@abi-omnipotent .align 16 mul_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer jmp .Lcommon_pop_regs .size mul_handler,.-mul_handler .type sqr_handler,\@abi-omnipotent .align 16 sqr_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->Rip<.Lsqr_prologue jb .Lcommon_seh_tail mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # body label cmp %r10,%rbx # context->Rip<.Lsqr_body jb .Lcommon_pop_regs mov 152($context),%rax # pull context->Rsp mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue jae .Lcommon_seh_tail mov 40(%rax),%rax # pull saved stack pointer .Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size sqr_handler,.-sqr_handler .section .pdata .align 4 .rva .LSEH_begin_bn_mul_mont_nohw .rva .LSEH_end_bn_mul_mont_nohw .rva .LSEH_info_bn_mul_mont_nohw .rva .LSEH_begin_bn_mul4x_mont .rva .LSEH_end_bn_mul4x_mont .rva .LSEH_info_bn_mul4x_mont .rva .LSEH_begin_bn_sqr8x_mont .rva .LSEH_end_bn_sqr8x_mont .rva .LSEH_info_bn_sqr8x_mont ___ $code.=<<___ if ($addx); .rva .LSEH_begin_bn_mulx4x_mont .rva .LSEH_end_bn_mulx4x_mont .rva .LSEH_info_bn_mulx4x_mont ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_bn_mul_mont_nohw: .byte 9,0,0,0 .rva mul_handler .rva .Lmul_body,.Lmul_epilogue # HandlerData[] .LSEH_info_bn_mul4x_mont: .byte 9,0,0,0 .rva mul_handler .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .LSEH_info_bn_sqr8x_mont: .byte 9,0,0,0 .rva sqr_handler .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] .align 8 ___ $code.=<<___ if ($addx); .LSEH_info_bn_mulx4x_mont: .byte 9,0,0,0 .rva sqr_handler .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] .align 8 ___ } print $code; close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont5.pl000064400000000000000000002255511046102023000201500ustar 00000000000000#! /usr/bin/env perl # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # August 2011. # # Companion to x86_64-mont.pl that optimizes cache-timing attack # countermeasures. The subroutines are produced by replacing bp[i] # references in their x86_64-mont.pl counterparts with cache-neutral # references to powers table computed in BN_mod_exp_mont_consttime. # In addition subroutine that scatters elements of the powers table # is implemented, so that scatter-/gathering can be tuned without # bn_exp.c modifications. # August 2013. # # Add MULX/AD*X code paths and additional interfaces to optimize for # branch prediction unit. For input lengths that are multiples of 8 # the np argument is not just modulus value, but one interleaved # with 0. This is to optimize post-condition... $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # In upstream, this is controlled by shelling out to the compiler to check # versions, but BoringSSL is intended to be used with pre-generated perlasm # output, so this isn't useful anyway. $addx = 1; # int bn_mul_mont_gather5_nohw( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, $bp="%rdx"; # const BN_ULONG *bp, $np="%rcx"; # const BN_ULONG *np, $n0="%r8"; # const BN_ULONG *n0, $num="%r9"; # int num, # int idx); # 0 to 2^5-1, "index" in $bp holding # pre-computed powers of a', interlaced # in such manner that b[0] is $bp[idx], # b[1] is [2^5+idx], etc. $lo0="%r10"; $hi0="%r11"; $hi1="%r13"; $i="%r14"; $j="%r15"; $m0="%rbx"; $m1="%rbp"; $code=<<___; .text ___ {{{ my @A=("%r10","%r11"); my @N=("%r13","%rdi"); $code.=<<___; .globl bn_mul4x_mont_gather5 .type bn_mul4x_mont_gather5,\@function,6 .align 32 bn_mul4x_mont_gather5: .cfi_startproc _CET_ENDBR .byte 0x67 mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lmul4x_prologue: .byte 0x67 # num is declared as an int, a 32-bit parameter, so the upper half is # undefined. It is important that this write to ${num}, which zeros the # upper half, predates the first access. shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). This is done to allow memory disambiguation # logic do its magic. [Extra [num] is allocated in order # to align with bn_power5's frame, which is cleansed after # completing exponentiation. Extra 256 bytes is for power mask # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmul4xsp_alt sub %r11,%rbp # align with $rp lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lmul4xsp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmul4x_page_walk jmp .Lmul4x_page_walk_done .Lmul4x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmul4x_page_walk .Lmul4x_page_walk_done: neg $num mov %rax,40(%rsp) .cfi_cfa_expression %rsp+40,deref,+8 .Lmul4x_body: call mul4x_internal mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret .cfi_endproc .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 .type mul4x_internal,\@abi-omnipotent .align 32 mul4x_internal: .cfi_startproc shl \$5,$num # $num was in bytes movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index lea .Linc(%rip),%rax lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) shr \$5,$num # restore $num ___ $bp="%r12"; $STRIDE=2**5*8; # 5 is "window size" $tp=$i; $code.=<<___; movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) lea 128(%rdx),$bp # size optimization pshufd \$0,%xmm5,%xmm5 # broadcast index movdqa %xmm1,%xmm4 .byte 0x67,0x67 movdqa %xmm1,%xmm2 ___ ######################################################################## # Calculate masks by comparing 0..31 to $idx and save result to stack. # # We compute sixteen 16-byte masks and store them on the stack. Mask i is stored # in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and # idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations # are scheduled in groups of four. $code.=<<___; paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 # compare to 1,0 .byte 0x67 movdqa %xmm4,%xmm3 ___ for($i=0;$i<$STRIDE/16-4;$i+=4) { $code.=<<___; paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 # compare to 3,2 movdqa %xmm0,`16*($i+0)+112`(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 # compare to 5,4 movdqa %xmm1,`16*($i+1)+112`(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 # compare to 7,6 movdqa %xmm2,`16*($i+2)+112`(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,`16*($i+3)+112`(%r10) movdqa %xmm4,%xmm3 ___ } $code.=<<___; # last iteration can be optimized paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,`16*($i+0)+112`(%r10) paddd %xmm2,%xmm3 .byte 0x67 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,`16*($i+1)+112`(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,`16*($i+2)+112`(%r10) pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register pand `16*($i+1)-128`($bp),%xmm1 pand `16*($i+2)-128`($bp),%xmm2 movdqa %xmm3,`16*($i+3)+112`(%r10) pand `16*($i+3)-128`($bp),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ for($i=0;$i<$STRIDE/16-4;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`($bp),%xmm4 movdqa `16*($i+1)-128`($bp),%xmm5 movdqa `16*($i+2)-128`($bp),%xmm2 pand `16*($i+0)+112`(%r10),%xmm4 movdqa `16*($i+3)-128`($bp),%xmm3 pand `16*($i+1)+112`(%r10),%xmm5 por %xmm4,%xmm0 pand `16*($i+2)+112`(%r10),%xmm2 por %xmm5,%xmm1 pand `16*($i+3)+112`(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ } $code.=<<___; por %xmm1,%xmm0 # Combine the upper and lower halves of %xmm0. pshufd \$0x4e,%xmm0,%xmm1 # Swap upper and lower halves. por %xmm1,%xmm0 lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[0] mov %r13,16+8(%rsp) # save end of b[num] mov $rp, 56+8(%rsp) # save $rp mov ($n0),$n0 # pull n0[0] value mov ($ap),%rax lea ($ap,$num),$ap # end of a[num] neg $num mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$A[0] mov ($np),%rax imulq $A[0],$m1 # "tp[0]"*n0 lea 64+8(%rsp),$tp mov %rdx,$A[1] mulq $m1 # np[0]*m1 add %rax,$A[0] # discarded mov 8($ap,$num),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 add %rax,$A[1] mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 add %rax,$N[1] mov 16($ap,$num),%rax adc \$0,%rdx add $A[1],$N[1] lea 4*8($num),$j # j=4 lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) mov %rdx,$N[0] jmp .L1st4x .align 32 .L1st4x: mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov 8*0($np),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov 8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) # tp[j-1] mov %rdx,$N[0] add \$32,$j # j+=4 jnz .L1st4x mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$num),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] mov $N[0],-8($tp) jmp .Louter4x .align 32 .Louter4x: lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 ___ for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`($bp),%xmm0 movdqa `16*($i+1)-128`($bp),%xmm1 movdqa `16*($i+2)-128`($bp),%xmm2 movdqa `16*($i+3)-128`($bp),%xmm3 pand `16*($i+0)-128`(%rdx),%xmm0 pand `16*($i+1)-128`(%rdx),%xmm1 por %xmm0,%xmm4 pand `16*($i+2)-128`(%rdx),%xmm2 por %xmm1,%xmm5 pand `16*($i+3)-128`(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 ___ } $code.=<<___; por %xmm5,%xmm4 # Combine the upper and lower halves of %xmm4 as %xmm0. pshufd \$0x4e,%xmm4,%xmm0 # Swap upper and lower halves. por %xmm4,%xmm0 lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[i] mov ($tp,$num),$A[0] mov $n0,$m1 mulq $m0 # ap[0]*bp[i] add %rax,$A[0] # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx imulq $A[0],$m1 # tp[0]*n0 mov %rdx,$A[1] mov $N[1],($tp) # store upmost overflow bit lea ($tp,$num),$tp # rewind $tp mulq $m1 # np[0]*m1 add %rax,$A[0] # "$N[0]", discarded mov 8($ap,$num),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] # +tp[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov 16($ap,$num),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] lea 4*8($num),$j # j=4 lea 8*4($np),$np adc \$0,%rdx mov %rdx,$N[0] jmp .Linner4x .align 32 .Linner4x: mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[1],-32($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov -8*1($np),%rax adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov 8*0($np),%rax adc \$0,%rdx add ($tp),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov 8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] lea 8*4($np),$np adc \$0,%rdx mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[0] add \$32,$j # j+=4 jnz .Linner4x mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[1],-32($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov $m1,%rax mov -8*1($np),$m1 adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$num),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] mov $N[1],-16($tp) # tp[j-1] lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] add ($tp),$N[0] # pull upmost overflow bit adc \$0,$N[1] # upmost overflow bit mov $N[0],-8($tp) cmp 16+8(%rsp),$bp jb .Louter4x ___ if (1) { $code.=<<___; xor %rax,%rax sub $N[0],$m1 # compare top-most words adc $j,$j # $j is zero or $j,$N[1] sub $N[1],%rax # %rax=-$N[1] lea ($tp,$num),%rbx # tptr in .sqr4x_sub mov ($np),%r12 lea ($np),%rbp # nptr in .sqr4x_sub mov %r9,%rcx sar \$3+2,%rcx mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub dec %r12 # so that after 'not' we get -n[0] xor %r10,%r10 mov 8*1(%rbp),%r13 mov 8*2(%rbp),%r14 mov 8*3(%rbp),%r15 jmp .Lsqr4x_sub_entry ___ } $code.=<<___; .cfi_endproc .size mul4x_internal,.-mul4x_internal ___ }}} {{{ ###################################################################### # void bn_power5_nohw( my $rptr="%rdi"; # BN_ULONG *rptr, my $aptr="%rsi"; # const BN_ULONG *aptr, my $bptr="%rdx"; # const void *table, my $nptr="%rcx"; # const BN_ULONG *nptr, my $n0 ="%r8"; # const BN_ULONG *n0); my $num ="%r9"; # int num, has to be divisible by 8 # int pwr my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); my @A0=("%r10","%r11"); my @A1=("%r12","%r13"); my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); $code.=<<___; .globl bn_power5_nohw .type bn_power5_nohw,\@function,6 .align 32 bn_power5_nohw: .cfi_startproc _CET_ENDBR mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lpower5_prologue: # num is declared as an int, a 32-bit parameter, so the upper half is # undefined. It is important that this write to ${num}, which zeros the # upper half, come before the first access. shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10d # 3*$num neg $num mov ($n0),$n0 # *n0 ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). This is done to allow memory disambiguation # logic do its magic. [Extra 256 bytes is for power mask # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwr_sp_alt sub %r11,%rbp # align with $aptr lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lpwr_sp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lpwr_page_walk jmp .Lpwr_page_walk_done .Lpwr_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lpwr_page_walk .Lpwr_page_walk_done: mov $num,%r10 neg $num ############################################################## # Stack layout # # +0 saved $num, used in reduction section # +8 &t[2*$num], used in reduction section # +32 saved *n0 # +40 saved %rsp # +48 t[2*$num] # mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lpower5_body: movq $rptr,%xmm1 # save $rptr, used in sqr8x movq $nptr,%xmm2 # save $nptr movq %r10, %xmm3 # -$num, used in sqr8x movq $bptr,%xmm4 call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal movq %xmm2,$nptr movq %xmm4,$bptr mov $aptr,$rptr mov 40(%rsp),%rax lea 32(%rsp),$n0 call mul4x_internal mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpower5_epilogue: ret .cfi_endproc .size bn_power5_nohw,.-bn_power5_nohw .globl bn_sqr8x_internal .hidden bn_sqr8x_internal .type bn_sqr8x_internal,\@abi-omnipotent .align 32 bn_sqr8x_internal: __bn_sqr8x_internal: .cfi_startproc _CET_ENDBR ############################################################## # Squaring part: # # a) multiply-n-add everything but a[i]*a[i]; # b) shift result of a) by 1 to the left and accumulate # a[i]*a[i] products; # ############################################################## # a[1]a[0] # a[2]a[0] # a[3]a[0] # a[2]a[1] # a[4]a[0] # a[3]a[1] # a[5]a[0] # a[4]a[1] # a[3]a[2] # a[6]a[0] # a[5]a[1] # a[4]a[2] # a[7]a[0] # a[6]a[1] # a[5]a[2] # a[4]a[3] # a[7]a[1] # a[6]a[2] # a[5]a[3] # a[7]a[2] # a[6]a[3] # a[5]a[4] # a[7]a[3] # a[6]a[4] # a[7]a[4] # a[6]a[5] # a[7]a[5] # a[7]a[6] # a[1]a[0] # a[2]a[0] # a[3]a[0] # a[4]a[0] # a[5]a[0] # a[6]a[0] # a[7]a[0] # a[2]a[1] # a[3]a[1] # a[4]a[1] # a[5]a[1] # a[6]a[1] # a[7]a[1] # a[3]a[2] # a[4]a[2] # a[5]a[2] # a[6]a[2] # a[7]a[2] # a[4]a[3] # a[5]a[3] # a[6]a[3] # a[7]a[3] # a[5]a[4] # a[6]a[4] # a[7]a[4] # a[6]a[5] # a[7]a[5] # a[7]a[6] # a[0]a[0] # a[1]a[1] # a[2]a[2] # a[3]a[3] # a[4]a[4] # a[5]a[5] # a[6]a[6] # a[7]a[7] lea 32(%r10),$i # $i=-($num-32) lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] mov $num,$j # $j=$num # comments apply to $num==8 case mov -32($aptr,$i),$a0 # a[0] lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] mov -24($aptr,$i),%rax # a[1] lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] mov -16($aptr,$i),$ai # a[2] mov %rax,$a1 mul $a0 # a[1]*a[0] mov %rax,$A0[0] # a[1]*a[0] mov $ai,%rax # a[2] mov %rdx,$A0[1] mov $A0[0],-24($tptr,$i) # t[1] mul $a0 # a[2]*a[0] add %rax,$A0[1] mov $ai,%rax adc \$0,%rdx mov $A0[1],-16($tptr,$i) # t[2] mov %rdx,$A0[0] mov -8($aptr,$i),$ai # a[3] mul $a1 # a[2]*a[1] mov %rax,$A1[0] # a[2]*a[1]+t[3] mov $ai,%rax mov %rdx,$A1[1] lea ($i),$j mul $a0 # a[3]*a[0] add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] mov $ai,%rax mov %rdx,$A0[1] adc \$0,$A0[1] add $A1[0],$A0[0] adc \$0,$A0[1] mov $A0[0],-8($tptr,$j) # t[3] jmp .Lsqr4x_1st .align 32 .Lsqr4x_1st: mov ($aptr,$j),$ai # a[4] mul $a1 # a[3]*a[1] add %rax,$A1[1] # a[3]*a[1]+t[4] mov $ai,%rax mov %rdx,$A1[0] adc \$0,$A1[0] mul $a0 # a[4]*a[0] add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] mov $ai,%rax # a[3] mov 8($aptr,$j),$ai # a[5] mov %rdx,$A0[0] adc \$0,$A0[0] add $A1[1],$A0[1] adc \$0,$A0[0] mul $a1 # a[4]*a[3] add %rax,$A1[0] # a[4]*a[3]+t[5] mov $ai,%rax mov $A0[1],($tptr,$j) # t[4] mov %rdx,$A1[1] adc \$0,$A1[1] mul $a0 # a[5]*a[2] add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] mov $ai,%rax mov 16($aptr,$j),$ai # a[6] mov %rdx,$A0[1] adc \$0,$A0[1] add $A1[0],$A0[0] adc \$0,$A0[1] mul $a1 # a[5]*a[3] add %rax,$A1[1] # a[5]*a[3]+t[6] mov $ai,%rax mov $A0[0],8($tptr,$j) # t[5] mov %rdx,$A1[0] adc \$0,$A1[0] mul $a0 # a[6]*a[2] add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] mov $ai,%rax # a[3] mov 24($aptr,$j),$ai # a[7] mov %rdx,$A0[0] adc \$0,$A0[0] add $A1[1],$A0[1] adc \$0,$A0[0] mul $a1 # a[6]*a[5] add %rax,$A1[0] # a[6]*a[5]+t[7] mov $ai,%rax mov $A0[1],16($tptr,$j) # t[6] mov %rdx,$A1[1] adc \$0,$A1[1] lea 32($j),$j mul $a0 # a[7]*a[4] add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] mov $ai,%rax mov %rdx,$A0[1] adc \$0,$A0[1] add $A1[0],$A0[0] adc \$0,$A0[1] mov $A0[0],-8($tptr,$j) # t[7] cmp \$0,$j jne .Lsqr4x_1st mul $a1 # a[7]*a[5] add %rax,$A1[1] lea 16($i),$i adc \$0,%rdx add $A0[1],$A1[1] adc \$0,%rdx mov $A1[1],($tptr) # t[8] mov %rdx,$A1[0] mov %rdx,8($tptr) # t[9] jmp .Lsqr4x_outer .align 32 .Lsqr4x_outer: # comments apply to $num==6 case mov -32($aptr,$i),$a0 # a[0] lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] mov -24($aptr,$i),%rax # a[1] lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] mov -16($aptr,$i),$ai # a[2] mov %rax,$a1 mul $a0 # a[1]*a[0] mov -24($tptr,$i),$A0[0] # t[1] add %rax,$A0[0] # a[1]*a[0]+t[1] mov $ai,%rax # a[2] adc \$0,%rdx mov $A0[0],-24($tptr,$i) # t[1] mov %rdx,$A0[1] mul $a0 # a[2]*a[0] add %rax,$A0[1] mov $ai,%rax adc \$0,%rdx add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] mov %rdx,$A0[0] adc \$0,$A0[0] mov $A0[1],-16($tptr,$i) # t[2] xor $A1[0],$A1[0] mov -8($aptr,$i),$ai # a[3] mul $a1 # a[2]*a[1] add %rax,$A1[0] # a[2]*a[1]+t[3] mov $ai,%rax adc \$0,%rdx add -8($tptr,$i),$A1[0] mov %rdx,$A1[1] adc \$0,$A1[1] mul $a0 # a[3]*a[0] add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] mov $ai,%rax adc \$0,%rdx add $A1[0],$A0[0] mov %rdx,$A0[1] adc \$0,$A0[1] mov $A0[0],-8($tptr,$i) # t[3] lea ($i),$j jmp .Lsqr4x_inner .align 32 .Lsqr4x_inner: mov ($aptr,$j),$ai # a[4] mul $a1 # a[3]*a[1] add %rax,$A1[1] # a[3]*a[1]+t[4] mov $ai,%rax mov %rdx,$A1[0] adc \$0,$A1[0] add ($tptr,$j),$A1[1] adc \$0,$A1[0] .byte 0x67 mul $a0 # a[4]*a[0] add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] mov $ai,%rax # a[3] mov 8($aptr,$j),$ai # a[5] mov %rdx,$A0[0] adc \$0,$A0[0] add $A1[1],$A0[1] adc \$0,$A0[0] mul $a1 # a[4]*a[3] add %rax,$A1[0] # a[4]*a[3]+t[5] mov $A0[1],($tptr,$j) # t[4] mov $ai,%rax mov %rdx,$A1[1] adc \$0,$A1[1] add 8($tptr,$j),$A1[0] lea 16($j),$j # j++ adc \$0,$A1[1] mul $a0 # a[5]*a[2] add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] mov $ai,%rax adc \$0,%rdx add $A1[0],$A0[0] mov %rdx,$A0[1] adc \$0,$A0[1] mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below cmp \$0,$j jne .Lsqr4x_inner .byte 0x67 mul $a1 # a[5]*a[3] add %rax,$A1[1] adc \$0,%rdx add $A0[1],$A1[1] adc \$0,%rdx mov $A1[1],($tptr) # t[6], "preloaded t[2]" below mov %rdx,$A1[0] mov %rdx,8($tptr) # t[7], "preloaded t[3]" below add \$16,$i jnz .Lsqr4x_outer # comments apply to $num==4 case mov -32($aptr),$a0 # a[0] lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] mov -24($aptr),%rax # a[1] lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] mov -16($aptr),$ai # a[2] mov %rax,$a1 mul $a0 # a[1]*a[0] add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] mov $ai,%rax # a[2] mov %rdx,$A0[1] adc \$0,$A0[1] mul $a0 # a[2]*a[0] add %rax,$A0[1] mov $ai,%rax mov $A0[0],-24($tptr) # t[1] mov %rdx,$A0[0] adc \$0,$A0[0] add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] mov -8($aptr),$ai # a[3] adc \$0,$A0[0] mul $a1 # a[2]*a[1] add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] mov $ai,%rax mov $A0[1],-16($tptr) # t[2] mov %rdx,$A1[1] adc \$0,$A1[1] mul $a0 # a[3]*a[0] add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] mov $ai,%rax mov %rdx,$A0[1] adc \$0,$A0[1] add $A1[0],$A0[0] adc \$0,$A0[1] mov $A0[0],-8($tptr) # t[3] mul $a1 # a[3]*a[1] add %rax,$A1[1] mov -16($aptr),%rax # a[2] adc \$0,%rdx add $A0[1],$A1[1] adc \$0,%rdx mov $A1[1],($tptr) # t[4] mov %rdx,$A1[0] mov %rdx,8($tptr) # t[5] mul $ai # a[2]*a[3] ___ { my ($shift,$carry)=($a0,$a1); my @S=(@A1,$ai,$n0); $code.=<<___; add \$16,$i xor $shift,$shift sub $num,$i # $i=16-$num xor $carry,$carry add $A1[0],%rax # t[5] adc \$0,%rdx mov %rax,8($tptr) # t[5] mov %rdx,16($tptr) # t[6] mov $carry,24($tptr) # t[7] mov -16($aptr,$i),%rax # a[0] lea 48+8(%rsp),$tptr xor $A0[0],$A0[0] # t[0] mov 8($tptr),$A0[1] # t[1] lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov -8($aptr,$i),%rax # a[i+1] # prefetch mov $S[0],($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift mov $S[1],8($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[2] mov 0($aptr,$i),%rax # a[i+1] # prefetch mov $S[2],16($tptr) adc %rdx,$S[3] lea 16($i),$i mov $S[3],24($tptr) sbb $carry,$carry # mov cf,$carry lea 64($tptr),$tptr jmp .Lsqr4x_shift_n_add .align 32 .Lsqr4x_shift_n_add: lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov -8($aptr,$i),%rax # a[i+1] # prefetch mov $S[0],-32($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift mov $S[1],-24($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[2] mov 0($aptr,$i),%rax # a[i+1] # prefetch mov $S[2],-16($tptr) adc %rdx,$S[3] lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift mov $S[3],-8($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov 8($aptr,$i),%rax # a[i+1] # prefetch mov $S[0],0($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift mov $S[1],8($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[2] mov 16($aptr,$i),%rax # a[i+1] # prefetch mov $S[2],16($tptr) adc %rdx,$S[3] mov $S[3],24($tptr) sbb $carry,$carry # mov cf,$carry lea 64($tptr),$tptr add \$32,$i jnz .Lsqr4x_shift_n_add lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift .byte 0x67 shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov -8($aptr),%rax # a[i+1] # prefetch mov $S[0],-32($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift mov $S[1],-24($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf adc %rax,$S[2] adc %rdx,$S[3] mov $S[2],-16($tptr) mov $S[3],-8($tptr) ___ } ###################################################################### # Montgomery reduction part, "word-by-word" algorithm. # # This new path is inspired by multiple submissions from Intel, by # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, # Vinodh Gopal... { my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); $code.=<<___; movq %xmm2,$nptr __bn_sqr8x_reduction: xor %rax,%rax lea ($nptr,$num),%rcx # end of n[] lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer mov %rcx,0+8(%rsp) lea 48+8(%rsp,$num),$tptr # end of initial t[] window mov %rdx,8+8(%rsp) neg $num jmp .L8x_reduction_loop .align 32 .L8x_reduction_loop: lea ($tptr,$num),$tptr # start of current t[] window .byte 0x66 mov 8*0($tptr),$m0 mov 8*1($tptr),%r9 mov 8*2($tptr),%r10 mov 8*3($tptr),%r11 mov 8*4($tptr),%r12 mov 8*5($tptr),%r13 mov 8*6($tptr),%r14 mov 8*7($tptr),%r15 mov %rax,(%rdx) # store top-most carry bit lea 8*8($tptr),$tptr .byte 0x67 mov $m0,%r8 imulq 32+8(%rsp),$m0 # n0*a[0] mov 8*0($nptr),%rax # n[0] mov \$8,%ecx jmp .L8x_reduce .align 32 .L8x_reduce: mulq $m0 mov 8*1($nptr),%rax # n[1] neg %r8 mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] mov %rdx,%r9 adc \$0,%r9 mulq $m0 add %rax,%r10 mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov 32+8(%rsp),$carry # pull n0, borrow $carry mov %rdx,%r10 adc \$0,%r10 mulq $m0 add %rax,%r11 mov 8*4($nptr),%rax adc \$0,%rdx imulq %r8,$carry # modulo-scheduled add %r11,%r10 mov %rdx,%r11 adc \$0,%r11 mulq $m0 add %rax,%r12 mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 adc \$0,%r12 mulq $m0 add %rax,%r13 mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 adc \$0,%r13 mulq $m0 add %rax,%r14 mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 adc \$0,%r14 mulq $m0 mov $carry,$m0 # n0*a[i] add %rax,%r15 mov 8*0($nptr),%rax # n[0] adc \$0,%rdx add %r15,%r14 mov %rdx,%r15 adc \$0,%r15 dec %ecx jnz .L8x_reduce lea 8*8($nptr),$nptr xor %rax,%rax mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? jae .L8x_no_tail .byte 0x66 add 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 sbb $carry,$carry # top carry mov 48+56+8(%rsp),$m0 # pull n0*a[0] mov \$8,%ecx mov 8*0($nptr),%rax jmp .L8x_tail .align 32 .L8x_tail: mulq $m0 add %rax,%r8 mov 8*1($nptr),%rax mov %r8,($tptr) # save result mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 lea 8($tptr),$tptr # $tptr++ mov %rdx,%r9 adc \$0,%r9 mulq $m0 add %rax,%r10 mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov %rdx,%r10 adc \$0,%r10 mulq $m0 add %rax,%r11 mov 8*4($nptr),%rax adc \$0,%rdx add %r11,%r10 mov %rdx,%r11 adc \$0,%r11 mulq $m0 add %rax,%r12 mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 adc \$0,%r12 mulq $m0 add %rax,%r13 mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 adc \$0,%r13 mulq $m0 add %rax,%r14 mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 adc \$0,%r14 mulq $m0 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] add %rax,%r15 adc \$0,%rdx add %r15,%r14 mov 8*0($nptr),%rax # pull n[0] mov %rdx,%r15 adc \$0,%r15 dec %ecx jnz .L8x_tail lea 8*8($nptr),$nptr mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? jae .L8x_tail_done # break out of loop mov 48+56+8(%rsp),$m0 # pull n0*a[0] neg $carry mov 8*0($nptr),%rax # pull n[0] adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 sbb $carry,$carry # top carry mov \$8,%ecx jmp .L8x_tail .align 32 .L8x_tail_done: xor %rax,%rax add (%rdx),%r8 # can this overflow? adc \$0,%r9 adc \$0,%r10 adc \$0,%r11 adc \$0,%r12 adc \$0,%r13 adc \$0,%r14 adc \$0,%r15 adc \$0,%rax neg $carry .L8x_no_tail: adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 adc \$0,%rax # top-most carry mov -8($nptr),%rcx # np[num-1] xor $carry,$carry movq %xmm2,$nptr # restore $nptr mov %r8,8*0($tptr) # store top 512 bits mov %r9,8*1($tptr) movq %xmm3,$num # $num is %r9, can't be moved upwards mov %r10,8*2($tptr) mov %r11,8*3($tptr) mov %r12,8*4($tptr) mov %r13,8*5($tptr) mov %r14,8*6($tptr) mov %r15,8*7($tptr) lea 8*8($tptr),$tptr cmp %rdx,$tptr # end of t[]? jb .L8x_reduction_loop ret .cfi_endproc .size bn_sqr8x_internal,.-bn_sqr8x_internal ___ } ############################################################## # Post-condition, 4x unrolled # { my ($tptr,$nptr)=("%rbx","%rbp"); $code.=<<___; .type __bn_post4x_internal,\@abi-omnipotent .align 32 __bn_post4x_internal: .cfi_startproc mov 8*0($nptr),%r12 lea (%rdi,$num),$tptr # %rdi was $tptr above mov $num,%rcx movq %xmm1,$rptr # restore $rptr neg %rax movq %xmm1,$aptr # prepare for back-to-back call sar \$3+2,%rcx dec %r12 # so that after 'not' we get -n[0] xor %r10,%r10 mov 8*1($nptr),%r13 mov 8*2($nptr),%r14 mov 8*3($nptr),%r15 jmp .Lsqr4x_sub_entry .align 16 .Lsqr4x_sub: mov 8*0($nptr),%r12 mov 8*1($nptr),%r13 mov 8*2($nptr),%r14 mov 8*3($nptr),%r15 .Lsqr4x_sub_entry: lea 8*4($nptr),$nptr not %r12 not %r13 not %r14 not %r15 and %rax,%r12 and %rax,%r13 and %rax,%r14 and %rax,%r15 neg %r10 # mov %r10,%cf adc 8*0($tptr),%r12 adc 8*1($tptr),%r13 adc 8*2($tptr),%r14 adc 8*3($tptr),%r15 mov %r12,8*0($rptr) lea 8*4($tptr),$tptr mov %r13,8*1($rptr) sbb %r10,%r10 # mov %cf,%r10 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx # pass %cf jnz .Lsqr4x_sub mov $num,%r10 # prepare for back-to-back call neg $num # restore $num ret .cfi_endproc .size __bn_post4x_internal,.-__bn_post4x_internal ___ } }}} if ($addx) {{{ my $bp="%rdx"; # restore original value $code.=<<___; .globl bn_mulx4x_mont_gather5 .type bn_mulx4x_mont_gather5,\@function,6 .align 32 bn_mulx4x_mont_gather5: .cfi_startproc _CET_ENDBR mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lmulx4x_prologue: # num is declared as an int, a 32-bit parameter, so the upper half is # undefined. It is important that this write to ${num}, which zeros the # upper half, predates the first access. shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num mov ($n0),$n0 # *n0 ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). This is done to allow memory disambiguation # logic do its magic. [Extra [num] is allocated in order # to align with bn_power5's frame, which is cleansed after # completing exponentiation. Extra 256 bytes is for power mask # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmulx4xsp_alt sub %r11,%rbp # align with $aptr lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lmulx4xsp_done .Lmulx4xsp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lmulx4xsp_done: and \$-64,%rbp # ensure alignment mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmulx4x_page_walk jmp .Lmulx4x_page_walk_done .Lmulx4x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmulx4x_page_walk .Lmulx4x_page_walk_done: ############################################################## # Stack layout # +0 -num # +8 off-loaded &b[i] # +16 end of b[num] # +24 inner counter # +32 saved n0 # +40 saved %rsp # +48 # +56 saved rp # +64 tmp[num+1] # mov $n0, 32(%rsp) # save *n0 mov %rax,40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lmulx4x_body: call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret .cfi_endproc .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 .type mulx4x_internal,\@abi-omnipotent .align 32 mulx4x_internal: .cfi_startproc mov $num,8(%rsp) # save -$num (it was in bytes) mov $num,%r10 neg $num # restore $num shl \$5,$num neg %r10 # restore $num lea 128($bp,$num),%r13 # end of powers table (+size optimization) shr \$5+5,$num movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument sub \$1,$num lea .Linc(%rip),%rax mov %r13,16+8(%rsp) # end of b[num] mov $num,24+8(%rsp) # inner counter mov $rp, 56+8(%rsp) # save $rp ___ my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); my $rptr=$bptr; my $STRIDE=2**5*8; # 5 is "window size" $code.=<<___; movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) lea 128($bp),$bptr # size optimization pshufd \$0,%xmm5,%xmm5 # broadcast index movdqa %xmm1,%xmm4 .byte 0x67 movdqa %xmm1,%xmm2 ___ ######################################################################## # Calculate masks by comparing 0..31 to $idx and save result to stack. # # We compute sixteen 16-byte masks and store them on the stack. Mask i is stored # in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and # idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations # are scheduled in groups of four. $code.=<<___; .byte 0x67 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 # compare to 1,0 movdqa %xmm4,%xmm3 ___ for($i=0;$i<$STRIDE/16-4;$i+=4) { $code.=<<___; paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 # compare to 3,2 movdqa %xmm0,`16*($i+0)+112`(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 # compare to 5,4 movdqa %xmm1,`16*($i+1)+112`(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 # compare to 7,6 movdqa %xmm2,`16*($i+2)+112`(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,`16*($i+3)+112`(%r10) movdqa %xmm4,%xmm3 ___ } $code.=<<___; # last iteration can be optimized .byte 0x67 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,`16*($i+0)+112`(%r10) paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,`16*($i+1)+112`(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,`16*($i+2)+112`(%r10) pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register pand `16*($i+1)-128`($bptr),%xmm1 pand `16*($i+2)-128`($bptr),%xmm2 movdqa %xmm3,`16*($i+3)+112`(%r10) pand `16*($i+3)-128`($bptr),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ for($i=0;$i<$STRIDE/16-4;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`($bptr),%xmm4 movdqa `16*($i+1)-128`($bptr),%xmm5 movdqa `16*($i+2)-128`($bptr),%xmm2 pand `16*($i+0)+112`(%r10),%xmm4 movdqa `16*($i+3)-128`($bptr),%xmm3 pand `16*($i+1)+112`(%r10),%xmm5 por %xmm4,%xmm0 pand `16*($i+2)+112`(%r10),%xmm2 por %xmm5,%xmm1 pand `16*($i+3)+112`(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ } $code.=<<___; pxor %xmm1,%xmm0 # Combine the upper and lower halves of %xmm0. pshufd \$0x4e,%xmm0,%xmm1 # Swap upper and lower halves. por %xmm1,%xmm0 lea $STRIDE($bptr),$bptr movq %xmm0,%rdx # bp[0] lea 64+8*4+8(%rsp),$tptr mov %rdx,$bi mulx 0*8($aptr),$mi,%rax # a[0]*b[0] mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] add %rax,%r11 mulx 2*8($aptr),%rax,%r13 # ... adc %rax,%r12 adc \$0,%r13 mulx 3*8($aptr),%rax,%r14 mov $mi,%r15 imulq 32+8(%rsp),$mi # "t[0]"*n0 xor $zero,$zero # cf=0, of=0 mov $mi,%rdx mov $bptr,8+8(%rsp) # off-load &b[i] lea 4*8($aptr),$aptr adcx %rax,%r13 adcx $zero,%r14 # cf=0 mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 mulx 2*8($nptr),%rax,%r12 mov 24+8(%rsp),$bptr # counter value mov %r10,-8*4($tptr) adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-8*3($tptr) adcx %rax,%r12 adox $zero,%r15 # of=0 lea 4*8($nptr),$nptr mov %r12,-8*2($tptr) jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: adcx $zero,%r15 # cf=0, modulo-scheduled mulx 0*8($aptr),%r10,%rax # a[4]*b[0] adcx %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] adcx %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx %r14,%r12 mulx 3*8($aptr),%r13,%r14 .byte 0x67,0x67 mov $mi,%rdx adcx %rax,%r13 adcx $zero,%r14 # cf=0 lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adox %r15,%r10 mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 mov %r11,-4*8($tptr) adox %r15,%r13 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 lea 4*8($nptr),$nptr mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_1st mov 8(%rsp),$num # load -num adc $zero,%r15 # modulo-scheduled lea ($aptr,$num),$aptr # rewind $aptr add %r15,%r14 mov 8+8(%rsp),$bptr # re-load &b[i] adc $zero,$zero # top-most carry mov %r14,-1*8($tptr) jmp .Lmulx4x_outer .align 32 .Lmulx4x_outer: lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) pxor %xmm4,%xmm4 .byte 0x67,0x67 pxor %xmm5,%xmm5 ___ for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`($bptr),%xmm0 movdqa `16*($i+1)-128`($bptr),%xmm1 movdqa `16*($i+2)-128`($bptr),%xmm2 pand `16*($i+0)+256`(%r10),%xmm0 movdqa `16*($i+3)-128`($bptr),%xmm3 pand `16*($i+1)+256`(%r10),%xmm1 por %xmm0,%xmm4 pand `16*($i+2)+256`(%r10),%xmm2 por %xmm1,%xmm5 pand `16*($i+3)+256`(%r10),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 ___ } $code.=<<___; por %xmm5,%xmm4 # Combine the upper and lower halves of %xmm4 as %xmm0. pshufd \$0x4e,%xmm4,%xmm0 # Swap upper and lower halves. por %xmm4,%xmm0 lea $STRIDE($bptr),$bptr movq %xmm0,%rdx # m0=bp[i] mov $zero,($tptr) # save top-most carry lea 4*8($tptr,$num),$tptr # rewind $tptr mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] xor $zero,$zero # cf=0, of=0 mov %rdx,$bi mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] adox -4*8($tptr),$mi # +t[0] adcx %r14,%r11 mulx 2*8($aptr),%r15,%r13 # ... adox -3*8($tptr),%r11 adcx %r15,%r12 mulx 3*8($aptr),%rdx,%r14 adox -2*8($tptr),%r12 adcx %rdx,%r13 lea ($nptr,$num),$nptr # rewind $nptr lea 4*8($aptr),$aptr adox -1*8($tptr),%r13 adcx $zero,%r14 adox $zero,%r14 mov $mi,%r15 imulq 32+8(%rsp),$mi # "t[0]"*n0 mov $mi,%rdx xor $zero,$zero # cf=0, of=0 mov $bptr,8+8(%rsp) # off-load &b[i] mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 mulx 2*8($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov 24+8(%rsp),$bptr # counter value mov %r10,-8*4($tptr) adcx %rax,%r12 mov %r11,-8*3($tptr) adox $zero,%r15 # of=0 mov %r12,-8*2($tptr) lea 4*8($nptr),$nptr jmp .Lmulx4x_inner .align 32 .Lmulx4x_inner: mulx 0*8($aptr),%r10,%rax # a[4]*b[i] adcx $zero,%r15 # cf=0, modulo-scheduled adox %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] adcx 0*8($tptr),%r10 adox %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx 1*8($tptr),%r11 adox %r14,%r12 mulx 3*8($aptr),%r13,%r14 mov $mi,%rdx adcx 2*8($tptr),%r12 adox %rax,%r13 adcx 3*8($tptr),%r13 adox $zero,%r14 # of=0 lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adcx $zero,%r14 # cf=0 adox %r15,%r10 mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 adox %r15,%r13 mov %r11,-4*8($tptr) mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx lea 4*8($nptr),$nptr mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_inner mov 0+8(%rsp),$num # load -num adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$bptr # pull top-most carry to %cf mov 8+8(%rsp),$bptr # re-load &b[i] mov 16+8(%rsp),%r10 adc %r15,%r14 lea ($aptr,$num),$aptr # rewind $aptr adc $zero,$zero # top-most carry mov %r14,-1*8($tptr) cmp %r10,$bptr jb .Lmulx4x_outer mov -8($nptr),%r10 mov $zero,%r8 mov ($nptr,$num),%r12 lea ($nptr,$num),%rbp # rewind $nptr mov $num,%rcx lea ($tptr,$num),%rdi # rewind $tptr xor %eax,%eax xor %r15,%r15 sub %r14,%r10 # compare top-most words adc %r15,%r15 or %r15,%r8 sar \$3+2,%rcx sub %r8,%rax # %rax=-%r8 mov 56+8(%rsp),%rdx # restore rp dec %r12 # so that after 'not' we get -n[0] mov 8*1(%rbp),%r13 xor %r8,%r8 mov 8*2(%rbp),%r14 mov 8*3(%rbp),%r15 jmp .Lsqrx4x_sub_entry # common post-condition .cfi_endproc .size mulx4x_internal,.-mulx4x_internal ___ } { ###################################################################### # void bn_powerx5( my $rptr="%rdi"; # BN_ULONG *rptr, my $aptr="%rsi"; # const BN_ULONG *aptr, my $bptr="%rdx"; # const void *table, my $nptr="%rcx"; # const BN_ULONG *nptr, my $n0 ="%r8"; # const BN_ULONG *n0); my $num ="%r9"; # int num, has to be divisible by 8 # int pwr); my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); my @A0=("%r10","%r11"); my @A1=("%r12","%r13"); my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); $code.=<<___; .globl bn_powerx5 .type bn_powerx5,\@function,6 .align 32 bn_powerx5: .cfi_startproc _CET_ENDBR mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lpowerx5_prologue: # num is declared as an int, a 32-bit parameter, so the upper half is # undefined. It is important that this write to ${num}, which zeros the # upper half, predates the first access. shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num mov ($n0),$n0 # *n0 ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). This is done to allow memory disambiguation # logic do its magic. [Extra 256 bytes is for power mask # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwrx_sp_alt sub %r11,%rbp # align with $aptr lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lpwrx_sp_done .align 32 .Lpwrx_sp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lpwrx_sp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lpwrx_page_walk jmp .Lpwrx_page_walk_done .Lpwrx_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lpwrx_page_walk .Lpwrx_page_walk_done: mov $num,%r10 neg $num ############################################################## # Stack layout # # +0 saved $num, used in reduction section # +8 &t[2*$num], used in reduction section # +16 intermediate carry bit # +24 top-most carry bit, used in reduction section # +32 saved *n0 # +40 saved %rsp # +48 t[2*$num] # pxor %xmm0,%xmm0 movq $rptr,%xmm1 # save $rptr movq $nptr,%xmm2 # save $nptr movq %r10, %xmm3 # -$num movq $bptr,%xmm4 mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lpowerx5_body: call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal mov %r10,$num # -num mov $aptr,$rptr movq %xmm2,$nptr movq %xmm4,$bptr mov 40(%rsp),%rax call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpowerx5_epilogue: ret .cfi_endproc .size bn_powerx5,.-bn_powerx5 .globl bn_sqrx8x_internal .hidden bn_sqrx8x_internal .type bn_sqrx8x_internal,\@abi-omnipotent .align 32 bn_sqrx8x_internal: __bn_sqrx8x_internal: .cfi_startproc _CET_ENDBR ################################################################## # Squaring part: # # a) multiply-n-add everything but a[i]*a[i]; # b) shift result of a) by 1 to the left and accumulate # a[i]*a[i] products; # ################################################################## # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] # a[1]a[0] # a[2]a[0] # a[3]a[0] # a[2]a[1] # a[3]a[1] # a[3]a[2] # # a[4]a[0] # a[5]a[0] # a[6]a[0] # a[7]a[0] # a[4]a[1] # a[5]a[1] # a[6]a[1] # a[7]a[1] # a[4]a[2] # a[5]a[2] # a[6]a[2] # a[7]a[2] # a[4]a[3] # a[5]a[3] # a[6]a[3] # a[7]a[3] # # a[5]a[4] # a[6]a[4] # a[7]a[4] # a[6]a[5] # a[7]a[5] # a[7]a[6] # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] ___ { my ($zero,$carry)=("%rbp","%rcx"); my $aaptr=$zero; $code.=<<___; lea 48+8(%rsp),$tptr lea ($aptr,$num),$aaptr mov $num,0+8(%rsp) # save $num mov $aaptr,8+8(%rsp) # save end of $aptr jmp .Lsqr8x_zero_start .align 32 .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 .Lsqrx8x_zero: .byte 0x3e movdqa %xmm0,0*8($tptr) movdqa %xmm0,2*8($tptr) movdqa %xmm0,4*8($tptr) movdqa %xmm0,6*8($tptr) .Lsqr8x_zero_start: # aligned at 32 movdqa %xmm0,8*8($tptr) movdqa %xmm0,10*8($tptr) movdqa %xmm0,12*8($tptr) movdqa %xmm0,14*8($tptr) lea 16*8($tptr),$tptr sub \$64,$num jnz .Lsqrx8x_zero mov 0*8($aptr),%rdx # a[0], modulo-scheduled #xor %r9,%r9 # t[1], ex-$num, zero already xor %r10,%r10 xor %r11,%r11 xor %r12,%r12 xor %r13,%r13 xor %r14,%r14 xor %r15,%r15 lea 48+8(%rsp),$tptr xor $zero,$zero # cf=0, cf=0 jmp .Lsqrx8x_outer_loop .align 32 .Lsqrx8x_outer_loop: mulx 1*8($aptr),%r8,%rax # a[1]*a[0] adcx %r9,%r8 # a[1]*a[0]+=t[1] adox %rax,%r10 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] adcx %r10,%r9 adox %rax,%r11 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... adcx %r11,%r10 adox %rax,%r12 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax adcx %r12,%r11 adox %rax,%r13 mulx 5*8($aptr),%r12,%rax adcx %r13,%r12 adox %rax,%r14 mulx 6*8($aptr),%r13,%rax adcx %r14,%r13 adox %r15,%rax mulx 7*8($aptr),%r14,%r15 mov 1*8($aptr),%rdx # a[1] adcx %rax,%r14 adox $zero,%r15 adc 8*8($tptr),%r15 mov %r8,1*8($tptr) # t[1] mov %r9,2*8($tptr) # t[2] sbb $carry,$carry # mov %cf,$carry xor $zero,$zero # cf=0, of=0 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] mulx 3*8($aptr),%r9,%rax # a[3]*a[1] adcx %r10,%r8 adox %rbx,%r9 mulx 4*8($aptr),%r10,%rbx # ... adcx %r11,%r9 adox %rax,%r10 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax adcx %r12,%r10 adox %rbx,%r11 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx adcx %r13,%r11 adox %r14,%r12 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 mov 2*8($aptr),%rdx # a[2] adcx %rax,%r12 adox %rbx,%r13 adcx %r15,%r13 adox $zero,%r14 # of=0 adcx $zero,%r14 # cf=0 mov %r8,3*8($tptr) # t[3] mov %r9,4*8($tptr) # t[4] mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] mulx 4*8($aptr),%r9,%rax # a[4]*a[2] adcx %r10,%r8 adox %rbx,%r9 mulx 5*8($aptr),%r10,%rbx # ... adcx %r11,%r9 adox %rax,%r10 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax adcx %r12,%r10 adox %r13,%r11 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 .byte 0x3e mov 3*8($aptr),%rdx # a[3] adcx %rbx,%r11 adox %rax,%r12 adcx %r14,%r12 mov %r8,5*8($tptr) # t[5] mov %r9,6*8($tptr) # t[6] mulx 4*8($aptr),%r8,%rax # a[4]*a[3] adox $zero,%r13 # of=0 adcx $zero,%r13 # cf=0 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] adcx %r10,%r8 adox %rax,%r9 mulx 6*8($aptr),%r10,%rax # ... adcx %r11,%r9 adox %r12,%r10 mulx 7*8($aptr),%r11,%r12 mov 4*8($aptr),%rdx # a[4] mov 5*8($aptr),%r14 # a[5] adcx %rbx,%r10 adox %rax,%r11 mov 6*8($aptr),%r15 # a[6] adcx %r13,%r11 adox $zero,%r12 # of=0 adcx $zero,%r12 # cf=0 mov %r8,7*8($tptr) # t[7] mov %r9,8*8($tptr) # t[8] mulx %r14,%r9,%rax # a[5]*a[4] mov 7*8($aptr),%r8 # a[7] adcx %r10,%r9 mulx %r15,%r10,%rbx # a[6]*a[4] adox %rax,%r10 adcx %r11,%r10 mulx %r8,%r11,%rax # a[7]*a[4] mov %r14,%rdx # a[5] adox %rbx,%r11 adcx %r12,%r11 #adox $zero,%rax # of=0 adcx $zero,%rax # cf=0 mulx %r15,%r14,%rbx # a[6]*a[5] mulx %r8,%r12,%r13 # a[7]*a[5] mov %r15,%rdx # a[6] lea 8*8($aptr),$aptr adcx %r14,%r11 adox %rbx,%r12 adcx %rax,%r12 adox $zero,%r13 .byte 0x67,0x67 mulx %r8,%r8,%r14 # a[7]*a[6] adcx %r8,%r13 adcx $zero,%r14 cmp 8+8(%rsp),$aptr je .Lsqrx8x_outer_break neg $carry # mov $carry,%cf mov \$-8,%rcx mov $zero,%r15 mov 8*8($tptr),%r8 adcx 9*8($tptr),%r9 # +=t[9] adcx 10*8($tptr),%r10 # ... adcx 11*8($tptr),%r11 adc 12*8($tptr),%r12 adc 13*8($tptr),%r13 adc 14*8($tptr),%r14 adc 15*8($tptr),%r15 lea ($aptr),$aaptr lea 2*64($tptr),$tptr sbb %rax,%rax # mov %cf,$carry mov -64($aptr),%rdx # a[0] mov %rax,16+8(%rsp) # offload $carry mov $tptr,24+8(%rsp) #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above xor %eax,%eax # cf=0, of=0 jmp .Lsqrx8x_loop .align 32 .Lsqrx8x_loop: mov %r8,%rbx mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] adcx %rax,%rbx # +=t[8] adox %r9,%r8 mulx 1*8($aaptr),%rax,%r9 # ... adcx %rax,%r8 adox %r10,%r9 mulx 2*8($aaptr),%rax,%r10 adcx %rax,%r9 adox %r11,%r10 mulx 3*8($aaptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 mulx 5*8($aaptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 mulx 6*8($aaptr),%rax,%r14 mov %rbx,($tptr,%rcx,8) # store t[8+i] mov \$0,%ebx adcx %rax,%r13 adox %r15,%r14 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 mov 8($aptr,%rcx,8),%rdx # a[i] adcx %rax,%r14 adox %rbx,%r15 # %rbx is 0, of=0 adcx %rbx,%r15 # cf=0 .byte 0x67 inc %rcx # of=0 jnz .Lsqrx8x_loop lea 8*8($aaptr),$aaptr mov \$-8,%rcx cmp 8+8(%rsp),$aaptr # done? je .Lsqrx8x_break sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf .byte 0x66 mov -64($aptr),%rdx adcx 0*8($tptr),%r8 adcx 1*8($tptr),%r9 adc 2*8($tptr),%r10 adc 3*8($tptr),%r11 adc 4*8($tptr),%r12 adc 5*8($tptr),%r13 adc 6*8($tptr),%r14 adc 7*8($tptr),%r15 lea 8*8($tptr),$tptr .byte 0x67 sbb %rax,%rax # mov %cf,%rax xor %ebx,%ebx # cf=0, of=0 mov %rax,16+8(%rsp) # offload carry jmp .Lsqrx8x_loop .align 32 .Lsqrx8x_break: xor $zero,$zero sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf adcx $zero,%r8 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry adcx $zero,%r9 mov 0*8($aptr),%rdx # a[8], modulo-scheduled adc \$0,%r10 mov %r8,0*8($tptr) adc \$0,%r11 adc \$0,%r12 adc \$0,%r13 adc \$0,%r14 adc \$0,%r15 cmp $carry,$tptr # cf=0, of=0 je .Lsqrx8x_outer_loop mov %r9,1*8($tptr) mov 1*8($carry),%r9 mov %r10,2*8($tptr) mov 2*8($carry),%r10 mov %r11,3*8($tptr) mov 3*8($carry),%r11 mov %r12,4*8($tptr) mov 4*8($carry),%r12 mov %r13,5*8($tptr) mov 5*8($carry),%r13 mov %r14,6*8($tptr) mov 6*8($carry),%r14 mov %r15,7*8($tptr) mov 7*8($carry),%r15 mov $carry,$tptr jmp .Lsqrx8x_outer_loop .align 32 .Lsqrx8x_outer_break: mov %r9,9*8($tptr) # t[9] movq %xmm3,%rcx # -$num mov %r10,10*8($tptr) # ... mov %r11,11*8($tptr) mov %r12,12*8($tptr) mov %r13,13*8($tptr) mov %r14,14*8($tptr) ___ } { my $i="%rcx"; $code.=<<___; lea 48+8(%rsp),$tptr mov ($aptr,$i),%rdx # a[0] mov 8($tptr),$A0[1] # t[1] xor $A0[0],$A0[0] # t[0], of=0, cf=0 mov 0+8(%rsp),$num # restore $num adox $A0[1],$A0[1] mov 16($tptr),$A1[0] # t[2] # prefetch mov 24($tptr),$A1[1] # t[3] # prefetch #jmp .Lsqrx4x_shift_n_add # happens to be aligned .align 32 .Lsqrx4x_shift_n_add: mulx %rdx,%rax,%rbx adox $A1[0],$A1[0] adcx $A0[0],%rax .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch adox $A1[1],$A1[1] adcx $A0[1],%rbx mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch mov %rax,0($tptr) mov %rbx,8($tptr) mulx %rdx,%rax,%rbx adox $A0[0],$A0[0] adcx $A1[0],%rax mov 16($aptr,$i),%rdx # a[i+2] # prefetch mov 48($tptr),$A1[0] # t[2*i+6] # prefetch adox $A0[1],$A0[1] adcx $A1[1],%rbx mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch mov %rax,16($tptr) mov %rbx,24($tptr) mulx %rdx,%rax,%rbx adox $A1[0],$A1[0] adcx $A0[0],%rax mov 24($aptr,$i),%rdx # a[i+3] # prefetch lea 32($i),$i mov 64($tptr),$A0[0] # t[2*i+8] # prefetch adox $A1[1],$A1[1] adcx $A0[1],%rbx mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch mov %rax,32($tptr) mov %rbx,40($tptr) mulx %rdx,%rax,%rbx adox $A0[0],$A0[0] adcx $A1[0],%rax jrcxz .Lsqrx4x_shift_n_add_break .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch adox $A0[1],$A0[1] adcx $A1[1],%rbx mov 80($tptr),$A1[0] # t[2*i+10] # prefetch mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch mov %rax,48($tptr) mov %rbx,56($tptr) lea 64($tptr),$tptr nop jmp .Lsqrx4x_shift_n_add .align 32 .Lsqrx4x_shift_n_add_break: adcx $A1[1],%rbx mov %rax,48($tptr) mov %rbx,56($tptr) lea 64($tptr),$tptr # end of t[] buffer ___ } ###################################################################### # Montgomery reduction part, "word-by-word" algorithm. # # This new path is inspired by multiple submissions from Intel, by # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, # Vinodh Gopal... { my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); $code.=<<___; movq %xmm2,$nptr __bn_sqrx8x_reduction: xor %eax,%eax # initial top-most carry bit mov 32+8(%rsp),%rbx # n0 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) lea -8*8($nptr,$num),%rcx # end of n[] #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer mov %rcx, 0+8(%rsp) # save end of n[] mov $tptr,8+8(%rsp) # save end of t[] lea 48+8(%rsp),$tptr # initial t[] window jmp .Lsqrx8x_reduction_loop .align 32 .Lsqrx8x_reduction_loop: mov 8*1($tptr),%r9 mov 8*2($tptr),%r10 mov 8*3($tptr),%r11 mov 8*4($tptr),%r12 mov %rdx,%r8 imulq %rbx,%rdx # n0*a[i] mov 8*5($tptr),%r13 mov 8*6($tptr),%r14 mov 8*7($tptr),%r15 mov %rax,24+8(%rsp) # store top-most carry bit lea 8*8($tptr),$tptr xor $carry,$carry # cf=0,of=0 mov \$-8,%rcx jmp .Lsqrx8x_reduce .align 32 .Lsqrx8x_reduce: mov %r8, %rbx mulx 8*0($nptr),%rax,%r8 # n[0] adcx %rbx,%rax # discarded adox %r9,%r8 mulx 8*1($nptr),%rbx,%r9 # n[1] adcx %rbx,%r8 adox %r10,%r9 mulx 8*2($nptr),%rbx,%r10 adcx %rbx,%r9 adox %r11,%r10 mulx 8*3($nptr),%rbx,%r11 adcx %rbx,%r10 adox %r12,%r11 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 mov %rdx,%rax mov %r8,%rdx adcx %rbx,%r11 adox %r13,%r12 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded mov %rax,%rdx mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 mulx 8*7($nptr),%rax,%r15 mov %rbx,%rdx adcx %rax,%r14 adox $carry,%r15 # $carry is 0 adcx $carry,%r15 # cf=0 .byte 0x67,0x67,0x67 inc %rcx # of=0 jnz .Lsqrx8x_reduce mov $carry,%rax # xor %rax,%rax cmp 0+8(%rsp),$nptr # end of n[]? jae .Lsqrx8x_no_tail mov 48+8(%rsp),%rdx # pull n0*a[0] add 8*0($tptr),%r8 lea 8*8($nptr),$nptr mov \$-8,%rcx adcx 8*1($tptr),%r9 adcx 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 lea 8*8($tptr),$tptr sbb %rax,%rax # top carry xor $carry,$carry # of=0, cf=0 mov %rax,16+8(%rsp) jmp .Lsqrx8x_tail .align 32 .Lsqrx8x_tail: mov %r8,%rbx mulx 8*0($nptr),%rax,%r8 adcx %rax,%rbx adox %r9,%r8 mulx 8*1($nptr),%rax,%r9 adcx %rax,%r8 adox %r10,%r9 mulx 8*2($nptr),%rax,%r10 adcx %rax,%r9 adox %r11,%r10 mulx 8*3($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 mulx 8*7($nptr),%rax,%r15 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] adcx %rax,%r14 adox $carry,%r15 mov %rbx,($tptr,%rcx,8) # save result mov %r8,%rbx adcx $carry,%r15 # cf=0 inc %rcx # of=0 jnz .Lsqrx8x_tail cmp 0+8(%rsp),$nptr # end of n[]? jae .Lsqrx8x_tail_done # break out of loop sub 16+8(%rsp),$carry # mov 16(%rsp),%cf mov 48+8(%rsp),%rdx # pull n0*a[0] lea 8*8($nptr),$nptr adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 lea 8*8($tptr),$tptr sbb %rax,%rax sub \$8,%rcx # mov \$-8,%rcx xor $carry,$carry # of=0, cf=0 mov %rax,16+8(%rsp) jmp .Lsqrx8x_tail .align 32 .Lsqrx8x_tail_done: xor %rax,%rax add 24+8(%rsp),%r8 # can this overflow? adc \$0,%r9 adc \$0,%r10 adc \$0,%r11 adc \$0,%r12 adc \$0,%r13 adc \$0,%r14 adc \$0,%r15 adc \$0,%rax sub 16+8(%rsp),$carry # mov 16(%rsp),%cf .Lsqrx8x_no_tail: # %cf is 0 if jumped here adc 8*0($tptr),%r8 movq %xmm3,%rcx adc 8*1($tptr),%r9 mov 8*7($nptr),$carry movq %xmm2,$nptr # restore $nptr adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 adc \$0,%rax # top-most carry mov 32+8(%rsp),%rbx # n0 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" mov %r8,8*0($tptr) # store top 512 bits lea 8*8($tptr),%r8 # borrow %r8 mov %r9,8*1($tptr) mov %r10,8*2($tptr) mov %r11,8*3($tptr) mov %r12,8*4($tptr) mov %r13,8*5($tptr) mov %r14,8*6($tptr) mov %r15,8*7($tptr) lea 8*8($tptr,%rcx),$tptr # start of current t[] window cmp 8+8(%rsp),%r8 # end of t[]? jb .Lsqrx8x_reduction_loop ret .cfi_endproc .size bn_sqrx8x_internal,.-bn_sqrx8x_internal ___ } ############################################################## # Post-condition, 4x unrolled # { my ($rptr,$nptr)=("%rdx","%rbp"); $code.=<<___; .align 32 .type __bn_postx4x_internal,\@abi-omnipotent __bn_postx4x_internal: .cfi_startproc mov 8*0($nptr),%r12 mov %rcx,%r10 # -$num mov %rcx,%r9 # -$num neg %rax sar \$3+2,%rcx #lea 48+8(%rsp,%r9),$tptr movq %xmm1,$rptr # restore $rptr movq %xmm1,$aptr # prepare for back-to-back call dec %r12 # so that after 'not' we get -n[0] mov 8*1($nptr),%r13 xor %r8,%r8 mov 8*2($nptr),%r14 mov 8*3($nptr),%r15 jmp .Lsqrx4x_sub_entry .align 16 .Lsqrx4x_sub: mov 8*0($nptr),%r12 mov 8*1($nptr),%r13 mov 8*2($nptr),%r14 mov 8*3($nptr),%r15 .Lsqrx4x_sub_entry: andn %rax,%r12,%r12 lea 8*4($nptr),$nptr andn %rax,%r13,%r13 andn %rax,%r14,%r14 andn %rax,%r15,%r15 neg %r8 # mov %r8,%cf adc 8*0($tptr),%r12 adc 8*1($tptr),%r13 adc 8*2($tptr),%r14 adc 8*3($tptr),%r15 mov %r12,8*0($rptr) lea 8*4($tptr),$tptr mov %r13,8*1($rptr) sbb %r8,%r8 # mov %cf,%r8 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx jnz .Lsqrx4x_sub neg %r9 # restore $num ret .cfi_endproc .size __bn_postx4x_internal,.-__bn_postx4x_internal ___ } }}} { my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order ("%rdi","%esi","%rdx","%ecx"); # Unix order my $out=$inp; my $STRIDE=2**5*8; my $N=$STRIDE/4; $code.=<<___; .globl bn_scatter5 .type bn_scatter5,\@abi-omnipotent .align 16 bn_scatter5: .cfi_startproc _CET_ENDBR cmp \$0, $num jz .Lscatter_epilogue # $tbl stores 32 entries, t0 through t31. Each entry has $num words. # They are interleaved in memory as follows: # # t0[0] t1[0] t2[0] ... t31[0] # t0[1] t1[1] t2[1] ... t31[1] # ... # t0[$num-1] t1[$num-1] t2[$num-1] ... t31[$num-1] lea ($tbl,$idx,8),$tbl .Lscatter: mov ($inp),%rax lea 8($inp),$inp mov %rax,($tbl) lea 32*8($tbl),$tbl sub \$1,$num jnz .Lscatter .Lscatter_epilogue: ret .cfi_endproc .size bn_scatter5,.-bn_scatter5 .globl bn_gather5 .type bn_gather5,\@abi-omnipotent .align 32 bn_gather5: .cfi_startproc .LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases _CET_ENDBR # I can't trust assembler to use specific encoding:-( .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 .cfi_def_cfa_register %r10 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp lea .Linc(%rip),%rax and \$-16,%rsp # shouldn't be formally required movd $idx,%xmm5 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 lea 128($tbl),%r11 # size optimization lea 128(%rsp),%rax # size optimization pshufd \$0,%xmm5,%xmm5 # broadcast $idx movdqa %xmm1,%xmm4 movdqa %xmm1,%xmm2 ___ ######################################################################## # Calculate masks by comparing 0..31 to $idx and save result to stack. # # We compute sixteen 16-byte masks and store them on the stack. Mask i is stored # in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and # idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations # are scheduled in groups of four. for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 # compare to 1,0 ___ $code.=<<___ if ($i); movdqa %xmm3,`16*($i-1)-128`(%rax) ___ $code.=<<___; movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 # compare to 3,2 movdqa %xmm0,`16*($i+0)-128`(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 # compare to 5,4 movdqa %xmm1,`16*($i+1)-128`(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 # compare to 7,6 movdqa %xmm2,`16*($i+2)-128`(%rax) movdqa %xmm4,%xmm2 ___ } $code.=<<___; movdqa %xmm3,`16*($i-1)-128`(%rax) jmp .Lgather .align 32 .Lgather: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 ___ for($i=0;$i<$STRIDE/16;$i+=4) { # Combine the masks with the corresponding table entries to select the correct # entry. $code.=<<___; movdqa `16*($i+0)-128`(%r11),%xmm0 movdqa `16*($i+1)-128`(%r11),%xmm1 movdqa `16*($i+2)-128`(%r11),%xmm2 pand `16*($i+0)-128`(%rax),%xmm0 movdqa `16*($i+3)-128`(%r11),%xmm3 pand `16*($i+1)-128`(%rax),%xmm1 por %xmm0,%xmm4 pand `16*($i+2)-128`(%rax),%xmm2 por %xmm1,%xmm5 pand `16*($i+3)-128`(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 ___ } $code.=<<___; por %xmm5,%xmm4 lea $STRIDE(%r11),%r11 # Combine the upper and lower halves of %xmm0. pshufd \$0x4e,%xmm4,%xmm0 # Swap upper and lower halves. por %xmm4,%xmm0 movq %xmm0,($out) # m0=bp[0] lea 8($out),$out sub \$1,$num jnz .Lgather lea (%r10),%rsp .cfi_def_cfa_register %rsp ret .LSEH_end_bn_gather5: .cfi_endproc .size bn_gather5,.-bn_gather5 ___ } $code.=<<___; .section .rodata .align 64 .Linc: .long 0,0, 1,1 .long 2,2, 2,2 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by " .text ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type mul_handler,\@abi-omnipotent .align 16 mul_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRipRsp mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea .Lmul4x_epilogue(%rip),%r10 # *ring*: hacked for deletion of _nohw cmp %r10,%rbx ja .Lbody_40 mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer jmp .Lcommon_pop_regs .Lbody_40: mov 40(%rax),%rax # pull saved stack pointer .Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size mul_handler,.-mul_handler .section .pdata .align 4 .rva .LSEH_begin_bn_mul4x_mont_gather5 .rva .LSEH_end_bn_mul4x_mont_gather5 .rva .LSEH_info_bn_mul4x_mont_gather5 .rva .LSEH_begin_bn_power5_nohw .rva .LSEH_end_bn_power5_nohw .rva .LSEH_info_bn_power5_nohw ___ $code.=<<___ if ($addx); .rva .LSEH_begin_bn_mulx4x_mont_gather5 .rva .LSEH_end_bn_mulx4x_mont_gather5 .rva .LSEH_info_bn_mulx4x_mont_gather5 .rva .LSEH_begin_bn_powerx5 .rva .LSEH_end_bn_powerx5 .rva .LSEH_info_bn_powerx5 ___ $code.=<<___; .rva .LSEH_begin_bn_gather5 .rva .LSEH_end_bn_gather5 .rva .LSEH_info_bn_gather5 .section .xdata .align 8 .LSEH_info_bn_mul4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_power5_nohw: .byte 9,0,0,0 .rva mul_handler .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] ___ $code.=<<___ if ($addx); .align 8 .LSEH_info_bn_mulx4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_powerx5: .byte 9,0,0,0 .rva mul_handler .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] ___ $code.=<<___; .align 8 .LSEH_info_bn_gather5: .byte 0x01,0x0b,0x03,0x0a .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) .align 8 ___ } $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/bn/internal.h000064400000000000000000000136751046102023000170240ustar 00000000000000// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. // Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef OPENSSL_HEADER_BN_INTERNAL_H #define OPENSSL_HEADER_BN_INTERNAL_H #include #if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__) #pragma warning(push, 3) #include #pragma warning(pop) #pragma intrinsic(_umul128) #endif #include "../../internal.h" typedef crypto_word_t BN_ULONG; #if defined(OPENSSL_64_BIT) #if defined(BORINGSSL_HAS_UINT128) // MSVC doesn't support two-word integers on 64-bit. #define BN_ULLONG uint128_t #endif #define BN_BITS2 64 #define BN_MONT_CTX_N0_LIMBS 1 #define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo), 0 #define TOBN(hi, lo) ((BN_ULONG)(hi) << 32 | (lo)) #elif defined(OPENSSL_32_BIT) #define BN_ULLONG uint64_t #define BN_BITS2 32 // On some 32-bit platforms, Montgomery multiplication is done using 64-bit // arithmetic with SIMD instructions. On such platforms, |BN_MONT_CTX::n0| // needs to be two words long. Only certain 32-bit platforms actually make use // of n0[1] and shorter R value would suffice for the others. However, // currently only the assembly files know which is which. #define BN_MONT_CTX_N0_LIMBS 2 #define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo) #define TOBN(hi, lo) (lo), (hi) #else #error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT" #endif // BN_MONTGOMERY_MAX_WORDS is the maximum numer of words allowed in a |BIGNUM| // used with Montgomery reduction. Ideally this limit would be applied to all // |BIGNUM|s, in |bn_wexpand|, but the exactfloat library needs to create 8 MiB // values for other operations. // #define BN_MONTGOMERY_MAX_WORDS (8 * 1024 / sizeof(BN_ULONG)) // bn_mul_mont writes |ap| * |bp| mod |np| to |rp|, each |num| words // long. Inputs and outputs are in Montgomery form. |n0| is a pointer to // an |N0|. // // If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced. // If neither is fully-reduced, the output may not be either. // // This function allocates |num| words on the stack, so |num| should be at most // |BN_MONTGOMERY_MAX_WORDS|. // // TODO(davidben): The x86_64 implementation expects a 32-bit input and masks // off upper bits. The aarch64 implementation expects a 64-bit input and does // not. |size_t| is the safer option but not strictly correct for x86_64. But // the |BN_MONTGOMERY_MAX_WORDS| bound makes this moot. // // See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word // inputs. // // |num| must be at least 4, at least on x86. // // In other forks, |bn_mul_mont| returns an |int| indicating whether it // actually did the multiplication. All our implementations always do the // multiplication, and forcing callers to deal with the possibility of it // failing just leads to further problems. OPENSSL_STATIC_ASSERT(sizeof(int) == sizeof(size_t) || (sizeof(int) == 4 && sizeof(size_t) == 8), "int and size_t ABI mismatch"); #if defined(OPENSSL_X86_64) void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, size_t num); static inline void bn_mul_mont_small( BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, size_t num) { bn_mul_mont_nohw(rp, ap, bp, np, n0, num); } #elif defined(OPENSSL_AARCH64) void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, size_t num); static inline void bn_mul_mont_small( BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, size_t num) { // No point in optimizing for P-256 because P-256 doesn't call into // this on AArch64. bn_mul_mont_nohw(rp, ap, bp, np, n0, num); } #elif defined(OPENSSL_ARM) void bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, size_t num); void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, size_t num); static inline void bn_mul_mont_small( BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, size_t num) { // Approximate what `bn_mul_mont` did so that the NEON version for P-256 // when practical. if (num == 8) { // XXX: This should not be accessing `neon_available` directly. if (neon_available) { bn_mul8x_mont_neon(rp, ap, bp, np, n0, num); return; } } bn_mul_mont_nohw(rp, ap, bp, np, n0, num); } #else void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, size_t num); static inline void bn_mul_mont_small( BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, size_t num) { bn_mul_mont(rp, ap, bp, np, n0, num); } #endif static inline void bn_umult_lohi(BN_ULONG *low_out, BN_ULONG *high_out, BN_ULONG a, BN_ULONG b) { #if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__) *low_out = _umul128(a, b, high_out); #else BN_ULLONG result = (BN_ULLONG)a * b; *low_out = (BN_ULONG)result; *high_out = (BN_ULONG)(result >> BN_BITS2); #endif } #endif // OPENSSL_HEADER_BN_INTERNAL_H ring-0.17.14/crypto/fipsmodule/bn/montgomery.c000064400000000000000000000047111046102023000173720ustar 00000000000000// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "internal.h" #include "../../internal.h" #include "../../limbs/limbs.h" #include "../../limbs/limbs.inl" OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2, "BN_MONT_CTX_N0_LIMBS value is invalid"); OPENSSL_STATIC_ASSERT( sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t), "uint64_t is insufficient precision for n0"); int bn_from_montgomery_in_place(BN_ULONG r[], size_t num_r, BN_ULONG a[], size_t num_a, const BN_ULONG n[], size_t num_n, const BN_ULONG n0_[BN_MONT_CTX_N0_LIMBS]) { if (num_n == 0 || num_r != num_n || num_a != 2 * num_n) { return 0; } // Add multiples of |n| to |r| until R = 2^(nl * BN_BITS2) divides it. On // input, we had |r| < |n| * R, so now |r| < 2 * |n| * R. Note that |r| // includes |carry| which is stored separately. BN_ULONG n0 = n0_[0]; BN_ULONG carry = 0; for (size_t i = 0; i < num_n; i++) { BN_ULONG v = limbs_mul_add_limb(a + i, n, a[i] * n0, num_n); v += carry + a[i + num_n]; carry |= (v != a[i + num_n]); carry &= (v <= a[i + num_n]); a[i + num_n] = v; } // Shift |num_n| words to divide by R. We have |a| < 2 * |n|. Note that |a| // includes |carry| which is stored separately. a += num_n; // |a| thus requires at most one additional subtraction |n| to be reduced. // Subtract |n| and select the answer in constant time. BN_ULONG v = limbs_sub(r, a, n, num_n) - carry; // |v| is one if |a| - |n| underflowed or zero if it did not. Note |v| cannot // be -1. That would imply the subtraction did not fit in |num_n| words, and // we know at most one subtraction is needed. v = 0u - v; for (size_t i = 0; i < num_n; i++) { r[i] = constant_time_select_w(v, a[i], r[i]); a[i] = 0; } return 1; } ring-0.17.14/crypto/fipsmodule/bn/montgomery_inv.c000064400000000000000000000105331046102023000202450ustar 00000000000000/* Copyright 2016 Brian Smith. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "internal.h" #include "../../internal.h" OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2, "BN_MONT_CTX_N0_LIMBS value is invalid"); OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t), "uint64_t is insufficient precision for n0"); // LG_LITTLE_R is log_2(r). #define LG_LITTLE_R (BN_MONT_CTX_N0_LIMBS * BN_BITS2) // bn_neg_inv_r_mod_n_u64 calculates the -1/n mod r; i.e. it calculates |v| // such that u*r - v*n == 1. |r| is the constant defined in |bn_mont_n0|. |n| // must be odd. // // This is derived from |xbinGCD| in Henry S. Warren, Jr.'s "Montgomery // Multiplication" (http://www.hackersdelight.org/MontgomeryMultiplication.pdf). // It is very similar to the MODULAR-INVERSE function in Stephen R. Dussé's and // Burton S. Kaliski Jr.'s "A Cryptographic Library for the Motorola DSP56000" // (http://link.springer.com/chapter/10.1007%2F3-540-46877-3_21). // // This is inspired by Joppe W. Bos's "Constant Time Modular Inversion" // (http://www.joppebos.com/files/CTInversion.pdf) so that the inversion is // constant-time with respect to |n|. We assume uint64_t additions, // subtractions, shifts, and bitwise operations are all constant time, which // may be a large leap of faith on 32-bit targets. We avoid division and // multiplication, which tend to be the most problematic in terms of timing // leaks. // // Most GCD implementations return values such that |u*r + v*n == 1|, so the // caller would have to negate the resultant |v| for the purpose of Montgomery // multiplication. This implementation does the negation implicitly by doing // the computations as a difference instead of a sum. uint64_t bn_neg_inv_mod_r_u64(uint64_t n) { dev_assert_secret(n % 2 == 1); // alpha == 2**(lg r - 1) == r / 2. static const uint64_t alpha = UINT64_C(1) << (LG_LITTLE_R - 1); const uint64_t beta = n; uint64_t u = 1; uint64_t v = 0; // The invariant maintained from here on is: // 2**(lg r - i) == u*2*alpha - v*beta. for (size_t i = 0; i < LG_LITTLE_R; ++i) { #if BN_BITS2 == 64 && defined(BN_ULLONG) dev_assert_secret((BN_ULLONG)(1) << (LG_LITTLE_R - i) == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta)); #endif // Delete a common factor of 2 in u and v if |u| is even. Otherwise, set // |u = (u + beta) / 2| and |v = (v / 2) + alpha|. uint64_t u_is_odd = UINT64_C(0) - (u & 1); // Either 0xff..ff or 0. // The addition can overflow, so use Dietz's method for it. // // Dietz calculates (x+y)/2 by (x xor y)>>1 + x&y. This is valid for all // (unsigned) x and y, even when x+y overflows. Evidence for 32-bit values // (embedded in 64 bits to so that overflow can be ignored): // // (declare-fun x () (_ BitVec 64)) // (declare-fun y () (_ BitVec 64)) // (assert (let ( // (one (_ bv1 64)) // (thirtyTwo (_ bv32 64))) // (and // (bvult x (bvshl one thirtyTwo)) // (bvult y (bvshl one thirtyTwo)) // (not (= // (bvadd (bvlshr (bvxor x y) one) (bvand x y)) // (bvlshr (bvadd x y) one))) // ))) // (check-sat) uint64_t beta_if_u_is_odd = beta & u_is_odd; // Either |beta| or 0. u = ((u ^ beta_if_u_is_odd) >> 1) + (u & beta_if_u_is_odd); uint64_t alpha_if_u_is_odd = alpha & u_is_odd; /* Either |alpha| or 0. */ v = (v >> 1) + alpha_if_u_is_odd; } // The invariant now shows that u*r - v*n == 1 since r == 2 * alpha. #if BN_BITS2 == 64 && defined(BN_ULLONG) declassify_assert(1 == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta)); #endif return v; } ring-0.17.14/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl000064400000000000000000001204221046102023000204360ustar 00000000000000#! /usr/bin/env perl # Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # ECP_NISTZ256 module for ARMv8. # # February 2015. # # Original ECP_NISTZ256 submission targeting x86_64 is detailed in # http://eprint.iacr.org/2013/816. # # with/without -DECP_NISTZ256_ASM # Apple A7 +190-360% # Cortex-A53 +190-400% # Cortex-A57 +190-350% # Denver +230-400% # # Ranges denote minimum and maximum improvement coefficients depending # on benchmark. Lower coefficients are for ECDSA sign, server-side # operation. Keep in mind that +400% means 5x improvement. # The first two arguments should always be the flavour and output file path. if ($#ARGV < 1) { die "Not enough arguments provided. Two arguments are necessary: the flavour and the output file path."; } $flavour = shift; $output = shift; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; { my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = map("x$_",(0..17,19,20)); my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont $code.=<<___; .section .rodata .align 5 .Lpoly: .quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 .LRR: // 2^512 mod P precomputed for NIST P256 polynomial .quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd .Lone_mont: .quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe .Lone: .quad 1,0,0,0 .Lord: .quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 .LordK: .quad 0xccd1c8aaee00bc4f .asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by " .text // void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], // const BN_ULONG x2[4]); .globl ecp_nistz256_mul_mont .type ecp_nistz256_mul_mont,%function .align 4 ecp_nistz256_mul_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldr $bi,[$bp] // bp[0] ldp $a0,$a1,[$ap] ldp $a2,$a3,[$ap,#16] adrp $poly3,:pg_hi21:.Lpoly add $poly3,$poly3,:lo12:.Lpoly ldr $poly1,[$poly3,#8] ldr $poly3,[$poly3,#24] bl __ecp_nistz256_mul_mont ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont // void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_sqr_mont .type ecp_nistz256_sqr_mont,%function .align 4 ecp_nistz256_sqr_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldp $a0,$a1,[$ap] ldp $a2,$a3,[$ap,#16] adrp $poly3,:pg_hi21:.Lpoly add $poly3,$poly3,:lo12:.Lpoly ldr $poly1,[$poly3,#8] ldr $poly3,[$poly3,#24] bl __ecp_nistz256_sqr_mont ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont // void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_neg .type ecp_nistz256_neg,%function .align 4 ecp_nistz256_neg: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 mov $bp,$ap mov $acc0,xzr // a = 0 mov $acc1,xzr mov $acc2,xzr mov $acc3,xzr adrp $poly3,:pg_hi21:.Lpoly add $poly3,$poly3,:lo12:.Lpoly ldr $poly1,[$poly3,#8] ldr $poly3,[$poly3,#24] bl __ecp_nistz256_sub_from ldp x29,x30,[sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_neg,.-ecp_nistz256_neg // note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded // to $a0-$a3 and b[0] - to $bi .type __ecp_nistz256_mul_mont,%function .align 4 __ecp_nistz256_mul_mont: mul $acc0,$a0,$bi // a[0]*b[0] umulh $t0,$a0,$bi mul $acc1,$a1,$bi // a[1]*b[0] umulh $t1,$a1,$bi mul $acc2,$a2,$bi // a[2]*b[0] umulh $t2,$a2,$bi mul $acc3,$a3,$bi // a[3]*b[0] umulh $t3,$a3,$bi ldr $bi,[$bp,#8] // b[1] adds $acc1,$acc1,$t0 // accumulate high parts of multiplication lsl $t0,$acc0,#32 adcs $acc2,$acc2,$t1 lsr $t1,$acc0,#32 adcs $acc3,$acc3,$t2 adc $acc4,xzr,$t3 mov $acc5,xzr ___ for($i=1;$i<4;$i++) { # Reduction iteration is normally performed by accumulating # result of multiplication of modulus by "magic" digit [and # omitting least significant word, which is guaranteed to # be 0], but thanks to special form of modulus and "magic" # digit being equal to least significant word, it can be # performed with additions and subtractions alone. Indeed: # # ffff0001.00000000.0000ffff.ffffffff # * abcdefgh # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh # # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we # rewrite above as: # # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh # # or marking redundant operations: # # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- # - 0000abcd.efgh0000.--------.--------.-------- $code.=<<___; subs $t2,$acc0,$t0 // "*0xffff0001" sbc $t3,$acc0,$t1 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] mul $t0,$a0,$bi // lo(a[0]*b[i]) adcs $acc1,$acc2,$t1 mul $t1,$a1,$bi // lo(a[1]*b[i]) adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 mul $t2,$a2,$bi // lo(a[2]*b[i]) adcs $acc3,$acc4,$t3 mul $t3,$a3,$bi // lo(a[3]*b[i]) adc $acc4,$acc5,xzr adds $acc0,$acc0,$t0 // accumulate low parts of multiplication umulh $t0,$a0,$bi // hi(a[0]*b[i]) adcs $acc1,$acc1,$t1 umulh $t1,$a1,$bi // hi(a[1]*b[i]) adcs $acc2,$acc2,$t2 umulh $t2,$a2,$bi // hi(a[2]*b[i]) adcs $acc3,$acc3,$t3 umulh $t3,$a3,$bi // hi(a[3]*b[i]) adc $acc4,$acc4,xzr ___ $code.=<<___ if ($i<3); ldr $bi,[$bp,#8*($i+1)] // b[$i+1] ___ $code.=<<___; adds $acc1,$acc1,$t0 // accumulate high parts of multiplication lsl $t0,$acc0,#32 adcs $acc2,$acc2,$t1 lsr $t1,$acc0,#32 adcs $acc3,$acc3,$t2 adcs $acc4,$acc4,$t3 adc $acc5,xzr,xzr ___ } $code.=<<___; // last reduction subs $t2,$acc0,$t0 // "*0xffff0001" sbc $t3,$acc0,$t1 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] adcs $acc1,$acc2,$t1 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 adcs $acc3,$acc4,$t3 adc $acc4,$acc5,xzr adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus sbcs $t1,$acc1,$poly1 sbcs $t2,$acc2,xzr sbcs $t3,$acc3,$poly3 sbcs xzr,$acc4,xzr // did it borrow? csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus csel $acc1,$acc1,$t1,lo csel $acc2,$acc2,$t2,lo stp $acc0,$acc1,[$rp] csel $acc3,$acc3,$t3,lo stp $acc2,$acc3,[$rp,#16] ret .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont // note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded // to $a0-$a3 .type __ecp_nistz256_sqr_mont,%function .align 4 __ecp_nistz256_sqr_mont: // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul $acc1,$a1,$a0 // a[1]*a[0] umulh $t1,$a1,$a0 mul $acc2,$a2,$a0 // a[2]*a[0] umulh $t2,$a2,$a0 mul $acc3,$a3,$a0 // a[3]*a[0] umulh $acc4,$a3,$a0 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication mul $t0,$a2,$a1 // a[2]*a[1] umulh $t1,$a2,$a1 adcs $acc3,$acc3,$t2 mul $t2,$a3,$a1 // a[3]*a[1] umulh $t3,$a3,$a1 adc $acc4,$acc4,xzr // can't overflow mul $acc5,$a3,$a2 // a[3]*a[2] umulh $acc6,$a3,$a2 adds $t1,$t1,$t2 // accumulate high parts of multiplication mul $acc0,$a0,$a0 // a[0]*a[0] adc $t2,$t3,xzr // can't overflow adds $acc3,$acc3,$t0 // accumulate low parts of multiplication umulh $a0,$a0,$a0 adcs $acc4,$acc4,$t1 mul $t1,$a1,$a1 // a[1]*a[1] adcs $acc5,$acc5,$t2 umulh $a1,$a1,$a1 adc $acc6,$acc6,xzr // can't overflow adds $acc1,$acc1,$acc1 // acc[1-6]*=2 mul $t2,$a2,$a2 // a[2]*a[2] adcs $acc2,$acc2,$acc2 umulh $a2,$a2,$a2 adcs $acc3,$acc3,$acc3 mul $t3,$a3,$a3 // a[3]*a[3] adcs $acc4,$acc4,$acc4 umulh $a3,$a3,$a3 adcs $acc5,$acc5,$acc5 adcs $acc6,$acc6,$acc6 adc $acc7,xzr,xzr adds $acc1,$acc1,$a0 // +a[i]*a[i] adcs $acc2,$acc2,$t1 adcs $acc3,$acc3,$a1 adcs $acc4,$acc4,$t2 adcs $acc5,$acc5,$a2 lsl $t0,$acc0,#32 adcs $acc6,$acc6,$t3 lsr $t1,$acc0,#32 adc $acc7,$acc7,$a3 ___ for($i=0;$i<3;$i++) { # reductions, see commentary in # multiplication for details $code.=<<___; subs $t2,$acc0,$t0 // "*0xffff0001" sbc $t3,$acc0,$t1 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] adcs $acc1,$acc2,$t1 lsl $t0,$acc0,#32 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 lsr $t1,$acc0,#32 adc $acc3,$t3,xzr // can't overflow ___ } $code.=<<___; subs $t2,$acc0,$t0 // "*0xffff0001" sbc $t3,$acc0,$t1 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] adcs $acc1,$acc2,$t1 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 adc $acc3,$t3,xzr // can't overflow adds $acc0,$acc0,$acc4 // accumulate upper half adcs $acc1,$acc1,$acc5 adcs $acc2,$acc2,$acc6 adcs $acc3,$acc3,$acc7 adc $acc4,xzr,xzr adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus sbcs $t1,$acc1,$poly1 sbcs $t2,$acc2,xzr sbcs $t3,$acc3,$poly3 sbcs xzr,$acc4,xzr // did it borrow? csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus csel $acc1,$acc1,$t1,lo csel $acc2,$acc2,$t2,lo stp $acc0,$acc1,[$rp] csel $acc3,$acc3,$t3,lo stp $acc2,$acc3,[$rp,#16] ret .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont // Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to // $a0-$a3 and $t0-$t3. This is done because it's used in multiple // contexts, e.g. in multiplication by 2 and 3... .type __ecp_nistz256_add_to,%function .align 4 __ecp_nistz256_add_to: adds $acc0,$acc0,$t0 // ret = a+b adcs $acc1,$acc1,$t1 adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 adc $ap,xzr,xzr // zap $ap adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus sbcs $t1,$acc1,$poly1 sbcs $t2,$acc2,xzr sbcs $t3,$acc3,$poly3 sbcs xzr,$ap,xzr // did subtraction borrow? csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus csel $acc1,$acc1,$t1,lo csel $acc2,$acc2,$t2,lo stp $acc0,$acc1,[$rp] csel $acc3,$acc3,$t3,lo stp $acc2,$acc3,[$rp,#16] ret .size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to .type __ecp_nistz256_sub_from,%function .align 4 __ecp_nistz256_sub_from: ldp $t0,$t1,[$bp] ldp $t2,$t3,[$bp,#16] subs $acc0,$acc0,$t0 // ret = a-b sbcs $acc1,$acc1,$t1 sbcs $acc2,$acc2,$t2 sbcs $acc3,$acc3,$t3 sbc $ap,xzr,xzr // zap $ap subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus adcs $t1,$acc1,$poly1 adcs $t2,$acc2,xzr adc $t3,$acc3,$poly3 cmp $ap,xzr // did subtraction borrow? csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret csel $acc1,$acc1,$t1,eq csel $acc2,$acc2,$t2,eq stp $acc0,$acc1,[$rp] csel $acc3,$acc3,$t3,eq stp $acc2,$acc3,[$rp,#16] ret .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from .type __ecp_nistz256_sub_morf,%function .align 4 __ecp_nistz256_sub_morf: ldp $t0,$t1,[$bp] ldp $t2,$t3,[$bp,#16] subs $acc0,$t0,$acc0 // ret = b-a sbcs $acc1,$t1,$acc1 sbcs $acc2,$t2,$acc2 sbcs $acc3,$t3,$acc3 sbc $ap,xzr,xzr // zap $ap subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus adcs $t1,$acc1,$poly1 adcs $t2,$acc2,xzr adc $t3,$acc3,$poly3 cmp $ap,xzr // did subtraction borrow? csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret csel $acc1,$acc1,$t1,eq csel $acc2,$acc2,$t2,eq stp $acc0,$acc1,[$rp] csel $acc3,$acc3,$t3,eq stp $acc2,$acc3,[$rp,#16] ret .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf .type __ecp_nistz256_div_by_2,%function .align 4 __ecp_nistz256_div_by_2: subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus adcs $t1,$acc1,$poly1 adcs $t2,$acc2,xzr adcs $t3,$acc3,$poly3 adc $ap,xzr,xzr // zap $ap tst $acc0,#1 // is a even? csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus csel $acc1,$acc1,$t1,eq csel $acc2,$acc2,$t2,eq csel $acc3,$acc3,$t3,eq csel $ap,xzr,$ap,eq lsr $acc0,$acc0,#1 // ret >>= 1 orr $acc0,$acc0,$acc1,lsl#63 lsr $acc1,$acc1,#1 orr $acc1,$acc1,$acc2,lsl#63 lsr $acc2,$acc2,#1 orr $acc2,$acc2,$acc3,lsl#63 lsr $acc3,$acc3,#1 stp $acc0,$acc1,[$rp] orr $acc3,$acc3,$ap,lsl#63 stp $acc2,$acc3,[$rp,#16] ret .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 ___ ######################################################################## # following subroutines are "literal" implementation of those found in # ecp_nistz256.c # ######################################################################## # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); # { my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); # above map() describes stack layout with 4 temporary # 256-bit vectors on top. my ($rp_real,$ap_real) = map("x$_",(21,22)); $code.=<<___; .globl ecp_nistz256_point_double .type ecp_nistz256_point_double,%function .align 5 ecp_nistz256_point_double: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] sub sp,sp,#32*4 .Ldouble_shortcut: ldp $acc0,$acc1,[$ap,#32] mov $rp_real,$rp ldp $acc2,$acc3,[$ap,#48] mov $ap_real,$ap adrp $poly3,:pg_hi21:.Lpoly add $poly3,$poly3,:lo12:.Lpoly ldr $poly1,[$poly3,#8] mov $t0,$acc0 ldr $poly3,[$poly3,#24] mov $t1,$acc1 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont mov $t2,$acc2 mov $t3,$acc3 ldp $a2,$a3,[$ap_real,#64+16] add $rp,sp,#$S bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); add $rp,sp,#$Zsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); ldp $t0,$t1,[$ap_real] ldp $t2,$t3,[$ap_real,#16] mov $a0,$acc0 // put Zsqr aside for p256_sub mov $a1,$acc1 mov $a2,$acc2 mov $a3,$acc3 add $rp,sp,#$M bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); add $bp,$ap_real,#0 mov $acc0,$a0 // restore Zsqr mov $acc1,$a1 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont mov $acc2,$a2 mov $acc3,$a3 ldp $a2,$a3,[sp,#$S+16] add $rp,sp,#$Zsqr bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); add $rp,sp,#$S bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); ldr $bi,[$ap_real,#32] ldp $a0,$a1,[$ap_real,#64] ldp $a2,$a3,[$ap_real,#64+16] add $bp,$ap_real,#32 add $rp,sp,#$tmp0 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); mov $t0,$acc0 mov $t1,$acc1 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont mov $t2,$acc2 mov $t3,$acc3 ldp $a2,$a3,[sp,#$S+16] add $rp,$rp_real,#64 bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); add $rp,sp,#$tmp0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont ldp $a0,$a1,[sp,#$M] ldp $a2,$a3,[sp,#$M+16] add $rp,$rp_real,#32 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); add $bp,sp,#$Zsqr add $rp,sp,#$M bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); mov $t0,$acc0 // duplicate M mov $t1,$acc1 mov $t2,$acc2 mov $t3,$acc3 mov $a0,$acc0 // put M aside mov $a1,$acc1 mov $a2,$acc2 mov $a3,$acc3 add $rp,sp,#$M bl __ecp_nistz256_add_to mov $t0,$a0 // restore M mov $t1,$a1 ldr $bi,[$ap_real] // forward load for p256_mul_mont mov $t2,$a2 ldp $a0,$a1,[sp,#$S] mov $t3,$a3 ldp $a2,$a3,[sp,#$S+16] bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); add $bp,$ap_real,#0 add $rp,sp,#$S bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); mov $t0,$acc0 mov $t1,$acc1 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont mov $t2,$acc2 mov $t3,$acc3 ldp $a2,$a3,[sp,#$M+16] add $rp,sp,#$tmp0 bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); add $rp,$rp_real,#0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); add $bp,sp,#$tmp0 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); add $bp,sp,#$S add $rp,sp,#$S bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); ldr $bi,[sp,#$M] mov $a0,$acc0 // copy S mov $a1,$acc1 mov $a2,$acc2 mov $a3,$acc3 add $bp,sp,#$M bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); add $bp,$rp_real,#32 add $rp,$rp_real,#32 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_double,.-ecp_nistz256_point_double ___ } ######################################################################## # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, # const P256_POINT *in2); { my ($res_x,$res_y,$res_z, $H,$Hsqr,$R,$Rsqr,$Hcub, $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); # above map() describes stack layout with 12 temporary # 256-bit vectors on top. my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28)); $code.=<<___; .globl ecp_nistz256_point_add .type ecp_nistz256_point_add,%function .align 5 ecp_nistz256_point_add: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#32*12 ldp $a0,$a1,[$bp,#64] // in2_z ldp $a2,$a3,[$bp,#64+16] mov $rp_real,$rp mov $ap_real,$ap mov $bp_real,$bp adrp $poly3,:pg_hi21:.Lpoly add $poly3,$poly3,:lo12:.Lpoly ldr $poly1,[$poly3,#8] ldr $poly3,[$poly3,#24] orr $t0,$a0,$a1 orr $t2,$a2,$a3 orr $in2infty,$t0,$t2 cmp $in2infty,#0 csetm $in2infty,ne // ~in2infty add $rp,sp,#$Z2sqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); ldp $a0,$a1,[$ap_real,#64] // in1_z ldp $a2,$a3,[$ap_real,#64+16] orr $t0,$a0,$a1 orr $t2,$a2,$a3 orr $in1infty,$t0,$t2 cmp $in1infty,#0 csetm $in1infty,ne // ~in1infty add $rp,sp,#$Z1sqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); ldr $bi,[$bp_real,#64] ldp $a0,$a1,[sp,#$Z2sqr] ldp $a2,$a3,[sp,#$Z2sqr+16] add $bp,$bp_real,#64 add $rp,sp,#$S1 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); ldr $bi,[$ap_real,#64] ldp $a0,$a1,[sp,#$Z1sqr] ldp $a2,$a3,[sp,#$Z1sqr+16] add $bp,$ap_real,#64 add $rp,sp,#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); ldr $bi,[$ap_real,#32] ldp $a0,$a1,[sp,#$S1] ldp $a2,$a3,[sp,#$S1+16] add $bp,$ap_real,#32 add $rp,sp,#$S1 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); ldr $bi,[$bp_real,#32] ldp $a0,$a1,[sp,#$S2] ldp $a2,$a3,[sp,#$S2+16] add $bp,$bp_real,#32 add $rp,sp,#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); add $bp,sp,#$S1 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont ldp $a0,$a1,[$ap_real] ldp $a2,$a3,[$ap_real,#16] add $rp,sp,#$R bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); orr $acc0,$acc0,$acc1 // see if result is zero orr $acc2,$acc2,$acc3 orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2) add $bp,sp,#$Z2sqr add $rp,sp,#$U1 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); ldr $bi,[sp,#$Z1sqr] ldp $a0,$a1,[$bp_real] ldp $a2,$a3,[$bp_real,#16] add $bp,sp,#$Z1sqr add $rp,sp,#$U2 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); add $bp,sp,#$U1 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont ldp $a2,$a3,[sp,#$R+16] add $rp,sp,#$H bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); orr $acc0,$acc0,$acc1 // see if result is zero orr $acc2,$acc2,$acc3 orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2) mvn $temp1,$in1infty // -1/0 -> 0/-1 mvn $temp2,$in2infty // -1/0 -> 0/-1 orr $acc0,$acc0,$temp1 orr $acc0,$acc0,$temp2 orr $acc0,$acc0,$temp0 cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) .Ladd_double: mov $ap,$ap_real mov $rp,$rp_real ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames b .Ldouble_shortcut .align 4 .Ladd_proceed: add $rp,sp,#$Rsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); ldr $bi,[$ap_real,#64] ldp $a0,$a1,[sp,#$H] ldp $a2,$a3,[sp,#$H+16] add $bp,$ap_real,#64 add $rp,sp,#$res_z bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); ldp $a0,$a1,[sp,#$H] ldp $a2,$a3,[sp,#$H+16] add $rp,sp,#$Hsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); ldr $bi,[$bp_real,#64] ldp $a0,$a1,[sp,#$res_z] ldp $a2,$a3,[sp,#$res_z+16] add $bp,$bp_real,#64 add $rp,sp,#$res_z bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); ldr $bi,[sp,#$H] ldp $a0,$a1,[sp,#$Hsqr] ldp $a2,$a3,[sp,#$Hsqr+16] add $bp,sp,#$H add $rp,sp,#$Hcub bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); ldr $bi,[sp,#$Hsqr] ldp $a0,$a1,[sp,#$U1] ldp $a2,$a3,[sp,#$U1+16] add $bp,sp,#$Hsqr add $rp,sp,#$U2 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); mov $t0,$acc0 mov $t1,$acc1 mov $t2,$acc2 mov $t3,$acc3 add $rp,sp,#$Hsqr bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); add $bp,sp,#$Rsqr add $rp,sp,#$res_x bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); add $bp,sp,#$Hcub bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); add $bp,sp,#$U2 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont ldp $a0,$a1,[sp,#$S1] ldp $a2,$a3,[sp,#$S1+16] add $rp,sp,#$res_y bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); add $bp,sp,#$Hcub add $rp,sp,#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); ldr $bi,[sp,#$R] ldp $a0,$a1,[sp,#$res_y] ldp $a2,$a3,[sp,#$res_y+16] add $bp,sp,#$R add $rp,sp,#$res_y bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); add $bp,sp,#$S2 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); ldp $a0,$a1,[sp,#$res_x] // res ldp $a2,$a3,[sp,#$res_x+16] ldp $t0,$t1,[$bp_real] // in2 ldp $t2,$t3,[$bp_real,#16] ___ for($i=0;$i<64;$i+=32) { # conditional moves $code.=<<___; ldp $acc0,$acc1,[$ap_real,#$i] // in1 cmp $in1infty,#0 // ~$in1intfy, remember? ldp $acc2,$acc3,[$ap_real,#$i+16] csel $t0,$a0,$t0,ne csel $t1,$a1,$t1,ne ldp $a0,$a1,[sp,#$res_x+$i+32] // res csel $t2,$a2,$t2,ne csel $t3,$a3,$t3,ne cmp $in2infty,#0 // ~$in2intfy, remember? ldp $a2,$a3,[sp,#$res_x+$i+48] csel $acc0,$t0,$acc0,ne csel $acc1,$t1,$acc1,ne ldp $t0,$t1,[$bp_real,#$i+32] // in2 csel $acc2,$t2,$acc2,ne csel $acc3,$t3,$acc3,ne ldp $t2,$t3,[$bp_real,#$i+48] stp $acc0,$acc1,[$rp_real,#$i] stp $acc2,$acc3,[$rp_real,#$i+16] ___ } $code.=<<___; ldp $acc0,$acc1,[$ap_real,#$i] // in1 cmp $in1infty,#0 // ~$in1intfy, remember? ldp $acc2,$acc3,[$ap_real,#$i+16] csel $t0,$a0,$t0,ne csel $t1,$a1,$t1,ne csel $t2,$a2,$t2,ne csel $t3,$a3,$t3,ne cmp $in2infty,#0 // ~$in2intfy, remember? csel $acc0,$t0,$acc0,ne csel $acc1,$t1,$acc1,ne csel $acc2,$t2,$acc2,ne csel $acc3,$t3,$acc3,ne stp $acc0,$acc1,[$rp_real,#$i] stp $acc2,$acc3,[$rp_real,#$i+16] .Ladd_done: add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_add,.-ecp_nistz256_point_add ___ } ######################################################################## # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, # const P256_POINT_AFFINE *in2); { my ($res_x,$res_y,$res_z, $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); my $Z1sqr = $S2; # above map() describes stack layout with 10 temporary # 256-bit vectors on top. my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); $code.=<<___; .globl ecp_nistz256_point_add_affine .type ecp_nistz256_point_add_affine,%function .align 5 ecp_nistz256_point_add_affine: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-80]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] sub sp,sp,#32*10 mov $rp_real,$rp mov $ap_real,$ap mov $bp_real,$bp adrp $poly3,:pg_hi21:.Lpoly add $poly3,$poly3,:lo12:.Lpoly ldr $poly1,[$poly3,#8] ldr $poly3,[$poly3,#24] ldp $a0,$a1,[$ap,#64] // in1_z ldp $a2,$a3,[$ap,#64+16] orr $t0,$a0,$a1 orr $t2,$a2,$a3 orr $in1infty,$t0,$t2 cmp $in1infty,#0 csetm $in1infty,ne // ~in1infty ldp $acc0,$acc1,[$bp] // in2_x ldp $acc2,$acc3,[$bp,#16] ldp $t0,$t1,[$bp,#32] // in2_y ldp $t2,$t3,[$bp,#48] orr $acc0,$acc0,$acc1 orr $acc2,$acc2,$acc3 orr $t0,$t0,$t1 orr $t2,$t2,$t3 orr $acc0,$acc0,$acc2 orr $t0,$t0,$t2 orr $in2infty,$acc0,$t0 cmp $in2infty,#0 csetm $in2infty,ne // ~in2infty add $rp,sp,#$Z1sqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); mov $a0,$acc0 mov $a1,$acc1 mov $a2,$acc2 mov $a3,$acc3 ldr $bi,[$bp_real] add $bp,$bp_real,#0 add $rp,sp,#$U2 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); add $bp,$ap_real,#0 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont ldp $a0,$a1,[sp,#$Z1sqr] ldp $a2,$a3,[sp,#$Z1sqr+16] add $rp,sp,#$H bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); add $bp,$ap_real,#64 add $rp,sp,#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); ldr $bi,[$ap_real,#64] ldp $a0,$a1,[sp,#$H] ldp $a2,$a3,[sp,#$H+16] add $bp,$ap_real,#64 add $rp,sp,#$res_z bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); ldr $bi,[$bp_real,#32] ldp $a0,$a1,[sp,#$S2] ldp $a2,$a3,[sp,#$S2+16] add $bp,$bp_real,#32 add $rp,sp,#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); add $bp,$ap_real,#32 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont ldp $a2,$a3,[sp,#$H+16] add $rp,sp,#$R bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); add $rp,sp,#$Hsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); ldp $a0,$a1,[sp,#$R] ldp $a2,$a3,[sp,#$R+16] add $rp,sp,#$Rsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); ldr $bi,[sp,#$H] ldp $a0,$a1,[sp,#$Hsqr] ldp $a2,$a3,[sp,#$Hsqr+16] add $bp,sp,#$H add $rp,sp,#$Hcub bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); ldr $bi,[$ap_real] ldp $a0,$a1,[sp,#$Hsqr] ldp $a2,$a3,[sp,#$Hsqr+16] add $bp,$ap_real,#0 add $rp,sp,#$U2 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); mov $t0,$acc0 mov $t1,$acc1 mov $t2,$acc2 mov $t3,$acc3 add $rp,sp,#$Hsqr bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); add $bp,sp,#$Rsqr add $rp,sp,#$res_x bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); add $bp,sp,#$Hcub bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); add $bp,sp,#$U2 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont ldp $a0,$a1,[sp,#$Hcub] ldp $a2,$a3,[sp,#$Hcub+16] add $rp,sp,#$res_y bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); add $bp,$ap_real,#32 add $rp,sp,#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); ldr $bi,[sp,#$R] ldp $a0,$a1,[sp,#$res_y] ldp $a2,$a3,[sp,#$res_y+16] add $bp,sp,#$R add $rp,sp,#$res_y bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); add $bp,sp,#$S2 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); ldp $a0,$a1,[sp,#$res_x] // res ldp $a2,$a3,[sp,#$res_x+16] ldp $t0,$t1,[$bp_real] // in2 ldp $t2,$t3,[$bp_real,#16] ___ for($i=0;$i<64;$i+=32) { # conditional moves $code.=<<___; ldp $acc0,$acc1,[$ap_real,#$i] // in1 cmp $in1infty,#0 // ~$in1intfy, remember? ldp $acc2,$acc3,[$ap_real,#$i+16] csel $t0,$a0,$t0,ne csel $t1,$a1,$t1,ne ldp $a0,$a1,[sp,#$res_x+$i+32] // res csel $t2,$a2,$t2,ne csel $t3,$a3,$t3,ne cmp $in2infty,#0 // ~$in2intfy, remember? ldp $a2,$a3,[sp,#$res_x+$i+48] csel $acc0,$t0,$acc0,ne csel $acc1,$t1,$acc1,ne ldp $t0,$t1,[$bp_real,#$i+32] // in2 csel $acc2,$t2,$acc2,ne csel $acc3,$t3,$acc3,ne ldp $t2,$t3,[$bp_real,#$i+48] stp $acc0,$acc1,[$rp_real,#$i] stp $acc2,$acc3,[$rp_real,#$i+16] ___ $code.=<<___ if ($i == 0); adrp $bp_real,:pg_hi21:.Lone_mont-64 add $bp_real,$bp_real,:lo12:.Lone_mont-64 ___ } $code.=<<___; ldp $acc0,$acc1,[$ap_real,#$i] // in1 cmp $in1infty,#0 // ~$in1intfy, remember? ldp $acc2,$acc3,[$ap_real,#$i+16] csel $t0,$a0,$t0,ne csel $t1,$a1,$t1,ne csel $t2,$a2,$t2,ne csel $t3,$a3,$t3,ne cmp $in2infty,#0 // ~$in2intfy, remember? csel $acc0,$t0,$acc0,ne csel $acc1,$t1,$acc1,ne csel $acc2,$t2,$acc2,ne csel $acc3,$t3,$acc3,ne stp $acc0,$acc1,[$rp_real,#$i] stp $acc2,$acc3,[$rp_real,#$i+16] add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x29,x30,[sp],#80 AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine ___ } if (1) { my ($ord0,$ord1) = ($poly1,$poly3); my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); my $acc7 = $bi; $code.=<<___; //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], // uint64_t b[4]); .globl ecp_nistz256_ord_mul_mont .type ecp_nistz256_ord_mul_mont,%function .align 4 ecp_nistz256_ord_mul_mont: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] adrp $ordk,:pg_hi21:.Lord add $ordk,$ordk,:lo12:.Lord ldr $bi,[$bp] // bp[0] ldp $a0,$a1,[$ap] ldp $a2,$a3,[$ap,#16] ldp $ord0,$ord1,[$ordk,#0] ldp $ord2,$ord3,[$ordk,#16] ldr $ordk,[$ordk,#32] mul $acc0,$a0,$bi // a[0]*b[0] umulh $t0,$a0,$bi mul $acc1,$a1,$bi // a[1]*b[0] umulh $t1,$a1,$bi mul $acc2,$a2,$bi // a[2]*b[0] umulh $t2,$a2,$bi mul $acc3,$a3,$bi // a[3]*b[0] umulh $acc4,$a3,$bi mul $t4,$acc0,$ordk adds $acc1,$acc1,$t0 // accumulate high parts of multiplication adcs $acc2,$acc2,$t1 adcs $acc3,$acc3,$t2 adc $acc4,$acc4,xzr mov $acc5,xzr ___ for ($i=1;$i<4;$i++) { ################################################################ # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz # * abcdefgh # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx # # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we # rewrite above as: # # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh $code.=<<___; ldr $bi,[$bp,#8*$i] // b[i] lsl $t0,$t4,#32 subs $acc2,$acc2,$t4 lsr $t1,$t4,#32 sbcs $acc3,$acc3,$t0 sbcs $acc4,$acc4,$t1 sbc $acc5,$acc5,xzr subs xzr,$acc0,#1 umulh $t1,$ord0,$t4 mul $t2,$ord1,$t4 umulh $t3,$ord1,$t4 adcs $t2,$t2,$t1 mul $t0,$a0,$bi adc $t3,$t3,xzr mul $t1,$a1,$bi adds $acc0,$acc1,$t2 mul $t2,$a2,$bi adcs $acc1,$acc2,$t3 mul $t3,$a3,$bi adcs $acc2,$acc3,$t4 adcs $acc3,$acc4,$t4 adc $acc4,$acc5,xzr adds $acc0,$acc0,$t0 // accumulate low parts umulh $t0,$a0,$bi adcs $acc1,$acc1,$t1 umulh $t1,$a1,$bi adcs $acc2,$acc2,$t2 umulh $t2,$a2,$bi adcs $acc3,$acc3,$t3 umulh $t3,$a3,$bi adc $acc4,$acc4,xzr mul $t4,$acc0,$ordk adds $acc1,$acc1,$t0 // accumulate high parts adcs $acc2,$acc2,$t1 adcs $acc3,$acc3,$t2 adcs $acc4,$acc4,$t3 adc $acc5,xzr,xzr ___ } $code.=<<___; lsl $t0,$t4,#32 // last reduction subs $acc2,$acc2,$t4 lsr $t1,$t4,#32 sbcs $acc3,$acc3,$t0 sbcs $acc4,$acc4,$t1 sbc $acc5,$acc5,xzr subs xzr,$acc0,#1 umulh $t1,$ord0,$t4 mul $t2,$ord1,$t4 umulh $t3,$ord1,$t4 adcs $t2,$t2,$t1 adc $t3,$t3,xzr adds $acc0,$acc1,$t2 adcs $acc1,$acc2,$t3 adcs $acc2,$acc3,$t4 adcs $acc3,$acc4,$t4 adc $acc4,$acc5,xzr subs $t0,$acc0,$ord0 // ret -= modulus sbcs $t1,$acc1,$ord1 sbcs $t2,$acc2,$ord2 sbcs $t3,$acc3,$ord3 sbcs xzr,$acc4,xzr csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus csel $acc1,$acc1,$t1,lo csel $acc2,$acc2,$t2,lo stp $acc0,$acc1,[$rp] csel $acc3,$acc3,$t3,lo stp $acc2,$acc3,[$rp,#16] ldp x19,x20,[sp,#16] ldp x21,x22,[sp,#32] ldp x23,x24,[sp,#48] ldr x29,[sp],#64 ret .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], // uint64_t rep); .globl ecp_nistz256_ord_sqr_mont .type ecp_nistz256_ord_sqr_mont,%function .align 4 ecp_nistz256_ord_sqr_mont: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] adrp $ordk,:pg_hi21:.Lord add $ordk,$ordk,:lo12:.Lord ldp $a0,$a1,[$ap] ldp $a2,$a3,[$ap,#16] ldp $ord0,$ord1,[$ordk,#0] ldp $ord2,$ord3,[$ordk,#16] ldr $ordk,[$ordk,#32] b .Loop_ord_sqr .align 4 .Loop_ord_sqr: sub $bp,$bp,#1 //////////////////////////////////////////////////////////////// // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul $acc1,$a1,$a0 // a[1]*a[0] umulh $t1,$a1,$a0 mul $acc2,$a2,$a0 // a[2]*a[0] umulh $t2,$a2,$a0 mul $acc3,$a3,$a0 // a[3]*a[0] umulh $acc4,$a3,$a0 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication mul $t0,$a2,$a1 // a[2]*a[1] umulh $t1,$a2,$a1 adcs $acc3,$acc3,$t2 mul $t2,$a3,$a1 // a[3]*a[1] umulh $t3,$a3,$a1 adc $acc4,$acc4,xzr // can't overflow mul $acc5,$a3,$a2 // a[3]*a[2] umulh $acc6,$a3,$a2 adds $t1,$t1,$t2 // accumulate high parts of multiplication mul $acc0,$a0,$a0 // a[0]*a[0] adc $t2,$t3,xzr // can't overflow adds $acc3,$acc3,$t0 // accumulate low parts of multiplication umulh $a0,$a0,$a0 adcs $acc4,$acc4,$t1 mul $t1,$a1,$a1 // a[1]*a[1] adcs $acc5,$acc5,$t2 umulh $a1,$a1,$a1 adc $acc6,$acc6,xzr // can't overflow adds $acc1,$acc1,$acc1 // acc[1-6]*=2 mul $t2,$a2,$a2 // a[2]*a[2] adcs $acc2,$acc2,$acc2 umulh $a2,$a2,$a2 adcs $acc3,$acc3,$acc3 mul $t3,$a3,$a3 // a[3]*a[3] adcs $acc4,$acc4,$acc4 umulh $a3,$a3,$a3 adcs $acc5,$acc5,$acc5 adcs $acc6,$acc6,$acc6 adc $acc7,xzr,xzr adds $acc1,$acc1,$a0 // +a[i]*a[i] mul $t4,$acc0,$ordk adcs $acc2,$acc2,$t1 adcs $acc3,$acc3,$a1 adcs $acc4,$acc4,$t2 adcs $acc5,$acc5,$a2 adcs $acc6,$acc6,$t3 adc $acc7,$acc7,$a3 ___ for($i=0; $i<4; $i++) { # reductions $code.=<<___; subs xzr,$acc0,#1 umulh $t1,$ord0,$t4 mul $t2,$ord1,$t4 umulh $t3,$ord1,$t4 adcs $t2,$t2,$t1 adc $t3,$t3,xzr adds $acc0,$acc1,$t2 adcs $acc1,$acc2,$t3 adcs $acc2,$acc3,$t4 adc $acc3,xzr,$t4 // can't overflow ___ $code.=<<___ if ($i<3); mul $t3,$acc0,$ordk ___ $code.=<<___; lsl $t0,$t4,#32 subs $acc1,$acc1,$t4 lsr $t1,$t4,#32 sbcs $acc2,$acc2,$t0 sbc $acc3,$acc3,$t1 // can't borrow ___ ($t3,$t4) = ($t4,$t3); } $code.=<<___; adds $acc0,$acc0,$acc4 // accumulate upper half adcs $acc1,$acc1,$acc5 adcs $acc2,$acc2,$acc6 adcs $acc3,$acc3,$acc7 adc $acc4,xzr,xzr subs $t0,$acc0,$ord0 // ret -= modulus sbcs $t1,$acc1,$ord1 sbcs $t2,$acc2,$ord2 sbcs $t3,$acc3,$ord3 sbcs xzr,$acc4,xzr csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus csel $a1,$acc1,$t1,lo csel $a2,$acc2,$t2,lo csel $a3,$acc3,$t3,lo cbnz $bp,.Loop_ord_sqr stp $a0,$a1,[$rp] stp $a2,$a3,[$rp,#16] ldp x19,x20,[sp,#16] ldp x21,x22,[sp,#32] ldp x23,x24,[sp,#48] ldr x29,[sp],#64 ret .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont ___ } } ######################################################################## # select subroutines # These select functions are similar to those in p256-x86_64-asm.pl # They load all points in the lookup table # keeping in the output only the one corresponding to the input index. { my ($val,$in_t)=map("x$_",(0..1)); my ($index)=("w2"); my ($Idx_ctr,$Val_in, $Mask_64)=("w9", "x10", "x11"); my ($Mask)=("v3"); my ($Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("v$_",(16..21)); my ($T0a,$T0b,$T0c,$T0d,$T0e,$T0f)=map("v$_",(22..27)); $code.=<<___; //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_select_w5 .type ecp_nistz256_select_w5,%function .align 4 ecp_nistz256_select_w5: AARCH64_VALID_CALL_TARGET // $Val_in := $val // $Idx_ctr := 0; loop counter and incremented internal index mov $Val_in, $val mov $Idx_ctr, #0 // [$Ra-$Rf] := 0 movi $Ra.16b, #0 movi $Rb.16b, #0 movi $Rc.16b, #0 movi $Rd.16b, #0 movi $Re.16b, #0 movi $Rf.16b, #0 .Lselect_w5_loop: // Loop 16 times. // Increment index (loop counter); tested at the end of the loop add $Idx_ctr, $Idx_ctr, #1 // [$T0a-$T0f] := Load a (3*256-bit = 6*128-bit) table entry starting at $in_t // and advance $in_t to point to the next entry ld1 {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64 // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s cmp $Idx_ctr, $index csetm $Mask_64, eq // continue loading ... ld1 {$T0e.2d, $T0f.2d}, [$in_t],#32 // duplicate mask_64 into Mask (all 0s or all 1s) dup $Mask.2d, $Mask_64 // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd] // i.e., values in output registers will remain the same if $Idx_ctr != $index bit $Ra.16b, $T0a.16b, $Mask.16b bit $Rb.16b, $T0b.16b, $Mask.16b bit $Rc.16b, $T0c.16b, $Mask.16b bit $Rd.16b, $T0d.16b, $Mask.16b bit $Re.16b, $T0e.16b, $Mask.16b bit $Rf.16b, $T0f.16b, $Mask.16b // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back tbz $Idx_ctr, #4, .Lselect_w5_loop // Write [$Ra-$Rf] to memory at the output pointer st1 {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$Val_in],#64 st1 {$Re.2d, $Rf.2d}, [$Val_in] ret .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_select_w7 .type ecp_nistz256_select_w7,%function .align 4 ecp_nistz256_select_w7: AARCH64_VALID_CALL_TARGET // $Idx_ctr := 0; loop counter and incremented internal index mov $Idx_ctr, #0 // [$Ra-$Rf] := 0 movi $Ra.16b, #0 movi $Rb.16b, #0 movi $Rc.16b, #0 movi $Rd.16b, #0 .Lselect_w7_loop: // Loop 64 times. // Increment index (loop counter); tested at the end of the loop add $Idx_ctr, $Idx_ctr, #1 // [$T0a-$T0d] := Load a (2*256-bit = 4*128-bit) table entry starting at $in_t // and advance $in_t to point to the next entry ld1 {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64 // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s cmp $Idx_ctr, $index csetm $Mask_64, eq // duplicate mask_64 into Mask (all 0s or all 1s) dup $Mask.2d, $Mask_64 // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd] // i.e., values in output registers will remain the same if $Idx_ctr != $index bit $Ra.16b, $T0a.16b, $Mask.16b bit $Rb.16b, $T0b.16b, $Mask.16b bit $Rc.16b, $T0c.16b, $Mask.16b bit $Rd.16b, $T0d.16b, $Mask.16b // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back tbz $Idx_ctr, #6, .Lselect_w7_loop // Write [$Ra-$Rd] to memory at the output pointer st1 {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$val] ret .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 ___ } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; # enforce flush ring-0.17.14/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl000064400000000000000000002637151046102023000203540ustar 00000000000000#! /usr/bin/env perl # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. # Copyright (c) 2014, Intel Corporation. All Rights Reserved. # Copyright (c) 2015 CloudFlare, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) # (1) Intel Corporation, Israel Development Center, Haifa, Israel # (2) University of Haifa, Israel # (3) CloudFlare, Inc. # # Reference: # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with # 256 Bit Primes" # Further optimization by : # # this/original with/without -DECP_NISTZ256_ASM(*) # Opteron +15-49% +150-195% # Bulldozer +18-45% +175-240% # P4 +24-46% +100-150% # Westmere +18-34% +87-160% # Sandy Bridge +14-35% +120-185% # Ivy Bridge +11-35% +125-180% # Haswell +10-37% +160-200% # Broadwell +24-58% +210-270% # Atom +20-50% +180-240% # VIA Nano +50-160% +480-480% # # (*) "without -DECP_NISTZ256_ASM" refers to build with # "enable-ec_nistp_64_gcc_128"; # # Ranges denote minimum and maximum improvement coefficients depending # on benchmark. In "this/original" column lower coefficient is for # ECDSA sign, while in "with/without" - for ECDH key agreement, and # higher - for ECDSA sign, relatively fastest server-side operation. # Keep in mind that +100% means 2x improvement. $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $avx = 2; $addx = 1; $code.=<<___; .text # The polynomial .section .rodata .align 64 .Lpoly: .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 .LOne: .long 1,1,1,1,1,1,1,1 .LTwo: .long 2,2,2,2,2,2,2,2 .LThree: .long 3,3,3,3,3,3,3,3 .LONE_mont: .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe # Constants for computations modulo ord(p256) .Lord: .quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 .LordK: .quad 0xccd1c8aaee00bc4f .text ___ { my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); $code.=<<___; ################################################################################ # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); .globl ecp_nistz256_neg .type ecp_nistz256_neg,\@function,2 .align 32 ecp_nistz256_neg: .cfi_startproc _CET_ENDBR push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Lneg_body: xor $a0, $a0 xor $a1, $a1 xor $a2, $a2 xor $a3, $a3 xor $t4, $t4 sub 8*0($a_ptr), $a0 sbb 8*1($a_ptr), $a1 sbb 8*2($a_ptr), $a2 mov $a0, $t0 sbb 8*3($a_ptr), $a3 lea .Lpoly(%rip), $a_ptr mov $a1, $t1 sbb \$0, $t4 add 8*0($a_ptr), $a0 mov $a2, $t2 adc 8*1($a_ptr), $a1 adc 8*2($a_ptr), $a2 mov $a3, $t3 adc 8*3($a_ptr), $a3 test $t4, $t4 cmovz $t0, $a0 cmovz $t1, $a1 mov $a0, 8*0($r_ptr) cmovz $t2, $a2 mov $a1, 8*1($r_ptr) cmovz $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lneg_epilogue: ret .cfi_endproc .size ecp_nistz256_neg,.-ecp_nistz256_neg ___ } { my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); my ($poly1,$poly3)=($acc6,$acc7); $code.=<<___; ################################################################################ # void ecp_nistz256_ord_mul_mont( # uint64_t res[4], # uint64_t a[4], # uint64_t b[4]); .globl ecp_nistz256_ord_mul_mont_nohw .type ecp_nistz256_ord_mul_mont_nohw,\@function,3 .align 32 ecp_nistz256_ord_mul_mont_nohw: .cfi_startproc _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lord_mul_body: mov 8*0($b_org), %rax mov $b_org, $b_ptr lea .Lord(%rip), %r14 mov .LordK(%rip), %r15 ################################# * b[0] mov %rax, $t0 mulq 8*0($a_ptr) mov %rax, $acc0 mov $t0, %rax mov %rdx, $acc1 mulq 8*1($a_ptr) add %rax, $acc1 mov $t0, %rax adc \$0, %rdx mov %rdx, $acc2 mulq 8*2($a_ptr) add %rax, $acc2 mov $t0, %rax adc \$0, %rdx mov $acc0, $acc5 imulq %r15,$acc0 mov %rdx, $acc3 mulq 8*3($a_ptr) add %rax, $acc3 mov $acc0, %rax adc \$0, %rdx mov %rdx, $acc4 ################################# First reduction step mulq 8*0(%r14) mov $acc0, $t1 add %rax, $acc5 # guaranteed to be zero mov $acc0, %rax adc \$0, %rdx mov %rdx, $t0 sub $acc0, $acc2 sbb \$0, $acc0 # can't borrow mulq 8*1(%r14) add $t0, $acc1 adc \$0, %rdx add %rax, $acc1 mov $t1, %rax adc %rdx, $acc2 mov $t1, %rdx adc \$0, $acc0 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc3 mov 8*1($b_ptr), %rax sbb %rdx, $t1 # can't borrow add $acc0, $acc3 adc $t1, $acc4 adc \$0, $acc5 ################################# * b[1] mov %rax, $t0 mulq 8*0($a_ptr) add %rax, $acc1 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*1($a_ptr) add $t1, $acc2 adc \$0, %rdx add %rax, $acc2 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*2($a_ptr) add $t1, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t0, %rax adc \$0, %rdx mov $acc1, $t0 imulq %r15, $acc1 mov %rdx, $t1 mulq 8*3($a_ptr) add $t1, $acc4 adc \$0, %rdx xor $acc0, $acc0 add %rax, $acc4 mov $acc1, %rax adc %rdx, $acc5 adc \$0, $acc0 ################################# Second reduction step mulq 8*0(%r14) mov $acc1, $t1 add %rax, $t0 # guaranteed to be zero mov $acc1, %rax adc %rdx, $t0 sub $acc1, $acc3 sbb \$0, $acc1 # can't borrow mulq 8*1(%r14) add $t0, $acc2 adc \$0, %rdx add %rax, $acc2 mov $t1, %rax adc %rdx, $acc3 mov $t1, %rdx adc \$0, $acc1 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc4 mov 8*2($b_ptr), %rax sbb %rdx, $t1 # can't borrow add $acc1, $acc4 adc $t1, $acc5 adc \$0, $acc0 ################################## * b[2] mov %rax, $t0 mulq 8*0($a_ptr) add %rax, $acc2 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*1($a_ptr) add $t1, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*2($a_ptr) add $t1, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t0, %rax adc \$0, %rdx mov $acc2, $t0 imulq %r15, $acc2 mov %rdx, $t1 mulq 8*3($a_ptr) add $t1, $acc5 adc \$0, %rdx xor $acc1, $acc1 add %rax, $acc5 mov $acc2, %rax adc %rdx, $acc0 adc \$0, $acc1 ################################# Third reduction step mulq 8*0(%r14) mov $acc2, $t1 add %rax, $t0 # guaranteed to be zero mov $acc2, %rax adc %rdx, $t0 sub $acc2, $acc4 sbb \$0, $acc2 # can't borrow mulq 8*1(%r14) add $t0, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t1, %rax adc %rdx, $acc4 mov $t1, %rdx adc \$0, $acc2 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc5 mov 8*3($b_ptr), %rax sbb %rdx, $t1 # can't borrow add $acc2, $acc5 adc $t1, $acc0 adc \$0, $acc1 ################################# * b[3] mov %rax, $t0 mulq 8*0($a_ptr) add %rax, $acc3 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*1($a_ptr) add $t1, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*2($a_ptr) add $t1, $acc5 adc \$0, %rdx add %rax, $acc5 mov $t0, %rax adc \$0, %rdx mov $acc3, $t0 imulq %r15, $acc3 mov %rdx, $t1 mulq 8*3($a_ptr) add $t1, $acc0 adc \$0, %rdx xor $acc2, $acc2 add %rax, $acc0 mov $acc3, %rax adc %rdx, $acc1 adc \$0, $acc2 ################################# Last reduction step mulq 8*0(%r14) mov $acc3, $t1 add %rax, $t0 # guaranteed to be zero mov $acc3, %rax adc %rdx, $t0 sub $acc3, $acc5 sbb \$0, $acc3 # can't borrow mulq 8*1(%r14) add $t0, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t1, %rax adc %rdx, $acc5 mov $t1, %rdx adc \$0, $acc3 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc0 sbb %rdx, $t1 # can't borrow add $acc3, $acc0 adc $t1, $acc1 adc \$0, $acc2 ################################# Subtract ord mov $acc4, $a_ptr sub 8*0(%r14), $acc4 mov $acc5, $acc3 sbb 8*1(%r14), $acc5 mov $acc0, $t0 sbb 8*2(%r14), $acc0 mov $acc1, $t1 sbb 8*3(%r14), $acc1 sbb \$0, $acc2 cmovc $a_ptr, $acc4 cmovc $acc3, $acc5 cmovc $t0, $acc0 cmovc $t1, $acc1 mov $acc4, 8*0($r_ptr) mov $acc5, 8*1($r_ptr) mov $acc0, 8*2($r_ptr) mov $acc1, 8*3($r_ptr) mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_mul_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_mul_mont_nohw,.-ecp_nistz256_ord_mul_mont_nohw ################################################################################ # void ecp_nistz256_ord_sqr_mont( # uint64_t res[4], # uint64_t a[4], # uint64_t rep); .globl ecp_nistz256_ord_sqr_mont_nohw .type ecp_nistz256_ord_sqr_mont_nohw,\@function,3 .align 32 ecp_nistz256_ord_sqr_mont_nohw: .cfi_startproc _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lord_sqr_body: mov 8*0($a_ptr), $acc0 mov 8*1($a_ptr), %rax mov 8*2($a_ptr), $acc6 mov 8*3($a_ptr), $acc7 lea .Lord(%rip), $a_ptr # pointer to modulus mov $b_org, $b_ptr jmp .Loop_ord_sqr .align 32 .Loop_ord_sqr: ################################# a[1:] * a[0] mov %rax, $t1 # put aside a[1] mul $acc0 # a[1] * a[0] mov %rax, $acc1 movq $t1, %xmm1 # offload a[1] mov $acc6, %rax mov %rdx, $acc2 mul $acc0 # a[2] * a[0] add %rax, $acc2 mov $acc7, %rax movq $acc6, %xmm2 # offload a[2] adc \$0, %rdx mov %rdx, $acc3 mul $acc0 # a[3] * a[0] add %rax, $acc3 mov $acc7, %rax movq $acc7, %xmm3 # offload a[3] adc \$0, %rdx mov %rdx, $acc4 ################################# a[3] * a[2] mul $acc6 # a[3] * a[2] mov %rax, $acc5 mov $acc6, %rax mov %rdx, $acc6 ################################# a[2:] * a[1] mul $t1 # a[2] * a[1] add %rax, $acc3 mov $acc7, %rax adc \$0, %rdx mov %rdx, $acc7 mul $t1 # a[3] * a[1] add %rax, $acc4 adc \$0, %rdx add $acc7, $acc4 adc %rdx, $acc5 adc \$0, $acc6 # can't overflow ################################# *2 xor $acc7, $acc7 mov $acc0, %rax add $acc1, $acc1 adc $acc2, $acc2 adc $acc3, $acc3 adc $acc4, $acc4 adc $acc5, $acc5 adc $acc6, $acc6 adc \$0, $acc7 ################################# Missing products mul %rax # a[0] * a[0] mov %rax, $acc0 movq %xmm1, %rax mov %rdx, $t1 mul %rax # a[1] * a[1] add $t1, $acc1 adc %rax, $acc2 movq %xmm2, %rax adc \$0, %rdx mov %rdx, $t1 mul %rax # a[2] * a[2] add $t1, $acc3 adc %rax, $acc4 movq %xmm3, %rax adc \$0, %rdx mov %rdx, $t1 mov $acc0, $t0 imulq 8*4($a_ptr), $acc0 # *= .LordK mul %rax # a[3] * a[3] add $t1, $acc5 adc %rax, $acc6 mov 8*0($a_ptr), %rax # modulus[0] adc %rdx, $acc7 # can't overflow ################################# First reduction step mul $acc0 mov $acc0, $t1 add %rax, $t0 # guaranteed to be zero mov 8*1($a_ptr), %rax # modulus[1] adc %rdx, $t0 sub $acc0, $acc2 sbb \$0, $t1 # can't borrow mul $acc0 add $t0, $acc1 adc \$0, %rdx add %rax, $acc1 mov $acc0, %rax adc %rdx, $acc2 mov $acc0, %rdx adc \$0, $t1 # can't overflow mov $acc1, $t0 imulq 8*4($a_ptr), $acc1 # *= .LordK shl \$32, %rax shr \$32, %rdx sub %rax, $acc3 mov 8*0($a_ptr), %rax sbb %rdx, $acc0 # can't borrow add $t1, $acc3 adc \$0, $acc0 # can't overflow ################################# Second reduction step mul $acc1 mov $acc1, $t1 add %rax, $t0 # guaranteed to be zero mov 8*1($a_ptr), %rax adc %rdx, $t0 sub $acc1, $acc3 sbb \$0, $t1 # can't borrow mul $acc1 add $t0, $acc2 adc \$0, %rdx add %rax, $acc2 mov $acc1, %rax adc %rdx, $acc3 mov $acc1, %rdx adc \$0, $t1 # can't overflow mov $acc2, $t0 imulq 8*4($a_ptr), $acc2 # *= .LordK shl \$32, %rax shr \$32, %rdx sub %rax, $acc0 mov 8*0($a_ptr), %rax sbb %rdx, $acc1 # can't borrow add $t1, $acc0 adc \$0, $acc1 # can't overflow ################################# Third reduction step mul $acc2 mov $acc2, $t1 add %rax, $t0 # guaranteed to be zero mov 8*1($a_ptr), %rax adc %rdx, $t0 sub $acc2, $acc0 sbb \$0, $t1 # can't borrow mul $acc2 add $t0, $acc3 adc \$0, %rdx add %rax, $acc3 mov $acc2, %rax adc %rdx, $acc0 mov $acc2, %rdx adc \$0, $t1 # can't overflow mov $acc3, $t0 imulq 8*4($a_ptr), $acc3 # *= .LordK shl \$32, %rax shr \$32, %rdx sub %rax, $acc1 mov 8*0($a_ptr), %rax sbb %rdx, $acc2 # can't borrow add $t1, $acc1 adc \$0, $acc2 # can't overflow ################################# Last reduction step mul $acc3 mov $acc3, $t1 add %rax, $t0 # guaranteed to be zero mov 8*1($a_ptr), %rax adc %rdx, $t0 sub $acc3, $acc1 sbb \$0, $t1 # can't borrow mul $acc3 add $t0, $acc0 adc \$0, %rdx add %rax, $acc0 mov $acc3, %rax adc %rdx, $acc1 mov $acc3, %rdx adc \$0, $t1 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc2 sbb %rdx, $acc3 # can't borrow add $t1, $acc2 adc \$0, $acc3 # can't overflow ################################# Add bits [511:256] of the sqr result xor %rdx, %rdx add $acc4, $acc0 adc $acc5, $acc1 mov $acc0, $acc4 adc $acc6, $acc2 adc $acc7, $acc3 mov $acc1, %rax adc \$0, %rdx ################################# Compare to modulus sub 8*0($a_ptr), $acc0 mov $acc2, $acc6 sbb 8*1($a_ptr), $acc1 sbb 8*2($a_ptr), $acc2 mov $acc3, $acc7 sbb 8*3($a_ptr), $acc3 sbb \$0, %rdx cmovc $acc4, $acc0 cmovnc $acc1, %rax cmovnc $acc2, $acc6 cmovnc $acc3, $acc7 dec $b_ptr jnz .Loop_ord_sqr mov $acc0, 8*0($r_ptr) mov %rax, 8*1($r_ptr) pxor %xmm1, %xmm1 mov $acc6, 8*2($r_ptr) pxor %xmm2, %xmm2 mov $acc7, 8*3($r_ptr) pxor %xmm3, %xmm3 mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_sqr_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_sqr_mont_nohw,.-ecp_nistz256_ord_sqr_mont_nohw ___ $code.=<<___ if ($addx); ################################################################################ .globl ecp_nistz256_ord_mul_mont_adx .type ecp_nistz256_ord_mul_mont_adx,\@function,3 .align 32 ecp_nistz256_ord_mul_mont_adx: .cfi_startproc .Lecp_nistz256_ord_mul_mont_adx: _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lord_mulx_body: mov $b_org, $b_ptr mov 8*0($b_org), %rdx mov 8*0($a_ptr), $acc1 mov 8*1($a_ptr), $acc2 mov 8*2($a_ptr), $acc3 mov 8*3($a_ptr), $acc4 lea -128($a_ptr), $a_ptr # control u-op density lea .Lord-128(%rip), %r14 mov .LordK(%rip), %r15 ################################# Multiply by b[0] mulx $acc1, $acc0, $acc1 mulx $acc2, $t0, $acc2 mulx $acc3, $t1, $acc3 add $t0, $acc1 mulx $acc4, $t0, $acc4 mov $acc0, %rdx mulx %r15, %rdx, %rax adc $t1, $acc2 adc $t0, $acc3 adc \$0, $acc4 ################################# reduction xor $acc5, $acc5 # $acc5=0, cf=0, of=0 mulx 8*0+128(%r14), $t0, $t1 adcx $t0, $acc0 # guaranteed to be zero adox $t1, $acc1 mulx 8*1+128(%r14), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 mulx 8*2+128(%r14), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*3+128(%r14), $t0, $t1 mov 8*1($b_ptr), %rdx adcx $t0, $acc3 adox $t1, $acc4 adcx $acc0, $acc4 adox $acc0, $acc5 adc \$0, $acc5 # cf=0, of=0 ################################# Multiply by b[1] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc1, %rdx mulx %r15, %rdx, %rax adcx $t0, $acc4 adox $t1, $acc5 adcx $acc0, $acc5 adox $acc0, $acc0 adc \$0, $acc0 # cf=0, of=0 ################################# reduction mulx 8*0+128(%r14), $t0, $t1 adcx $t0, $acc1 # guaranteed to be zero adox $t1, $acc2 mulx 8*1+128(%r14), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*2+128(%r14), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*3+128(%r14), $t0, $t1 mov 8*2($b_ptr), %rdx adcx $t0, $acc4 adox $t1, $acc5 adcx $acc1, $acc5 adox $acc1, $acc0 adc \$0, $acc0 # cf=0, of=0 ################################# Multiply by b[2] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc2, %rdx mulx %r15, %rdx, %rax adcx $t0, $acc5 adox $t1, $acc0 adcx $acc1, $acc0 adox $acc1, $acc1 adc \$0, $acc1 # cf=0, of=0 ################################# reduction mulx 8*0+128(%r14), $t0, $t1 adcx $t0, $acc2 # guaranteed to be zero adox $t1, $acc3 mulx 8*1+128(%r14), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*2+128(%r14), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*3+128(%r14), $t0, $t1 mov 8*3($b_ptr), %rdx adcx $t0, $acc5 adox $t1, $acc0 adcx $acc2, $acc0 adox $acc2, $acc1 adc \$0, $acc1 # cf=0, of=0 ################################# Multiply by b[3] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc5 adox $t1, $acc0 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc3, %rdx mulx %r15, %rdx, %rax adcx $t0, $acc0 adox $t1, $acc1 adcx $acc2, $acc1 adox $acc2, $acc2 adc \$0, $acc2 # cf=0, of=0 ################################# reduction mulx 8*0+128(%r14), $t0, $t1 adcx $t0, $acc3 # guranteed to be zero adox $t1, $acc4 mulx 8*1+128(%r14), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*2+128(%r14), $t0, $t1 adcx $t0, $acc5 adox $t1, $acc0 mulx 8*3+128(%r14), $t0, $t1 lea 128(%r14),%r14 mov $acc4, $t2 adcx $t0, $acc0 adox $t1, $acc1 mov $acc5, $t3 adcx $acc3, $acc1 adox $acc3, $acc2 adc \$0, $acc2 ################################# # Branch-less conditional subtraction of P mov $acc0, $t0 sub 8*0(%r14), $acc4 sbb 8*1(%r14), $acc5 sbb 8*2(%r14), $acc0 mov $acc1, $t1 sbb 8*3(%r14), $acc1 sbb \$0, $acc2 cmovc $t2, $acc4 cmovc $t3, $acc5 cmovc $t0, $acc0 cmovc $t1, $acc1 mov $acc4, 8*0($r_ptr) mov $acc5, 8*1($r_ptr) mov $acc0, 8*2($r_ptr) mov $acc1, 8*3($r_ptr) mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_mulx_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_mul_mont_adx,.-ecp_nistz256_ord_mul_mont_adx .globl ecp_nistz256_ord_sqr_mont_adx .type ecp_nistz256_ord_sqr_mont_adx,\@function,3 .align 32 ecp_nistz256_ord_sqr_mont_adx: .cfi_startproc _CET_ENDBR .Lecp_nistz256_ord_sqr_mont_adx: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lord_sqrx_body: mov $b_org, $b_ptr mov 8*0($a_ptr), %rdx mov 8*1($a_ptr), $acc6 mov 8*2($a_ptr), $acc7 mov 8*3($a_ptr), $acc0 lea .Lord(%rip), $a_ptr jmp .Loop_ord_sqrx .align 32 .Loop_ord_sqrx: mulx $acc6, $acc1, $acc2 # a[0]*a[1] mulx $acc7, $t0, $acc3 # a[0]*a[2] mov %rdx, %rax # offload a[0] movq $acc6, %xmm1 # offload a[1] mulx $acc0, $t1, $acc4 # a[0]*a[3] mov $acc6, %rdx add $t0, $acc2 movq $acc7, %xmm2 # offload a[2] adc $t1, $acc3 adc \$0, $acc4 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 ################################# mulx $acc7, $t0, $t1 # a[1]*a[2] adcx $t0, $acc3 adox $t1, $acc4 mulx $acc0, $t0, $t1 # a[1]*a[3] mov $acc7, %rdx adcx $t0, $acc4 adox $t1, $acc5 adc \$0, $acc5 ################################# mulx $acc0, $t0, $acc6 # a[2]*a[3] mov %rax, %rdx movq $acc0, %xmm3 # offload a[3] xor $acc7, $acc7 # $acc7=0,cf=0,of=0 adcx $acc1, $acc1 # acc1:6<<1 adox $t0, $acc5 adcx $acc2, $acc2 adox $acc7, $acc6 # of=0 ################################# a[i]*a[i] mulx %rdx, $acc0, $t1 movq %xmm1, %rdx adcx $acc3, $acc3 adox $t1, $acc1 adcx $acc4, $acc4 mulx %rdx, $t0, $t4 movq %xmm2, %rdx adcx $acc5, $acc5 adox $t0, $acc2 adcx $acc6, $acc6 mulx %rdx, $t0, $t1 .byte 0x67 movq %xmm3, %rdx adox $t4, $acc3 adcx $acc7, $acc7 adox $t0, $acc4 adox $t1, $acc5 mulx %rdx, $t0, $t4 adox $t0, $acc6 adox $t4, $acc7 ################################# reduction mov $acc0, %rdx mulx 8*4($a_ptr), %rdx, $t0 xor %rax, %rax # cf=0, of=0 mulx 8*0($a_ptr), $t0, $t1 adcx $t0, $acc0 # guaranteed to be zero adox $t1, $acc1 mulx 8*1($a_ptr), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 mulx 8*2($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*3($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc0 # of=0 adcx %rax, $acc0 # cf=0 ################################# mov $acc1, %rdx mulx 8*4($a_ptr), %rdx, $t0 mulx 8*0($a_ptr), $t0, $t1 adox $t0, $acc1 # guaranteed to be zero adcx $t1, $acc2 mulx 8*1($a_ptr), $t0, $t1 adox $t0, $acc2 adcx $t1, $acc3 mulx 8*2($a_ptr), $t0, $t1 adox $t0, $acc3 adcx $t1, $acc0 mulx 8*3($a_ptr), $t0, $t1 adox $t0, $acc0 adcx $t1, $acc1 # cf=0 adox %rax, $acc1 # of=0 ################################# mov $acc2, %rdx mulx 8*4($a_ptr), %rdx, $t0 mulx 8*0($a_ptr), $t0, $t1 adcx $t0, $acc2 # guaranteed to be zero adox $t1, $acc3 mulx 8*1($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc0 mulx 8*2($a_ptr), $t0, $t1 adcx $t0, $acc0 adox $t1, $acc1 mulx 8*3($a_ptr), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 # of=0 adcx %rax, $acc2 # cf=0 ################################# mov $acc3, %rdx mulx 8*4($a_ptr), %rdx, $t0 mulx 8*0($a_ptr), $t0, $t1 adox $t0, $acc3 # guaranteed to be zero adcx $t1, $acc0 mulx 8*1($a_ptr), $t0, $t1 adox $t0, $acc0 adcx $t1, $acc1 mulx 8*2($a_ptr), $t0, $t1 adox $t0, $acc1 adcx $t1, $acc2 mulx 8*3($a_ptr), $t0, $t1 adox $t0, $acc2 adcx $t1, $acc3 adox %rax, $acc3 ################################# accumulate upper half add $acc0, $acc4 # add $acc4, $acc0 adc $acc5, $acc1 mov $acc4, %rdx adc $acc6, $acc2 adc $acc7, $acc3 mov $acc1, $acc6 adc \$0, %rax ################################# compare to modulus sub 8*0($a_ptr), $acc4 mov $acc2, $acc7 sbb 8*1($a_ptr), $acc1 sbb 8*2($a_ptr), $acc2 mov $acc3, $acc0 sbb 8*3($a_ptr), $acc3 sbb \$0, %rax cmovnc $acc4, %rdx cmovnc $acc1, $acc6 cmovnc $acc2, $acc7 cmovnc $acc3, $acc0 dec $b_ptr jnz .Loop_ord_sqrx mov %rdx, 8*0($r_ptr) mov $acc6, 8*1($r_ptr) pxor %xmm1, %xmm1 mov $acc7, 8*2($r_ptr) pxor %xmm2, %xmm2 mov $acc0, 8*3($r_ptr) pxor %xmm3, %xmm3 mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_sqrx_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_sqr_mont_adx,.-ecp_nistz256_ord_sqr_mont_adx ___ $code.=<<___; ################################################################################ # void ecp_nistz256_mul_mont( # uint64_t res[4], # uint64_t a[4], # uint64_t b[4]); .globl ecp_nistz256_mul_mont_nohw .type ecp_nistz256_mul_mont_nohw,\@function,3 .align 32 ecp_nistz256_mul_mont_nohw: .cfi_startproc _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lmul_body: mov $b_org, $b_ptr mov 8*0($b_org), %rax mov 8*0($a_ptr), $acc1 mov 8*1($a_ptr), $acc2 mov 8*2($a_ptr), $acc3 mov 8*3($a_ptr), $acc4 call __ecp_nistz256_mul_montq mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lmul_epilogue: ret .cfi_endproc .size ecp_nistz256_mul_mont_nohw,.-ecp_nistz256_mul_mont_nohw .type __ecp_nistz256_mul_montq,\@abi-omnipotent .align 32 __ecp_nistz256_mul_montq: .cfi_startproc ######################################################################## # Multiply a by b[0] mov %rax, $t1 mulq $acc1 mov .Lpoly+8*1(%rip),$poly1 mov %rax, $acc0 mov $t1, %rax mov %rdx, $acc1 mulq $acc2 mov .Lpoly+8*3(%rip),$poly3 add %rax, $acc1 mov $t1, %rax adc \$0, %rdx mov %rdx, $acc2 mulq $acc3 add %rax, $acc2 mov $t1, %rax adc \$0, %rdx mov %rdx, $acc3 mulq $acc4 add %rax, $acc3 mov $acc0, %rax adc \$0, %rdx xor $acc5, $acc5 mov %rdx, $acc4 ######################################################################## # First reduction step # Basically now we want to multiply acc[0] by p256, # and add the result to the acc. # Due to the special form of p256 we do some optimizations # # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] # then we add acc[0] and get acc[0] x 2^96 mov $acc0, $t1 shl \$32, $acc0 mulq $poly3 shr \$32, $t1 add $acc0, $acc1 # +=acc[0]<<96 adc $t1, $acc2 adc %rax, $acc3 mov 8*1($b_ptr), %rax adc %rdx, $acc4 adc \$0, $acc5 xor $acc0, $acc0 ######################################################################## # Multiply by b[1] mov %rax, $t1 mulq 8*0($a_ptr) add %rax, $acc1 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*1($a_ptr) add $t0, $acc2 adc \$0, %rdx add %rax, $acc2 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*2($a_ptr) add $t0, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*3($a_ptr) add $t0, $acc4 adc \$0, %rdx add %rax, $acc4 mov $acc1, %rax adc %rdx, $acc5 adc \$0, $acc0 ######################################################################## # Second reduction step mov $acc1, $t1 shl \$32, $acc1 mulq $poly3 shr \$32, $t1 add $acc1, $acc2 adc $t1, $acc3 adc %rax, $acc4 mov 8*2($b_ptr), %rax adc %rdx, $acc5 adc \$0, $acc0 xor $acc1, $acc1 ######################################################################## # Multiply by b[2] mov %rax, $t1 mulq 8*0($a_ptr) add %rax, $acc2 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*1($a_ptr) add $t0, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*2($a_ptr) add $t0, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*3($a_ptr) add $t0, $acc5 adc \$0, %rdx add %rax, $acc5 mov $acc2, %rax adc %rdx, $acc0 adc \$0, $acc1 ######################################################################## # Third reduction step mov $acc2, $t1 shl \$32, $acc2 mulq $poly3 shr \$32, $t1 add $acc2, $acc3 adc $t1, $acc4 adc %rax, $acc5 mov 8*3($b_ptr), %rax adc %rdx, $acc0 adc \$0, $acc1 xor $acc2, $acc2 ######################################################################## # Multiply by b[3] mov %rax, $t1 mulq 8*0($a_ptr) add %rax, $acc3 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*1($a_ptr) add $t0, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*2($a_ptr) add $t0, $acc5 adc \$0, %rdx add %rax, $acc5 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*3($a_ptr) add $t0, $acc0 adc \$0, %rdx add %rax, $acc0 mov $acc3, %rax adc %rdx, $acc1 adc \$0, $acc2 ######################################################################## # Final reduction step mov $acc3, $t1 shl \$32, $acc3 mulq $poly3 shr \$32, $t1 add $acc3, $acc4 adc $t1, $acc5 mov $acc4, $t0 adc %rax, $acc0 adc %rdx, $acc1 mov $acc5, $t1 adc \$0, $acc2 ######################################################################## # Branch-less conditional subtraction of P sub \$-1, $acc4 # .Lpoly[0] mov $acc0, $t2 sbb $poly1, $acc5 # .Lpoly[1] sbb \$0, $acc0 # .Lpoly[2] mov $acc1, $t3 sbb $poly3, $acc1 # .Lpoly[3] sbb \$0, $acc2 cmovc $t0, $acc4 cmovc $t1, $acc5 mov $acc4, 8*0($r_ptr) cmovc $t2, $acc0 mov $acc5, 8*1($r_ptr) cmovc $t3, $acc1 mov $acc0, 8*2($r_ptr) mov $acc1, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq ################################################################################ # void ecp_nistz256_sqr_mont( # uint64_t res[4], # uint64_t a[4]); # we optimize the square according to S.Gueron and V.Krasnov, # "Speeding up Big-Number Squaring" .globl ecp_nistz256_sqr_mont_nohw .type ecp_nistz256_sqr_mont_nohw,\@function,2 .align 32 ecp_nistz256_sqr_mont_nohw: .cfi_startproc _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lsqr_body: mov 8*0($a_ptr), %rax mov 8*1($a_ptr), $acc6 mov 8*2($a_ptr), $acc7 mov 8*3($a_ptr), $acc0 call __ecp_nistz256_sqr_montq mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lsqr_epilogue: ret .cfi_endproc .size ecp_nistz256_sqr_mont_nohw,.-ecp_nistz256_sqr_mont_nohw .type __ecp_nistz256_sqr_montq,\@abi-omnipotent .align 32 __ecp_nistz256_sqr_montq: .cfi_startproc mov %rax, $acc5 mulq $acc6 # a[1]*a[0] mov %rax, $acc1 mov $acc7, %rax mov %rdx, $acc2 mulq $acc5 # a[0]*a[2] add %rax, $acc2 mov $acc0, %rax adc \$0, %rdx mov %rdx, $acc3 mulq $acc5 # a[0]*a[3] add %rax, $acc3 mov $acc7, %rax adc \$0, %rdx mov %rdx, $acc4 ################################# mulq $acc6 # a[1]*a[2] add %rax, $acc3 mov $acc0, %rax adc \$0, %rdx mov %rdx, $t1 mulq $acc6 # a[1]*a[3] add %rax, $acc4 mov $acc0, %rax adc \$0, %rdx add $t1, $acc4 mov %rdx, $acc5 adc \$0, $acc5 ################################# mulq $acc7 # a[2]*a[3] xor $acc7, $acc7 add %rax, $acc5 mov 8*0($a_ptr), %rax mov %rdx, $acc6 adc \$0, $acc6 add $acc1, $acc1 # acc1:6<<1 adc $acc2, $acc2 adc $acc3, $acc3 adc $acc4, $acc4 adc $acc5, $acc5 adc $acc6, $acc6 adc \$0, $acc7 mulq %rax mov %rax, $acc0 mov 8*1($a_ptr), %rax mov %rdx, $t0 mulq %rax add $t0, $acc1 adc %rax, $acc2 mov 8*2($a_ptr), %rax adc \$0, %rdx mov %rdx, $t0 mulq %rax add $t0, $acc3 adc %rax, $acc4 mov 8*3($a_ptr), %rax adc \$0, %rdx mov %rdx, $t0 mulq %rax add $t0, $acc5 adc %rax, $acc6 mov $acc0, %rax adc %rdx, $acc7 mov .Lpoly+8*1(%rip), $a_ptr mov .Lpoly+8*3(%rip), $t1 ########################################## # Now the reduction # First iteration mov $acc0, $t0 shl \$32, $acc0 mulq $t1 shr \$32, $t0 add $acc0, $acc1 # +=acc[0]<<96 adc $t0, $acc2 adc %rax, $acc3 mov $acc1, %rax adc \$0, %rdx ########################################## # Second iteration mov $acc1, $t0 shl \$32, $acc1 mov %rdx, $acc0 mulq $t1 shr \$32, $t0 add $acc1, $acc2 adc $t0, $acc3 adc %rax, $acc0 mov $acc2, %rax adc \$0, %rdx ########################################## # Third iteration mov $acc2, $t0 shl \$32, $acc2 mov %rdx, $acc1 mulq $t1 shr \$32, $t0 add $acc2, $acc3 adc $t0, $acc0 adc %rax, $acc1 mov $acc3, %rax adc \$0, %rdx ########################################### # Last iteration mov $acc3, $t0 shl \$32, $acc3 mov %rdx, $acc2 mulq $t1 shr \$32, $t0 add $acc3, $acc0 adc $t0, $acc1 adc %rax, $acc2 adc \$0, %rdx xor $acc3, $acc3 ############################################ # Add the rest of the acc add $acc0, $acc4 adc $acc1, $acc5 mov $acc4, $acc0 adc $acc2, $acc6 adc %rdx, $acc7 mov $acc5, $acc1 adc \$0, $acc3 sub \$-1, $acc4 # .Lpoly[0] mov $acc6, $acc2 sbb $a_ptr, $acc5 # .Lpoly[1] sbb \$0, $acc6 # .Lpoly[2] mov $acc7, $t0 sbb $t1, $acc7 # .Lpoly[3] sbb \$0, $acc3 cmovc $acc0, $acc4 cmovc $acc1, $acc5 mov $acc4, 8*0($r_ptr) cmovc $acc2, $acc6 mov $acc5, 8*1($r_ptr) cmovc $t0, $acc7 mov $acc6, 8*2($r_ptr) mov $acc7, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq ___ if ($addx) { $code.=<<___; .globl ecp_nistz256_mul_mont_adx .type ecp_nistz256_mul_mont_adx,\@function,3 .align 32 ecp_nistz256_mul_mont_adx: .cfi_startproc _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lmulx_body: mov $b_org, $b_ptr mov 8*0($b_org), %rdx mov 8*0($a_ptr), $acc1 mov 8*1($a_ptr), $acc2 mov 8*2($a_ptr), $acc3 mov 8*3($a_ptr), $acc4 lea -128($a_ptr), $a_ptr # control u-op density call __ecp_nistz256_mul_montx mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lmulx_epilogue: ret .cfi_endproc .size ecp_nistz256_mul_mont_adx,.-ecp_nistz256_mul_mont_adx .type __ecp_nistz256_mul_montx,\@abi-omnipotent .align 32 __ecp_nistz256_mul_montx: .cfi_startproc ######################################################################## # Multiply by b[0] mulx $acc1, $acc0, $acc1 mulx $acc2, $t0, $acc2 mov \$32, $poly1 xor $acc5, $acc5 # cf=0 mulx $acc3, $t1, $acc3 mov .Lpoly+8*3(%rip), $poly3 adc $t0, $acc1 mulx $acc4, $t0, $acc4 mov $acc0, %rdx adc $t1, $acc2 shlx $poly1,$acc0,$t1 adc $t0, $acc3 shrx $poly1,$acc0,$t0 adc \$0, $acc4 ######################################################################## # First reduction step add $t1, $acc1 adc $t0, $acc2 mulx $poly3, $t0, $t1 mov 8*1($b_ptr), %rdx adc $t0, $acc3 adc $t1, $acc4 adc \$0, $acc5 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 ######################################################################## # Multiply by b[1] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc1, %rdx adcx $t0, $acc4 shlx $poly1, $acc1, $t0 adox $t1, $acc5 shrx $poly1, $acc1, $t1 adcx $acc0, $acc5 adox $acc0, $acc0 adc \$0, $acc0 ######################################################################## # Second reduction step add $t0, $acc2 adc $t1, $acc3 mulx $poly3, $t0, $t1 mov 8*2($b_ptr), %rdx adc $t0, $acc4 adc $t1, $acc5 adc \$0, $acc0 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 ######################################################################## # Multiply by b[2] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc2, %rdx adcx $t0, $acc5 shlx $poly1, $acc2, $t0 adox $t1, $acc0 shrx $poly1, $acc2, $t1 adcx $acc1, $acc0 adox $acc1, $acc1 adc \$0, $acc1 ######################################################################## # Third reduction step add $t0, $acc3 adc $t1, $acc4 mulx $poly3, $t0, $t1 mov 8*3($b_ptr), %rdx adc $t0, $acc5 adc $t1, $acc0 adc \$0, $acc1 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 ######################################################################## # Multiply by b[3] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc5 adox $t1, $acc0 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc3, %rdx adcx $t0, $acc0 shlx $poly1, $acc3, $t0 adox $t1, $acc1 shrx $poly1, $acc3, $t1 adcx $acc2, $acc1 adox $acc2, $acc2 adc \$0, $acc2 ######################################################################## # Fourth reduction step add $t0, $acc4 adc $t1, $acc5 mulx $poly3, $t0, $t1 mov $acc4, $t2 mov .Lpoly+8*1(%rip), $poly1 adc $t0, $acc0 mov $acc5, $t3 adc $t1, $acc1 adc \$0, $acc2 ######################################################################## # Branch-less conditional subtraction of P xor %eax, %eax mov $acc0, $t0 sbb \$-1, $acc4 # .Lpoly[0] sbb $poly1, $acc5 # .Lpoly[1] sbb \$0, $acc0 # .Lpoly[2] mov $acc1, $t1 sbb $poly3, $acc1 # .Lpoly[3] sbb \$0, $acc2 cmovc $t2, $acc4 cmovc $t3, $acc5 mov $acc4, 8*0($r_ptr) cmovc $t0, $acc0 mov $acc5, 8*1($r_ptr) cmovc $t1, $acc1 mov $acc0, 8*2($r_ptr) mov $acc1, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx .globl ecp_nistz256_sqr_mont_adx .type ecp_nistz256_sqr_mont_adx,\@function,2 .align 32 ecp_nistz256_sqr_mont_adx: .cfi_startproc _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lsqrx_body: mov 8*0($a_ptr), %rdx mov 8*1($a_ptr), $acc6 mov 8*2($a_ptr), $acc7 mov 8*3($a_ptr), $acc0 lea -128($a_ptr), $a_ptr # control u-op density call __ecp_nistz256_sqr_montx mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lsqrx_epilogue: ret .cfi_endproc .size ecp_nistz256_sqr_mont_adx,.-ecp_nistz256_sqr_mont_adx .type __ecp_nistz256_sqr_montx,\@abi-omnipotent .align 32 __ecp_nistz256_sqr_montx: .cfi_startproc mulx $acc6, $acc1, $acc2 # a[0]*a[1] mulx $acc7, $t0, $acc3 # a[0]*a[2] xor %eax, %eax adc $t0, $acc2 mulx $acc0, $t1, $acc4 # a[0]*a[3] mov $acc6, %rdx adc $t1, $acc3 adc \$0, $acc4 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 ################################# mulx $acc7, $t0, $t1 # a[1]*a[2] adcx $t0, $acc3 adox $t1, $acc4 mulx $acc0, $t0, $t1 # a[1]*a[3] mov $acc7, %rdx adcx $t0, $acc4 adox $t1, $acc5 adc \$0, $acc5 ################################# mulx $acc0, $t0, $acc6 # a[2]*a[3] mov 8*0+128($a_ptr), %rdx xor $acc7, $acc7 # $acc7=0,cf=0,of=0 adcx $acc1, $acc1 # acc1:6<<1 adox $t0, $acc5 adcx $acc2, $acc2 adox $acc7, $acc6 # of=0 mulx %rdx, $acc0, $t1 mov 8*1+128($a_ptr), %rdx adcx $acc3, $acc3 adox $t1, $acc1 adcx $acc4, $acc4 mulx %rdx, $t0, $t4 mov 8*2+128($a_ptr), %rdx adcx $acc5, $acc5 adox $t0, $acc2 adcx $acc6, $acc6 .byte 0x67 mulx %rdx, $t0, $t1 mov 8*3+128($a_ptr), %rdx adox $t4, $acc3 adcx $acc7, $acc7 adox $t0, $acc4 mov \$32, $a_ptr adox $t1, $acc5 .byte 0x67,0x67 mulx %rdx, $t0, $t4 mov .Lpoly+8*3(%rip), %rdx adox $t0, $acc6 shlx $a_ptr, $acc0, $t0 adox $t4, $acc7 shrx $a_ptr, $acc0, $t4 mov %rdx,$t1 # reduction step 1 add $t0, $acc1 adc $t4, $acc2 mulx $acc0, $t0, $acc0 adc $t0, $acc3 shlx $a_ptr, $acc1, $t0 adc \$0, $acc0 shrx $a_ptr, $acc1, $t4 # reduction step 2 add $t0, $acc2 adc $t4, $acc3 mulx $acc1, $t0, $acc1 adc $t0, $acc0 shlx $a_ptr, $acc2, $t0 adc \$0, $acc1 shrx $a_ptr, $acc2, $t4 # reduction step 3 add $t0, $acc3 adc $t4, $acc0 mulx $acc2, $t0, $acc2 adc $t0, $acc1 shlx $a_ptr, $acc3, $t0 adc \$0, $acc2 shrx $a_ptr, $acc3, $t4 # reduction step 4 add $t0, $acc0 adc $t4, $acc1 mulx $acc3, $t0, $acc3 adc $t0, $acc2 adc \$0, $acc3 xor $t3, $t3 add $acc0, $acc4 # accumulate upper half mov .Lpoly+8*1(%rip), $a_ptr adc $acc1, $acc5 mov $acc4, $acc0 adc $acc2, $acc6 adc $acc3, $acc7 mov $acc5, $acc1 adc \$0, $t3 sub \$-1, $acc4 # .Lpoly[0] mov $acc6, $acc2 sbb $a_ptr, $acc5 # .Lpoly[1] sbb \$0, $acc6 # .Lpoly[2] mov $acc7, $acc3 sbb $t1, $acc7 # .Lpoly[3] sbb \$0, $t3 cmovc $acc0, $acc4 cmovc $acc1, $acc5 mov $acc4, 8*0($r_ptr) cmovc $acc2, $acc6 mov $acc5, 8*1($r_ptr) cmovc $acc3, $acc7 mov $acc6, 8*2($r_ptr) mov $acc7, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx ___ } } { my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); $code.=<<___; ################################################################################ # void ecp_nistz256_select_w5_nohw(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_select_w5_nohw .type ecp_nistz256_select_w5_nohw,\@abi-omnipotent .align 32 ecp_nistz256_select_w5_nohw: .cfi_startproc _CET_ENDBR ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax .LSEH_begin_ecp_nistz256_select_w5_nohw: .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) ___ $code.=<<___; movdqa .LOne(%rip), $ONE movd $index, $INDEX pxor $Ra, $Ra pxor $Rb, $Rb pxor $Rc, $Rc pxor $Rd, $Rd pxor $Re, $Re pxor $Rf, $Rf movdqa $ONE, $M0 pshufd \$0, $INDEX, $INDEX mov \$16, %rax .Lselect_loop_sse_w5: movdqa $M0, $TMP0 paddd $ONE, $M0 pcmpeqd $INDEX, $TMP0 movdqa 16*0($in_t), $T0a movdqa 16*1($in_t), $T0b movdqa 16*2($in_t), $T0c movdqa 16*3($in_t), $T0d movdqa 16*4($in_t), $T0e movdqa 16*5($in_t), $T0f lea 16*6($in_t), $in_t pand $TMP0, $T0a pand $TMP0, $T0b por $T0a, $Ra pand $TMP0, $T0c por $T0b, $Rb pand $TMP0, $T0d por $T0c, $Rc pand $TMP0, $T0e por $T0d, $Rd pand $TMP0, $T0f por $T0e, $Re por $T0f, $Rf dec %rax jnz .Lselect_loop_sse_w5 movdqu $Ra, 16*0($val) movdqu $Rb, 16*1($val) movdqu $Rc, 16*2($val) movdqu $Rd, 16*3($val) movdqu $Re, 16*4($val) movdqu $Rf, 16*5($val) ___ $code.=<<___ if ($win64); movaps (%rsp), %xmm6 movaps 0x10(%rsp), %xmm7 movaps 0x20(%rsp), %xmm8 movaps 0x30(%rsp), %xmm9 movaps 0x40(%rsp), %xmm10 movaps 0x50(%rsp), %xmm11 movaps 0x60(%rsp), %xmm12 movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp ___ $code.=<<___; ret .cfi_endproc .LSEH_end_ecp_nistz256_select_w5_nohw: .size ecp_nistz256_select_w5_nohw,.-ecp_nistz256_select_w5_nohw ################################################################################ # void ecp_nistz256_select_w7_nohw(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_select_w7_nohw .type ecp_nistz256_select_w7_nohw,\@abi-omnipotent .align 32 ecp_nistz256_select_w7_nohw: .cfi_startproc _CET_ENDBR ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax .LSEH_begin_ecp_nistz256_select_w7_nohw: .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) ___ $code.=<<___; movdqa .LOne(%rip), $M0 movd $index, $INDEX pxor $Ra, $Ra pxor $Rb, $Rb pxor $Rc, $Rc pxor $Rd, $Rd movdqa $M0, $ONE pshufd \$0, $INDEX, $INDEX mov \$64, %rax .Lselect_loop_sse_w7: movdqa $M0, $TMP0 paddd $ONE, $M0 movdqa 16*0($in_t), $T0a movdqa 16*1($in_t), $T0b pcmpeqd $INDEX, $TMP0 movdqa 16*2($in_t), $T0c movdqa 16*3($in_t), $T0d lea 16*4($in_t), $in_t pand $TMP0, $T0a pand $TMP0, $T0b por $T0a, $Ra pand $TMP0, $T0c por $T0b, $Rb pand $TMP0, $T0d por $T0c, $Rc prefetcht0 255($in_t) por $T0d, $Rd dec %rax jnz .Lselect_loop_sse_w7 movdqu $Ra, 16*0($val) movdqu $Rb, 16*1($val) movdqu $Rc, 16*2($val) movdqu $Rd, 16*3($val) ___ $code.=<<___ if ($win64); movaps (%rsp), %xmm6 movaps 0x10(%rsp), %xmm7 movaps 0x20(%rsp), %xmm8 movaps 0x30(%rsp), %xmm9 movaps 0x40(%rsp), %xmm10 movaps 0x50(%rsp), %xmm11 movaps 0x60(%rsp), %xmm12 movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp ___ $code.=<<___; ret .cfi_endproc .LSEH_end_ecp_nistz256_select_w7_nohw: .size ecp_nistz256_select_w7_nohw,.-ecp_nistz256_select_w7_nohw ___ } if ($avx>1) { my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); $code.=<<___; ################################################################################ # void ecp_nistz256_select_w5_avx2(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_select_w5_avx2 .type ecp_nistz256_select_w5_avx2,\@abi-omnipotent .align 32 ecp_nistz256_select_w5_avx2: .cfi_startproc _CET_ENDBR vzeroupper ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax mov %rsp,%r11 .LSEH_begin_ecp_nistz256_select_w5_avx2: .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) ___ $code.=<<___; vmovdqa .LTwo(%rip), $TWO vpxor $Ra, $Ra, $Ra vpxor $Rb, $Rb, $Rb vpxor $Rc, $Rc, $Rc vmovdqa .LOne(%rip), $M0 vmovdqa .LTwo(%rip), $M1 vmovd $index, %xmm1 vpermd $INDEX, $Ra, $INDEX mov \$8, %rax .Lselect_loop_avx2_w5: vmovdqa 32*0($in_t), $T0a vmovdqa 32*1($in_t), $T0b vmovdqa 32*2($in_t), $T0c vmovdqa 32*3($in_t), $T1a vmovdqa 32*4($in_t), $T1b vmovdqa 32*5($in_t), $T1c vpcmpeqd $INDEX, $M0, $TMP0 vpcmpeqd $INDEX, $M1, $TMP1 vpaddd $TWO, $M0, $M0 vpaddd $TWO, $M1, $M1 lea 32*6($in_t), $in_t vpand $TMP0, $T0a, $T0a vpand $TMP0, $T0b, $T0b vpand $TMP0, $T0c, $T0c vpand $TMP1, $T1a, $T1a vpand $TMP1, $T1b, $T1b vpand $TMP1, $T1c, $T1c vpxor $T0a, $Ra, $Ra vpxor $T0b, $Rb, $Rb vpxor $T0c, $Rc, $Rc vpxor $T1a, $Ra, $Ra vpxor $T1b, $Rb, $Rb vpxor $T1c, $Rc, $Rc dec %rax jnz .Lselect_loop_avx2_w5 vmovdqu $Ra, 32*0($val) vmovdqu $Rb, 32*1($val) vmovdqu $Rc, 32*2($val) vzeroupper ___ $code.=<<___ if ($win64); movaps (%rsp), %xmm6 movaps 0x10(%rsp), %xmm7 movaps 0x20(%rsp), %xmm8 movaps 0x30(%rsp), %xmm9 movaps 0x40(%rsp), %xmm10 movaps 0x50(%rsp), %xmm11 movaps 0x60(%rsp), %xmm12 movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea (%r11), %rsp ___ $code.=<<___; ret .cfi_endproc .LSEH_end_ecp_nistz256_select_w5_avx2: .size ecp_nistz256_select_w5_avx2,.-ecp_nistz256_select_w5_avx2 ___ } if ($avx>1) { my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); $code.=<<___; ################################################################################ # void ecp_nistz256_select_w7_avx2(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_select_w7_avx2 .type ecp_nistz256_select_w7_avx2,\@abi-omnipotent .align 32 ecp_nistz256_select_w7_avx2: .cfi_startproc _CET_ENDBR vzeroupper ___ $code.=<<___ if ($win64); mov %rsp,%r11 lea -0x88(%rsp), %rax .LSEH_begin_ecp_nistz256_select_w7_avx2: .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) ___ $code.=<<___; vmovdqa .LThree(%rip), $THREE vpxor $Ra, $Ra, $Ra vpxor $Rb, $Rb, $Rb vmovdqa .LOne(%rip), $M0 vmovdqa .LTwo(%rip), $M1 vmovdqa .LThree(%rip), $M2 vmovd $index, %xmm1 vpermd $INDEX, $Ra, $INDEX # Skip index = 0, because it is implicitly the point at infinity mov \$21, %rax .Lselect_loop_avx2_w7: vmovdqa 32*0($in_t), $T0a vmovdqa 32*1($in_t), $T0b vmovdqa 32*2($in_t), $T1a vmovdqa 32*3($in_t), $T1b vmovdqa 32*4($in_t), $T2a vmovdqa 32*5($in_t), $T2b vpcmpeqd $INDEX, $M0, $TMP0 vpcmpeqd $INDEX, $M1, $TMP1 vpcmpeqd $INDEX, $M2, $TMP2 vpaddd $THREE, $M0, $M0 vpaddd $THREE, $M1, $M1 vpaddd $THREE, $M2, $M2 lea 32*6($in_t), $in_t vpand $TMP0, $T0a, $T0a vpand $TMP0, $T0b, $T0b vpand $TMP1, $T1a, $T1a vpand $TMP1, $T1b, $T1b vpand $TMP2, $T2a, $T2a vpand $TMP2, $T2b, $T2b vpxor $T0a, $Ra, $Ra vpxor $T0b, $Rb, $Rb vpxor $T1a, $Ra, $Ra vpxor $T1b, $Rb, $Rb vpxor $T2a, $Ra, $Ra vpxor $T2b, $Rb, $Rb dec %rax jnz .Lselect_loop_avx2_w7 vmovdqa 32*0($in_t), $T0a vmovdqa 32*1($in_t), $T0b vpcmpeqd $INDEX, $M0, $TMP0 vpand $TMP0, $T0a, $T0a vpand $TMP0, $T0b, $T0b vpxor $T0a, $Ra, $Ra vpxor $T0b, $Rb, $Rb vmovdqu $Ra, 32*0($val) vmovdqu $Rb, 32*1($val) vzeroupper ___ $code.=<<___ if ($win64); movaps (%rsp), %xmm6 movaps 0x10(%rsp), %xmm7 movaps 0x20(%rsp), %xmm8 movaps 0x30(%rsp), %xmm9 movaps 0x40(%rsp), %xmm10 movaps 0x50(%rsp), %xmm11 movaps 0x60(%rsp), %xmm12 movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea (%r11), %rsp ___ $code.=<<___; ret .cfi_endproc .LSEH_end_ecp_nistz256_select_w7_avx2: .size ecp_nistz256_select_w7_avx2,.-ecp_nistz256_select_w7_avx2 ___ } {{{ ######################################################################## # This block implements higher level point_double, point_add and # point_add_affine. The key to performance in this case is to allow # out-of-order execution logic to overlap computations from next step # with tail processing from current step. By using tailored calling # sequence we minimize inter-step overhead to give processor better # shot at overlapping operations... # # You will notice that input data is copied to stack. Trouble is that # there are no registers to spare for holding original pointers and # reloading them, pointers, would create undesired dependencies on # effective addresses calculation paths. In other words it's too done # to favour out-of-order execution logic. # my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); my ($poly1,$poly3)=($acc6,$acc7); sub load_for_mul () { my ($a,$b,$src0) = @_; my $bias = $src0 eq "%rax" ? 0 : -128; " mov $b, $src0 lea $b, $b_ptr mov 8*0+$a, $acc1 mov 8*1+$a, $acc2 lea $bias+$a, $a_ptr mov 8*2+$a, $acc3 mov 8*3+$a, $acc4" } sub load_for_sqr () { my ($a,$src0) = @_; my $bias = $src0 eq "%rax" ? 0 : -128; " mov 8*0+$a, $src0 mov 8*1+$a, $acc6 lea $bias+$a, $a_ptr mov 8*2+$a, $acc7 mov 8*3+$a, $acc0" } { ######################################################################## # operate in 4-5-0-1 "name space" that matches multiplication output # my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); $code.=<<___; .type __ecp_nistz256_add_toq,\@abi-omnipotent .align 32 __ecp_nistz256_add_toq: .cfi_startproc xor $t4,$t4 add 8*0($b_ptr), $a0 adc 8*1($b_ptr), $a1 mov $a0, $t0 adc 8*2($b_ptr), $a2 adc 8*3($b_ptr), $a3 mov $a1, $t1 adc \$0, $t4 sub \$-1, $a0 mov $a2, $t2 sbb $poly1, $a1 sbb \$0, $a2 mov $a3, $t3 sbb $poly3, $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq .type __ecp_nistz256_sub_fromq,\@abi-omnipotent .align 32 __ecp_nistz256_sub_fromq: .cfi_startproc sub 8*0($b_ptr), $a0 sbb 8*1($b_ptr), $a1 mov $a0, $t0 sbb 8*2($b_ptr), $a2 sbb 8*3($b_ptr), $a3 mov $a1, $t1 sbb $t4, $t4 add \$-1, $a0 mov $a2, $t2 adc $poly1, $a1 adc \$0, $a2 mov $a3, $t3 adc $poly3, $a3 test $t4, $t4 cmovz $t0, $a0 cmovz $t1, $a1 mov $a0, 8*0($r_ptr) cmovz $t2, $a2 mov $a1, 8*1($r_ptr) cmovz $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq .type __ecp_nistz256_subq,\@abi-omnipotent .align 32 __ecp_nistz256_subq: .cfi_startproc sub $a0, $t0 sbb $a1, $t1 mov $t0, $a0 sbb $a2, $t2 sbb $a3, $t3 mov $t1, $a1 sbb $t4, $t4 add \$-1, $t0 mov $t2, $a2 adc $poly1, $t1 adc \$0, $t2 mov $t3, $a3 adc $poly3, $t3 test $t4, $t4 cmovnz $t0, $a0 cmovnz $t1, $a1 cmovnz $t2, $a2 cmovnz $t3, $a3 ret .cfi_endproc .size __ecp_nistz256_subq,.-__ecp_nistz256_subq .type __ecp_nistz256_mul_by_2q,\@abi-omnipotent .align 32 __ecp_nistz256_mul_by_2q: .cfi_startproc xor $t4, $t4 add $a0, $a0 # a0:a3+a0:a3 adc $a1, $a1 mov $a0, $t0 adc $a2, $a2 adc $a3, $a3 mov $a1, $t1 adc \$0, $t4 sub \$-1, $a0 mov $a2, $t2 sbb $poly1, $a1 sbb \$0, $a2 mov $a3, $t3 sbb $poly3, $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q ___ } sub gen_double () { my $x = shift; my ($src0,$sfx,$bias); my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); if ($x ne "x") { $src0 = "%rax"; $sfx = "_nohw"; $bias = 0; } else { $src0 = "%rdx"; $sfx = "_adx"; $bias = 128; } $code.=<<___; .globl ecp_nistz256_point_double$sfx .type ecp_nistz256_point_double$sfx,\@function,2 .align 32 ecp_nistz256_point_double$sfx: .cfi_startproc _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$32*5+8, %rsp .cfi_adjust_cfa_offset 32*5+8 .Lpoint_double${x}_body: .Lpoint_double_shortcut$x: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x mov $a_ptr, $b_ptr # backup copy movdqu 0x10($a_ptr), %xmm1 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order mov 0x20+8*1($a_ptr), $acc5 mov 0x20+8*2($a_ptr), $acc0 mov 0x20+8*3($a_ptr), $acc1 mov .Lpoly+8*1(%rip), $poly1 mov .Lpoly+8*3(%rip), $poly3 movdqa %xmm0, $in_x(%rsp) movdqa %xmm1, $in_x+0x10(%rsp) lea 0x20($r_ptr), $acc2 lea 0x40($r_ptr), $acc3 movq $r_ptr, %xmm0 movq $acc2, %xmm1 movq $acc3, %xmm2 lea $S(%rsp), $r_ptr call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); mov 0x40+8*0($a_ptr), $src0 mov 0x40+8*1($a_ptr), $acc6 mov 0x40+8*2($a_ptr), $acc7 mov 0x40+8*3($a_ptr), $acc0 lea 0x40-$bias($a_ptr), $a_ptr lea $Zsqr(%rsp), $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); `&load_for_sqr("$S(%rsp)", "$src0")` lea $S(%rsp), $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); mov 0x20($b_ptr), $src0 # $b_ptr is still valid mov 0x40+8*0($b_ptr), $acc1 mov 0x40+8*1($b_ptr), $acc2 mov 0x40+8*2($b_ptr), $acc3 mov 0x40+8*3($b_ptr), $acc4 lea 0x40-$bias($b_ptr), $a_ptr lea 0x20($b_ptr), $b_ptr movq %xmm2, $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order mov $in_x+8*1(%rsp), $acc5 lea $Zsqr(%rsp), $b_ptr mov $in_x+8*2(%rsp), $acc0 mov $in_x+8*3(%rsp), $acc1 lea $M(%rsp), $r_ptr call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order mov $in_x+8*1(%rsp), $acc5 lea $Zsqr(%rsp), $b_ptr mov $in_x+8*2(%rsp), $acc0 mov $in_x+8*3(%rsp), $acc1 lea $Zsqr(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); `&load_for_sqr("$S(%rsp)", "$src0")` movq %xmm1, $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); ___ { ######## ecp_nistz256_div_by_2(res_y, res_y); ########################## # operate in 4-5-6-7 "name space" that matches squaring output # my ($poly1,$poly3)=($a_ptr,$t1); my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); $code.=<<___; xor $t4, $t4 mov $a0, $t0 add \$-1, $a0 mov $a1, $t1 adc $poly1, $a1 mov $a2, $t2 adc \$0, $a2 mov $a3, $t3 adc $poly3, $a3 adc \$0, $t4 xor $a_ptr, $a_ptr # borrow $a_ptr test \$1, $t0 cmovz $t0, $a0 cmovz $t1, $a1 cmovz $t2, $a2 cmovz $t3, $a3 cmovz $a_ptr, $t4 mov $a1, $t0 # a0:a3>>1 shr \$1, $a0 shl \$63, $t0 mov $a2, $t1 shr \$1, $a1 or $t0, $a0 shl \$63, $t1 mov $a3, $t2 shr \$1, $a2 or $t1, $a1 shl \$63, $t2 mov $a0, 8*0($r_ptr) shr \$1, $a3 mov $a1, 8*1($r_ptr) shl \$63, $t4 or $t2, $a2 or $t4, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ___ } $code.=<<___; `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` lea $M(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); lea $tmp0(%rsp), $r_ptr call __ecp_nistz256_mul_by_2$x lea $M(%rsp), $b_ptr lea $M(%rsp), $r_ptr call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` lea $S(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); lea $tmp0(%rsp), $r_ptr call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); `&load_for_sqr("$M(%rsp)", "$src0")` movq %xmm0, $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); lea $tmp0(%rsp), $b_ptr mov $acc6, $acc0 # harmonize sqr output and sub input mov $acc7, $acc1 mov $a_ptr, $poly1 mov $t1, $poly3 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); mov $S+8*0(%rsp), $t0 mov $S+8*1(%rsp), $t1 mov $S+8*2(%rsp), $t2 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order lea $S(%rsp), $r_ptr call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); mov $M(%rsp), $src0 lea $M(%rsp), $b_ptr mov $acc4, $acc6 # harmonize sub output and mul input xor %ecx, %ecx mov $acc4, $S+8*0(%rsp) # have to save:-( mov $acc5, $acc2 mov $acc5, $S+8*1(%rsp) cmovz $acc0, $acc3 mov $acc0, $S+8*2(%rsp) lea $S-$bias(%rsp), $a_ptr cmovz $acc1, $acc4 mov $acc1, $S+8*3(%rsp) mov $acc6, $acc1 lea $S(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); movq %xmm1, $b_ptr movq %xmm1, $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); lea 32*5+56(%rsp), %rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbx .cfi_restore %rbx mov -8(%rsi),%rbp .cfi_restore %rbp lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpoint_double${x}_epilogue: ret .cfi_endproc .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx ___ } &gen_double("q"); sub gen_add () { my $x = shift; my ($src0,$sfx,$bias); my ($H,$Hsqr,$R,$Rsqr,$Hcub, $U1,$U2,$S1,$S2, $res_x,$res_y,$res_z, $in1_x,$in1_y,$in1_z, $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); if ($x ne "x") { $src0 = "%rax"; $sfx = "_nohw"; $bias = 0; } else { $src0 = "%rdx"; $sfx = "_adx"; $bias = 128; } $code.=<<___; .globl ecp_nistz256_point_add$sfx .type ecp_nistz256_point_add$sfx,\@function,3 .align 32 ecp_nistz256_point_add$sfx: .cfi_startproc _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$32*18+8, %rsp .cfi_adjust_cfa_offset 32*18+8 .Lpoint_add${x}_body: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr movdqu 0x10($a_ptr), %xmm1 movdqu 0x20($a_ptr), %xmm2 movdqu 0x30($a_ptr), %xmm3 movdqu 0x40($a_ptr), %xmm4 movdqu 0x50($a_ptr), %xmm5 mov $a_ptr, $b_ptr # reassign mov $b_org, $a_ptr # reassign movdqa %xmm0, $in1_x(%rsp) movdqa %xmm1, $in1_x+0x10(%rsp) movdqa %xmm2, $in1_y(%rsp) movdqa %xmm3, $in1_y+0x10(%rsp) movdqa %xmm4, $in1_z(%rsp) movdqa %xmm5, $in1_z+0x10(%rsp) por %xmm4, %xmm5 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr pshufd \$0xb1, %xmm5, %xmm3 movdqu 0x10($a_ptr), %xmm1 movdqu 0x20($a_ptr), %xmm2 por %xmm3, %xmm5 movdqu 0x30($a_ptr), %xmm3 mov 0x40+8*0($a_ptr), $src0 # load original in2_z mov 0x40+8*1($a_ptr), $acc6 mov 0x40+8*2($a_ptr), $acc7 mov 0x40+8*3($a_ptr), $acc0 movdqa %xmm0, $in2_x(%rsp) pshufd \$0x1e, %xmm5, %xmm4 movdqa %xmm1, $in2_x+0x10(%rsp) movdqu 0x40($a_ptr),%xmm0 # in2_z again movdqu 0x50($a_ptr),%xmm1 movdqa %xmm2, $in2_y(%rsp) movdqa %xmm3, $in2_y+0x10(%rsp) por %xmm4, %xmm5 pxor %xmm4, %xmm4 por %xmm0, %xmm1 movq $r_ptr, %xmm0 # save $r_ptr lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid mov $src0, $in2_z+8*0(%rsp) # make in2_z copy mov $acc6, $in2_z+8*1(%rsp) mov $acc7, $in2_z+8*2(%rsp) mov $acc0, $in2_z+8*3(%rsp) lea $Z2sqr(%rsp), $r_ptr # Z2^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); pcmpeqd %xmm4, %xmm5 pshufd \$0xb1, %xmm1, %xmm4 por %xmm1, %xmm4 pshufd \$0, %xmm5, %xmm5 # in1infty pshufd \$0x1e, %xmm4, %xmm3 por %xmm3, %xmm4 pxor %xmm3, %xmm3 pcmpeqd %xmm3, %xmm4 pshufd \$0, %xmm4, %xmm4 # in2infty mov 0x40+8*0($b_ptr), $src0 # load original in1_z mov 0x40+8*1($b_ptr), $acc6 mov 0x40+8*2($b_ptr), $acc7 mov 0x40+8*3($b_ptr), $acc0 movq $b_ptr, %xmm1 lea 0x40-$bias($b_ptr), $a_ptr lea $Z1sqr(%rsp), $r_ptr # Z1^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` lea $S1(%rsp), $r_ptr # S1 = Z2^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr # S2 = Z1^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); lea $S1(%rsp), $b_ptr lea $R(%rsp), $r_ptr # R = S2 - S1 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); or $acc5, $acc4 # see if result is zero movdqa %xmm4, %xmm2 or $acc0, $acc4 or $acc1, $acc4 por %xmm5, %xmm2 # in1infty || in2infty movq $acc4, %xmm3 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); lea $U1(%rsp), $b_ptr lea $H(%rsp), $r_ptr # H = U2 - U1 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); or $acc5, $acc4 # see if result is zero or $acc0, $acc4 or $acc1, $acc4 # !is_equal(U1, U2) movq %xmm2, $acc0 movq %xmm3, $acc1 or $acc0, $acc4 .byte 0x3e # predict taken jnz .Ladd_proceed$x # !is_equal(U1, U2) || in1infty || in2infty # We now know A = B or A = -B and neither is infinity. Compare the # y-coordinates via S1 and S2. test $acc1, $acc1 jz .Ladd_double$x # is_equal(S1, S2) # A = -B, so the result is infinity. # # TODO(davidben): Does .Ladd_proceed handle this case? It seems to, in # which case we should eliminate this special-case and simplify the # timing analysis. movq %xmm0, $r_ptr # restore $r_ptr pxor %xmm0, %xmm0 movdqu %xmm0, 0x00($r_ptr) movdqu %xmm0, 0x10($r_ptr) movdqu %xmm0, 0x20($r_ptr) movdqu %xmm0, 0x30($r_ptr) movdqu %xmm0, 0x40($r_ptr) movdqu %xmm0, 0x50($r_ptr) jmp .Ladd_done$x .align 32 .Ladd_double$x: movq %xmm1, $a_ptr # restore $a_ptr movq %xmm0, $r_ptr # restore $r_ptr add \$`32*(18-5)`, %rsp # difference in frame sizes .cfi_adjust_cfa_offset `-32*(18-5)` jmp .Lpoint_double_shortcut$x .cfi_adjust_cfa_offset `32*(18-5)` .align 32 .Ladd_proceed$x: `&load_for_sqr("$R(%rsp)", "$src0")` lea $Rsqr(%rsp), $r_ptr # R^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); `&load_for_sqr("$H(%rsp)", "$src0")` lea $Hsqr(%rsp), $r_ptr # H^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` lea $Hcub(%rsp), $r_ptr # H^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` lea $U2(%rsp), $r_ptr # U1*H^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); ___ { ####################################################################### # operate in 4-5-0-1 "name space" that matches multiplication output # my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); my ($poly1, $poly3)=($acc6,$acc7); $code.=<<___; #lea $U2(%rsp), $a_ptr #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); xor $t4, $t4 add $acc0, $acc0 # a0:a3+a0:a3 lea $Rsqr(%rsp), $a_ptr adc $acc1, $acc1 mov $acc0, $t0 adc $acc2, $acc2 adc $acc3, $acc3 mov $acc1, $t1 adc \$0, $t4 sub \$-1, $acc0 mov $acc2, $t2 sbb $poly1, $acc1 sbb \$0, $acc2 mov $acc3, $t3 sbb $poly3, $acc3 sbb \$0, $t4 cmovc $t0, $acc0 mov 8*0($a_ptr), $t0 cmovc $t1, $acc1 mov 8*1($a_ptr), $t1 cmovc $t2, $acc2 mov 8*2($a_ptr), $t2 cmovc $t3, $acc3 mov 8*3($a_ptr), $t3 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); lea $Hcub(%rsp), $b_ptr lea $res_x(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); mov $U2+8*0(%rsp), $t0 mov $U2+8*1(%rsp), $t1 mov $U2+8*2(%rsp), $t2 mov $U2+8*3(%rsp), $t3 lea $res_y(%rsp), $r_ptr call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); mov $acc0, 8*0($r_ptr) # save the result, as mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't mov $acc2, 8*2($r_ptr) mov $acc3, 8*3($r_ptr) ___ } $code.=<<___; `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` lea $res_y(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); lea $S2(%rsp), $b_ptr lea $res_y(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); movq %xmm0, $r_ptr # restore $r_ptr movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); movdqa %xmm5, %xmm1 pandn $res_z(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_z+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_z(%rsp), %xmm2 pand $in2_z+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_z(%rsp), %xmm2 pand $in1_z+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x40($r_ptr) movdqu %xmm3, 0x50($r_ptr) movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); movdqa %xmm5, %xmm1 pandn $res_x(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_x+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_x(%rsp), %xmm2 pand $in2_x+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_x(%rsp), %xmm2 pand $in1_x+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x00($r_ptr) movdqu %xmm3, 0x10($r_ptr) movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); movdqa %xmm5, %xmm1 pandn $res_y(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_y+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_y(%rsp), %xmm2 pand $in2_y+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_y(%rsp), %xmm2 pand $in1_y+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x20($r_ptr) movdqu %xmm3, 0x30($r_ptr) .Ladd_done$x: lea 32*18+56(%rsp), %rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbx .cfi_restore %rbx mov -8(%rsi),%rbp .cfi_restore %rbp lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpoint_add${x}_epilogue: ret .cfi_endproc .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx ___ } &gen_add("q"); sub gen_add_affine () { my $x = shift; my ($src0,$sfx,$bias); my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, $res_x,$res_y,$res_z, $in1_x,$in1_y,$in1_z, $in2_x,$in2_y)=map(32*$_,(0..14)); my $Z1sqr = $S2; if ($x ne "x") { $src0 = "%rax"; $sfx = "_nohw"; $bias = 0; } else { $src0 = "%rdx"; $sfx = "_adx"; $bias = 128; } $code.=<<___; .globl ecp_nistz256_point_add_affine$sfx .type ecp_nistz256_point_add_affine$sfx,\@function,3 .align 32 ecp_nistz256_point_add_affine$sfx: .cfi_startproc _CET_ENDBR push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$32*15+8, %rsp .cfi_adjust_cfa_offset 32*15+8 .Ladd_affine${x}_body: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr mov $b_org, $b_ptr # reassign movdqu 0x10($a_ptr), %xmm1 movdqu 0x20($a_ptr), %xmm2 movdqu 0x30($a_ptr), %xmm3 movdqu 0x40($a_ptr), %xmm4 movdqu 0x50($a_ptr), %xmm5 mov 0x40+8*0($a_ptr), $src0 # load original in1_z mov 0x40+8*1($a_ptr), $acc6 mov 0x40+8*2($a_ptr), $acc7 mov 0x40+8*3($a_ptr), $acc0 movdqa %xmm0, $in1_x(%rsp) movdqa %xmm1, $in1_x+0x10(%rsp) movdqa %xmm2, $in1_y(%rsp) movdqa %xmm3, $in1_y+0x10(%rsp) movdqa %xmm4, $in1_z(%rsp) movdqa %xmm5, $in1_z+0x10(%rsp) por %xmm4, %xmm5 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr pshufd \$0xb1, %xmm5, %xmm3 movdqu 0x10($b_ptr), %xmm1 movdqu 0x20($b_ptr), %xmm2 por %xmm3, %xmm5 movdqu 0x30($b_ptr), %xmm3 movdqa %xmm0, $in2_x(%rsp) pshufd \$0x1e, %xmm5, %xmm4 movdqa %xmm1, $in2_x+0x10(%rsp) por %xmm0, %xmm1 movq $r_ptr, %xmm0 # save $r_ptr movdqa %xmm2, $in2_y(%rsp) movdqa %xmm3, $in2_y+0x10(%rsp) por %xmm2, %xmm3 por %xmm4, %xmm5 pxor %xmm4, %xmm4 por %xmm1, %xmm3 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid lea $Z1sqr(%rsp), $r_ptr # Z1^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); pcmpeqd %xmm4, %xmm5 pshufd \$0xb1, %xmm3, %xmm4 mov 0x00($b_ptr), $src0 # $b_ptr is still valid #lea 0x00($b_ptr), $b_ptr mov $acc4, $acc1 # harmonize sqr output and mul input por %xmm3, %xmm4 pshufd \$0, %xmm5, %xmm5 # in1infty pshufd \$0x1e, %xmm4, %xmm3 mov $acc5, $acc2 por %xmm3, %xmm4 pxor %xmm3, %xmm3 mov $acc6, $acc3 pcmpeqd %xmm3, %xmm4 pshufd \$0, %xmm4, %xmm4 # in2infty lea $Z1sqr-$bias(%rsp), $a_ptr mov $acc7, $acc4 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); lea $in1_x(%rsp), $b_ptr lea $H(%rsp), $r_ptr # H = U2 - U1 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr # S2 = Z1^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); lea $in1_y(%rsp), $b_ptr lea $R(%rsp), $r_ptr # R = S2 - S1 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); `&load_for_sqr("$H(%rsp)", "$src0")` lea $Hsqr(%rsp), $r_ptr # H^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); `&load_for_sqr("$R(%rsp)", "$src0")` lea $Rsqr(%rsp), $r_ptr # R^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` lea $Hcub(%rsp), $r_ptr # H^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` lea $U2(%rsp), $r_ptr # U1*H^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); ___ { ####################################################################### # operate in 4-5-0-1 "name space" that matches multiplication output # my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); my ($poly1, $poly3)=($acc6,$acc7); $code.=<<___; #lea $U2(%rsp), $a_ptr #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); xor $t4, $t4 add $acc0, $acc0 # a0:a3+a0:a3 lea $Rsqr(%rsp), $a_ptr adc $acc1, $acc1 mov $acc0, $t0 adc $acc2, $acc2 adc $acc3, $acc3 mov $acc1, $t1 adc \$0, $t4 sub \$-1, $acc0 mov $acc2, $t2 sbb $poly1, $acc1 sbb \$0, $acc2 mov $acc3, $t3 sbb $poly3, $acc3 sbb \$0, $t4 cmovc $t0, $acc0 mov 8*0($a_ptr), $t0 cmovc $t1, $acc1 mov 8*1($a_ptr), $t1 cmovc $t2, $acc2 mov 8*2($a_ptr), $t2 cmovc $t3, $acc3 mov 8*3($a_ptr), $t3 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); lea $Hcub(%rsp), $b_ptr lea $res_x(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); mov $U2+8*0(%rsp), $t0 mov $U2+8*1(%rsp), $t1 mov $U2+8*2(%rsp), $t2 mov $U2+8*3(%rsp), $t3 lea $H(%rsp), $r_ptr call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); mov $acc0, 8*0($r_ptr) # save the result, as mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't mov $acc2, 8*2($r_ptr) mov $acc3, 8*3($r_ptr) ___ } $code.=<<___; `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` lea $H(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); lea $S2(%rsp), $b_ptr lea $res_y(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); movq %xmm0, $r_ptr # restore $r_ptr movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); movdqa %xmm5, %xmm1 pandn $res_z(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_z+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand .LONE_mont(%rip), %xmm2 pand .LONE_mont+0x10(%rip), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_z(%rsp), %xmm2 pand $in1_z+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x40($r_ptr) movdqu %xmm3, 0x50($r_ptr) movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); movdqa %xmm5, %xmm1 pandn $res_x(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_x+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_x(%rsp), %xmm2 pand $in2_x+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_x(%rsp), %xmm2 pand $in1_x+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x00($r_ptr) movdqu %xmm3, 0x10($r_ptr) movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); movdqa %xmm5, %xmm1 pandn $res_y(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_y+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_y(%rsp), %xmm2 pand $in2_y+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_y(%rsp), %xmm2 pand $in1_y+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x20($r_ptr) movdqu %xmm3, 0x30($r_ptr) lea 32*15+56(%rsp), %rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbx .cfi_restore %rbx mov -8(%rsi),%rbp .cfi_restore %rbp lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Ladd_affine${x}_epilogue: ret .cfi_endproc .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx ___ } &gen_add_affine("q"); ######################################################################## # AD*X magic # if ($addx) { { ######################################################################## # operate in 4-5-0-1 "name space" that matches multiplication output # my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); $code.=<<___; .type __ecp_nistz256_add_tox,\@abi-omnipotent .align 32 __ecp_nistz256_add_tox: .cfi_startproc xor $t4, $t4 adc 8*0($b_ptr), $a0 adc 8*1($b_ptr), $a1 mov $a0, $t0 adc 8*2($b_ptr), $a2 adc 8*3($b_ptr), $a3 mov $a1, $t1 adc \$0, $t4 xor $t3, $t3 sbb \$-1, $a0 mov $a2, $t2 sbb $poly1, $a1 sbb \$0, $a2 mov $a3, $t3 sbb $poly3, $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox .type __ecp_nistz256_sub_fromx,\@abi-omnipotent .align 32 __ecp_nistz256_sub_fromx: .cfi_startproc xor $t4, $t4 sbb 8*0($b_ptr), $a0 sbb 8*1($b_ptr), $a1 mov $a0, $t0 sbb 8*2($b_ptr), $a2 sbb 8*3($b_ptr), $a3 mov $a1, $t1 sbb \$0, $t4 xor $t3, $t3 adc \$-1, $a0 mov $a2, $t2 adc $poly1, $a1 adc \$0, $a2 mov $a3, $t3 adc $poly3, $a3 bt \$0, $t4 cmovnc $t0, $a0 cmovnc $t1, $a1 mov $a0, 8*0($r_ptr) cmovnc $t2, $a2 mov $a1, 8*1($r_ptr) cmovnc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx .type __ecp_nistz256_subx,\@abi-omnipotent .align 32 __ecp_nistz256_subx: .cfi_startproc xor $t4, $t4 sbb $a0, $t0 sbb $a1, $t1 mov $t0, $a0 sbb $a2, $t2 sbb $a3, $t3 mov $t1, $a1 sbb \$0, $t4 xor $a3 ,$a3 adc \$-1, $t0 mov $t2, $a2 adc $poly1, $t1 adc \$0, $t2 mov $t3, $a3 adc $poly3, $t3 bt \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 cmovc $t2, $a2 cmovc $t3, $a3 ret .cfi_endproc .size __ecp_nistz256_subx,.-__ecp_nistz256_subx .type __ecp_nistz256_mul_by_2x,\@abi-omnipotent .align 32 __ecp_nistz256_mul_by_2x: .cfi_startproc xor $t4, $t4 adc $a0, $a0 # a0:a3+a0:a3 adc $a1, $a1 mov $a0, $t0 adc $a2, $a2 adc $a3, $a3 mov $a1, $t1 adc \$0, $t4 xor $t3, $t3 sbb \$-1, $a0 mov $a2, $t2 sbb $poly1, $a1 sbb \$0, $a2 mov $a3, $t3 sbb $poly3, $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x ___ } &gen_double("x"); &gen_add("x"); &gen_add_affine("x"); } }}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type short_handler,\@abi-omnipotent .align 16 short_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea 16(%rax),%rax mov -8(%rax),%r12 mov -16(%rax),%r13 mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 jmp .Lcommon_seh_tail .size short_handler,.-short_handler .type full_handler,\@abi-omnipotent .align 16 full_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail mov 8(%r11),%r10d # HandlerData[2] lea (%rax,%r10),%rax mov -8(%rax),%rbp mov -16(%rax),%rbx mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size full_handler,.-full_handler .section .pdata .align 4 .rva .LSEH_begin_ecp_nistz256_neg .rva .LSEH_end_ecp_nistz256_neg .rva .LSEH_info_ecp_nistz256_neg .rva .LSEH_begin_ecp_nistz256_ord_mul_mont_nohw .rva .LSEH_end_ecp_nistz256_ord_mul_mont_nohw .rva .LSEH_info_ecp_nistz256_ord_mul_mont_nohw .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont_nohw .rva .LSEH_end_ecp_nistz256_ord_sqr_mont_nohw .rva .LSEH_info_ecp_nistz256_ord_sqr_mont_nohw ___ $code.=<<___ if ($addx); .rva .LSEH_begin_ecp_nistz256_ord_mul_mont_adx .rva .LSEH_end_ecp_nistz256_ord_mul_mont_adx .rva .LSEH_info_ecp_nistz256_ord_mul_mont_adx .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont_adx .rva .LSEH_end_ecp_nistz256_ord_sqr_mont_adx .rva .LSEH_info_ecp_nistz256_ord_sqr_mont_adx ___ $code.=<<___; .rva .LSEH_begin_ecp_nistz256_mul_mont_nohw .rva .LSEH_end_ecp_nistz256_mul_mont_nohw .rva .LSEH_info_ecp_nistz256_mul_mont_nohw .rva .LSEH_begin_ecp_nistz256_sqr_mont_nohw .rva .LSEH_end_ecp_nistz256_sqr_mont_nohw .rva .LSEH_info_ecp_nistz256_sqr_mont_nohw ___ $code.=<<___ if ($addx); .rva .LSEH_begin_ecp_nistz256_mul_mont_adx .rva .LSEH_end_ecp_nistz256_mul_mont_adx .rva .LSEH_info_ecp_nistz256_mul_mont_adx .rva .LSEH_begin_ecp_nistz256_sqr_mont_adx .rva .LSEH_end_ecp_nistz256_sqr_mont_adx .rva .LSEH_info_ecp_nistz256_sqr_mont_adx ___ $code.=<<___; .rva .LSEH_begin_ecp_nistz256_select_w5_nohw .rva .LSEH_end_ecp_nistz256_select_w5_nohw .rva .LSEH_info_ecp_nistz256_select_wX_nohw .rva .LSEH_begin_ecp_nistz256_select_w7_nohw .rva .LSEH_end_ecp_nistz256_select_w7_nohw .rva .LSEH_info_ecp_nistz256_select_wX_nohw ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_ecp_nistz256_select_w5_avx2 .rva .LSEH_end_ecp_nistz256_select_w5_avx2 .rva .LSEH_info_ecp_nistz256_select_wX_avx2 .rva .LSEH_begin_ecp_nistz256_select_w7_avx2 .rva .LSEH_end_ecp_nistz256_select_w7_avx2 .rva .LSEH_info_ecp_nistz256_select_wX_avx2 ___ $code.=<<___; .rva .LSEH_begin_ecp_nistz256_point_double_nohw .rva .LSEH_end_ecp_nistz256_point_double_nohw .rva .LSEH_info_ecp_nistz256_point_double_nohw .rva .LSEH_begin_ecp_nistz256_point_add_nohw .rva .LSEH_end_ecp_nistz256_point_add_nohw .rva .LSEH_info_ecp_nistz256_point_add_nohw .rva .LSEH_begin_ecp_nistz256_point_add_affine_nohw .rva .LSEH_end_ecp_nistz256_point_add_affine_nohw .rva .LSEH_info_ecp_nistz256_point_add_affine_nohw ___ $code.=<<___ if ($addx); .rva .LSEH_begin_ecp_nistz256_point_double_adx .rva .LSEH_end_ecp_nistz256_point_double_adx .rva .LSEH_info_ecp_nistz256_point_double_adx .rva .LSEH_begin_ecp_nistz256_point_add_adx .rva .LSEH_end_ecp_nistz256_point_add_adx .rva .LSEH_info_ecp_nistz256_point_add_adx .rva .LSEH_begin_ecp_nistz256_point_add_affine_adx .rva .LSEH_end_ecp_nistz256_point_add_affine_adx .rva .LSEH_info_ecp_nistz256_point_add_affine_adx ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_ecp_nistz256_neg: .byte 9,0,0,0 .rva short_handler .rva .Lneg_body,.Lneg_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_ord_mul_mont_nohw: .byte 9,0,0,0 .rva full_handler .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_ord_sqr_mont_nohw: .byte 9,0,0,0 .rva full_handler .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[] .long 48,0 ___ $code.=<<___ if ($addx); .LSEH_info_ecp_nistz256_ord_mul_mont_adx: .byte 9,0,0,0 .rva full_handler .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_ord_sqr_mont_adx: .byte 9,0,0,0 .rva full_handler .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[] .long 48,0 ___ $code.=<<___; .LSEH_info_ecp_nistz256_mul_mont_nohw: .byte 9,0,0,0 .rva full_handler .rva .Lmul_body,.Lmul_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_sqr_mont_nohw: .byte 9,0,0,0 .rva full_handler .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] .long 48,0 ___ $code.=<<___ if ($addx); .LSEH_info_ecp_nistz256_mul_mont_adx: .byte 9,0,0,0 .rva full_handler .rva .Lmulx_body,.Lmulx_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_sqr_mont_adx: .byte 9,0,0,0 .rva full_handler .rva .Lsqrx_body,.Lsqrx_epilogue # HandlerData[] .long 48,0 ___ $code.=<<___; .LSEH_info_ecp_nistz256_select_wX_nohw: .byte 0x01,0x33,0x16,0x00 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 .align 8 ___ $code.=<<___ if ($avx>1); .LSEH_info_ecp_nistz256_select_wX_avx2: .byte 0x01,0x36,0x17,0x0b .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 .align 8 ___ $code.=<<___; .LSEH_info_ecp_nistz256_point_double_nohw: .byte 9,0,0,0 .rva full_handler .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[] .long 32*5+56,0 .LSEH_info_ecp_nistz256_point_add_nohw: .byte 9,0,0,0 .rva full_handler .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[] .long 32*18+56,0 .LSEH_info_ecp_nistz256_point_add_affine_nohw: .byte 9,0,0,0 .rva full_handler .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[] .long 32*15+56,0 ___ $code.=<<___ if ($addx); .align 8 .LSEH_info_ecp_nistz256_point_double_adx: .byte 9,0,0,0 .rva full_handler .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[] .long 32*5+56,0 .LSEH_info_ecp_nistz256_point_add_adx: .byte 9,0,0,0 .rva full_handler .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[] .long 32*18+56,0 .LSEH_info_ecp_nistz256_point_add_affine_adx: .byte 9,0,0,0 .rva full_handler .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[] .long 32*15+56,0 ___ } $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.c000064400000000000000000000041651046102023000171630ustar 00000000000000/* Copyright (c) 2014, Intel Corporation. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "ecp_nistz.h" #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wconversion" #endif /* Fills |str| with the bytewise little-endian encoding of |scalar|, where * |scalar| has |num_limbs| limbs. |str| is padded with zeros at the end up * to |str_len| bytes. Actually, |str_len| must be exactly one byte more than * needed to encode |num_limbs| losslessly, so that there is an extra byte at * the end. The extra byte is useful because the caller will be breaking |str| * up into windows of a number of bits (5 or 7) that isn't divisible by 8, and * so it is useful for it to be able to read an extra zero byte. */ void little_endian_bytes_from_scalar(uint8_t str[], size_t str_len, const Limb scalar[], size_t num_limbs) { debug_assert_nonsecret(str_len == (num_limbs * sizeof(Limb)) + 1); size_t i; for (i = 0; i < num_limbs * sizeof(Limb); i += sizeof(Limb)) { Limb d = scalar[i / sizeof(Limb)]; str[i + 0] = d & 0xff; str[i + 1] = (d >> 8) & 0xff; str[i + 2] = (d >> 16) & 0xff; str[i + 3] = (d >>= 24) & 0xff; if (sizeof(Limb) == 8) { d >>= 8; str[i + 4] = d & 0xff; str[i + 5] = (d >> 8) & 0xff; str[i + 6] = (d >> 16) & 0xff; str[i + 7] = (d >> 24) & 0xff; } } for (; i < str_len; i++) { str[i] = 0; } } ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.h000064400000000000000000000276041046102023000171730ustar 00000000000000/* Copyright (c) 2015, Google Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #ifndef OPENSSL_HEADER_EC_ECP_NISTZ_H #define OPENSSL_HEADER_EC_ECP_NISTZ_H #include #include "../../limbs/limbs.h" #if defined(__GNUC__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" #pragma GCC diagnostic ignored "-Wsign-conversion" #endif // This function looks at `w + 1` scalar bits (`w` current, 1 adjacent less // significant bit), and recodes them into a signed digit for use in fast point // multiplication: the use of signed rather than unsigned digits means that // fewer points need to be precomputed, given that point inversion is easy (a // precomputed point dP makes -dP available as well). // // BACKGROUND: // // Signed digits for multiplication were introduced by Booth ("A signed binary // multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV, // pt. 2 (1951), pp. 236-240), in that case for multiplication of integers. // Booth's original encoding did not generally improve the density of nonzero // digits over the binary representation, and was merely meant to simplify the // handling of signed factors given in two's complement; but it has since been // shown to be the basis of various signed-digit representations that do have // further advantages, including the wNAF, using the following general // approach: // // (1) Given a binary representation // // b_k ... b_2 b_1 b_0, // // of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1 // by using bit-wise subtraction as follows: // // b_k b_(k-1) ... b_2 b_1 b_0 // - b_k ... b_3 b_2 b_1 b_0 // ----------------------------------------- // s_(k+1) s_k ... s_3 s_2 s_1 s_0 // // A left-shift followed by subtraction of the original value yields a new // representation of the same value, using signed bits s_i = b_(i-1) - b_i. // This representation from Booth's paper has since appeared in the // literature under a variety of different names including "reversed binary // form", "alternating greedy expansion", "mutual opposite form", and // "sign-alternating {+-1}-representation". // // An interesting property is that among the nonzero bits, values 1 and -1 // strictly alternate. // // (2) Various window schemes can be applied to the Booth representation of // integers: for example, right-to-left sliding windows yield the wNAF // (a signed-digit encoding independently discovered by various researchers // in the 1990s), and left-to-right sliding windows yield a left-to-right // equivalent of the wNAF (independently discovered by various researchers // around 2004). // // To prevent leaking information through side channels in point multiplication, // we need to recode the given integer into a regular pattern: sliding windows // as in wNAFs won't do, we need their fixed-window equivalent -- which is a few // decades older: we'll be using the so-called "modified Booth encoding" due to // MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49 // (1961), pp. 67-91), in a radix-2**w setting. That is, we always combine `w` // signed bits into a signed digit, e.g. (for `w == 5`): // // s_(5j + 4) s_(5j + 3) s_(5j + 2) s_(5j + 1) s_(5j) // // The sign-alternating property implies that the resulting digit values are // integers from `-2**(w-1)` to `2**(w-1)`, e.g. -16 to 16 for `w == 5`. // // Of course, we don't actually need to compute the signed digits s_i as an // intermediate step (that's just a nice way to see how this scheme relates // to the wNAF): a direct computation obtains the recoded digit from the // six bits b_(5j + 4) ... b_(5j - 1). // // This function takes those `w` bits as an integer (e.g. 0 .. 63), writing the // recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute // value, in the range 0 .. 2**(w-1). Note that this integer essentially provides // the input bits "shifted to the left" by one position: for example, the input // to compute the least significant recoded digit, given that there's no bit // b_-1, has to be b_4 b_3 b_2 b_1 b_0 0. // // DOUBLING CASE: // // Point addition formulas for short Weierstrass curves are often incomplete. // Edge cases such as P + P or P + ∞ must be handled separately. This // complicates constant-time requirements. P + ∞ cannot be avoided (any window // may be zero) and is handled with constant-time selects. P + P (where P is not // ∞) usually is not. Instead, windowing strategies are chosen to avoid this // case. Whether this happens depends on the group order. // // Let w be the window width (in this function, w = 5). The non-trivial doubling // case in single-point scalar multiplication may occur if and only if the // 2^(w-1) bit of the group order is zero. // // Note the above only holds if the scalar is fully reduced and the group order // is a prime that is much larger than 2^w. It also only holds when windows // are applied from most significant to least significant, doubling between each // window. It does not apply to more complex table strategies such as // |EC_nistz256_method|. // // PROOF: // // Let n be the group order. Let l be the number of bits needed to represent n. // Assume there exists some 0 <= k < n such that signed w-bit windowed // multiplication hits the doubling case. // // Windowed multiplication consists of iterating over groups of s_i (defined // above based on k's binary representation) from most to least significant. At // iteration i (for i = ..., 3w, 2w, w, 0, starting from the most significant // window), we: // // 1. Double the accumulator A, w times. Let A_i be the value of A at this // point. // // 2. Set A to T_i + A_i, where T_i is a precomputed multiple of P // corresponding to the window s_(i+w-1) ... s_i. // // Let j be the index such that A_j = T_j ≠ ∞. Looking at A_i and T_i as // multiples of P, define a_i and t_i to be scalar coefficients of A_i and T_i. // Thus a_j = t_j ≠ 0 (mod n). Note a_i and t_i may not be reduced mod n. t_i is // the value of the w signed bits s_(i+w-1) ... s_i. a_i is computed as a_i = // 2^w * (a_(i+w) + t_(i+w)). // // t_i is bounded by -2^(w-1) <= t_i <= 2^(w-1). Additionally, we may write it // in terms of unsigned bits b_i. t_i consists of signed bits s_(i+w-1) ... s_i. // This is computed as: // // b_(i+w-2) b_(i+w-3) ... b_i b_(i-1) // - b_(i+w-1) b_(i+w-2) ... b_(i+1) b_i // -------------------------------------------- // t_i = s_(i+w-1) s_(i+w-2) ... s_(i+1) s_i // // Observe that b_(i+w-2) through b_i occur in both terms. Let x be the integer // represented by that bit string, i.e. 2^(w-2)*b_(i+w-2) + ... + b_i. // // t_i = (2*x + b_(i-1)) - (2^(w-1)*b_(i+w-1) + x) // = x - 2^(w-1)*b_(i+w-1) + b_(i-1) // // Or, using C notation for bit operations: // // t_i = (k>>i) & ((1<<(w-1)) - 1) - (k>>i) & (1<<(w-1)) + (k>>(i-1)) & 1 // // Note b_(i-1) is added in left-shifted by one (or doubled) from its place. // This is compensated by t_(i-w)'s subtraction term. Thus, a_i may be computed // by adding b_l b_(l-1) ... b_(i+1) b_i and an extra copy of b_(i-1). In C // notation, this is: // // a_i = (k>>(i+w)) << w + ((k>>(i+w-1)) & 1) << w // // Observe that, while t_i may be positive or negative, a_i is bounded by // 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up // are all zero. (Note this implies a non-trivial P + (-P) is unreachable for // all groups. That would imply the subsequent a_i is zero, which means all // terms thus far were zero.) // // Returning to our doubling position, we have a_j = t_j (mod n). We now // determine the value of a_j - t_j, which must be divisible by n. Our bounds on // a_j and t_j imply a_j - t_j is 0 or n. If it is 0, a_j = t_j. However, 2^w // divides a_j and -2^(w-1) <= t_j <= 2^(w-1), so this can only happen if // a_j = t_j = 0, which is a trivial doubling. Therefore, a_j - t_j = n. // // Now we determine j. Suppose j > 0. w divides j, so j >= w. Then, // // n = a_j - t_j = (k>>(j+w)) << w + ((k>>(j+w-1)) & 1) << w - t_j // <= k/2^j + 2^w - t_j // < n/2^w + 2^w + 2^(w-1) // // n is much larger than 2^w, so this is impossible. Thus, j = 0: only the final // addition may hit the doubling case. // // Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L // such that k_H is the contribution from b_(l-1) .. b_w, k_M is the // contribution from b_(w-1), and k_L is the contribution from b_(w-2) ... b_0. // That is: // // - 2^w divides k_H // - k_M is 0 or 2^(w-1) // - 0 <= k_L < 2^(w-1) // // Divide n into n_H + n_M + n_L similarly. We thus have: // // t_0 = (k>>0) & ((1<<(w-1)) - 1) - (k>>0) & (1<<(w-1)) + (k>>(0-1)) & 1 // = k & ((1<<(w-1)) - 1) - k & (1<<(w-1)) // = k_L - k_M // // a_0 = (k>>(0+w)) << w + ((k>>(0+w-1)) & 1) << w // = (k>>w) << w + ((k>>(w-1)) & 1) << w // = k_H + 2*k_M // // n = a_0 - t_0 // n_H + n_M + n_L = (k_H + 2*k_M) - (k_L - k_M) // = k_H + 3*k_M - k_L // // k_H - k_L < k and k < n, so k_H - k_L ≠ n. Therefore k_M is not 0 and must be // 2^(w-1). Now we consider k_H and n_H. We know k_H <= n_H. Suppose k_H = n_H. // Then, // // n_M + n_L = 3*(2^(w-1)) - k_L // > 3*(2^(w-1)) - 2^(w-1) // = 2^w // // Contradiction (n_M + n_L is the bottom w bits of n). Thus k_H < n_H. Suppose // k_H < n_H - 2*2^w. Then, // // n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L // < n_H - 2*2^w + 3*(2^(w-1)) - k_L // n_M + n_L < -2^(w-1) - k_L // // Contradiction. Thus, k_H = n_H - 2^w. (Note 2^w divides n_H and k_H.) Thus, // // n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L // = n_H - 2^w + 3*(2^(w-1)) - k_L // n_M + n_L = 2^(w-1) - k_L // <= 2^(w-1) // // Equality would mean 2^(w-1) divides n, which is impossible if n is prime. // Thus n_M + n_L < 2^(w-1), so n_M is zero, proving our condition. // // This proof constructs k, so, to show the converse, let k_H = n_H - 2^w, // k_M = 2^(w-1), k_L = 2^(w-1) - n_L. This will result in a non-trivial point // doubling in the final addition and is the only such scalar. // // COMMON CURVES: // // The group orders for common curves end in the following bit patterns: // // P-521: ...00001001; w = 4 is okay // P-384: ...01110011; w = 2, 5, 6, 7 are okay // P-256: ...01010001; w = 5, 7 are okay // P-224: ...00111101; w = 3, 4, 5, 6 are okay static inline void booth_recode(crypto_word_t *is_negative, crypto_word_t *digit, crypto_word_t in, crypto_word_t w) { debug_assert_nonsecret(w >= 2); debug_assert_nonsecret(w <= 7); // Set all bits of `s` to MSB(in), similar to |constant_time_msb_s|, // but 'in' seen as (`w+1`)-bit value. crypto_word_t s = ~((in >> w) - 1); crypto_word_t d; d = ((crypto_word_t)1u << (w + 1)) - in - 1; d = (d & s) | (in & ~s); d = (d >> 1) + (d & 1); *is_negative = constant_time_is_nonzero_w(s & 1); *digit = d; } #if defined(__GNUC__) #pragma GCC diagnostic pop #endif void little_endian_bytes_from_scalar(uint8_t str[], size_t str_len, const Limb scalar[], size_t num_limbs); #endif // OPENSSL_HEADER_EC_ECP_NISTZ_H ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.h000064400000000000000000000022051046102023000174200ustar 00000000000000/* Copyright (c) 2014, Intel Corporation. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #ifndef OPENSSL_HEADER_EC_ECP_NISTZ384_H #define OPENSSL_HEADER_EC_ECP_NISTZ384_H #include "../../limbs/limbs.h" #define P384_LIMBS (384u / LIMB_BITS) typedef struct { Limb X[P384_LIMBS]; Limb Y[P384_LIMBS]; Limb Z[P384_LIMBS]; } P384_POINT; typedef struct { Limb X[P384_LIMBS]; Limb Y[P384_LIMBS]; } P384_POINT_AFFINE; #endif // OPENSSL_HEADER_EC_ECP_NISTZ384_H ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.inl000064400000000000000000000223071046102023000177600ustar 00000000000000/* Copyright (c) 2014, Intel Corporation. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* Developers and authors: * Shay Gueron (1, 2), and Vlad Krasnov (1) * (1) Intel Corporation, Israel Development Center * (2) University of Haifa * Reference: * Shay Gueron and Vlad Krasnov * "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes" * http://eprint.iacr.org/2013/816 */ #include "ecp_nistz.h" #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wsign-conversion" #endif /* Point double: r = 2*a */ static void nistz384_point_double(P384_POINT *r, const P384_POINT *a) { BN_ULONG S[P384_LIMBS]; BN_ULONG M[P384_LIMBS]; BN_ULONG Zsqr[P384_LIMBS]; BN_ULONG tmp0[P384_LIMBS]; const BN_ULONG *in_x = a->X; const BN_ULONG *in_y = a->Y; const BN_ULONG *in_z = a->Z; BN_ULONG *res_x = r->X; BN_ULONG *res_y = r->Y; BN_ULONG *res_z = r->Z; elem_mul_by_2(S, in_y); elem_sqr_mont(Zsqr, in_z); elem_sqr_mont(S, S); elem_mul_mont(res_z, in_z, in_y); elem_mul_by_2(res_z, res_z); elem_add(M, in_x, Zsqr); elem_sub(Zsqr, in_x, Zsqr); elem_sqr_mont(res_y, S); elem_div_by_2(res_y, res_y); elem_mul_mont(M, M, Zsqr); elem_mul_by_3(M, M); elem_mul_mont(S, S, in_x); elem_mul_by_2(tmp0, S); elem_sqr_mont(res_x, M); elem_sub(res_x, res_x, tmp0); elem_sub(S, S, res_x); elem_mul_mont(S, S, M); elem_sub(res_y, S, res_y); } /* Point addition: r = a+b */ static void nistz384_point_add(P384_POINT *r, const P384_POINT *a, const P384_POINT *b) { BN_ULONG U2[P384_LIMBS], S2[P384_LIMBS]; BN_ULONG U1[P384_LIMBS], S1[P384_LIMBS]; BN_ULONG Z1sqr[P384_LIMBS]; BN_ULONG Z2sqr[P384_LIMBS]; BN_ULONG H[P384_LIMBS], R[P384_LIMBS]; BN_ULONG Hsqr[P384_LIMBS]; BN_ULONG Rsqr[P384_LIMBS]; BN_ULONG Hcub[P384_LIMBS]; BN_ULONG res_x[P384_LIMBS]; BN_ULONG res_y[P384_LIMBS]; BN_ULONG res_z[P384_LIMBS]; const BN_ULONG *in1_x = a->X; const BN_ULONG *in1_y = a->Y; const BN_ULONG *in1_z = a->Z; const BN_ULONG *in2_x = b->X; const BN_ULONG *in2_y = b->Y; const BN_ULONG *in2_z = b->Z; BN_ULONG in1infty = is_zero(a->Z); BN_ULONG in2infty = is_zero(b->Z); elem_sqr_mont(Z2sqr, in2_z); /* Z2^2 */ elem_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ elem_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */ elem_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ elem_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */ elem_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ elem_sub(R, S2, S1); /* R = S2 - S1 */ elem_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */ elem_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ elem_sub(H, U2, U1); /* H = U2 - U1 */ BN_ULONG is_exceptional = is_equal(U1, U2) & ~in1infty & ~in2infty; if (is_exceptional) { if (is_equal(S1, S2)) { nistz384_point_double(r, a); } else { limbs_zero(r->X, P384_LIMBS); limbs_zero(r->Y, P384_LIMBS); limbs_zero(r->Z, P384_LIMBS); } return; } elem_sqr_mont(Rsqr, R); /* R^2 */ elem_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ elem_sqr_mont(Hsqr, H); /* H^2 */ elem_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */ elem_mul_mont(Hcub, Hsqr, H); /* H^3 */ elem_mul_mont(U2, U1, Hsqr); /* U1*H^2 */ elem_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ elem_sub(res_x, Rsqr, Hsqr); elem_sub(res_x, res_x, Hcub); elem_sub(res_y, U2, res_x); elem_mul_mont(S2, S1, Hcub); elem_mul_mont(res_y, R, res_y); elem_sub(res_y, res_y, S2); copy_conditional(res_x, in2_x, in1infty); copy_conditional(res_y, in2_y, in1infty); copy_conditional(res_z, in2_z, in1infty); copy_conditional(res_x, in1_x, in2infty); copy_conditional(res_y, in1_y, in2infty); copy_conditional(res_z, in1_z, in2infty); limbs_copy(r->X, res_x, P384_LIMBS); limbs_copy(r->Y, res_y, P384_LIMBS); limbs_copy(r->Z, res_z, P384_LIMBS); } static void add_precomputed_w5(P384_POINT *r, crypto_word_t wvalue, const P384_POINT table[16]) { crypto_word_t recoded_is_negative; crypto_word_t recoded; booth_recode(&recoded_is_negative, &recoded, wvalue, 5); alignas(64) P384_POINT h; p384_point_select_w5(&h, table, recoded); alignas(64) BN_ULONG tmp[P384_LIMBS]; p384_elem_neg(tmp, h.Y); copy_conditional(h.Y, tmp, recoded_is_negative); nistz384_point_add(r, r, &h); } /* r = p * p_scalar */ static void nistz384_point_mul(P384_POINT *r, const BN_ULONG p_scalar[P384_LIMBS], const Limb p_x[P384_LIMBS], const Limb p_y[P384_LIMBS]) { static const size_t kWindowSize = 5; static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1; uint8_t p_str[(P384_LIMBS * sizeof(Limb)) + 1]; little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]), p_scalar, P384_LIMBS); /* A |P384_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should * add no more than 63 bytes of overhead. Thus, |table| should require * ~2367 ((144 * 16) + 63) bytes of stack space. */ alignas(64) P384_POINT table[16]; /* table[0] is implicitly (0,0,0) (the point at infinity), therefore it is * not stored. All other values are actually stored with an offset of -1 in * table. */ P384_POINT *row = table; limbs_copy(row[1 - 1].X, p_x, P384_LIMBS); limbs_copy(row[1 - 1].Y, p_y, P384_LIMBS); limbs_copy(row[1 - 1].Z, ONE, P384_LIMBS); nistz384_point_double(&row[2 - 1], &row[1 - 1]); nistz384_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]); nistz384_point_double(&row[4 - 1], &row[2 - 1]); nistz384_point_double(&row[6 - 1], &row[3 - 1]); nistz384_point_double(&row[8 - 1], &row[4 - 1]); nistz384_point_double(&row[12 - 1], &row[6 - 1]); nistz384_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]); nistz384_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]); nistz384_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]); nistz384_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]); nistz384_point_double(&row[14 - 1], &row[7 - 1]); nistz384_point_double(&row[10 - 1], &row[5 - 1]); nistz384_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]); nistz384_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]); nistz384_point_double(&row[16 - 1], &row[8 - 1]); static const size_t START_INDEX = 384 - 4; size_t index = START_INDEX; BN_ULONG recoded_is_negative; crypto_word_t recoded; crypto_word_t wvalue = p_str[(index - 1) / 8]; wvalue = (wvalue >> ((index - 1) % 8)) & kMask; booth_recode(&recoded_is_negative, &recoded, wvalue, 5); dev_assert_secret(!recoded_is_negative); p384_point_select_w5(r, table, recoded); while (index >= kWindowSize) { if (index != START_INDEX) { size_t off = (index - 1) / 8; wvalue = p_str[off] | p_str[off + 1] << 8; wvalue = (wvalue >> ((index - 1) % 8)) & kMask; add_precomputed_w5(r, wvalue, table); } index -= kWindowSize; nistz384_point_double(r, r); nistz384_point_double(r, r); nistz384_point_double(r, r); nistz384_point_double(r, r); nistz384_point_double(r, r); } /* Final window */ wvalue = p_str[0]; wvalue = (wvalue << 1) & kMask; add_precomputed_w5(r, wvalue, table); } void p384_point_double(Limb r[3][P384_LIMBS], const Limb a[3][P384_LIMBS]) { P384_POINT t; limbs_copy(t.X, a[0], P384_LIMBS); limbs_copy(t.Y, a[1], P384_LIMBS); limbs_copy(t.Z, a[2], P384_LIMBS); nistz384_point_double(&t, &t); limbs_copy(r[0], t.X, P384_LIMBS); limbs_copy(r[1], t.Y, P384_LIMBS); limbs_copy(r[2], t.Z, P384_LIMBS); } void p384_point_add(Limb r[3][P384_LIMBS], const Limb a[3][P384_LIMBS], const Limb b[3][P384_LIMBS]) { P384_POINT t1; limbs_copy(t1.X, a[0], P384_LIMBS); limbs_copy(t1.Y, a[1], P384_LIMBS); limbs_copy(t1.Z, a[2], P384_LIMBS); P384_POINT t2; limbs_copy(t2.X, b[0], P384_LIMBS); limbs_copy(t2.Y, b[1], P384_LIMBS); limbs_copy(t2.Z, b[2], P384_LIMBS); nistz384_point_add(&t1, &t1, &t2); limbs_copy(r[0], t1.X, P384_LIMBS); limbs_copy(r[1], t1.Y, P384_LIMBS); limbs_copy(r[2], t1.Z, P384_LIMBS); } void p384_point_mul(Limb r[3][P384_LIMBS], const BN_ULONG p_scalar[P384_LIMBS], const Limb p_x[P384_LIMBS], const Limb p_y[P384_LIMBS]) { alignas(64) P384_POINT acc; nistz384_point_mul(&acc, p_scalar, p_x, p_y); limbs_copy(r[0], acc.X, P384_LIMBS); limbs_copy(r[1], acc.Y, P384_LIMBS); limbs_copy(r[2], acc.Z, P384_LIMBS); } #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic pop #endif ring-0.17.14/crypto/fipsmodule/ec/gfp_p256.c000064400000000000000000000034571046102023000165200ustar 00000000000000/* Copyright 2016 Brian Smith. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "./p256_shared.h" #include "../../limbs/limbs.h" #if !defined(OPENSSL_USE_NISTZ256) typedef Limb ScalarMont[P256_LIMBS]; typedef Limb Scalar[P256_LIMBS]; #include "../bn/internal.h" static const BN_ULONG N[P256_LIMBS] = { #if defined(OPENSSL_64_BIT) 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 #else 0xfc632551, 0xf3b9cac2, 0xa7179e84, 0xbce6faad, 0xffffffff, 0xffffffff, 0, 0xffffffff #endif }; static const BN_ULONG N_N0[] = { BN_MONT_CTX_N0(0xccd1c8aa, 0xee00bc4f) }; void p256_scalar_mul_mont(ScalarMont r, const ScalarMont a, const ScalarMont b) { /* XXX: Inefficient. TODO: optimize with dedicated multiplication routine. */ bn_mul_mont_small(r, a, b, N, N_N0, P256_LIMBS); } /* XXX: Inefficient. TODO: optimize with dedicated squaring routine. */ void p256_scalar_sqr_rep_mont(ScalarMont r, const ScalarMont a, Limb rep) { dev_assert_secret(rep >= 1); p256_scalar_mul_mont(r, a, a); for (Limb i = 1; i < rep; ++i) { p256_scalar_mul_mont(r, r, r); } } #endif ring-0.17.14/crypto/fipsmodule/ec/gfp_p384.c000064400000000000000000000203041046102023000165100ustar 00000000000000/* Copyright 2016 Brian Smith. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "../../limbs/limbs.h" #include "ecp_nistz384.h" #include "../bn/internal.h" #include "../../internal.h" #include "../../limbs/limbs.inl" /* XXX: Here we assume that the conversion from |Carry| to |Limb| is * constant-time, but we haven't verified that assumption. TODO: Fix it so * we don't need to make that assumption. */ typedef Limb Elem[P384_LIMBS]; typedef Limb ScalarMont[P384_LIMBS]; typedef Limb Scalar[P384_LIMBS]; static const BN_ULONG Q[P384_LIMBS] = { #if defined(OPENSSL_64_BIT) 0xffffffff, 0xffffffff00000000, 0xfffffffffffffffe, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff #else 0xffffffff, 0, 0, 0xffffffff, 0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff #endif }; static const BN_ULONG N[P384_LIMBS] = { #if defined(OPENSSL_64_BIT) 0xecec196accc52973, 0x581a0db248b0a77a, 0xc7634d81f4372ddf, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff #else 0xccc52973, 0xecec196a, 0x48b0a77a, 0x581a0db2, 0xf4372ddf, 0xc7634d81, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff #endif }; static const BN_ULONG ONE[P384_LIMBS] = { #if defined(OPENSSL_64_BIT) 0xffffffff00000001, 0xffffffff, 1, 0, 0 #else 1, 0xffffffff, 0xffffffff, 0, 1, 0, 0, 0, 0, 0 #endif }; static const Elem Q_PLUS_1_SHR_1 = { #if defined(OPENSSL_64_BIT) 0x80000000, 0x7fffffff80000000, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffffff #else 0x80000000, 0, 0x80000000, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff #endif }; static const BN_ULONG Q_N0[] = { BN_MONT_CTX_N0(1, 1) }; static const BN_ULONG N_N0[] = { BN_MONT_CTX_N0(0x6ed46089, 0xe88fdc45) }; /* XXX: MSVC for x86 warns when it fails to inline these functions it should * probably inline. */ #if defined(_MSC_VER) && !defined(__clang__) && defined(OPENSSL_X86) #define INLINE_IF_POSSIBLE __forceinline #else #define INLINE_IF_POSSIBLE inline #endif static inline Limb is_equal(const Elem a, const Elem b) { return LIMBS_equal(a, b, P384_LIMBS); } static inline Limb is_zero(const BN_ULONG a[P384_LIMBS]) { return LIMBS_are_zero(a, P384_LIMBS); } static inline void copy_conditional(Elem r, const Elem a, const Limb condition) { for (size_t i = 0; i < P384_LIMBS; ++i) { r[i] = constant_time_select_w(condition, a[i], r[i]); } } static inline void elem_add(Elem r, const Elem a, const Elem b) { LIMBS_add_mod(r, a, b, Q, P384_LIMBS); } static inline void elem_sub(Elem r, const Elem a, const Elem b) { LIMBS_sub_mod(r, a, b, Q, P384_LIMBS); } static void elem_div_by_2(Elem r, const Elem a) { /* Consider the case where `a` is even. Then we can shift `a` right one bit * and the result will still be valid because we didn't lose any bits and so * `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy. * * The remainder of this comment is considering the case where `a` is odd. * * Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)` * because the lowest bit is lost during the shift. For example, consider: * * ```python * q = 2**384 - 2**128 - 2**96 + 2**32 - 1 * a = 2**383 * two_a = a * 2 % q * assert two_a == 0x100000000ffffffffffffffff00000001 * ``` * * Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When * we divide `two_a` by two (mod q), we need to get the value `2**383`, which * we obviously can't get with just a right shift. * * `q` is odd, and `a` is odd, so `a + q` is even. We could calculate * `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to * keep track of an extra most significant bit. We can avoid that by instead * calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least * significant bit of `a`. `q + 1` is even, which means it can be shifted * without losing any bits. Since `q` is odd, `q - 1` is even, so the largest * odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know * `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of * `a >> 1` is `(a - 1)/2` since the shift will drop the least significant * bit of `a`, which is 1. Thus: * * sum = ((q + 1) >> 1) + (a >> 1) * sum = (q + 1)/2 + (a >> 1) (substituting (q + 1)/2) * <= (q + 1)/2 + (q - 2 - 1)/2 (substituting a <= q - 2) * <= (q + 1)/2 + (q - 3)/2 (simplifying) * <= (q + 1 + q - 3)/2 (factoring out the common divisor) * <= (2q - 2)/2 (simplifying) * <= q - 1 (simplifying) * * Thus, no reduction of the sum mod `q` is necessary. */ Limb is_odd = constant_time_is_nonzero_w(a[0] & 1); /* r = a >> 1. */ Limb carry = a[P384_LIMBS - 1] & 1; r[P384_LIMBS - 1] = a[P384_LIMBS - 1] >> 1; for (size_t i = 1; i < P384_LIMBS; ++i) { Limb new_carry = a[P384_LIMBS - i - 1]; r[P384_LIMBS - i - 1] = (a[P384_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1)); carry = new_carry; } Elem adjusted; BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, P384_LIMBS); dev_assert_secret(carry2 == 0); (void)carry2; copy_conditional(r, adjusted, is_odd); } static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) { /* XXX: Not (clearly) constant-time; inefficient.*/ bn_mul_mont_small(r, a, b, Q, Q_N0, P384_LIMBS); } static inline void elem_mul_by_2(Elem r, const Elem a) { LIMBS_shl_mod(r, a, Q, P384_LIMBS); } static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) { /* XXX: inefficient. TODO: Replace with an integrated shift + add. */ Elem doubled; elem_add(doubled, a, a); elem_add(r, doubled, a); } static inline void elem_sqr_mont(Elem r, const Elem a) { /* XXX: Inefficient. TODO: Add a dedicated squaring routine. */ elem_mul_mont(r, a, a); } void p384_elem_sub(Elem r, const Elem a, const Elem b) { elem_sub(r, a, b); } void p384_elem_div_by_2(Elem r, const Elem a) { elem_div_by_2(r, a); } void p384_elem_mul_mont(Elem r, const Elem a, const Elem b) { elem_mul_mont(r, a, b); } void p384_elem_neg(Elem r, const Elem a) { Limb is_zero = LIMBS_are_zero(a, P384_LIMBS); Carry borrow = limbs_sub(r, Q, a, P384_LIMBS); dev_assert_secret(borrow == 0); (void)borrow; for (size_t i = 0; i < P384_LIMBS; ++i) { r[i] = constant_time_select_w(is_zero, 0, r[i]); } } void p384_scalar_mul_mont(ScalarMont r, const ScalarMont a, const ScalarMont b) { /* XXX: Inefficient. TODO: Add dedicated multiplication routine. */ bn_mul_mont_small(r, a, b, N, N_N0, P384_LIMBS); } /* TODO(perf): Optimize this. */ static void p384_point_select_w5(P384_POINT *out, const P384_POINT table[16], size_t index) { Elem x; limbs_zero(x, P384_LIMBS); Elem y; limbs_zero(y, P384_LIMBS); Elem z; limbs_zero(z, P384_LIMBS); // TODO: Rewrite in terms of |limbs_select|. for (size_t i = 0; i < 16; ++i) { crypto_word_t equal = constant_time_eq_w(index, (crypto_word_t)i + 1); for (size_t j = 0; j < P384_LIMBS; ++j) { x[j] = constant_time_select_w(equal, table[i].X[j], x[j]); y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]); z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]); } } limbs_copy(out->X, x, P384_LIMBS); limbs_copy(out->Y, y, P384_LIMBS); limbs_copy(out->Z, z, P384_LIMBS); } #include "ecp_nistz384.inl" ring-0.17.14/crypto/fipsmodule/ec/p256-nistz-table.h000064400000000000000000023504311046102023000201220ustar 00000000000000// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. // Copyright (c) 2015, Intel Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // This is the precomputed constant time access table for the code in // p256-nistz.c, for the default generator. The table consists of 37 // subtables, each subtable contains 64 affine points. The affine points are // encoded as eight uint64's, four for the x coordinate and four for the y. // Both values are in little-endian order. There are 37 tables because a // signed, 6-bit wNAF form of the scalar is used and ceil(256/(6 + 1)) = 37. // Within each table there are 64 values because the 6-bit wNAF value can take // 64 values, ignoring the sign bit, which is implemented by performing a // negation of the affine point when required. We would like to align it to 2MB // in order to increase the chances of using a large page but that appears to // lead to invalid ELF files being produced. // This file is generated by make_tables.go. static const alignas(4096) PRECOMP256_ROW ecp_nistz256_precomputed[37] = { {{{TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601), TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6)}, {TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c), TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85)}}, {{TOBN(0x850046d4, 0x10ddd64d), TOBN(0xaa6ae3c1, 0xa433827d), TOBN(0x73220503, 0x8d1490d9), TOBN(0xf6bb32e4, 0x3dcf3a3b)}, {TOBN(0x2f3648d3, 0x61bee1a5), TOBN(0x152cd7cb, 0xeb236ff8), TOBN(0x19a8fb0e, 0x92042dbe), TOBN(0x78c57751, 0x0a5b8a3b)}}, {{TOBN(0xffac3f90, 0x4eebc127), TOBN(0xb027f84a, 0x087d81fb), TOBN(0x66ad77dd, 0x87cbbc98), TOBN(0x26936a3f, 0xb6ff747e)}, {TOBN(0xb04c5c1f, 0xc983a7eb), TOBN(0x583e47ad, 0x0861fe1a), TOBN(0x78820831, 0x1a2ee98e), TOBN(0xd5f06a29, 0xe587cc07)}}, {{TOBN(0x74b0b50d, 0x46918dcc), TOBN(0x4650a6ed, 0xc623c173), TOBN(0x0cdaacac, 0xe8100af2), TOBN(0x577362f5, 0x41b0176b)}, {TOBN(0x2d96f24c, 0xe4cbaba6), TOBN(0x17628471, 0xfad6f447), TOBN(0x6b6c36de, 0xe5ddd22e), TOBN(0x84b14c39, 0x4c5ab863)}}, {{TOBN(0xbe1b8aae, 0xc45c61f5), TOBN(0x90ec649a, 0x94b9537d), TOBN(0x941cb5aa, 0xd076c20c), TOBN(0xc9079605, 0x890523c8)}, {TOBN(0xeb309b4a, 0xe7ba4f10), TOBN(0x73c568ef, 0xe5eb882b), TOBN(0x3540a987, 0x7e7a1f68), TOBN(0x73a076bb, 0x2dd1e916)}}, {{TOBN(0x40394737, 0x3e77664a), TOBN(0x55ae744f, 0x346cee3e), TOBN(0xd50a961a, 0x5b17a3ad), TOBN(0x13074b59, 0x54213673)}, {TOBN(0x93d36220, 0xd377e44b), TOBN(0x299c2b53, 0xadff14b5), TOBN(0xf424d44c, 0xef639f11), TOBN(0xa4c9916d, 0x4a07f75f)}}, {{TOBN(0x0746354e, 0xa0173b4f), TOBN(0x2bd20213, 0xd23c00f7), TOBN(0xf43eaab5, 0x0c23bb08), TOBN(0x13ba5119, 0xc3123e03)}, {TOBN(0x2847d030, 0x3f5b9d4d), TOBN(0x6742f2f2, 0x5da67bdd), TOBN(0xef933bdc, 0x77c94195), TOBN(0xeaedd915, 0x6e240867)}}, {{TOBN(0x27f14cd1, 0x9499a78f), TOBN(0x462ab5c5, 0x6f9b3455), TOBN(0x8f90f02a, 0xf02cfc6b), TOBN(0xb763891e, 0xb265230d)}, {TOBN(0xf59da3a9, 0x532d4977), TOBN(0x21e3327d, 0xcf9eba15), TOBN(0x123c7b84, 0xbe60bbf0), TOBN(0x56ec12f2, 0x7706df76)}}, {{TOBN(0x75c96e8f, 0x264e20e8), TOBN(0xabe6bfed, 0x59a7a841), TOBN(0x2cc09c04, 0x44c8eb00), TOBN(0xe05b3080, 0xf0c4e16b)}, {TOBN(0x1eb7777a, 0xa45f3314), TOBN(0x56af7bed, 0xce5d45e3), TOBN(0x2b6e019a, 0x88b12f1a), TOBN(0x086659cd, 0xfd835f9b)}}, {{TOBN(0x2c18dbd1, 0x9dc21ec8), TOBN(0x98f9868a, 0x0fcf8139), TOBN(0x737d2cd6, 0x48250b49), TOBN(0xcc61c947, 0x24b3428f)}, {TOBN(0x0c2b4078, 0x80dd9e76), TOBN(0xc43a8991, 0x383fbe08), TOBN(0x5f7d2d65, 0x779be5d2), TOBN(0x78719a54, 0xeb3b4ab5)}}, {{TOBN(0xea7d260a, 0x6245e404), TOBN(0x9de40795, 0x6e7fdfe0), TOBN(0x1ff3a415, 0x8dac1ab5), TOBN(0x3e7090f1, 0x649c9073)}, {TOBN(0x1a768561, 0x2b944e88), TOBN(0x250f939e, 0xe57f61c8), TOBN(0x0c0daa89, 0x1ead643d), TOBN(0x68930023, 0xe125b88e)}}, {{TOBN(0x04b71aa7, 0xd2697768), TOBN(0xabdedef5, 0xca345a33), TOBN(0x2409d29d, 0xee37385e), TOBN(0x4ee1df77, 0xcb83e156)}, {TOBN(0x0cac12d9, 0x1cbb5b43), TOBN(0x170ed2f6, 0xca895637), TOBN(0x28228cfa, 0x8ade6d66), TOBN(0x7ff57c95, 0x53238aca)}}, {{TOBN(0xccc42563, 0x4b2ed709), TOBN(0x0e356769, 0x856fd30d), TOBN(0xbcbcd43f, 0x559e9811), TOBN(0x738477ac, 0x5395b759)}, {TOBN(0x35752b90, 0xc00ee17f), TOBN(0x68748390, 0x742ed2e3), TOBN(0x7cd06422, 0xbd1f5bc1), TOBN(0xfbc08769, 0xc9e7b797)}}, {{TOBN(0xa242a35b, 0xb0cf664a), TOBN(0x126e48f7, 0x7f9707e3), TOBN(0x1717bf54, 0xc6832660), TOBN(0xfaae7332, 0xfd12c72e)}, {TOBN(0x27b52db7, 0x995d586b), TOBN(0xbe29569e, 0x832237c2), TOBN(0xe8e4193e, 0x2a65e7db), TOBN(0x152706dc, 0x2eaa1bbb)}}, {{TOBN(0x72bcd8b7, 0xbc60055b), TOBN(0x03cc23ee, 0x56e27e4b), TOBN(0xee337424, 0xe4819370), TOBN(0xe2aa0e43, 0x0ad3da09)}, {TOBN(0x40b8524f, 0x6383c45d), TOBN(0xd7663554, 0x42a41b25), TOBN(0x64efa6de, 0x778a4797), TOBN(0x2042170a, 0x7079adf4)}}, {{TOBN(0x808b0b65, 0x0bc6fb80), TOBN(0x5882e075, 0x3ffe2e6b), TOBN(0xd5ef2f7c, 0x2c83f549), TOBN(0x54d63c80, 0x9103b723)}, {TOBN(0xf2f11bd6, 0x52a23f9b), TOBN(0x3670c319, 0x4b0b6587), TOBN(0x55c4623b, 0xb1580e9e), TOBN(0x64edf7b2, 0x01efe220)}}, {{TOBN(0x97091dcb, 0xd53c5c9d), TOBN(0xf17624b6, 0xac0a177b), TOBN(0xb0f13975, 0x2cfe2dff), TOBN(0xc1a35c0a, 0x6c7a574e)}, {TOBN(0x227d3146, 0x93e79987), TOBN(0x0575bf30, 0xe89cb80e), TOBN(0x2f4e247f, 0x0d1883bb), TOBN(0xebd51226, 0x3274c3d0)}}, {{TOBN(0x5f3e51c8, 0x56ada97a), TOBN(0x4afc964d, 0x8f8b403e), TOBN(0xa6f247ab, 0x412e2979), TOBN(0x675abd1b, 0x6f80ebda)}, {TOBN(0x66a2bd72, 0x5e485a1d), TOBN(0x4b2a5caf, 0x8f4f0b3c), TOBN(0x2626927f, 0x1b847bba), TOBN(0x6c6fc7d9, 0x0502394d)}}, {{TOBN(0xfea912ba, 0xa5659ae8), TOBN(0x68363aba, 0x25e1a16e), TOBN(0xb8842277, 0x752c41ac), TOBN(0xfe545c28, 0x2897c3fc)}, {TOBN(0x2d36e9e7, 0xdc4c696b), TOBN(0x5806244a, 0xfba977c5), TOBN(0x85665e9b, 0xe39508c1), TOBN(0xf720ee25, 0x6d12597b)}}, {{TOBN(0x8a979129, 0xd2337a31), TOBN(0x5916868f, 0x0f862bdc), TOBN(0x048099d9, 0x5dd283ba), TOBN(0xe2d1eeb6, 0xfe5bfb4e)}, {TOBN(0x82ef1c41, 0x7884005d), TOBN(0xa2d4ec17, 0xffffcbae), TOBN(0x9161c53f, 0x8aa95e66), TOBN(0x5ee104e1, 0xc5fee0d0)}}, {{TOBN(0x562e4cec, 0xc135b208), TOBN(0x74e1b265, 0x4783f47d), TOBN(0x6d2a506c, 0x5a3f3b30), TOBN(0xecead9f4, 0xc16762fc)}, {TOBN(0xf29dd4b2, 0xe286e5b9), TOBN(0x1b0fadc0, 0x83bb3c61), TOBN(0x7a75023e, 0x7fac29a4), TOBN(0xc086d5f1, 0xc9477fa3)}}, {{TOBN(0x0fc61135, 0x2f6f3076), TOBN(0xc99ffa23, 0xe3912a9a), TOBN(0x6a0b0685, 0xd2f8ba3d), TOBN(0xfdc777e8, 0xe93358a4)}, {TOBN(0x94a787bb, 0x35415f04), TOBN(0x640c2d6a, 0x4d23fea4), TOBN(0x9de917da, 0x153a35b5), TOBN(0x793e8d07, 0x5d5cd074)}}, {{TOBN(0xf4f87653, 0x2de45068), TOBN(0x37c7a7e8, 0x9e2e1f6e), TOBN(0xd0825fa2, 0xa3584069), TOBN(0xaf2cea7c, 0x1727bf42)}, {TOBN(0x0360a4fb, 0x9e4785a9), TOBN(0xe5fda49c, 0x27299f4a), TOBN(0x48068e13, 0x71ac2f71), TOBN(0x83d0687b, 0x9077666f)}}, {{TOBN(0x6d3883b2, 0x15d02819), TOBN(0x6d0d7550, 0x40dd9a35), TOBN(0x61d7cbf9, 0x1d2b469f), TOBN(0xf97b232f, 0x2efc3115)}, {TOBN(0xa551d750, 0xb24bcbc7), TOBN(0x11ea4949, 0x88a1e356), TOBN(0x7669f031, 0x93cb7501), TOBN(0x595dc55e, 0xca737b8a)}}, {{TOBN(0xa4a319ac, 0xd837879f), TOBN(0x6fc1b49e, 0xed6b67b0), TOBN(0xe3959933, 0x32f1f3af), TOBN(0x966742eb, 0x65432a2e)}, {TOBN(0x4b8dc9fe, 0xb4966228), TOBN(0x96cc6312, 0x43f43950), TOBN(0x12068859, 0xc9b731ee), TOBN(0x7b948dc3, 0x56f79968)}}, {{TOBN(0x61e4ad32, 0xed1f8008), TOBN(0xe6c9267a, 0xd8b17538), TOBN(0x1ac7c5eb, 0x857ff6fb), TOBN(0x994baaa8, 0x55f2fb10)}, {TOBN(0x84cf14e1, 0x1d248018), TOBN(0x5a39898b, 0x628ac508), TOBN(0x14fde97b, 0x5fa944f5), TOBN(0xed178030, 0xd12e5ac7)}}, {{TOBN(0x042c2af4, 0x97e2feb4), TOBN(0xd36a42d7, 0xaebf7313), TOBN(0x49d2c9eb, 0x084ffdd7), TOBN(0x9f8aa54b, 0x2ef7c76a)}, {TOBN(0x9200b7ba, 0x09895e70), TOBN(0x3bd0c66f, 0xddb7fb58), TOBN(0x2d97d108, 0x78eb4cbb), TOBN(0x2d431068, 0xd84bde31)}}, {{TOBN(0x4b523eb7, 0x172ccd1f), TOBN(0x7323cb28, 0x30a6a892), TOBN(0x97082ec0, 0xcfe153eb), TOBN(0xe97f6b6a, 0xf2aadb97)}, {TOBN(0x1d3d393e, 0xd1a83da1), TOBN(0xa6a7f9c7, 0x804b2a68), TOBN(0x4a688b48, 0x2d0cb71e), TOBN(0xa9b4cc5f, 0x40585278)}}, {{TOBN(0x5e5db46a, 0xcb66e132), TOBN(0xf1be963a, 0x0d925880), TOBN(0x944a7027, 0x0317b9e2), TOBN(0xe266f959, 0x48603d48)}, {TOBN(0x98db6673, 0x5c208899), TOBN(0x90472447, 0xa2fb18a3), TOBN(0x8a966939, 0x777c619f), TOBN(0x3798142a, 0x2a3be21b)}}, {{TOBN(0xb4241cb1, 0x3298b343), TOBN(0xa3a14e49, 0xb44f65a1), TOBN(0xc5f4d6cd, 0x3ac77acd), TOBN(0xd0288cb5, 0x52b6fc3c)}, {TOBN(0xd5cc8c2f, 0x1c040abc), TOBN(0xb675511e, 0x06bf9b4a), TOBN(0xd667da37, 0x9b3aa441), TOBN(0x460d45ce, 0x51601f72)}}, {{TOBN(0xe2f73c69, 0x6755ff89), TOBN(0xdd3cf7e7, 0x473017e6), TOBN(0x8ef5689d, 0x3cf7600d), TOBN(0x948dc4f8, 0xb1fc87b4)}, {TOBN(0xd9e9fe81, 0x4ea53299), TOBN(0x2d921ca2, 0x98eb6028), TOBN(0xfaecedfd, 0x0c9803fc), TOBN(0xf38ae891, 0x4d7b4745)}}, {{TOBN(0xd8c5fccf, 0xc5e3a3d8), TOBN(0xbefd904c, 0x4079dfbf), TOBN(0xbc6d6a58, 0xfead0197), TOBN(0x39227077, 0x695532a4)}, {TOBN(0x09e23e6d, 0xdbef42f5), TOBN(0x7e449b64, 0x480a9908), TOBN(0x7b969c1a, 0xad9a2e40), TOBN(0x6231d792, 0x9591c2a4)}}, {{TOBN(0x87151456, 0x0f664534), TOBN(0x85ceae7c, 0x4b68f103), TOBN(0xac09c4ae, 0x65578ab9), TOBN(0x33ec6868, 0xf044b10c)}, {TOBN(0x6ac4832b, 0x3a8ec1f1), TOBN(0x5509d128, 0x5847d5ef), TOBN(0xf909604f, 0x763f1574), TOBN(0xb16c4303, 0xc32f63c4)}}, {{TOBN(0xb6ab2014, 0x7ca23cd3), TOBN(0xcaa7a5c6, 0xa391849d), TOBN(0x5b0673a3, 0x75678d94), TOBN(0xc982ddd4, 0xdd303e64)}, {TOBN(0xfd7b000b, 0x5db6f971), TOBN(0xbba2cb1f, 0x6f876f92), TOBN(0xc77332a3, 0x3c569426), TOBN(0xa159100c, 0x570d74f8)}}, {{TOBN(0xfd16847f, 0xdec67ef5), TOBN(0x742ee464, 0x233e76b7), TOBN(0x0b8e4134, 0xefc2b4c8), TOBN(0xca640b86, 0x42a3e521)}, {TOBN(0x653a0190, 0x8ceb6aa9), TOBN(0x313c300c, 0x547852d5), TOBN(0x24e4ab12, 0x6b237af7), TOBN(0x2ba90162, 0x8bb47af8)}}, {{TOBN(0x3d5e58d6, 0xa8219bb7), TOBN(0xc691d0bd, 0x1b06c57f), TOBN(0x0ae4cb10, 0xd257576e), TOBN(0x3569656c, 0xd54a3dc3)}, {TOBN(0xe5ebaebd, 0x94cda03a), TOBN(0x934e82d3, 0x162bfe13), TOBN(0x450ac0ba, 0xe251a0c6), TOBN(0x480b9e11, 0xdd6da526)}}, {{TOBN(0x00467bc5, 0x8cce08b5), TOBN(0xb636458c, 0x7f178d55), TOBN(0xc5748bae, 0xa677d806), TOBN(0x2763a387, 0xdfa394eb)}, {TOBN(0xa12b448a, 0x7d3cebb6), TOBN(0xe7adda3e, 0x6f20d850), TOBN(0xf63ebce5, 0x1558462c), TOBN(0x58b36143, 0x620088a8)}}, {{TOBN(0x8a2cc3ca, 0x4d63c0ee), TOBN(0x51233117, 0x0fe948ce), TOBN(0x7463fd85, 0x222ef33b), TOBN(0xadf0c7dc, 0x7c603d6c)}, {TOBN(0x0ec32d3b, 0xfe7765e5), TOBN(0xccaab359, 0xbf380409), TOBN(0xbdaa84d6, 0x8e59319c), TOBN(0xd9a4c280, 0x9c80c34d)}}, {{TOBN(0xa9d89488, 0xa059c142), TOBN(0x6f5ae714, 0xff0b9346), TOBN(0x068f237d, 0x16fb3664), TOBN(0x5853e4c4, 0x363186ac)}, {TOBN(0xe2d87d23, 0x63c52f98), TOBN(0x2ec4a766, 0x81828876), TOBN(0x47b864fa, 0xe14e7b1c), TOBN(0x0c0bc0e5, 0x69192408)}}, {{TOBN(0xe4d7681d, 0xb82e9f3e), TOBN(0x83200f0b, 0xdf25e13c), TOBN(0x8909984c, 0x66f27280), TOBN(0x462d7b00, 0x75f73227)}, {TOBN(0xd90ba188, 0xf2651798), TOBN(0x74c6e18c, 0x36ab1c34), TOBN(0xab256ea3, 0x5ef54359), TOBN(0x03466612, 0xd1aa702f)}}, {{TOBN(0x624d6049, 0x2ed22e91), TOBN(0x6fdfe0b5, 0x6f072822), TOBN(0xeeca1115, 0x39ce2271), TOBN(0x98100a4f, 0xdb01614f)}, {TOBN(0xb6b0daa2, 0xa35c628f), TOBN(0xb6f94d2e, 0xc87e9a47), TOBN(0xc6773259, 0x1d57d9ce), TOBN(0xf70bfeec, 0x03884a7b)}}, {{TOBN(0x5fb35ccf, 0xed2bad01), TOBN(0xa155cbe3, 0x1da6a5c7), TOBN(0xc2e2594c, 0x30a92f8f), TOBN(0x649c89ce, 0x5bfafe43)}, {TOBN(0xd158667d, 0xe9ff257a), TOBN(0x9b359611, 0xf32c50ae), TOBN(0x4b00b20b, 0x906014cf), TOBN(0xf3a8cfe3, 0x89bc7d3d)}}, {{TOBN(0x4ff23ffd, 0x248a7d06), TOBN(0x80c5bfb4, 0x878873fa), TOBN(0xb7d9ad90, 0x05745981), TOBN(0x179c85db, 0x3db01994)}, {TOBN(0xba41b062, 0x61a6966c), TOBN(0x4d82d052, 0xeadce5a8), TOBN(0x9e91cd3b, 0xa5e6a318), TOBN(0x47795f4f, 0x95b2dda0)}}, {{TOBN(0xecfd7c1f, 0xd55a897c), TOBN(0x009194ab, 0xb29110fb), TOBN(0x5f0e2046, 0xe381d3b0), TOBN(0x5f3425f6, 0xa98dd291)}, {TOBN(0xbfa06687, 0x730d50da), TOBN(0x0423446c, 0x4b083b7f), TOBN(0x397a247d, 0xd69d3417), TOBN(0xeb629f90, 0x387ba42a)}}, {{TOBN(0x1ee426cc, 0xd5cd79bf), TOBN(0x0032940b, 0x946c6e18), TOBN(0x1b1e8ae0, 0x57477f58), TOBN(0xe94f7d34, 0x6d823278)}, {TOBN(0xc747cb96, 0x782ba21a), TOBN(0xc5254469, 0xf72b33a5), TOBN(0x772ef6de, 0xc7f80c81), TOBN(0xd73acbfe, 0x2cd9e6b5)}}, {{TOBN(0x4075b5b1, 0x49ee90d9), TOBN(0x785c339a, 0xa06e9eba), TOBN(0xa1030d5b, 0xabf825e0), TOBN(0xcec684c3, 0xa42931dc)}, {TOBN(0x42ab62c9, 0xc1586e63), TOBN(0x45431d66, 0x5ab43f2b), TOBN(0x57c8b2c0, 0x55f7835d), TOBN(0x033da338, 0xc1b7f865)}}, {{TOBN(0x283c7513, 0xcaa76097), TOBN(0x0a624fa9, 0x36c83906), TOBN(0x6b20afec, 0x715af2c7), TOBN(0x4b969974, 0xeba78bfd)}, {TOBN(0x220755cc, 0xd921d60e), TOBN(0x9b944e10, 0x7baeca13), TOBN(0x04819d51, 0x5ded93d4), TOBN(0x9bbff86e, 0x6dddfd27)}}, {{TOBN(0x6b344130, 0x77adc612), TOBN(0xa7496529, 0xbbd803a0), TOBN(0x1a1baaa7, 0x6d8805bd), TOBN(0xc8403902, 0x470343ad)}, {TOBN(0x39f59f66, 0x175adff1), TOBN(0x0b26d7fb, 0xb7d8c5b7), TOBN(0xa875f5ce, 0x529d75e3), TOBN(0x85efc7e9, 0x41325cc2)}}, {{TOBN(0x21950b42, 0x1ff6acd3), TOBN(0xffe70484, 0x53dc6909), TOBN(0xff4cd0b2, 0x28766127), TOBN(0xabdbe608, 0x4fb7db2b)}, {TOBN(0x837c9228, 0x5e1109e8), TOBN(0x26147d27, 0xf4645b5a), TOBN(0x4d78f592, 0xf7818ed8), TOBN(0xd394077e, 0xf247fa36)}}, {{TOBN(0x0fb9c2d0, 0x488c171a), TOBN(0xa78bfbaa, 0x13685278), TOBN(0xedfbe268, 0xd5b1fa6a), TOBN(0x0dceb8db, 0x2b7eaba7)}, {TOBN(0xbf9e8089, 0x9ae2b710), TOBN(0xefde7ae6, 0xa4449c96), TOBN(0x43b7716b, 0xcc143a46), TOBN(0xd7d34194, 0xc3628c13)}}, {{TOBN(0x508cec1c, 0x3b3f64c9), TOBN(0xe20bc0ba, 0x1e5edf3f), TOBN(0xda1deb85, 0x2f4318d4), TOBN(0xd20ebe0d, 0x5c3fa443)}, {TOBN(0x370b4ea7, 0x73241ea3), TOBN(0x61f1511c, 0x5e1a5f65), TOBN(0x99a5e23d, 0x82681c62), TOBN(0xd731e383, 0xa2f54c2d)}}, {{TOBN(0x2692f36e, 0x83445904), TOBN(0x2e0ec469, 0xaf45f9c0), TOBN(0x905a3201, 0xc67528b7), TOBN(0x88f77f34, 0xd0e5e542)}, {TOBN(0xf67a8d29, 0x5864687c), TOBN(0x23b92eae, 0x22df3562), TOBN(0x5c27014b, 0x9bbec39e), TOBN(0x7ef2f226, 0x9c0f0f8d)}}, {{TOBN(0x97359638, 0x546c4d8d), TOBN(0x5f9c3fc4, 0x92f24679), TOBN(0x912e8bed, 0xa8c8acd9), TOBN(0xec3a318d, 0x306634b0)}, {TOBN(0x80167f41, 0xc31cb264), TOBN(0x3db82f6f, 0x522113f2), TOBN(0xb155bcd2, 0xdcafe197), TOBN(0xfba1da59, 0x43465283)}}, {{TOBN(0xa0425b8e, 0xb212cf53), TOBN(0x4f2e512e, 0xf8557c5f), TOBN(0xc1286ff9, 0x25c4d56c), TOBN(0xbb8a0fea, 0xee26c851)}, {TOBN(0xc28f70d2, 0xe7d6107e), TOBN(0x7ee0c444, 0xe76265aa), TOBN(0x3df277a4, 0x1d1936b1), TOBN(0x1a556e3f, 0xea9595eb)}}, {{TOBN(0x258bbbf9, 0xe7305683), TOBN(0x31eea5bf, 0x07ef5be6), TOBN(0x0deb0e4a, 0x46c814c1), TOBN(0x5cee8449, 0xa7b730dd)}, {TOBN(0xeab495c5, 0xa0182bde), TOBN(0xee759f87, 0x9e27a6b4), TOBN(0xc2cf6a68, 0x80e518ca), TOBN(0x25e8013f, 0xf14cf3f4)}}, {{TOBN(0x8fc44140, 0x7e8d7a14), TOBN(0xbb1ff3ca, 0x9556f36a), TOBN(0x6a844385, 0x14600044), TOBN(0xba3f0c4a, 0x7451ae63)}, {TOBN(0xdfcac25b, 0x1f9af32a), TOBN(0x01e0db86, 0xb1f2214b), TOBN(0x4e9a5bc2, 0xa4b596ac), TOBN(0x83927681, 0x026c2c08)}}, {{TOBN(0x3ec832e7, 0x7acaca28), TOBN(0x1bfeea57, 0xc7385b29), TOBN(0x068212e3, 0xfd1eaf38), TOBN(0xc1329830, 0x6acf8ccc)}, {TOBN(0xb909f2db, 0x2aac9e59), TOBN(0x5748060d, 0xb661782a), TOBN(0xc5ab2632, 0xc79b7a01), TOBN(0xda44c6c6, 0x00017626)}}, {{TOBN(0xf26c00e8, 0xa7ea82f0), TOBN(0x99cac80d, 0xe4299aaf), TOBN(0xd66fe3b6, 0x7ed78be1), TOBN(0x305f725f, 0x648d02cd)}, {TOBN(0x33ed1bc4, 0x623fb21b), TOBN(0xfa70533e, 0x7a6319ad), TOBN(0x17ab562d, 0xbe5ffb3e), TOBN(0x06374994, 0x56674741)}}, {{TOBN(0x69d44ed6, 0x5c46aa8e), TOBN(0x2100d5d3, 0xa8d063d1), TOBN(0xcb9727ea, 0xa2d17c36), TOBN(0x4c2bab1b, 0x8add53b7)}, {TOBN(0xa084e90c, 0x15426704), TOBN(0x778afcd3, 0xa837ebea), TOBN(0x6651f701, 0x7ce477f8), TOBN(0xa0624998, 0x46fb7a8b)}}, {{TOBN(0xdc1e6828, 0xed8a6e19), TOBN(0x33fc2336, 0x4189d9c7), TOBN(0x026f8fe2, 0x671c39bc), TOBN(0xd40c4ccd, 0xbc6f9915)}, {TOBN(0xafa135bb, 0xf80e75ca), TOBN(0x12c651a0, 0x22adff2c), TOBN(0xc40a04bd, 0x4f51ad96), TOBN(0x04820109, 0xbbe4e832)}}, {{TOBN(0x3667eb1a, 0x7f4c04cc), TOBN(0x59556621, 0xa9404f84), TOBN(0x71cdf653, 0x7eceb50a), TOBN(0x994a44a6, 0x9b8335fa)}, {TOBN(0xd7faf819, 0xdbeb9b69), TOBN(0x473c5680, 0xeed4350d), TOBN(0xb6658466, 0xda44bba2), TOBN(0x0d1bc780, 0x872bdbf3)}}, {{TOBN(0xe535f175, 0xa1962f91), TOBN(0x6ed7e061, 0xed58f5a7), TOBN(0x177aa4c0, 0x2089a233), TOBN(0x0dbcb03a, 0xe539b413)}, {TOBN(0xe3dc424e, 0xbb32e38e), TOBN(0x6472e5ef, 0x6806701e), TOBN(0xdd47ff98, 0x814be9ee), TOBN(0x6b60cfff, 0x35ace009)}}, {{TOBN(0xb8d3d931, 0x9ff91fe5), TOBN(0x039c4800, 0xf0518eed), TOBN(0x95c37632, 0x9182cb26), TOBN(0x0763a434, 0x82fc568d)}, {TOBN(0x707c04d5, 0x383e76ba), TOBN(0xac98b930, 0x824e8197), TOBN(0x92bf7c8f, 0x91230de0), TOBN(0x90876a01, 0x40959b70)}}, {{TOBN(0xdb6d96f3, 0x05968b80), TOBN(0x380a0913, 0x089f73b9), TOBN(0x7da70b83, 0xc2c61e01), TOBN(0x95fb8394, 0x569b38c7)}, {TOBN(0x9a3c6512, 0x80edfe2f), TOBN(0x8f726bb9, 0x8faeaf82), TOBN(0x8010a4a0, 0x78424bf8), TOBN(0x29672044, 0x0e844970)}}}, {{{TOBN(0x63c5cb81, 0x7a2ad62a), TOBN(0x7ef2b6b9, 0xac62ff54), TOBN(0x3749bba4, 0xb3ad9db5), TOBN(0xad311f2c, 0x46d5a617)}, {TOBN(0xb77a8087, 0xc2ff3b6d), TOBN(0xb46feaf3, 0x367834ff), TOBN(0xf8aa266d, 0x75d6b138), TOBN(0xfa38d320, 0xec008188)}}, {{TOBN(0x486d8ffa, 0x696946fc), TOBN(0x50fbc6d8, 0xb9cba56d), TOBN(0x7e3d423e, 0x90f35a15), TOBN(0x7c3da195, 0xc0dd962c)}, {TOBN(0xe673fdb0, 0x3cfd5d8b), TOBN(0x0704b7c2, 0x889dfca5), TOBN(0xf6ce581f, 0xf52305aa), TOBN(0x399d49eb, 0x914d5e53)}}, {{TOBN(0x380a496d, 0x6ec293cd), TOBN(0x733dbda7, 0x8e7051f5), TOBN(0x037e388d, 0xb849140a), TOBN(0xee4b32b0, 0x5946dbf6)}, {TOBN(0xb1c4fda9, 0xcae368d1), TOBN(0x5001a7b0, 0xfdb0b2f3), TOBN(0x6df59374, 0x2e3ac46e), TOBN(0x4af675f2, 0x39b3e656)}}, {{TOBN(0x44e38110, 0x39949296), TOBN(0x5b63827b, 0x361db1b5), TOBN(0x3e5323ed, 0x206eaff5), TOBN(0x942370d2, 0xc21f4290)}, {TOBN(0xf2caaf2e, 0xe0d985a1), TOBN(0x192cc64b, 0x7239846d), TOBN(0x7c0b8f47, 0xae6312f8), TOBN(0x7dc61f91, 0x96620108)}}, {{TOBN(0xb830fb5b, 0xc2da7de9), TOBN(0xd0e643df, 0x0ff8d3be), TOBN(0x31ee77ba, 0x188a9641), TOBN(0x4e8aa3aa, 0xbcf6d502)}, {TOBN(0xf9fb6532, 0x9a49110f), TOBN(0xd18317f6, 0x2dd6b220), TOBN(0x7e3ced41, 0x52c3ea5a), TOBN(0x0d296a14, 0x7d579c4a)}}, {{TOBN(0x35d6a53e, 0xed4c3717), TOBN(0x9f8240cf, 0x3d0ed2a3), TOBN(0x8c0d4d05, 0xe5543aa5), TOBN(0x45d5bbfb, 0xdd33b4b4)}, {TOBN(0xfa04cc73, 0x137fd28e), TOBN(0x862ac6ef, 0xc73b3ffd), TOBN(0x403ff9f5, 0x31f51ef2), TOBN(0x34d5e0fc, 0xbc73f5a2)}}, {{TOBN(0xf2526820, 0x08913f4f), TOBN(0xea20ed61, 0xeac93d95), TOBN(0x51ed38b4, 0x6ca6b26c), TOBN(0x8662dcbc, 0xea4327b0)}, {TOBN(0x6daf295c, 0x725d2aaa), TOBN(0xbad2752f, 0x8e52dcda), TOBN(0x2210e721, 0x0b17dacc), TOBN(0xa37f7912, 0xd51e8232)}}, {{TOBN(0x4f7081e1, 0x44cc3add), TOBN(0xd5ffa1d6, 0x87be82cf), TOBN(0x89890b6c, 0x0edd6472), TOBN(0xada26e1a, 0x3ed17863)}, {TOBN(0x276f2715, 0x63483caa), TOBN(0xe6924cd9, 0x2f6077fd), TOBN(0x05a7fe98, 0x0a466e3c), TOBN(0xf1c794b0, 0xb1902d1f)}}, {{TOBN(0xe5213688, 0x82a8042c), TOBN(0xd931cfaf, 0xcd278298), TOBN(0x069a0ae0, 0xf597a740), TOBN(0x0adbb3f3, 0xeb59107c)}, {TOBN(0x983e951e, 0x5eaa8eb8), TOBN(0xe663a8b5, 0x11b48e78), TOBN(0x1631cc0d, 0x8a03f2c5), TOBN(0x7577c11e, 0x11e271e2)}}, {{TOBN(0x33b2385c, 0x08369a90), TOBN(0x2990c59b, 0x190eb4f8), TOBN(0x819a6145, 0xc68eac80), TOBN(0x7a786d62, 0x2ec4a014)}, {TOBN(0x33faadbe, 0x20ac3a8d), TOBN(0x31a21781, 0x5aba2d30), TOBN(0x209d2742, 0xdba4f565), TOBN(0xdb2ce9e3, 0x55aa0fbb)}}, {{TOBN(0x8cef334b, 0x168984df), TOBN(0xe81dce17, 0x33879638), TOBN(0xf6e6949c, 0x263720f0), TOBN(0x5c56feaf, 0xf593cbec)}, {TOBN(0x8bff5601, 0xfde58c84), TOBN(0x74e24117, 0x2eccb314), TOBN(0xbcf01b61, 0x4c9a8a78), TOBN(0xa233e35e, 0x544c9868)}}, {{TOBN(0xb3156bf3, 0x8bd7aff1), TOBN(0x1b5ee4cb, 0x1d81b146), TOBN(0x7ba1ac41, 0xd628a915), TOBN(0x8f3a8f9c, 0xfd89699e)}, {TOBN(0x7329b9c9, 0xa0748be7), TOBN(0x1d391c95, 0xa92e621f), TOBN(0xe51e6b21, 0x4d10a837), TOBN(0xd255f53a, 0x4947b435)}}, {{TOBN(0x07669e04, 0xf1788ee3), TOBN(0xc14f27af, 0xa86938a2), TOBN(0x8b47a334, 0xe93a01c0), TOBN(0xff627438, 0xd9366808)}, {TOBN(0x7a0985d8, 0xca2a5965), TOBN(0x3d9a5542, 0xd6e9b9b3), TOBN(0xc23eb80b, 0x4cf972e8), TOBN(0x5c1c33bb, 0x4fdf72fd)}}, {{TOBN(0x0c4a58d4, 0x74a86108), TOBN(0xf8048a8f, 0xee4c5d90), TOBN(0xe3c7c924, 0xe86d4c80), TOBN(0x28c889de, 0x056a1e60)}, {TOBN(0x57e2662e, 0xb214a040), TOBN(0xe8c48e98, 0x37e10347), TOBN(0x87742862, 0x80ac748a), TOBN(0xf1c24022, 0x186b06f2)}}, {{TOBN(0xac2dd4c3, 0x5f74040a), TOBN(0x409aeb71, 0xfceac957), TOBN(0x4fbad782, 0x55c4ec23), TOBN(0xb359ed61, 0x8a7b76ec)}, {TOBN(0x12744926, 0xed6f4a60), TOBN(0xe21e8d7f, 0x4b912de3), TOBN(0xe2575a59, 0xfc705a59), TOBN(0x72f1d4de, 0xed2dbc0e)}}, {{TOBN(0x3d2b24b9, 0xeb7926b8), TOBN(0xbff88cb3, 0xcdbe5509), TOBN(0xd0f399af, 0xe4dd640b), TOBN(0x3c5fe130, 0x2f76ed45)}, {TOBN(0x6f3562f4, 0x3764fb3d), TOBN(0x7b5af318, 0x3151b62d), TOBN(0xd5bd0bc7, 0xd79ce5f3), TOBN(0xfdaf6b20, 0xec66890f)}}, {{TOBN(0x735c67ec, 0x6063540c), TOBN(0x50b259c2, 0xe5f9cb8f), TOBN(0xb8734f9a, 0x3f99c6ab), TOBN(0xf8cc13d5, 0xa3a7bc85)}, {TOBN(0x80c1b305, 0xc5217659), TOBN(0xfe5364d4, 0x4ec12a54), TOBN(0xbd87045e, 0x681345fe), TOBN(0x7f8efeb1, 0x582f897f)}}, {{TOBN(0xe8cbf1e5, 0xd5923359), TOBN(0xdb0cea9d, 0x539b9fb0), TOBN(0x0c5b34cf, 0x49859b98), TOBN(0x5e583c56, 0xa4403cc6)}, {TOBN(0x11fc1a2d, 0xd48185b7), TOBN(0xc93fbc7e, 0x6e521787), TOBN(0x47e7a058, 0x05105b8b), TOBN(0x7b4d4d58, 0xdb8260c8)}}, {{TOBN(0xe33930b0, 0x46eb842a), TOBN(0x8e844a9a, 0x7bdae56d), TOBN(0x34ef3a9e, 0x13f7fdfc), TOBN(0xb3768f82, 0x636ca176)}, {TOBN(0x2821f4e0, 0x4e09e61c), TOBN(0x414dc3a1, 0xa0c7cddc), TOBN(0xd5379437, 0x54945fcd), TOBN(0x151b6eef, 0xb3555ff1)}}, {{TOBN(0xb31bd613, 0x6339c083), TOBN(0x39ff8155, 0xdfb64701), TOBN(0x7c3388d2, 0xe29604ab), TOBN(0x1e19084b, 0xa6b10442)}, {TOBN(0x17cf54c0, 0xeccd47ef), TOBN(0x89693385, 0x4a5dfb30), TOBN(0x69d023fb, 0x47daf9f6), TOBN(0x9222840b, 0x7d91d959)}}, {{TOBN(0x439108f5, 0x803bac62), TOBN(0x0b7dd91d, 0x379bd45f), TOBN(0xd651e827, 0xca63c581), TOBN(0x5c5d75f6, 0x509c104f)}, {TOBN(0x7d5fc738, 0x1f2dc308), TOBN(0x20faa7bf, 0xd98454be), TOBN(0x95374bee, 0xa517b031), TOBN(0xf036b9b1, 0x642692ac)}}, {{TOBN(0xc5106109, 0x39842194), TOBN(0xb7e2353e, 0x49d05295), TOBN(0xfc8c1d5c, 0xefb42ee0), TOBN(0xe04884eb, 0x08ce811c)}, {TOBN(0xf1f75d81, 0x7419f40e), TOBN(0x5b0ac162, 0xa995c241), TOBN(0x120921bb, 0xc4c55646), TOBN(0x713520c2, 0x8d33cf97)}}, {{TOBN(0xb4a65a5c, 0xe98c5100), TOBN(0x6cec871d, 0x2ddd0f5a), TOBN(0x251f0b7f, 0x9ba2e78b), TOBN(0x224a8434, 0xce3a2a5f)}, {TOBN(0x26827f61, 0x25f5c46f), TOBN(0x6a22bedc, 0x48545ec0), TOBN(0x25ae5fa0, 0xb1bb5cdc), TOBN(0xd693682f, 0xfcb9b98f)}}, {{TOBN(0x32027fe8, 0x91e5d7d3), TOBN(0xf14b7d17, 0x73a07678), TOBN(0xf88497b3, 0xc0dfdd61), TOBN(0xf7c2eec0, 0x2a8c4f48)}, {TOBN(0xaa5573f4, 0x3756e621), TOBN(0xc013a240, 0x1825b948), TOBN(0x1c03b345, 0x63878572), TOBN(0xa0472bea, 0x653a4184)}}, {{TOBN(0xf4222e27, 0x0ac69a80), TOBN(0x34096d25, 0xf51e54f6), TOBN(0x00a648cb, 0x8fffa591), TOBN(0x4e87acdc, 0x69b6527f)}, {TOBN(0x0575e037, 0xe285ccb4), TOBN(0x188089e4, 0x50ddcf52), TOBN(0xaa96c9a8, 0x870ff719), TOBN(0x74a56cd8, 0x1fc7e369)}}, {{TOBN(0x41d04ee2, 0x1726931a), TOBN(0x0bbbb2c8, 0x3660ecfd), TOBN(0xa6ef6de5, 0x24818e18), TOBN(0xe421cc51, 0xe7d57887)}, {TOBN(0xf127d208, 0xbea87be6), TOBN(0x16a475d3, 0xb1cdd682), TOBN(0x9db1b684, 0x439b63f7), TOBN(0x5359b3db, 0xf0f113b6)}}, {{TOBN(0xdfccf1de, 0x8bf06e31), TOBN(0x1fdf8f44, 0xdd383901), TOBN(0x10775cad, 0x5017e7d2), TOBN(0xdfc3a597, 0x58d11eef)}, {TOBN(0x6ec9c8a0, 0xb1ecff10), TOBN(0xee6ed6cc, 0x28400549), TOBN(0xb5ad7bae, 0x1b4f8d73), TOBN(0x61b4f11d, 0xe00aaab9)}}, {{TOBN(0x7b32d69b, 0xd4eff2d7), TOBN(0x88ae6771, 0x4288b60f), TOBN(0x159461b4, 0x37a1e723), TOBN(0x1f3d4789, 0x570aae8c)}, {TOBN(0x869118c0, 0x7f9871da), TOBN(0x35fbda78, 0xf635e278), TOBN(0x738f3641, 0xe1541dac), TOBN(0x6794b13a, 0xc0dae45f)}}, {{TOBN(0x065064ac, 0x09cc0917), TOBN(0x27c53729, 0xc68540fd), TOBN(0x0d2d4c8e, 0xef227671), TOBN(0xd23a9f80, 0xa1785a04)}, {TOBN(0x98c59528, 0x52650359), TOBN(0xfa09ad01, 0x74a1acad), TOBN(0x082d5a29, 0x0b55bf5c), TOBN(0xa40f1c67, 0x419b8084)}}, {{TOBN(0x3a5c752e, 0xdcc18770), TOBN(0x4baf1f2f, 0x8825c3a5), TOBN(0xebd63f74, 0x21b153ed), TOBN(0xa2383e47, 0xb2f64723)}, {TOBN(0xe7bf620a, 0x2646d19a), TOBN(0x56cb44ec, 0x03c83ffd), TOBN(0xaf7267c9, 0x4f6be9f1), TOBN(0x8b2dfd7b, 0xc06bb5e9)}}, {{TOBN(0xb87072f2, 0xa672c5c7), TOBN(0xeacb11c8, 0x0d53c5e2), TOBN(0x22dac29d, 0xff435932), TOBN(0x37bdb99d, 0x4408693c)}, {TOBN(0xf6e62fb6, 0x2899c20f), TOBN(0x3535d512, 0x447ece24), TOBN(0xfbdc6b88, 0xff577ce3), TOBN(0x726693bd, 0x190575f2)}}, {{TOBN(0x6772b0e5, 0xab4b35a2), TOBN(0x1d8b6001, 0xf5eeaacf), TOBN(0x728f7ce4, 0x795b9580), TOBN(0x4a20ed2a, 0x41fb81da)}, {TOBN(0x9f685cd4, 0x4fec01e6), TOBN(0x3ed7ddcc, 0xa7ff50ad), TOBN(0x460fd264, 0x0c2d97fd), TOBN(0x3a241426, 0xeb82f4f9)}}, {{TOBN(0x17d1df2c, 0x6a8ea820), TOBN(0xb2b50d3b, 0xf22cc254), TOBN(0x03856cba, 0xb7291426), TOBN(0x87fd26ae, 0x04f5ee39)}, {TOBN(0x9cb696cc, 0x02bee4ba), TOBN(0x53121804, 0x06820fd6), TOBN(0xa5dfc269, 0x0212e985), TOBN(0x666f7ffa, 0x160f9a09)}}, {{TOBN(0xc503cd33, 0xbccd9617), TOBN(0x365dede4, 0xba7730a3), TOBN(0x798c6355, 0x5ddb0786), TOBN(0xa6c3200e, 0xfc9cd3bc)}, {TOBN(0x060ffb2c, 0xe5e35efd), TOBN(0x99a4e25b, 0x5555a1c1), TOBN(0x11d95375, 0xf70b3751), TOBN(0x0a57354a, 0x160e1bf6)}}, {{TOBN(0xecb3ae4b, 0xf8e4b065), TOBN(0x07a834c4, 0x2e53022b), TOBN(0x1cd300b3, 0x8692ed96), TOBN(0x16a6f792, 0x61ee14ec)}, {TOBN(0x8f1063c6, 0x6a8649ed), TOBN(0xfbcdfcfe, 0x869f3e14), TOBN(0x2cfb97c1, 0x00a7b3ec), TOBN(0xcea49b3c, 0x7130c2f1)}}, {{TOBN(0x462d044f, 0xe9d96488), TOBN(0x4b53d52e, 0x8182a0c1), TOBN(0x84b6ddd3, 0x0391e9e9), TOBN(0x80ab7b48, 0xb1741a09)}, {TOBN(0xec0e15d4, 0x27d3317f), TOBN(0x8dfc1ddb, 0x1a64671e), TOBN(0x93cc5d5f, 0xd49c5b92), TOBN(0xc995d53d, 0x3674a331)}}, {{TOBN(0x302e41ec, 0x090090ae), TOBN(0x2278a0cc, 0xedb06830), TOBN(0x1d025932, 0xfbc99690), TOBN(0x0c32fbd2, 0xb80d68da)}, {TOBN(0xd79146da, 0xf341a6c1), TOBN(0xae0ba139, 0x1bef68a0), TOBN(0xc6b8a563, 0x8d774b3a), TOBN(0x1cf307bd, 0x880ba4d7)}}, {{TOBN(0xc033bdc7, 0x19803511), TOBN(0xa9f97b3b, 0x8888c3be), TOBN(0x3d68aebc, 0x85c6d05e), TOBN(0xc3b88a9d, 0x193919eb)}, {TOBN(0x2d300748, 0xc48b0ee3), TOBN(0x7506bc7c, 0x07a746c1), TOBN(0xfc48437c, 0x6e6d57f3), TOBN(0x5bd71587, 0xcfeaa91a)}}, {{TOBN(0xa4ed0408, 0xc1bc5225), TOBN(0xd0b946db, 0x2719226d), TOBN(0x109ecd62, 0x758d2d43), TOBN(0x75c8485a, 0x2751759b)}, {TOBN(0xb0b75f49, 0x9ce4177a), TOBN(0x4fa61a1e, 0x79c10c3d), TOBN(0xc062d300, 0xa167fcd7), TOBN(0x4df3874c, 0x750f0fa8)}}, {{TOBN(0x29ae2cf9, 0x83dfedc9), TOBN(0xf8437134, 0x8d87631a), TOBN(0xaf571711, 0x7429c8d2), TOBN(0x18d15867, 0x146d9272)}, {TOBN(0x83053ecf, 0x69769bb7), TOBN(0xc55eb856, 0xc479ab82), TOBN(0x5ef7791c, 0x21b0f4b2), TOBN(0xaa5956ba, 0x3d491525)}}, {{TOBN(0x407a96c2, 0x9fe20eba), TOBN(0xf27168bb, 0xe52a5ad3), TOBN(0x43b60ab3, 0xbf1d9d89), TOBN(0xe45c51ef, 0x710e727a)}, {TOBN(0xdfca5276, 0x099b4221), TOBN(0x8dc6407c, 0x2557a159), TOBN(0x0ead8335, 0x91035895), TOBN(0x0a9db957, 0x9c55dc32)}}, {{TOBN(0xe40736d3, 0xdf61bc76), TOBN(0x13a619c0, 0x3f778cdb), TOBN(0x6dd921a4, 0xc56ea28f), TOBN(0x76a52433, 0x2fa647b4)}, {TOBN(0x23591891, 0xac5bdc5d), TOBN(0xff4a1a72, 0xbac7dc01), TOBN(0x9905e261, 0x62df8453), TOBN(0x3ac045df, 0xe63b265f)}}, {{TOBN(0x8a3f341b, 0xad53dba7), TOBN(0x8ec269cc, 0x837b625a), TOBN(0xd71a2782, 0x3ae31189), TOBN(0x8fb4f9a3, 0x55e96120)}, {TOBN(0x804af823, 0xff9875cf), TOBN(0x23224f57, 0x5d442a9b), TOBN(0x1c4d3b9e, 0xecc62679), TOBN(0x91da22fb, 0xa0e7ddb1)}}, {{TOBN(0xa370324d, 0x6c04a661), TOBN(0x9710d3b6, 0x5e376d17), TOBN(0xed8c98f0, 0x3044e357), TOBN(0xc364ebbe, 0x6422701c)}, {TOBN(0x347f5d51, 0x7733d61c), TOBN(0xd55644b9, 0xcea826c3), TOBN(0x80c6e0ad, 0x55a25548), TOBN(0x0aa7641d, 0x844220a7)}}, {{TOBN(0x1438ec81, 0x31810660), TOBN(0x9dfa6507, 0xde4b4043), TOBN(0x10b515d8, 0xcc3e0273), TOBN(0x1b6066dd, 0x28d8cfb2)}, {TOBN(0xd3b04591, 0x9c9efebd), TOBN(0x425d4bdf, 0xa21c1ff4), TOBN(0x5fe5af19, 0xd57607d3), TOBN(0xbbf773f7, 0x54481084)}}, {{TOBN(0x8435bd69, 0x94b03ed1), TOBN(0xd9ad1de3, 0x634cc546), TOBN(0x2cf423fc, 0x00e420ca), TOBN(0xeed26d80, 0xa03096dd)}, {TOBN(0xd7f60be7, 0xa4db09d2), TOBN(0xf47f569d, 0x960622f7), TOBN(0xe5925fd7, 0x7296c729), TOBN(0xeff2db26, 0x26ca2715)}}, {{TOBN(0xa6fcd014, 0xb913e759), TOBN(0x53da4786, 0x8ff4de93), TOBN(0x14616d79, 0xc32068e1), TOBN(0xb187d664, 0xccdf352e)}, {TOBN(0xf7afb650, 0x1dc90b59), TOBN(0x8170e943, 0x7daa1b26), TOBN(0xc8e3bdd8, 0x700c0a84), TOBN(0x6e8d345f, 0x6482bdfa)}}, {{TOBN(0x84cfbfa1, 0xc5c5ea50), TOBN(0xd3baf14c, 0x67960681), TOBN(0x26398403, 0x0dd50942), TOBN(0xe4b7839c, 0x4716a663)}, {TOBN(0xd5f1f794, 0xe7de6dc0), TOBN(0x5cd0f4d4, 0x622aa7ce), TOBN(0x5295f3f1, 0x59acfeec), TOBN(0x8d933552, 0x953e0607)}}, {{TOBN(0xc7db8ec5, 0x776c5722), TOBN(0xdc467e62, 0x2b5f290c), TOBN(0xd4297e70, 0x4ff425a9), TOBN(0x4be924c1, 0x0cf7bb72)}, {TOBN(0x0d5dc5ae, 0xa1892131), TOBN(0x8bf8a8e3, 0xa705c992), TOBN(0x73a0b064, 0x7a305ac5), TOBN(0x00c9ca4e, 0x9a8c77a8)}}, {{TOBN(0x5dfee80f, 0x83774bdd), TOBN(0x63131602, 0x85734485), TOBN(0xa1b524ae, 0x914a69a9), TOBN(0xebc2ffaf, 0xd4e300d7)}, {TOBN(0x52c93db7, 0x7cfa46a5), TOBN(0x71e6161f, 0x21653b50), TOBN(0x3574fc57, 0xa4bc580a), TOBN(0xc09015dd, 0xe1bc1253)}}, {{TOBN(0x4b7b47b2, 0xd174d7aa), TOBN(0x4072d8e8, 0xf3a15d04), TOBN(0xeeb7d47f, 0xd6fa07ed), TOBN(0x6f2b9ff9, 0xedbdafb1)}, {TOBN(0x18c51615, 0x3760fe8a), TOBN(0x7a96e6bf, 0xf06c6c13), TOBN(0x4d7a0410, 0x0ea2d071), TOBN(0xa1914e9b, 0x0be2a5ce)}}, {{TOBN(0x5726e357, 0xd8a3c5cf), TOBN(0x1197ecc3, 0x2abb2b13), TOBN(0x6c0d7f7f, 0x31ae88dd), TOBN(0x15b20d1a, 0xfdbb3efe)}, {TOBN(0xcd06aa26, 0x70584039), TOBN(0x2277c969, 0xa7dc9747), TOBN(0xbca69587, 0x7855d815), TOBN(0x899ea238, 0x5188b32a)}}, {{TOBN(0x37d9228b, 0x760c1c9d), TOBN(0xc7efbb11, 0x9b5c18da), TOBN(0x7f0d1bc8, 0x19f6dbc5), TOBN(0x4875384b, 0x07e6905b)}, {TOBN(0xc7c50baa, 0x3ba8cd86), TOBN(0xb0ce40fb, 0xc2905de0), TOBN(0x70840673, 0x7a231952), TOBN(0xa912a262, 0xcf43de26)}}, {{TOBN(0x9c38ddcc, 0xeb5b76c1), TOBN(0x746f5285, 0x26fc0ab4), TOBN(0x52a63a50, 0xd62c269f), TOBN(0x60049c55, 0x99458621)}, {TOBN(0xe7f48f82, 0x3c2f7c9e), TOBN(0x6bd99043, 0x917d5cf3), TOBN(0xeb1317a8, 0x8701f469), TOBN(0xbd3fe2ed, 0x9a449fe0)}}, {{TOBN(0x421e79ca, 0x12ef3d36), TOBN(0x9ee3c36c, 0x3e7ea5de), TOBN(0xe48198b5, 0xcdff36f7), TOBN(0xaff4f967, 0xc6b82228)}, {TOBN(0x15e19dd0, 0xc47adb7e), TOBN(0x45699b23, 0x032e7dfa), TOBN(0x40680c8b, 0x1fae026a), TOBN(0x5a347a48, 0x550dbf4d)}}, {{TOBN(0xe652533b, 0x3cef0d7d), TOBN(0xd94f7b18, 0x2bbb4381), TOBN(0x838752be, 0x0e80f500), TOBN(0x8e6e2488, 0x9e9c9bfb)}, {TOBN(0xc9751697, 0x16caca6a), TOBN(0x866c49d8, 0x38531ad9), TOBN(0xc917e239, 0x7151ade1), TOBN(0x2d016ec1, 0x6037c407)}}, {{TOBN(0xa407ccc9, 0x00eac3f9), TOBN(0x835f6280, 0xe2ed4748), TOBN(0xcc54c347, 0x1cc98e0d), TOBN(0x0e969937, 0xdcb572eb)}, {TOBN(0x1b16c8e8, 0x8f30c9cb), TOBN(0xa606ae75, 0x373c4661), TOBN(0x47aa689b, 0x35502cab), TOBN(0xf89014ae, 0x4d9bb64f)}}, {{TOBN(0x202f6a9c, 0x31c71f7b), TOBN(0x01f95aa3, 0x296ffe5c), TOBN(0x5fc06014, 0x53cec3a3), TOBN(0xeb991237, 0x5f498a45)}, {TOBN(0xae9a935e, 0x5d91ba87), TOBN(0xc6ac6281, 0x0b564a19), TOBN(0x8a8fe81c, 0x3bd44e69), TOBN(0x7c8b467f, 0x9dd11d45)}}, {{TOBN(0xf772251f, 0xea5b8e69), TOBN(0xaeecb3bd, 0xc5b75fbc), TOBN(0x1aca3331, 0x887ff0e5), TOBN(0xbe5d49ff, 0x19f0a131)}, {TOBN(0x582c13aa, 0xe5c8646f), TOBN(0xdbaa12e8, 0x20e19980), TOBN(0x8f40f31a, 0xf7abbd94), TOBN(0x1f13f5a8, 0x1dfc7663)}}, {{TOBN(0x5d81f1ee, 0xaceb4fc0), TOBN(0x36256002, 0x5e6f0f42), TOBN(0x4b67d6d7, 0x751370c8), TOBN(0x2608b698, 0x03e80589)}, {TOBN(0xcfc0d2fc, 0x05268301), TOBN(0xa6943d39, 0x40309212), TOBN(0x192a90c2, 0x1fd0e1c2), TOBN(0xb209f113, 0x37f1dc76)}}, {{TOBN(0xefcc5e06, 0x97bf1298), TOBN(0xcbdb6730, 0x219d639e), TOBN(0xd009c116, 0xb81e8c6f), TOBN(0xa3ffdde3, 0x1a7ce2e5)}, {TOBN(0xc53fbaaa, 0xa914d3ba), TOBN(0x836d500f, 0x88df85ee), TOBN(0xd98dc71b, 0x66ee0751), TOBN(0x5a3d7005, 0x714516fd)}}, {{TOBN(0x21d3634d, 0x39eedbba), TOBN(0x35cd2e68, 0x0455a46d), TOBN(0xc8cafe65, 0xf9d7eb0c), TOBN(0xbda3ce9e, 0x00cefb3e)}, {TOBN(0xddc17a60, 0x2c9cf7a4), TOBN(0x01572ee4, 0x7bcb8773), TOBN(0xa92b2b01, 0x8c7548df), TOBN(0x732fd309, 0xa84600e3)}}, {{TOBN(0xe22109c7, 0x16543a40), TOBN(0x9acafd36, 0xfede3c6c), TOBN(0xfb206852, 0x6824e614), TOBN(0x2a4544a9, 0xda25dca0)}, {TOBN(0x25985262, 0x91d60b06), TOBN(0x281b7be9, 0x28753545), TOBN(0xec667b1a, 0x90f13b27), TOBN(0x33a83aff, 0x940e2eb4)}}, {{TOBN(0x80009862, 0xd5d721d5), TOBN(0x0c3357a3, 0x5bd3a182), TOBN(0x27f3a83b, 0x7aa2cda4), TOBN(0xb58ae74e, 0xf6f83085)}, {TOBN(0x2a911a81, 0x2e6dad6b), TOBN(0xde286051, 0xf43d6c5b), TOBN(0x4bdccc41, 0xf996c4d8), TOBN(0xe7312ec0, 0x0ae1e24e)}}}, {{{TOBN(0xf8d112e7, 0x6e6485b3), TOBN(0x4d3e24db, 0x771c52f8), TOBN(0x48e3ee41, 0x684a2f6d), TOBN(0x7161957d, 0x21d95551)}, {TOBN(0x19631283, 0xcdb12a6c), TOBN(0xbf3fa882, 0x2e50e164), TOBN(0xf6254b63, 0x3166cc73), TOBN(0x3aefa7ae, 0xaee8cc38)}}, {{TOBN(0x79b0fe62, 0x3b36f9fd), TOBN(0x26543b23, 0xfde19fc0), TOBN(0x136e64a0, 0x958482ef), TOBN(0x23f63771, 0x9b095825)}, {TOBN(0x14cfd596, 0xb6a1142e), TOBN(0x5ea6aac6, 0x335aac0b), TOBN(0x86a0e8bd, 0xf3081dd5), TOBN(0x5fb89d79, 0x003dc12a)}}, {{TOBN(0xf615c33a, 0xf72e34d4), TOBN(0x0bd9ea40, 0x110eec35), TOBN(0x1c12bc5b, 0xc1dea34e), TOBN(0x686584c9, 0x49ae4699)}, {TOBN(0x13ad95d3, 0x8c97b942), TOBN(0x4609561a, 0x4e5c7562), TOBN(0x9e94a4ae, 0xf2737f89), TOBN(0xf57594c6, 0x371c78b6)}}, {{TOBN(0x0f0165fc, 0xe3779ee3), TOBN(0xe00e7f9d, 0xbd495d9e), TOBN(0x1fa4efa2, 0x20284e7a), TOBN(0x4564bade, 0x47ac6219)}, {TOBN(0x90e6312a, 0xc4708e8e), TOBN(0x4f5725fb, 0xa71e9adf), TOBN(0xe95f55ae, 0x3d684b9f), TOBN(0x47f7ccb1, 0x1e94b415)}}, {{TOBN(0x7322851b, 0x8d946581), TOBN(0xf0d13133, 0xbdf4a012), TOBN(0xa3510f69, 0x6584dae0), TOBN(0x03a7c171, 0x3c9f6c6d)}, {TOBN(0x5be97f38, 0xe475381a), TOBN(0xca1ba422, 0x85823334), TOBN(0xf83cc5c7, 0x0be17dda), TOBN(0x158b1494, 0x0b918c0f)}}, {{TOBN(0xda3a77e5, 0x522e6b69), TOBN(0x69c908c3, 0xbbcd6c18), TOBN(0x1f1b9e48, 0xd924fd56), TOBN(0x37c64e36, 0xaa4bb3f7)}, {TOBN(0x5a4fdbdf, 0xee478d7d), TOBN(0xba75c8bc, 0x0193f7a0), TOBN(0x84bc1e84, 0x56cd16df), TOBN(0x1fb08f08, 0x46fad151)}}, {{TOBN(0x8a7cabf9, 0x842e9f30), TOBN(0xa331d4bf, 0x5eab83af), TOBN(0xd272cfba, 0x017f2a6a), TOBN(0x27560abc, 0x83aba0e3)}, {TOBN(0x94b83387, 0x0e3a6b75), TOBN(0x25c6aea2, 0x6b9f50f5), TOBN(0x803d691d, 0xb5fdf6d0), TOBN(0x03b77509, 0xe6333514)}}, {{TOBN(0x36178903, 0x61a341c1), TOBN(0x3604dc60, 0x0cfd6142), TOBN(0x022295eb, 0x8533316c), TOBN(0x3dbde4ac, 0x44af2922)}, {TOBN(0x898afc5d, 0x1c7eef69), TOBN(0x58896805, 0xd14f4fa1), TOBN(0x05002160, 0x203c21ca), TOBN(0x6f0d1f30, 0x40ef730b)}}, {{TOBN(0x8e8c44d4, 0x196224f8), TOBN(0x75a4ab95, 0x374d079d), TOBN(0x79085ecc, 0x7d48f123), TOBN(0x56f04d31, 0x1bf65ad8)}, {TOBN(0xe220bf1c, 0xbda602b2), TOBN(0x73ee1742, 0xf9612c69), TOBN(0x76008fc8, 0x084fd06b), TOBN(0x4000ef9f, 0xf11380d1)}}, {{TOBN(0x48201b4b, 0x12cfe297), TOBN(0x3eee129c, 0x292f74e5), TOBN(0xe1fe114e, 0xc9e874e8), TOBN(0x899b055c, 0x92c5fc41)}, {TOBN(0x4e477a64, 0x3a39c8cf), TOBN(0x82f09efe, 0x78963cc9), TOBN(0x6fd3fd8f, 0xd333f863), TOBN(0x85132b2a, 0xdc949c63)}}, {{TOBN(0x7e06a3ab, 0x516eb17b), TOBN(0x73bec06f, 0xd2c7372b), TOBN(0xe4f74f55, 0xba896da6), TOBN(0xbb4afef8, 0x8e9eb40f)}, {TOBN(0x2d75bec8, 0xe61d66b0), TOBN(0x02bda4b4, 0xef29300b), TOBN(0x8bbaa8de, 0x026baa5a), TOBN(0xff54befd, 0xa07f4440)}}, {{TOBN(0xbd9b8b1d, 0xbe7a2af3), TOBN(0xec51caa9, 0x4fb74a72), TOBN(0xb9937a4b, 0x63879697), TOBN(0x7c9a9d20, 0xec2687d5)}, {TOBN(0x1773e44f, 0x6ef5f014), TOBN(0x8abcf412, 0xe90c6900), TOBN(0x387bd022, 0x8142161e), TOBN(0x50393755, 0xfcb6ff2a)}}, {{TOBN(0x9813fd56, 0xed6def63), TOBN(0x53cf6482, 0x7d53106c), TOBN(0x991a35bd, 0x431f7ac1), TOBN(0xf1e274dd, 0x63e65faf)}, {TOBN(0xf63ffa3c, 0x44cc7880), TOBN(0x411a426b, 0x7c256981), TOBN(0xb698b9fd, 0x93a420e0), TOBN(0x89fdddc0, 0xae53f8fe)}}, {{TOBN(0x766e0722, 0x32398baa), TOBN(0x205fee42, 0x5cfca031), TOBN(0xa49f5341, 0x7a029cf2), TOBN(0xa88c68b8, 0x4023890d)}, {TOBN(0xbc275041, 0x7337aaa8), TOBN(0x9ed364ad, 0x0eb384f4), TOBN(0xe0816f85, 0x29aba92f), TOBN(0x2e9e1941, 0x04e38a88)}}, {{TOBN(0x57eef44a, 0x3dafd2d5), TOBN(0x35d1fae5, 0x97ed98d8), TOBN(0x50628c09, 0x2307f9b1), TOBN(0x09d84aae, 0xd6cba5c6)}, {TOBN(0x67071bc7, 0x88aaa691), TOBN(0x2dea57a9, 0xafe6cb03), TOBN(0xdfe11bb4, 0x3d78ac01), TOBN(0x7286418c, 0x7fd7aa51)}}, {{TOBN(0xfabf7709, 0x77f7195a), TOBN(0x8ec86167, 0xadeb838f), TOBN(0xea1285a8, 0xbb4f012d), TOBN(0xd6883503, 0x9a3eab3f)}, {TOBN(0xee5d24f8, 0x309004c2), TOBN(0xa96e4b76, 0x13ffe95e), TOBN(0x0cdffe12, 0xbd223ea4), TOBN(0x8f5c2ee5, 0xb6739a53)}}, {{TOBN(0x5cb4aaa5, 0xdd968198), TOBN(0xfa131c52, 0x72413a6c), TOBN(0x53d46a90, 0x9536d903), TOBN(0xb270f0d3, 0x48606d8e)}, {TOBN(0x518c7564, 0xa053a3bc), TOBN(0x088254b7, 0x1a86caef), TOBN(0xb3ba8cb4, 0x0ab5efd0), TOBN(0x5c59900e, 0x4605945d)}}, {{TOBN(0xecace1dd, 0xa1887395), TOBN(0x40960f36, 0x932a65de), TOBN(0x9611ff5c, 0x3aa95529), TOBN(0xc58215b0, 0x7c1e5a36)}, {TOBN(0xd48c9b58, 0xf0e1a524), TOBN(0xb406856b, 0xf590dfb8), TOBN(0xc7605e04, 0x9cd95662), TOBN(0x0dd036ee, 0xa33ecf82)}}, {{TOBN(0xa50171ac, 0xc33156b3), TOBN(0xf09d24ea, 0x4a80172e), TOBN(0x4e1f72c6, 0x76dc8eef), TOBN(0xe60caadc, 0x5e3d44ee)}, {TOBN(0x006ef8a6, 0x979b1d8f), TOBN(0x60908a1c, 0x97788d26), TOBN(0x6e08f95b, 0x266feec0), TOBN(0x618427c2, 0x22e8c94e)}}, {{TOBN(0x3d613339, 0x59145a65), TOBN(0xcd9bc368, 0xfa406337), TOBN(0x82d11be3, 0x2d8a52a0), TOBN(0xf6877b27, 0x97a1c590)}, {TOBN(0x837a819b, 0xf5cbdb25), TOBN(0x2a4fd1d8, 0xde090249), TOBN(0x622a7de7, 0x74990e5f), TOBN(0x840fa5a0, 0x7945511b)}}, {{TOBN(0x30b974be, 0x6558842d), TOBN(0x70df8c64, 0x17f3d0a6), TOBN(0x7c803520, 0x7542e46d), TOBN(0x7251fe7f, 0xe4ecc823)}, {TOBN(0xe59134cb, 0x5e9aac9a), TOBN(0x11bb0934, 0xf0045d71), TOBN(0x53e5d9b5, 0xdbcb1d4e), TOBN(0x8d97a905, 0x92defc91)}}, {{TOBN(0xfe289327, 0x7946d3f9), TOBN(0xe132bd24, 0x07472273), TOBN(0xeeeb510c, 0x1eb6ae86), TOBN(0x777708c5, 0xf0595067)}, {TOBN(0x18e2c8cd, 0x1297029e), TOBN(0x2c61095c, 0xbbf9305e), TOBN(0xe466c258, 0x6b85d6d9), TOBN(0x8ac06c36, 0xda1ea530)}}, {{TOBN(0xa365dc39, 0xa1304668), TOBN(0xe4a9c885, 0x07f89606), TOBN(0x65a4898f, 0xacc7228d), TOBN(0x3e2347ff, 0x84ca8303)}, {TOBN(0xa5f6fb77, 0xea7d23a3), TOBN(0x2fac257d, 0x672a71cd), TOBN(0x6908bef8, 0x7e6a44d3), TOBN(0x8ff87566, 0x891d3d7a)}}, {{TOBN(0xe58e90b3, 0x6b0cf82e), TOBN(0x6438d246, 0x2615b5e7), TOBN(0x07b1f8fc, 0x669c145a), TOBN(0xb0d8b2da, 0x36f1e1cb)}, {TOBN(0x54d5dadb, 0xd9184c4d), TOBN(0x3dbb18d5, 0xf93d9976), TOBN(0x0a3e0f56, 0xd1147d47), TOBN(0x2afa8c8d, 0xa0a48609)}}, {{TOBN(0x275353e8, 0xbc36742c), TOBN(0x898f427e, 0xeea0ed90), TOBN(0x26f4947e, 0x3e477b00), TOBN(0x8ad8848a, 0x308741e3)}, {TOBN(0x6c703c38, 0xd74a2a46), TOBN(0x5e3e05a9, 0x9ba17ba2), TOBN(0xc1fa6f66, 0x4ab9a9e4), TOBN(0x474a2d9a, 0x3841d6ec)}}, {{TOBN(0x871239ad, 0x653ae326), TOBN(0x14bcf72a, 0xa74cbb43), TOBN(0x8737650e, 0x20d4c083), TOBN(0x3df86536, 0x110ed4af)}, {TOBN(0xd2d86fe7, 0xb53ca555), TOBN(0x688cb00d, 0xabd5d538), TOBN(0xcf81bda3, 0x1ad38468), TOBN(0x7ccfe3cc, 0xf01167b6)}}, {{TOBN(0xcf4f47e0, 0x6c4c1fe6), TOBN(0x557e1f1a, 0x298bbb79), TOBN(0xf93b974f, 0x30d45a14), TOBN(0x174a1d2d, 0x0baf97c4)}, {TOBN(0x7a003b30, 0xc51fbf53), TOBN(0xd8940991, 0xee68b225), TOBN(0x5b0aa7b7, 0x1c0f4173), TOBN(0x975797c9, 0xa20a7153)}}, {{TOBN(0x26e08c07, 0xe3533d77), TOBN(0xd7222e6a, 0x2e341c99), TOBN(0x9d60ec3d, 0x8d2dc4ed), TOBN(0xbdfe0d8f, 0x7c476cf8)}, {TOBN(0x1fe59ab6, 0x1d056605), TOBN(0xa9ea9df6, 0x86a8551f), TOBN(0x8489941e, 0x47fb8d8c), TOBN(0xfeb874eb, 0x4a7f1b10)}}, {{TOBN(0xfe5fea86, 0x7ee0d98f), TOBN(0x201ad34b, 0xdbf61864), TOBN(0x45d8fe47, 0x37c031d4), TOBN(0xd5f49fae, 0x795f0822)}, {TOBN(0xdb0fb291, 0xc7f4a40c), TOBN(0x2e69d9c1, 0x730ddd92), TOBN(0x754e1054, 0x49d76987), TOBN(0x8a24911d, 0x7662db87)}}, {{TOBN(0x61fc1810, 0x60a71676), TOBN(0xe852d1a8, 0xf66a8ad1), TOBN(0x172bbd65, 0x6417231e), TOBN(0x0d6de7bd, 0x3babb11f)}, {TOBN(0x6fde6f88, 0xc8e347f8), TOBN(0x1c587547, 0x9bd99cc3), TOBN(0x78e54ed0, 0x34076950), TOBN(0x97f0f334, 0x796e83ba)}}, {{TOBN(0xe4dbe1ce, 0x4924867a), TOBN(0xbd5f51b0, 0x60b84917), TOBN(0x37530040, 0x3cb09a79), TOBN(0xdb3fe0f8, 0xff1743d8)}, {TOBN(0xed7894d8, 0x556fa9db), TOBN(0xfa262169, 0x23412fbf), TOBN(0x563be0db, 0xba7b9291), TOBN(0x6ca8b8c0, 0x0c9fb234)}}, {{TOBN(0xed406aa9, 0xbd763802), TOBN(0xc21486a0, 0x65303da1), TOBN(0x61ae291e, 0xc7e62ec4), TOBN(0x622a0492, 0xdf99333e)}, {TOBN(0x7fd80c9d, 0xbb7a8ee0), TOBN(0xdc2ed3bc, 0x6c01aedb), TOBN(0x35c35a12, 0x08be74ec), TOBN(0xd540cb1a, 0x469f671f)}}, {{TOBN(0xd16ced4e, 0xcf84f6c7), TOBN(0x8561fb9c, 0x2d090f43), TOBN(0x7e693d79, 0x6f239db4), TOBN(0xa736f928, 0x77bd0d94)}, {TOBN(0x07b4d929, 0x2c1950ee), TOBN(0xda177543, 0x56dc11b3), TOBN(0xa5dfbbaa, 0x7a6a878e), TOBN(0x1c70cb29, 0x4decb08a)}}, {{TOBN(0xfba28c8b, 0x6f0f7c50), TOBN(0xa8eba2b8, 0x854dcc6d), TOBN(0x5ff8e89a, 0x36b78642), TOBN(0x070c1c8e, 0xf6873adf)}, {TOBN(0xbbd3c371, 0x6484d2e4), TOBN(0xfb78318f, 0x0d414129), TOBN(0x2621a39c, 0x6ad93b0b), TOBN(0x979d74c2, 0xa9e917f7)}}, {{TOBN(0xfc195647, 0x61fb0428), TOBN(0x4d78954a, 0xbee624d4), TOBN(0xb94896e0, 0xb8ae86fd), TOBN(0x6667ac0c, 0xc91c8b13)}, {TOBN(0x9f180512, 0x43bcf832), TOBN(0xfbadf8b7, 0xa0010137), TOBN(0xc69b4089, 0xb3ba8aa7), TOBN(0xfac4bacd, 0xe687ce85)}}, {{TOBN(0x9164088d, 0x977eab40), TOBN(0x51f4c5b6, 0x2760b390), TOBN(0xd238238f, 0x340dd553), TOBN(0x358566c3, 0xdb1d31c9)}, {TOBN(0x3a5ad69e, 0x5068f5ff), TOBN(0xf31435fc, 0xdaff6b06), TOBN(0xae549a5b, 0xd6debff0), TOBN(0x59e5f0b7, 0x75e01331)}}, {{TOBN(0x5d492fb8, 0x98559acf), TOBN(0x96018c2e, 0x4db79b50), TOBN(0x55f4a48f, 0x609f66aa), TOBN(0x1943b3af, 0x4900a14f)}, {TOBN(0xc22496df, 0x15a40d39), TOBN(0xb2a44684, 0x4c20f7c5), TOBN(0x76a35afa, 0x3b98404c), TOBN(0xbec75725, 0xff5d1b77)}}, {{TOBN(0xb67aa163, 0xbea06444), TOBN(0x27e95bb2, 0xf724b6f2), TOBN(0x3c20e3e9, 0xd238c8ab), TOBN(0x1213754e, 0xddd6ae17)}, {TOBN(0x8c431020, 0x716e0f74), TOBN(0x6679c82e, 0xffc095c2), TOBN(0x2eb3adf4, 0xd0ac2932), TOBN(0x2cc970d3, 0x01bb7a76)}}, {{TOBN(0x70c71f2f, 0x740f0e66), TOBN(0x545c616b, 0x2b6b23cc), TOBN(0x4528cfcb, 0xb40a8bd7), TOBN(0xff839633, 0x2ab27722)}, {TOBN(0x049127d9, 0x025ac99a), TOBN(0xd314d4a0, 0x2b63e33b), TOBN(0xc8c310e7, 0x28d84519), TOBN(0x0fcb8983, 0xb3bc84ba)}}, {{TOBN(0x2cc52261, 0x38634818), TOBN(0x501814f4, 0xb44c2e0b), TOBN(0xf7e181aa, 0x54dfdba3), TOBN(0xcfd58ff0, 0xe759718c)}, {TOBN(0xf90cdb14, 0xd3b507a8), TOBN(0x57bd478e, 0xc50bdad8), TOBN(0x29c197e2, 0x50e5f9aa), TOBN(0x4db6eef8, 0xe40bc855)}}, {{TOBN(0x2cc8f21a, 0xd1fc0654), TOBN(0xc71cc963, 0x81269d73), TOBN(0xecfbb204, 0x077f49f9), TOBN(0xdde92571, 0xca56b793)}, {TOBN(0x9abed6a3, 0xf97ad8f7), TOBN(0xe6c19d3f, 0x924de3bd), TOBN(0x8dce92f4, 0xa140a800), TOBN(0x85f44d1e, 0x1337af07)}}, {{TOBN(0x5953c08b, 0x09d64c52), TOBN(0xa1b5e49f, 0xf5df9749), TOBN(0x336a8fb8, 0x52735f7d), TOBN(0xb332b6db, 0x9add676b)}, {TOBN(0x558b88a0, 0xb4511aa4), TOBN(0x09788752, 0xdbd5cc55), TOBN(0x16b43b9c, 0xd8cd52bd), TOBN(0x7f0bc5a0, 0xc2a2696b)}}, {{TOBN(0x146e12d4, 0xc11f61ef), TOBN(0x9ce10754, 0x3a83e79e), TOBN(0x08ec73d9, 0x6cbfca15), TOBN(0x09ff29ad, 0x5b49653f)}, {TOBN(0xe31b72bd, 0xe7da946e), TOBN(0xebf9eb3b, 0xee80a4f2), TOBN(0xd1aabd08, 0x17598ce4), TOBN(0x18b5fef4, 0x53f37e80)}}, {{TOBN(0xd5d5cdd3, 0x5958cd79), TOBN(0x3580a1b5, 0x1d373114), TOBN(0xa36e4c91, 0xfa935726), TOBN(0xa38c534d, 0xef20d760)}, {TOBN(0x7088e40a, 0x2ff5845b), TOBN(0xe5bb40bd, 0xbd78177f), TOBN(0x4f06a7a8, 0x857f9920), TOBN(0xe3cc3e50, 0xe968f05d)}}, {{TOBN(0x1d68b7fe, 0xe5682d26), TOBN(0x5206f76f, 0xaec7f87c), TOBN(0x41110530, 0x041951ab), TOBN(0x58ec52c1, 0xd4b5a71a)}, {TOBN(0xf3488f99, 0x0f75cf9a), TOBN(0xf411951f, 0xba82d0d5), TOBN(0x27ee75be, 0x618895ab), TOBN(0xeae060d4, 0x6d8aab14)}}, {{TOBN(0x9ae1df73, 0x7fb54dc2), TOBN(0x1f3e391b, 0x25963649), TOBN(0x242ec32a, 0xfe055081), TOBN(0x5bd450ef, 0x8491c9bd)}, {TOBN(0x367efc67, 0x981eb389), TOBN(0xed7e1928, 0x3a0550d5), TOBN(0x362e776b, 0xab3ce75c), TOBN(0xe890e308, 0x1f24c523)}}, {{TOBN(0xb961b682, 0xfeccef76), TOBN(0x8b8e11f5, 0x8bba6d92), TOBN(0x8f2ccc4c, 0x2b2375c4), TOBN(0x0d7f7a52, 0xe2f86cfa)}, {TOBN(0xfd94d30a, 0x9efe5633), TOBN(0x2d8d246b, 0x5451f934), TOBN(0x2234c6e3, 0x244e6a00), TOBN(0xde2b5b0d, 0xddec8c50)}}, {{TOBN(0x2ce53c5a, 0xbf776f5b), TOBN(0x6f724071, 0x60357b05), TOBN(0xb2593717, 0x71bf3f7a), TOBN(0x87d2501c, 0x440c4a9f)}, {TOBN(0x440552e1, 0x87b05340), TOBN(0xb7bf7cc8, 0x21624c32), TOBN(0x4155a6ce, 0x22facddb), TOBN(0x5a4228cb, 0x889837ef)}}, {{TOBN(0xef87d6d6, 0xfd4fd671), TOBN(0xa233687e, 0xc2daa10e), TOBN(0x75622244, 0x03c0eb96), TOBN(0x7632d184, 0x8bf19be6)}, {TOBN(0x05d0f8e9, 0x40735ff4), TOBN(0x3a3e6e13, 0xc00931f1), TOBN(0x31ccde6a, 0xdafe3f18), TOBN(0xf381366a, 0xcfe51207)}}, {{TOBN(0x24c222a9, 0x60167d92), TOBN(0x62f9d6f8, 0x7529f18c), TOBN(0x412397c0, 0x0353b114), TOBN(0x334d89dc, 0xef808043)}, {TOBN(0xd9ec63ba, 0x2a4383ce), TOBN(0xcec8e937, 0x5cf92ba0), TOBN(0xfb8b4288, 0xc8be74c0), TOBN(0x67d6912f, 0x105d4391)}}, {{TOBN(0x7b996c46, 0x1b913149), TOBN(0x36aae2ef, 0x3a4e02da), TOBN(0xb68aa003, 0x972de594), TOBN(0x284ec70d, 0x4ec6d545)}, {TOBN(0xf3d2b2d0, 0x61391d54), TOBN(0x69c5d5d6, 0xfe114e92), TOBN(0xbe0f00b5, 0xb4482dff), TOBN(0xe1596fa5, 0xf5bf33c5)}}, {{TOBN(0x10595b56, 0x96a71cba), TOBN(0x944938b2, 0xfdcadeb7), TOBN(0xa282da4c, 0xfccd8471), TOBN(0x98ec05f3, 0x0d37bfe1)}, {TOBN(0xe171ce1b, 0x0698304a), TOBN(0x2d691444, 0x21bdf79b), TOBN(0xd0cd3b74, 0x1b21dec1), TOBN(0x712ecd8b, 0x16a15f71)}}, {{TOBN(0x8d4c00a7, 0x00fd56e1), TOBN(0x02ec9692, 0xf9527c18), TOBN(0x21c44937, 0x4a3e42e1), TOBN(0x9176fbab, 0x1392ae0a)}, {TOBN(0x8726f1ba, 0x44b7b618), TOBN(0xb4d7aae9, 0xf1de491c), TOBN(0xf91df7b9, 0x07b582c0), TOBN(0x7e116c30, 0xef60aa3a)}}, {{TOBN(0x99270f81, 0x466265d7), TOBN(0xb15b6fe2, 0x4df7adf0), TOBN(0xfe33b2d3, 0xf9738f7f), TOBN(0x48553ab9, 0xd6d70f95)}, {TOBN(0x2cc72ac8, 0xc21e94db), TOBN(0x795ac38d, 0xbdc0bbee), TOBN(0x0a1be449, 0x2e40478f), TOBN(0x81bd3394, 0x052bde55)}}, {{TOBN(0x63c8dbe9, 0x56b3c4f2), TOBN(0x017a99cf, 0x904177cc), TOBN(0x947bbddb, 0x4d010fc1), TOBN(0xacf9b00b, 0xbb2c9b21)}, {TOBN(0x2970bc8d, 0x47173611), TOBN(0x1a4cbe08, 0xac7d756f), TOBN(0x06d9f4aa, 0x67d541a2), TOBN(0xa3e8b689, 0x59c2cf44)}}, {{TOBN(0xaad066da, 0x4d88f1dd), TOBN(0xc604f165, 0x7ad35dea), TOBN(0x7edc0720, 0x4478ca67), TOBN(0xa10dfae0, 0xba02ce06)}, {TOBN(0xeceb1c76, 0xaf36f4e4), TOBN(0x994b2292, 0xaf3f8f48), TOBN(0xbf9ed77b, 0x77c8a68c), TOBN(0x74f544ea, 0x51744c9d)}}, {{TOBN(0x82d05bb9, 0x8113a757), TOBN(0x4ef2d2b4, 0x8a9885e4), TOBN(0x1e332be5, 0x1aa7865f), TOBN(0x22b76b18, 0x290d1a52)}, {TOBN(0x308a2310, 0x44351683), TOBN(0x9d861896, 0xa3f22840), TOBN(0x5959ddcd, 0x841ed947), TOBN(0x0def0c94, 0x154b73bf)}}, {{TOBN(0xf0105417, 0x4c7c15e0), TOBN(0x539bfb02, 0x3a277c32), TOBN(0xe699268e, 0xf9dccf5f), TOBN(0x9f5796a5, 0x0247a3bd)}, {TOBN(0x8b839de8, 0x4f157269), TOBN(0xc825c1e5, 0x7a30196b), TOBN(0x6ef0aabc, 0xdc8a5a91), TOBN(0xf4a8ce6c, 0x498b7fe6)}}, {{TOBN(0x1cce35a7, 0x70cbac78), TOBN(0x83488e9b, 0xf6b23958), TOBN(0x0341a070, 0xd76cb011), TOBN(0xda6c9d06, 0xae1b2658)}, {TOBN(0xb701fb30, 0xdd648c52), TOBN(0x994ca02c, 0x52fb9fd1), TOBN(0x06933117, 0x6f563086), TOBN(0x3d2b8100, 0x17856bab)}}, {{TOBN(0xe89f48c8, 0x5963a46e), TOBN(0x658ab875, 0xa99e61c7), TOBN(0x6e296f87, 0x4b8517b4), TOBN(0x36c4fcdc, 0xfc1bc656)}, {TOBN(0xde5227a1, 0xa3906def), TOBN(0x9fe95f57, 0x62418945), TOBN(0x20c91e81, 0xfdd96cde), TOBN(0x5adbe47e, 0xda4480de)}}, {{TOBN(0xa009370f, 0x396de2b6), TOBN(0x98583d4b, 0xf0ecc7bd), TOBN(0xf44f6b57, 0xe51d0672), TOBN(0x03d6b078, 0x556b1984)}, {TOBN(0x27dbdd93, 0xb0b64912), TOBN(0x9b3a3434, 0x15687b09), TOBN(0x0dba6461, 0x51ec20a9), TOBN(0xec93db7f, 0xff28187c)}}, {{TOBN(0x00ff8c24, 0x66e48bdd), TOBN(0x2514f2f9, 0x11ccd78e), TOBN(0xeba11f4f, 0xe1250603), TOBN(0x8a22cd41, 0x243fa156)}, {TOBN(0xa4e58df4, 0xb283e4c6), TOBN(0x78c29859, 0x8b39783f), TOBN(0x5235aee2, 0xa5259809), TOBN(0xc16284b5, 0x0e0227dd)}}, {{TOBN(0xa5f57916, 0x1338830d), TOBN(0x6d4b8a6b, 0xd2123fca), TOBN(0x236ea68a, 0xf9c546f8), TOBN(0xc1d36873, 0xfa608d36)}, {TOBN(0xcd76e495, 0x8d436d13), TOBN(0xd4d9c221, 0x8fb080af), TOBN(0x665c1728, 0xe8ad3fb5), TOBN(0xcf1ebe4d, 0xb3d572e0)}}, {{TOBN(0xa7a8746a, 0x584c5e20), TOBN(0x267e4ea1, 0xb9dc7035), TOBN(0x593a15cf, 0xb9548c9b), TOBN(0x5e6e2135, 0x4bd012f3)}, {TOBN(0xdf31cc6a, 0x8c8f936e), TOBN(0x8af84d04, 0xb5c241dc), TOBN(0x63990a6f, 0x345efb86), TOBN(0x6fef4e61, 0xb9b962cb)}}}, {{{TOBN(0xf6368f09, 0x25722608), TOBN(0x131260db, 0x131cf5c6), TOBN(0x40eb353b, 0xfab4f7ac), TOBN(0x85c78880, 0x37eee829)}, {TOBN(0x4c1581ff, 0xc3bdf24e), TOBN(0x5bff75cb, 0xf5c3c5a8), TOBN(0x35e8c83f, 0xa14e6f40), TOBN(0xb81d1c0f, 0x0295e0ca)}}, {{TOBN(0xfcde7cc8, 0xf43a730f), TOBN(0xe89b6f3c, 0x33ab590e), TOBN(0xc823f529, 0xad03240b), TOBN(0x82b79afe, 0x98bea5db)}, {TOBN(0x568f2856, 0x962fe5de), TOBN(0x0c590adb, 0x60c591f3), TOBN(0x1fc74a14, 0x4a28a858), TOBN(0x3b662498, 0xb3203f4c)}}, {{TOBN(0x91e3cf0d, 0x6c39765a), TOBN(0xa2db3acd, 0xac3cca0b), TOBN(0x288f2f08, 0xcb953b50), TOBN(0x2414582c, 0xcf43cf1a)}, {TOBN(0x8dec8bbc, 0x60eee9a8), TOBN(0x54c79f02, 0x729aa042), TOBN(0xd81cd5ec, 0x6532f5d5), TOBN(0xa672303a, 0xcf82e15f)}}, {{TOBN(0x376aafa8, 0x719c0563), TOBN(0xcd8ad2dc, 0xbc5fc79f), TOBN(0x303fdb9f, 0xcb750cd3), TOBN(0x14ff052f, 0x4418b08e)}, {TOBN(0xf75084cf, 0x3e2d6520), TOBN(0x7ebdf0f8, 0x144ed509), TOBN(0xf43bf0f2, 0xd3f25b98), TOBN(0x86ad71cf, 0xa354d837)}}, {{TOBN(0xb827fe92, 0x26f43572), TOBN(0xdfd3ab5b, 0x5d824758), TOBN(0x315dd23a, 0x539094c1), TOBN(0x85c0e37a, 0x66623d68)}, {TOBN(0x575c7972, 0x7be19ae0), TOBN(0x616a3396, 0xdf0d36b5), TOBN(0xa1ebb3c8, 0x26b1ff7e), TOBN(0x635b9485, 0x140ad453)}}, {{TOBN(0x92bf3cda, 0xda430c0b), TOBN(0x4702850e, 0x3a96dac6), TOBN(0xc91cf0a5, 0x15ac326a), TOBN(0x95de4f49, 0xab8c25e4)}, {TOBN(0xb01bad09, 0xe265c17c), TOBN(0x24e45464, 0x087b3881), TOBN(0xd43e583c, 0xe1fac5ca), TOBN(0xe17cb318, 0x6ead97a6)}}, {{TOBN(0x6cc39243, 0x74dcec46), TOBN(0x33cfc02d, 0x54c2b73f), TOBN(0x82917844, 0xf26cd99c), TOBN(0x8819dd95, 0xd1773f89)}, {TOBN(0x09572aa6, 0x0871f427), TOBN(0x8e0cf365, 0xf6f01c34), TOBN(0x7fa52988, 0xbff1f5af), TOBN(0x4eb357ea, 0xe75e8e50)}}, {{TOBN(0xd9d0c8c4, 0x868af75d), TOBN(0xd7325cff, 0x45c8c7ea), TOBN(0xab471996, 0xcc81ecb0), TOBN(0xff5d55f3, 0x611824ed)}, {TOBN(0xbe314541, 0x1977a0ee), TOBN(0x5085c4c5, 0x722038c6), TOBN(0x2d5335bf, 0xf94bb495), TOBN(0x894ad8a6, 0xc8e2a082)}}, {{TOBN(0x5c3e2341, 0xada35438), TOBN(0xf4a9fc89, 0x049b8c4e), TOBN(0xbeeb355a, 0x9f17cf34), TOBN(0x3f311e0e, 0x6c91fe10)}, {TOBN(0xc2d20038, 0x92ab9891), TOBN(0x257bdcc1, 0x3e8ce9a9), TOBN(0x1b2d9789, 0x88c53bee), TOBN(0x927ce89a, 0xcdba143a)}}, {{TOBN(0xb0a32cca, 0x523db280), TOBN(0x5c889f8a, 0x50d43783), TOBN(0x503e04b3, 0x4897d16f), TOBN(0x8cdb6e78, 0x08f5f2e8)}, {TOBN(0x6ab91cf0, 0x179c8e74), TOBN(0xd8874e52, 0x48211d60), TOBN(0xf948d4d5, 0xea851200), TOBN(0x4076d41e, 0xe6f9840a)}}, {{TOBN(0xc20e263c, 0x47b517ea), TOBN(0x79a448fd, 0x30685e5e), TOBN(0xe55f6f78, 0xf90631a0), TOBN(0x88a790b1, 0xa79e6346)}, {TOBN(0x62160c7d, 0x80969fe8), TOBN(0x54f92fd4, 0x41491bb9), TOBN(0xa6645c23, 0x5c957526), TOBN(0xf44cc5ae, 0xbea3ce7b)}}, {{TOBN(0xf7628327, 0x8b1e68b7), TOBN(0xc731ad7a, 0x303f29d3), TOBN(0xfe5a9ca9, 0x57d03ecb), TOBN(0x96c0d50c, 0x41bc97a7)}, {TOBN(0xc4669fe7, 0x9b4f7f24), TOBN(0xfdd781d8, 0x3d9967ef), TOBN(0x7892c7c3, 0x5d2c208d), TOBN(0x8bf64f7c, 0xae545cb3)}}, {{TOBN(0xc01f862c, 0x467be912), TOBN(0xf4c85ee9, 0xc73d30cc), TOBN(0x1fa6f4be, 0x6ab83ec7), TOBN(0xa07a3c1c, 0x4e3e3cf9)}, {TOBN(0x87f8ef45, 0x0c00beb3), TOBN(0x30e2c2b3, 0x000d4c3e), TOBN(0x1aa00b94, 0xfe08bf5b), TOBN(0x32c133aa, 0x9224ef52)}}, {{TOBN(0x38df16bb, 0x32e5685d), TOBN(0x68a9e069, 0x58e6f544), TOBN(0x495aaff7, 0xcdc5ebc6), TOBN(0xf894a645, 0x378b135f)}, {TOBN(0xf316350a, 0x09e27ecf), TOBN(0xeced201e, 0x58f7179d), TOBN(0x2eec273c, 0xe97861ba), TOBN(0x47ec2cae, 0xd693be2e)}}, {{TOBN(0xfa4c97c4, 0xf68367ce), TOBN(0xe4f47d0b, 0xbe5a5755), TOBN(0x17de815d, 0xb298a979), TOBN(0xd7eca659, 0xc177dc7d)}, {TOBN(0x20fdbb71, 0x49ded0a3), TOBN(0x4cb2aad4, 0xfb34d3c5), TOBN(0x2cf31d28, 0x60858a33), TOBN(0x3b6873ef, 0xa24aa40f)}}, {{TOBN(0x540234b2, 0x2c11bb37), TOBN(0x2d0366dd, 0xed4c74a3), TOBN(0xf9a968da, 0xeec5f25d), TOBN(0x36601068, 0x67b63142)}, {TOBN(0x07cd6d2c, 0x68d7b6d4), TOBN(0xa8f74f09, 0x0c842942), TOBN(0xe2751404, 0x7768b1ee), TOBN(0x4b5f7e89, 0xfe62aee4)}}, {{TOBN(0xc6a77177, 0x89070d26), TOBN(0xa1f28e4e, 0xdd1c8bc7), TOBN(0xea5f4f06, 0x469e1f17), TOBN(0x78fc242a, 0xfbdb78e0)}, {TOBN(0xc9c7c592, 0x8b0588f1), TOBN(0xb6b7a0fd, 0x1535921e), TOBN(0xcc5bdb91, 0xbde5ae35), TOBN(0xb42c485e, 0x12ff1864)}}, {{TOBN(0xa1113e13, 0xdbab98aa), TOBN(0xde9d469b, 0xa17b1024), TOBN(0x23f48b37, 0xc0462d3a), TOBN(0x3752e537, 0x7c5c078d)}, {TOBN(0xe3a86add, 0x15544eb9), TOBN(0xf013aea7, 0x80fba279), TOBN(0x8b5bb76c, 0xf22001b5), TOBN(0xe617ba14, 0xf02891ab)}}, {{TOBN(0xd39182a6, 0x936219d3), TOBN(0x5ce1f194, 0xae51cb19), TOBN(0xc78f8598, 0xbf07a74c), TOBN(0x6d7158f2, 0x22cbf1bc)}, {TOBN(0x3b846b21, 0xe300ce18), TOBN(0x35fba630, 0x2d11275d), TOBN(0x5fe25c36, 0xa0239b9b), TOBN(0xd8beb35d, 0xdf05d940)}}, {{TOBN(0x4db02bb0, 0x1f7e320d), TOBN(0x0641c364, 0x6da320ea), TOBN(0x6d95fa5d, 0x821389a3), TOBN(0x92699748, 0x8fcd8e3d)}, {TOBN(0x316fef17, 0xceb6c143), TOBN(0x67fcb841, 0xd933762b), TOBN(0xbb837e35, 0x118b17f8), TOBN(0x4b92552f, 0x9fd24821)}}, {{TOBN(0xae6bc70e, 0x46aca793), TOBN(0x1cf0b0e4, 0xe579311b), TOBN(0x8dc631be, 0x5802f716), TOBN(0x099bdc6f, 0xbddbee4d)}, {TOBN(0xcc352bb2, 0x0caf8b05), TOBN(0xf74d505a, 0x72d63df2), TOBN(0xb9876d4b, 0x91c4f408), TOBN(0x1ce18473, 0x9e229b2d)}}, {{TOBN(0x49507597, 0x83abdb4a), TOBN(0x850fbcb6, 0xdee84b18), TOBN(0x6325236e, 0x609e67dc), TOBN(0x04d831d9, 0x9336c6d8)}, {TOBN(0x8deaae3b, 0xfa12d45d), TOBN(0xe425f8ce, 0x4746e246), TOBN(0x8004c175, 0x24f5f31e), TOBN(0xaca16d8f, 0xad62c3b7)}}, {{TOBN(0x0dc15a6a, 0x9152f934), TOBN(0xf1235e5d, 0xed0e12c1), TOBN(0xc33c06ec, 0xda477dac), TOBN(0x76be8732, 0xb2ea0006)}, {TOBN(0xcf3f7831, 0x0c0cd313), TOBN(0x3c524553, 0xa614260d), TOBN(0x31a756f8, 0xcab22d15), TOBN(0x03ee10d1, 0x77827a20)}}, {{TOBN(0xd1e059b2, 0x1994ef20), TOBN(0x2a653b69, 0x638ae318), TOBN(0x70d5eb58, 0x2f699010), TOBN(0x279739f7, 0x09f5f84a)}, {TOBN(0x5da4663c, 0x8b799336), TOBN(0xfdfdf14d, 0x203c37eb), TOBN(0x32d8a9dc, 0xa1dbfb2d), TOBN(0xab40cff0, 0x77d48f9b)}}, {{TOBN(0xc018b383, 0xd20b42d5), TOBN(0xf9a810ef, 0x9f78845f), TOBN(0x40af3753, 0xbdba9df0), TOBN(0xb90bdcfc, 0x131dfdf9)}, {TOBN(0x18720591, 0xf01ab782), TOBN(0xc823f211, 0x6af12a88), TOBN(0xa51b80f3, 0x0dc14401), TOBN(0xde248f77, 0xfb2dfbe3)}}, {{TOBN(0xef5a44e5, 0x0cafe751), TOBN(0x73997c9c, 0xd4dcd221), TOBN(0x32fd86d1, 0xde854024), TOBN(0xd5b53adc, 0xa09b84bb)}, {TOBN(0x008d7a11, 0xdcedd8d1), TOBN(0x406bd1c8, 0x74b32c84), TOBN(0x5d4472ff, 0x05dde8b1), TOBN(0x2e25f2cd, 0xfce2b32f)}}, {{TOBN(0xbec0dd5e, 0x29dfc254), TOBN(0x4455fcf6, 0x2b98b267), TOBN(0x0b4d43a5, 0xc72df2ad), TOBN(0xea70e6be, 0x48a75397)}, {TOBN(0x2aad6169, 0x5820f3bf), TOBN(0xf410d2dd, 0x9e37f68f), TOBN(0x70fb7dba, 0x7be5ac83), TOBN(0x636bb645, 0x36ec3eec)}}, {{TOBN(0x27104ea3, 0x9754e21c), TOBN(0xbc87a3e6, 0x8d63c373), TOBN(0x483351d7, 0x4109db9a), TOBN(0x0fa724e3, 0x60134da7)}, {TOBN(0x9ff44c29, 0xb0720b16), TOBN(0x2dd0cf13, 0x06aceead), TOBN(0x5942758c, 0xe26929a6), TOBN(0x96c5db92, 0xb766a92b)}}, {{TOBN(0xcec7d4c0, 0x5f18395e), TOBN(0xd3f22744, 0x1f80d032), TOBN(0x7a68b37a, 0xcb86075b), TOBN(0x074764dd, 0xafef92db)}, {TOBN(0xded1e950, 0x7bc7f389), TOBN(0xc580c850, 0xb9756460), TOBN(0xaeeec2a4, 0x7da48157), TOBN(0x3f0b4e7f, 0x82c587b3)}}, {{TOBN(0x231c6de8, 0xa9f19c53), TOBN(0x5717bd73, 0x6974e34e), TOBN(0xd9e1d216, 0xf1508fa9), TOBN(0x9f112361, 0xdadaa124)}, {TOBN(0x80145e31, 0x823b7348), TOBN(0x4dd8f0d5, 0xac634069), TOBN(0xe3d82fc7, 0x2297c258), TOBN(0x276fcfee, 0x9cee7431)}}, {{TOBN(0x8eb61b5e, 0x2bc0aea9), TOBN(0x4f668fd5, 0xde329431), TOBN(0x03a32ab1, 0x38e4b87e), TOBN(0xe1374517, 0x73d0ef0b)}, {TOBN(0x1a46f7e6, 0x853ac983), TOBN(0xc3bdf42e, 0x68e78a57), TOBN(0xacf20785, 0x2ea96dd1), TOBN(0xa10649b9, 0xf1638460)}}, {{TOBN(0xf2369f0b, 0x879fbbed), TOBN(0x0ff0ae86, 0xda9d1869), TOBN(0x5251d759, 0x56766f45), TOBN(0x4984d8c0, 0x2be8d0fc)}, {TOBN(0x7ecc95a6, 0xd21008f0), TOBN(0x29bd54a0, 0x3a1a1c49), TOBN(0xab9828c5, 0xd26c50f3), TOBN(0x32c0087c, 0x51d0d251)}}, {{TOBN(0x9bac3ce6, 0x0c1cdb26), TOBN(0xcd94d947, 0x557ca205), TOBN(0x1b1bd598, 0x9db1fdcd), TOBN(0x0eda0108, 0xa3d8b149)}, {TOBN(0x95066610, 0x56152fcc), TOBN(0xc2f037e6, 0xe7192b33), TOBN(0xdeffb41a, 0xc92e05a4), TOBN(0x1105f6c2, 0xc2f6c62e)}}, {{TOBN(0x68e73500, 0x8733913c), TOBN(0xcce86163, 0x3f3adc40), TOBN(0xf407a942, 0x38a278e9), TOBN(0xd13c1b9d, 0x2ab21292)}, {TOBN(0x93ed7ec7, 0x1c74cf5c), TOBN(0x8887dc48, 0xf1a4c1b4), TOBN(0x3830ff30, 0x4b3a11f1), TOBN(0x358c5a3c, 0x58937cb6)}}, {{TOBN(0x027dc404, 0x89022829), TOBN(0x40e93977, 0x3b798f79), TOBN(0x90ad3337, 0x38be6ead), TOBN(0x9c23f6bc, 0xf34c0a5d)}, {TOBN(0xd1711a35, 0xfbffd8bb), TOBN(0x60fcfb49, 0x1949d3dd), TOBN(0x09c8ef4b, 0x7825d93a), TOBN(0x24233cff, 0xa0a8c968)}}, {{TOBN(0x67ade46c, 0xe6d982af), TOBN(0xebb6bf3e, 0xe7544d7c), TOBN(0xd6b9ba76, 0x3d8bd087), TOBN(0x46fe382d, 0x4dc61280)}, {TOBN(0xbd39a7e8, 0xb5bdbd75), TOBN(0xab381331, 0xb8f228fe), TOBN(0x0709a77c, 0xce1c4300), TOBN(0x6a247e56, 0xf337ceac)}}, {{TOBN(0x8f34f21b, 0x636288be), TOBN(0x9dfdca74, 0xc8a7c305), TOBN(0x6decfd1b, 0xea919e04), TOBN(0xcdf2688d, 0x8e1991f8)}, {TOBN(0xe607df44, 0xd0f8a67e), TOBN(0xd985df4b, 0x0b58d010), TOBN(0x57f834c5, 0x0c24f8f4), TOBN(0xe976ef56, 0xa0bf01ae)}}, {{TOBN(0x536395ac, 0xa1c32373), TOBN(0x351027aa, 0x734c0a13), TOBN(0xd2f1b5d6, 0x5e6bd5bc), TOBN(0x2b539e24, 0x223debed)}, {TOBN(0xd4994cec, 0x0eaa1d71), TOBN(0x2a83381d, 0x661dcf65), TOBN(0x5f1aed2f, 0x7b54c740), TOBN(0x0bea3fa5, 0xd6dda5ee)}}, {{TOBN(0x9d4fb684, 0x36cc6134), TOBN(0x8eb9bbf3, 0xc0a443dd), TOBN(0xfc500e2e, 0x383b7d2a), TOBN(0x7aad621c, 0x5b775257)}, {TOBN(0x69284d74, 0x0a8f7cc0), TOBN(0xe820c2ce, 0x07562d65), TOBN(0xbf9531b9, 0x499758ee), TOBN(0x73e95ca5, 0x6ee0cc2d)}}, {{TOBN(0xf61790ab, 0xfbaf50a5), TOBN(0xdf55e76b, 0x684e0750), TOBN(0xec516da7, 0xf176b005), TOBN(0x575553bb, 0x7a2dddc7)}, {TOBN(0x37c87ca3, 0x553afa73), TOBN(0x315f3ffc, 0x4d55c251), TOBN(0xe846442a, 0xaf3e5d35), TOBN(0x61b91149, 0x6495ff28)}}, {{TOBN(0x23cc95d3, 0xfa326dc3), TOBN(0x1df4da1f, 0x18fc2cea), TOBN(0x24bf9adc, 0xd0a37d59), TOBN(0xb6710053, 0x320d6e1e)}, {TOBN(0x96f9667e, 0x618344d1), TOBN(0xcc7ce042, 0xa06445af), TOBN(0xa02d8514, 0xd68dbc3a), TOBN(0x4ea109e4, 0x280b5a5b)}}, {{TOBN(0x5741a7ac, 0xb40961bf), TOBN(0x4ada5937, 0x6aa56bfa), TOBN(0x7feb9145, 0x02b765d1), TOBN(0x561e97be, 0xe6ad1582)}, {TOBN(0xbbc4a5b6, 0xda3982f5), TOBN(0x0c2659ed, 0xb546f468), TOBN(0xb8e7e6aa, 0x59612d20), TOBN(0xd83dfe20, 0xac19e8e0)}}, {{TOBN(0x8530c45f, 0xb835398c), TOBN(0x6106a8bf, 0xb38a41c2), TOBN(0x21e8f9a6, 0x35f5dcdb), TOBN(0x39707137, 0xcae498ed)}, {TOBN(0x70c23834, 0xd8249f00), TOBN(0x9f14b58f, 0xab2537a0), TOBN(0xd043c365, 0x5f61c0c2), TOBN(0xdc5926d6, 0x09a194a7)}}, {{TOBN(0xddec0339, 0x8e77738a), TOBN(0xd07a63ef, 0xfba46426), TOBN(0x2e58e79c, 0xee7f6e86), TOBN(0xe59b0459, 0xff32d241)}, {TOBN(0xc5ec84e5, 0x20fa0338), TOBN(0x97939ac8, 0xeaff5ace), TOBN(0x0310a4e3, 0xb4a38313), TOBN(0x9115fba2, 0x8f9d9885)}}, {{TOBN(0x8dd710c2, 0x5fadf8c3), TOBN(0x66be38a2, 0xce19c0e2), TOBN(0xd42a279c, 0x4cfe5022), TOBN(0x597bb530, 0x0e24e1b8)}, {TOBN(0x3cde86b7, 0xc153ca7f), TOBN(0xa8d30fb3, 0x707d63bd), TOBN(0xac905f92, 0xbd60d21e), TOBN(0x98e7ffb6, 0x7b9a54ab)}}, {{TOBN(0xd7147df8, 0xe9726a30), TOBN(0xb5e216ff, 0xafce3533), TOBN(0xb550b799, 0x2ff1ec40), TOBN(0x6b613b87, 0xa1e953fd)}, {TOBN(0x87b88dba, 0x792d5610), TOBN(0x2ee1270a, 0xa190fbe1), TOBN(0x02f4e2dc, 0x2ef581da), TOBN(0x016530e4, 0xeff82a95)}}, {{TOBN(0xcbb93dfd, 0x8fd6ee89), TOBN(0x16d3d986, 0x46848fff), TOBN(0x600eff24, 0x1da47adf), TOBN(0x1b9754a0, 0x0ad47a71)}, {TOBN(0x8f9266df, 0x70c33b98), TOBN(0xaadc87ae, 0xdf34186e), TOBN(0x0d2ce8e1, 0x4ad24132), TOBN(0x8a47cbfc, 0x19946eba)}}, {{TOBN(0x47feeb66, 0x62b5f3af), TOBN(0xcefab561, 0x0abb3734), TOBN(0x449de60e, 0x19f35cb1), TOBN(0x39f8db14, 0x157f0eb9)}, {TOBN(0xffaecc5b, 0x3c61bfd6), TOBN(0xa5a4d41d, 0x41216703), TOBN(0x7f8fabed, 0x224e1cc2), TOBN(0x0d5a8186, 0x871ad953)}}, {{TOBN(0xf10774f7, 0xd22da9a9), TOBN(0x45b8a678, 0xcc8a9b0d), TOBN(0xd9c2e722, 0xbdc32cff), TOBN(0xbf71b5f5, 0x337202a5)}, {TOBN(0x95c57f2f, 0x69fc4db9), TOBN(0xb6dad34c, 0x765d01e1), TOBN(0x7e0bd13f, 0xcb904635), TOBN(0x61751253, 0x763a588c)}}, {{TOBN(0xd85c2997, 0x81af2c2d), TOBN(0xc0f7d9c4, 0x81b9d7da), TOBN(0x838a34ae, 0x08533e8d), TOBN(0x15c4cb08, 0x311d8311)}, {TOBN(0x97f83285, 0x8e121e14), TOBN(0xeea7dc1e, 0x85000a5f), TOBN(0x0c6059b6, 0x5d256274), TOBN(0xec9beace, 0xb95075c0)}}, {{TOBN(0x173daad7, 0x1df97828), TOBN(0xbf851cb5, 0xa8937877), TOBN(0xb083c594, 0x01646f3c), TOBN(0x3bad30cf, 0x50c6d352)}, {TOBN(0xfeb2b202, 0x496bbcea), TOBN(0x3cf9fd4f, 0x18a1e8ba), TOBN(0xd26de7ff, 0x1c066029), TOBN(0x39c81e9e, 0x4e9ed4f8)}}, {{TOBN(0xd8be0cb9, 0x7b390d35), TOBN(0x01df2bbd, 0x964aab27), TOBN(0x3e8c1a65, 0xc3ef64f8), TOBN(0x567291d1, 0x716ed1dd)}, {TOBN(0x95499c6c, 0x5f5406d3), TOBN(0x71fdda39, 0x5ba8e23f), TOBN(0xcfeb320e, 0xd5096ece), TOBN(0xbe7ba92b, 0xca66dd16)}}, {{TOBN(0x4608d36b, 0xc6fb5a7d), TOBN(0xe3eea15a, 0x6d2dd0e0), TOBN(0x75b0a3eb, 0x8f97a36a), TOBN(0xf59814cc, 0x1c83de1e)}, {TOBN(0x56c9c5b0, 0x1c33c23f), TOBN(0xa96c1da4, 0x6faa4136), TOBN(0x46bf2074, 0xde316551), TOBN(0x3b866e7b, 0x1f756c8f)}}, {{TOBN(0x727727d8, 0x1495ed6b), TOBN(0xb2394243, 0xb682dce7), TOBN(0x8ab8454e, 0x758610f3), TOBN(0xc243ce84, 0x857d72a4)}, {TOBN(0x7b320d71, 0xdbbf370f), TOBN(0xff9afa37, 0x78e0f7ca), TOBN(0x0119d1e0, 0xea7b523f), TOBN(0xb997f8cb, 0x058c7d42)}}, {{TOBN(0x285bcd2a, 0x37bbb184), TOBN(0x51dcec49, 0xa45d1fa6), TOBN(0x6ade3b64, 0xe29634cb), TOBN(0x080c94a7, 0x26b86ef1)}, {TOBN(0xba583db1, 0x2283fbe3), TOBN(0x902bddc8, 0x5a9315ed), TOBN(0x07c1ccb3, 0x86964bec), TOBN(0x78f4eacf, 0xb6258301)}}, {{TOBN(0x4bdf3a49, 0x56f90823), TOBN(0xba0f5080, 0x741d777b), TOBN(0x091d71c3, 0xf38bf760), TOBN(0x9633d50f, 0x9b625b02)}, {TOBN(0x03ecb743, 0xb8c9de61), TOBN(0xb4751254, 0x5de74720), TOBN(0x9f9defc9, 0x74ce1cb2), TOBN(0x774a4f6a, 0x00bd32ef)}}, {{TOBN(0xaca385f7, 0x73848f22), TOBN(0x53dad716, 0xf3f8558e), TOBN(0xab7b34b0, 0x93c471f9), TOBN(0xf530e069, 0x19644bc7)}, {TOBN(0x3d9fb1ff, 0xdd59d31a), TOBN(0x4382e0df, 0x08daa795), TOBN(0x165c6f4b, 0xd5cc88d7), TOBN(0xeaa392d5, 0x4a18c900)}}, {{TOBN(0x94203c67, 0x648024ee), TOBN(0x188763f2, 0x8c2fabcd), TOBN(0xa80f87ac, 0xbbaec835), TOBN(0x632c96e0, 0xf29d8d54)}, {TOBN(0x29b0a60e, 0x4c00a95e), TOBN(0x2ef17f40, 0xe011e9fa), TOBN(0xf6c0e1d1, 0x15b77223), TOBN(0xaaec2c62, 0x14b04e32)}}, {{TOBN(0xd35688d8, 0x3d84e58c), TOBN(0x2af5094c, 0x958571db), TOBN(0x4fff7e19, 0x760682a6), TOBN(0x4cb27077, 0xe39a407c)}, {TOBN(0x0f59c547, 0x4ff0e321), TOBN(0x169f34a6, 0x1b34c8ff), TOBN(0x2bff1096, 0x52bc1ba7), TOBN(0xa25423b7, 0x83583544)}}, {{TOBN(0x5d55d5d5, 0x0ac8b782), TOBN(0xff6622ec, 0x2db3c892), TOBN(0x48fce741, 0x6b8bb642), TOBN(0x31d6998c, 0x69d7e3dc)}, {TOBN(0xdbaf8004, 0xcadcaed0), TOBN(0x801b0142, 0xd81d053c), TOBN(0x94b189fc, 0x59630ec6), TOBN(0x120e9934, 0xaf762c8e)}}, {{TOBN(0x53a29aa4, 0xfdc6a404), TOBN(0x19d8e01e, 0xa1909948), TOBN(0x3cfcabf1, 0xd7e89681), TOBN(0x3321a50d, 0x4e132d37)}, {TOBN(0xd0496863, 0xe9a86111), TOBN(0x8c0cde61, 0x06a3bc65), TOBN(0xaf866c49, 0xfc9f8eef), TOBN(0x2066350e, 0xff7f5141)}}, {{TOBN(0x4f8a4689, 0xe56ddfbd), TOBN(0xea1b0c07, 0xfe32983a), TOBN(0x2b317462, 0x873cb8cb), TOBN(0x658deddc, 0x2d93229f)}, {TOBN(0x65efaf4d, 0x0f64ef58), TOBN(0xfe43287d, 0x730cc7a8), TOBN(0xaebc0c72, 0x3d047d70), TOBN(0x92efa539, 0xd92d26c9)}}, {{TOBN(0x06e78457, 0x94b56526), TOBN(0x415cb80f, 0x0961002d), TOBN(0x89e5c565, 0x76dcb10f), TOBN(0x8bbb6982, 0xff9259fe)}, {TOBN(0x4fe8795b, 0x9abc2668), TOBN(0xb5d4f534, 0x1e678fb1), TOBN(0x6601f3be, 0x7b7da2b9), TOBN(0x98da59e2, 0xa13d6805)}}, {{TOBN(0x190d8ea6, 0x01799a52), TOBN(0xa20cec41, 0xb86d2952), TOBN(0x3062ffb2, 0x7fff2a7c), TOBN(0x741b32e5, 0x79f19d37)}, {TOBN(0xf80d8181, 0x4eb57d47), TOBN(0x7a2d0ed4, 0x16aef06b), TOBN(0x09735fb0, 0x1cecb588), TOBN(0x1641caaa, 0xc6061f5b)}}}, {{{TOBN(0x7f99824f, 0x20151427), TOBN(0x206828b6, 0x92430206), TOBN(0xaa9097d7, 0xe1112357), TOBN(0xacf9a2f2, 0x09e414ec)}, {TOBN(0xdbdac9da, 0x27915356), TOBN(0x7e0734b7, 0x001efee3), TOBN(0x54fab5bb, 0xd2b288e2), TOBN(0x4c630fc4, 0xf62dd09c)}}, {{TOBN(0x8537107a, 0x1ac2703b), TOBN(0xb49258d8, 0x6bc857b5), TOBN(0x57df14de, 0xbcdaccd1), TOBN(0x24ab68d7, 0xc4ae8529)}, {TOBN(0x7ed8b5d4, 0x734e59d0), TOBN(0x5f8740c8, 0xc495cc80), TOBN(0x84aedd5a, 0x291db9b3), TOBN(0x80b360f8, 0x4fb995be)}}, {{TOBN(0xae915f5d, 0x5fa067d1), TOBN(0x4134b57f, 0x9668960c), TOBN(0xbd3656d6, 0xa48edaac), TOBN(0xdac1e3e4, 0xfc1d7436)}, {TOBN(0x674ff869, 0xd81fbb26), TOBN(0x449ed3ec, 0xb26c33d4), TOBN(0x85138705, 0xd94203e8), TOBN(0xccde538b, 0xbeeb6f4a)}}, {{TOBN(0x55d5c68d, 0xa61a76fa), TOBN(0x598b441d, 0xca1554dc), TOBN(0xd39923b9, 0x773b279c), TOBN(0x33331d3c, 0x36bf9efc)}, {TOBN(0x2d4c848e, 0x298de399), TOBN(0xcfdb8e77, 0xa1a27f56), TOBN(0x94c855ea, 0x57b8ab70), TOBN(0xdcdb9dae, 0x6f7879ba)}}, {{TOBN(0x7bdff8c2, 0x019f2a59), TOBN(0xb3ce5bb3, 0xcb4fbc74), TOBN(0xea907f68, 0x8a9173dd), TOBN(0x6cd3d0d3, 0x95a75439)}, {TOBN(0x92ecc4d6, 0xefed021c), TOBN(0x09a9f9b0, 0x6a77339a), TOBN(0x87ca6b15, 0x7188c64a), TOBN(0x10c29968, 0x44899158)}}, {{TOBN(0x5859a229, 0xed6e82ef), TOBN(0x16f338e3, 0x65ebaf4e), TOBN(0x0cd31387, 0x5ead67ae), TOBN(0x1c73d228, 0x54ef0bb4)}, {TOBN(0x4cb55131, 0x74a5c8c7), TOBN(0x01cd2970, 0x7f69ad6a), TOBN(0xa04d00dd, 0xe966f87e), TOBN(0xd96fe447, 0x0b7b0321)}}, {{TOBN(0x342ac06e, 0x88fbd381), TOBN(0x02cd4a84, 0x5c35a493), TOBN(0xe8fa89de, 0x54f1bbcd), TOBN(0x341d6367, 0x2575ed4c)}, {TOBN(0xebe357fb, 0xd238202b), TOBN(0x600b4d1a, 0xa984ead9), TOBN(0xc35c9f44, 0x52436ea0), TOBN(0x96fe0a39, 0xa370751b)}}, {{TOBN(0x4c4f0736, 0x7f636a38), TOBN(0x9f943fb7, 0x0e76d5cb), TOBN(0xb03510ba, 0xa8b68b8b), TOBN(0xc246780a, 0x9ed07a1f)}, {TOBN(0x3c051415, 0x6d549fc2), TOBN(0xc2953f31, 0x607781ca), TOBN(0x955e2c69, 0xd8d95413), TOBN(0xb300fadc, 0x7bd282e3)}}, {{TOBN(0x81fe7b50, 0x87e9189f), TOBN(0xdb17375c, 0xf42dda27), TOBN(0x22f7d896, 0xcf0a5904), TOBN(0xa0e57c5a, 0xebe348e6)}, {TOBN(0xa61011d3, 0xf40e3c80), TOBN(0xb1189321, 0x8db705c5), TOBN(0x4ed9309e, 0x50fedec3), TOBN(0xdcf14a10, 0x4d6d5c1d)}}, {{TOBN(0x056c265b, 0x55691342), TOBN(0xe8e08504, 0x91049dc7), TOBN(0x131329f5, 0xc9bae20a), TOBN(0x96c8b3e8, 0xd9dccdb4)}, {TOBN(0x8c5ff838, 0xfb4ee6b4), TOBN(0xfc5a9aeb, 0x41e8ccf0), TOBN(0x7417b764, 0xfae050c6), TOBN(0x0953c3d7, 0x00452080)}}, {{TOBN(0x21372682, 0x38dfe7e8), TOBN(0xea417e15, 0x2bb79d4b), TOBN(0x59641f1c, 0x76e7cf2d), TOBN(0x271e3059, 0xea0bcfcc)}, {TOBN(0x624c7dfd, 0x7253ecbd), TOBN(0x2f552e25, 0x4fca6186), TOBN(0xcbf84ecd, 0x4d866e9c), TOBN(0x73967709, 0xf68d4610)}}, {{TOBN(0xa14b1163, 0xc27901b4), TOBN(0xfd9236e0, 0x899b8bf3), TOBN(0x42b091ec, 0xcbc6da0a), TOBN(0xbb1dac6f, 0x5ad1d297)}, {TOBN(0x80e61d53, 0xa91cf76e), TOBN(0x4110a412, 0xd31f1ee7), TOBN(0x2d87c3ba, 0x13efcf77), TOBN(0x1f374bb4, 0xdf450d76)}}, {{TOBN(0x5e78e2f2, 0x0d188dab), TOBN(0xe3968ed0, 0xf4b885ef), TOBN(0x46c0568e, 0x7314570f), TOBN(0x31616338, 0x01170521)}, {TOBN(0x18e1e7e2, 0x4f0c8afe), TOBN(0x4caa75ff, 0xdeea78da), TOBN(0x82db67f2, 0x7c5d8a51), TOBN(0x36a44d86, 0x6f505370)}}, {{TOBN(0xd72c5bda, 0x0333974f), TOBN(0x5db516ae, 0x27a70146), TOBN(0x34705281, 0x210ef921), TOBN(0xbff17a8f, 0x0c9c38e5)}, {TOBN(0x78f4814e, 0x12476da1), TOBN(0xc1e16613, 0x33c16980), TOBN(0x9e5b386f, 0x424d4bca), TOBN(0x4c274e87, 0xc85740de)}}, {{TOBN(0xb6a9b88d, 0x6c2f5226), TOBN(0x14d1b944, 0x550d7ca8), TOBN(0x580c85fc, 0x1fc41709), TOBN(0xc1da368b, 0x54c6d519)}, {TOBN(0x2b0785ce, 0xd5113cf7), TOBN(0x0670f633, 0x5a34708f), TOBN(0x46e23767, 0x15cc3f88), TOBN(0x1b480cfa, 0x50c72c8f)}}, {{TOBN(0x20288602, 0x4147519a), TOBN(0xd0981eac, 0x26b372f0), TOBN(0xa9d4a7ca, 0xa785ebc8), TOBN(0xd953c50d, 0xdbdf58e9)}, {TOBN(0x9d6361cc, 0xfd590f8f), TOBN(0x72e9626b, 0x44e6c917), TOBN(0x7fd96110, 0x22eb64cf), TOBN(0x863ebb7e, 0x9eb288f3)}}, {{TOBN(0x6e6ab761, 0x6aca8ee7), TOBN(0x97d10b39, 0xd7b40358), TOBN(0x1687d377, 0x1e5feb0d), TOBN(0xc83e50e4, 0x8265a27a)}, {TOBN(0x8f75a9fe, 0xc954b313), TOBN(0xcc2e8f47, 0x310d1f61), TOBN(0xf5ba81c5, 0x6557d0e0), TOBN(0x25f9680c, 0x3eaf6207)}}, {{TOBN(0xf95c6609, 0x4354080b), TOBN(0x5225bfa5, 0x7bf2fe1c), TOBN(0xc5c004e2, 0x5c7d98fa), TOBN(0x3561bf1c, 0x019aaf60)}, {TOBN(0x5e6f9f17, 0xba151474), TOBN(0xdec2f934, 0xb04f6eca), TOBN(0x64e368a1, 0x269acb1e), TOBN(0x1332d9e4, 0x0cdda493)}}, {{TOBN(0x60d6cf69, 0xdf23de05), TOBN(0x66d17da2, 0x009339a0), TOBN(0x9fcac985, 0x0a693923), TOBN(0xbcf057fc, 0xed7c6a6d)}, {TOBN(0xc3c5c8c5, 0xf0b5662c), TOBN(0x25318dd8, 0xdcba4f24), TOBN(0x60e8cb75, 0x082b69ff), TOBN(0x7c23b3ee, 0x1e728c01)}}, {{TOBN(0x15e10a0a, 0x097e4403), TOBN(0xcb3d0a86, 0x19854665), TOBN(0x88d8e211, 0xd67d4826), TOBN(0xb39af66e, 0x0b9d2839)}, {TOBN(0xa5f94588, 0xbd475ca8), TOBN(0xe06b7966, 0xc077b80b), TOBN(0xfedb1485, 0xda27c26c), TOBN(0xd290d33a, 0xfe0fd5e0)}}, {{TOBN(0xa40bcc47, 0xf34fb0fa), TOBN(0xb4760cc8, 0x1fb1ab09), TOBN(0x8fca0993, 0xa273bfe3), TOBN(0x13e4fe07, 0xf70b213c)}, {TOBN(0x3bcdb992, 0xfdb05163), TOBN(0x8c484b11, 0x0c2b19b6), TOBN(0x1acb815f, 0xaaf2e3e2), TOBN(0xc6905935, 0xb89ff1b4)}}, {{TOBN(0xb2ad6f9d, 0x586e74e1), TOBN(0x488883ad, 0x67b80484), TOBN(0x758aa2c7, 0x369c3ddb), TOBN(0x8ab74e69, 0x9f9afd31)}, {TOBN(0x10fc2d28, 0x5e21beb1), TOBN(0x3484518a, 0x318c42f9), TOBN(0x377427dc, 0x53cf40c3), TOBN(0x9de0781a, 0x391bc1d9)}}, {{TOBN(0x8faee858, 0x693807e1), TOBN(0xa3865327, 0x4e81ccc7), TOBN(0x02c30ff2, 0x6f835b84), TOBN(0xb604437b, 0x0d3d38d4)}, {TOBN(0xb3fc8a98, 0x5ca1823d), TOBN(0xb82f7ec9, 0x03be0324), TOBN(0xee36d761, 0xcf684a33), TOBN(0x5a01df0e, 0x9f29bf7d)}}, {{TOBN(0x686202f3, 0x1306583d), TOBN(0x05b10da0, 0x437c622e), TOBN(0xbf9aaa0f, 0x076a7bc8), TOBN(0x25e94efb, 0x8f8f4e43)}, {TOBN(0x8a35c9b7, 0xfa3dc26d), TOBN(0xe0e5fb93, 0x96ff03c5), TOBN(0xa77e3843, 0xebc394ce), TOBN(0xcede6595, 0x8361de60)}}, {{TOBN(0xd27c22f6, 0xa1993545), TOBN(0xab01cc36, 0x24d671ba), TOBN(0x63fa2877, 0xa169c28e), TOBN(0x925ef904, 0x2eb08376)}, {TOBN(0x3b2fa3cf, 0x53aa0b32), TOBN(0xb27beb5b, 0x71c49d7a), TOBN(0xb60e1834, 0xd105e27f), TOBN(0xd6089788, 0x4f68570d)}}, {{TOBN(0x23094ce0, 0xd6fbc2ac), TOBN(0x738037a1, 0x815ff551), TOBN(0xda73b1bb, 0x6bef119c), TOBN(0xdcf6c430, 0xeef506ba)}, {TOBN(0x00e4fe7b, 0xe3ef104a), TOBN(0xebdd9a2c, 0x0a065628), TOBN(0x853a81c3, 0x8792043e), TOBN(0x22ad6ece, 0xb3b59108)}}, {{TOBN(0x9fb813c0, 0x39cd297d), TOBN(0x8ec7e16e, 0x05bda5d9), TOBN(0x2834797c, 0x0d104b96), TOBN(0xcc11a2e7, 0x7c511510)}, {TOBN(0x96ca5a53, 0x96ee6380), TOBN(0x054c8655, 0xcea38742), TOBN(0xb5946852, 0xd54dfa7d), TOBN(0x97c422e7, 0x1f4ab207)}}, {{TOBN(0xbf907509, 0x0c22b540), TOBN(0x2cde42aa, 0xb7c267d4), TOBN(0xba18f9ed, 0x5ab0d693), TOBN(0x3ba62aa6, 0x6e4660d9)}, {TOBN(0xb24bf97b, 0xab9ea96a), TOBN(0x5d039642, 0xe3b60e32), TOBN(0x4e6a4506, 0x7c4d9bd5), TOBN(0x666c5b9e, 0x7ed4a6a4)}}, {{TOBN(0xfa3fdcd9, 0x8edbd7cc), TOBN(0x4660bb87, 0xc6ccd753), TOBN(0x9ae90820, 0x21e6b64f), TOBN(0x8a56a713, 0xb36bfb3f)}, {TOBN(0xabfce096, 0x5726d47f), TOBN(0x9eed01b2, 0x0b1a9a7f), TOBN(0x30e9cad4, 0x4eb74a37), TOBN(0x7b2524cc, 0x53e9666d)}}, {{TOBN(0x6a29683b, 0x8f4b002f), TOBN(0xc2200d7a, 0x41f4fc20), TOBN(0xcf3af47a, 0x3a338acc), TOBN(0x6539a4fb, 0xe7128975)}, {TOBN(0xcec31c14, 0xc33c7fcf), TOBN(0x7eb6799b, 0xc7be322b), TOBN(0x119ef4e9, 0x6646f623), TOBN(0x7b7a26a5, 0x54d7299b)}}, {{TOBN(0xcb37f08d, 0x403f46f2), TOBN(0x94b8fc43, 0x1a0ec0c7), TOBN(0xbb8514e3, 0xc332142f), TOBN(0xf3ed2c33, 0xe80d2a7a)}, {TOBN(0x8d2080af, 0xb639126c), TOBN(0xf7b6be60, 0xe3553ade), TOBN(0x3950aa9f, 0x1c7e2b09), TOBN(0x847ff958, 0x6410f02b)}}, {{TOBN(0x877b7cf5, 0x678a31b0), TOBN(0xd50301ae, 0x3998b620), TOBN(0x734257c5, 0xc00fb396), TOBN(0xf9fb18a0, 0x04e672a6)}, {TOBN(0xff8bd8eb, 0xe8758851), TOBN(0x1e64e4c6, 0x5d99ba44), TOBN(0x4b8eaedf, 0x7dfd93b7), TOBN(0xba2f2a98, 0x04e76b8c)}}, {{TOBN(0x7d790cba, 0xe8053433), TOBN(0xc8e725a0, 0x3d2c9585), TOBN(0x58c5c476, 0xcdd8f5ed), TOBN(0xd106b952, 0xefa9fe1d)}, {TOBN(0x3c5c775b, 0x0eff13a9), TOBN(0x242442ba, 0xe057b930), TOBN(0xe9f458d4, 0xc9b70cbd), TOBN(0x69b71448, 0xa3cdb89a)}}, {{TOBN(0x41ee46f6, 0x0e2ed742), TOBN(0x573f1045, 0x40067493), TOBN(0xb1e154ff, 0x9d54c304), TOBN(0x2ad0436a, 0x8d3a7502)}, {TOBN(0xee4aaa2d, 0x431a8121), TOBN(0xcd38b3ab, 0x886f11ed), TOBN(0x57d49ea6, 0x034a0eb7), TOBN(0xd2b773bd, 0xf7e85e58)}}, {{TOBN(0x4a559ac4, 0x9b5c1f14), TOBN(0xc444be1a, 0x3e54df2b), TOBN(0x13aad704, 0xeda41891), TOBN(0xcd927bec, 0x5eb5c788)}, {TOBN(0xeb3c8516, 0xe48c8a34), TOBN(0x1b7ac812, 0x4b546669), TOBN(0x1815f896, 0x594df8ec), TOBN(0x87c6a79c, 0x79227865)}}, {{TOBN(0xae02a2f0, 0x9b56ddbd), TOBN(0x1339b5ac, 0x8a2f1cf3), TOBN(0xf2b569c7, 0x839dff0d), TOBN(0xb0b9e864, 0xfee9a43d)}, {TOBN(0x4ff8ca41, 0x77bb064e), TOBN(0x145a2812, 0xfd249f63), TOBN(0x3ab7beac, 0xf86f689a), TOBN(0x9bafec27, 0x01d35f5e)}}, {{TOBN(0x28054c65, 0x4265aa91), TOBN(0xa4b18304, 0x035efe42), TOBN(0x6887b0e6, 0x9639dec7), TOBN(0xf4b8f6ad, 0x3d52aea5)}, {TOBN(0xfb9293cc, 0x971a8a13), TOBN(0x3f159e5d, 0x4c934d07), TOBN(0x2c50e9b1, 0x09acbc29), TOBN(0x08eb65e6, 0x7154d129)}}, {{TOBN(0x4feff589, 0x30b75c3e), TOBN(0x0bb82fe2, 0x94491c93), TOBN(0xd8ac377a, 0x89af62bb), TOBN(0xd7b51490, 0x9685e49f)}, {TOBN(0xabca9a7b, 0x04497f19), TOBN(0x1b35ed0a, 0x1a7ad13f), TOBN(0x6b601e21, 0x3ec86ed6), TOBN(0xda91fcb9, 0xce0c76f1)}}, {{TOBN(0x9e28507b, 0xd7ab27e1), TOBN(0x7c19a555, 0x63945b7b), TOBN(0x6b43f0a1, 0xaafc9827), TOBN(0x443b4fbd, 0x3aa55b91)}, {TOBN(0x962b2e65, 0x6962c88f), TOBN(0x139da8d4, 0xce0db0ca), TOBN(0xb93f05dd, 0x1b8d6c4f), TOBN(0x779cdff7, 0x180b9824)}}, {{TOBN(0xbba23fdd, 0xae57c7b7), TOBN(0x345342f2, 0x1b932522), TOBN(0xfd9c80fe, 0x556d4aa3), TOBN(0xa03907ba, 0x6525bb61)}, {TOBN(0x38b010e1, 0xff218933), TOBN(0xc066b654, 0xaa52117b), TOBN(0x8e141920, 0x94f2e6ea), TOBN(0x66a27dca, 0x0d32f2b2)}}, {{TOBN(0x69c7f993, 0x048b3717), TOBN(0xbf5a989a, 0xb178ae1c), TOBN(0x49fa9058, 0x564f1d6b), TOBN(0x27ec6e15, 0xd31fde4e)}, {TOBN(0x4cce0373, 0x7276e7fc), TOBN(0x64086d79, 0x89d6bf02), TOBN(0x5a72f046, 0x4ccdd979), TOBN(0x909c3566, 0x47775631)}}, {{TOBN(0x1c07bc6b, 0x75dd7125), TOBN(0xb4c6bc97, 0x87a0428d), TOBN(0x507ece52, 0xfdeb6b9d), TOBN(0xfca56512, 0xb2c95432)}, {TOBN(0x15d97181, 0xd0e8bd06), TOBN(0x384dd317, 0xc6bb46ea), TOBN(0x5441ea20, 0x3952b624), TOBN(0xbcf70dee, 0x4e7dc2fb)}}, {{TOBN(0x372b016e, 0x6628e8c3), TOBN(0x07a0d667, 0xb60a7522), TOBN(0xcf05751b, 0x0a344ee2), TOBN(0x0ec09a48, 0x118bdeec)}, {TOBN(0x6e4b3d4e, 0xd83dce46), TOBN(0x43a6316d, 0x99d2fc6e), TOBN(0xa99d8989, 0x56cf044c), TOBN(0x7c7f4454, 0xae3e5fb7)}}, {{TOBN(0xb2e6b121, 0xfbabbe92), TOBN(0x281850fb, 0xe1330076), TOBN(0x093581ec, 0x97890015), TOBN(0x69b1dded, 0x75ff77f5)}, {TOBN(0x7cf0b18f, 0xab105105), TOBN(0x953ced31, 0xa89ccfef), TOBN(0x3151f85f, 0xeb914009), TOBN(0x3c9f1b87, 0x88ed48ad)}}, {{TOBN(0xc9aba1a1, 0x4a7eadcb), TOBN(0x928e7501, 0x522e71cf), TOBN(0xeaede727, 0x3a2e4f83), TOBN(0x467e10d1, 0x1ce3bbd3)}, {TOBN(0xf3442ac3, 0xb955dcf0), TOBN(0xba96307d, 0xd3d5e527), TOBN(0xf763a10e, 0xfd77f474), TOBN(0x5d744bd0, 0x6a6e1ff0)}}, {{TOBN(0xd287282a, 0xa777899e), TOBN(0xe20eda8f, 0xd03f3cde), TOBN(0x6a7e75bb, 0x50b07d31), TOBN(0x0b7e2a94, 0x6f379de4)}, {TOBN(0x31cb64ad, 0x19f593cf), TOBN(0x7b1a9e4f, 0x1e76ef1d), TOBN(0xe18c9c9d, 0xb62d609c), TOBN(0x439bad6d, 0xe779a650)}}, {{TOBN(0x219d9066, 0xe032f144), TOBN(0x1db632b8, 0xe8b2ec6a), TOBN(0xff0d0fd4, 0xfda12f78), TOBN(0x56fb4c2d, 0x2a25d265)}, {TOBN(0x5f4e2ee1, 0x255a03f1), TOBN(0x61cd6af2, 0xe96af176), TOBN(0xe0317ba8, 0xd068bc97), TOBN(0x927d6bab, 0x264b988e)}}, {{TOBN(0xa18f07e0, 0xe90fb21e), TOBN(0x00fd2b80, 0xbba7fca1), TOBN(0x20387f27, 0x95cd67b5), TOBN(0x5b89a4e7, 0xd39707f7)}, {TOBN(0x8f83ad3f, 0x894407ce), TOBN(0xa0025b94, 0x6c226132), TOBN(0xc79563c7, 0xf906c13b), TOBN(0x5f548f31, 0x4e7bb025)}}, {{TOBN(0x2b4c6b8f, 0xeac6d113), TOBN(0xa67e3f9c, 0x0e813c76), TOBN(0x3982717c, 0x3fe1f4b9), TOBN(0x58865819, 0x26d8050e)}, {TOBN(0x99f3640c, 0xf7f06f20), TOBN(0xdc610216, 0x2a66ebc2), TOBN(0x52f2c175, 0x767a1e08), TOBN(0x05660e1a, 0x5999871b)}}, {{TOBN(0x6b0f1762, 0x6d3c4693), TOBN(0xf0e7d627, 0x37ed7bea), TOBN(0xc51758c7, 0xb75b226d), TOBN(0x40a88628, 0x1f91613b)}, {TOBN(0x889dbaa7, 0xbbb38ce0), TOBN(0xe0404b65, 0xbddcad81), TOBN(0xfebccd3a, 0x8bc9671f), TOBN(0xfbf9a357, 0xee1f5375)}}, {{TOBN(0x5dc169b0, 0x28f33398), TOBN(0xb07ec11d, 0x72e90f65), TOBN(0xae7f3b4a, 0xfaab1eb1), TOBN(0xd970195e, 0x5f17538a)}, {TOBN(0x52b05cbe, 0x0181e640), TOBN(0xf5debd62, 0x2643313d), TOBN(0x76148154, 0x5df31f82), TOBN(0x23e03b33, 0x3a9e13c5)}}, {{TOBN(0xff758949, 0x4fde0c1f), TOBN(0xbf8a1abe, 0xe5b6ec20), TOBN(0x702278fb, 0x87e1db6c), TOBN(0xc447ad7a, 0x35ed658f)}, {TOBN(0x48d4aa38, 0x03d0ccf2), TOBN(0x80acb338, 0x819a7c03), TOBN(0x9bc7c89e, 0x6e17cecc), TOBN(0x46736b8b, 0x03be1d82)}}, {{TOBN(0xd65d7b60, 0xc0432f96), TOBN(0xddebe7a3, 0xdeb5442f), TOBN(0x79a25307, 0x7dff69a2), TOBN(0x37a56d94, 0x02cf3122)}, {TOBN(0x8bab8aed, 0xf2350d0a), TOBN(0x13c3f276, 0x037b0d9a), TOBN(0xc664957c, 0x44c65cae), TOBN(0x88b44089, 0xc2e71a88)}}, {{TOBN(0xdb88e5a3, 0x5cb02664), TOBN(0x5d4c0bf1, 0x8686c72e), TOBN(0xea3d9b62, 0xa682d53e), TOBN(0x9b605ef4, 0x0b2ad431)}, {TOBN(0x71bac202, 0xc69645d0), TOBN(0xa115f03a, 0x6a1b66e7), TOBN(0xfe2c563a, 0x158f4dc4), TOBN(0xf715b3a0, 0x4d12a78c)}}, {{TOBN(0x8f7f0a48, 0xd413213a), TOBN(0x2035806d, 0xc04becdb), TOBN(0xecd34a99, 0x5d8587f5), TOBN(0x4d8c3079, 0x9f6d3a71)}, {TOBN(0x1b2a2a67, 0x8d95a8f6), TOBN(0xc58c9d7d, 0xf2110d0d), TOBN(0xdeee81d5, 0xcf8fba3f), TOBN(0xa42be3c0, 0x0c7cdf68)}}, {{TOBN(0x2126f742, 0xd43b5eaa), TOBN(0x054a0766, 0xdfa59b85), TOBN(0x9d0d5e36, 0x126bfd45), TOBN(0xa1f8fbd7, 0x384f8a8f)}, {TOBN(0x317680f5, 0xd563fccc), TOBN(0x48ca5055, 0xf280a928), TOBN(0xe00b81b2, 0x27b578cf), TOBN(0x10aad918, 0x2994a514)}}, {{TOBN(0xd9e07b62, 0xb7bdc953), TOBN(0x9f0f6ff2, 0x5bc086dd), TOBN(0x09d1ccff, 0x655eee77), TOBN(0x45475f79, 0x5bef7df1)}, {TOBN(0x3faa28fa, 0x86f702cc), TOBN(0x92e60905, 0x0f021f07), TOBN(0xe9e62968, 0x7f8fa8c6), TOBN(0xbd71419a, 0xf036ea2c)}}, {{TOBN(0x171ee1cc, 0x6028da9a), TOBN(0x5352fe1a, 0xc251f573), TOBN(0xf8ff236e, 0x3fa997f4), TOBN(0xd831b6c9, 0xa5749d5f)}, {TOBN(0x7c872e1d, 0xe350e2c2), TOBN(0xc56240d9, 0x1e0ce403), TOBN(0xf9deb077, 0x6974f5cb), TOBN(0x7d50ba87, 0x961c3728)}}, {{TOBN(0xd6f89426, 0x5a3a2518), TOBN(0xcf817799, 0xc6303d43), TOBN(0x510a0471, 0x619e5696), TOBN(0xab049ff6, 0x3a5e307b)}, {TOBN(0xe4cdf9b0, 0xfeb13ec7), TOBN(0xd5e97117, 0x9d8ff90c), TOBN(0xf6f64d06, 0x9afa96af), TOBN(0x00d0bf5e, 0x9d2012a2)}}, {{TOBN(0xe63f301f, 0x358bcdc0), TOBN(0x07689e99, 0x0a9d47f8), TOBN(0x1f689e2f, 0x4f43d43a), TOBN(0x4d542a16, 0x90920904)}, {TOBN(0xaea293d5, 0x9ca0a707), TOBN(0xd061fe45, 0x8ac68065), TOBN(0x1033bf1b, 0x0090008c), TOBN(0x29749558, 0xc08a6db6)}}, {{TOBN(0x74b5fc59, 0xc1d5d034), TOBN(0xf712e9f6, 0x67e215e0), TOBN(0xfd520cbd, 0x860200e6), TOBN(0x0229acb4, 0x3ea22588)}, {TOBN(0x9cd1e14c, 0xfff0c82e), TOBN(0x87684b62, 0x59c69e73), TOBN(0xda85e61c, 0x96ccb989), TOBN(0x2d5dbb02, 0xa3d06493)}}, {{TOBN(0xf22ad33a, 0xe86b173c), TOBN(0xe8e41ea5, 0xa79ff0e3), TOBN(0x01d2d725, 0xdd0d0c10), TOBN(0x31f39088, 0x032d28f9)}, {TOBN(0x7b3f71e1, 0x7829839e), TOBN(0x0cf691b4, 0x4502ae58), TOBN(0xef658dbd, 0xbefc6115), TOBN(0xa5cd6ee5, 0xb3ab5314)}}, {{TOBN(0x206c8d7b, 0x5f1d2347), TOBN(0x794645ba, 0x4cc2253a), TOBN(0xd517d8ff, 0x58389e08), TOBN(0x4fa20dee, 0x9f847288)}, {TOBN(0xeba072d8, 0xd797770a), TOBN(0x7360c91d, 0xbf429e26), TOBN(0x7200a3b3, 0x80af8279), TOBN(0x6a1c9150, 0x82dadce3)}}, {{TOBN(0x0ee6d3a7, 0xc35d8794), TOBN(0x042e6558, 0x0356bae5), TOBN(0x9f59698d, 0x643322fd), TOBN(0x9379ae15, 0x50a61967)}, {TOBN(0x64b9ae62, 0xfcc9981e), TOBN(0xaed3d631, 0x6d2934c6), TOBN(0x2454b302, 0x5e4e65eb), TOBN(0xab09f647, 0xf9950428)}}}, {{{TOBN(0xb2083a12, 0x22248acc), TOBN(0x1f6ec0ef, 0x3264e366), TOBN(0x5659b704, 0x5afdee28), TOBN(0x7a823a40, 0xe6430bb5)}, {TOBN(0x24592a04, 0xe1900a79), TOBN(0xcde09d4a, 0xc9ee6576), TOBN(0x52b6463f, 0x4b5ea54a), TOBN(0x1efe9ed3, 0xd3ca65a7)}}, {{TOBN(0xe27a6dbe, 0x305406dd), TOBN(0x8eb7dc7f, 0xdd5d1957), TOBN(0xf54a6876, 0x387d4d8f), TOBN(0x9c479409, 0xc7762de4)}, {TOBN(0xbe4d5b5d, 0x99b30778), TOBN(0x25380c56, 0x6e793682), TOBN(0x602d37f3, 0xdac740e3), TOBN(0x140deabe, 0x1566e4ae)}}, {{TOBN(0x4481d067, 0xafd32acf), TOBN(0xd8f0fcca, 0xe1f71ccf), TOBN(0xd208dd0c, 0xb596f2da), TOBN(0xd049d730, 0x9aad93f9)}, {TOBN(0xc79f263d, 0x42ab580e), TOBN(0x09411bb1, 0x23f707b4), TOBN(0x8cfde1ff, 0x835e0eda), TOBN(0x72707490, 0x90f03402)}}, {{TOBN(0xeaee6126, 0xc49a861e), TOBN(0x024f3b65, 0xe14f0d06), TOBN(0x51a3f1e8, 0xc69bfc17), TOBN(0xc3c3a8e9, 0xa7686381)}, {TOBN(0x3400752c, 0xb103d4c8), TOBN(0x02bc4613, 0x9218b36b), TOBN(0xc67f75eb, 0x7651504a), TOBN(0xd6848b56, 0xd02aebfa)}}, {{TOBN(0xbd9802e6, 0xc30fa92b), TOBN(0x5a70d96d, 0x9a552784), TOBN(0x9085c4ea, 0x3f83169b), TOBN(0xfa9423bb, 0x06908228)}, {TOBN(0x2ffebe12, 0xfe97a5b9), TOBN(0x85da6049, 0x71b99118), TOBN(0x9cbc2f7f, 0x63178846), TOBN(0xfd96bc70, 0x9153218e)}}, {{TOBN(0x958381db, 0x1782269b), TOBN(0xae34bf79, 0x2597e550), TOBN(0xbb5c6064, 0x5f385153), TOBN(0x6f0e96af, 0xe3088048)}, {TOBN(0xbf6a0215, 0x77884456), TOBN(0xb3b5688c, 0x69310ea7), TOBN(0x17c94295, 0x04fad2de), TOBN(0xe020f0e5, 0x17896d4d)}}, {{TOBN(0x730ba0ab, 0x0976505f), TOBN(0x567f6813, 0x095e2ec5), TOBN(0x47062010, 0x6331ab71), TOBN(0x72cfa977, 0x41d22b9f)}, {TOBN(0x33e55ead, 0x8a2373da), TOBN(0xa8d0d5f4, 0x7ba45a68), TOBN(0xba1d8f9c, 0x03029d15), TOBN(0x8f34f1cc, 0xfc55b9f3)}}, {{TOBN(0xcca4428d, 0xbbe5a1a9), TOBN(0x8187fd5f, 0x3126bd67), TOBN(0x0036973a, 0x48105826), TOBN(0xa39b6663, 0xb8bd61a0)}, {TOBN(0x6d42deef, 0x2d65a808), TOBN(0x4969044f, 0x94636b19), TOBN(0xf611ee47, 0xdd5d564c), TOBN(0x7b2f3a49, 0xd2873077)}}, {{TOBN(0x94157d45, 0x300eb294), TOBN(0x2b2a656e, 0x169c1494), TOBN(0xc000dd76, 0xd3a47aa9), TOBN(0xa2864e4f, 0xa6243ea4)}, {TOBN(0x82716c47, 0xdb89842e), TOBN(0x12dfd7d7, 0x61479fb7), TOBN(0x3b9a2c56, 0xe0b2f6dc), TOBN(0x46be862a, 0xd7f85d67)}}, {{TOBN(0x03b0d8dd, 0x0f82b214), TOBN(0x460c34f9, 0xf103cbc6), TOBN(0xf32e5c03, 0x18d79e19), TOBN(0x8b8888ba, 0xa84117f8)}, {TOBN(0x8f3c37dc, 0xc0722677), TOBN(0x10d21be9, 0x1c1c0f27), TOBN(0xd47c8468, 0xe0f7a0c6), TOBN(0x9bf02213, 0xadecc0e0)}}, {{TOBN(0x0baa7d12, 0x42b48b99), TOBN(0x1bcb665d, 0x48424096), TOBN(0x8b847cd6, 0xebfb5cfb), TOBN(0x87c2ae56, 0x9ad4d10d)}, {TOBN(0xf1cbb122, 0x0de36726), TOBN(0xe7043c68, 0x3fdfbd21), TOBN(0x4bd0826a, 0x4e79d460), TOBN(0x11f5e598, 0x4bd1a2cb)}}, {{TOBN(0x97554160, 0xb7fe7b6e), TOBN(0x7d16189a, 0x400a3fb2), TOBN(0xd73e9bea, 0xe328ca1e), TOBN(0x0dd04b97, 0xe793d8cc)}, {TOBN(0xa9c83c9b, 0x506db8cc), TOBN(0x5cd47aae, 0xcf38814c), TOBN(0x26fc430d, 0xb64b45e6), TOBN(0x079b5499, 0xd818ea84)}}, {{TOBN(0xebb01102, 0xc1c24a3b), TOBN(0xca24e568, 0x1c161c1a), TOBN(0x103eea69, 0x36f00a4a), TOBN(0x9ad76ee8, 0x76176c7b)}, {TOBN(0x97451fc2, 0x538e0ff7), TOBN(0x94f89809, 0x6604b3b0), TOBN(0x6311436e, 0x3249cfd7), TOBN(0x27b4a7bd, 0x41224f69)}}, {{TOBN(0x03b5d21a, 0xe0ac2941), TOBN(0x279b0254, 0xc2d31937), TOBN(0x3307c052, 0xcac992d0), TOBN(0x6aa7cb92, 0xefa8b1f3)}, {TOBN(0x5a182580, 0x0d37c7a5), TOBN(0x13380c37, 0x342d5422), TOBN(0x92ac2d66, 0xd5d2ef92), TOBN(0x035a70c9, 0x030c63c6)}}, {{TOBN(0xc16025dd, 0x4ce4f152), TOBN(0x1f419a71, 0xf9df7c06), TOBN(0x6d5b2214, 0x91e4bb14), TOBN(0xfc43c6cc, 0x839fb4ce)}, {TOBN(0x49f06591, 0x925d6b2d), TOBN(0x4b37d9d3, 0x62186598), TOBN(0x8c54a971, 0xd01b1629), TOBN(0xe1a9c29f, 0x51d50e05)}}, {{TOBN(0x5109b785, 0x71ba1861), TOBN(0x48b22d5c, 0xd0c8f93d), TOBN(0xe8fa84a7, 0x8633bb93), TOBN(0x53fba6ba, 0x5aebbd08)}, {TOBN(0x7ff27df3, 0xe5eea7d8), TOBN(0x521c8796, 0x68ca7158), TOBN(0xb9d5133b, 0xce6f1a05), TOBN(0x2d50cd53, 0xfd0ebee4)}}, {{TOBN(0xc82115d6, 0xc5a3ef16), TOBN(0x993eff9d, 0xba079221), TOBN(0xe4da2c5e, 0x4b5da81c), TOBN(0x9a89dbdb, 0x8033fd85)}, {TOBN(0x60819ebf, 0x2b892891), TOBN(0x53902b21, 0x5d14a4d5), TOBN(0x6ac35051, 0xd7fda421), TOBN(0xcc6ab885, 0x61c83284)}}, {{TOBN(0x14eba133, 0xf74cff17), TOBN(0x240aaa03, 0xecb813f2), TOBN(0xcfbb6540, 0x6f665bee), TOBN(0x084b1fe4, 0xa425ad73)}, {TOBN(0x009d5d16, 0xd081f6a6), TOBN(0x35304fe8, 0xeef82c90), TOBN(0xf20346d5, 0xaa9eaa22), TOBN(0x0ada9f07, 0xac1c91e3)}}, {{TOBN(0xa6e21678, 0x968a6144), TOBN(0x54c1f77c, 0x07b31a1e), TOBN(0xd6bb787e, 0x5781fbe1), TOBN(0x61bd2ee0, 0xe31f1c4a)}, {TOBN(0xf25aa1e9, 0x781105fc), TOBN(0x9cf2971f, 0x7b2f8e80), TOBN(0x26d15412, 0xcdff919b), TOBN(0x01db4ebe, 0x34bc896e)}}, {{TOBN(0x7d9b3e23, 0xb40df1cf), TOBN(0x59337373, 0x94e971b4), TOBN(0xbf57bd14, 0x669cf921), TOBN(0x865daedf, 0x0c1a1064)}, {TOBN(0x3eb70bd3, 0x83279125), TOBN(0xbc3d5b9f, 0x34ecdaab), TOBN(0x91e3ed7e, 0x5f755caf), TOBN(0x49699f54, 0xd41e6f02)}}, {{TOBN(0x185770e1, 0xd4a7a15b), TOBN(0x08f3587a, 0xeaac87e7), TOBN(0x352018db, 0x473133ea), TOBN(0x674ce719, 0x04fd30fc)}, {TOBN(0x7b8d9835, 0x088b3e0e), TOBN(0x7a0356a9, 0x5d0d47a1), TOBN(0x9d9e7659, 0x6474a3c4), TOBN(0x61ea48a7, 0xff66966c)}}, {{TOBN(0x30417758, 0x0f3e4834), TOBN(0xfdbb21c2, 0x17a9afcb), TOBN(0x756fa17f, 0x2f9a67b3), TOBN(0x2a6b2421, 0xa245c1a8)}, {TOBN(0x64be2794, 0x4af02291), TOBN(0xade465c6, 0x2a5804fe), TOBN(0x8dffbd39, 0xa6f08fd7), TOBN(0xc4efa84c, 0xaa14403b)}}, {{TOBN(0xa1b91b2a, 0x442b0f5c), TOBN(0xb748e317, 0xcf997736), TOBN(0x8d1b62bf, 0xcee90e16), TOBN(0x907ae271, 0x0b2078c0)}, {TOBN(0xdf31534b, 0x0c9bcddd), TOBN(0x043fb054, 0x39adce83), TOBN(0x99031043, 0xd826846a), TOBN(0x61a9c0d6, 0xb144f393)}}, {{TOBN(0xdab48046, 0x47718427), TOBN(0xdf17ff9b, 0x6e830f8b), TOBN(0x408d7ee8, 0xe49a1347), TOBN(0x6ac71e23, 0x91c1d4ae)}, {TOBN(0xc8cbb9fd, 0x1defd73c), TOBN(0x19840657, 0xbbbbfec5), TOBN(0x39db1cb5, 0x9e7ef8ea), TOBN(0x78aa8296, 0x64105f30)}}, {{TOBN(0xa3d9b7f0, 0xa3738c29), TOBN(0x0a2f235a, 0xbc3250a3), TOBN(0x55e506f6, 0x445e4caf), TOBN(0x0974f73d, 0x33475f7a)}, {TOBN(0xd37dbba3, 0x5ba2f5a8), TOBN(0x542c6e63, 0x6af40066), TOBN(0x26d99b53, 0xc5d73e2c), TOBN(0x06060d7d, 0x6c3ca33e)}}, {{TOBN(0xcdbef1c2, 0x065fef4a), TOBN(0x77e60f7d, 0xfd5b92e3), TOBN(0xd7c549f0, 0x26708350), TOBN(0x201b3ad0, 0x34f121bf)}, {TOBN(0x5fcac2a1, 0x0334fc14), TOBN(0x8a9a9e09, 0x344552f6), TOBN(0x7dd8a1d3, 0x97653082), TOBN(0x5fc0738f, 0x79d4f289)}}, {{TOBN(0x787d244d, 0x17d2d8c3), TOBN(0xeffc6345, 0x70830684), TOBN(0x5ddb96dd, 0xe4f73ae5), TOBN(0x8efb14b1, 0x172549a5)}, {TOBN(0x6eb73eee, 0x2245ae7a), TOBN(0xbca4061e, 0xea11f13e), TOBN(0xb577421d, 0x30b01f5d), TOBN(0xaa688b24, 0x782e152c)}}, {{TOBN(0x67608e71, 0xbd3502ba), TOBN(0x4ef41f24, 0xb4de75a0), TOBN(0xb08dde5e, 0xfd6125e5), TOBN(0xde484825, 0xa409543f)}, {TOBN(0x1f198d98, 0x65cc2295), TOBN(0x428a3771, 0x6e0edfa2), TOBN(0x4f9697a2, 0xadf35fc7), TOBN(0x01a43c79, 0xf7cac3c7)}}, {{TOBN(0xb05d7059, 0x0fd3659a), TOBN(0x8927f30c, 0xbb7f2d9a), TOBN(0x4023d1ac, 0x8cf984d3), TOBN(0x32125ed3, 0x02897a45)}, {TOBN(0xfb572dad, 0x3d414205), TOBN(0x73000ef2, 0xe3fa82a9), TOBN(0x4c0868e9, 0xf10a5581), TOBN(0x5b61fc67, 0x6b0b3ca5)}}, {{TOBN(0xc1258d5b, 0x7cae440c), TOBN(0x21c08b41, 0x402b7531), TOBN(0xf61a8955, 0xde932321), TOBN(0x3568faf8, 0x2d1408af)}, {TOBN(0x71b15e99, 0x9ecf965b), TOBN(0xf14ed248, 0xe917276f), TOBN(0xc6f4caa1, 0x820cf9e2), TOBN(0x681b20b2, 0x18d83c7e)}}, {{TOBN(0x6cde738d, 0xc6c01120), TOBN(0x71db0813, 0xae70e0db), TOBN(0x95fc0644, 0x74afe18c), TOBN(0x34619053, 0x129e2be7)}, {TOBN(0x80615cea, 0xdb2a3b15), TOBN(0x0a49a19e, 0xdb4c7073), TOBN(0x0e1b84c8, 0x8fd2d367), TOBN(0xd74bf462, 0x033fb8aa)}}, {{TOBN(0x889f6d65, 0x533ef217), TOBN(0x7158c7e4, 0xc3ca2e87), TOBN(0xfb670dfb, 0xdc2b4167), TOBN(0x75910a01, 0x844c257f)}, {TOBN(0xf336bf07, 0xcf88577d), TOBN(0x22245250, 0xe45e2ace), TOBN(0x2ed92e8d, 0x7ca23d85), TOBN(0x29f8be4c, 0x2b812f58)}}, {{TOBN(0xdd9ebaa7, 0x076fe12b), TOBN(0x3f2400cb, 0xae1537f9), TOBN(0x1aa93528, 0x17bdfb46), TOBN(0xc0f98430, 0x67883b41)}, {TOBN(0x5590ede1, 0x0170911d), TOBN(0x7562f5bb, 0x34d4b17f), TOBN(0xe1fa1df2, 0x1826b8d2), TOBN(0xb40b796a, 0x6bd80d59)}}, {{TOBN(0xd65bf197, 0x3467ba92), TOBN(0x8c9b46db, 0xf70954b0), TOBN(0x97c8a0f3, 0x0e78f15d), TOBN(0xa8f3a69a, 0x85a4c961)}, {TOBN(0x4242660f, 0x61e4ce9b), TOBN(0xbf06aab3, 0x6ea6790c), TOBN(0xc6706f8e, 0xec986416), TOBN(0x9e56dec1, 0x9a9fc225)}}, {{TOBN(0x527c46f4, 0x9a9898d9), TOBN(0xd799e77b, 0x5633cdef), TOBN(0x24eacc16, 0x7d9e4297), TOBN(0xabb61cea, 0x6b1cb734)}, {TOBN(0xbee2e8a7, 0xf778443c), TOBN(0x3bb42bf1, 0x29de2fe6), TOBN(0xcbed86a1, 0x3003bb6f), TOBN(0xd3918e6c, 0xd781cdf6)}}, {{TOBN(0x4bee3271, 0x9a5103f1), TOBN(0x5243efc6, 0xf50eac06), TOBN(0xb8e122cb, 0x6adcc119), TOBN(0x1b7faa84, 0xc0b80a08)}, {TOBN(0x32c3d1bd, 0x6dfcd08c), TOBN(0x129dec4e, 0x0be427de), TOBN(0x98ab679c, 0x1d263c83), TOBN(0xafc83cb7, 0xcef64eff)}}, {{TOBN(0x85eb6088, 0x2fa6be76), TOBN(0x892585fb, 0x1328cbfe), TOBN(0xc154d3ed, 0xcf618dda), TOBN(0xc44f601b, 0x3abaf26e)}, {TOBN(0x7bf57d0b, 0x2be1fdfd), TOBN(0xa833bd2d, 0x21137fee), TOBN(0x9353af36, 0x2db591a8), TOBN(0xc76f26dc, 0x5562a056)}}, {{TOBN(0x1d87e47d, 0x3fdf5a51), TOBN(0x7afb5f93, 0x55c9cab0), TOBN(0x91bbf58f, 0x89e0586e), TOBN(0x7c72c018, 0x0d843709)}, {TOBN(0xa9a5aafb, 0x99b5c3dc), TOBN(0xa48a0f1d, 0x3844aeb0), TOBN(0x7178b7dd, 0xb667e482), TOBN(0x453985e9, 0x6e23a59a)}}, {{TOBN(0x4a54c860, 0x01b25dd8), TOBN(0x0dd37f48, 0xfb897c8a), TOBN(0x5f8aa610, 0x0ea90cd9), TOBN(0xc8892c68, 0x16d5830d)}, {TOBN(0xeb4befc0, 0xef514ca5), TOBN(0x478eb679, 0xe72c9ee6), TOBN(0x9bca20da, 0xdbc40d5f), TOBN(0xf015de21, 0xdde4f64a)}}, {{TOBN(0xaa6a4de0, 0xeaf4b8a5), TOBN(0x68cfd9ca, 0x4bc60e32), TOBN(0x668a4b01, 0x7fd15e70), TOBN(0xd9f0694a, 0xf27dc09d)}, {TOBN(0xf6c3cad5, 0xba708bcd), TOBN(0x5cd2ba69, 0x5bb95c2a), TOBN(0xaa28c1d3, 0x33c0a58f), TOBN(0x23e274e3, 0xabc77870)}}, {{TOBN(0x44c3692d, 0xdfd20a4a), TOBN(0x091c5fd3, 0x81a66653), TOBN(0x6c0bb691, 0x09a0757d), TOBN(0x9072e8b9, 0x667343ea)}, {TOBN(0x31d40eb0, 0x80848bec), TOBN(0x95bd480a, 0x79fd36cc), TOBN(0x01a77c61, 0x65ed43f5), TOBN(0xafccd127, 0x2e0d40bf)}}, {{TOBN(0xeccfc82d, 0x1cc1884b), TOBN(0xc85ac201, 0x5d4753b4), TOBN(0xc7a6caac, 0x658e099f), TOBN(0xcf46369e, 0x04b27390)}, {TOBN(0xe2e7d049, 0x506467ea), TOBN(0x481b63a2, 0x37cdeccc), TOBN(0x4029abd8, 0xed80143a), TOBN(0x28bfe3c7, 0xbcb00b88)}}, {{TOBN(0x3bec1009, 0x0643d84a), TOBN(0x885f3668, 0xabd11041), TOBN(0xdb02432c, 0xf83a34d6), TOBN(0x32f7b360, 0x719ceebe)}, {TOBN(0xf06c7837, 0xdad1fe7a), TOBN(0x60a157a9, 0x5441a0b0), TOBN(0x704970e9, 0xe2d47550), TOBN(0xcd2bd553, 0x271b9020)}}, {{TOBN(0xff57f82f, 0x33e24a0b), TOBN(0x9cbee23f, 0xf2565079), TOBN(0x16353427, 0xeb5f5825), TOBN(0x276feec4, 0xe948d662)}, {TOBN(0xd1b62bc6, 0xda10032b), TOBN(0x718351dd, 0xf0e72a53), TOBN(0x93452076, 0x2420e7ba), TOBN(0x96368fff, 0x3a00118d)}}, {{TOBN(0x00ce2d26, 0x150a49e4), TOBN(0x0c28b636, 0x3f04706b), TOBN(0xbad65a46, 0x58b196d0), TOBN(0x6c8455fc, 0xec9f8b7c)}, {TOBN(0xe90c895f, 0x2d71867e), TOBN(0x5c0be31b, 0xedf9f38c), TOBN(0x2a37a15e, 0xd8f6ec04), TOBN(0x239639e7, 0x8cd85251)}}, {{TOBN(0xd8975315, 0x9c7c4c6b), TOBN(0x603aa3c0, 0xd7409af7), TOBN(0xb8d53d0c, 0x007132fb), TOBN(0x68d12af7, 0xa6849238)}, {TOBN(0xbe0607e7, 0xbf5d9279), TOBN(0x9aa50055, 0xaada74ce), TOBN(0xe81079cb, 0xba7e8ccb), TOBN(0x610c71d1, 0xa5f4ff5e)}}, {{TOBN(0x9e2ee1a7, 0x5aa07093), TOBN(0xca84004b, 0xa75da47c), TOBN(0x074d3951, 0x3de75401), TOBN(0xf938f756, 0xbb311592)}, {TOBN(0x96197618, 0x00a43421), TOBN(0x39a25362, 0x07bc78c8), TOBN(0x278f710a, 0x0a171276), TOBN(0xb28446ea, 0x8d1a8f08)}}, {{TOBN(0x184781bf, 0xe3b6a661), TOBN(0x7751cb1d, 0xe6d279f7), TOBN(0xf8ff95d6, 0xc59eb662), TOBN(0x186d90b7, 0x58d3dea7)}, {TOBN(0x0e4bb6c1, 0xdfb4f754), TOBN(0x5c5cf56b, 0x2b2801dc), TOBN(0xc561e452, 0x1f54564d), TOBN(0xb4fb8c60, 0xf0dd7f13)}}, {{TOBN(0xf8849630, 0x33ff98c7), TOBN(0x9619fffa, 0xcf17769c), TOBN(0xf8090bf6, 0x1bfdd80a), TOBN(0x14d9a149, 0x422cfe63)}, {TOBN(0xb354c360, 0x6f6df9ea), TOBN(0xdbcf770d, 0x218f17ea), TOBN(0x207db7c8, 0x79eb3480), TOBN(0x213dbda8, 0x559b6a26)}}, {{TOBN(0xac4c200b, 0x29fc81b3), TOBN(0xebc3e09f, 0x171d87c1), TOBN(0x91799530, 0x1481aa9e), TOBN(0x051b92e1, 0x92e114fa)}, {TOBN(0xdf8f92e9, 0xecb5537f), TOBN(0x44b1b2cc, 0x290c7483), TOBN(0xa711455a, 0x2adeb016), TOBN(0x964b6856, 0x81a10c2c)}}, {{TOBN(0x4f159d99, 0xcec03623), TOBN(0x05532225, 0xef3271ea), TOBN(0xb231bea3, 0xc5ee4849), TOBN(0x57a54f50, 0x7094f103)}, {TOBN(0x3e2d421d, 0x9598b352), TOBN(0xe865a49c, 0x67412ab4), TOBN(0xd2998a25, 0x1cc3a912), TOBN(0x5d092808, 0x0c74d65d)}}, {{TOBN(0x73f45908, 0x4088567a), TOBN(0xeb6b280e, 0x1f214a61), TOBN(0x8c9adc34, 0xcaf0c13d), TOBN(0x39d12938, 0xf561fb80)}, {TOBN(0xb2dc3a5e, 0xbc6edfb4), TOBN(0x7485b1b1, 0xfe4d210e), TOBN(0x062e0400, 0xe186ae72), TOBN(0x91e32d5c, 0x6eeb3b88)}}, {{TOBN(0x6df574d7, 0x4be59224), TOBN(0xebc88ccc, 0x716d55f3), TOBN(0x26c2e6d0, 0xcad6ed33), TOBN(0xc6e21e7d, 0x0d3e8b10)}, {TOBN(0x2cc5840e, 0x5bcc36bb), TOBN(0x9292445e, 0x7da74f69), TOBN(0x8be8d321, 0x4e5193a8), TOBN(0x3ec23629, 0x8df06413)}}, {{TOBN(0xc7e9ae85, 0xb134defa), TOBN(0x6073b1d0, 0x1bb2d475), TOBN(0xb9ad615e, 0x2863c00d), TOBN(0x9e29493d, 0x525f4ac4)}, {TOBN(0xc32b1dea, 0x4e9acf4f), TOBN(0x3e1f01c8, 0xa50db88d), TOBN(0xb05d70ea, 0x04da916c), TOBN(0x714b0d0a, 0xd865803e)}}, {{TOBN(0x4bd493fc, 0x9920cb5e), TOBN(0x5b44b1f7, 0x92c7a3ac), TOBN(0xa2a77293, 0xbcec9235), TOBN(0x5ee06e87, 0xcd378553)}, {TOBN(0xceff8173, 0xda621607), TOBN(0x2bb03e4c, 0x99f5d290), TOBN(0x2945106a, 0xa6f734ac), TOBN(0xb5056604, 0xd25c4732)}}, {{TOBN(0x5945920c, 0xe079afee), TOBN(0x686e17a0, 0x6789831f), TOBN(0x5966bee8, 0xb74a5ae5), TOBN(0x38a673a2, 0x1e258d46)}, {TOBN(0xbd1cc1f2, 0x83141c95), TOBN(0x3b2ecf4f, 0x0e96e486), TOBN(0xcd3aa896, 0x74e5fc78), TOBN(0x415ec10c, 0x2482fa7a)}}, {{TOBN(0x15234419, 0x80503380), TOBN(0x513d917a, 0xd314b392), TOBN(0xb0b52f4e, 0x63caecae), TOBN(0x07bf22ad, 0x2dc7780b)}, {TOBN(0xe761e8a1, 0xe4306839), TOBN(0x1b3be962, 0x5dd7feaa), TOBN(0x4fe728de, 0x74c778f1), TOBN(0xf1fa0bda, 0x5e0070f6)}}, {{TOBN(0x85205a31, 0x6ec3f510), TOBN(0x2c7e4a14, 0xd2980475), TOBN(0xde3c19c0, 0x6f30ebfd), TOBN(0xdb1c1f38, 0xd4b7e644)}, {TOBN(0xfe291a75, 0x5dce364a), TOBN(0xb7b22a3c, 0x058f5be3), TOBN(0x2cd2c302, 0x37fea38c), TOBN(0x2930967a, 0x2e17be17)}}, {{TOBN(0x87f009de, 0x0c061c65), TOBN(0xcb014aac, 0xedc6ed44), TOBN(0x49bd1cb4, 0x3bafb1eb), TOBN(0x81bd8b5c, 0x282d3688)}, {TOBN(0x1cdab87e, 0xf01a17af), TOBN(0x21f37ac4, 0xe710063b), TOBN(0x5a6c5676, 0x42fc8193), TOBN(0xf4753e70, 0x56a6015c)}}, {{TOBN(0x020f795e, 0xa15b0a44), TOBN(0x8f37c8d7, 0x8958a958), TOBN(0x63b7e89b, 0xa4b675b5), TOBN(0xb4fb0c0c, 0x0fc31aea)}, {TOBN(0xed95e639, 0xa7ff1f2e), TOBN(0x9880f5a3, 0x619614fb), TOBN(0xdeb6ff02, 0x947151ab), TOBN(0x5bc5118c, 0xa868dcdb)}}, {{TOBN(0xd8da2055, 0x4c20cea5), TOBN(0xcac2776e, 0x14c4d69a), TOBN(0xcccb22c1, 0x622d599b), TOBN(0xa4ddb653, 0x68a9bb50)}, {TOBN(0x2c4ff151, 0x1b4941b4), TOBN(0xe1ff19b4, 0x6efba588), TOBN(0x35034363, 0xc48345e0), TOBN(0x45542e3d, 0x1e29dfc4)}}, {{TOBN(0xf197cb91, 0x349f7aed), TOBN(0x3b2b5a00, 0x8fca8420), TOBN(0x7c175ee8, 0x23aaf6d8), TOBN(0x54dcf421, 0x35af32b6)}, {TOBN(0x0ba14307, 0x27d6561e), TOBN(0x879d5ee4, 0xd175b1e2), TOBN(0xc7c43673, 0x99807db5), TOBN(0x77a54455, 0x9cd55bcd)}}, {{TOBN(0xe6c2ff13, 0x0105c072), TOBN(0x18f7a99f, 0x8dda7da4), TOBN(0x4c301820, 0x0e2d35c1), TOBN(0x06a53ca0, 0xd9cc6c82)}, {TOBN(0xaa21cc1e, 0xf1aa1d9e), TOBN(0x32414334, 0x4a75b1e8), TOBN(0x2a6d1328, 0x0ebe9fdc), TOBN(0x16bd173f, 0x98a4755a)}}, {{TOBN(0xfbb9b245, 0x2133ffd9), TOBN(0x39a8b2f1, 0x830f1a20), TOBN(0x484bc97d, 0xd5a1f52a), TOBN(0xd6aebf56, 0xa40eddf8)}, {TOBN(0x32257acb, 0x76ccdac6), TOBN(0xaf4d36ec, 0x1586ff27), TOBN(0x8eaa8863, 0xf8de7dd1), TOBN(0x0045d5cf, 0x88647c16)}}}, {{{TOBN(0xa6f3d574, 0xc005979d), TOBN(0xc2072b42, 0x6a40e350), TOBN(0xfca5c156, 0x8de2ecf9), TOBN(0xa8c8bf5b, 0xa515344e)}, {TOBN(0x97aee555, 0x114df14a), TOBN(0xd4374a4d, 0xfdc5ec6b), TOBN(0x754cc28f, 0x2ca85418), TOBN(0x71cb9e27, 0xd3c41f78)}}, {{TOBN(0x89105079, 0x03605c39), TOBN(0xf0843d9e, 0xa142c96c), TOBN(0xf3744934, 0x16923684), TOBN(0x732caa2f, 0xfa0a2893)}, {TOBN(0xb2e8c270, 0x61160170), TOBN(0xc32788cc, 0x437fbaa3), TOBN(0x39cd818e, 0xa6eda3ac), TOBN(0xe2e94239, 0x9e2b2e07)}}, {{TOBN(0x6967d39b, 0x0260e52a), TOBN(0xd42585cc, 0x90653325), TOBN(0x0d9bd605, 0x21ca7954), TOBN(0x4fa20877, 0x81ed57b3)}, {TOBN(0x60c1eff8, 0xe34a0bbe), TOBN(0x56b0040c, 0x84f6ef64), TOBN(0x28be2b24, 0xb1af8483), TOBN(0xb2278163, 0xf5531614)}}, {{TOBN(0x8df27545, 0x5922ac1c), TOBN(0xa7b3ef5c, 0xa52b3f63), TOBN(0x8e77b214, 0x71de57c4), TOBN(0x31682c10, 0x834c008b)}, {TOBN(0xc76824f0, 0x4bd55d31), TOBN(0xb6d1c086, 0x17b61c71), TOBN(0x31db0903, 0xc2a5089d), TOBN(0x9c092172, 0x184e5d3f)}}, {{TOBN(0xdd7ced5b, 0xc00cc638), TOBN(0x1a2015eb, 0x61278fc2), TOBN(0x2e8e5288, 0x6a37f8d6), TOBN(0xc457786f, 0xe79933ad)}, {TOBN(0xb3fe4cce, 0x2c51211a), TOBN(0xad9b10b2, 0x24c20498), TOBN(0x90d87a4f, 0xd28db5e5), TOBN(0x698cd105, 0x3aca2fc3)}}, {{TOBN(0x4f112d07, 0xe91b536d), TOBN(0xceb982f2, 0x9eba09d6), TOBN(0x3c157b2c, 0x197c396f), TOBN(0xe23c2d41, 0x7b66eb24)}, {TOBN(0x480c57d9, 0x3f330d37), TOBN(0xb3a4c8a1, 0x79108deb), TOBN(0x702388de, 0xcb199ce5), TOBN(0x0b019211, 0xb944a8d4)}}, {{TOBN(0x24f2a692, 0x840bb336), TOBN(0x7c353bdc, 0xa669fa7b), TOBN(0xda20d6fc, 0xdec9c300), TOBN(0x625fbe2f, 0xa13a4f17)}, {TOBN(0xa2b1b61a, 0xdbc17328), TOBN(0x008965bf, 0xa9515621), TOBN(0x49690939, 0xc620ff46), TOBN(0x182dd27d, 0x8717e91c)}}, {{TOBN(0x5ace5035, 0xea6c3997), TOBN(0x54259aaa, 0xc2610bef), TOBN(0xef18bb3f, 0x3c80dd39), TOBN(0x6910b95b, 0x5fc3fa39)}, {TOBN(0xfce2f510, 0x43e09aee), TOBN(0xced56c9f, 0xa7675665), TOBN(0x10e265ac, 0xd872db61), TOBN(0x6982812e, 0xae9fce69)}}, {{TOBN(0x29be11c6, 0xce800998), TOBN(0x72bb1752, 0xb90360d9), TOBN(0x2c193197, 0x5a4ad590), TOBN(0x2ba2f548, 0x9fc1dbc0)}, {TOBN(0x7fe4eebb, 0xe490ebe0), TOBN(0x12a0a4cd, 0x7fae11c0), TOBN(0x7197cf81, 0xe903ba37), TOBN(0xcf7d4aa8, 0xde1c6dd8)}}, {{TOBN(0x92af6bf4, 0x3fd5684c), TOBN(0x2b26eecf, 0x80360aa1), TOBN(0xbd960f30, 0x00546a82), TOBN(0x407b3c43, 0xf59ad8fe)}, {TOBN(0x86cae5fe, 0x249c82ba), TOBN(0x9e0faec7, 0x2463744c), TOBN(0x87f551e8, 0x94916272), TOBN(0x033f9344, 0x6ceb0615)}}, {{TOBN(0x1e5eb0d1, 0x8be82e84), TOBN(0x89967f0e, 0x7a582fef), TOBN(0xbcf687d5, 0xa6e921fa), TOBN(0xdfee4cf3, 0xd37a09ba)}, {TOBN(0x94f06965, 0xb493c465), TOBN(0x638b9a1c, 0x7635c030), TOBN(0x76667864, 0x66f05e9f), TOBN(0xccaf6808, 0xc04da725)}}, {{TOBN(0xca2eb690, 0x768fccfc), TOBN(0xf402d37d, 0xb835b362), TOBN(0x0efac0d0, 0xe2fdfcce), TOBN(0xefc9cdef, 0xb638d990)}, {TOBN(0x2af12b72, 0xd1669a8b), TOBN(0x33c536bc, 0x5774ccbd), TOBN(0x30b21909, 0xfb34870e), TOBN(0xc38fa2f7, 0x7df25aca)}}, {{TOBN(0x74c5f02b, 0xbf81f3f5), TOBN(0x0525a5ae, 0xaf7e4581), TOBN(0x88d2aaba, 0x433c54ae), TOBN(0xed9775db, 0x806a56c5)}, {TOBN(0xd320738a, 0xc0edb37d), TOBN(0x25fdb6ee, 0x66cc1f51), TOBN(0xac661d17, 0x10600d76), TOBN(0x931ec1f3, 0xbdd1ed76)}}, {{TOBN(0x65c11d62, 0x19ee43f1), TOBN(0x5cd57c3e, 0x60829d97), TOBN(0xd26c91a3, 0x984be6e8), TOBN(0xf08d9309, 0x8b0c53bd)}, {TOBN(0x94bc9e5b, 0xc016e4ea), TOBN(0xd3916839, 0x11d43d2b), TOBN(0x886c5ad7, 0x73701155), TOBN(0xe0377626, 0x20b00715)}}, {{TOBN(0x7f01c9ec, 0xaa80ba59), TOBN(0x3083411a, 0x68538e51), TOBN(0x970370f1, 0xe88128af), TOBN(0x625cc3db, 0x91dec14b)}, {TOBN(0xfef9666c, 0x01ac3107), TOBN(0xb2a8d577, 0xd5057ac3), TOBN(0xb0f26299, 0x92be5df7), TOBN(0xf579c8e5, 0x00353924)}}, {{TOBN(0xb8fa3d93, 0x1341ed7a), TOBN(0x4223272c, 0xa7b59d49), TOBN(0x3dcb1947, 0x83b8c4a4), TOBN(0x4e413c01, 0xed1302e4)}, {TOBN(0x6d999127, 0xe17e44ce), TOBN(0xee86bf75, 0x33b3adfb), TOBN(0xf6902fe6, 0x25aa96ca), TOBN(0xb73540e4, 0xe5aae47d)}}, {{TOBN(0x32801d7b, 0x1b4a158c), TOBN(0xe571c99e, 0x27e2a369), TOBN(0x40cb76c0, 0x10d9f197), TOBN(0xc308c289, 0x3167c0ae)}, {TOBN(0xa6ef9dd3, 0xeb7958f2), TOBN(0xa7226dfc, 0x300879b1), TOBN(0x6cd0b362, 0x7edf0636), TOBN(0x4efbce6c, 0x7bc37eed)}}, {{TOBN(0x75f92a05, 0x8d699021), TOBN(0x586d4c79, 0x772566e3), TOBN(0x378ca5f1, 0x761ad23a), TOBN(0x650d86fc, 0x1465a8ac)}, {TOBN(0x7a4ed457, 0x842ba251), TOBN(0x6b65e3e6, 0x42234933), TOBN(0xaf1543b7, 0x31aad657), TOBN(0xa4cefe98, 0xcbfec369)}}, {{TOBN(0xb587da90, 0x9f47befb), TOBN(0x6562e9fb, 0x41312d13), TOBN(0xa691ea59, 0xeff1cefe), TOBN(0xcc30477a, 0x05fc4cf6)}, {TOBN(0xa1632461, 0x0b0ffd3d), TOBN(0xa1f16f3b, 0x5b355956), TOBN(0x5b148d53, 0x4224ec24), TOBN(0xdc834e7b, 0xf977012a)}}, {{TOBN(0x7bfc5e75, 0xb2c69dbc), TOBN(0x3aa77a29, 0x03c3da6c), TOBN(0xde0df03c, 0xca910271), TOBN(0xcbd5ca4a, 0x7806dc55)}, {TOBN(0xe1ca5807, 0x6db476cb), TOBN(0xfde15d62, 0x5f37a31e), TOBN(0xf49af520, 0xf41af416), TOBN(0x96c5c5b1, 0x7d342db5)}}, {{TOBN(0x155c43b7, 0xeb4ceb9b), TOBN(0x2e993010, 0x4e77371a), TOBN(0x1d2987da, 0x675d43af), TOBN(0xef2bc1c0, 0x8599fd72)}, {TOBN(0x96894b7b, 0x9342f6b2), TOBN(0x201eadf2, 0x7c8e71f0), TOBN(0xf3479d9f, 0x4a1f3efc), TOBN(0xe0f8a742, 0x702a9704)}}, {{TOBN(0xeafd44b6, 0xb3eba40c), TOBN(0xf9739f29, 0xc1c1e0d0), TOBN(0x0091471a, 0x619d505e), TOBN(0xc15f9c96, 0x9d7c263e)}, {TOBN(0x5be47285, 0x83afbe33), TOBN(0xa3b6d6af, 0x04f1e092), TOBN(0xe76526b9, 0x751a9d11), TOBN(0x2ec5b26d, 0x9a4ae4d2)}}, {{TOBN(0xeb66f4d9, 0x02f6fb8d), TOBN(0x4063c561, 0x96912164), TOBN(0xeb7050c1, 0x80ef3000), TOBN(0x288d1c33, 0xeaa5b3f0)}, {TOBN(0xe87c68d6, 0x07806fd8), TOBN(0xb2f7f9d5, 0x4bbbf50f), TOBN(0x25972f3a, 0xac8d6627), TOBN(0xf8547774, 0x10e8c13b)}}, {{TOBN(0xcc50ef6c, 0x872b4a60), TOBN(0xab2a34a4, 0x4613521b), TOBN(0x39c5c190, 0x983e15d1), TOBN(0x61dde5df, 0x59905512)}, {TOBN(0xe417f621, 0x9f2275f3), TOBN(0x0750c8b6, 0x451d894b), TOBN(0x75b04ab9, 0x78b0bdaa), TOBN(0x3bfd9fd4, 0x458589bd)}}, {{TOBN(0xf1013e30, 0xee9120b6), TOBN(0x2b51af93, 0x23a4743e), TOBN(0xea96ffae, 0x48d14d9e), TOBN(0x71dc0dbe, 0x698a1d32)}, {TOBN(0x914962d2, 0x0180cca4), TOBN(0x1ae60677, 0xc3568963), TOBN(0x8cf227b1, 0x437bc444), TOBN(0xc650c83b, 0xc9962c7a)}}, {{TOBN(0x23c2c7dd, 0xfe7ccfc4), TOBN(0xf925c89d, 0x1b929d48), TOBN(0x4460f74b, 0x06783c33), TOBN(0xac2c8d49, 0xa590475a)}, {TOBN(0xfb40b407, 0xb807bba0), TOBN(0x9d1e362d, 0x69ff8f3a), TOBN(0xa33e9681, 0xcbef64a4), TOBN(0x67ece5fa, 0x332fb4b2)}}, {{TOBN(0x6900a99b, 0x739f10e3), TOBN(0xc3341ca9, 0xff525925), TOBN(0xee18a626, 0xa9e2d041), TOBN(0xa5a83685, 0x29580ddd)}, {TOBN(0xf3470c81, 0x9d7de3cd), TOBN(0xedf02586, 0x2062cf9c), TOBN(0xf43522fa, 0xc010edb0), TOBN(0x30314135, 0x13a4b1ae)}}, {{TOBN(0xc792e02a, 0xdb22b94b), TOBN(0x993d8ae9, 0xa1eaa45b), TOBN(0x8aad6cd3, 0xcd1e1c63), TOBN(0x89529ca7, 0xc5ce688a)}, {TOBN(0x2ccee3aa, 0xe572a253), TOBN(0xe02b6438, 0x02a21efb), TOBN(0xa7091b6e, 0xc9430358), TOBN(0x06d1b1fa, 0x9d7db504)}}, {{TOBN(0x58846d32, 0xc4744733), TOBN(0x40517c71, 0x379f9e34), TOBN(0x2f65655f, 0x130ef6ca), TOBN(0x526e4488, 0xf1f3503f)}, {TOBN(0x8467bd17, 0x7ee4a976), TOBN(0x1d9dc913, 0x921363d1), TOBN(0xd8d24c33, 0xb069e041), TOBN(0x5eb5da0a, 0x2cdf7f51)}}, {{TOBN(0x1c0f3cb1, 0x197b994f), TOBN(0x3c95a6c5, 0x2843eae9), TOBN(0x7766ffc9, 0xa6097ea5), TOBN(0x7bea4093, 0xd723b867)}, {TOBN(0xb48e1f73, 0x4db378f9), TOBN(0x70025b00, 0xe37b77ac), TOBN(0x943dc8e7, 0xaf24ad46), TOBN(0xb98a15ac, 0x16d00a85)}}, {{TOBN(0x3adc38ba, 0x2743b004), TOBN(0xb1c7f4f7, 0x334415ee), TOBN(0xea43df8f, 0x1e62d05a), TOBN(0x32618905, 0x9d76a3b6)}, {TOBN(0x2fbd0bb5, 0xa23a0f46), TOBN(0x5bc971db, 0x6a01918c), TOBN(0x7801d94a, 0xb4743f94), TOBN(0xb94df65e, 0x676ae22b)}}, {{TOBN(0xaafcbfab, 0xaf95894c), TOBN(0x7b9bdc07, 0x276b2241), TOBN(0xeaf98362, 0x5bdda48b), TOBN(0x5977faf2, 0xa3fcb4df)}, {TOBN(0xbed042ef, 0x052c4b5b), TOBN(0x9fe87f71, 0x067591f0), TOBN(0xc89c73ca, 0x22f24ec7), TOBN(0x7d37fa9e, 0xe64a9f1b)}}, {{TOBN(0x2710841a, 0x15562627), TOBN(0x2c01a613, 0xc243b034), TOBN(0x1d135c56, 0x2bc68609), TOBN(0xc2ca1715, 0x8b03f1f6)}, {TOBN(0xc9966c2d, 0x3eb81d82), TOBN(0xc02abf4a, 0x8f6df13e), TOBN(0x77b34bd7, 0x8f72b43b), TOBN(0xaff6218f, 0x360c82b0)}}, {{TOBN(0x0aa5726c, 0x8d55b9d2), TOBN(0xdc0adbe9, 0x99e9bffb), TOBN(0x9097549c, 0xefb9e72a), TOBN(0x16755712, 0x9dfb3111)}, {TOBN(0xdd8bf984, 0xf26847f9), TOBN(0xbcb8e387, 0xdfb30cb7), TOBN(0xc1fd32a7, 0x5171ef9c), TOBN(0x977f3fc7, 0x389b363f)}}, {{TOBN(0x116eaf2b, 0xf4babda0), TOBN(0xfeab68bd, 0xf7113c8e), TOBN(0xd1e3f064, 0xb7def526), TOBN(0x1ac30885, 0xe0b3fa02)}, {TOBN(0x1c5a6e7b, 0x40142d9d), TOBN(0x839b5603, 0x30921c0b), TOBN(0x48f301fa, 0x36a116a3), TOBN(0x380e1107, 0xcfd9ee6d)}}, {{TOBN(0x7945ead8, 0x58854be1), TOBN(0x4111c12e, 0xcbd4d49d), TOBN(0xece3b1ec, 0x3a29c2ef), TOBN(0x6356d404, 0x8d3616f5)}, {TOBN(0x9f0d6a8f, 0x594d320e), TOBN(0x0989316d, 0xf651ccd2), TOBN(0x6c32117a, 0x0f8fdde4), TOBN(0x9abe5cc5, 0xa26a9bbc)}}, {{TOBN(0xcff560fb, 0x9723f671), TOBN(0x21b2a12d, 0x7f3d593c), TOBN(0xe4cb18da, 0x24ba0696), TOBN(0x186e2220, 0xc3543384)}, {TOBN(0x722f64e0, 0x88312c29), TOBN(0x94282a99, 0x17dc7752), TOBN(0x62467bbf, 0x5a85ee89), TOBN(0xf435c650, 0xf10076a0)}}, {{TOBN(0xc9ff1539, 0x43b3a50b), TOBN(0x7132130c, 0x1a53efbc), TOBN(0x31bfe063, 0xf7b0c5b7), TOBN(0xb0179a7d, 0x4ea994cc)}, {TOBN(0x12d064b3, 0xc85f455b), TOBN(0x47259328, 0x8f6e0062), TOBN(0xf64e590b, 0xb875d6d9), TOBN(0x22dd6225, 0xad92bcc7)}}, {{TOBN(0xb658038e, 0xb9c3bd6d), TOBN(0x00cdb0d6, 0xfbba27c8), TOBN(0x0c681337, 0x1062c45d), TOBN(0xd8515b8c, 0x2d33407d)}, {TOBN(0xcb8f699e, 0x8cbb5ecf), TOBN(0x8c4347f8, 0xc608d7d8), TOBN(0x2c11850a, 0xbb3e00db), TOBN(0x20a8dafd, 0xecb49d19)}}, {{TOBN(0xbd781480, 0x45ee2f40), TOBN(0x75e354af, 0x416b60cf), TOBN(0xde0b58a1, 0x8d49a8c4), TOBN(0xe40e94e2, 0xfa359536)}, {TOBN(0xbd4fa59f, 0x62accd76), TOBN(0x05cf466a, 0x8c762837), TOBN(0xb5abda99, 0x448c277b), TOBN(0x5a9e01bf, 0x48b13740)}}, {{TOBN(0x9d457798, 0x326aad8d), TOBN(0xbdef4954, 0xc396f7e7), TOBN(0x6fb274a2, 0xc253e292), TOBN(0x2800bf0a, 0x1cfe53e7)}, {TOBN(0x22426d31, 0x44438fd4), TOBN(0xef233923, 0x5e259f9a), TOBN(0x4188503c, 0x03f66264), TOBN(0x9e5e7f13, 0x7f9fdfab)}}, {{TOBN(0x565eb76c, 0x5fcc1aba), TOBN(0xea632548, 0x59b5bff8), TOBN(0x5587c087, 0xaab6d3fa), TOBN(0x92b639ea, 0x6ce39c1b)}, {TOBN(0x0706e782, 0x953b135c), TOBN(0x7308912e, 0x425268ef), TOBN(0x599e92c7, 0x090e7469), TOBN(0x83b90f52, 0x9bc35e75)}}, {{TOBN(0x4750b3d0, 0x244975b3), TOBN(0xf3a44358, 0x11965d72), TOBN(0x179c6774, 0x9c8dc751), TOBN(0xff18cdfe, 0xd23d9ff0)}, {TOBN(0xc4013833, 0x2028e247), TOBN(0x96e280e2, 0xf3bfbc79), TOBN(0xf60417bd, 0xd0880a84), TOBN(0x263c9f3d, 0x2a568151)}}, {{TOBN(0x36be15b3, 0x2d2ce811), TOBN(0x846dc0c2, 0xf8291d21), TOBN(0x5cfa0ecb, 0x789fcfdb), TOBN(0x45a0beed, 0xd7535b9a)}, {TOBN(0xec8e9f07, 0x96d69af1), TOBN(0x31a7c5b8, 0x599ab6dc), TOBN(0xd36d45ef, 0xf9e2e09f), TOBN(0x3cf49ef1, 0xdcee954b)}}, {{TOBN(0x6be34cf3, 0x086cff9b), TOBN(0x88dbd491, 0x39a3360f), TOBN(0x1e96b8cc, 0x0dbfbd1d), TOBN(0xc1e5f7bf, 0xcb7e2552)}, {TOBN(0x0547b214, 0x28819d98), TOBN(0xc770dd9c, 0x7aea9dcb), TOBN(0xaef0d4c7, 0x041d68c8), TOBN(0xcc2b9818, 0x13cb9ba8)}}, {{TOBN(0x7fc7bc76, 0xfe86c607), TOBN(0x6b7b9337, 0x502a9a95), TOBN(0x1948dc27, 0xd14dab63), TOBN(0x249dd198, 0xdae047be)}, {TOBN(0xe8356584, 0xa981a202), TOBN(0x3531dd18, 0x3a893387), TOBN(0x1be11f90, 0xc85c7209), TOBN(0x93d2fe1e, 0xe2a52b5a)}}, {{TOBN(0x8225bfe2, 0xec6d6b97), TOBN(0x9cf6d6f4, 0xbd0aa5de), TOBN(0x911459cb, 0x54779f5f), TOBN(0x5649cddb, 0x86aeb1f3)}, {TOBN(0x32133579, 0x3f26ce5a), TOBN(0xc289a102, 0x550f431e), TOBN(0x559dcfda, 0x73b84c6f), TOBN(0x84973819, 0xee3ac4d7)}}, {{TOBN(0xb51e55e6, 0xf2606a82), TOBN(0xe25f7061, 0x90f2fb57), TOBN(0xacef6c2a, 0xb1a4e37c), TOBN(0x864e359d, 0x5dcf2706)}, {TOBN(0x479e6b18, 0x7ce57316), TOBN(0x2cab2500, 0x3a96b23d), TOBN(0xed489862, 0x8ef16df7), TOBN(0x2056538c, 0xef3758b5)}}, {{TOBN(0xa7df865e, 0xf15d3101), TOBN(0x80c5533a, 0x61b553d7), TOBN(0x366e1997, 0x4ed14294), TOBN(0x6620741f, 0xb3c0bcd6)}, {TOBN(0x21d1d9c4, 0xedc45418), TOBN(0x005b859e, 0xc1cc4a9d), TOBN(0xdf01f630, 0xa1c462f0), TOBN(0x15d06cf3, 0xf26820c7)}}, {{TOBN(0x9f7f24ee, 0x3484be47), TOBN(0x2ff33e96, 0x4a0c902f), TOBN(0x00bdf457, 0x5a0bc453), TOBN(0x2378dfaf, 0x1aa238db)}, {TOBN(0x272420ec, 0x856720f2), TOBN(0x2ad9d95b, 0x96797291), TOBN(0xd1242cc6, 0x768a1558), TOBN(0x2e287f8b, 0x5cc86aa8)}}, {{TOBN(0x796873d0, 0x990cecaa), TOBN(0xade55f81, 0x675d4080), TOBN(0x2645eea3, 0x21f0cd84), TOBN(0x7a1efa0f, 0xb4e17d02)}, {TOBN(0xf6858420, 0x037cc061), TOBN(0x682e05f0, 0xd5d43e12), TOBN(0x59c36994, 0x27218710), TOBN(0x85cbba4d, 0x3f7cd2fc)}}, {{TOBN(0x726f9729, 0x7a3cd22a), TOBN(0x9f8cd5dc, 0x4a628397), TOBN(0x17b93ab9, 0xc23165ed), TOBN(0xff5f5dbf, 0x122823d4)}, {TOBN(0xc1e4e4b5, 0x654a446d), TOBN(0xd1a9496f, 0x677257ba), TOBN(0x6387ba94, 0xde766a56), TOBN(0x23608bc8, 0x521ec74a)}}, {{TOBN(0x16a522d7, 0x6688c4d4), TOBN(0x9d6b4282, 0x07373abd), TOBN(0xa62f07ac, 0xb42efaa3), TOBN(0xf73e00f7, 0xe3b90180)}, {TOBN(0x36175fec, 0x49421c3e), TOBN(0xc4e44f9b, 0x3dcf2678), TOBN(0x76df436b, 0x7220f09f), TOBN(0x172755fb, 0x3aa8b6cf)}}, {{TOBN(0xbab89d57, 0x446139cc), TOBN(0x0a0a6e02, 0x5fe0208f), TOBN(0xcdbb63e2, 0x11e5d399), TOBN(0x33ecaa12, 0xa8977f0b)}, {TOBN(0x59598b21, 0xf7c42664), TOBN(0xb3e91b32, 0xab65d08a), TOBN(0x035822ee, 0xf4502526), TOBN(0x1dcf0176, 0x720a82a9)}}, {{TOBN(0x50f8598f, 0x3d589e02), TOBN(0xdf0478ff, 0xb1d63d2c), TOBN(0x8b8068bd, 0x1571cd07), TOBN(0x30c3aa4f, 0xd79670cd)}, {TOBN(0x25e8fd4b, 0x941ade7f), TOBN(0x3d1debdc, 0x32790011), TOBN(0x65b6dcbd, 0x3a3f9ff0), TOBN(0x282736a4, 0x793de69c)}}, {{TOBN(0xef69a0c3, 0xd41d3bd3), TOBN(0xb533b8c9, 0x07a26bde), TOBN(0xe2801d97, 0xdb2edf9f), TOBN(0xdc4a8269, 0xe1877af0)}, {TOBN(0x6c1c5851, 0x3d590dbe), TOBN(0x84632f6b, 0xee4e9357), TOBN(0xd36d36b7, 0x79b33374), TOBN(0xb46833e3, 0x9bbca2e6)}}, {{TOBN(0x37893913, 0xf7fc0586), TOBN(0x385315f7, 0x66bf4719), TOBN(0x72c56293, 0xb31855dc), TOBN(0xd1416d4e, 0x849061fe)}, {TOBN(0xbeb3ab78, 0x51047213), TOBN(0x447f6e61, 0xf040c996), TOBN(0xd06d310d, 0x638b1d0c), TOBN(0xe28a413f, 0xbad1522e)}}, {{TOBN(0x685a76cb, 0x82003f86), TOBN(0x610d07f7, 0x0bcdbca3), TOBN(0x6ff66021, 0x9ca4c455), TOBN(0x7df39b87, 0xcea10eec)}, {TOBN(0xb9255f96, 0xe22db218), TOBN(0x8cc6d9eb, 0x08a34c44), TOBN(0xcd4ffb86, 0x859f9276), TOBN(0x8fa15eb2, 0x50d07335)}}, {{TOBN(0xdf553845, 0xcf2c24b5), TOBN(0x89f66a9f, 0x52f9c3ba), TOBN(0x8f22b5b9, 0xe4a7ceb3), TOBN(0xaffef809, 0x0e134686)}, {TOBN(0x3e53e1c6, 0x8eb8fac2), TOBN(0x93c1e4eb, 0x28aec98e), TOBN(0xb6b91ec5, 0x32a43bcb), TOBN(0x2dbfa947, 0xb2d74a51)}}, {{TOBN(0xe065d190, 0xca84bad7), TOBN(0xfb13919f, 0xad58e65c), TOBN(0x3c41718b, 0xf1cb6e31), TOBN(0x688969f0, 0x06d05c3f)}, {TOBN(0xd4f94ce7, 0x21264d45), TOBN(0xfdfb65e9, 0x7367532b), TOBN(0x5b1be8b1, 0x0945a39d), TOBN(0x229f789c, 0x2b8baf3b)}}, {{TOBN(0xd8f41f3e, 0x6f49f15d), TOBN(0x678ce828, 0x907f0792), TOBN(0xc69ace82, 0xfca6e867), TOBN(0x106451ae, 0xd01dcc89)}, {TOBN(0x1bb4f7f0, 0x19fc32d2), TOBN(0x64633dfc, 0xb00c52d2), TOBN(0x8f13549a, 0xad9ea445), TOBN(0x99a3bf50, 0xfb323705)}}, {{TOBN(0x0c9625a2, 0x534d4dbc), TOBN(0x45b8f1d1, 0xc2a2fea3), TOBN(0x76ec21a1, 0xa530fc1a), TOBN(0x4bac9c2a, 0x9e5bd734)}, {TOBN(0x5996d76a, 0x7b4e3587), TOBN(0x0045cdee, 0x1182d9e3), TOBN(0x1aee24b9, 0x1207f13d), TOBN(0x66452e97, 0x97345a41)}}, {{TOBN(0x16e5b054, 0x9f950cd0), TOBN(0x9cc72fb1, 0xd7fdd075), TOBN(0x6edd61e7, 0x66249663), TOBN(0xde4caa4d, 0xf043cccb)}, {TOBN(0x11b1f57a, 0x55c7ac17), TOBN(0x779cbd44, 0x1a85e24d), TOBN(0x78030f86, 0xe46081e7), TOBN(0xfd4a6032, 0x8e20f643)}}, {{TOBN(0xcc7a6488, 0x0a750c0f), TOBN(0x39bacfe3, 0x4e548e83), TOBN(0x3d418c76, 0x0c110f05), TOBN(0x3e4daa4c, 0xb1f11588)}, {TOBN(0x2733e7b5, 0x5ffc69ff), TOBN(0x46f147bc, 0x92053127), TOBN(0x885b2434, 0xd722df94), TOBN(0x6a444f65, 0xe6fc6b7c)}}}, {{{TOBN(0x7a1a465a, 0xc3f16ea8), TOBN(0x115a461d, 0xb2f1d11c), TOBN(0x4767dd95, 0x6c68a172), TOBN(0x3392f2eb, 0xd13a4698)}, {TOBN(0xc7a99ccd, 0xe526cdc7), TOBN(0x8e537fdc, 0x22292b81), TOBN(0x76d8cf69, 0xa6d39198), TOBN(0xffc5ff43, 0x2446852d)}}, {{TOBN(0x97b14f7e, 0xa90567e6), TOBN(0x513257b7, 0xb6ae5cb7), TOBN(0x85454a3c, 0x9f10903d), TOBN(0xd8d2c9ad, 0x69bc3724)}, {TOBN(0x38da9324, 0x6b29cb44), TOBN(0xb540a21d, 0x77c8cbac), TOBN(0x9bbfe435, 0x01918e42), TOBN(0xfffa707a, 0x56c3614e)}}, {{TOBN(0x0ce4e3f1, 0xd4e353b7), TOBN(0x062d8a14, 0xef46b0a0), TOBN(0x6408d5ab, 0x574b73fd), TOBN(0xbc41d1c9, 0xd3273ffd)}, {TOBN(0x3538e1e7, 0x6be77800), TOBN(0x71fe8b37, 0xc5655031), TOBN(0x1cd91621, 0x6b9b331a), TOBN(0xad825d0b, 0xbb388f73)}}, {{TOBN(0x56c2e05b, 0x1cb76219), TOBN(0x0ec0bf91, 0x71567e7e), TOBN(0xe7076f86, 0x61c4c910), TOBN(0xd67b085b, 0xbabc04d9)}, {TOBN(0x9fb90459, 0x5e93a96a), TOBN(0x7526c1ea, 0xfbdc249a), TOBN(0x0d44d367, 0xecdd0bb7), TOBN(0x95399917, 0x9dc0d695)}}, {{TOBN(0x61360ee9, 0x9e240d18), TOBN(0x057cdcac, 0xb4b94466), TOBN(0xe7667cd1, 0x2fe5325c), TOBN(0x1fa297b5, 0x21974e3b)}, {TOBN(0xfa4081e7, 0xdb083d76), TOBN(0x31993be6, 0xf206bd15), TOBN(0x8949269b, 0x14c19f8c), TOBN(0x21468d72, 0xa9d92357)}}, {{TOBN(0x2ccbc583, 0xa4c506ec), TOBN(0x957ed188, 0xd1acfe97), TOBN(0x8baed833, 0x12f1aea2), TOBN(0xef2a6cb4, 0x8325362d)}, {TOBN(0x130dde42, 0x8e195c43), TOBN(0xc842025a, 0x0e6050c6), TOBN(0x2da972a7, 0x08686a5d), TOBN(0xb52999a1, 0xe508b4a8)}}, {{TOBN(0xd9f090b9, 0x10a5a8bd), TOBN(0xca91d249, 0x096864da), TOBN(0x8e6a93be, 0x3f67dbc1), TOBN(0xacae6fba, 0xf5f4764c)}, {TOBN(0x1563c6e0, 0xd21411a0), TOBN(0x28fa787f, 0xda0a4ad8), TOBN(0xd524491c, 0x908c8030), TOBN(0x1257ba0e, 0x4c795f07)}}, {{TOBN(0x83f49167, 0xceca9754), TOBN(0x426d2cf6, 0x4b7939a0), TOBN(0x2555e355, 0x723fd0bf), TOBN(0xa96e6d06, 0xc4f144e2)}, {TOBN(0x4768a8dd, 0x87880e61), TOBN(0x15543815, 0xe508e4d5), TOBN(0x09d7e772, 0xb1b65e15), TOBN(0x63439dd6, 0xac302fa0)}}, {{TOBN(0xb93f802f, 0xc14e35c2), TOBN(0x71735b7c, 0x4341333c), TOBN(0x03a25104, 0x16d4f362), TOBN(0x3f4d069b, 0xbf433c8e)}, {TOBN(0x0d83ae01, 0xf78f5a7c), TOBN(0x50a8ffbe, 0x7c4eed07), TOBN(0xc74f8906, 0x76e10f83), TOBN(0x7d080966, 0x9ddaf8e1)}}, {{TOBN(0xb11df8e1, 0x698e04cc), TOBN(0x877be203, 0x169005c8), TOBN(0x32749e8c, 0x4f3c6179), TOBN(0x2dbc9d0a, 0x7853fc05)}, {TOBN(0x187d4f93, 0x9454d937), TOBN(0xe682ce9d, 0xb4800e1b), TOBN(0xa9129ad8, 0x165e68e8), TOBN(0x0fe29735, 0xbe7f785b)}}, {{TOBN(0x5303f40c, 0x5b9e02b7), TOBN(0xa37c9692, 0x35ee04e8), TOBN(0x5f46cc20, 0x34d6632b), TOBN(0x55ef72b2, 0x96ac545b)}, {TOBN(0xabec5c1f, 0x7b91b062), TOBN(0x0a79e1c7, 0xbb33e821), TOBN(0xbb04b428, 0x3a9f4117), TOBN(0x0de1f28f, 0xfd2a475a)}}, {{TOBN(0x31019ccf, 0x3a4434b4), TOBN(0xa3458111, 0x1a7954dc), TOBN(0xa9dac80d, 0xe34972a7), TOBN(0xb043d054, 0x74f6b8dd)}, {TOBN(0x021c319e, 0x11137b1a), TOBN(0x00a754ce, 0xed5cc03f), TOBN(0x0aa2c794, 0xcbea5ad4), TOBN(0x093e67f4, 0x70c015b6)}}, {{TOBN(0x72cdfee9, 0xc97e3f6b), TOBN(0xc10bcab4, 0xb6da7461), TOBN(0x3b02d2fc, 0xb59806b9), TOBN(0x85185e89, 0xa1de6f47)}, {TOBN(0x39e6931f, 0x0eb6c4d4), TOBN(0x4d4440bd, 0xd4fa5b04), TOBN(0x5418786e, 0x34be7eb8), TOBN(0x6380e521, 0x9d7259bc)}}, {{TOBN(0x20ac0351, 0xd598d710), TOBN(0x272c4166, 0xcb3a4da4), TOBN(0xdb82fe1a, 0xca71de1f), TOBN(0x746e79f2, 0xd8f54b0f)}, {TOBN(0x6e7fc736, 0x4b573e9b), TOBN(0x75d03f46, 0xfd4b5040), TOBN(0x5c1cc36d, 0x0b98d87b), TOBN(0x513ba3f1, 0x1f472da1)}}, {{TOBN(0x79d0af26, 0xabb177dd), TOBN(0xf82ab568, 0x7891d564), TOBN(0x2b6768a9, 0x72232173), TOBN(0xefbb3bb0, 0x8c1f6619)}, {TOBN(0xb29c11db, 0xa6d18358), TOBN(0x519e2797, 0xb0916d3a), TOBN(0xd4dc18f0, 0x9188e290), TOBN(0x648e86e3, 0x98b0ca7f)}}, {{TOBN(0x859d3145, 0x983c38b5), TOBN(0xb14f176c, 0x637abc8b), TOBN(0x2793fb9d, 0xcaff7be6), TOBN(0xebe5a55f, 0x35a66a5a)}, {TOBN(0x7cec1dcd, 0x9f87dc59), TOBN(0x7c595cd3, 0xfbdbf560), TOBN(0x5b543b22, 0x26eb3257), TOBN(0x69080646, 0xc4c935fd)}}, {{TOBN(0x7f2e4403, 0x81e9ede3), TOBN(0x243c3894, 0xcaf6df0a), TOBN(0x7c605bb1, 0x1c073b11), TOBN(0xcd06a541, 0xba6a4a62)}, {TOBN(0x29168949, 0x49d4e2e5), TOBN(0x33649d07, 0x4af66880), TOBN(0xbfc0c885, 0xe9a85035), TOBN(0xb4e52113, 0xfc410f4b)}}, {{TOBN(0xdca3b706, 0x78a6513b), TOBN(0x92ea4a2a, 0x9edb1943), TOBN(0x02642216, 0xdb6e2dd8), TOBN(0x9b45d0b4, 0x9fd57894)}, {TOBN(0x114e70db, 0xc69d11ae), TOBN(0x1477dd19, 0x4c57595f), TOBN(0xbc2208b4, 0xec77c272), TOBN(0x95c5b4d7, 0xdb68f59c)}}, {{TOBN(0xb8c4fc63, 0x42e532b7), TOBN(0x386ba422, 0x9ae35290), TOBN(0xfb5dda42, 0xd201ecbc), TOBN(0x2353dc8b, 0xa0e38fd6)}, {TOBN(0x9a0b85ea, 0x68f7e978), TOBN(0x96ec5682, 0x2ad6d11f), TOBN(0x5e279d6c, 0xe5f6886d), TOBN(0xd3fe03cd, 0x3cb1914d)}}, {{TOBN(0xfe541fa4, 0x7ea67c77), TOBN(0x952bd2af, 0xe3ea810c), TOBN(0x791fef56, 0x8d01d374), TOBN(0xa3a1c621, 0x0f11336e)}, {TOBN(0x5ad0d5a9, 0xc7ec6d79), TOBN(0xff7038af, 0x3225c342), TOBN(0x003c6689, 0xbc69601b), TOBN(0x25059bc7, 0x45e8747d)}}, {{TOBN(0xfa4965b2, 0xf2086fbf), TOBN(0xf6840ea6, 0x86916078), TOBN(0xd7ac7620, 0x70081d6c), TOBN(0xe600da31, 0xb5328645)}, {TOBN(0x01916f63, 0x529b8a80), TOBN(0xe80e4858, 0x2d7d6f3e), TOBN(0x29eb0fe8, 0xd664ca7c), TOBN(0xf017637b, 0xe7b43b0c)}}, {{TOBN(0x9a75c806, 0x76cb2566), TOBN(0x8f76acb1, 0xb24892d9), TOBN(0x7ae7b9cc, 0x1f08fe45), TOBN(0x19ef7329, 0x6a4907d8)}, {TOBN(0x2db4ab71, 0x5f228bf0), TOBN(0xf3cdea39, 0x817032d7), TOBN(0x0b1f482e, 0xdcabe3c0), TOBN(0x3baf76b4, 0xbb86325c)}}, {{TOBN(0xd49065e0, 0x10089465), TOBN(0x3bab5d29, 0x8e77c596), TOBN(0x7636c3a6, 0x193dbd95), TOBN(0xdef5d294, 0xb246e499)}, {TOBN(0xb22c58b9, 0x286b2475), TOBN(0xa0b93939, 0xcd80862b), TOBN(0x3002c83a, 0xf0992388), TOBN(0x6de01f9b, 0xeacbe14c)}}, {{TOBN(0x6aac688e, 0xadd70482), TOBN(0x708de92a, 0x7b4a4e8a), TOBN(0x75b6dd73, 0x758a6eef), TOBN(0xea4bf352, 0x725b3c43)}, {TOBN(0x10041f2c, 0x87912868), TOBN(0xb1b1be95, 0xef09297a), TOBN(0x19ae23c5, 0xa9f3860a), TOBN(0xc4f0f839, 0x515dcf4b)}}, {{TOBN(0x3c7ecca3, 0x97f6306a), TOBN(0x744c44ae, 0x68a3a4b0), TOBN(0x69cd13a0, 0xb3a1d8a2), TOBN(0x7cad0a1e, 0x5256b578)}, {TOBN(0xea653fcd, 0x33791d9e), TOBN(0x9cc2a05d, 0x74b2e05f), TOBN(0x73b391dc, 0xfd7affa2), TOBN(0xddb7091e, 0xb6b05442)}}, {{TOBN(0xc71e27bf, 0x8538a5c6), TOBN(0x195c63dd, 0x89abff17), TOBN(0xfd315285, 0x1b71e3da), TOBN(0x9cbdfda7, 0xfa680fa0)}, {TOBN(0x9db876ca, 0x849d7eab), TOBN(0xebe2764b, 0x3c273271), TOBN(0x663357e3, 0xf208dcea), TOBN(0x8c5bd833, 0x565b1b70)}}, {{TOBN(0xccc3b4f5, 0x9837fc0d), TOBN(0x9b641ba8, 0xa79cf00f), TOBN(0x7428243d, 0xdfdf3990), TOBN(0x83a594c4, 0x020786b1)}, {TOBN(0xb712451a, 0x526c4502), TOBN(0x9d39438e, 0x6adb3f93), TOBN(0xfdb261e3, 0xe9ff0ccd), TOBN(0x80344e3c, 0xe07af4c3)}}, {{TOBN(0x75900d7c, 0x2fa4f126), TOBN(0x08a3b865, 0x5c99a232), TOBN(0x2478b6bf, 0xdb25e0c3), TOBN(0x482cc2c2, 0x71db2edf)}, {TOBN(0x37df7e64, 0x5f321bb8), TOBN(0x8a93821b, 0x9a8005b4), TOBN(0x3fa2f10c, 0xcc8c1958), TOBN(0x0d332218, 0x2c269d0a)}}, {{TOBN(0x20ab8119, 0xe246b0e6), TOBN(0xb39781e4, 0xd349fd17), TOBN(0xd293231e, 0xb31aa100), TOBN(0x4b779c97, 0xbb032168)}, {TOBN(0x4b3f19e1, 0xc8470500), TOBN(0x45b7efe9, 0x0c4c869d), TOBN(0xdb84f38a, 0xa1a6bbcc), TOBN(0x3b59cb15, 0xb2fddbc1)}}, {{TOBN(0xba5514df, 0x3fd165e8), TOBN(0x499fd6a9, 0x061f8811), TOBN(0x72cd1fe0, 0xbfef9f00), TOBN(0x120a4bb9, 0x79ad7e8a)}, {TOBN(0xf2ffd095, 0x5f4a5ac5), TOBN(0xcfd174f1, 0x95a7a2f0), TOBN(0xd42301ba, 0x9d17baf1), TOBN(0xd2fa487a, 0x77f22089)}}, {{TOBN(0x9cb09efe, 0xb1dc77e1), TOBN(0xe9566939, 0x21c99682), TOBN(0x8c546901, 0x6c6067bb), TOBN(0xfd378574, 0x61c24456)}, {TOBN(0x2b6a6cbe, 0x81796b33), TOBN(0x62d550f6, 0x58e87f8b), TOBN(0x1b763e1c, 0x7f1b01b4), TOBN(0x4b93cfea, 0x1b1b5e12)}}, {{TOBN(0xb9345238, 0x1d531696), TOBN(0x57201c00, 0x88cdde69), TOBN(0xdde92251, 0x9a86afc7), TOBN(0xe3043895, 0xbd35cea8)}, {TOBN(0x7608c1e1, 0x8555970d), TOBN(0x8267dfa9, 0x2535935e), TOBN(0xd4c60a57, 0x322ea38b), TOBN(0xe0bf7977, 0x804ef8b5)}}, {{TOBN(0x1a0dab28, 0xc06fece4), TOBN(0xd405991e, 0x94e7b49d), TOBN(0xc542b6d2, 0x706dab28), TOBN(0xcb228da3, 0xa91618fb)}, {TOBN(0x224e4164, 0x107d1cea), TOBN(0xeb9fdab3, 0xd0f5d8f1), TOBN(0xc02ba386, 0x0d6e41cd), TOBN(0x676a72c5, 0x9b1f7146)}}, {{TOBN(0xffd6dd98, 0x4d6cb00b), TOBN(0xcef9c5ca, 0xde2e8d7c), TOBN(0xa1bbf5d7, 0x641c7936), TOBN(0x1b95b230, 0xee8f772e)}, {TOBN(0xf765a92e, 0xe8ac25b1), TOBN(0xceb04cfc, 0x3a18b7c6), TOBN(0x27944cef, 0x0acc8966), TOBN(0xcbb3c957, 0x434c1004)}}, {{TOBN(0x9c9971a1, 0xa43ff93c), TOBN(0x5bc2db17, 0xa1e358a9), TOBN(0x45b4862e, 0xa8d9bc82), TOBN(0x70ebfbfb, 0x2201e052)}, {TOBN(0xafdf64c7, 0x92871591), TOBN(0xea5bcae6, 0xb42d0219), TOBN(0xde536c55, 0x2ad8f03c), TOBN(0xcd6c3f4d, 0xa76aa33c)}}, {{TOBN(0xbeb5f623, 0x0bca6de3), TOBN(0xdd20dd99, 0xb1e706fd), TOBN(0x90b3ff9d, 0xac9059d4), TOBN(0x2d7b2902, 0x7ccccc4e)}, {TOBN(0x8a090a59, 0xce98840f), TOBN(0xa5d947e0, 0x8410680a), TOBN(0x49ae346a, 0x923379a5), TOBN(0x7dbc84f9, 0xb28a3156)}}, {{TOBN(0xfd40d916, 0x54a1aff2), TOBN(0xabf318ba, 0x3a78fb9b), TOBN(0x50152ed8, 0x3029f95e), TOBN(0x9fc1dd77, 0xc58ad7fa)}, {TOBN(0x5fa57915, 0x13595c17), TOBN(0xb9504668, 0x8f62b3a9), TOBN(0x907b5b24, 0xff3055b0), TOBN(0x2e995e35, 0x9a84f125)}}, {{TOBN(0x87dacf69, 0x7e9bbcfb), TOBN(0x95d0c1d6, 0xe86d96e3), TOBN(0x65726e3c, 0x2d95a75c), TOBN(0x2c3c9001, 0xacd27f21)}, {TOBN(0x1deab561, 0x6c973f57), TOBN(0x108b7e2c, 0xa5221643), TOBN(0x5fee9859, 0xc4ef79d4), TOBN(0xbd62b88a, 0x40d4b8c6)}}, {{TOBN(0xb4dd29c4, 0x197c75d6), TOBN(0x266a6df2, 0xb7076feb), TOBN(0x9512d0ea, 0x4bf2df11), TOBN(0x1320c24f, 0x6b0cc9ec)}, {TOBN(0x6bb1e0e1, 0x01a59596), TOBN(0x8317c5bb, 0xeff9aaac), TOBN(0x65bb405e, 0x385aa6c9), TOBN(0x613439c1, 0x8f07988f)}}, {{TOBN(0xd730049f, 0x16a66e91), TOBN(0xe97f2820, 0xfa1b0e0d), TOBN(0x4131e003, 0x304c28ea), TOBN(0x820ab732, 0x526bac62)}, {TOBN(0xb2ac9ef9, 0x28714423), TOBN(0x54ecfffa, 0xadb10cb2), TOBN(0x8781476e, 0xf886a4cc), TOBN(0x4b2c87b5, 0xdb2f8d49)}}, {{TOBN(0xe857cd20, 0x0a44295d), TOBN(0x707d7d21, 0x58c6b044), TOBN(0xae8521f9, 0xf596757c), TOBN(0x87448f03, 0x67b2b714)}, {TOBN(0x13a9bc45, 0x5ebcd58d), TOBN(0x79bcced9, 0x9122d3c1), TOBN(0x3c644247, 0x9e076642), TOBN(0x0cf22778, 0x2df4767d)}}, {{TOBN(0x5e61aee4, 0x71d444b6), TOBN(0x211236bf, 0xc5084a1d), TOBN(0x7e15bc9a, 0x4fd3eaf6), TOBN(0x68df2c34, 0xab622bf5)}, {TOBN(0x9e674f0f, 0x59bf4f36), TOBN(0xf883669b, 0xd7f34d73), TOBN(0xc48ac1b8, 0x31497b1d), TOBN(0x323b925d, 0x5106703b)}}, {{TOBN(0x22156f42, 0x74082008), TOBN(0xeffc521a, 0xc8482bcb), TOBN(0x5c6831bf, 0x12173479), TOBN(0xcaa2528f, 0xc4739490)}, {TOBN(0x84d2102a, 0x8f1b3c4d), TOBN(0xcf64dfc1, 0x2d9bec0d), TOBN(0x433febad, 0x78a546ef), TOBN(0x1f621ec3, 0x7b73cef1)}}, {{TOBN(0x6aecd627, 0x37338615), TOBN(0x162082ab, 0x01d8edf6), TOBN(0x833a8119, 0x19e86b66), TOBN(0x6023a251, 0xd299b5db)}, {TOBN(0xf5bb0c3a, 0xbbf04b89), TOBN(0x6735eb69, 0xae749a44), TOBN(0xd0e058c5, 0x4713de3b), TOBN(0xfdf2593e, 0x2c3d4ccd)}}, {{TOBN(0x1b8f414e, 0xfdd23667), TOBN(0xdd52aaca, 0xfa2015ee), TOBN(0x3e31b517, 0xbd9625ff), TOBN(0x5ec9322d, 0x8db5918c)}, {TOBN(0xbc73ac85, 0xa96f5294), TOBN(0x82aa5bf3, 0x61a0666a), TOBN(0x49755810, 0xbf08ac42), TOBN(0xd21cdfd5, 0x891cedfc)}}, {{TOBN(0x918cb57b, 0x67f8be10), TOBN(0x365d1a7c, 0x56ffa726), TOBN(0x2435c504, 0x6532de93), TOBN(0xc0fc5e10, 0x2674cd02)}, {TOBN(0x6e51fcf8, 0x9cbbb142), TOBN(0x1d436e5a, 0xafc50692), TOBN(0x766bffff, 0x3fbcae22), TOBN(0x3148c2fd, 0xfd55d3b8)}}, {{TOBN(0x52c7fdc9, 0x233222fa), TOBN(0x89ff1092, 0xe419fb6b), TOBN(0x3cd6db99, 0x25254977), TOBN(0x2e85a161, 0x1cf12ca7)}, {TOBN(0xadd2547c, 0xdc810bc9), TOBN(0xea3f458f, 0x9d257c22), TOBN(0x642c1fbe, 0x27d6b19b), TOBN(0xed07e6b5, 0x140481a6)}}, {{TOBN(0x6ada1d42, 0x86d2e0f8), TOBN(0xe5920122, 0x0e8a9fd5), TOBN(0x02c936af, 0x708c1b49), TOBN(0x60f30fee, 0x2b4bfaff)}, {TOBN(0x6637ad06, 0x858e6a61), TOBN(0xce4c7767, 0x3fd374d0), TOBN(0x39d54b2d, 0x7188defb), TOBN(0xa8c9d250, 0xf56a6b66)}}, {{TOBN(0x58fc0f5e, 0xb24fe1dc), TOBN(0x9eaf9dee, 0x6b73f24c), TOBN(0xa90d588b, 0x33650705), TOBN(0xde5b62c5, 0xaf2ec729)}, {TOBN(0x5c72cfae, 0xd3c2b36e), TOBN(0x868c19d5, 0x034435da), TOBN(0x88605f93, 0xe17ee145), TOBN(0xaa60c4ee, 0x77a5d5b1)}}, {{TOBN(0xbcf5bfd2, 0x3b60c472), TOBN(0xaf4ef13c, 0xeb1d3049), TOBN(0x373f44fc, 0xe13895c9), TOBN(0xf29b382f, 0x0cbc9822)}, {TOBN(0x1bfcb853, 0x73efaef6), TOBN(0xcf56ac9c, 0xa8c96f40), TOBN(0xd7adf109, 0x7a191e24), TOBN(0x98035f44, 0xbf8a8dc2)}}, {{TOBN(0xf40a71b9, 0x1e750c84), TOBN(0xc57f7b0c, 0x5dc6c469), TOBN(0x49a0e79c, 0x6fbc19c1), TOBN(0x6b0f5889, 0xa48ebdb8)}, {TOBN(0x5d3fd084, 0xa07c4e9f), TOBN(0xc3830111, 0xab27de14), TOBN(0x0e4929fe, 0x33e08dcc), TOBN(0xf4a5ad24, 0x40bb73a3)}}, {{TOBN(0xde86c2bf, 0x490f97ca), TOBN(0x288f09c6, 0x67a1ce18), TOBN(0x364bb886, 0x1844478d), TOBN(0x7840fa42, 0xceedb040)}, {TOBN(0x1269fdd2, 0x5a631b37), TOBN(0x94761f1e, 0xa47c8b7d), TOBN(0xfc0c2e17, 0x481c6266), TOBN(0x85e16ea2, 0x3daa5fa7)}}, {{TOBN(0xccd86033, 0x92491048), TOBN(0x0c2f6963, 0xf4d402d7), TOBN(0x6336f7df, 0xdf6a865c), TOBN(0x0a2a463c, 0xb5c02a87)}, {TOBN(0xb0e29be7, 0xbf2f12ee), TOBN(0xf0a22002, 0x66bad988), TOBN(0x27f87e03, 0x9123c1d7), TOBN(0x21669c55, 0x328a8c98)}}, {{TOBN(0x186b9803, 0x92f14529), TOBN(0xd3d056cc, 0x63954df3), TOBN(0x2f03fd58, 0x175a46f6), TOBN(0x63e34ebe, 0x11558558)}, {TOBN(0xe13fedee, 0x5b80cfa5), TOBN(0xe872a120, 0xd401dbd1), TOBN(0x52657616, 0xe8a9d667), TOBN(0xbc8da4b6, 0xe08d6693)}}, {{TOBN(0x370fb9bb, 0x1b703e75), TOBN(0x6773b186, 0xd4338363), TOBN(0x18dad378, 0xecef7bff), TOBN(0xaac787ed, 0x995677da)}, {TOBN(0x4801ea8b, 0x0437164b), TOBN(0xf430ad20, 0x73fe795e), TOBN(0xb164154d, 0x8ee5eb73), TOBN(0x0884ecd8, 0x108f7c0e)}}, {{TOBN(0x0e6ec096, 0x5f520698), TOBN(0x640631fe, 0x44f7b8d9), TOBN(0x92fd34fc, 0xa35a68b9), TOBN(0x9c5a4b66, 0x4d40cf4e)}, {TOBN(0x949454bf, 0x80b6783d), TOBN(0x80e701fe, 0x3a320a10), TOBN(0x8d1a564a, 0x1a0a39b2), TOBN(0x1436d53d, 0x320587db)}}, {{TOBN(0xf5096e6d, 0x6556c362), TOBN(0xbc23a3c0, 0xe2455d7e), TOBN(0x3a7aee54, 0x807230f9), TOBN(0x9ba1cfa6, 0x22ae82fd)}, {TOBN(0x833a057a, 0x99c5d706), TOBN(0x8be85f4b, 0x842315c9), TOBN(0xd083179a, 0x66a72f12), TOBN(0x2fc77d5d, 0xcdcc73cd)}}, {{TOBN(0x22b88a80, 0x5616ee30), TOBN(0xfb09548f, 0xe7ab1083), TOBN(0x8ad6ab0d, 0x511270cd), TOBN(0x61f6c57a, 0x6924d9ab)}, {TOBN(0xa0f7bf72, 0x90aecb08), TOBN(0x849f87c9, 0x0df784a4), TOBN(0x27c79c15, 0xcfaf1d03), TOBN(0xbbf9f675, 0xc463face)}}, {{TOBN(0x91502c65, 0x765ba543), TOBN(0x18ce3cac, 0x42ea60dd), TOBN(0xe5cee6ac, 0x6e43ecb3), TOBN(0x63e4e910, 0x68f2aeeb)}, {TOBN(0x26234fa3, 0xc85932ee), TOBN(0x96883e8b, 0x4c90c44d), TOBN(0x29b9e738, 0xa18a50f6), TOBN(0xbfc62b2a, 0x3f0420df)}}, {{TOBN(0xd22a7d90, 0x6d3e1fa9), TOBN(0x17115618, 0xfe05b8a3), TOBN(0x2a0c9926, 0xbb2b9c01), TOBN(0xc739fcc6, 0xe07e76a2)}, {TOBN(0x540e9157, 0x165e439a), TOBN(0x06353a62, 0x6a9063d8), TOBN(0x84d95594, 0x61e927a3), TOBN(0x013b9b26, 0xe2e0be7f)}}, {{TOBN(0x4feaec3b, 0x973497f1), TOBN(0x15c0f94e, 0x093ebc2d), TOBN(0x6af5f227, 0x33af0583), TOBN(0x0c2af206, 0xc61f3340)}, {TOBN(0xd25dbdf1, 0x4457397c), TOBN(0x2e8ed017, 0xcabcbae0), TOBN(0xe3010938, 0xc2815306), TOBN(0xbaa99337, 0xe8c6cd68)}}, {{TOBN(0x08513182, 0x3b0ec7de), TOBN(0x1e1b822b, 0x58df05df), TOBN(0x5c14842f, 0xa5c3b683), TOBN(0x98fe977e, 0x3eba34ce)}, {TOBN(0xfd2316c2, 0x0d5e8873), TOBN(0xe48d839a, 0xbd0d427d), TOBN(0x495b2218, 0x623fc961), TOBN(0x24ee56e7, 0xb46fba5e)}}, {{TOBN(0x9184a55b, 0x91e4de58), TOBN(0xa7488ca5, 0xdfdea288), TOBN(0xa723862e, 0xa8dcc943), TOBN(0x92d762b2, 0x849dc0fc)}, {TOBN(0x3c444a12, 0x091ff4a9), TOBN(0x581113fa, 0x0cada274), TOBN(0xb9de0a45, 0x30d8eae2), TOBN(0x5e0fcd85, 0xdf6b41ea)}}, {{TOBN(0x6233ea68, 0xc094dbb5), TOBN(0xb77d062e, 0xd968d410), TOBN(0x3e719bbc, 0x58b3002d), TOBN(0x68e7dd3d, 0x3dc49d58)}, {TOBN(0x8d825740, 0x013a5e58), TOBN(0x21311747, 0x3c9e3c1b), TOBN(0x0cb0a2a7, 0x7c99b6ab), TOBN(0x5c48a3b3, 0xc2f888f2)}}}, {{{TOBN(0xc7913e91, 0x991724f3), TOBN(0x5eda799c, 0x39cbd686), TOBN(0xddb595c7, 0x63d4fc1e), TOBN(0x6b63b80b, 0xac4fed54)}, {TOBN(0x6ea0fc69, 0x7e5fb516), TOBN(0x737708ba, 0xd0f1c964), TOBN(0x9628745f, 0x11a92ca5), TOBN(0x61f37958, 0x9a86967a)}}, {{TOBN(0x9af39b2c, 0xaa665072), TOBN(0x78322fa4, 0xefd324ef), TOBN(0x3d153394, 0xc327bd31), TOBN(0x81d5f271, 0x3129dab0)}, {TOBN(0xc72e0c42, 0xf48027f5), TOBN(0xaa40cdbc, 0x8536e717), TOBN(0xf45a657a, 0x2d369d0f), TOBN(0xb03bbfc4, 0xea7f74e6)}}, {{TOBN(0x46a8c418, 0x0d738ded), TOBN(0x6f1a5bb0, 0xe0de5729), TOBN(0xf10230b9, 0x8ba81675), TOBN(0x32c6f30c, 0x112b33d4)}, {TOBN(0x7559129d, 0xd8fffb62), TOBN(0x6a281b47, 0xb459bf05), TOBN(0x77c1bd3a, 0xfa3b6776), TOBN(0x0709b380, 0x7829973a)}}, {{TOBN(0x8c26b232, 0xa3326505), TOBN(0x38d69272, 0xee1d41bf), TOBN(0x0459453e, 0xffe32afa), TOBN(0xce8143ad, 0x7cb3ea87)}, {TOBN(0x932ec1fa, 0x7e6ab666), TOBN(0x6cd2d230, 0x22286264), TOBN(0x459a46fe, 0x6736f8ed), TOBN(0x50bf0d00, 0x9eca85bb)}}, {{TOBN(0x0b825852, 0x877a21ec), TOBN(0x300414a7, 0x0f537a94), TOBN(0x3f1cba40, 0x21a9a6a2), TOBN(0x50824eee, 0x76943c00)}, {TOBN(0xa0dbfcec, 0xf83cba5d), TOBN(0xf9538148, 0x93b4f3c0), TOBN(0x61744162, 0x48f24dd7), TOBN(0x5322d64d, 0xe4fb09dd)}}, {{TOBN(0x57447384, 0x3d9325f3), TOBN(0xa9bef2d0, 0xf371cb84), TOBN(0x77d2188b, 0xa61e36c5), TOBN(0xbbd6a7d7, 0xc602df72)}, {TOBN(0xba3aa902, 0x8f61bc0b), TOBN(0xf49085ed, 0x6ed0b6a1), TOBN(0x8bc625d6, 0xae6e8298), TOBN(0x832b0b1d, 0xa2e9c01d)}}, {{TOBN(0xa337c447, 0xf1f0ced1), TOBN(0x800cc793, 0x9492dd2b), TOBN(0x4b93151d, 0xbea08efa), TOBN(0x820cf3f8, 0xde0a741e)}, {TOBN(0xff1982dc, 0x1c0f7d13), TOBN(0xef921960, 0x84dde6ca), TOBN(0x1ad7d972, 0x45f96ee3), TOBN(0x319c8dbe, 0x29dea0c7)}}, {{TOBN(0xd3ea3871, 0x7b82b99b), TOBN(0x75922d4d, 0x470eb624), TOBN(0x8f66ec54, 0x3b95d466), TOBN(0x66e673cc, 0xbee1e346)}, {TOBN(0x6afe67c4, 0xb5f2b89a), TOBN(0x3de9c1e6, 0x290e5cd3), TOBN(0x8c278bb6, 0x310a2ada), TOBN(0x420fa384, 0x0bdb323b)}}, {{TOBN(0x0ae1d63b, 0x0eb919b0), TOBN(0xd74ee51d, 0xa74b9620), TOBN(0x395458d0, 0xa674290c), TOBN(0x324c930f, 0x4620a510)}, {TOBN(0x2d1f4d19, 0xfbac27d4), TOBN(0x4086e8ca, 0x9bedeeac), TOBN(0x0cdd211b, 0x9b679ab8), TOBN(0x5970167d, 0x7090fec4)}}, {{TOBN(0x3420f2c9, 0xfaf1fc63), TOBN(0x616d333a, 0x328c8bb4), TOBN(0x7d65364c, 0x57f1fe4a), TOBN(0x9343e877, 0x55e5c73a)}, {TOBN(0x5795176b, 0xe970e78c), TOBN(0xa36ccebf, 0x60533627), TOBN(0xfc7c7380, 0x09cdfc1b), TOBN(0xb39a2afe, 0xb3fec326)}}, {{TOBN(0xb7ff1ba1, 0x6224408a), TOBN(0xcc856e92, 0x247cfc5e), TOBN(0x01f102e7, 0xc18bc493), TOBN(0x4613ab74, 0x2091c727)}, {TOBN(0xaa25e89c, 0xc420bf2b), TOBN(0x00a53176, 0x90337ec2), TOBN(0xd2be9f43, 0x7d025fc7), TOBN(0x3316fb85, 0x6e6fe3dc)}}, {{TOBN(0x27520af5, 0x9ac50814), TOBN(0xfdf95e78, 0x9a8e4223), TOBN(0xb7e7df2a, 0x56bec5a0), TOBN(0xf7022f7d, 0xdf159e5d)}, {TOBN(0x93eeeab1, 0xcac1fe8f), TOBN(0x8040188c, 0x37451168), TOBN(0x7ee8aa8a, 0xd967dce6), TOBN(0xfa0e79e7, 0x3abc9299)}}, {{TOBN(0x67332cfc, 0x2064cfd1), TOBN(0x339c31de, 0xb0651934), TOBN(0x719b28d5, 0x2a3bcbea), TOBN(0xee74c82b, 0x9d6ae5c6)}, {TOBN(0x0927d05e, 0xbaf28ee6), TOBN(0x82cecf2c, 0x9d719028), TOBN(0x0b0d353e, 0xddb30289), TOBN(0xfe4bb977, 0xfddb2e29)}}, {{TOBN(0xbb5bb990, 0x640bfd9e), TOBN(0xd226e277, 0x82f62108), TOBN(0x4bf00985, 0x02ffdd56), TOBN(0x7756758a, 0x2ca1b1b5)}, {TOBN(0xc32b62a3, 0x5285fe91), TOBN(0xedbc546a, 0x8c9cd140), TOBN(0x1e47a013, 0xaf5cb008), TOBN(0xbca7e720, 0x073ce8f2)}}, {{TOBN(0xe10b2ab8, 0x17a91cae), TOBN(0xb89aab65, 0x08e27f63), TOBN(0x7b3074a7, 0xdba3ddf9), TOBN(0x1c20ce09, 0x330c2972)}, {TOBN(0x6b9917b4, 0x5fcf7e33), TOBN(0xe6793743, 0x945ceb42), TOBN(0x18fc2215, 0x5c633d19), TOBN(0xad1adb3c, 0xc7485474)}}, {{TOBN(0x646f9679, 0x6424c49b), TOBN(0xf888dfe8, 0x67c241c9), TOBN(0xe12d4b93, 0x24f68b49), TOBN(0x9a6b62d8, 0xa571df20)}, {TOBN(0x81b4b26d, 0x179483cb), TOBN(0x666f9632, 0x9511fae2), TOBN(0xd281b3e4, 0xd53aa51f), TOBN(0x7f96a765, 0x7f3dbd16)}}, {{TOBN(0xa7f8b5bf, 0x074a30ce), TOBN(0xd7f52107, 0x005a32e6), TOBN(0x6f9e0907, 0x50237ed4), TOBN(0x2f21da47, 0x8096fa2b)}, {TOBN(0xf3e19cb4, 0xeec863a0), TOBN(0xd18f77fd, 0x9527620a), TOBN(0x9505c81c, 0x407c1cf8), TOBN(0x9998db4e, 0x1b6ec284)}}, {{TOBN(0x7e3389e5, 0xc247d44d), TOBN(0x12507141, 0x3f4f3d80), TOBN(0xd4ba0110, 0x4a78a6c7), TOBN(0x312874a0, 0x767720be)}, {TOBN(0xded059a6, 0x75944370), TOBN(0xd6123d90, 0x3b2c0bdd), TOBN(0xa56b717b, 0x51c108e3), TOBN(0x9bb7940e, 0x070623e9)}}, {{TOBN(0x794e2d59, 0x84ac066c), TOBN(0xf5954a92, 0xe68c69a0), TOBN(0x28c52458, 0x4fd99dcc), TOBN(0x60e639fc, 0xb1012517)}, {TOBN(0xc2e60125, 0x7de79248), TOBN(0xe9ef6404, 0xf12fc6d7), TOBN(0x4c4f2808, 0x2a3b5d32), TOBN(0x865ad32e, 0xc768eb8a)}}, {{TOBN(0xac02331b, 0x13fb70b6), TOBN(0x037b44c1, 0x95599b27), TOBN(0x1a860fc4, 0x60bd082c), TOBN(0xa2e25745, 0xc980cd01)}, {TOBN(0xee3387a8, 0x1da0263e), TOBN(0x931bfb95, 0x2d10f3d6), TOBN(0x5b687270, 0xa1f24a32), TOBN(0xf140e65d, 0xca494b86)}}, {{TOBN(0x4f4ddf91, 0xb2f1ac7a), TOBN(0xf99eaabb, 0x760fee27), TOBN(0x57f4008a, 0x49c228e5), TOBN(0x090be440, 0x1cf713bb)}, {TOBN(0xac91fbe4, 0x5004f022), TOBN(0xd838c2c2, 0x569e1af6), TOBN(0xd6c7d20b, 0x0f1daaa5), TOBN(0xaa063ac1, 0x1bbb02c0)}}, {{TOBN(0x0938a422, 0x59558a78), TOBN(0x5343c669, 0x8435da2f), TOBN(0x96f67b18, 0x034410dc), TOBN(0x7cc1e424, 0x84510804)}, {TOBN(0x86a1543f, 0x16dfbb7d), TOBN(0x921fa942, 0x5b5bd592), TOBN(0x9dcccb6e, 0xb33dd03c), TOBN(0x8581ddd9, 0xb843f51e)}}, {{TOBN(0x54935fcb, 0x81d73c9e), TOBN(0x6d07e979, 0x0a5e97ab), TOBN(0x4dc7b30a, 0xcf3a6bab), TOBN(0x147ab1f3, 0x170bee11)}, {TOBN(0x0aaf8e3d, 0x9fafdee4), TOBN(0xfab3dbcb, 0x538a8b95), TOBN(0x405df4b3, 0x6ef13871), TOBN(0xf1f4e9cb, 0x088d5a49)}}, {{TOBN(0x9bcd24d3, 0x66b33f1d), TOBN(0x3b97b820, 0x5ce445c0), TOBN(0xe2926549, 0xba93ff61), TOBN(0xd9c341ce, 0x4dafe616)}, {TOBN(0xfb30a76e, 0x16efb6f3), TOBN(0xdf24b8ca, 0x605b953c), TOBN(0x8bd52afe, 0xc2fffb9f), TOBN(0xbbac5ff7, 0xe19d0b96)}}, {{TOBN(0x43c01b87, 0x459afccd), TOBN(0x6bd45143, 0xb7432652), TOBN(0x84734530, 0x55b5d78e), TOBN(0x81088fdb, 0x1554ba7d)}, {TOBN(0xada0a52c, 0x1e269375), TOBN(0xf9f037c4, 0x2dc5ec10), TOBN(0xc0660607, 0x94bfbc11), TOBN(0xc0a630bb, 0xc9c40d2f)}}, {{TOBN(0x5efc797e, 0xab64c31e), TOBN(0xffdb1dab, 0x74507144), TOBN(0xf6124287, 0x1ca6790c), TOBN(0xe9609d81, 0xe69bf1bf)}, {TOBN(0xdb898595, 0x00d24fc9), TOBN(0x9c750333, 0xe51fb417), TOBN(0x51830a91, 0xfef7bbde), TOBN(0x0ce67dc8, 0x945f585c)}}, {{TOBN(0x9a730ed4, 0x4763eb50), TOBN(0x24a0e221, 0xc1ab0d66), TOBN(0x643b6393, 0x648748f3), TOBN(0x1982daa1, 0x6d3c6291)}, {TOBN(0x6f00a9f7, 0x8bbc5549), TOBN(0x7a1783e1, 0x7f36384e), TOBN(0xe8346323, 0xde977f50), TOBN(0x91ab688d, 0xb245502a)}}, {{TOBN(0x331ab6b5, 0x6d0bdd66), TOBN(0x0a6ef32e, 0x64b71229), TOBN(0x1028150e, 0xfe7c352f), TOBN(0x27e04350, 0xce7b39d3)}, {TOBN(0x2a3c8acd, 0xc1070c82), TOBN(0xfb2034d3, 0x80c9feef), TOBN(0x2d729621, 0x709f3729), TOBN(0x8df290bf, 0x62cb4549)}}, {{TOBN(0x02f99f33, 0xfc2e4326), TOBN(0x3b30076d, 0x5eddf032), TOBN(0xbb21f8cf, 0x0c652fb5), TOBN(0x314fb49e, 0xed91cf7b)}, {TOBN(0xa013eca5, 0x2f700750), TOBN(0x2b9e3c23, 0x712a4575), TOBN(0xe5355557, 0xaf30fbb0), TOBN(0x1ada3516, 0x7c77e771)}}, {{TOBN(0x45f6ecb2, 0x7b135670), TOBN(0xe85d19df, 0x7cfc202e), TOBN(0x0f1b50c7, 0x58d1be9f), TOBN(0x5ebf2c0a, 0xead2e344)}, {TOBN(0x1531fe4e, 0xabc199c9), TOBN(0xc7032592, 0x56bab0ae), TOBN(0x16ab2e48, 0x6c1fec54), TOBN(0x0f87fda8, 0x04280188)}}, {{TOBN(0xdc9f46fc, 0x609e4a74), TOBN(0x2a44a143, 0xba667f91), TOBN(0xbc3d8b95, 0xb4d83436), TOBN(0xa01e4bd0, 0xc7bd2958)}, {TOBN(0x7b182932, 0x73483c90), TOBN(0xa79c6aa1, 0xa7c7b598), TOBN(0xbf3983c6, 0xeaaac07e), TOBN(0x8f18181e, 0x96e0d4e6)}}, {{TOBN(0x8553d37c, 0x051af62b), TOBN(0xe9a998eb, 0x0bf94496), TOBN(0xe0844f9f, 0xb0d59aa1), TOBN(0x983fd558, 0xe6afb813)}, {TOBN(0x9670c0ca, 0x65d69804), TOBN(0x732b22de, 0x6ea5ff2d), TOBN(0xd7640ba9, 0x5fd8623b), TOBN(0x9f619163, 0xa6351782)}}, {{TOBN(0x0bfc27ee, 0xacee5043), TOBN(0xae419e73, 0x2eb10f02), TOBN(0x19c028d1, 0x8943fb05), TOBN(0x71f01cf7, 0xff13aa2a)}, {TOBN(0x7790737e, 0x8887a132), TOBN(0x67513309, 0x66318410), TOBN(0x9819e8a3, 0x7ddb795e), TOBN(0xfecb8ef5, 0xdad100b2)}}, {{TOBN(0x59f74a22, 0x3021926a), TOBN(0xb7c28a49, 0x6f9b4c1c), TOBN(0xed1a733f, 0x912ad0ab), TOBN(0x42a910af, 0x01a5659c)}, {TOBN(0x3842c6e0, 0x7bd68cab), TOBN(0x2b57fa38, 0x76d70ac8), TOBN(0x8a6707a8, 0x3c53aaeb), TOBN(0x62c1c510, 0x65b4db18)}}, {{TOBN(0x8de2c1fb, 0xb2d09dc7), TOBN(0xc3dfed12, 0x266bd23b), TOBN(0x927d039b, 0xd5b27db6), TOBN(0x2fb2f0f1, 0x103243da)}, {TOBN(0xf855a07b, 0x80be7399), TOBN(0xed9327ce, 0x1f9f27a8), TOBN(0xa0bd99c7, 0x729bdef7), TOBN(0x2b67125e, 0x28250d88)}}, {{TOBN(0x784b26e8, 0x8670ced7), TOBN(0xe3dfe41f, 0xc31bd3b4), TOBN(0x9e353a06, 0xbcc85cbc), TOBN(0x302e2909, 0x60178a9d)}, {TOBN(0x860abf11, 0xa6eac16e), TOBN(0x76447000, 0xaa2b3aac), TOBN(0x46ff9d19, 0x850afdab), TOBN(0x35bdd6a5, 0xfdb2d4c1)}}, {{TOBN(0xe82594b0, 0x7e5c9ce9), TOBN(0x0f379e53, 0x20af346e), TOBN(0x608b31e3, 0xbc65ad4a), TOBN(0x710c6b12, 0x267c4826)}, {TOBN(0x51c966f9, 0x71954cf1), TOBN(0xb1cec793, 0x0d0aa215), TOBN(0x1f155989, 0x86bd23a8), TOBN(0xae2ff99c, 0xf9452e86)}}, {{TOBN(0xd8dd953c, 0x340ceaa2), TOBN(0x26355275, 0x2e2e9333), TOBN(0x15d4e5f9, 0x8586f06d), TOBN(0xd6bf94a8, 0xf7cab546)}, {TOBN(0x33c59a0a, 0xb76a9af0), TOBN(0x52740ab3, 0xba095af7), TOBN(0xc444de8a, 0x24389ca0), TOBN(0xcc6f9863, 0x706da0cb)}}, {{TOBN(0xb5a741a7, 0x6b2515cf), TOBN(0x71c41601, 0x9585c749), TOBN(0x78350d4f, 0xe683de97), TOBN(0x31d61524, 0x63d0b5f5)}, {TOBN(0x7a0cc5e1, 0xfbce090b), TOBN(0xaac927ed, 0xfbcb2a5b), TOBN(0xe920de49, 0x20d84c35), TOBN(0x8c06a0b6, 0x22b4de26)}}, {{TOBN(0xd34dd58b, 0xafe7ddf3), TOBN(0x55851fed, 0xc1e6e55b), TOBN(0xd1395616, 0x960696e7), TOBN(0x940304b2, 0x5f22705f)}, {TOBN(0x6f43f861, 0xb0a2a860), TOBN(0xcf121282, 0x0e7cc981), TOBN(0x12186212, 0x0ab64a96), TOBN(0x09215b9a, 0xb789383c)}}, {{TOBN(0x311eb305, 0x37387c09), TOBN(0xc5832fce, 0xf03ee760), TOBN(0x30358f58, 0x32f7ea19), TOBN(0xe01d3c34, 0x91d53551)}, {TOBN(0x1ca5ee41, 0xda48ea80), TOBN(0x34e71e8e, 0xcf4fa4c1), TOBN(0x312abd25, 0x7af1e1c7), TOBN(0xe3afcdeb, 0x2153f4a5)}}, {{TOBN(0x9d5c84d7, 0x00235e9a), TOBN(0x0308d3f4, 0x8c4c836f), TOBN(0xc0a66b04, 0x89332de5), TOBN(0x610dd399, 0x89e566ef)}, {TOBN(0xf8eea460, 0xd1ac1635), TOBN(0x84cbb3fb, 0x20a2c0df), TOBN(0x40afb488, 0xe74a48c5), TOBN(0x29738198, 0xd326b150)}}, {{TOBN(0x2a17747f, 0xa6d74081), TOBN(0x60ea4c05, 0x55a26214), TOBN(0x53514bb4, 0x1f88c5fe), TOBN(0xedd64567, 0x7e83426c)}, {TOBN(0xd5d6cbec, 0x96460b25), TOBN(0xa12fd0ce, 0x68dc115e), TOBN(0xc5bc3ed2, 0x697840ea), TOBN(0x969876a8, 0xa6331e31)}}, {{TOBN(0x60c36217, 0x472ff580), TOBN(0xf4229705, 0x4ad41393), TOBN(0x4bd99ef0, 0xa03b8b92), TOBN(0x501c7317, 0xc144f4f6)}, {TOBN(0x159009b3, 0x18464945), TOBN(0x6d5e594c, 0x74c5c6be), TOBN(0x2d587011, 0x321a3660), TOBN(0xd1e184b1, 0x3898d022)}}, {{TOBN(0x5ba04752, 0x4c6a7e04), TOBN(0x47fa1e2b, 0x45550b65), TOBN(0x9419daf0, 0x48c0a9a5), TOBN(0x66362953, 0x7c243236)}, {TOBN(0xcd0744b1, 0x5cb12a88), TOBN(0x561b6f9a, 0x2b646188), TOBN(0x599415a5, 0x66c2c0c0), TOBN(0xbe3f0859, 0x0f83f09a)}}, {{TOBN(0x9141c5be, 0xb92041b8), TOBN(0x01ae38c7, 0x26477d0d), TOBN(0xca8b71f3, 0xd12c7a94), TOBN(0xfab5b31f, 0x765c70db)}, {TOBN(0x76ae7492, 0x487443e9), TOBN(0x8595a310, 0x990d1349), TOBN(0xf8dbeda8, 0x7d460a37), TOBN(0x7f7ad082, 0x1e45a38f)}}, {{TOBN(0xed1d4db6, 0x1059705a), TOBN(0xa3dd492a, 0xe6b9c697), TOBN(0x4b92ee3a, 0x6eb38bd5), TOBN(0xbab2609d, 0x67cc0bb7)}, {TOBN(0x7fc4fe89, 0x6e70ee82), TOBN(0xeff2c56e, 0x13e6b7e3), TOBN(0x9b18959e, 0x34d26fca), TOBN(0x2517ab66, 0x889d6b45)}}, {{TOBN(0xf167b4e0, 0xbdefdd4f), TOBN(0x69958465, 0xf366e401), TOBN(0x5aa368ab, 0xa73bbec0), TOBN(0x12148709, 0x7b240c21)}, {TOBN(0x378c3233, 0x18969006), TOBN(0xcb4d73ce, 0xe1fe53d1), TOBN(0x5f50a80e, 0x130c4361), TOBN(0xd67f5951, 0x7ef5212b)}}, {{TOBN(0xf145e21e, 0x9e70c72e), TOBN(0xb2e52e29, 0x5566d2fb), TOBN(0x44eaba4a, 0x032397f5), TOBN(0x5e56937b, 0x7e31a7de)}, {TOBN(0x68dcf517, 0x456c61e1), TOBN(0xbc2e954a, 0xa8b0a388), TOBN(0xe3552fa7, 0x60a8b755), TOBN(0x03442dae, 0x73ad0cde)}}, {{TOBN(0x37ffe747, 0xceb26210), TOBN(0x983545e8, 0x787baef9), TOBN(0x8b8c8535, 0x86a3de31), TOBN(0xc621dbcb, 0xfacd46db)}, {TOBN(0x82e442e9, 0x59266fbb), TOBN(0xa3514c37, 0x339d471c), TOBN(0x3a11b771, 0x62cdad96), TOBN(0xf0cb3b3c, 0xecf9bdf0)}}, {{TOBN(0x3fcbdbce, 0x478e2135), TOBN(0x7547b5cf, 0xbda35342), TOBN(0xa97e81f1, 0x8a677af6), TOBN(0xc8c2bf83, 0x28817987)}, {TOBN(0xdf07eaaf, 0x45580985), TOBN(0xc68d1f05, 0xc93b45cb), TOBN(0x106aa2fe, 0xc77b4cac), TOBN(0x4c1d8afc, 0x04a7ae86)}}, {{TOBN(0xdb41c3fd, 0x9eb45ab2), TOBN(0x5b234b5b, 0xd4b22e74), TOBN(0xda253dec, 0xf215958a), TOBN(0x67e0606e, 0xa04edfa0)}, {TOBN(0xabbbf070, 0xef751b11), TOBN(0xf352f175, 0xf6f06dce), TOBN(0xdfc4b6af, 0x6839f6b4), TOBN(0x53ddf9a8, 0x9959848e)}}, {{TOBN(0xda49c379, 0xc21520b0), TOBN(0x90864ff0, 0xdbd5d1b6), TOBN(0x2f055d23, 0x5f49c7f7), TOBN(0xe51e4e6a, 0xa796b2d8)}, {TOBN(0xc361a67f, 0x5c9dc340), TOBN(0x5ad53c37, 0xbca7c620), TOBN(0xda1d6588, 0x32c756d0), TOBN(0xad60d911, 0x8bb67e13)}}, {{TOBN(0xd6c47bdf, 0x0eeec8c6), TOBN(0x4a27fec1, 0x078a1821), TOBN(0x081f7415, 0xc3099524), TOBN(0x8effdf0b, 0x82cd8060)}, {TOBN(0xdb70ec1c, 0x65842df8), TOBN(0x8821b358, 0xd319a901), TOBN(0x72ee56ee, 0xde42b529), TOBN(0x5bb39592, 0x236e4286)}}, {{TOBN(0xd1183316, 0xfd6f7140), TOBN(0xf9fadb5b, 0xbd8e81f7), TOBN(0x701d5e0c, 0x5a02d962), TOBN(0xfdee4dbf, 0x1b601324)}, {TOBN(0xbed17407, 0x35d7620e), TOBN(0x04e3c2c3, 0xf48c0012), TOBN(0x9ee29da7, 0x3455449a), TOBN(0x562cdef4, 0x91a836c4)}}, {{TOBN(0x8f682a5f, 0x47701097), TOBN(0x617125d8, 0xff88d0c2), TOBN(0x948fda24, 0x57bb86dd), TOBN(0x348abb8f, 0x289f7286)}, {TOBN(0xeb10eab5, 0x99d94bbd), TOBN(0xd51ba28e, 0x4684d160), TOBN(0xabe0e51c, 0x30c8f41a), TOBN(0x66588b45, 0x13254f4a)}}, {{TOBN(0x147ebf01, 0xfad097a5), TOBN(0x49883ea8, 0x610e815d), TOBN(0xe44d60ba, 0x8a11de56), TOBN(0xa970de6e, 0x827a7a6d)}, {TOBN(0x2be41424, 0x5e17fc19), TOBN(0xd833c657, 0x01214057), TOBN(0x1375813b, 0x363e723f), TOBN(0x6820bb88, 0xe6a52e9b)}}, {{TOBN(0x7e7f6970, 0xd875d56a), TOBN(0xd6a0a9ac, 0x51fbf6bf), TOBN(0x54ba8790, 0xa3083c12), TOBN(0xebaeb23d, 0x6ae7eb64)}, {TOBN(0xa8685c3a, 0xb99a907a), TOBN(0xf1e74550, 0x026bf40b), TOBN(0x7b73a027, 0xc802cd9e), TOBN(0x9a8a927c, 0x4fef4635)}}, {{TOBN(0xe1b6f60c, 0x08191224), TOBN(0xc4126ebb, 0xde4ec091), TOBN(0xe1dff4dc, 0x4ae38d84), TOBN(0xde3f57db, 0x4f2ef985)}, {TOBN(0x34964337, 0xd446a1dd), TOBN(0x7bf217a0, 0x859e77f6), TOBN(0x8ff10527, 0x8e1d13f5), TOBN(0xa304ef03, 0x74eeae27)}}, {{TOBN(0xfc6f5e47, 0xd19dfa5a), TOBN(0xdb007de3, 0x7fad982b), TOBN(0x28205ad1, 0x613715f5), TOBN(0x251e6729, 0x7889529e)}, {TOBN(0x72705184, 0x1ae98e78), TOBN(0xf818537d, 0x271cac32), TOBN(0xc8a15b7e, 0xb7f410f5), TOBN(0xc474356f, 0x81f62393)}}, {{TOBN(0x92dbdc5a, 0xc242316b), TOBN(0xabe060ac, 0xdbf4aff5), TOBN(0x6e8c38fe, 0x909a8ec6), TOBN(0x43e514e5, 0x6116cb94)}, {TOBN(0x2078fa38, 0x07d784f9), TOBN(0x1161a880, 0xf4b5b357), TOBN(0x5283ce79, 0x13adea3d), TOBN(0x0756c3e6, 0xcc6a910b)}}, {{TOBN(0x60bcfe01, 0xaaa79697), TOBN(0x04a73b29, 0x56391db1), TOBN(0xdd8dad47, 0x189b45a0), TOBN(0xbfac0dd0, 0x48d5b8d9)}, {TOBN(0x34ab3af5, 0x7d3d2ec2), TOBN(0x6fa2fc2d, 0x207bd3af), TOBN(0x9ff40092, 0x66550ded), TOBN(0x719b3e87, 0x1fd5b913)}}, {{TOBN(0xa573a496, 0x6d17fbc7), TOBN(0x0cd1a70a, 0x73d2b24e), TOBN(0x34e2c5ca, 0xb2676937), TOBN(0xe7050b06, 0xbf669f21)}, {TOBN(0xfbe948b6, 0x1ede9046), TOBN(0xa0530051, 0x97662659), TOBN(0x58cbd4ed, 0xf10124c5), TOBN(0xde2646e4, 0xdd6c06c8)}}, {{TOBN(0x332f8108, 0x8cad38c0), TOBN(0x471b7e90, 0x6bd68ae2), TOBN(0x56ac3fb2, 0x0d8e27a3), TOBN(0xb54660db, 0x136b4b0d)}, {TOBN(0x123a1e11, 0xa6fd8de4), TOBN(0x44dbffea, 0xa37799ef), TOBN(0x4540b977, 0xce6ac17c), TOBN(0x495173a8, 0xaf60acef)}}}, {{{TOBN(0x9ebb284d, 0x391c2a82), TOBN(0xbcdd4863, 0x158308e8), TOBN(0x006f16ec, 0x83f1edca), TOBN(0xa13e2c37, 0x695dc6c8)}, {TOBN(0x2ab756f0, 0x4a057a87), TOBN(0xa8765500, 0xa6b48f98), TOBN(0x4252face, 0x68651c44), TOBN(0xa52b540b, 0xe1765e02)}}, {{TOBN(0x4f922fc5, 0x16a0d2bb), TOBN(0x0d5cc16c, 0x1a623499), TOBN(0x9241cf3a, 0x57c62c8b), TOBN(0x2f5e6961, 0xfd1b667f)}, {TOBN(0x5c15c70b, 0xf5a01797), TOBN(0x3d20b44d, 0x60956192), TOBN(0x04911b37, 0x071fdb52), TOBN(0xf648f916, 0x8d6f0f7b)}}, {{TOBN(0x6dc1acaf, 0xe60b7cf7), TOBN(0x25860a50, 0x84a9d869), TOBN(0x56fc6f09, 0xe7ba8ac4), TOBN(0x828c5bd0, 0x6148d29e)}, {TOBN(0xac6b435e, 0xdc55ae5f), TOBN(0xa527f56c, 0xc0117411), TOBN(0x94d5045e, 0xfd24342c), TOBN(0x2c4c0a35, 0x70b67c0d)}}, {{TOBN(0x027cc8b8, 0xfac61d9a), TOBN(0x7d25e062, 0xe3c6fe8a), TOBN(0xe08805bf, 0xe5bff503), TOBN(0x13271e6c, 0x6ff632f7)}, {TOBN(0x55dca6c0, 0x232f76a5), TOBN(0x8957c32d, 0x701ef426), TOBN(0xee728bcb, 0xa10a5178), TOBN(0x5ea60411, 0xb62c5173)}}, {{TOBN(0xfc4e964e, 0xd0b8892b), TOBN(0x9ea17683, 0x9301bb74), TOBN(0x6265c5ae, 0xfcc48626), TOBN(0xe60cf82e, 0xbb3e9102)}, {TOBN(0x57adf797, 0xd4df5531), TOBN(0x235b59a1, 0x8deeefe2), TOBN(0x60adcf58, 0x3f306eb1), TOBN(0x105c2753, 0x3d09492d)}}, {{TOBN(0x4090914b, 0xb5def996), TOBN(0x1cb69c83, 0x233dd1e7), TOBN(0xc1e9c1d3, 0x9b3d5e76), TOBN(0x1f3338ed, 0xfccf6012)}, {TOBN(0xb1e95d0d, 0x2f5378a8), TOBN(0xacf4c2c7, 0x2f00cd21), TOBN(0x6e984240, 0xeb5fe290), TOBN(0xd66c038d, 0x248088ae)}}, {{TOBN(0x804d264a, 0xf94d70cf), TOBN(0xbdb802ef, 0x7314bf7e), TOBN(0x8fb54de2, 0x4333ed02), TOBN(0x740461e0, 0x285635d9)}, {TOBN(0x4113b2c8, 0x365e9383), TOBN(0xea762c83, 0x3fdef652), TOBN(0x4eec6e2e, 0x47b956c1), TOBN(0xa3d814be, 0x65620fa4)}}, {{TOBN(0x9ad5462b, 0xb4d8bc50), TOBN(0x181c0b16, 0xa9195770), TOBN(0xebd4fe1c, 0x78412a68), TOBN(0xae0341bc, 0xc0dff48c)}, {TOBN(0xb6bc45cf, 0x7003e866), TOBN(0xf11a6dea, 0x8a24a41b), TOBN(0x5407151a, 0xd04c24c2), TOBN(0x62c9d27d, 0xda5b7b68)}}, {{TOBN(0x2e964235, 0x88cceff6), TOBN(0x8594c54f, 0x8b07ed69), TOBN(0x1578e73c, 0xc84d0d0d), TOBN(0x7b4e1055, 0xff532868)}, {TOBN(0xa348c0d5, 0xb5ec995a), TOBN(0xbf4b9d55, 0x14289a54), TOBN(0x9ba155a6, 0x58fbd777), TOBN(0x186ed7a8, 0x1a84491d)}}, {{TOBN(0xd4992b30, 0x614c0900), TOBN(0xda98d121, 0xbd00c24b), TOBN(0x7f534dc8, 0x7ec4bfa1), TOBN(0x4a5ff674, 0x37dc34bc)}, {TOBN(0x68c196b8, 0x1d7ea1d7), TOBN(0x38cf2893, 0x80a6d208), TOBN(0xfd56cd09, 0xe3cbbd6e), TOBN(0xec72e27e, 0x4205a5b6)}}, {{TOBN(0x15ea68f5, 0xa44f77f7), TOBN(0x7aa5f9fd, 0xb43c52bc), TOBN(0x86ff676f, 0x94f0e609), TOBN(0xa4cde963, 0x2e2d432b)}, {TOBN(0x8cafa0c0, 0xeee470af), TOBN(0x84137d0e, 0x8a3f5ec8), TOBN(0xebb40411, 0xfaa31231), TOBN(0xa239c13f, 0x6f7f7ccf)}}, {{TOBN(0x32865719, 0xa8afd30b), TOBN(0x86798328, 0x8a826dce), TOBN(0xdf04e891, 0xc4a8fbe0), TOBN(0xbb6b6e1b, 0xebf56ad3)}, {TOBN(0x0a695b11, 0x471f1ff0), TOBN(0xd76c3389, 0xbe15baf0), TOBN(0x018edb95, 0xbe96c43e), TOBN(0xf2beaaf4, 0x90794158)}}, {{TOBN(0x152db09e, 0xc3076a27), TOBN(0x5e82908e, 0xe416545d), TOBN(0xa2c41272, 0x356d6f2e), TOBN(0xdc9c9642, 0x31fd74e1)}, {TOBN(0x66ceb88d, 0x519bf615), TOBN(0xe29ecd76, 0x05a2274e), TOBN(0x3a0473c4, 0xbf5e2fa0), TOBN(0x6b6eb671, 0x64284e67)}}, {{TOBN(0xe8b97932, 0xb88756dd), TOBN(0xed4e8652, 0xf17e3e61), TOBN(0xc2dd1499, 0x3ee1c4a4), TOBN(0xc0aaee17, 0x597f8c0e)}, {TOBN(0x15c4edb9, 0x6c168af3), TOBN(0x6563c7bf, 0xb39ae875), TOBN(0xadfadb6f, 0x20adb436), TOBN(0xad55e8c9, 0x9a042ac0)}}, {{TOBN(0x975a1ed8, 0xb76da1f5), TOBN(0x10dfa466, 0xa58acb94), TOBN(0x8dd7f7e3, 0xac060282), TOBN(0x6813e66a, 0x572a051e)}, {TOBN(0xb4ccae1e, 0x350cb901), TOBN(0xb653d656, 0x50cb7822), TOBN(0x42484710, 0xdfab3b87), TOBN(0xcd7ee537, 0x9b670fd0)}}, {{TOBN(0x0a50b12e, 0x523b8bf6), TOBN(0x8009eb5b, 0x8f910c1b), TOBN(0xf535af82, 0x4a167588), TOBN(0x0f835f9c, 0xfb2a2abd)}, {TOBN(0xf59b2931, 0x2afceb62), TOBN(0xc797df2a, 0x169d383f), TOBN(0xeb3f5fb0, 0x66ac02b0), TOBN(0x029d4c6f, 0xdaa2d0ca)}}, {{TOBN(0xd4059bc1, 0xafab4bc5), TOBN(0x833f5c6f, 0x56783247), TOBN(0xb5346630, 0x8d2d3605), TOBN(0x83387891, 0xd34d8433)}, {TOBN(0xd973b30f, 0xadd9419a), TOBN(0xbcca1099, 0xafe3fce8), TOBN(0x08178315, 0x0809aac6), TOBN(0x01b7f21a, 0x540f0f11)}}, {{TOBN(0x65c29219, 0x909523c8), TOBN(0xa62f648f, 0xa3a1c741), TOBN(0x88598d4f, 0x60c9e55a), TOBN(0xbce9141b, 0x0e4f347a)}, {TOBN(0x9af97d84, 0x35f9b988), TOBN(0x0210da62, 0x320475b6), TOBN(0x3c076e22, 0x9191476c), TOBN(0x7520dbd9, 0x44fc7834)}}, {{TOBN(0x6a6b2cfe, 0xc1ab1bbd), TOBN(0xef8a65be, 0xdc650938), TOBN(0x72855540, 0x805d7bc4), TOBN(0xda389396, 0xed11fdfd)}, {TOBN(0xa9d5bd36, 0x74660876), TOBN(0x11d67c54, 0xb45dff35), TOBN(0x6af7d148, 0xa4f5da94), TOBN(0xbb8d4c3f, 0xc0bbeb31)}}, {{TOBN(0x87a7ebd1, 0xe0a1b12a), TOBN(0x1e4ef88d, 0x770ba95f), TOBN(0x8c33345c, 0xdc2ae9cb), TOBN(0xcecf1276, 0x01cc8403)}, {TOBN(0x687c012e, 0x1b39b80f), TOBN(0xfd90d0ad, 0x35c33ba4), TOBN(0xa3ef5a67, 0x5c9661c2), TOBN(0x368fc88e, 0xe017429e)}}, {{TOBN(0xd30c6761, 0x196a2fa2), TOBN(0x931b9817, 0xbd5b312e), TOBN(0xba01000c, 0x72f54a31), TOBN(0xa203d2c8, 0x66eaa541)}, {TOBN(0xf2abdee0, 0x98939db3), TOBN(0xe37d6c2c, 0x3e606c02), TOBN(0xf2921574, 0x521ff643), TOBN(0x2781b3c4, 0xd7e2fca3)}}, {{TOBN(0x664300b0, 0x7850ec06), TOBN(0xac5a38b9, 0x7d3a10cf), TOBN(0x9233188d, 0xe34ab39d), TOBN(0xe77057e4, 0x5072cbb9)}, {TOBN(0xbcf0c042, 0xb59e78df), TOBN(0x4cfc91e8, 0x1d97de52), TOBN(0x4661a26c, 0x3ee0ca4a), TOBN(0x5620a4c1, 0xfb8507bc)}}, {{TOBN(0x4b44d4aa, 0x049f842c), TOBN(0xceabc5d5, 0x1540e82b), TOBN(0x306710fd, 0x15c6f156), TOBN(0xbe5ae52b, 0x63db1d72)}, {TOBN(0x06f1e7e6, 0x334957f1), TOBN(0x57e388f0, 0x31144a70), TOBN(0xfb69bb2f, 0xdf96447b), TOBN(0x0f78ebd3, 0x73e38a12)}}, {{TOBN(0xb8222605, 0x2b7ce542), TOBN(0xe6d4ce99, 0x7472bde1), TOBN(0x53e16ebe, 0x09d2f4da), TOBN(0x180ff42e, 0x53b92b2e)}, {TOBN(0xc59bcc02, 0x2c34a1c6), TOBN(0x3803d6f9, 0x422c46c2), TOBN(0x18aff74f, 0x5c14a8a2), TOBN(0x55aebf80, 0x10a08b28)}}, {{TOBN(0x66097d58, 0x7135593f), TOBN(0x32e6eff7, 0x2be570cd), TOBN(0x584e6a10, 0x2a8c860d), TOBN(0xcd185890, 0xa2eb4163)}, {TOBN(0x7ceae99d, 0x6d97e134), TOBN(0xd42c6b70, 0xdd8447ce), TOBN(0x59ddbb4a, 0xb8c50273), TOBN(0x03c612df, 0x3cf34e1e)}}, {{TOBN(0x84b9ca15, 0x04b6c5a0), TOBN(0x35216f39, 0x18f0e3a3), TOBN(0x3ec2d2bc, 0xbd986c00), TOBN(0x8bf546d9, 0xd19228fe)}, {TOBN(0xd1c655a4, 0x4cd623c3), TOBN(0x366ce718, 0x502b8e5a), TOBN(0x2cfc84b4, 0xeea0bfe7), TOBN(0xe01d5cee, 0xcf443e8e)}}, {{TOBN(0x8ec045d9, 0x036520f8), TOBN(0xdfb3c3d1, 0x92d40e98), TOBN(0x0bac4cce, 0xcc559a04), TOBN(0x35eccae5, 0x240ea6b1)}, {TOBN(0x180b32db, 0xf8a5a0ac), TOBN(0x547972a5, 0xeb699700), TOBN(0xa3765801, 0xca26bca0), TOBN(0x57e09d0e, 0xa647f25a)}}, {{TOBN(0xb956970e, 0x2fdd23cc), TOBN(0xb80288bc, 0x5682e971), TOBN(0xe6e6d91e, 0x9ae86ebc), TOBN(0x0564c83f, 0x8c9f1939)}, {TOBN(0x551932a2, 0x39560368), TOBN(0xe893752b, 0x049c28e2), TOBN(0x0b03cee5, 0xa6a158c3), TOBN(0xe12d656b, 0x04964263)}}, {{TOBN(0x4b47554e, 0x63e3bc1d), TOBN(0xc719b6a2, 0x45044ff7), TOBN(0x4f24d30a, 0xe48daa07), TOBN(0xa3f37556, 0xc8c1edc3)}, {TOBN(0x9a47bf76, 0x0700d360), TOBN(0xbb1a1824, 0x822ae4e2), TOBN(0x22e275a3, 0x89f1fb4c), TOBN(0x72b1aa23, 0x9968c5f5)}}, {{TOBN(0xa75feaca, 0xbe063f64), TOBN(0x9b392f43, 0xbce47a09), TOBN(0xd4241509, 0x1ad07aca), TOBN(0x4b0c591b, 0x8d26cd0f)}, {TOBN(0x2d42ddfd, 0x92f1169a), TOBN(0x63aeb1ac, 0x4cbf2392), TOBN(0x1de9e877, 0x0691a2af), TOBN(0xebe79af7, 0xd98021da)}}, {{TOBN(0xcfdf2a4e, 0x40e50acf), TOBN(0xf0a98ad7, 0xaf01d665), TOBN(0xefb640bf, 0x1831be1f), TOBN(0x6fe8bd2f, 0x80e9ada0)}, {TOBN(0x94c103a1, 0x6cafbc91), TOBN(0x170f8759, 0x8308e08c), TOBN(0x5de2d2ab, 0x9780ff4f), TOBN(0x666466bc, 0x45b201f2)}}, {{TOBN(0x58af2010, 0xf5b343bc), TOBN(0x0f2e400a, 0xf2f142fe), TOBN(0x3483bfde, 0xa85f4bdf), TOBN(0xf0b1d093, 0x03bfeaa9)}, {TOBN(0x2ea01b95, 0xc7081603), TOBN(0xe943e4c9, 0x3dba1097), TOBN(0x47be92ad, 0xb438f3a6), TOBN(0x00bb7742, 0xe5bf6636)}}, {{TOBN(0x136b7083, 0x824297b4), TOBN(0x9d0e5580, 0x5584455f), TOBN(0xab48cedc, 0xf1c7d69e), TOBN(0x53a9e481, 0x2a256e76)}, {TOBN(0x0402b0e0, 0x65eb2413), TOBN(0xdadbbb84, 0x8fc407a7), TOBN(0xa65cd5a4, 0x8d7f5492), TOBN(0x21d44293, 0x74bae294)}}, {{TOBN(0x66917ce6, 0x3b5f1cc4), TOBN(0x37ae52ea, 0xce872e62), TOBN(0xbb087b72, 0x2905f244), TOBN(0x12077086, 0x1e6af74f)}, {TOBN(0x4b644e49, 0x1058edea), TOBN(0x827510e3, 0xb638ca1d), TOBN(0x8cf2b704, 0x6038591c), TOBN(0xffc8b47a, 0xfe635063)}}, {{TOBN(0x3ae220e6, 0x1b4d5e63), TOBN(0xbd864742, 0x9d961b4b), TOBN(0x610c107e, 0x9bd16bed), TOBN(0x4270352a, 0x1127147b)}, {TOBN(0x7d17ffe6, 0x64cfc50e), TOBN(0x50dee01a, 0x1e36cb42), TOBN(0x068a7622, 0x35dc5f9a), TOBN(0x9a08d536, 0xdf53f62c)}}, {{TOBN(0x4ed71457, 0x6be5f7de), TOBN(0xd93006f8, 0xc2263c9e), TOBN(0xe073694c, 0xcacacb36), TOBN(0x2ff7a5b4, 0x3ae118ab)}, {TOBN(0x3cce53f1, 0xcd871236), TOBN(0xf156a39d, 0xc2aa6d52), TOBN(0x9cc5f271, 0xb198d76d), TOBN(0xbc615b6f, 0x81383d39)}}, {{TOBN(0xa54538e8, 0xde3eee6b), TOBN(0x58c77538, 0xab910d91), TOBN(0x31e5bdbc, 0x58d278bd), TOBN(0x3cde4adf, 0xb963acae)}, {TOBN(0xb1881fd2, 0x5302169c), TOBN(0x8ca60fa0, 0xa989ed8b), TOBN(0xa1999458, 0xff96a0ee), TOBN(0xc1141f03, 0xac6c283d)}}, {{TOBN(0x7677408d, 0x6dfafed3), TOBN(0x33a01653, 0x39661588), TOBN(0x3c9c15ec, 0x0b726fa0), TOBN(0x090cfd93, 0x6c9b56da)}, {TOBN(0xe34f4bae, 0xa3c40af5), TOBN(0x3469eadb, 0xd21129f1), TOBN(0xcc51674a, 0x1e207ce8), TOBN(0x1e293b24, 0xc83b1ef9)}}, {{TOBN(0x17173d13, 0x1e6c0bb4), TOBN(0x19004695, 0x90776d35), TOBN(0xe7980e34, 0x6de6f922), TOBN(0x873554cb, 0xf4dd9a22)}, {TOBN(0x0316c627, 0xcbf18a51), TOBN(0x4d93651b, 0x3032c081), TOBN(0x207f2771, 0x3946834d), TOBN(0x2c08d7b4, 0x30cdbf80)}}, {{TOBN(0x137a4fb4, 0x86df2a61), TOBN(0xa1ed9c07, 0xecf7b4a2), TOBN(0xb2e460e2, 0x7bd042ff), TOBN(0xb7f5e2fa, 0x5f62f5ec)}, {TOBN(0x7aa6ec6b, 0xcc2423b7), TOBN(0x75ce0a7f, 0xba63eea7), TOBN(0x67a45fb1, 0xf250a6e1), TOBN(0x93bc919c, 0xe53cdc9f)}}, {{TOBN(0x9271f56f, 0x871942df), TOBN(0x2372ff6f, 0x7859ad66), TOBN(0x5f4c2b96, 0x33cb1a78), TOBN(0xe3e29101, 0x5838aa83)}, {TOBN(0xa7ed1611, 0xe4e8110c), TOBN(0x2a2d70d5, 0x330198ce), TOBN(0xbdf132e8, 0x6720efe0), TOBN(0xe61a8962, 0x66a471bf)}}, {{TOBN(0x796d3a85, 0x825808bd), TOBN(0x51dc3cb7, 0x3fd6e902), TOBN(0x643c768a, 0x916219d1), TOBN(0x36cd7685, 0xa2ad7d32)}, {TOBN(0xe3db9d05, 0xb22922a4), TOBN(0x6494c87e, 0xdba29660), TOBN(0xf0ac91df, 0xbcd2ebc7), TOBN(0x4deb57a0, 0x45107f8d)}}, {{TOBN(0x42271f59, 0xc3d12a73), TOBN(0x5f71687c, 0xa5c2c51d), TOBN(0xcb1f50c6, 0x05797bcb), TOBN(0x29ed0ed9, 0xd6d34eb0)}, {TOBN(0xe5fe5b47, 0x4683c2eb), TOBN(0x4956eeb5, 0x97447c46), TOBN(0x5b163a43, 0x71207167), TOBN(0x93fa2fed, 0x0248c5ef)}}, {{TOBN(0x67930af2, 0x31f63950), TOBN(0xa77797c1, 0x14caa2c9), TOBN(0x526e80ee, 0x27ac7e62), TOBN(0xe1e6e626, 0x58b28aec)}, {TOBN(0x636178b0, 0xb3c9fef0), TOBN(0xaf7752e0, 0x6d5f90be), TOBN(0x94ecaf18, 0xeece51cf), TOBN(0x2864d0ed, 0xca806e1f)}}, {{TOBN(0x6de2e383, 0x97c69134), TOBN(0x5a42c316, 0xeb291293), TOBN(0xc7779219, 0x6a60bae0), TOBN(0xa24de346, 0x6b7599d1)}, {TOBN(0x49d374aa, 0xb75d4941), TOBN(0x98900586, 0x2d501ff0), TOBN(0x9f16d40e, 0xeb7974cf), TOBN(0x1033860b, 0xcdd8c115)}}, {{TOBN(0xb6c69ac8, 0x2094cec3), TOBN(0x9976fb88, 0x403b770c), TOBN(0x1dea026c, 0x4859590d), TOBN(0xb6acbb46, 0x8562d1fd)}, {TOBN(0x7cd6c461, 0x44569d85), TOBN(0xc3190a36, 0x97f0891d), TOBN(0xc6f53195, 0x48d5a17d), TOBN(0x7d919966, 0xd749abc8)}}, {{TOBN(0x65104837, 0xdd1c8a20), TOBN(0x7e5410c8, 0x2f683419), TOBN(0x958c3ca8, 0xbe94022e), TOBN(0x605c3197, 0x6145dac2)}, {TOBN(0x3fc07501, 0x01683d54), TOBN(0x1d7127c5, 0x595b1234), TOBN(0x10b8f87c, 0x9481277f), TOBN(0x677db2a8, 0xe65a1adb)}}, {{TOBN(0xec2fccaa, 0xddce3345), TOBN(0x2a6811b7, 0x012a4350), TOBN(0x96760ff1, 0xac598bdc), TOBN(0x054d652a, 0xd1bf4128)}, {TOBN(0x0a1151d4, 0x92a21005), TOBN(0xad7f3971, 0x33110fdf), TOBN(0x8c95928c, 0x1960100f), TOBN(0x6c91c825, 0x7bf03362)}}, {{TOBN(0xc8c8b2a2, 0xce309f06), TOBN(0xfdb27b59, 0xca27204b), TOBN(0xd223eaa5, 0x0848e32e), TOBN(0xb93e4b2e, 0xe7bfaf1e)}, {TOBN(0xc5308ae6, 0x44aa3ded), TOBN(0x317a666a, 0xc015d573), TOBN(0xc888ce23, 0x1a979707), TOBN(0xf141c1e6, 0x0d5c4958)}}, {{TOBN(0xb53b7de5, 0x61906373), TOBN(0x858dbade, 0xeb999595), TOBN(0x8cbb47b2, 0xa59e5c36), TOBN(0x660318b3, 0xdcf4e842)}, {TOBN(0xbd161ccd, 0x12ba4b7a), TOBN(0xf399daab, 0xf8c8282a), TOBN(0x1587633a, 0xeeb2130d), TOBN(0xa465311a, 0xda38dd7d)}}, {{TOBN(0x5f75eec8, 0x64d3779b), TOBN(0x3c5d0476, 0xad64c171), TOBN(0x87410371, 0x2a914428), TOBN(0x8096a891, 0x90e2fc29)}, {TOBN(0xd3d2ae9d, 0x23b3ebc2), TOBN(0x90bdd6db, 0xa580cfd6), TOBN(0x52dbb7f3, 0xc5b01f6c), TOBN(0xe68eded4, 0xe102a2dc)}}, {{TOBN(0x17785b77, 0x99eb6df0), TOBN(0x26c3cc51, 0x7386b779), TOBN(0x345ed988, 0x6417a48e), TOBN(0xe990b4e4, 0x07d6ef31)}, {TOBN(0x0f456b7e, 0x2586abba), TOBN(0x239ca6a5, 0x59c96e9a), TOBN(0xe327459c, 0xe2eb4206), TOBN(0x3a4c3313, 0xa002b90a)}}, {{TOBN(0x2a114806, 0xf6a3f6fb), TOBN(0xad5cad2f, 0x85c251dd), TOBN(0x92c1f613, 0xf5a784d3), TOBN(0xec7bfacf, 0x349766d5)}, {TOBN(0x04b3cd33, 0x3e23cb3b), TOBN(0x3979fe84, 0xc5a64b2d), TOBN(0x192e2720, 0x7e589106), TOBN(0xa60c43d1, 0xa15b527f)}}, {{TOBN(0x2dae9082, 0xbe7cf3a6), TOBN(0xcc86ba92, 0xbc967274), TOBN(0xf28a2ce8, 0xaea0a8a9), TOBN(0x404ca6d9, 0x6ee988b3)}, {TOBN(0xfd7e9c5d, 0x005921b8), TOBN(0xf56297f1, 0x44e79bf9), TOBN(0xa163b460, 0x0d75ddc2), TOBN(0x30b23616, 0xa1f2be87)}}, {{TOBN(0x4b070d21, 0xbfe50e2b), TOBN(0x7ef8cfd0, 0xe1bfede1), TOBN(0xadba0011, 0x2aac4ae0), TOBN(0x2a3e7d01, 0xb9ebd033)}, {TOBN(0x995277ec, 0xe38d9d1c), TOBN(0xb500249e, 0x9c5d2de3), TOBN(0x8912b820, 0xf13ca8c9), TOBN(0xc8798114, 0x877793af)}}, {{TOBN(0x19e6125d, 0xec3f1dec), TOBN(0x07b1f040, 0x911178da), TOBN(0xd93ededa, 0x904a6738), TOBN(0x55187a5a, 0x0bebedcd)}, {TOBN(0xf7d04722, 0xeb329d41), TOBN(0xf449099e, 0xf170b391), TOBN(0xfd317a69, 0xca99f828), TOBN(0x50c3db2b, 0x34a4976d)}}, {{TOBN(0xe9ba7784, 0x3757b392), TOBN(0x326caefd, 0xaa3ca05a), TOBN(0x78e5293b, 0xf1e593d4), TOBN(0x7842a937, 0x0d98fd13)}, {TOBN(0xe694bf96, 0x5f96b10d), TOBN(0x373a9df6, 0x06a8cd05), TOBN(0x997d1e51, 0xe8f0c7fc), TOBN(0x1d019790, 0x63fd972e)}}, {{TOBN(0x0064d858, 0x5499fb32), TOBN(0x7b67bad9, 0x77a8aeb7), TOBN(0x1d3eb977, 0x2d08eec5), TOBN(0x5fc047a6, 0xcbabae1d)}, {TOBN(0x0577d159, 0xe54a64bb), TOBN(0x8862201b, 0xc43497e4), TOBN(0xad6b4e28, 0x2ce0608d), TOBN(0x8b687b7d, 0x0b167aac)}}, {{TOBN(0x6ed4d367, 0x8b2ecfa9), TOBN(0x24dfe62d, 0xa90c3c38), TOBN(0xa1862e10, 0x3fe5c42b), TOBN(0x1ca73dca, 0xd5732a9f)}, {TOBN(0x35f038b7, 0x76bb87ad), TOBN(0x674976ab, 0xf242b81f), TOBN(0x4f2bde7e, 0xb0fd90cd), TOBN(0x6efc172e, 0xa7fdf092)}}, {{TOBN(0x3806b69b, 0x92222f1f), TOBN(0x5a2459ca, 0x6cf7ae70), TOBN(0x6789f69c, 0xa85217ee), TOBN(0x5f232b5e, 0xe3dc85ac)}, {TOBN(0x660e3ec5, 0x48e9e516), TOBN(0x124b4e47, 0x3197eb31), TOBN(0x10a0cb13, 0xaafcca23), TOBN(0x7bd63ba4, 0x8213224f)}}, {{TOBN(0xaffad7cc, 0x290a7f4f), TOBN(0x6b409c9e, 0x0286b461), TOBN(0x58ab809f, 0xffa407af), TOBN(0xc3122eed, 0xc68ac073)}, {TOBN(0x17bf9e50, 0x4ef24d7e), TOBN(0x5d929794, 0x3e2a5811), TOBN(0x519bc867, 0x02902e01), TOBN(0x76bba5da, 0x39c8a851)}}, {{TOBN(0xe9f9669c, 0xda94951e), TOBN(0x4b6af58d, 0x66b8d418), TOBN(0xfa321074, 0x17d426a4), TOBN(0xc78e66a9, 0x9dde6027)}, {TOBN(0x0516c083, 0x4a53b964), TOBN(0xfc659d38, 0xff602330), TOBN(0x0ab55e5c, 0x58c5c897), TOBN(0x985099b2, 0x838bc5df)}}, {{TOBN(0x061d9efc, 0xc52fc238), TOBN(0x712b2728, 0x6ac1da3f), TOBN(0xfb658149, 0x9283fe08), TOBN(0x4954ac94, 0xb8aaa2f7)}, {TOBN(0x85c0ada4, 0x7fb2e74f), TOBN(0xee8ba98e, 0xb89926b0), TOBN(0xe4f9d37d, 0x23d1af5b), TOBN(0x14ccdbf9, 0xba9b015e)}}, {{TOBN(0xb674481b, 0x7bfe7178), TOBN(0x4e1debae, 0x65405868), TOBN(0x061b2821, 0xc48c867d), TOBN(0x69c15b35, 0x513b30ea)}, {TOBN(0x3b4a1666, 0x36871088), TOBN(0xe5e29f5d, 0x1220b1ff), TOBN(0x4b82bb35, 0x233d9f4d), TOBN(0x4e076333, 0x18cdc675)}}}, {{{TOBN(0x0d53f5c7, 0xa3e6fced), TOBN(0xe8cbbdd5, 0xf45fbdeb), TOBN(0xf85c01df, 0x13339a70), TOBN(0x0ff71880, 0x142ceb81)}, {TOBN(0x4c4e8774, 0xbd70437a), TOBN(0x5fb32891, 0xba0bda6a), TOBN(0x1cdbebd2, 0xf18bd26e), TOBN(0x2f9526f1, 0x03a9d522)}}, {{TOBN(0x40ce3051, 0x92c4d684), TOBN(0x8b04d725, 0x7612efcd), TOBN(0xb9dcda36, 0x6f9cae20), TOBN(0x0edc4d24, 0xf058856c)}, {TOBN(0x64f2e6bf, 0x85427900), TOBN(0x3de81295, 0xdc09dfea), TOBN(0xd41b4487, 0x379bf26c), TOBN(0x50b62c6d, 0x6df135a9)}}, {{TOBN(0xd4f8e3b4, 0xc72dfe67), TOBN(0xc416b0f6, 0x90e19fdf), TOBN(0x18b9098d, 0x4c13bd35), TOBN(0xac11118a, 0x15b8cb9e)}, {TOBN(0xf598a318, 0xf0062841), TOBN(0xbfe0602f, 0x89f356f4), TOBN(0x7ae3637e, 0x30177a0c), TOBN(0x34097747, 0x61136537)}}, {{TOBN(0x0db2fb5e, 0xd005832a), TOBN(0x5f5efd3b, 0x91042e4f), TOBN(0x8c4ffdc6, 0xed70f8ca), TOBN(0xe4645d0b, 0xb52da9cc)}, {TOBN(0x9596f58b, 0xc9001d1f), TOBN(0x52c8f0bc, 0x4e117205), TOBN(0xfd4aa0d2, 0xe398a084), TOBN(0x815bfe3a, 0x104f49de)}}, {{TOBN(0x97e5443f, 0x23885e5f), TOBN(0xf72f8f99, 0xe8433aab), TOBN(0xbd00b154, 0xe4d4e604), TOBN(0xd0b35e6a, 0xe5e173ff)}, {TOBN(0x57b2a048, 0x9164722d), TOBN(0x3e3c665b, 0x88761ec8), TOBN(0x6bdd1397, 0x3da83832), TOBN(0x3c8b1a1e, 0x73dafe3b)}}, {{TOBN(0x4497ace6, 0x54317cac), TOBN(0xbe600ab9, 0x521771b3), TOBN(0xb42e409e, 0xb0dfe8b8), TOBN(0x386a67d7, 0x3942310f)}, {TOBN(0x25548d8d, 0x4431cc28), TOBN(0xa7cff142, 0x985dc524), TOBN(0x4d60f5a1, 0x93c4be32), TOBN(0x83ebd5c8, 0xd071c6e1)}}, {{TOBN(0xba3a80a7, 0xb1fd2b0b), TOBN(0x9b3ad396, 0x5bec33e8), TOBN(0xb3868d61, 0x79743fb3), TOBN(0xcfd169fc, 0xfdb462fa)}, {TOBN(0xd3b499d7, 0x9ce0a6af), TOBN(0x55dc1cf1, 0xe42d3ff8), TOBN(0x04fb9e6c, 0xc6c3e1b2), TOBN(0x47e6961d, 0x6f69a474)}}, {{TOBN(0x54eb3acc, 0xe548b37b), TOBN(0xb38e7542, 0x84d40549), TOBN(0x8c3daa51, 0x7b341b4f), TOBN(0x2f6928ec, 0x690bf7fa)}, {TOBN(0x0496b323, 0x86ce6c41), TOBN(0x01be1c55, 0x10adadcd), TOBN(0xc04e67e7, 0x4bb5faf9), TOBN(0x3cbaf678, 0xe15c9985)}}, {{TOBN(0x8cd12145, 0x50ca4247), TOBN(0xba1aa47a, 0xe7dd30aa), TOBN(0x2f81ddf1, 0xe58fee24), TOBN(0x03452936, 0xeec9b0e8)}, {TOBN(0x8bdc3b81, 0x243aea96), TOBN(0x9a2919af, 0x15c3d0e5), TOBN(0x9ea640ec, 0x10948361), TOBN(0x5ac86d5b, 0x6e0bcccf)}}, {{TOBN(0xf892d918, 0xc36cf440), TOBN(0xaed3e837, 0xc939719c), TOBN(0xb07b08d2, 0xc0218b64), TOBN(0x6f1bcbba, 0xce9790dd)}, {TOBN(0x4a84d6ed, 0x60919b8e), TOBN(0xd8900791, 0x8ac1f9eb), TOBN(0xf84941aa, 0x0dd5daef), TOBN(0xb22fe40a, 0x67fd62c5)}}, {{TOBN(0x97e15ba2, 0x157f2db3), TOBN(0xbda2fc8f, 0x8e28ca9c), TOBN(0x5d050da4, 0x37b9f454), TOBN(0x3d57eb57, 0x2379d72e)}, {TOBN(0xe9b5eba2, 0xfb5ee997), TOBN(0x01648ca2, 0xe11538ca), TOBN(0x32bb76f6, 0xf6327974), TOBN(0x338f14b8, 0xff3f4bb7)}}, {{TOBN(0x524d226a, 0xd7ab9a2d), TOBN(0x9c00090d, 0x7dfae958), TOBN(0x0ba5f539, 0x8751d8c2), TOBN(0x8afcbcdd, 0x3ab8262d)}, {TOBN(0x57392729, 0xe99d043b), TOBN(0xef51263b, 0xaebc943a), TOBN(0x9feace93, 0x20862935), TOBN(0x639efc03, 0xb06c817b)}}, {{TOBN(0x1fe054b3, 0x66b4be7a), TOBN(0x3f25a9de, 0x84a37a1e), TOBN(0xf39ef1ad, 0x78d75cd9), TOBN(0xd7b58f49, 0x5062c1b5)}, {TOBN(0x6f74f9a9, 0xff563436), TOBN(0xf718ff29, 0xe8af51e7), TOBN(0x5234d313, 0x15e97fec), TOBN(0xb6a8e2b1, 0x292f1c0a)}}, {{TOBN(0xa7f53aa8, 0x327720c1), TOBN(0x956ca322, 0xba092cc8), TOBN(0x8f03d64a, 0x28746c4d), TOBN(0x51fe1782, 0x66d0d392)}, {TOBN(0xd19b34db, 0x3c832c80), TOBN(0x60dccc5c, 0x6da2e3b4), TOBN(0x245dd62e, 0x0a104ccc), TOBN(0xa7ab1de1, 0x620b21fd)}}, {{TOBN(0xb293ae0b, 0x3893d123), TOBN(0xf7b75783, 0xb15ee71c), TOBN(0x5aa3c614, 0x42a9468b), TOBN(0xd686123c, 0xdb15d744)}, {TOBN(0x8c616891, 0xa7ab4116), TOBN(0x6fcd72c8, 0xa4e6a459), TOBN(0xac219110, 0x77e5fad7), TOBN(0xfb6a20e7, 0x704fa46b)}}, {{TOBN(0xe839be7d, 0x341d81dc), TOBN(0xcddb6889, 0x32148379), TOBN(0xda6211a1, 0xf7026ead), TOBN(0xf3b2575f, 0xf4d1cc5e)}, {TOBN(0x40cfc8f6, 0xa7a73ae6), TOBN(0x83879a5e, 0x61d5b483), TOBN(0xc5acb1ed, 0x41a50ebc), TOBN(0x59a60cc8, 0x3c07d8fa)}}, {{TOBN(0x1b73bdce, 0xb1876262), TOBN(0x2b0d79f0, 0x12af4ee9), TOBN(0x8bcf3b0b, 0xd46e1d07), TOBN(0x17d6af9d, 0xe45d152f)}, {TOBN(0x73520461, 0x6d736451), TOBN(0x43cbbd97, 0x56b0bf5a), TOBN(0xb0833a5b, 0xd5999b9d), TOBN(0x702614f0, 0xeb72e398)}}, {{TOBN(0x0aadf01a, 0x59c3e9f8), TOBN(0x40200e77, 0xce6b3d16), TOBN(0xda22bdd3, 0xdeddafad), TOBN(0x76dedaf4, 0x310d72e1)}, {TOBN(0x49ef807c, 0x4bc2e88f), TOBN(0x6ba81291, 0x146dd5a5), TOBN(0xa1a4077a, 0x7d8d59e9), TOBN(0x87b6a2e7, 0x802db349)}}, {{TOBN(0xd5679997, 0x1b4e598e), TOBN(0xf499ef1f, 0x06fe4b1d), TOBN(0x3978d3ae, 0xfcb267c5), TOBN(0xb582b557, 0x235786d0)}, {TOBN(0x32b3b2ca, 0x1715cb07), TOBN(0x4c3de6a2, 0x8480241d), TOBN(0x63b5ffed, 0xcb571ecd), TOBN(0xeaf53900, 0xed2fe9a9)}}, {{TOBN(0xdec98d4a, 0xc3b81990), TOBN(0x1cb83722, 0x9e0cc8fe), TOBN(0xfe0b0491, 0xd2b427b9), TOBN(0x0f2386ac, 0xe983a66c)}, {TOBN(0x930c4d1e, 0xb3291213), TOBN(0xa2f82b2e, 0x59a62ae4), TOBN(0x77233853, 0xf93e89e3), TOBN(0x7f8063ac, 0x11777c7f)}}, {{TOBN(0xff0eb567, 0x59ad2877), TOBN(0x6f454642, 0x9865c754), TOBN(0xe6fe701a, 0x236e9a84), TOBN(0xc586ef16, 0x06e40fc3)}, {TOBN(0x3f62b6e0, 0x24bafad9), TOBN(0xc8b42bd2, 0x64da906a), TOBN(0xc98e1eb4, 0xda3276a0), TOBN(0x30d0e5fc, 0x06cbf852)}}, {{TOBN(0x1b6b2ae1, 0xe8b4dfd4), TOBN(0xd754d5c7, 0x8301cbac), TOBN(0x66097629, 0x112a39ac), TOBN(0xf86b5999, 0x93ba4ab9)}, {TOBN(0x26c9dea7, 0x99f9d581), TOBN(0x0473b1a8, 0xc2fafeaa), TOBN(0x1469af55, 0x3b2505a5), TOBN(0x227d16d7, 0xd6a43323)}}, {{TOBN(0x3316f73c, 0xad3d97f9), TOBN(0x52bf3bb5, 0x1f137455), TOBN(0x953eafeb, 0x09954e7c), TOBN(0xa721dfed, 0xdd732411)}, {TOBN(0xb4929821, 0x141d4579), TOBN(0x3411321c, 0xaa3bd435), TOBN(0xafb355aa, 0x17fa6015), TOBN(0xb4e7ef4a, 0x18e42f0e)}}, {{TOBN(0x604ac97c, 0x59371000), TOBN(0xe1c48c70, 0x7f759c18), TOBN(0x3f62ecc5, 0xa5db6b65), TOBN(0x0a78b173, 0x38a21495)}, {TOBN(0x6be1819d, 0xbcc8ad94), TOBN(0x70dc04f6, 0xd89c3400), TOBN(0x462557b4, 0xa6b4840a), TOBN(0x544c6ade, 0x60bd21c0)}}, {{TOBN(0x6a00f24e, 0x907a544b), TOBN(0xa7520dcb, 0x313da210), TOBN(0xfe939b75, 0x11e4994b), TOBN(0x918b6ba6, 0xbc275d70)}, {TOBN(0xd3e5e0fc, 0x644be892), TOBN(0x707a9816, 0xfdaf6c42), TOBN(0x60145567, 0xf15c13fe), TOBN(0x4818ebaa, 0xe130a54a)}}, {{TOBN(0x28aad3ad, 0x58d2f767), TOBN(0xdc5267fd, 0xd7e7c773), TOBN(0x4919cc88, 0xc3afcc98), TOBN(0xaa2e6ab0, 0x2db8cd4b)}, {TOBN(0xd46fec04, 0xd0c63eaa), TOBN(0xa1cb92c5, 0x19ffa832), TOBN(0x678dd178, 0xe43a631f), TOBN(0xfb5ae1cd, 0x3dc788b3)}}, {{TOBN(0x68b4fb90, 0x6e77de04), TOBN(0x7992bcf0, 0xf06dbb97), TOBN(0x896e6a13, 0xc417c01d), TOBN(0x8d96332c, 0xb956be01)}, {TOBN(0x902fc93a, 0x413aa2b9), TOBN(0x99a4d915, 0xfc98c8a5), TOBN(0x52c29407, 0x565f1137), TOBN(0x4072690f, 0x21e4f281)}}, {{TOBN(0x36e607cf, 0x02ff6072), TOBN(0xa47d2ca9, 0x8ad98cdc), TOBN(0xbf471d1e, 0xf5f56609), TOBN(0xbcf86623, 0xf264ada0)}, {TOBN(0xb70c0687, 0xaa9e5cb6), TOBN(0xc98124f2, 0x17401c6c), TOBN(0x8189635f, 0xd4a61435), TOBN(0xd28fb8af, 0xa9d98ea6)}}, {{TOBN(0xb9a67c2a, 0x40c251f8), TOBN(0x88cd5d87, 0xa2da44be), TOBN(0x437deb96, 0xe09b5423), TOBN(0x150467db, 0x64287dc1)}, {TOBN(0xe161debb, 0xcdabb839), TOBN(0xa79e9742, 0xf1839a3e), TOBN(0xbb8dd3c2, 0x652d202b), TOBN(0x7b3e67f7, 0xe9f97d96)}}, {{TOBN(0x5aa5d78f, 0xb1cb6ac9), TOBN(0xffa13e8e, 0xca1d0d45), TOBN(0x369295dd, 0x2ba5bf95), TOBN(0xd68bd1f8, 0x39aff05e)}, {TOBN(0xaf0d86f9, 0x26d783f2), TOBN(0x543a59b3, 0xfc3aafc1), TOBN(0x3fcf81d2, 0x7b7da97c), TOBN(0xc990a056, 0xd25dee46)}}, {{TOBN(0x3e6775b8, 0x519cce2c), TOBN(0xfc9af71f, 0xae13d863), TOBN(0x774a4a6f, 0x47c1605c), TOBN(0x46ba4245, 0x2fd205e8)}, {TOBN(0xa06feea4, 0xd3fd524d), TOBN(0x1e724641, 0x6de1acc2), TOBN(0xf53816f1, 0x334e2b42), TOBN(0x49e5918e, 0x922f0024)}}, {{TOBN(0x439530b6, 0x65c7322d), TOBN(0xcf12cc01, 0xb3c1b3fb), TOBN(0xc70b0186, 0x0172f685), TOBN(0xb915ee22, 0x1b58391d)}, {TOBN(0x9afdf03b, 0xa317db24), TOBN(0x87dec659, 0x17b8ffc4), TOBN(0x7f46597b, 0xe4d3d050), TOBN(0x80a1c1ed, 0x006500e7)}}, {{TOBN(0x84902a96, 0x78bf030e), TOBN(0xfb5e9c9a, 0x50560148), TOBN(0x6dae0a92, 0x63362426), TOBN(0xdcaeecf4, 0xa9e30c40)}, {TOBN(0xc0d887bb, 0x518d0c6b), TOBN(0x99181152, 0xcb985b9d), TOBN(0xad186898, 0xef7bc381), TOBN(0x18168ffb, 0x9ee46201)}}, {{TOBN(0x9a04cdaa, 0x2502753c), TOBN(0xbb279e26, 0x51407c41), TOBN(0xeacb03aa, 0xf23564e5), TOBN(0x18336582, 0x71e61016)}, {TOBN(0x8684b8c4, 0xeb809877), TOBN(0xb336e18d, 0xea0e672e), TOBN(0xefb601f0, 0x34ee5867), TOBN(0x2733edbe, 0x1341cfd1)}}, {{TOBN(0xb15e809a, 0x26025c3c), TOBN(0xe6e981a6, 0x9350df88), TOBN(0x92376237, 0x8502fd8e), TOBN(0x4791f216, 0x0c12be9b)}, {TOBN(0xb7256789, 0x25f02425), TOBN(0xec863194, 0x7a974443), TOBN(0x7c0ce882, 0xfb41cc52), TOBN(0xc266ff7e, 0xf25c07f2)}}, {{TOBN(0x3d4da8c3, 0x017025f3), TOBN(0xefcf628c, 0xfb9579b4), TOBN(0x5c4d0016, 0x1f3716ec), TOBN(0x9c27ebc4, 0x6801116e)}, {TOBN(0x5eba0ea1, 0x1da1767e), TOBN(0xfe151452, 0x47004c57), TOBN(0x3ace6df6, 0x8c2373b7), TOBN(0x75c3dffe, 0x5dbc37ac)}}, {{TOBN(0x3dc32a73, 0xddc925fc), TOBN(0xb679c841, 0x2f65ee0b), TOBN(0x715a3295, 0x451cbfeb), TOBN(0xd9889768, 0xf76e9a29)}, {TOBN(0xec20ce7f, 0xb28ad247), TOBN(0xe99146c4, 0x00894d79), TOBN(0x71457d7c, 0x9f5e3ea7), TOBN(0x097b2662, 0x38030031)}}, {{TOBN(0xdb7f6ae6, 0xcf9f82a8), TOBN(0x319decb9, 0x438f473a), TOBN(0xa63ab386, 0x283856c3), TOBN(0x13e3172f, 0xb06a361b)}, {TOBN(0x2959f8dc, 0x7d5a006c), TOBN(0x2dbc27c6, 0x75fba752), TOBN(0xc1227ab2, 0x87c22c9e), TOBN(0x06f61f75, 0x71a268b2)}}, {{TOBN(0x1b6bb971, 0x04779ce2), TOBN(0xaca83812, 0x0aadcb1d), TOBN(0x297ae0bc, 0xaeaab2d5), TOBN(0xa5c14ee7, 0x5bfb9f13)}, {TOBN(0xaa00c583, 0xf17a62c7), TOBN(0x39eb962c, 0x173759f6), TOBN(0x1eeba1d4, 0x86c9a88f), TOBN(0x0ab6c37a, 0xdf016c5e)}}, {{TOBN(0xa2a147db, 0xa28a0749), TOBN(0x246c20d6, 0xee519165), TOBN(0x5068d1b1, 0xd3810715), TOBN(0xb1e7018c, 0x748160b9)}, {TOBN(0x03f5b1fa, 0xf380ff62), TOBN(0xef7fb1dd, 0xf3cb2c1e), TOBN(0xeab539a8, 0xfc91a7da), TOBN(0x83ddb707, 0xf3f9b561)}}, {{TOBN(0xc550e211, 0xfe7df7a4), TOBN(0xa7cd07f2, 0x063f6f40), TOBN(0xb0de3635, 0x2976879c), TOBN(0xb5f83f85, 0xe55741da)}, {TOBN(0x4ea9d25e, 0xf3d8ac3d), TOBN(0x6fe2066f, 0x62819f02), TOBN(0x4ab2b9c2, 0xcef4a564), TOBN(0x1e155d96, 0x5ffa2de3)}}, {{TOBN(0x0eb0a19b, 0xc3a72d00), TOBN(0x4037665b, 0x8513c31b), TOBN(0x2fb2b6bf, 0x04c64637), TOBN(0x45c34d6e, 0x08cdc639)}, {TOBN(0x56f1e10f, 0xf01fd796), TOBN(0x4dfb8101, 0xfe3667b8), TOBN(0xe0eda253, 0x9021d0c0), TOBN(0x7a94e9ff, 0x8a06c6ab)}}, {{TOBN(0x2d3bb0d9, 0xbb9aa882), TOBN(0xea20e4e5, 0xec05fd10), TOBN(0xed7eeb5f, 0x1a1ca64e), TOBN(0x2fa6b43c, 0xc6327cbd)}, {TOBN(0xb577e3cf, 0x3aa91121), TOBN(0x8c6bd5ea, 0x3a34079b), TOBN(0xd7e5ba39, 0x60e02fc0), TOBN(0xf16dd2c3, 0x90141bf8)}}, {{TOBN(0xb57276d9, 0x80101b98), TOBN(0x760883fd, 0xb82f0f66), TOBN(0x89d7de75, 0x4bc3eff3), TOBN(0x03b60643, 0x5dc2ab40)}, {TOBN(0xcd6e53df, 0xe05beeac), TOBN(0xf2f1e862, 0xbc3325cd), TOBN(0xdd0f7921, 0x774f03c3), TOBN(0x97ca7221, 0x4552cc1b)}}, {{TOBN(0x5a0d6afe, 0x1cd19f72), TOBN(0xa20915dc, 0xf183fbeb), TOBN(0x9fda4b40, 0x832c403c), TOBN(0x32738edd, 0xbe425442)}, {TOBN(0x469a1df6, 0xb5eccf1a), TOBN(0x4b5aff42, 0x28bbe1f0), TOBN(0x31359d7f, 0x570dfc93), TOBN(0xa18be235, 0xf0088628)}}, {{TOBN(0xa5b30fba, 0xb00ed3a9), TOBN(0x34c61374, 0x73cdf8be), TOBN(0x2c5c5f46, 0xabc56797), TOBN(0x5cecf93d, 0xb82a8ae2)}, {TOBN(0x7d3dbe41, 0xa968fbf0), TOBN(0xd23d4583, 0x1a5c7f3d), TOBN(0xf28f69a0, 0xc087a9c7), TOBN(0xc2d75471, 0x474471ca)}}, {{TOBN(0x36ec9f4a, 0x4eb732ec), TOBN(0x6c943bbd, 0xb1ca6bed), TOBN(0xd64535e1, 0xf2457892), TOBN(0x8b84a8ea, 0xf7e2ac06)}, {TOBN(0xe0936cd3, 0x2499dd5f), TOBN(0x12053d7e, 0x0ed04e57), TOBN(0x4bdd0076, 0xe4305d9d), TOBN(0x34a527b9, 0x1f67f0a2)}}, {{TOBN(0xe79a4af0, 0x9cec46ea), TOBN(0xb15347a1, 0x658b9bc7), TOBN(0x6bd2796f, 0x35af2f75), TOBN(0xac957990, 0x4051c435)}, {TOBN(0x2669dda3, 0xc33a655d), TOBN(0x5d503c2e, 0x88514aa3), TOBN(0xdfa11337, 0x3753dd41), TOBN(0x3f054673, 0x0b754f78)}}, {{TOBN(0xbf185677, 0x496125bd), TOBN(0xfb0023c8, 0x3775006c), TOBN(0xfa0f072f, 0x3a037899), TOBN(0x4222b6eb, 0x0e4aea57)}, {TOBN(0x3dde5e76, 0x7866d25a), TOBN(0xb6eb04f8, 0x4837aa6f), TOBN(0x5315591a, 0x2cf1cdb8), TOBN(0x6dfb4f41, 0x2d4e683c)}}, {{TOBN(0x7e923ea4, 0x48ee1f3a), TOBN(0x9604d9f7, 0x05a2afd5), TOBN(0xbe1d4a33, 0x40ea4948), TOBN(0x5b45f1f4, 0xb44cbd2f)}, {TOBN(0x5faf8376, 0x4acc757e), TOBN(0xa7cf9ab8, 0x63d68ff7), TOBN(0x8ad62f69, 0xdf0e404b), TOBN(0xd65f33c2, 0x12bdafdf)}}, {{TOBN(0xc365de15, 0xa377b14e), TOBN(0x6bf5463b, 0x8e39f60c), TOBN(0x62030d2d, 0x2ce68148), TOBN(0xd95867ef, 0xe6f843a8)}, {TOBN(0xd39a0244, 0xef5ab017), TOBN(0x0bd2d8c1, 0x4ab55d12), TOBN(0xc9503db3, 0x41639169), TOBN(0x2d4e25b0, 0xf7660c8a)}}, {{TOBN(0x760cb3b5, 0xe224c5d7), TOBN(0xfa3baf8c, 0x68616919), TOBN(0x9fbca113, 0x8d142552), TOBN(0x1ab18bf1, 0x7669ebf5)}, {TOBN(0x55e6f53e, 0x9bdf25dd), TOBN(0x04cc0bf3, 0xcb6cd154), TOBN(0x595bef49, 0x95e89080), TOBN(0xfe9459a8, 0x104a9ac1)}}, {{TOBN(0xad2d89ca, 0xcce9bb32), TOBN(0xddea65e1, 0xf7de8285), TOBN(0x62ed8c35, 0xb351bd4b), TOBN(0x4150ff36, 0x0c0e19a7)}, {TOBN(0x86e3c801, 0x345f4e47), TOBN(0x3bf21f71, 0x203a266c), TOBN(0x7ae110d4, 0x855b1f13), TOBN(0x5d6aaf6a, 0x07262517)}}, {{TOBN(0x1e0f12e1, 0x813d28f1), TOBN(0x6000e11d, 0x7ad7a523), TOBN(0xc7d8deef, 0xc744a17b), TOBN(0x1e990b48, 0x14c05a00)}, {TOBN(0x68fddaee, 0x93e976d5), TOBN(0x696241d1, 0x46610d63), TOBN(0xb204e7c3, 0x893dda88), TOBN(0x8bccfa65, 0x6a3a6946)}}, {{TOBN(0xb59425b4, 0xc5cd1411), TOBN(0x701b4042, 0xff3658b1), TOBN(0xe3e56bca, 0x4784cf93), TOBN(0x27de5f15, 0x8fe68d60)}, {TOBN(0x4ab9cfce, 0xf8d53f19), TOBN(0xddb10311, 0xa40a730d), TOBN(0x6fa73cd1, 0x4eee0a8a), TOBN(0xfd548748, 0x5249719d)}}, {{TOBN(0x49d66316, 0xa8123ef0), TOBN(0x73c32db4, 0xe7f95438), TOBN(0x2e2ed209, 0x0d9e7854), TOBN(0xf98a9329, 0x9d9f0507)}, {TOBN(0xc5d33cf6, 0x0c6aa20a), TOBN(0x9a32ba14, 0x75279bb2), TOBN(0x7e3202cb, 0x774a7307), TOBN(0x64ed4bc4, 0xe8c42dbd)}}, {{TOBN(0xc20f1a06, 0xd4caed0d), TOBN(0xb8021407, 0x171d22b3), TOBN(0xd426ca04, 0xd13268d7), TOBN(0x92377007, 0x25f4d126)}, {TOBN(0x4204cbc3, 0x71f21a85), TOBN(0x18461b7a, 0xf82369ba), TOBN(0xc0c07d31, 0x3fc858f9), TOBN(0x5deb5a50, 0xe2bab569)}}, {{TOBN(0xd5959d46, 0xd5eea89e), TOBN(0xfdff8424, 0x08437f4b), TOBN(0xf21071e4, 0x3cfe254f), TOBN(0x72417696, 0x95468321)}, {TOBN(0x5d8288b9, 0x102cae3e), TOBN(0x2d143e3d, 0xf1965dff), TOBN(0x00c9a376, 0xa078d847), TOBN(0x6fc0da31, 0x26028731)}}, {{TOBN(0xa2baeadf, 0xe45083a2), TOBN(0x66bc7218, 0x5e5b4bcd), TOBN(0x2c826442, 0xd04b8e7f), TOBN(0xc19f5451, 0x6c4b586b)}, {TOBN(0x60182c49, 0x5b7eeed5), TOBN(0xd9954ecd, 0x7aa9dfa1), TOBN(0xa403a8ec, 0xc73884ad), TOBN(0x7fb17de2, 0x9bb39041)}}, {{TOBN(0x694b64c5, 0xabb020e8), TOBN(0x3d18c184, 0x19c4eec7), TOBN(0x9c4673ef, 0x1c4793e5), TOBN(0xc7b8aeb5, 0x056092e6)}, {TOBN(0x3aa1ca43, 0xf0f8c16b), TOBN(0x224ed5ec, 0xd679b2f6), TOBN(0x0d56eeaf, 0x55a205c9), TOBN(0xbfe115ba, 0x4b8e028b)}}, {{TOBN(0x97e60849, 0x3927f4fe), TOBN(0xf91fbf94, 0x759aa7c5), TOBN(0x985af769, 0x6be90a51), TOBN(0xc1277b78, 0x78ccb823)}, {TOBN(0x395b656e, 0xe7a75952), TOBN(0x00df7de0, 0x928da5f5), TOBN(0x09c23175, 0x4ca4454f), TOBN(0x4ec971f4, 0x7aa2d3c1)}}, {{TOBN(0x45c3c507, 0xe75d9ccc), TOBN(0x63b7be8a, 0x3dc90306), TOBN(0x37e09c66, 0x5db44bdc), TOBN(0x50d60da1, 0x6841c6a2)}, {TOBN(0x6f9b65ee, 0x08df1b12), TOBN(0x38734879, 0x7ff089df), TOBN(0x9c331a66, 0x3fe8013d), TOBN(0x017f5de9, 0x5f42fcc8)}}, {{TOBN(0x43077866, 0xe8e57567), TOBN(0xc9f781ce, 0xf9fcdb18), TOBN(0x38131dda, 0x9b12e174), TOBN(0x25d84aa3, 0x8a03752a)}, {TOBN(0x45e09e09, 0x4d0c0ce2), TOBN(0x1564008b, 0x92bebba5), TOBN(0xf7e8ad31, 0xa87284c7), TOBN(0xb7c4b46c, 0x97e7bbaa)}}, {{TOBN(0x3e22a7b3, 0x97acf4ec), TOBN(0x0426c400, 0x5ea8b640), TOBN(0x5e3295a6, 0x4e969285), TOBN(0x22aabc59, 0xa6a45670)}, {TOBN(0xb929714c, 0x5f5942bc), TOBN(0x9a6168bd, 0xfa3182ed), TOBN(0x2216a665, 0x104152ba), TOBN(0x46908d03, 0xb6926368)}}}, {{{TOBN(0xa9f5d874, 0x5a1251fb), TOBN(0x967747a8, 0xc72725c7), TOBN(0x195c33e5, 0x31ffe89e), TOBN(0x609d210f, 0xe964935e)}, {TOBN(0xcafd6ca8, 0x2fe12227), TOBN(0xaf9b5b96, 0x0426469d), TOBN(0x2e9ee04c, 0x5693183c), TOBN(0x1084a333, 0xc8146fef)}}, {{TOBN(0x96649933, 0xaed1d1f7), TOBN(0x566eaff3, 0x50563090), TOBN(0x345057f0, 0xad2e39cf), TOBN(0x148ff65b, 0x1f832124)}, {TOBN(0x042e89d4, 0xcf94cf0d), TOBN(0x319bec84, 0x520c58b3), TOBN(0x2a267626, 0x5361aa0d), TOBN(0xc86fa302, 0x8fbc87ad)}}, {{TOBN(0xfc83d2ab, 0x5c8b06d5), TOBN(0xb1a785a2, 0xfe4eac46), TOBN(0xb99315bc, 0x846f7779), TOBN(0xcf31d816, 0xef9ea505)}, {TOBN(0x2391fe6a, 0x15d7dc85), TOBN(0x2f132b04, 0xb4016b33), TOBN(0x29547fe3, 0x181cb4c7), TOBN(0xdb66d8a6, 0x650155a1)}}, {{TOBN(0x6b66d7e1, 0xadc1696f), TOBN(0x98ebe593, 0x0acd72d0), TOBN(0x65f24550, 0xcc1b7435), TOBN(0xce231393, 0xb4b9a5ec)}, {TOBN(0x234a22d4, 0xdb067df9), TOBN(0x98dda095, 0xcaff9b00), TOBN(0x1bbc75a0, 0x6100c9c1), TOBN(0x1560a9c8, 0x939cf695)}}, {{TOBN(0xcf006d3e, 0x99e0925f), TOBN(0x2dd74a96, 0x6322375a), TOBN(0xc58b446a, 0xb56af5ba), TOBN(0x50292683, 0xe0b9b4f1)}, {TOBN(0xe2c34cb4, 0x1aeaffa3), TOBN(0x8b17203f, 0x9b9587c1), TOBN(0x6d559207, 0xead1350c), TOBN(0x2b66a215, 0xfb7f9604)}}, {{TOBN(0x0850325e, 0xfe51bf74), TOBN(0x9c4f579e, 0x5e460094), TOBN(0x5c87b92a, 0x76da2f25), TOBN(0x889de4e0, 0x6febef33)}, {TOBN(0x6900ec06, 0x646083ce), TOBN(0xbe2a0335, 0xbfe12773), TOBN(0xadd1da35, 0xc5344110), TOBN(0x757568b7, 0xb802cd20)}}, {{TOBN(0x75559779, 0x00f7e6c8), TOBN(0x38e8b94f, 0x0facd2f0), TOBN(0xfea1f3af, 0x03fde375), TOBN(0x5e11a1d8, 0x75881dfc)}, {TOBN(0xb3a6b02e, 0xc1e2f2ef), TOBN(0x193d2bbb, 0xc605a6c5), TOBN(0x325ffeee, 0x339a0b2d), TOBN(0x27b6a724, 0x9e0c8846)}}, {{TOBN(0xe4050f1c, 0xf1c367ca), TOBN(0x9bc85a9b, 0xc90fbc7d), TOBN(0xa373c4a2, 0xe1a11032), TOBN(0xb64232b7, 0xad0393a9)}, {TOBN(0xf5577eb0, 0x167dad29), TOBN(0x1604f301, 0x94b78ab2), TOBN(0x0baa94af, 0xe829348b), TOBN(0x77fbd8dd, 0x41654342)}}, {{TOBN(0xdab50ea5, 0xb964e39a), TOBN(0xd4c29e3c, 0xd0d3c76e), TOBN(0x80dae67c, 0x56d11964), TOBN(0x7307a8bf, 0xe5ffcc2f)}, {TOBN(0x65bbc1aa, 0x91708c3b), TOBN(0xa151e62c, 0x28bf0eeb), TOBN(0x6cb53381, 0x6fa34db7), TOBN(0x5139e05c, 0xa29403a8)}}, {{TOBN(0x6ff651b4, 0x94a7cd2e), TOBN(0x5671ffd1, 0x0699336c), TOBN(0x6f5fd2cc, 0x979a896a), TOBN(0x11e893a8, 0xd8148cef)}, {TOBN(0x988906a1, 0x65cf7b10), TOBN(0x81b67178, 0xc50d8485), TOBN(0x7c0deb35, 0x8a35b3de), TOBN(0x423ac855, 0xc1d29799)}}, {{TOBN(0xaf580d87, 0xdac50b74), TOBN(0x28b2b89f, 0x5869734c), TOBN(0x99a3b936, 0x874e28fb), TOBN(0xbb2c9190, 0x25f3f73a)}, {TOBN(0x199f6918, 0x84a9d5b7), TOBN(0x7ebe2325, 0x7e770374), TOBN(0xf442e107, 0x0738efe2), TOBN(0xcf9f3f56, 0xcf9082d2)}}, {{TOBN(0x719f69e1, 0x09618708), TOBN(0xcc9e8364, 0xc183f9b1), TOBN(0xec203a95, 0x366a21af), TOBN(0x6aec5d6d, 0x068b141f)}, {TOBN(0xee2df78a, 0x994f04e9), TOBN(0xb39ccae8, 0x271245b0), TOBN(0xb875a4a9, 0x97e43f4f), TOBN(0x507dfe11, 0xdb2cea98)}}, {{TOBN(0x4fbf81cb, 0x489b03e9), TOBN(0xdb86ec5b, 0x6ec414fa), TOBN(0xfad444f9, 0xf51b3ae5), TOBN(0xca7d33d6, 0x1914e3fe)}, {TOBN(0xa9c32f5c, 0x0ae6c4d0), TOBN(0xa9ca1d1e, 0x73969568), TOBN(0x98043c31, 0x1aa7467e), TOBN(0xe832e75c, 0xe21b5ac6)}}, {{TOBN(0x314b7aea, 0x5232123d), TOBN(0x08307c8c, 0x65ae86db), TOBN(0x06e7165c, 0xaa4668ed), TOBN(0xb170458b, 0xb4d3ec39)}, {TOBN(0x4d2e3ec6, 0xc19bb986), TOBN(0xc5f34846, 0xae0304ed), TOBN(0x917695a0, 0x6c9f9722), TOBN(0x6c7f7317, 0x4cab1c0a)}}, {{TOBN(0x6295940e, 0x9d6d2e8b), TOBN(0xd318b8c1, 0x549f7c97), TOBN(0x22453204, 0x97713885), TOBN(0x468d834b, 0xa8a440fe)}, {TOBN(0xd81fe5b2, 0xbfba796e), TOBN(0x152364db, 0x6d71f116), TOBN(0xbb8c7c59, 0xb5b66e53), TOBN(0x0b12c61b, 0x2641a192)}}, {{TOBN(0x31f14802, 0xfcf0a7fd), TOBN(0x42fd0789, 0x5488b01e), TOBN(0x71d78d6d, 0x9952b498), TOBN(0x8eb572d9, 0x07ac5201)}, {TOBN(0xe0a2a44c, 0x4d194a88), TOBN(0xd2b63fd9, 0xba017e66), TOBN(0x78efc6c8, 0xf888aefc), TOBN(0xb76f6bda, 0x4a881a11)}}, {{TOBN(0x187f314b, 0xb46c2397), TOBN(0x004cf566, 0x5ded2819), TOBN(0xa9ea5704, 0x38764d34), TOBN(0xbba45217, 0x78084709)}, {TOBN(0x06474571, 0x1171121e), TOBN(0xad7b7eb1, 0xe7c9b671), TOBN(0xdacfbc40, 0x730f7507), TOBN(0x178cd8c6, 0xc7ad7bd1)}}, {{TOBN(0xbf0be101, 0xb2a67238), TOBN(0x3556d367, 0xaf9c14f2), TOBN(0x104b7831, 0xa5662075), TOBN(0x58ca59bb, 0x79d9e60a)}, {TOBN(0x4bc45392, 0xa569a73b), TOBN(0x517a52e8, 0x5698f6c9), TOBN(0x85643da5, 0xaeadd755), TOBN(0x1aed0cd5, 0x2a581b84)}}, {{TOBN(0xb9b4ff84, 0x80af1372), TOBN(0x244c3113, 0xf1ba5d1f), TOBN(0x2a5dacbe, 0xf5f98d31), TOBN(0x2c3323e8, 0x4375bc2a)}, {TOBN(0x17a3ab4a, 0x5594b1dd), TOBN(0xa1928bfb, 0xceb4797e), TOBN(0xe83af245, 0xe4886a19), TOBN(0x8979d546, 0x72b5a74a)}}, {{TOBN(0xa0f726bc, 0x19f9e967), TOBN(0xd9d03152, 0xe8fbbf4e), TOBN(0xcfd6f51d, 0xb7707d40), TOBN(0x633084d9, 0x63f6e6e0)}, {TOBN(0xedcd9cdc, 0x55667eaf), TOBN(0x73b7f92b, 0x2e44d56f), TOBN(0xfb2e39b6, 0x4e962b14), TOBN(0x7d408f6e, 0xf671fcbf)}}, {{TOBN(0xcc634ddc, 0x164a89bb), TOBN(0x74a42bb2, 0x3ef3bd05), TOBN(0x1280dbb2, 0x428decbb), TOBN(0x6103f6bb, 0x402c8596)}, {TOBN(0xfa2bf581, 0x355a5752), TOBN(0x562f96a8, 0x00946674), TOBN(0x4e4ca16d, 0x6da0223b), TOBN(0xfe47819f, 0x28d3aa25)}}, {{TOBN(0x9eea3075, 0xf8dfcf8a), TOBN(0xa284f0aa, 0x95669825), TOBN(0xb3fca250, 0x867d3fd8), TOBN(0x20757b5f, 0x269d691e)}, {TOBN(0xf2c24020, 0x93b8a5de), TOBN(0xd3f93359, 0xebc06da6), TOBN(0x1178293e, 0xb2739c33), TOBN(0xd2a3e770, 0xbcd686e5)}}, {{TOBN(0xa76f49f4, 0xcd941534), TOBN(0x0d37406b, 0xe3c71c0e), TOBN(0x172d9397, 0x3b97f7e3), TOBN(0xec17e239, 0xbd7fd0de)}, {TOBN(0xe3290551, 0x6f496ba2), TOBN(0x6a693172, 0x36ad50e7), TOBN(0xc4e539a2, 0x83e7eff5), TOBN(0x752737e7, 0x18e1b4cf)}}, {{TOBN(0xa2f7932c, 0x68af43ee), TOBN(0x5502468e, 0x703d00bd), TOBN(0xe5dc978f, 0x2fb061f5), TOBN(0xc9a1904a, 0x28c815ad)}, {TOBN(0xd3af538d, 0x470c56a4), TOBN(0x159abc5f, 0x193d8ced), TOBN(0x2a37245f, 0x20108ef3), TOBN(0xfa17081e, 0x223f7178)}}, {{TOBN(0x27b0fb2b, 0x10c8c0f5), TOBN(0x2102c3ea, 0x40650547), TOBN(0x594564df, 0x8ac3bfa7), TOBN(0x98102033, 0x509dad96)}, {TOBN(0x6989643f, 0xf1d18a13), TOBN(0x35eebd91, 0xd7fc5af0), TOBN(0x078d096a, 0xfaeaafd8), TOBN(0xb7a89341, 0xdef3de98)}}, {{TOBN(0x2a206e8d, 0xecf2a73a), TOBN(0x066a6397, 0x8e551994), TOBN(0x3a6a088a, 0xb98d53a2), TOBN(0x0ce7c67c, 0x2d1124aa)}, {TOBN(0x48cec671, 0x759a113c), TOBN(0xe3b373d3, 0x4f6f67fa), TOBN(0x5455d479, 0xfd36727b), TOBN(0xe5a428ee, 0xa13c0d81)}}, {{TOBN(0xb853dbc8, 0x1c86682b), TOBN(0xb78d2727, 0xb8d02b2a), TOBN(0xaaf69bed, 0x8ebc329a), TOBN(0xdb6b40b3, 0x293b2148)}, {TOBN(0xe42ea77d, 0xb8c4961f), TOBN(0xb1a12f7c, 0x20e5e0ab), TOBN(0xa0ec5274, 0x79e8b05e), TOBN(0x68027391, 0xfab60a80)}}, {{TOBN(0x6bfeea5f, 0x16b1bd5e), TOBN(0xf957e420, 0x4de30ad3), TOBN(0xcbaf664e, 0x6a353b9e), TOBN(0x5c873312, 0x26d14feb)}, {TOBN(0x4e87f98c, 0xb65f57cb), TOBN(0xdb60a621, 0x5e0cdd41), TOBN(0x67c16865, 0xa6881440), TOBN(0x1093ef1a, 0x46ab52aa)}}, {{TOBN(0xc095afb5, 0x3f4ece64), TOBN(0x6a6bb02e, 0x7604551a), TOBN(0x55d44b4e, 0x0b26b8cd), TOBN(0xe5f9a999, 0xf971268a)}, {TOBN(0xc08ec425, 0x11a7de84), TOBN(0x83568095, 0xfda469dd), TOBN(0x737bfba1, 0x6c6c90a2), TOBN(0x1cb9c4a0, 0xbe229831)}}, {{TOBN(0x93bccbba, 0xbb2eec64), TOBN(0xa0c23b64, 0xda03adbe), TOBN(0x5f7aa00a, 0xe0e86ac4), TOBN(0x470b941e, 0xfc1401e6)}, {TOBN(0x5ad8d679, 0x9df43574), TOBN(0x4ccfb8a9, 0x0f65d810), TOBN(0x1bce80e3, 0xaa7fbd81), TOBN(0x273291ad, 0x9508d20a)}}, {{TOBN(0xf5c4b46b, 0x42a92806), TOBN(0x810684ec, 0xa86ab44a), TOBN(0x4591640b, 0xca0bc9f8), TOBN(0xb5efcdfc, 0x5c4b6054)}, {TOBN(0x16fc8907, 0x6e9edd12), TOBN(0xe29d0b50, 0xd4d792f9), TOBN(0xa45fd01c, 0x9b03116d), TOBN(0x85035235, 0xc81765a4)}}, {{TOBN(0x1fe2a9b2, 0xb4b4b67c), TOBN(0xc1d10df0, 0xe8020604), TOBN(0x9d64abfc, 0xbc8058d8), TOBN(0x8943b9b2, 0x712a0fbb)}, {TOBN(0x90eed914, 0x3b3def04), TOBN(0x85ab3aa2, 0x4ce775ff), TOBN(0x605fd4ca, 0x7bbc9040), TOBN(0x8b34a564, 0xe2c75dfb)}}, {{TOBN(0x41ffc94a, 0x10358560), TOBN(0x2d8a5072, 0x9e5c28aa), TOBN(0xe915a0fc, 0x4cc7eb15), TOBN(0xe9efab05, 0x8f6d0f5d)}, {TOBN(0xdbab47a9, 0xd19e9b91), TOBN(0x8cfed745, 0x0276154c), TOBN(0x154357ae, 0x2cfede0d), TOBN(0x520630df, 0x19f5a4ef)}}, {{TOBN(0x25759f7c, 0xe382360f), TOBN(0xb6db05c9, 0x88bf5857), TOBN(0x2917d61d, 0x6c58d46c), TOBN(0x14f8e491, 0xfd20cb7a)}, {TOBN(0xb68a727a, 0x11c20340), TOBN(0x0386f86f, 0xaf7ccbb6), TOBN(0x5c8bc6cc, 0xfee09a20), TOBN(0x7d76ff4a, 0xbb7eea35)}}, {{TOBN(0xa7bdebe7, 0xdb15be7a), TOBN(0x67a08054, 0xd89f0302), TOBN(0x56bf0ea9, 0xc1193364), TOBN(0xc8244467, 0x62837ebe)}, {TOBN(0x32bd8e8b, 0x20d841b8), TOBN(0x127a0548, 0xdbb8a54f), TOBN(0x83dd4ca6, 0x63b20236), TOBN(0x87714718, 0x203491fa)}}, {{TOBN(0x4dabcaaa, 0xaa8a5288), TOBN(0x91cc0c8a, 0xaf23a1c9), TOBN(0x34c72c6a, 0x3f220e0c), TOBN(0xbcc20bdf, 0x1232144a)}, {TOBN(0x6e2f42da, 0xa20ede1b), TOBN(0xc441f00c, 0x74a00515), TOBN(0xbf46a5b6, 0x734b8c4b), TOBN(0x57409503, 0x7b56c9a4)}}, {{TOBN(0x9f735261, 0xe4585d45), TOBN(0x9231faed, 0x6734e642), TOBN(0x1158a176, 0xbe70ee6c), TOBN(0x35f1068d, 0x7c3501bf)}, {TOBN(0x6beef900, 0xa2d26115), TOBN(0x649406f2, 0xef0afee3), TOBN(0x3f43a60a, 0xbc2420a1), TOBN(0x509002a7, 0xd5aee4ac)}}, {{TOBN(0xb46836a5, 0x3ff3571b), TOBN(0x24f98b78, 0x837927c1), TOBN(0x6254256a, 0x4533c716), TOBN(0xf27abb0b, 0xd07ee196)}, {TOBN(0xd7cf64fc, 0x5c6d5bfd), TOBN(0x6915c751, 0xf0cd7a77), TOBN(0xd9f59012, 0x8798f534), TOBN(0x772b0da8, 0xf81d8b5f)}}, {{TOBN(0x1244260c, 0x2e03fa69), TOBN(0x36cf0e3a, 0x3be1a374), TOBN(0x6e7c1633, 0xef06b960), TOBN(0xa71a4c55, 0x671f90f6)}, {TOBN(0x7a941251, 0x33c673db), TOBN(0xc0bea510, 0x73e8c131), TOBN(0x61a8a699, 0xd4f6c734), TOBN(0x25e78c88, 0x341ed001)}}, {{TOBN(0x5c18acf8, 0x8e2f7d90), TOBN(0xfdbf33d7, 0x77be32cd), TOBN(0x0a085cd7, 0xd2eb5ee9), TOBN(0x2d702cfb, 0xb3201115)}, {TOBN(0xb6e0ebdb, 0x85c88ce8), TOBN(0x23a3ce3c, 0x1e01d617), TOBN(0x3041618e, 0x567333ac), TOBN(0x9dd0fd8f, 0x157edb6b)}}, {{TOBN(0x27f74702, 0xb57872b8), TOBN(0x2ef26b4f, 0x657d5fe1), TOBN(0x95426f0a, 0x57cf3d40), TOBN(0x847e2ad1, 0x65a6067a)}, {TOBN(0xd474d9a0, 0x09996a74), TOBN(0x16a56acd, 0x2a26115c), TOBN(0x02a615c3, 0xd16f4d43), TOBN(0xcc3fc965, 0xaadb85b7)}}, {{TOBN(0x386bda73, 0xce07d1b0), TOBN(0xd82910c2, 0x58ad4178), TOBN(0x124f82cf, 0xcd2617f4), TOBN(0xcc2f5e8d, 0xef691770)}, {TOBN(0x82702550, 0xb8c30ccc), TOBN(0x7b856aea, 0x1a8e575a), TOBN(0xbb822fef, 0xb1ab9459), TOBN(0x085928bc, 0xec24e38e)}}, {{TOBN(0x5d0402ec, 0xba8f4b4d), TOBN(0xc07cd4ba, 0x00b4d58b), TOBN(0x5d8dffd5, 0x29227e7a), TOBN(0x61d44d0c, 0x31bf386f)}, {TOBN(0xe486dc2b, 0x135e6f4d), TOBN(0x680962eb, 0xe79410ef), TOBN(0xa61bd343, 0xf10088b5), TOBN(0x6aa76076, 0xe2e28686)}}, {{TOBN(0x80463d11, 0x8fb98871), TOBN(0xcb26f5c3, 0xbbc76aff), TOBN(0xd4ab8edd, 0xfbe03614), TOBN(0xc8eb579b, 0xc0cf2dee)}, {TOBN(0xcc004c15, 0xc93bae41), TOBN(0x46fbae5d, 0x3aeca3b2), TOBN(0x671235cf, 0x0f1e9ab1), TOBN(0xadfba934, 0x9ec285c1)}}, {{TOBN(0x88ded013, 0xf216c980), TOBN(0xc8ac4fb8, 0xf79e0bc1), TOBN(0xa29b89c6, 0xfb97a237), TOBN(0xb697b780, 0x9922d8e7)}, {TOBN(0x3142c639, 0xddb945b5), TOBN(0x447b06c7, 0xe094c3a9), TOBN(0xcdcb3642, 0x72266c90), TOBN(0x633aad08, 0xa9385046)}}, {{TOBN(0xa36c936b, 0xb57c6477), TOBN(0x871f8b64, 0xe94dbcc6), TOBN(0x28d0fb62, 0xa591a67b), TOBN(0x9d40e081, 0xc1d926f5)}, {TOBN(0x3111eaf6, 0xf2d84b5a), TOBN(0x228993f9, 0xa565b644), TOBN(0x0ccbf592, 0x2c83188b), TOBN(0xf87b30ab, 0x3df3e197)}}, {{TOBN(0xb8658b31, 0x7642bca8), TOBN(0x1a032d7f, 0x52800f17), TOBN(0x051dcae5, 0x79bf9445), TOBN(0xeba6b8ee, 0x54a2e253)}, {TOBN(0x5c8b9cad, 0xd4485692), TOBN(0x84bda40e, 0x8986e9be), TOBN(0xd16d16a4, 0x2f0db448), TOBN(0x8ec80050, 0xa14d4188)}}, {{TOBN(0xb2b26107, 0x98fa7aaa), TOBN(0x41209ee4, 0xf073aa4e), TOBN(0xf1570359, 0xf2d6b19b), TOBN(0xcbe6868c, 0xfc577caf)}, {TOBN(0x186c4bdc, 0x32c04dd3), TOBN(0xa6c35fae, 0xcfeee397), TOBN(0xb4a1b312, 0xf086c0cf), TOBN(0xe0a5ccc6, 0xd9461fe2)}}, {{TOBN(0xc32278aa, 0x1536189f), TOBN(0x1126c55f, 0xba6df571), TOBN(0x0f71a602, 0xb194560e), TOBN(0x8b2d7405, 0x324bd6e1)}, {TOBN(0x8481939e, 0x3738be71), TOBN(0xb5090b1a, 0x1a4d97a9), TOBN(0x116c65a3, 0xf05ba915), TOBN(0x21863ad3, 0xaae448aa)}}, {{TOBN(0xd24e2679, 0xa7aae5d3), TOBN(0x7076013d, 0x0de5c1c4), TOBN(0x2d50f8ba, 0xbb05b629), TOBN(0x73c1abe2, 0x6e66efbb)}, {TOBN(0xefd4b422, 0xf2488af7), TOBN(0xe4105d02, 0x663ba575), TOBN(0x7eb60a8b, 0x53a69457), TOBN(0x62210008, 0xc945973b)}}, {{TOBN(0xfb255478, 0x77a50ec6), TOBN(0xbf0392f7, 0x0a37a72c), TOBN(0xa0a7a19c, 0x4be18e7a), TOBN(0x90d8ea16, 0x25b1e0af)}, {TOBN(0x7582a293, 0xef953f57), TOBN(0x90a64d05, 0xbdc5465a), TOBN(0xca79c497, 0xe2510717), TOBN(0x560dbb7c, 0x18cb641f)}}, {{TOBN(0x1d8e3286, 0x4b66abfb), TOBN(0xd26f52e5, 0x59030900), TOBN(0x1ee3f643, 0x5584941a), TOBN(0x6d3b3730, 0x569f5958)}, {TOBN(0x9ff2a62f, 0x4789dba5), TOBN(0x91fcb815, 0x72b5c9b7), TOBN(0xf446cb7d, 0x6c8f9a0e), TOBN(0x48f625c1, 0x39b7ecb5)}}, {{TOBN(0xbabae801, 0x1c6219b8), TOBN(0xe7a562d9, 0x28ac2f23), TOBN(0xe1b48732, 0x26e20588), TOBN(0x06ee1cad, 0x775af051)}, {TOBN(0xda29ae43, 0xfaff79f7), TOBN(0xc141a412, 0x652ee9e0), TOBN(0x1e127f6f, 0x195f4bd0), TOBN(0x29c6ab4f, 0x072f34f8)}}, {{TOBN(0x7b7c1477, 0x30448112), TOBN(0x82b51af1, 0xe4a38656), TOBN(0x2bf2028a, 0x2f315010), TOBN(0xc9a4a01f, 0x6ea88cd4)}, {TOBN(0xf63e95d8, 0x257e5818), TOBN(0xdd8efa10, 0xb4519b16), TOBN(0xed8973e0, 0x0da910bf), TOBN(0xed49d077, 0x5c0fe4a9)}}, {{TOBN(0xac3aac5e, 0xb7caee1e), TOBN(0x1033898d, 0xa7f4da57), TOBN(0x42145c0e, 0x5c6669b9), TOBN(0x42daa688, 0xc1aa2aa0)}, {TOBN(0x629cc15c, 0x1a1d885a), TOBN(0x25572ec0, 0xf4b76817), TOBN(0x8312e435, 0x9c8f8f28), TOBN(0x8107f8cd, 0x81965490)}}, {{TOBN(0x516ff3a3, 0x6fa6110c), TOBN(0x74fb1eb1, 0xfb93561f), TOBN(0x6c0c9047, 0x8457522b), TOBN(0xcfd32104, 0x6bb8bdc6)}, {TOBN(0x2d6884a2, 0xcc80ad57), TOBN(0x7c27fc35, 0x86a9b637), TOBN(0x3461baed, 0xadf4e8cd), TOBN(0x1d56251a, 0x617242f0)}}, {{TOBN(0x0b80d209, 0xc955bef4), TOBN(0xdf02cad2, 0x06adb047), TOBN(0xf0d7cb91, 0x5ec74fee), TOBN(0xd2503375, 0x1111ba44)}, {TOBN(0x9671755e, 0xdf53cb36), TOBN(0x54dcb612, 0x3368551b), TOBN(0x66d69aac, 0xc8a025a4), TOBN(0x6be946c6, 0xe77ef445)}}, {{TOBN(0x719946d1, 0xa995e094), TOBN(0x65e848f6, 0xe51e04d8), TOBN(0xe62f3300, 0x6a1e3113), TOBN(0x1541c7c1, 0x501de503)}, {TOBN(0x4daac9fa, 0xf4acfade), TOBN(0x0e585897, 0x44cd0b71), TOBN(0x544fd869, 0x0a51cd77), TOBN(0x60fc20ed, 0x0031016d)}}, {{TOBN(0x58b404ec, 0xa4276867), TOBN(0x46f6c3cc, 0x34f34993), TOBN(0x477ca007, 0xc636e5bd), TOBN(0x8018f5e5, 0x7c458b47)}, {TOBN(0xa1202270, 0xe47b668f), TOBN(0xcef48ccd, 0xee14f203), TOBN(0x23f98bae, 0x62ff9b4d), TOBN(0x55acc035, 0xc589eddd)}}, {{TOBN(0x3fe712af, 0x64db4444), TOBN(0x19e9d634, 0xbecdd480), TOBN(0xe08bc047, 0xa930978a), TOBN(0x2dbf24ec, 0xa1280733)}, {TOBN(0x3c0ae38c, 0x2cd706b2), TOBN(0x5b012a5b, 0x359017b9), TOBN(0x3943c38c, 0x72e0f5ae), TOBN(0x786167ea, 0x57176fa3)}}, {{TOBN(0xe5f9897d, 0x594881dc), TOBN(0x6b5efad8, 0xcfb820c1), TOBN(0xb2179093, 0xd55018de), TOBN(0x39ad7d32, 0x0bac56ce)}, {TOBN(0xb55122e0, 0x2cfc0e81), TOBN(0x117c4661, 0xf6d89daa), TOBN(0x362d01e1, 0xcb64fa09), TOBN(0x6a309b4e, 0x3e9c4ddd)}}, {{TOBN(0xfa979fb7, 0xabea49b1), TOBN(0xb4b1d27d, 0x10e2c6c5), TOBN(0xbd61c2c4, 0x23afde7a), TOBN(0xeb6614f8, 0x9786d358)}, {TOBN(0x4a5d816b, 0x7f6f7459), TOBN(0xe431a44f, 0x09360e7b), TOBN(0x8c27a032, 0xc309914c), TOBN(0xcea5d68a, 0xcaede3d8)}}, {{TOBN(0x3668f665, 0x3a0a3f95), TOBN(0x89369416, 0x7ceba27b), TOBN(0x89981fad, 0xe4728fe9), TOBN(0x7102c8a0, 0x8a093562)}, {TOBN(0xbb80310e, 0x235d21c8), TOBN(0x505e55d1, 0xbefb7f7b), TOBN(0xa0a90811, 0x12958a67), TOBN(0xd67e106a, 0x4d851fef)}}, {{TOBN(0xb84011a9, 0x431dd80e), TOBN(0xeb7c7cca, 0x73306cd9), TOBN(0x20fadd29, 0xd1b3b730), TOBN(0x83858b5b, 0xfe37b3d3)}, {TOBN(0xbf4cd193, 0xb6251d5c), TOBN(0x1cca1fd3, 0x1352d952), TOBN(0xc66157a4, 0x90fbc051), TOBN(0x7990a638, 0x89b98636)}}}, {{{TOBN(0xe5aa692a, 0x87dec0e1), TOBN(0x010ded8d, 0xf7b39d00), TOBN(0x7b1b80c8, 0x54cfa0b5), TOBN(0x66beb876, 0xa0f8ea28)}, {TOBN(0x50d7f531, 0x3476cd0e), TOBN(0xa63d0e65, 0xb08d3949), TOBN(0x1a09eea9, 0x53479fc6), TOBN(0x82ae9891, 0xf499e742)}}, {{TOBN(0xab58b910, 0x5ca7d866), TOBN(0x582967e2, 0x3adb3b34), TOBN(0x89ae4447, 0xcceac0bc), TOBN(0x919c667c, 0x7bf56af5)}, {TOBN(0x9aec17b1, 0x60f5dcd7), TOBN(0xec697b9f, 0xddcaadbc), TOBN(0x0b98f341, 0x463467f5), TOBN(0xb187f1f7, 0xa967132f)}}, {{TOBN(0x90fe7a1d, 0x214aeb18), TOBN(0x1506af3c, 0x741432f7), TOBN(0xbb5565f9, 0xe591a0c4), TOBN(0x10d41a77, 0xb44f1bc3)}, {TOBN(0xa09d65e4, 0xa84bde96), TOBN(0x42f060d8, 0xf20a6a1c), TOBN(0x652a3bfd, 0xf27f9ce7), TOBN(0xb6bdb65c, 0x3b3d739f)}}, {{TOBN(0xeb5ddcb6, 0xec7fae9f), TOBN(0x995f2714, 0xefb66e5a), TOBN(0xdee95d8e, 0x69445d52), TOBN(0x1b6c2d46, 0x09e27620)}, {TOBN(0x32621c31, 0x8129d716), TOBN(0xb03909f1, 0x0958c1aa), TOBN(0x8c468ef9, 0x1af4af63), TOBN(0x162c429f, 0xfba5cdf6)}}, {{TOBN(0x2f682343, 0x753b9371), TOBN(0x29cab45a, 0x5f1f9cd7), TOBN(0x571623ab, 0xb245db96), TOBN(0xc507db09, 0x3fd79999)}, {TOBN(0x4e2ef652, 0xaf036c32), TOBN(0x86f0cc78, 0x05018e5c), TOBN(0xc10a73d4, 0xab8be350), TOBN(0x6519b397, 0x7e826327)}}, {{TOBN(0xe8cb5eef, 0x9c053df7), TOBN(0x8de25b37, 0xb300ea6f), TOBN(0xdb03fa92, 0xc849cffb), TOBN(0x242e43a7, 0xe84169bb)}, {TOBN(0xe4fa51f4, 0xdd6f958e), TOBN(0x6925a77f, 0xf4445a8d), TOBN(0xe6e72a50, 0xe90d8949), TOBN(0xc66648e3, 0x2b1f6390)}}, {{TOBN(0xb2ab1957, 0x173e460c), TOBN(0x1bbbce75, 0x30704590), TOBN(0xc0a90dbd, 0xdb1c7162), TOBN(0x505e399e, 0x15cdd65d)}, {TOBN(0x68434dcb, 0x57797ab7), TOBN(0x60ad35ba, 0x6a2ca8e8), TOBN(0x4bfdb1e0, 0xde3336c1), TOBN(0xbbef99eb, 0xd8b39015)}}, {{TOBN(0x6c3b96f3, 0x1711ebec), TOBN(0x2da40f1f, 0xce98fdc4), TOBN(0xb99774d3, 0x57b4411f), TOBN(0x87c8bdf4, 0x15b65bb6)}, {TOBN(0xda3a89e3, 0xc2eef12d), TOBN(0xde95bb9b, 0x3c7471f3), TOBN(0x600f225b, 0xd812c594), TOBN(0x54907c5d, 0x2b75a56b)}}, {{TOBN(0xa93cc5f0, 0x8db60e35), TOBN(0x743e3cd6, 0xfa833319), TOBN(0x7dad5c41, 0xf81683c9), TOBN(0x70c1e7d9, 0x9c34107e)}, {TOBN(0x0edc4a39, 0xa6be0907), TOBN(0x36d47035, 0x86d0b7d3), TOBN(0x8c76da03, 0x272bfa60), TOBN(0x0b4a07ea, 0x0f08a414)}}, {{TOBN(0x699e4d29, 0x45c1dd53), TOBN(0xcadc5898, 0x231debb5), TOBN(0xdf49fcc7, 0xa77f00e0), TOBN(0x93057bbf, 0xa73e5a0e)}, {TOBN(0x2f8b7ecd, 0x027a4cd1), TOBN(0x114734b3, 0xc614011a), TOBN(0xe7a01db7, 0x67677c68), TOBN(0x89d9be5e, 0x7e273f4f)}}, {{TOBN(0xd225cb2e, 0x089808ef), TOBN(0xf1f7a27d, 0xd59e4107), TOBN(0x53afc761, 0x8211b9c9), TOBN(0x0361bc67, 0xe6819159)}, {TOBN(0x2a865d0b, 0x7f071426), TOBN(0x6a3c1810, 0xe7072567), TOBN(0x3e3bca1e, 0x0d6bcabd), TOBN(0xa1b02bc1, 0x408591bc)}}, {{TOBN(0xe0deee59, 0x31fba239), TOBN(0xf47424d3, 0x98bd91d1), TOBN(0x0f8886f4, 0x071a3c1d), TOBN(0x3f7d41e8, 0xa819233b)}, {TOBN(0x708623c2, 0xcf6eb998), TOBN(0x86bb49af, 0x609a287f), TOBN(0x942bb249, 0x63c90762), TOBN(0x0ef6eea5, 0x55a9654b)}}, {{TOBN(0x5f6d2d72, 0x36f5defe), TOBN(0xfa9922dc, 0x56f99176), TOBN(0x6c8c5ece, 0xf78ce0c7), TOBN(0x7b44589d, 0xbe09b55e)}, {TOBN(0xe11b3bca, 0x9ea83770), TOBN(0xd7fa2c7f, 0x2ab71547), TOBN(0x2a3dd6fa, 0x2a1ddcc0), TOBN(0x09acb430, 0x5a7b7707)}}, {{TOBN(0x4add4a2e, 0x649d4e57), TOBN(0xcd53a2b0, 0x1917526e), TOBN(0xc5262330, 0x20b44ac4), TOBN(0x4028746a, 0xbaa2c31d)}, {TOBN(0x51318390, 0x64291d4c), TOBN(0xbf48f151, 0xee5ad909), TOBN(0xcce57f59, 0x7b185681), TOBN(0x7c3ac1b0, 0x4854d442)}}, {{TOBN(0x65587dc3, 0xc093c171), TOBN(0xae7acb24, 0x24f42b65), TOBN(0x5a338adb, 0x955996cb), TOBN(0xc8e65675, 0x6051f91b)}, {TOBN(0x66711fba, 0x28b8d0b1), TOBN(0x15d74137, 0xb6c10a90), TOBN(0x70cdd7eb, 0x3a232a80), TOBN(0xc9e2f07f, 0x6191ed24)}}, {{TOBN(0xa80d1db6, 0xf79588c0), TOBN(0xfa52fc69, 0xb55768cc), TOBN(0x0b4df1ae, 0x7f54438a), TOBN(0x0cadd1a7, 0xf9b46a4f)}, {TOBN(0xb40ea6b3, 0x1803dd6f), TOBN(0x488e4fa5, 0x55eaae35), TOBN(0x9f047d55, 0x382e4e16), TOBN(0xc9b5b7e0, 0x2f6e0c98)}}, {{TOBN(0x6b1bd2d3, 0x95762649), TOBN(0xa9604ee7, 0xc7aea3f6), TOBN(0x3646ff27, 0x6dc6f896), TOBN(0x9bf0e7f5, 0x2860bad1)}, {TOBN(0x2d92c821, 0x7cb44b92), TOBN(0xa2f5ce63, 0xaea9c182), TOBN(0xd0a2afb1, 0x9154a5fd), TOBN(0x482e474c, 0x95801da6)}}, {{TOBN(0xc19972d0, 0xb611c24b), TOBN(0x1d468e65, 0x60a8f351), TOBN(0xeb758069, 0x7bcf6421), TOBN(0xec9dd0ee, 0x88fbc491)}, {TOBN(0x5b59d2bf, 0x956c2e32), TOBN(0x73dc6864, 0xdcddf94e), TOBN(0xfd5e2321, 0xbcee7665), TOBN(0xa7b4f8ef, 0x5e9a06c4)}}, {{TOBN(0xfba918dd, 0x7280f855), TOBN(0xbbaac260, 0x8baec688), TOBN(0xa3b3f00f, 0x33400f42), TOBN(0x3d2dba29, 0x66f2e6e4)}, {TOBN(0xb6f71a94, 0x98509375), TOBN(0x8f33031f, 0xcea423cc), TOBN(0x009b8dd0, 0x4807e6fb), TOBN(0x5163cfe5, 0x5cdb954c)}}, {{TOBN(0x03cc8f17, 0xcf41c6e8), TOBN(0xf1f03c2a, 0x037b925c), TOBN(0xc39c19cc, 0x66d2427c), TOBN(0x823d24ba, 0x7b6c18e4)}, {TOBN(0x32ef9013, 0x901f0b4f), TOBN(0x684360f1, 0xf8941c2e), TOBN(0x0ebaff52, 0x2c28092e), TOBN(0x7891e4e3, 0x256c932f)}}, {{TOBN(0x51264319, 0xac445e3d), TOBN(0x553432e7, 0x8ea74381), TOBN(0xe6eeaa69, 0x67e9c50a), TOBN(0x27ced284, 0x62e628c7)}, {TOBN(0x3f96d375, 0x7a4afa57), TOBN(0xde0a14c3, 0xe484c150), TOBN(0x364a24eb, 0x38bd9923), TOBN(0x1df18da0, 0xe5177422)}}, {{TOBN(0x174e8f82, 0xd8d38a9b), TOBN(0x2e97c600, 0xe7de1391), TOBN(0xc5709850, 0xa1c175dd), TOBN(0x969041a0, 0x32ae5035)}, {TOBN(0xcbfd533b, 0x76a2086b), TOBN(0xd6bba71b, 0xd7c2e8fe), TOBN(0xb2d58ee6, 0x099dfb67), TOBN(0x3a8b342d, 0x064a85d9)}}, {{TOBN(0x3bc07649, 0x522f9be3), TOBN(0x690c075b, 0xdf1f49a8), TOBN(0x80e1aee8, 0x3854ec42), TOBN(0x2a7dbf44, 0x17689dc7)}, {TOBN(0xc004fc0e, 0x3faf4078), TOBN(0xb2f02e9e, 0xdf11862c), TOBN(0xf10a5e0f, 0xa0a1b7b3), TOBN(0x30aca623, 0x8936ec80)}}, {{TOBN(0xf83cbf05, 0x02f40d9a), TOBN(0x4681c468, 0x2c318a4d), TOBN(0x98575618, 0x0e9c2674), TOBN(0xbe79d046, 0x1847092e)}, {TOBN(0xaf1e480a, 0x78bd01e0), TOBN(0x6dd359e4, 0x72a51db9), TOBN(0x62ce3821, 0xe3afbab6), TOBN(0xc5cee5b6, 0x17733199)}}, {{TOBN(0xe08b30d4, 0x6ffd9fbb), TOBN(0x6e5bc699, 0x36c610b7), TOBN(0xf343cff2, 0x9ce262cf), TOBN(0xca2e4e35, 0x68b914c1)}, {TOBN(0x011d64c0, 0x16de36c5), TOBN(0xe0b10fdd, 0x42e2b829), TOBN(0x78942981, 0x6685aaf8), TOBN(0xe7511708, 0x230ede97)}}, {{TOBN(0x671ed8fc, 0x3b922bf8), TOBN(0xe4d8c0a0, 0x4c29b133), TOBN(0x87eb1239, 0x3b6e99c4), TOBN(0xaff3974c, 0x8793beba)}, {TOBN(0x03749405, 0x2c18df9b), TOBN(0xc5c3a293, 0x91007139), TOBN(0x6a77234f, 0xe37a0b95), TOBN(0x02c29a21, 0xb661c96b)}}, {{TOBN(0xc3aaf1d6, 0x141ecf61), TOBN(0x9195509e, 0x3bb22f53), TOBN(0x29597404, 0x22d51357), TOBN(0x1b083822, 0x537bed60)}, {TOBN(0xcd7d6e35, 0xe07289f0), TOBN(0x1f94c48c, 0x6dd86eff), TOBN(0xc8bb1f82, 0xeb0f9cfa), TOBN(0x9ee0b7e6, 0x1b2eb97d)}}, {{TOBN(0x5a52fe2e, 0x34d74e31), TOBN(0xa352c310, 0x3bf79ab6), TOBN(0x97ff6c5a, 0xabfeeb8f), TOBN(0xbfbe8fef, 0xf5c97305)}, {TOBN(0xd6081ce6, 0xa7904608), TOBN(0x1f812f3a, 0xc4fca249), TOBN(0x9b24bc9a, 0xb9e5e200), TOBN(0x91022c67, 0x38012ee8)}}, {{TOBN(0xe83d9c5d, 0x30a713a1), TOBN(0x4876e3f0, 0x84ef0f93), TOBN(0xc9777029, 0xc1fbf928), TOBN(0xef7a6bb3, 0xbce7d2a4)}, {TOBN(0xb8067228, 0xdfa2a659), TOBN(0xd5cd3398, 0xd877a48f), TOBN(0xbea4fd8f, 0x025d0f3f), TOBN(0xd67d2e35, 0x2eae7c2b)}}, {{TOBN(0x184de7d7, 0xcc5f4394), TOBN(0xb5551b5c, 0x4536e142), TOBN(0x2e89b212, 0xd34aa60a), TOBN(0x14a96fea, 0xf50051d5)}, {TOBN(0x4e21ef74, 0x0d12bb0b), TOBN(0xc522f020, 0x60b9677e), TOBN(0x8b12e467, 0x2df7731d), TOBN(0x39f80382, 0x7b326d31)}}, {{TOBN(0xdfb8630c, 0x39024a94), TOBN(0xaacb96a8, 0x97319452), TOBN(0xd68a3961, 0xeda3867c), TOBN(0x0c58e2b0, 0x77c4ffca)}, {TOBN(0x3d545d63, 0x4da919fa), TOBN(0xef79b69a, 0xf15e2289), TOBN(0x54bc3d3d, 0x808bab10), TOBN(0xc8ab3007, 0x45f82c37)}}, {{TOBN(0xc12738b6, 0x7c4a658a), TOBN(0xb3c47639, 0x40e72182), TOBN(0x3b77be46, 0x8798e44f), TOBN(0xdc047df2, 0x17a7f85f)}, {TOBN(0x2439d4c5, 0x5e59d92d), TOBN(0xcedca475, 0xe8e64d8d), TOBN(0xa724cd0d, 0x87ca9b16), TOBN(0x35e4fd59, 0xa5540dfe)}}, {{TOBN(0xf8c1ff18, 0xe4bcf6b1), TOBN(0x856d6285, 0x295018fa), TOBN(0x433f665c, 0x3263c949), TOBN(0xa6a76dd6, 0xa1f21409)}, {TOBN(0x17d32334, 0xcc7b4f79), TOBN(0xa1d03122, 0x06720e4a), TOBN(0xadb6661d, 0x81d9bed5), TOBN(0xf0d6fb02, 0x11db15d1)}}, {{TOBN(0x7fd11ad5, 0x1fb747d2), TOBN(0xab50f959, 0x3033762b), TOBN(0x2a7e711b, 0xfbefaf5a), TOBN(0xc7393278, 0x3fef2bbf)}, {TOBN(0xe29fa244, 0x0df6f9be), TOBN(0x9092757b, 0x71efd215), TOBN(0xee60e311, 0x4f3d6fd9), TOBN(0x338542d4, 0x0acfb78b)}}, {{TOBN(0x44a23f08, 0x38961a0f), TOBN(0x1426eade, 0x986987ca), TOBN(0x36e6ee2e, 0x4a863cc6), TOBN(0x48059420, 0x628b8b79)}, {TOBN(0x30303ad8, 0x7396e1de), TOBN(0x5c8bdc48, 0x38c5aad1), TOBN(0x3e40e11f, 0x5c8f5066), TOBN(0xabd6e768, 0x8d246bbd)}}, {{TOBN(0x68aa40bb, 0x23330a01), TOBN(0xd23f5ee4, 0xc34eafa0), TOBN(0x3bbee315, 0x5de02c21), TOBN(0x18dd4397, 0xd1d8dd06)}, {TOBN(0x3ba1939a, 0x122d7b44), TOBN(0xe6d3b40a, 0xa33870d6), TOBN(0x8e620f70, 0x1c4fe3f8), TOBN(0xf6bba1a5, 0xd3a50cbf)}}, {{TOBN(0x4a78bde5, 0xcfc0aee0), TOBN(0x847edc46, 0xc08c50bd), TOBN(0xbaa2439c, 0xad63c9b2), TOBN(0xceb4a728, 0x10fc2acb)}, {TOBN(0xa419e40e, 0x26da033d), TOBN(0x6cc3889d, 0x03e02683), TOBN(0x1cd28559, 0xfdccf725), TOBN(0x0fd7e0f1, 0x8d13d208)}}, {{TOBN(0x01b9733b, 0x1f0df9d4), TOBN(0x8cc2c5f3, 0xa2b5e4f3), TOBN(0x43053bfa, 0x3a304fd4), TOBN(0x8e87665c, 0x0a9f1aa7)}, {TOBN(0x087f29ec, 0xd73dc965), TOBN(0x15ace455, 0x3e9023db), TOBN(0x2370e309, 0x2bce28b4), TOBN(0xf9723442, 0xb6b1e84a)}}, {{TOBN(0xbeee662e, 0xb72d9f26), TOBN(0xb19396de, 0xf0e47109), TOBN(0x85b1fa73, 0xe13289d0), TOBN(0x436cf77e, 0x54e58e32)}, {TOBN(0x0ec833b3, 0xe990ef77), TOBN(0x7373e3ed, 0x1b11fc25), TOBN(0xbe0eda87, 0x0fc332ce), TOBN(0xced04970, 0x8d7ea856)}}, {{TOBN(0xf85ff785, 0x7e977ca0), TOBN(0xb66ee8da, 0xdfdd5d2b), TOBN(0xf5e37950, 0x905af461), TOBN(0x587b9090, 0x966d487c)}, {TOBN(0x6a198a1b, 0x32ba0127), TOBN(0xa7720e07, 0x141615ac), TOBN(0xa23f3499, 0x996ef2f2), TOBN(0xef5f64b4, 0x470bcb3d)}}, {{TOBN(0xa526a962, 0x92b8c559), TOBN(0x0c14aac0, 0x69740a0f), TOBN(0x0d41a9e3, 0xa6bdc0a5), TOBN(0x97d52106, 0x9c48aef4)}, {TOBN(0xcf16bd30, 0x3e7c253b), TOBN(0xcc834b1a, 0x47fdedc1), TOBN(0x7362c6e5, 0x373aab2e), TOBN(0x264ed85e, 0xc5f590ff)}}, {{TOBN(0x7a46d9c0, 0x66d41870), TOBN(0xa50c20b1, 0x4787ba09), TOBN(0x185e7e51, 0xe3d44635), TOBN(0xb3b3e080, 0x31e2d8dc)}, {TOBN(0xbed1e558, 0xa179e9d9), TOBN(0x2daa3f79, 0x74a76781), TOBN(0x4372baf2, 0x3a40864f), TOBN(0x46900c54, 0x4fe75cb5)}}, {{TOBN(0xb95f171e, 0xf76765d0), TOBN(0x4ad726d2, 0x95c87502), TOBN(0x2ec769da, 0x4d7c99bd), TOBN(0x5e2ddd19, 0xc36cdfa8)}, {TOBN(0xc22117fc, 0xa93e6dea), TOBN(0xe8a2583b, 0x93771123), TOBN(0xbe2f6089, 0xfa08a3a2), TOBN(0x4809d5ed, 0x8f0e1112)}}, {{TOBN(0x3b414aa3, 0xda7a095e), TOBN(0x9049acf1, 0x26f5aadd), TOBN(0x78d46a4d, 0x6be8b84a), TOBN(0xd66b1963, 0xb732b9b3)}, {TOBN(0x5c2ac2a0, 0xde6e9555), TOBN(0xcf52d098, 0xb5bd8770), TOBN(0x15a15fa6, 0x0fd28921), TOBN(0x56ccb81e, 0x8b27536d)}}, {{TOBN(0x0f0d8ab8, 0x9f4ccbb8), TOBN(0xed5f44d2, 0xdb221729), TOBN(0x43141988, 0x00bed10c), TOBN(0xc94348a4, 0x1d735b8b)}, {TOBN(0x79f3e9c4, 0x29ef8479), TOBN(0x4c13a4e3, 0x614c693f), TOBN(0x32c9af56, 0x8e143a14), TOBN(0xbc517799, 0xe29ac5c4)}}, {{TOBN(0x05e17992, 0x2774856f), TOBN(0x6e52fb05, 0x6c1bf55f), TOBN(0xaeda4225, 0xe4f19e16), TOBN(0x70f4728a, 0xaf5ccb26)}, {TOBN(0x5d2118d1, 0xb2947f22), TOBN(0xc827ea16, 0x281d6fb9), TOBN(0x8412328d, 0x8cf0eabd), TOBN(0x45ee9fb2, 0x03ef9dcf)}}, {{TOBN(0x8e700421, 0xbb937d63), TOBN(0xdf8ff2d5, 0xcc4b37a6), TOBN(0xa4c0d5b2, 0x5ced7b68), TOBN(0x6537c1ef, 0xc7308f59)}, {TOBN(0x25ce6a26, 0x3b37f8e8), TOBN(0x170e9a9b, 0xdeebc6ce), TOBN(0xdd037952, 0x8728d72c), TOBN(0x445b0e55, 0x850154bc)}}, {{TOBN(0x4b7d0e06, 0x83a7337b), TOBN(0x1e3416d4, 0xffecf249), TOBN(0x24840eff, 0x66a2b71f), TOBN(0xd0d9a50a, 0xb37cc26d)}, {TOBN(0xe2198150, 0x6fe28ef7), TOBN(0x3cc5ef16, 0x23324c7f), TOBN(0x220f3455, 0x769b5263), TOBN(0xe2ade2f1, 0xa10bf475)}}, {{TOBN(0x28cd20fa, 0x458d3671), TOBN(0x1549722c, 0x2dc4847b), TOBN(0x6dd01e55, 0x591941e3), TOBN(0x0e6fbcea, 0x27128ccb)}, {TOBN(0xae1a1e6b, 0x3bef0262), TOBN(0xfa8c472c, 0x8f54e103), TOBN(0x7539c0a8, 0x72c052ec), TOBN(0xd7b27369, 0x5a3490e9)}}, {{TOBN(0x143fe1f1, 0x71684349), TOBN(0x36b4722e, 0x32e19b97), TOBN(0xdc059227, 0x90980aff), TOBN(0x175c9c88, 0x9e13d674)}, {TOBN(0xa7de5b22, 0x6e6bfdb1), TOBN(0x5ea5b7b2, 0xbedb4b46), TOBN(0xd5570191, 0xd34a6e44), TOBN(0xfcf60d2e, 0xa24ff7e6)}}, {{TOBN(0x614a392d, 0x677819e1), TOBN(0x7be74c7e, 0xaa5a29e8), TOBN(0xab50fece, 0x63c85f3f), TOBN(0xaca2e2a9, 0x46cab337)}, {TOBN(0x7f700388, 0x122a6fe3), TOBN(0xdb69f703, 0x882a04a8), TOBN(0x9a77935d, 0xcf7aed57), TOBN(0xdf16207c, 0x8d91c86f)}}, {{TOBN(0x2fca49ab, 0x63ed9998), TOBN(0xa3125c44, 0xa77ddf96), TOBN(0x05dd8a86, 0x24344072), TOBN(0xa023dda2, 0xfec3fb56)}, {TOBN(0x421b41fc, 0x0c743032), TOBN(0x4f2120c1, 0x5e438639), TOBN(0xfb7cae51, 0xc83c1b07), TOBN(0xb2370caa, 0xcac2171a)}}, {{TOBN(0x2eb2d962, 0x6cc820fb), TOBN(0x59feee5c, 0xb85a44bf), TOBN(0x94620fca, 0x5b6598f0), TOBN(0x6b922cae, 0x7e314051)}, {TOBN(0xff8745ad, 0x106bed4e), TOBN(0x546e71f5, 0xdfa1e9ab), TOBN(0x935c1e48, 0x1ec29487), TOBN(0x9509216c, 0x4d936530)}}, {{TOBN(0xc7ca3067, 0x85c9a2db), TOBN(0xd6ae5152, 0x6be8606f), TOBN(0x09dbcae6, 0xe14c651d), TOBN(0xc9536e23, 0x9bc32f96)}, {TOBN(0xa90535a9, 0x34521b03), TOBN(0xf39c526c, 0x878756ff), TOBN(0x383172ec, 0x8aedf03c), TOBN(0x20a8075e, 0xefe0c034)}}, {{TOBN(0xf22f9c62, 0x64026422), TOBN(0x8dd10780, 0x24b9d076), TOBN(0x944c742a, 0x3bef2950), TOBN(0x55b9502e, 0x88a2b00b)}, {TOBN(0xa59e14b4, 0x86a09817), TOBN(0xa39dd3ac, 0x47bb4071), TOBN(0x55137f66, 0x3be0592f), TOBN(0x07fcafd4, 0xc9e63f5b)}}, {{TOBN(0x963652ee, 0x346eb226), TOBN(0x7dfab085, 0xec2facb7), TOBN(0x273bf2b8, 0x691add26), TOBN(0x30d74540, 0xf2b46c44)}, {TOBN(0x05e8e73e, 0xf2c2d065), TOBN(0xff9b8a00, 0xd42eeac9), TOBN(0x2fcbd205, 0x97209d22), TOBN(0xeb740ffa, 0xde14ea2c)}}, {{TOBN(0xc71ff913, 0xa8aef518), TOBN(0x7bfc74bb, 0xfff4cfa2), TOBN(0x1716680c, 0xb6b36048), TOBN(0x121b2cce, 0x9ef79af1)}, {TOBN(0xbff3c836, 0xa01eb3d3), TOBN(0x50eb1c6a, 0x5f79077b), TOBN(0xa48c32d6, 0xa004bbcf), TOBN(0x47a59316, 0x7d64f61d)}}, {{TOBN(0x6068147f, 0x93102016), TOBN(0x12c5f654, 0x94d12576), TOBN(0xefb071a7, 0xc9bc6b91), TOBN(0x7c2da0c5, 0x6e23ea95)}, {TOBN(0xf4fd45b6, 0xd4a1dd5d), TOBN(0x3e7ad9b6, 0x9122b13c), TOBN(0x342ca118, 0xe6f57a48), TOBN(0x1c2e94a7, 0x06f8288f)}}, {{TOBN(0x99e68f07, 0x5a97d231), TOBN(0x7c80de97, 0x4d838758), TOBN(0xbce0f5d0, 0x05872727), TOBN(0xbe5d95c2, 0x19c4d016)}, {TOBN(0x921d5cb1, 0x9c2492ee), TOBN(0x42192dc1, 0x404d6fb3), TOBN(0x4c84dcd1, 0x32f988d3), TOBN(0xde26d61f, 0xa17b8e85)}}, {{TOBN(0xc466dcb6, 0x137c7408), TOBN(0x9a38d7b6, 0x36a266da), TOBN(0x7ef5cb06, 0x83bebf1b), TOBN(0xe5cdcbbf, 0x0fd014e3)}, {TOBN(0x30aa376d, 0xf65965a0), TOBN(0x60fe88c2, 0xebb3e95e), TOBN(0x33fd0b61, 0x66ee6f20), TOBN(0x8827dcdb, 0x3f41f0a0)}}, {{TOBN(0xbf8a9d24, 0x0c56c690), TOBN(0x40265dad, 0xddb7641d), TOBN(0x522b05bf, 0x3a6b662b), TOBN(0x466d1dfe, 0xb1478c9b)}, {TOBN(0xaa616962, 0x1484469b), TOBN(0x0db60549, 0x02df8f9f), TOBN(0xc37bca02, 0x3cb8bf51), TOBN(0x5effe346, 0x21371ce8)}}, {{TOBN(0xe8f65264, 0xff112c32), TOBN(0x8a9c736d, 0x7b971fb2), TOBN(0xa4f19470, 0x7b75080d), TOBN(0xfc3f2c5a, 0x8839c59b)}, {TOBN(0x1d6c777e, 0x5aeb49c2), TOBN(0xf3db034d, 0xda1addfe), TOBN(0xd76fee5a, 0x5535affc), TOBN(0x0853ac70, 0xb92251fd)}}, {{TOBN(0x37e3d594, 0x8b2a29d5), TOBN(0x28f1f457, 0x4de00ddb), TOBN(0x8083c1b5, 0xf42c328b), TOBN(0xd8ef1d8f, 0xe493c73b)}, {TOBN(0x96fb6260, 0x41dc61bd), TOBN(0xf74e8a9d, 0x27ee2f8a), TOBN(0x7c605a80, 0x2c946a5d), TOBN(0xeed48d65, 0x3839ccfd)}}, {{TOBN(0x9894344f, 0x3a29467a), TOBN(0xde81e949, 0xc51eba6d), TOBN(0xdaea066b, 0xa5e5c2f2), TOBN(0x3fc8a614, 0x08c8c7b3)}, {TOBN(0x7adff88f, 0x06d0de9f), TOBN(0xbbc11cf5, 0x3b75ce0a), TOBN(0x9fbb7acc, 0xfbbc87d5), TOBN(0xa1458e26, 0x7badfde2)}}}, {{{TOBN(0x1cb43668, 0xe039c256), TOBN(0x5f26fb8b, 0x7c17fd5d), TOBN(0xeee426af, 0x79aa062b), TOBN(0x072002d0, 0xd78fbf04)}, {TOBN(0x4c9ca237, 0xe84fb7e3), TOBN(0xb401d8a1, 0x0c82133d), TOBN(0xaaa52592, 0x6d7e4181), TOBN(0xe9430833, 0x73dbb152)}}, {{TOBN(0xf92dda31, 0xbe24319a), TOBN(0x03f7d28b, 0xe095a8e7), TOBN(0xa52fe840, 0x98782185), TOBN(0x276ddafe, 0x29c24dbc)}, {TOBN(0x80cd5496, 0x1d7a64eb), TOBN(0xe4360889, 0x7f1dbe42), TOBN(0x2f81a877, 0x8438d2d5), TOBN(0x7e4d52a8, 0x85169036)}}, {{TOBN(0x19e3d5b1, 0x1d59715d), TOBN(0xc7eaa762, 0xd788983e), TOBN(0xe5a730b0, 0xabf1f248), TOBN(0xfbab8084, 0xfae3fd83)}, {TOBN(0x65e50d21, 0x53765b2f), TOBN(0xbdd4e083, 0xfa127f3d), TOBN(0x9cf3c074, 0x397b1b10), TOBN(0x59f8090c, 0xb1b59fd3)}}, {{TOBN(0x7b15fd9d, 0x615faa8f), TOBN(0x8fa1eb40, 0x968554ed), TOBN(0x7bb4447e, 0x7aa44882), TOBN(0x2bb2d0d1, 0x029fff32)}, {TOBN(0x075e2a64, 0x6caa6d2f), TOBN(0x8eb879de, 0x22e7351b), TOBN(0xbcd5624e, 0x9a506c62), TOBN(0x218eaef0, 0xa87e24dc)}}, {{TOBN(0x37e56847, 0x44ddfa35), TOBN(0x9ccfc5c5, 0xdab3f747), TOBN(0x9ac1df3f, 0x1ee96cf4), TOBN(0x0c0571a1, 0x3b480b8f)}, {TOBN(0x2fbeb3d5, 0x4b3a7b3c), TOBN(0x35c03669, 0x5dcdbb99), TOBN(0x52a0f5dc, 0xb2415b3a), TOBN(0xd57759b4, 0x4413ed9a)}}, {{TOBN(0x1fe647d8, 0x3d30a2c5), TOBN(0x0857f77e, 0xf78a81dc), TOBN(0x11d5a334, 0x131a4a9b), TOBN(0xc0a94af9, 0x29d393f5)}, {TOBN(0xbc3a5c0b, 0xdaa6ec1a), TOBN(0xba9fe493, 0x88d2d7ed), TOBN(0xbb4335b4, 0xbb614797), TOBN(0x991c4d68, 0x72f83533)}}, {{TOBN(0x53258c28, 0xd2f01cb3), TOBN(0x93d6eaa3, 0xd75db0b1), TOBN(0x419a2b0d, 0xe87d0db4), TOBN(0xa1e48f03, 0xd8fe8493)}, {TOBN(0xf747faf6, 0xc508b23a), TOBN(0xf137571a, 0x35d53549), TOBN(0x9f5e58e2, 0xfcf9b838), TOBN(0xc7186cee, 0xa7fd3cf5)}}, {{TOBN(0x77b868ce, 0xe978a1d3), TOBN(0xe3a68b33, 0x7ab92d04), TOBN(0x51029794, 0x87a5b862), TOBN(0x5f0606c3, 0x3a61d41d)}, {TOBN(0x2814be27, 0x6f9326f1), TOBN(0x2f521c14, 0xc6fe3c2e), TOBN(0x17464d7d, 0xacdf7351), TOBN(0x10f5f9d3, 0x777f7e44)}}, {{TOBN(0xce8e616b, 0x269fb37d), TOBN(0xaaf73804, 0x7de62de5), TOBN(0xaba11175, 0x4fdd4153), TOBN(0x515759ba, 0x3770b49b)}, {TOBN(0x8b09ebf8, 0xaa423a61), TOBN(0x592245a1, 0xcd41fb92), TOBN(0x1cba8ec1, 0x9b4c8936), TOBN(0xa87e91e3, 0xaf36710e)}}, {{TOBN(0x1fd84ce4, 0x3d34a2e3), TOBN(0xee3759ce, 0xb43b5d61), TOBN(0x895bc78c, 0x619186c7), TOBN(0xf19c3809, 0xcbb9725a)}, {TOBN(0xc0be21aa, 0xde744b1f), TOBN(0xa7d222b0, 0x60f8056b), TOBN(0x74be6157, 0xb23efe11), TOBN(0x6fab2b4f, 0x0cd68253)}}, {{TOBN(0xad33ea5f, 0x4bf1d725), TOBN(0x9c1d8ee2, 0x4f6c950f), TOBN(0x544ee78a, 0xa377af06), TOBN(0x54f489bb, 0x94a113e1)}, {TOBN(0x8f11d634, 0x992fb7e8), TOBN(0x0169a7aa, 0xa2a44347), TOBN(0x1d49d4af, 0x95020e00), TOBN(0x95945722, 0xe08e120b)}}, {{TOBN(0xb6e33878, 0xa4d32282), TOBN(0xe36e029d, 0x48020ae7), TOBN(0xe05847fb, 0x37a9b750), TOBN(0xf876812c, 0xb29e3819)}, {TOBN(0x84ad138e, 0xd23a17f0), TOBN(0x6d7b4480, 0xf0b3950e), TOBN(0xdfa8aef4, 0x2fd67ae0), TOBN(0x8d3eea24, 0x52333af6)}}, {{TOBN(0x0d052075, 0xb15d5acc), TOBN(0xc6d9c79f, 0xbd815bc4), TOBN(0x8dcafd88, 0xdfa36cf2), TOBN(0x908ccbe2, 0x38aa9070)}, {TOBN(0x638722c4, 0xba35afce), TOBN(0x5a3da8b0, 0xfd6abf0b), TOBN(0x2dce252c, 0xc9c335c1), TOBN(0x84e7f0de, 0x65aa799b)}}, {{TOBN(0x2101a522, 0xb99a72cb), TOBN(0x06de6e67, 0x87618016), TOBN(0x5ff8c7cd, 0xe6f3653e), TOBN(0x0a821ab5, 0xc7a6754a)}, {TOBN(0x7e3fa52b, 0x7cb0b5a2), TOBN(0xa7fb121c, 0xc9048790), TOBN(0x1a725020, 0x06ce053a), TOBN(0xb490a31f, 0x04e929b0)}}, {{TOBN(0xe17be47d, 0x62dd61ad), TOBN(0x781a961c, 0x6be01371), TOBN(0x1063bfd3, 0xdae3cbba), TOBN(0x35647406, 0x7f73c9ba)}, {TOBN(0xf50e957b, 0x2736a129), TOBN(0xa6313702, 0xed13f256), TOBN(0x9436ee65, 0x3a19fcc5), TOBN(0xcf2bdb29, 0xe7a4c8b6)}}, {{TOBN(0xb06b1244, 0xc5f95cd8), TOBN(0xda8c8af0, 0xf4ab95f4), TOBN(0x1bae59c2, 0xb9e5836d), TOBN(0x07d51e7e, 0x3acffffc)}, {TOBN(0x01e15e6a, 0xc2ccbcda), TOBN(0x3bc1923f, 0x8528c3e0), TOBN(0x43324577, 0xa49fead4), TOBN(0x61a1b884, 0x2aa7a711)}}, {{TOBN(0xf9a86e08, 0x700230ef), TOBN(0x0af585a1, 0xbd19adf8), TOBN(0x7645f361, 0xf55ad8f2), TOBN(0x6e676223, 0x46c3614c)}, {TOBN(0x23cb257c, 0x4e774d3f), TOBN(0x82a38513, 0xac102d1b), TOBN(0x9bcddd88, 0x7b126aa5), TOBN(0xe716998b, 0xeefd3ee4)}}, {{TOBN(0x4239d571, 0xfb167583), TOBN(0xdd011c78, 0xd16c8f8a), TOBN(0x271c2895, 0x69a27519), TOBN(0x9ce0a3b7, 0xd2d64b6a)}, {TOBN(0x8c977289, 0xd5ec6738), TOBN(0xa3b49f9a, 0x8840ef6b), TOBN(0x808c14c9, 0x9a453419), TOBN(0x5c00295b, 0x0cf0a2d5)}}, {{TOBN(0x524414fb, 0x1d4bcc76), TOBN(0xb07691d2, 0x459a88f1), TOBN(0x77f43263, 0xf70d110f), TOBN(0x64ada5e0, 0xb7abf9f3)}, {TOBN(0xafd0f94e, 0x5b544cf5), TOBN(0xb4a13a15, 0xfd2713fe), TOBN(0xb99b7d6e, 0x250c74f4), TOBN(0x097f2f73, 0x20324e45)}}, {{TOBN(0x994b37d8, 0xaffa8208), TOBN(0xc3c31b0b, 0xdc29aafc), TOBN(0x3da74651, 0x7a3a607f), TOBN(0xd8e1b8c1, 0xfe6955d6)}, {TOBN(0x716e1815, 0xc8418682), TOBN(0x541d487f, 0x7dc91d97), TOBN(0x48a04669, 0xc6996982), TOBN(0xf39cab15, 0x83a6502e)}}, {{TOBN(0x025801a0, 0xe68db055), TOBN(0xf3569758, 0xba3338d5), TOBN(0xb0c8c0aa, 0xee2afa84), TOBN(0x4f6985d3, 0xfb6562d1)}, {TOBN(0x351f1f15, 0x132ed17a), TOBN(0x510ed0b4, 0xc04365fe), TOBN(0xa3f98138, 0xe5b1f066), TOBN(0xbc9d95d6, 0x32df03dc)}}, {{TOBN(0xa83ccf6e, 0x19abd09e), TOBN(0x0b4097c1, 0x4ff17edb), TOBN(0x58a5c478, 0xd64a06ce), TOBN(0x2ddcc3fd, 0x544a58fd)}, {TOBN(0xd449503d, 0x9e8153b8), TOBN(0x3324fd02, 0x7774179b), TOBN(0xaf5d47c8, 0xdbd9120c), TOBN(0xeb860162, 0x34fa94db)}}, {{TOBN(0x5817bdd1, 0x972f07f4), TOBN(0xe5579e2e, 0xd27bbceb), TOBN(0x86847a1f, 0x5f11e5a6), TOBN(0xb39ed255, 0x7c3cf048)}, {TOBN(0xe1076417, 0xa2f62e55), TOBN(0x6b9ab38f, 0x1bcf82a2), TOBN(0x4bb7c319, 0x7aeb29f9), TOBN(0xf6d17da3, 0x17227a46)}}, {{TOBN(0xab53ddbd, 0x0f968c00), TOBN(0xa03da7ec, 0x000c880b), TOBN(0x7b239624, 0x6a9ad24d), TOBN(0x612c0401, 0x01ec60d0)}, {TOBN(0x70d10493, 0x109f5df1), TOBN(0xfbda4030, 0x80af7550), TOBN(0x30b93f95, 0xc6b9a9b3), TOBN(0x0c74ec71, 0x007d9418)}}, {{TOBN(0x94175564, 0x6edb951f), TOBN(0x5f4a9d78, 0x7f22c282), TOBN(0xb7870895, 0xb38d1196), TOBN(0xbc593df3, 0xa228ce7c)}, {TOBN(0xc78c5bd4, 0x6af3641a), TOBN(0x7802200b, 0x3d9b3dcc), TOBN(0x0dc73f32, 0x8be33304), TOBN(0x847ed87d, 0x61ffb79a)}}, {{TOBN(0xf85c974e, 0x6d671192), TOBN(0x1e14100a, 0xde16f60f), TOBN(0x45cb0d5a, 0x95c38797), TOBN(0x18923bba, 0x9b022da4)}, {TOBN(0xef2be899, 0xbbe7e86e), TOBN(0x4a1510ee, 0x216067bf), TOBN(0xd98c8154, 0x84d5ce3e), TOBN(0x1af777f0, 0xf92a2b90)}}, {{TOBN(0x9fbcb400, 0x4ef65724), TOBN(0x3e04a4c9, 0x3c0ca6fe), TOBN(0xfb3e2cb5, 0x55002994), TOBN(0x1f3a93c5, 0x5363ecab)}, {TOBN(0x1fe00efe, 0x3923555b), TOBN(0x744bedd9, 0x1e1751ea), TOBN(0x3fb2db59, 0x6ab69357), TOBN(0x8dbd7365, 0xf5e6618b)}}, {{TOBN(0x99d53099, 0xdf1ea40e), TOBN(0xb3f24a0b, 0x57d61e64), TOBN(0xd088a198, 0x596eb812), TOBN(0x22c8361b, 0x5762940b)}, {TOBN(0x66f01f97, 0xf9c0d95c), TOBN(0x88461172, 0x8e43cdae), TOBN(0x11599a7f, 0xb72b15c3), TOBN(0x135a7536, 0x420d95cc)}}, {{TOBN(0x2dcdf0f7, 0x5f7ae2f6), TOBN(0x15fc6e1d, 0xd7fa6da2), TOBN(0x81ca829a, 0xd1d441b6), TOBN(0x84c10cf8, 0x04a106b6)}, {TOBN(0xa9b26c95, 0xa73fbbd0), TOBN(0x7f24e0cb, 0x4d8f6ee8), TOBN(0x48b45937, 0x1e25a043), TOBN(0xf8a74fca, 0x036f3dfe)}}, {{TOBN(0x1ed46585, 0xc9f84296), TOBN(0x7fbaa8fb, 0x3bc278b0), TOBN(0xa8e96cd4, 0x6c4fcbd0), TOBN(0x940a1202, 0x73b60a5f)}, {TOBN(0x34aae120, 0x55a4aec8), TOBN(0x550e9a74, 0xdbd742f0), TOBN(0x794456d7, 0x228c68ab), TOBN(0x492f8868, 0xa4e25ec6)}}, {{TOBN(0x682915ad, 0xb2d8f398), TOBN(0xf13b51cc, 0x5b84c953), TOBN(0xcda90ab8, 0x5bb917d6), TOBN(0x4b615560, 0x4ea3dee1)}, {TOBN(0x578b4e85, 0x0a52c1c8), TOBN(0xeab1a695, 0x20b75fc4), TOBN(0x60c14f3c, 0xaa0bb3c6), TOBN(0x220f448a, 0xb8216094)}}, {{TOBN(0x4fe7ee31, 0xb0e63d34), TOBN(0xf4600572, 0xa9e54fab), TOBN(0xc0493334, 0xd5e7b5a4), TOBN(0x8589fb92, 0x06d54831)}, {TOBN(0xaa70f5cc, 0x6583553a), TOBN(0x0879094a, 0xe25649e5), TOBN(0xcc904507, 0x10044652), TOBN(0xebb0696d, 0x02541c4f)}}, {{TOBN(0x5a171fde, 0xb9718710), TOBN(0x38f1bed8, 0xf374a9f5), TOBN(0xc8c582e1, 0xba39bdc1), TOBN(0xfc457b0a, 0x908cc0ce)}, {TOBN(0x9a187fd4, 0x883841e2), TOBN(0x8ec25b39, 0x38725381), TOBN(0x2553ed05, 0x96f84395), TOBN(0x095c7661, 0x6f6c6897)}}, {{TOBN(0x917ac85c, 0x4bdc5610), TOBN(0xb2885fe4, 0x179eb301), TOBN(0x5fc65547, 0x8b78bdcc), TOBN(0x4a9fc893, 0xe59e4699)}, {TOBN(0xbb7ff0cd, 0x3ce299af), TOBN(0x195be9b3, 0xadf38b20), TOBN(0x6a929c87, 0xd38ddb8f), TOBN(0x55fcc99c, 0xb21a51b9)}}, {{TOBN(0x2b695b4c, 0x721a4593), TOBN(0xed1e9a15, 0x768eaac2), TOBN(0xfb63d71c, 0x7489f914), TOBN(0xf98ba31c, 0x78118910)}, {TOBN(0x80291373, 0x9b128eb4), TOBN(0x7801214e, 0xd448af4a), TOBN(0xdbd2e22b, 0x55418dd3), TOBN(0xeffb3c0d, 0xd3998242)}}, {{TOBN(0xdfa6077c, 0xc7bf3827), TOBN(0xf2165bcb, 0x47f8238f), TOBN(0xfe37cf68, 0x8564d554), TOBN(0xe5f825c4, 0x0a81fb98)}, {TOBN(0x43cc4f67, 0xffed4d6f), TOBN(0xbc609578, 0xb50a34b0), TOBN(0x8aa8fcf9, 0x5041faf1), TOBN(0x5659f053, 0x651773b6)}}, {{TOBN(0xe87582c3, 0x6044d63b), TOBN(0xa6089409, 0x0cdb0ca0), TOBN(0x8c993e0f, 0xbfb2bcf6), TOBN(0xfc64a719, 0x45985cfc)}, {TOBN(0x15c4da80, 0x83dbedba), TOBN(0x804ae112, 0x2be67df7), TOBN(0xda4c9658, 0xa23defde), TOBN(0x12002ddd, 0x5156e0d3)}}, {{TOBN(0xe68eae89, 0x5dd21b96), TOBN(0x8b99f28b, 0xcf44624d), TOBN(0x0ae00808, 0x1ec8897a), TOBN(0xdd0a9303, 0x6712f76e)}, {TOBN(0x96237522, 0x4e233de4), TOBN(0x192445b1, 0x2b36a8a5), TOBN(0xabf9ff74, 0x023993d9), TOBN(0x21f37bf4, 0x2aad4a8f)}}, {{TOBN(0x340a4349, 0xf8bd2bbd), TOBN(0x1d902cd9, 0x4868195d), TOBN(0x3d27bbf1, 0xe5fdb6f1), TOBN(0x7a5ab088, 0x124f9f1c)}, {TOBN(0xc466ab06, 0xf7a09e03), TOBN(0x2f8a1977, 0x31f2c123), TOBN(0xda355dc7, 0x041b6657), TOBN(0xcb840d12, 0x8ece2a7c)}}, {{TOBN(0xb600ad9f, 0x7db32675), TOBN(0x78fea133, 0x07a06f1b), TOBN(0x5d032269, 0xb31f6094), TOBN(0x07753ef5, 0x83ec37aa)}, {TOBN(0x03485aed, 0x9c0bea78), TOBN(0x41bb3989, 0xbc3f4524), TOBN(0x09403761, 0x697f726d), TOBN(0x6109beb3, 0xdf394820)}}, {{TOBN(0x804111ea, 0x3b6d1145), TOBN(0xb6271ea9, 0xa8582654), TOBN(0x619615e6, 0x24e66562), TOBN(0xa2554945, 0xd7b6ad9c)}, {TOBN(0xd9c4985e, 0x99bfe35f), TOBN(0x9770ccc0, 0x7b51cdf6), TOBN(0x7c327013, 0x92881832), TOBN(0x8777d45f, 0x286b26d1)}}, {{TOBN(0x9bbeda22, 0xd847999d), TOBN(0x03aa33b6, 0xc3525d32), TOBN(0x4b7b96d4, 0x28a959a1), TOBN(0xbb3786e5, 0x31e5d234)}, {TOBN(0xaeb5d3ce, 0x6961f247), TOBN(0x20aa85af, 0x02f93d3f), TOBN(0x9cd1ad3d, 0xd7a7ae4f), TOBN(0xbf6688f0, 0x781adaa8)}}, {{TOBN(0xb1b40e86, 0x7469cead), TOBN(0x1904c524, 0x309fca48), TOBN(0x9b7312af, 0x4b54bbc7), TOBN(0xbe24bf8f, 0x593affa2)}, {TOBN(0xbe5e0790, 0xbd98764b), TOBN(0xa0f45f17, 0xa26e299e), TOBN(0x4af0d2c2, 0x6b8fe4c7), TOBN(0xef170db1, 0x8ae8a3e6)}}, {{TOBN(0x0e8d61a0, 0x29e0ccc1), TOBN(0xcd53e87e, 0x60ad36ca), TOBN(0x328c6623, 0xc8173822), TOBN(0x7ee1767d, 0xa496be55)}, {TOBN(0x89f13259, 0x648945af), TOBN(0x9e45a5fd, 0x25c8009c), TOBN(0xaf2febd9, 0x1f61ab8c), TOBN(0x43f6bc86, 0x8a275385)}}, {{TOBN(0x87792348, 0xf2142e79), TOBN(0x17d89259, 0xc6e6238a), TOBN(0x7536d2f6, 0x4a839d9b), TOBN(0x1f428fce, 0x76a1fbdc)}, {TOBN(0x1c109601, 0x0db06dfe), TOBN(0xbfc16bc1, 0x50a3a3cc), TOBN(0xf9cbd9ec, 0x9b30f41b), TOBN(0x5b5da0d6, 0x00138cce)}}, {{TOBN(0xec1d0a48, 0x56ef96a7), TOBN(0xb47eb848, 0x982bf842), TOBN(0x66deae32, 0xec3f700d), TOBN(0x4e43c42c, 0xaa1181e0)}, {TOBN(0xa1d72a31, 0xd1a4aa2a), TOBN(0x440d4668, 0xc004f3ce), TOBN(0x0d6a2d3b, 0x45fe8a7a), TOBN(0x820e52e2, 0xfb128365)}}, {{TOBN(0x29ac5fcf, 0x25e51b09), TOBN(0x180cd2bf, 0x2023d159), TOBN(0xa9892171, 0xa1ebf90e), TOBN(0xf97c4c87, 0x7c132181)}, {TOBN(0x9f1dc724, 0xc03dbb7e), TOBN(0xae043765, 0x018cbbe4), TOBN(0xfb0b2a36, 0x0767d153), TOBN(0xa8e2f4d6, 0x249cbaeb)}}, {{TOBN(0x172a5247, 0xd95ea168), TOBN(0x1758fada, 0x2970764a), TOBN(0xac803a51, 0x1d978169), TOBN(0x299cfe2e, 0xde77e01b)}, {TOBN(0x652a1e17, 0xb0a98927), TOBN(0x2e26e1d1, 0x20014495), TOBN(0x7ae0af9f, 0x7175b56a), TOBN(0xc2e22a80, 0xd64b9f95)}}, {{TOBN(0x4d0ff9fb, 0xd90a060a), TOBN(0x496a27db, 0xbaf38085), TOBN(0x32305401, 0xda776bcf), TOBN(0xb8cdcef6, 0x725f209e)}, {TOBN(0x61ba0f37, 0x436a0bba), TOBN(0x263fa108, 0x76860049), TOBN(0x92beb98e, 0xda3542cf), TOBN(0xa2d4d14a, 0xd5849538)}}, {{TOBN(0x989b9d68, 0x12e9a1bc), TOBN(0x61d9075c, 0x5f6e3268), TOBN(0x352c6aa9, 0x99ace638), TOBN(0xde4e4a55, 0x920f43ff)}, {TOBN(0xe5e4144a, 0xd673c017), TOBN(0x667417ae, 0x6f6e05ea), TOBN(0x613416ae, 0xdcd1bd56), TOBN(0x5eb36201, 0x86693711)}}, {{TOBN(0x2d7bc504, 0x3a1aa914), TOBN(0x175a1299, 0x76dc5975), TOBN(0xe900e0f2, 0x3fc8125c), TOBN(0x569ef68c, 0x11198875)}, {TOBN(0x9012db63, 0x63a113b4), TOBN(0xe3bd3f56, 0x98835766), TOBN(0xa5c94a52, 0x76412dea), TOBN(0xad9e2a09, 0xaa735e5c)}}, {{TOBN(0x405a984c, 0x508b65e9), TOBN(0xbde4a1d1, 0x6df1a0d1), TOBN(0x1a9433a1, 0xdfba80da), TOBN(0xe9192ff9, 0x9440ad2e)}, {TOBN(0x9f649696, 0x5099fe92), TOBN(0x25ddb65c, 0x0b27a54a), TOBN(0x178279dd, 0xc590da61), TOBN(0x5479a999, 0xfbde681a)}}, {{TOBN(0xd0e84e05, 0x013fe162), TOBN(0xbe11dc92, 0x632d471b), TOBN(0xdf0b0c45, 0xfc0e089f), TOBN(0x04fb15b0, 0x4c144025)}, {TOBN(0xa61d5fc2, 0x13c99927), TOBN(0xa033e9e0, 0x3de2eb35), TOBN(0xf8185d5c, 0xb8dacbb4), TOBN(0x9a88e265, 0x8644549d)}}, {{TOBN(0xf717af62, 0x54671ff6), TOBN(0x4bd4241b, 0x5fa58603), TOBN(0x06fba40b, 0xe67773c0), TOBN(0xc1d933d2, 0x6a2847e9)}, {TOBN(0xf4f5acf3, 0x689e2c70), TOBN(0x92aab0e7, 0x46bafd31), TOBN(0x798d76aa, 0x3473f6e5), TOBN(0xcc6641db, 0x93141934)}}, {{TOBN(0xcae27757, 0xd31e535e), TOBN(0x04cc43b6, 0x87c2ee11), TOBN(0x8d1f9675, 0x2e029ffa), TOBN(0xc2150672, 0xe4cc7a2c)}, {TOBN(0x3b03c1e0, 0x8d68b013), TOBN(0xa9d6816f, 0xedf298f3), TOBN(0x1bfbb529, 0xa2804464), TOBN(0x95a52fae, 0x5db22125)}}, {{TOBN(0x55b32160, 0x0e1cb64e), TOBN(0x004828f6, 0x7e7fc9fe), TOBN(0x13394b82, 0x1bb0fb93), TOBN(0xb6293a2d, 0x35f1a920)}, {TOBN(0xde35ef21, 0xd145d2d9), TOBN(0xbe6225b3, 0xbb8fa603), TOBN(0x00fc8f6b, 0x32cf252d), TOBN(0xa28e52e6, 0x117cf8c2)}}, {{TOBN(0x9d1dc89b, 0x4c371e6d), TOBN(0xcebe0675, 0x36ef0f28), TOBN(0x5de05d09, 0xa4292f81), TOBN(0xa8303593, 0x353e3083)}, {TOBN(0xa1715b0a, 0x7e37a9bb), TOBN(0x8c56f61e, 0x2b8faec3), TOBN(0x52507431, 0x33c9b102), TOBN(0x0130cefc, 0xa44431f0)}}, {{TOBN(0x56039fa0, 0xbd865cfb), TOBN(0x4b03e578, 0xbc5f1dd7), TOBN(0x40edf2e4, 0xbabe7224), TOBN(0xc752496d, 0x3a1988f6)}, {TOBN(0xd1572d3b, 0x564beb6b), TOBN(0x0db1d110, 0x39a1c608), TOBN(0x568d1934, 0x16f60126), TOBN(0x05ae9668, 0xf354af33)}}, {{TOBN(0x19de6d37, 0xc92544f2), TOBN(0xcc084353, 0xa35837d5), TOBN(0xcbb6869c, 0x1a514ece), TOBN(0xb633e728, 0x2e1d1066)}, {TOBN(0xf15dd69f, 0x936c581c), TOBN(0x96e7b8ce, 0x7439c4f9), TOBN(0x5e676f48, 0x2e448a5b), TOBN(0xb2ca7d5b, 0xfd916bbb)}}, {{TOBN(0xd55a2541, 0xf5024025), TOBN(0x47bc5769, 0xe4c2d937), TOBN(0x7d31b92a, 0x0362189f), TOBN(0x83f3086e, 0xef7816f9)}, {TOBN(0xf9f46d94, 0xb587579a), TOBN(0xec2d22d8, 0x30e76c5f), TOBN(0x27d57461, 0xb000ffcf), TOBN(0xbb7e65f9, 0x364ffc2c)}}, {{TOBN(0x7c7c9477, 0x6652a220), TOBN(0x61618f89, 0xd696c981), TOBN(0x5021701d, 0x89effff3), TOBN(0xf2c8ff8e, 0x7c314163)}, {TOBN(0x2da413ad, 0x8efb4d3e), TOBN(0x937b5adf, 0xce176d95), TOBN(0x22867d34, 0x2a67d51c), TOBN(0x262b9b10, 0x18eb3ac9)}}, {{TOBN(0x4e314fe4, 0xc43ff28b), TOBN(0x76476627, 0x6a664e7a), TOBN(0x3e90e40b, 0xb7a565c2), TOBN(0x8588993a, 0xc1acf831)}, {TOBN(0xd7b501d6, 0x8f938829), TOBN(0x996627ee, 0x3edd7d4c), TOBN(0x37d44a62, 0x90cd34c7), TOBN(0xa8327499, 0xf3833e8d)}}, {{TOBN(0x2e18917d, 0x4bf50353), TOBN(0x85dd726b, 0x556765fb), TOBN(0x54fe65d6, 0x93d5ab66), TOBN(0x3ddbaced, 0x915c25fe)}, {TOBN(0xa799d9a4, 0x12f22e85), TOBN(0xe2a24867, 0x6d06f6bc), TOBN(0xf4f1ee56, 0x43ca1637), TOBN(0xfda2828b, 0x61ece30a)}}, {{TOBN(0x758c1a3e, 0xa2dee7a6), TOBN(0xdcde2f3c, 0x734b2284), TOBN(0xaba445d2, 0x4eaba6ad), TOBN(0x35aaf668, 0x76cee0a7)}, {TOBN(0x7e0b04a9, 0xe5aa049a), TOBN(0xe74083ad, 0x91103e84), TOBN(0xbeb183ce, 0x40afecc3), TOBN(0x6b89de9f, 0xea043f7a)}}}, {{{TOBN(0x0e299d23, 0xfe67ba66), TOBN(0x91450760, 0x93cf2f34), TOBN(0xf45b5ea9, 0x97fcf913), TOBN(0x5be00843, 0x8bd7ddda)}, {TOBN(0x358c3e05, 0xd53ff04d), TOBN(0xbf7ccdc3, 0x5de91ef7), TOBN(0xad684dbf, 0xb69ec1a0), TOBN(0x367e7cf2, 0x801fd997)}}, {{TOBN(0x0ca1f3b7, 0xb0dc8595), TOBN(0x27de4608, 0x9f1d9f2e), TOBN(0x1af3bf39, 0xbadd82a7), TOBN(0x79356a79, 0x65862448)}, {TOBN(0xc0602345, 0xf5f9a052), TOBN(0x1a8b0f89, 0x139a42f9), TOBN(0xb53eee42, 0x844d40fc), TOBN(0x93b0bfe5, 0x4e5b6368)}}, {{TOBN(0x5434dd02, 0xc024789c), TOBN(0x90dca9ea, 0x41b57bfc), TOBN(0x8aa898e2, 0x243398df), TOBN(0xf607c834, 0x894a94bb)}, {TOBN(0xbb07be97, 0xc2c99b76), TOBN(0x6576ba67, 0x18c29302), TOBN(0x3d79efcc, 0xe703a88c), TOBN(0xf259ced7, 0xb6a0d106)}}, {{TOBN(0x0f893a5d, 0xc8de610b), TOBN(0xe8c515fb, 0x67e223ce), TOBN(0x7774bfa6, 0x4ead6dc5), TOBN(0x89d20f95, 0x925c728f)}, {TOBN(0x7a1e0966, 0x098583ce), TOBN(0xa2eedb94, 0x93f2a7d7), TOBN(0x1b282097, 0x4c304d4a), TOBN(0x0842e3da, 0xc077282d)}}, {{TOBN(0xe4d972a3, 0x3b9e2d7b), TOBN(0x7cc60b27, 0xc48218ff), TOBN(0x8fc70838, 0x84149d91), TOBN(0x5c04346f, 0x2f461ecc)}, {TOBN(0xebe9fdf2, 0x614650a9), TOBN(0x5e35b537, 0xc1f666ac), TOBN(0x645613d1, 0x88babc83), TOBN(0x88cace3a, 0xc5e1c93e)}}, {{TOBN(0x209ca375, 0x3de92e23), TOBN(0xccb03cc8, 0x5fbbb6e3), TOBN(0xccb90f03, 0xd7b1487e), TOBN(0xfa9c2a38, 0xc710941f)}, {TOBN(0x756c3823, 0x6724ceed), TOBN(0x3a902258, 0x192d0323), TOBN(0xb150e519, 0xea5e038e), TOBN(0xdcba2865, 0xc7427591)}}, {{TOBN(0xe549237f, 0x78890732), TOBN(0xc443bef9, 0x53fcb4d9), TOBN(0x9884d8a6, 0xeb3480d6), TOBN(0x8a35b6a1, 0x3048b186)}, {TOBN(0xb4e44716, 0x65e9a90a), TOBN(0x45bf380d, 0x653006c0), TOBN(0x8f3f820d, 0x4fe9ae3b), TOBN(0x244a35a0, 0x979a3b71)}}, {{TOBN(0xa1010e9d, 0x74cd06ff), TOBN(0x9c17c7df, 0xaca3eeac), TOBN(0x74c86cd3, 0x8063aa2b), TOBN(0x8595c4b3, 0x734614ff)}, {TOBN(0xa3de00ca, 0x990f62cc), TOBN(0xd9bed213, 0xca0c3be5), TOBN(0x7886078a, 0xdf8ce9f5), TOBN(0xddb27ce3, 0x5cd44444)}}, {{TOBN(0xed374a66, 0x58926ddd), TOBN(0x138b2d49, 0x908015b8), TOBN(0x886c6579, 0xde1f7ab8), TOBN(0x888b9aa0, 0xc3020b7a)}, {TOBN(0xd3ec034e, 0x3a96e355), TOBN(0xba65b0b8, 0xf30fbe9a), TOBN(0x064c8e50, 0xff21367a), TOBN(0x1f508ea4, 0x0b04b46e)}}, {{TOBN(0x98561a49, 0x747c866c), TOBN(0xbbb1e5fe, 0x0518a062), TOBN(0x20ff4e8b, 0xecdc3608), TOBN(0x7f55cded, 0x20184027)}, {TOBN(0x8d73ec95, 0xf38c85f0), TOBN(0x5b589fdf, 0x8bc3b8c3), TOBN(0xbe95dd98, 0x0f12b66f), TOBN(0xf5bd1a09, 0x0e338e01)}}, {{TOBN(0x65163ae5, 0x5e915918), TOBN(0x6158d6d9, 0x86f8a46b), TOBN(0x8466b538, 0xeeebf99c), TOBN(0xca8761f6, 0xbca477ef)}, {TOBN(0xaf3449c2, 0x9ebbc601), TOBN(0xef3b0f41, 0xe0c3ae2f), TOBN(0xaa6c577d, 0x5de63752), TOBN(0xe9166601, 0x64682a51)}}, {{TOBN(0x5a3097be, 0xfc15aa1e), TOBN(0x40d12548, 0xb54b0745), TOBN(0x5bad4706, 0x519a5f12), TOBN(0xed03f717, 0xa439dee6)}, {TOBN(0x0794bb6c, 0x4a02c499), TOBN(0xf725083d, 0xcffe71d2), TOBN(0x2cad7519, 0x0f3adcaf), TOBN(0x7f68ea1c, 0x43729310)}}, {{TOBN(0xe747c8c7, 0xb7ffd977), TOBN(0xec104c35, 0x80761a22), TOBN(0x8395ebaf, 0x5a3ffb83), TOBN(0xfb3261f4, 0xe4b63db7)}, {TOBN(0x53544960, 0xd883e544), TOBN(0x13520d70, 0x8cc2eeb8), TOBN(0x08f6337b, 0xd3d65f99), TOBN(0x83997db2, 0x781cf95b)}}, {{TOBN(0xce6ff106, 0x0dbd2c01), TOBN(0x4f8eea6b, 0x1f9ce934), TOBN(0x546f7c4b, 0x0e993921), TOBN(0x6236a324, 0x5e753fc7)}, {TOBN(0x65a41f84, 0xa16022e9), TOBN(0x0c18d878, 0x43d1dbb2), TOBN(0x73c55640, 0x2d4cef9c), TOBN(0xa0428108, 0x70444c74)}}, {{TOBN(0x68e4f15e, 0x9afdfb3c), TOBN(0x49a56143, 0x5bdfb6df), TOBN(0xa9bc1bd4, 0x5f823d97), TOBN(0xbceb5970, 0xea111c2a)}, {TOBN(0x366b455f, 0xb269bbc4), TOBN(0x7cd85e1e, 0xe9bc5d62), TOBN(0xc743c41c, 0x4f18b086), TOBN(0xa4b40990, 0x95294fb9)}}, {{TOBN(0x9c7c581d, 0x26ee8382), TOBN(0xcf17dcc5, 0x359d638e), TOBN(0xee8273ab, 0xb728ae3d), TOBN(0x1d112926, 0xf821f047)}, {TOBN(0x11498477, 0x50491a74), TOBN(0x687fa761, 0xfde0dfb9), TOBN(0x2c258022, 0x7ea435ab), TOBN(0x6b8bdb94, 0x91ce7e3f)}}, {{TOBN(0x4c5b5dc9, 0x3bf834aa), TOBN(0x04371819, 0x4f6c7e4b), TOBN(0xc284e00a, 0x3736bcad), TOBN(0x0d881118, 0x21ae8f8d)}, {TOBN(0xf9cf0f82, 0xf48c8e33), TOBN(0xa11fd075, 0xa1bf40db), TOBN(0xdceab0de, 0xdc2733e5), TOBN(0xc560a8b5, 0x8e986bd7)}}, {{TOBN(0x48dd1fe2, 0x3929d097), TOBN(0x3885b290, 0x92f188f1), TOBN(0x0f2ae613, 0xda6fcdac), TOBN(0x9054303e, 0xb662a46c)}, {TOBN(0xb6871e44, 0x0738042a), TOBN(0x98e6a977, 0xbdaf6449), TOBN(0xd8bc0650, 0xd1c9df1b), TOBN(0xef3d6451, 0x36e098f9)}}, {{TOBN(0x03fbae82, 0xb6d72d28), TOBN(0x77ca9db1, 0xf5d84080), TOBN(0x8a112cff, 0xa58efc1c), TOBN(0x518d761c, 0xc564cb4a)}, {TOBN(0x69b5740e, 0xf0d1b5ce), TOBN(0x717039cc, 0xe9eb1785), TOBN(0x3fe29f90, 0x22f53382), TOBN(0x8e54ba56, 0x6bc7c95c)}}, {{TOBN(0x9c806d8a, 0xf7f91d0f), TOBN(0x3b61b0f1, 0xa82a5728), TOBN(0x4640032d, 0x94d76754), TOBN(0x273eb5de, 0x47d834c6)}, {TOBN(0x2988abf7, 0x7b4e4d53), TOBN(0xb7ce66bf, 0xde401777), TOBN(0x9fba6b32, 0x715071b3), TOBN(0x82413c24, 0xad3a1a98)}}, {{TOBN(0x5b7fc8c4, 0xe0e8ad93), TOBN(0xb5679aee, 0x5fab868d), TOBN(0xb1f9d2fa, 0x2b3946f3), TOBN(0x458897dc, 0x5685b50a)}, {TOBN(0x1e98c930, 0x89d0caf3), TOBN(0x39564c5f, 0x78642e92), TOBN(0x1b77729a, 0x0dbdaf18), TOBN(0xf9170722, 0x579e82e6)}}, {{TOBN(0x680c0317, 0xe4515fa5), TOBN(0xf85cff84, 0xfb0c790f), TOBN(0xc7a82aab, 0x6d2e0765), TOBN(0x7446bca9, 0x35c82b32)}, {TOBN(0x5de607aa, 0x6d63184f), TOBN(0x7c1a46a8, 0x262803a6), TOBN(0xd218313d, 0xaebe8035), TOBN(0x92113ffd, 0xc73c51f8)}}, {{TOBN(0x4b38e083, 0x12e7e46c), TOBN(0x69d0a37a, 0x56126bd5), TOBN(0xfb3f324b, 0x73c07e04), TOBN(0xa0c22f67, 0x8fda7267)}, {TOBN(0x8f2c0051, 0x4d2c7d8f), TOBN(0xbc45ced3, 0xcbe2cae5), TOBN(0xe1c6cf07, 0xa8f0f277), TOBN(0xbc392312, 0x1eb99a98)}}, {{TOBN(0x75537b7e, 0x3cc8ac85), TOBN(0x8d725f57, 0xdd02753b), TOBN(0xfd05ff64, 0xb737df2f), TOBN(0x55fe8712, 0xf6d2531d)}, {TOBN(0x57ce04a9, 0x6ab6b01c), TOBN(0x69a02a89, 0x7cd93724), TOBN(0x4f82ac35, 0xcf86699b), TOBN(0x8242d3ad, 0x9cb4b232)}}, {{TOBN(0x713d0f65, 0xd62105e5), TOBN(0xbb222bfa, 0x2d29be61), TOBN(0xf2f9a79e, 0x6cfbef09), TOBN(0xfc24d8d3, 0xd5d6782f)}, {TOBN(0x5db77085, 0xd4129967), TOBN(0xdb81c3cc, 0xdc3c2a43), TOBN(0x9d655fc0, 0x05d8d9a3), TOBN(0x3f5d057a, 0x54298026)}}, {{TOBN(0x1157f56d, 0x88c54694), TOBN(0xb26baba5, 0x9b09573e), TOBN(0x2cab03b0, 0x22adffd1), TOBN(0x60a412c8, 0xdd69f383)}, {TOBN(0xed76e98b, 0x54b25039), TOBN(0xd4ee67d3, 0x687e714d), TOBN(0x87739648, 0x7b00b594), TOBN(0xce419775, 0xc9ef709b)}}, {{TOBN(0x40f76f85, 0x1c203a40), TOBN(0x30d352d6, 0xeafd8f91), TOBN(0xaf196d3d, 0x95578dd2), TOBN(0xea4bb3d7, 0x77cc3f3d)}, {TOBN(0x42a5bd03, 0xb98e782b), TOBN(0xac958c40, 0x0624920d), TOBN(0xb838134c, 0xfc56fcc8), TOBN(0x86ec4ccf, 0x89572e5e)}}, {{TOBN(0x69c43526, 0x9be47be0), TOBN(0x323b7dd8, 0xcb28fea1), TOBN(0xfa5538ba, 0x3a6c67e5), TOBN(0xef921d70, 0x1d378e46)}, {TOBN(0xf92961fc, 0x3c4b880e), TOBN(0x3f6f914e, 0x98940a67), TOBN(0xa990eb0a, 0xfef0ff39), TOBN(0xa6c2920f, 0xf0eeff9c)}}, {{TOBN(0xca804166, 0x51b8d9a3), TOBN(0x42531bc9, 0x0ffb0db1), TOBN(0x72ce4718, 0xaa82e7ce), TOBN(0x6e199913, 0xdf574741)}, {TOBN(0xd5f1b13d, 0xd5d36946), TOBN(0x8255dc65, 0xf68f0194), TOBN(0xdc9df4cd, 0x8710d230), TOBN(0x3453c20f, 0x138c1988)}}, {{TOBN(0x9af98dc0, 0x89a6ef01), TOBN(0x4dbcc3f0, 0x9857df85), TOBN(0x34805601, 0x5c1ad924), TOBN(0x40448da5, 0xd0493046)}, {TOBN(0xf629926d, 0x4ee343e2), TOBN(0x6343f1bd, 0x90e8a301), TOBN(0xefc93491, 0x40815b3f), TOBN(0xf882a423, 0xde8f66fb)}}, {{TOBN(0x3a12d5f4, 0xe7db9f57), TOBN(0x7dfba38a, 0x3c384c27), TOBN(0x7a904bfd, 0x6fc660b1), TOBN(0xeb6c5db3, 0x2773b21c)}, {TOBN(0xc350ee66, 0x1cdfe049), TOBN(0x9baac0ce, 0x44540f29), TOBN(0xbc57b6ab, 0xa5ec6aad), TOBN(0x167ce8c3, 0x0a7c1baa)}}, {{TOBN(0xb23a03a5, 0x53fb2b56), TOBN(0x6ce141e7, 0x4e057f78), TOBN(0x796525c3, 0x89e490d9), TOBN(0x0bc95725, 0xa31a7e75)}, {TOBN(0x1ec56791, 0x1220fd06), TOBN(0x716e3a3c, 0x408b0bd6), TOBN(0x31cd6bf7, 0xe8ebeba9), TOBN(0xa7326ca6, 0xbee6b670)}}, {{TOBN(0x3d9f851c, 0xcd090c43), TOBN(0x561e8f13, 0xf12c3988), TOBN(0x50490b6a, 0x904b7be4), TOBN(0x61690ce1, 0x0410737b)}, {TOBN(0x299e9a37, 0x0f009052), TOBN(0x258758f0, 0xf026092e), TOBN(0x9fa255f3, 0xfdfcdc0f), TOBN(0xdbc9fb1f, 0xc0e1bcd2)}}, {{TOBN(0x35f9dd6e, 0x24651840), TOBN(0xdca45a84, 0xa5c59abc), TOBN(0x103d396f, 0xecca4938), TOBN(0x4532da0a, 0xb97b3f29)}, {TOBN(0xc4135ea5, 0x1999a6bf), TOBN(0x3aa9505a, 0x5e6bf2ee), TOBN(0xf77cef06, 0x3f5be093), TOBN(0x97d1a0f8, 0xa943152e)}}, {{TOBN(0x2cb0ebba, 0x2e1c21dd), TOBN(0xf41b29fc, 0x2c6797c4), TOBN(0xc6e17321, 0xb300101f), TOBN(0x4422b0e9, 0xd0d79a89)}, {TOBN(0x49e4901c, 0x92f1bfc4), TOBN(0x06ab1f8f, 0xe1e10ed9), TOBN(0x84d35577, 0xdb2926b8), TOBN(0xca349d39, 0x356e8ec2)}}, {{TOBN(0x70b63d32, 0x343bf1a9), TOBN(0x8fd3bd28, 0x37d1a6b1), TOBN(0x0454879c, 0x316865b4), TOBN(0xee959ff6, 0xc458efa2)}, {TOBN(0x0461dcf8, 0x9706dc3f), TOBN(0x737db0e2, 0x164e4b2e), TOBN(0x09262680, 0x2f8843c8), TOBN(0x54498bbc, 0x7745e6f6)}}, {{TOBN(0x359473fa, 0xa29e24af), TOBN(0xfcc3c454, 0x70aa87a1), TOBN(0xfd2c4bf5, 0x00573ace), TOBN(0xb65b514e, 0x28dd1965)}, {TOBN(0xe46ae7cf, 0x2193e393), TOBN(0x60e9a4e1, 0xf5444d97), TOBN(0xe7594e96, 0x00ff38ed), TOBN(0x43d84d2f, 0x0a0e0f02)}}, {{TOBN(0x8b6db141, 0xee398a21), TOBN(0xb88a56ae, 0xe3bcc5be), TOBN(0x0a1aa52f, 0x373460ea), TOBN(0x20da1a56, 0x160bb19b)}, {TOBN(0xfb54999d, 0x65bf0384), TOBN(0x71a14d24, 0x5d5a180e), TOBN(0xbc44db7b, 0x21737b04), TOBN(0xd84fcb18, 0x01dd8e92)}}, {{TOBN(0x80de937b, 0xfa44b479), TOBN(0x53505499, 0x5c98fd4f), TOBN(0x1edb12ab, 0x28f08727), TOBN(0x4c58b582, 0xa5f3ef53)}, {TOBN(0xbfb236d8, 0x8327f246), TOBN(0xc3a3bfaa, 0x4d7df320), TOBN(0xecd96c59, 0xb96024f2), TOBN(0xfc293a53, 0x7f4e0433)}}, {{TOBN(0x5341352b, 0x5acf6e10), TOBN(0xc50343fd, 0xafe652c3), TOBN(0x4af3792d, 0x18577a7f), TOBN(0xe1a4c617, 0xaf16823d)}, {TOBN(0x9b26d0cd, 0x33425d0a), TOBN(0x306399ed, 0x9b7bc47f), TOBN(0x2a792f33, 0x706bb20b), TOBN(0x31219614, 0x98111055)}}, {{TOBN(0x864ec064, 0x87f5d28b), TOBN(0x11392d91, 0x962277fd), TOBN(0xb5aa7942, 0xbb6aed5f), TOBN(0x080094dc, 0x47e799d9)}, {TOBN(0x4afa588c, 0x208ba19b), TOBN(0xd3e7570f, 0x8512f284), TOBN(0xcbae64e6, 0x02f5799a), TOBN(0xdeebe7ef, 0x514b9492)}}, {{TOBN(0x30300f98, 0xe5c298ff), TOBN(0x17f561be, 0x3678361f), TOBN(0xf52ff312, 0x98cb9a16), TOBN(0x6233c3bc, 0x5562d490)}, {TOBN(0x7bfa15a1, 0x92e3a2cb), TOBN(0x961bcfd1, 0xe6365119), TOBN(0x3bdd29bf, 0x2c8c53b1), TOBN(0x739704df, 0x822844ba)}}, {{TOBN(0x7dacfb58, 0x7e7b754b), TOBN(0x23360791, 0xa806c9b9), TOBN(0xe7eb88c9, 0x23504452), TOBN(0x2983e996, 0x852c1783)}, {TOBN(0xdd4ae529, 0x958d881d), TOBN(0x026bae03, 0x262c7b3c), TOBN(0x3a6f9193, 0x960b52d1), TOBN(0xd0980f90, 0x92696cfb)}}, {{TOBN(0x4c1f428c, 0xd5f30851), TOBN(0x94dfed27, 0x2a4f6630), TOBN(0x4df53772, 0xfc5d48a4), TOBN(0xdd2d5a2f, 0x933260ce)}, {TOBN(0x574115bd, 0xd44cc7a5), TOBN(0x4ba6b20d, 0xbd12533a), TOBN(0x30e93cb8, 0x243057c9), TOBN(0x794c486a, 0x14de320e)}}, {{TOBN(0xe925d4ce, 0xf21496e4), TOBN(0xf951d198, 0xec696331), TOBN(0x9810e2de, 0x3e8d812f), TOBN(0xd0a47259, 0x389294ab)}, {TOBN(0x513ba2b5, 0x0e3bab66), TOBN(0x462caff5, 0xabad306f), TOBN(0xe2dc6d59, 0xaf04c49e), TOBN(0x1aeb8750, 0xe0b84b0b)}}, {{TOBN(0xc034f12f, 0x2f7d0ca2), TOBN(0x6d2e8128, 0xe06acf2f), TOBN(0x801f4f83, 0x21facc2f), TOBN(0xa1170c03, 0xf40ef607)}, {TOBN(0xfe0a1d4f, 0x7805a99c), TOBN(0xbde56a36, 0xcc26aba5), TOBN(0x5b1629d0, 0x35531f40), TOBN(0xac212c2b, 0x9afa6108)}}, {{TOBN(0x30a06bf3, 0x15697be5), TOBN(0x6f0545dc, 0x2c63c7c1), TOBN(0x5d8cb842, 0x7ccdadaf), TOBN(0xd52e379b, 0xac7015bb)}, {TOBN(0xc4f56147, 0xf462c23e), TOBN(0xd44a4298, 0x46bc24b0), TOBN(0xbc73d23a, 0xe2856d4f), TOBN(0x61cedd8c, 0x0832bcdf)}}, {{TOBN(0x60953556, 0x99f241d7), TOBN(0xee4adbd7, 0x001a349d), TOBN(0x0b35bf6a, 0xaa89e491), TOBN(0x7f0076f4, 0x136f7546)}, {TOBN(0xd19a18ba, 0x9264da3d), TOBN(0x6eb2d2cd, 0x62a7a28b), TOBN(0xcdba941f, 0x8761c971), TOBN(0x1550518b, 0xa3be4a5d)}}, {{TOBN(0xd0e8e2f0, 0x57d0b70c), TOBN(0xeea8612e, 0xcd133ba3), TOBN(0x814670f0, 0x44416aec), TOBN(0x424db6c3, 0x30775061)}, {TOBN(0xd96039d1, 0x16213fd1), TOBN(0xc61e7fa5, 0x18a3478f), TOBN(0xa805bdcc, 0xcb0c5021), TOBN(0xbdd6f3a8, 0x0cc616dd)}}, {{TOBN(0x06009667, 0x5d97f7e2), TOBN(0x31db0fc1, 0xaf0bf4b6), TOBN(0x23680ed4, 0x5491627a), TOBN(0xb99a3c66, 0x7d741fb1)}, {TOBN(0xe9bb5f55, 0x36b1ff92), TOBN(0x29738577, 0x512b388d), TOBN(0xdb8a2ce7, 0x50fcf263), TOBN(0x385346d4, 0x6c4f7b47)}}, {{TOBN(0xbe86c5ef, 0x31631f9e), TOBN(0xbf91da21, 0x03a57a29), TOBN(0xc3b1f796, 0x7b23f821), TOBN(0x0f7d00d2, 0x770db354)}, {TOBN(0x8ffc6c3b, 0xd8fe79da), TOBN(0xcc5e8c40, 0xd525c996), TOBN(0x4640991d, 0xcfff632a), TOBN(0x64d97e8c, 0x67112528)}}, {{TOBN(0xc232d973, 0x02f1cd1e), TOBN(0xce87eacb, 0x1dd212a4), TOBN(0x6e4c8c73, 0xe69802f7), TOBN(0x12ef0290, 0x1fffddbd)}, {TOBN(0x941ec74e, 0x1bcea6e2), TOBN(0xd0b54024, 0x3cb92cbb), TOBN(0x809fb9d4, 0x7e8f9d05), TOBN(0x3bf16159, 0xf2992aae)}}, {{TOBN(0xad40f279, 0xf8a7a838), TOBN(0x11aea631, 0x05615660), TOBN(0xbf52e6f1, 0xa01f6fa1), TOBN(0xef046995, 0x3dc2aec9)}, {TOBN(0x785dbec9, 0xd8080711), TOBN(0xe1aec60a, 0x9fdedf76), TOBN(0xece797b5, 0xfa21c126), TOBN(0xc66e898f, 0x05e52732)}}, {{TOBN(0x39bb69c4, 0x08811fdb), TOBN(0x8bfe1ef8, 0x2fc7f082), TOBN(0xc8e7a393, 0x174f4138), TOBN(0xfba8ad1d, 0xd58d1f98)}, {TOBN(0xbc21d0ce, 0xbfd2fd5b), TOBN(0x0b839a82, 0x6ee60d61), TOBN(0xaacf7658, 0xafd22253), TOBN(0xb526bed8, 0xaae396b3)}}, {{TOBN(0xccc1bbc2, 0x38564464), TOBN(0x9e3ff947, 0x8c45bc73), TOBN(0xcde9bca3, 0x58188a78), TOBN(0x138b8ee0, 0xd73bf8f7)}, {TOBN(0x5c7e234c, 0x4123c489), TOBN(0x66e69368, 0xfa643297), TOBN(0x0629eeee, 0x39a15fa3), TOBN(0x95fab881, 0xa9e2a927)}}, {{TOBN(0xb2497007, 0xeafbb1e1), TOBN(0xd75c9ce6, 0xe75b7a93), TOBN(0x3558352d, 0xefb68d78), TOBN(0xa2f26699, 0x223f6396)}, {TOBN(0xeb911ecf, 0xe469b17a), TOBN(0x62545779, 0xe72d3ec2), TOBN(0x8ea47de7, 0x82cb113f), TOBN(0xebe4b086, 0x4e1fa98d)}}, {{TOBN(0xec2d5ed7, 0x8cdfedb1), TOBN(0xa535c077, 0xfe211a74), TOBN(0x9678109b, 0x11d244c5), TOBN(0xf17c8bfb, 0xbe299a76)}, {TOBN(0xb651412e, 0xfb11fbc4), TOBN(0xea0b5482, 0x94ab3f65), TOBN(0xd8dffd95, 0x0cf78243), TOBN(0x2e719e57, 0xce0361d4)}}, {{TOBN(0x9007f085, 0x304ddc5b), TOBN(0x095e8c6d, 0x4daba2ea), TOBN(0x5a33cdb4, 0x3f9d28a9), TOBN(0x85b95cd8, 0xe2283003)}, {TOBN(0xbcd6c819, 0xb9744733), TOBN(0x29c5f538, 0xfc7f5783), TOBN(0x6c49b2fa, 0xd59038e4), TOBN(0x68349cc1, 0x3bbe1018)}}, {{TOBN(0xcc490c1d, 0x21830ee5), TOBN(0x36f9c4ee, 0xe9bfa297), TOBN(0x58fd7294, 0x48de1a94), TOBN(0xaadb13a8, 0x4e8f2cdc)}, {TOBN(0x515eaaa0, 0x81313dba), TOBN(0xc76bb468, 0xc2152dd8), TOBN(0x357f8d75, 0xa653dbf8), TOBN(0xe4d8c4d1, 0xb14ac143)}}, {{TOBN(0xbdb8e675, 0xb055cb40), TOBN(0x898f8e7b, 0x977b5167), TOBN(0xecc65651, 0xb82fb863), TOBN(0x56544814, 0x6d88f01f)}, {TOBN(0xb0928e95, 0x263a75a9), TOBN(0xcfb6836f, 0x1a22fcda), TOBN(0x651d14db, 0x3f3bd37c), TOBN(0x1d3837fb, 0xb6ad4664)}}, {{TOBN(0x7c5fb538, 0xff4f94ab), TOBN(0x7243c712, 0x6d7fb8f2), TOBN(0xef13d60c, 0xa85c5287), TOBN(0x18cfb7c7, 0x4bb8dd1b)}, {TOBN(0x82f9bfe6, 0x72908219), TOBN(0x35c4592b, 0x9d5144ab), TOBN(0x52734f37, 0x9cf4b42f), TOBN(0x6bac55e7, 0x8c60ddc4)}}, {{TOBN(0xb5cd811e, 0x94dea0f6), TOBN(0x259ecae4, 0xe18cc1a3), TOBN(0x6a0e836e, 0x15e660f8), TOBN(0x6c639ea6, 0x0e02bff2)}, {TOBN(0x8721b8cb, 0x7e1026fd), TOBN(0x9e73b50b, 0x63261942), TOBN(0xb8c70974, 0x77f01da3), TOBN(0x1839e6a6, 0x8268f57f)}}, {{TOBN(0x571b9415, 0x5150b805), TOBN(0x1892389e, 0xf92c7097), TOBN(0x8d69c18e, 0x4a084b95), TOBN(0x7014c512, 0xbe5b495c)}, {TOBN(0x4780db36, 0x1b07523c), TOBN(0x2f6219ce, 0x2c1c64fa), TOBN(0xc38b81b0, 0x602c105a), TOBN(0xab4f4f20, 0x5dc8e360)}}, {{TOBN(0x20d3c982, 0xcf7d62d2), TOBN(0x1f36e29d, 0x23ba8150), TOBN(0x48ae0bf0, 0x92763f9e), TOBN(0x7a527e6b, 0x1d3a7007)}, {TOBN(0xb4a89097, 0x581a85e3), TOBN(0x1f1a520f, 0xdc158be5), TOBN(0xf98db37d, 0x167d726e), TOBN(0x8802786e, 0x1113e862)}}}, {{{TOBN(0xefb2149e, 0x36f09ab0), TOBN(0x03f163ca, 0x4a10bb5b), TOBN(0xd0297045, 0x06e20998), TOBN(0x56f0af00, 0x1b5a3bab)}, {TOBN(0x7af4cfec, 0x70880e0d), TOBN(0x7332a66f, 0xbe3d913f), TOBN(0x32e6c84a, 0x7eceb4bd), TOBN(0xedc4a79a, 0x9c228f55)}}, {{TOBN(0xc37c7dd0, 0xc55c4496), TOBN(0xa6a96357, 0x25bbabd2), TOBN(0x5b7e63f2, 0xadd7f363), TOBN(0x9dce3782, 0x2e73f1df)}, {TOBN(0xe1e5a16a, 0xb2b91f71), TOBN(0xe4489823, 0x5ba0163c), TOBN(0xf2759c32, 0xf6e515ad), TOBN(0xa5e2f1f8, 0x8615eecf)}}, {{TOBN(0x74519be7, 0xabded551), TOBN(0x03d358b8, 0xc8b74410), TOBN(0x4d00b10b, 0x0e10d9a9), TOBN(0x6392b0b1, 0x28da52b7)}, {TOBN(0x6744a298, 0x0b75c904), TOBN(0xc305b0ae, 0xa8f7f96c), TOBN(0x042e421d, 0x182cf932), TOBN(0xf6fc5d50, 0x9e4636ca)}}, {{TOBN(0x795847c9, 0xd64cc78c), TOBN(0x6c50621b, 0x9b6cb27b), TOBN(0x07099bf8, 0xdf8022ab), TOBN(0x48f862eb, 0xc04eda1d)}, {TOBN(0xd12732ed, 0xe1603c16), TOBN(0x19a80e0f, 0x5c9a9450), TOBN(0xe2257f54, 0xb429b4fc), TOBN(0x66d3b2c6, 0x45460515)}}, {{TOBN(0x6ca4f87e, 0x822e37be), TOBN(0x73f237b4, 0x253bda4e), TOBN(0xf747f3a2, 0x41190aeb), TOBN(0xf06fa36f, 0x804cf284)}, {TOBN(0x0a6bbb6e, 0xfc621c12), TOBN(0x5d624b64, 0x40b80ec6), TOBN(0x4b072425, 0x7ba556f3), TOBN(0x7fa0c354, 0x3e2d20a8)}}, {{TOBN(0xe921fa31, 0xe3229d41), TOBN(0xa929c652, 0x94531bd4), TOBN(0x84156027, 0xa6d38209), TOBN(0xf3d69f73, 0x6bdb97bd)}, {TOBN(0x8906d19a, 0x16833631), TOBN(0x68a34c2e, 0x03d51be3), TOBN(0xcb59583b, 0x0e511cd8), TOBN(0x99ce6bfd, 0xfdc132a8)}}, {{TOBN(0x3facdaaa, 0xffcdb463), TOBN(0x658bbc1a, 0x34a38b08), TOBN(0x12a801f8, 0xf1a9078d), TOBN(0x1567bcf9, 0x6ab855de)}, {TOBN(0xe08498e0, 0x3572359b), TOBN(0xcf0353e5, 0x8659e68b), TOBN(0xbb86e9c8, 0x7d23807c), TOBN(0xbc08728d, 0x2198e8a2)}}, {{TOBN(0x8de2b7bc, 0x453cadd6), TOBN(0x203900a7, 0xbc0bc1f8), TOBN(0xbcd86e47, 0xa6abd3af), TOBN(0x911cac12, 0x8502effb)}, {TOBN(0x2d550242, 0xec965469), TOBN(0x0e9f7692, 0x29e0017e), TOBN(0x633f078f, 0x65979885), TOBN(0xfb87d449, 0x4cf751ef)}}, {{TOBN(0xe1790e4b, 0xfc25419a), TOBN(0x36467203, 0x4bff3cfd), TOBN(0xc8db6386, 0x25b6e83f), TOBN(0x6cc69f23, 0x6cad6fd2)}, {TOBN(0x0219e45a, 0x6bc68bb9), TOBN(0xe43d79b6, 0x297f7334), TOBN(0x7d445368, 0x465dc97c), TOBN(0x4b9eea32, 0x2a0b949a)}}, {{TOBN(0x1b96c6ba, 0x6102d021), TOBN(0xeaafac78, 0x2f4461ea), TOBN(0xd4b85c41, 0xc49f19a8), TOBN(0x275c28e4, 0xcf538875)}, {TOBN(0x35451a9d, 0xdd2e54e0), TOBN(0x6991adb5, 0x0605618b), TOBN(0x5b8b4bcd, 0x7b36cd24), TOBN(0x372a4f8c, 0x56f37216)}}, {{TOBN(0xc890bd73, 0xa6a5da60), TOBN(0x6f083da0, 0xdc4c9ff0), TOBN(0xf4e14d94, 0xf0536e57), TOBN(0xf9ee1eda, 0xaaec8243)}, {TOBN(0x571241ec, 0x8bdcf8e7), TOBN(0xa5db8271, 0x0b041e26), TOBN(0x9a0b9a99, 0xe3fff040), TOBN(0xcaaf21dd, 0x7c271202)}}, {{TOBN(0xb4e2b2e1, 0x4f0dd2e8), TOBN(0xe77e7c4f, 0x0a377ac7), TOBN(0x69202c3f, 0x0d7a2198), TOBN(0xf759b7ff, 0x28200eb8)}, {TOBN(0xc87526ed, 0xdcfe314e), TOBN(0xeb84c524, 0x53d5cf99), TOBN(0xb1b52ace, 0x515138b6), TOBN(0x5aa7ff8c, 0x23fca3f4)}}, {{TOBN(0xff0b13c3, 0xb9791a26), TOBN(0x960022da, 0xcdd58b16), TOBN(0xdbd55c92, 0x57aad2de), TOBN(0x3baaaaa3, 0xf30fe619)}, {TOBN(0x9a4b2346, 0x0d881efd), TOBN(0x506416c0, 0x46325e2a), TOBN(0x91381e76, 0x035c18d4), TOBN(0xb3bb68be, 0xf27817b0)}}, {{TOBN(0x15bfb8bf, 0x5116f937), TOBN(0x7c64a586, 0xc1268943), TOBN(0x71e25cc3, 0x8419a2c8), TOBN(0x9fd6b0c4, 0x8335f463)}, {TOBN(0x4bf0ba3c, 0xe8ee0e0e), TOBN(0x6f6fba60, 0x298c21fa), TOBN(0x57d57b39, 0xae66bee0), TOBN(0x292d5130, 0x22672544)}}, {{TOBN(0xf451105d, 0xbab093b3), TOBN(0x012f59b9, 0x02839986), TOBN(0x8a915802, 0x3474a89c), TOBN(0x048c919c, 0x2de03e97)}, {TOBN(0xc476a2b5, 0x91071cd5), TOBN(0x791ed89a, 0x034970a5), TOBN(0x89bd9042, 0xe1b7994b), TOBN(0x8eaf5179, 0xa1057ffd)}}, {{TOBN(0x6066e2a2, 0xd551ee10), TOBN(0x87a8f1d8, 0x727e09a6), TOBN(0x00d08bab, 0x2c01148d), TOBN(0x6da8e4f1, 0x424f33fe)}, {TOBN(0x466d17f0, 0xcf9a4e71), TOBN(0xff502010, 0x3bf5cb19), TOBN(0xdccf97d8, 0xd062ecc0), TOBN(0x80c0d9af, 0x81d80ac4)}}, {{TOBN(0xe87771d8, 0x033f2876), TOBN(0xb0186ec6, 0x7d5cc3db), TOBN(0x58e8bb80, 0x3bc9bc1d), TOBN(0x4d1395cc, 0x6f6ef60e)}, {TOBN(0xa73c62d6, 0x186244a0), TOBN(0x918e5f23, 0x110a5b53), TOBN(0xed4878ca, 0x741b7eab), TOBN(0x3038d71a, 0xdbe03e51)}}, {{TOBN(0x840204b7, 0xa93c3246), TOBN(0x21ab6069, 0xa0b9b4cd), TOBN(0xf5fa6e2b, 0xb1d64218), TOBN(0x1de6ad0e, 0xf3d56191)}, {TOBN(0x570aaa88, 0xff1929c7), TOBN(0xc6df4c6b, 0x640e87b5), TOBN(0xde8a74f2, 0xc65f0ccc), TOBN(0x8b972fd5, 0xe6f6cc01)}}, {{TOBN(0x3fff36b6, 0x0b846531), TOBN(0xba7e45e6, 0x10a5e475), TOBN(0x84a1d10e, 0x4145b6c5), TOBN(0xf1f7f91a, 0x5e046d9d)}, {TOBN(0x0317a692, 0x44de90d7), TOBN(0x951a1d4a, 0xf199c15e), TOBN(0x91f78046, 0xc9d73deb), TOBN(0x74c82828, 0xfab8224f)}}, {{TOBN(0xaa6778fc, 0xe7560b90), TOBN(0xb4073e61, 0xa7e824ce), TOBN(0xff0d693c, 0xd642eba8), TOBN(0x7ce2e57a, 0x5dccef38)}, {TOBN(0x89c2c789, 0x1df1ad46), TOBN(0x83a06922, 0x098346fd), TOBN(0x2d715d72, 0xda2fc177), TOBN(0x7b6dd71d, 0x85b6cf1d)}}, {{TOBN(0xc60a6d0a, 0x73fa9cb0), TOBN(0xedd3992e, 0x328bf5a9), TOBN(0xc380ddd0, 0x832c8c82), TOBN(0xd182d410, 0xa2a0bf50)}, {TOBN(0x7d9d7438, 0xd9a528db), TOBN(0xe8b1a0e9, 0xcaf53994), TOBN(0xddd6e5fe, 0x0e19987c), TOBN(0xacb8df03, 0x190b059d)}}, {{TOBN(0x53703a32, 0x8300129f), TOBN(0x1f637662, 0x68c43bfd), TOBN(0xbcbd1913, 0x00e54051), TOBN(0x812fcc62, 0x7bf5a8c5)}, {TOBN(0x3f969d5f, 0x29fb85da), TOBN(0x72f4e00a, 0x694759e8), TOBN(0x426b6e52, 0x790726b7), TOBN(0x617bbc87, 0x3bdbb209)}}, {{TOBN(0x511f8bb9, 0x97aee317), TOBN(0x812a4096, 0xe81536a8), TOBN(0x137dfe59, 0x3ac09b9b), TOBN(0x0682238f, 0xba8c9a7a)}, {TOBN(0x7072ead6, 0xaeccb4bd), TOBN(0x6a34e9aa, 0x692ba633), TOBN(0xc82eaec2, 0x6fff9d33), TOBN(0xfb753512, 0x1d4d2b62)}}, {{TOBN(0x1a0445ff, 0x1d7aadab), TOBN(0x65d38260, 0xd5f6a67c), TOBN(0x6e62fb08, 0x91cfb26f), TOBN(0xef1e0fa5, 0x5c7d91d6)}, {TOBN(0x47e7c7ba, 0x33db72cd), TOBN(0x017cbc09, 0xfa7c74b2), TOBN(0x3c931590, 0xf50a503c), TOBN(0xcac54f60, 0x616baa42)}}, {{TOBN(0x9b6cd380, 0xb2369f0f), TOBN(0x97d3a70d, 0x23c76151), TOBN(0x5f9dd6fc, 0x9862a9c6), TOBN(0x044c4ab2, 0x12312f51)}, {TOBN(0x035ea0fd, 0x834a2ddc), TOBN(0x49e6b862, 0xcc7b826d), TOBN(0xb03d6883, 0x62fce490), TOBN(0x62f2497a, 0xb37e36e9)}}, {{TOBN(0x04b005b6, 0xc6458293), TOBN(0x36bb5276, 0xe8d10af7), TOBN(0xacf2dc13, 0x8ee617b8), TOBN(0x470d2d35, 0xb004b3d4)}, {TOBN(0x06790832, 0xfeeb1b77), TOBN(0x2bb75c39, 0x85657f9c), TOBN(0xd70bd4ed, 0xc0f60004), TOBN(0xfe797ecc, 0x219b018b)}}, {{TOBN(0x9b5bec2a, 0x753aebcc), TOBN(0xdaf9f3dc, 0xc939eca5), TOBN(0xd6bc6833, 0xd095ad09), TOBN(0x98abdd51, 0xdaa4d2fc)}, {TOBN(0xd9840a31, 0x8d168be5), TOBN(0xcf7c10e0, 0x2325a23c), TOBN(0xa5c02aa0, 0x7e6ecfaf), TOBN(0x2462e7e6, 0xb5bfdf18)}}, {{TOBN(0xab2d8a8b, 0xa0cc3f12), TOBN(0x68dd485d, 0xbc672a29), TOBN(0x72039752, 0x596f2cd3), TOBN(0x5d3eea67, 0xa0cf3d8d)}, {TOBN(0x810a1a81, 0xe6602671), TOBN(0x8f144a40, 0x14026c0c), TOBN(0xbc753a6d, 0x76b50f85), TOBN(0xc4dc21e8, 0x645cd4a4)}}, {{TOBN(0xc5262dea, 0x521d0378), TOBN(0x802b8e0e, 0x05011c6f), TOBN(0x1ba19cbb, 0x0b4c19ea), TOBN(0x21db64b5, 0xebf0aaec)}, {TOBN(0x1f394ee9, 0x70342f9d), TOBN(0x93a10aee, 0x1bc44a14), TOBN(0xa7eed31b, 0x3efd0baa), TOBN(0x6e7c824e, 0x1d154e65)}}, {{TOBN(0xee23fa81, 0x9966e7ee), TOBN(0x64ec4aa8, 0x05b7920d), TOBN(0x2d44462d, 0x2d90aad4), TOBN(0xf44dd195, 0xdf277ad5)}, {TOBN(0x8d6471f1, 0xbb46b6a1), TOBN(0x1e65d313, 0xfd885090), TOBN(0x33a800f5, 0x13a977b4), TOBN(0xaca9d721, 0x0797e1ef)}}, {{TOBN(0x9a5a85a0, 0xfcff6a17), TOBN(0x9970a3f3, 0x1eca7cee), TOBN(0xbb9f0d6b, 0xc9504be3), TOBN(0xe0c504be, 0xadd24ee2)}, {TOBN(0x7e09d956, 0x77fcc2f4), TOBN(0xef1a5227, 0x65bb5fc4), TOBN(0x145d4fb1, 0x8b9286aa), TOBN(0x66fd0c5d, 0x6649028b)}}, {{TOBN(0x98857ceb, 0x1bf4581c), TOBN(0xe635e186, 0xaca7b166), TOBN(0x278ddd22, 0x659722ac), TOBN(0xa0903c4c, 0x1db68007)}, {TOBN(0x366e4589, 0x48f21402), TOBN(0x31b49c14, 0xb96abda2), TOBN(0x329c4b09, 0xe0403190), TOBN(0x97197ca3, 0xd29f43fe)}}, {{TOBN(0x8073dd1e, 0x274983d8), TOBN(0xda1a3bde, 0x55717c8f), TOBN(0xfd3d4da2, 0x0361f9d1), TOBN(0x1332d081, 0x4c7de1ce)}, {TOBN(0x9b7ef7a3, 0xaa6d0e10), TOBN(0x17db2e73, 0xf54f1c4a), TOBN(0xaf3dffae, 0x4cd35567), TOBN(0xaaa2f406, 0xe56f4e71)}}, {{TOBN(0x8966759e, 0x7ace3fc7), TOBN(0x9594eacf, 0x45a8d8c6), TOBN(0x8de3bd8b, 0x91834e0e), TOBN(0xafe4ca53, 0x548c0421)}, {TOBN(0xfdd7e856, 0xe6ee81c6), TOBN(0x8f671beb, 0x6b891a3a), TOBN(0xf7a58f2b, 0xfae63829), TOBN(0x9ab186fb, 0x9c11ac9f)}}, {{TOBN(0x8d6eb369, 0x10b5be76), TOBN(0x046b7739, 0xfb040bcd), TOBN(0xccb4529f, 0xcb73de88), TOBN(0x1df0fefc, 0xcf26be03)}, {TOBN(0xad7757a6, 0xbcfcd027), TOBN(0xa8786c75, 0xbb3165ca), TOBN(0xe9db1e34, 0x7e99a4d9), TOBN(0x99ee86df, 0xb06c504b)}}, {{TOBN(0x5b7c2ddd, 0xc15c9f0a), TOBN(0xdf87a734, 0x4295989e), TOBN(0x59ece47c, 0x03d08fda), TOBN(0xb074d3dd, 0xad5fc702)}, {TOBN(0x20407903, 0x51a03776), TOBN(0x2bb1f77b, 0x2a608007), TOBN(0x25c58f4f, 0xe1153185), TOBN(0xe6df62f6, 0x766e6447)}}, {{TOBN(0xefb3d1be, 0xed51275a), TOBN(0x5de47dc7, 0x2f0f483f), TOBN(0x7932d98e, 0x97c2bedf), TOBN(0xd5c11927, 0x0219f8a1)}, {TOBN(0x9d751200, 0xa73a294e), TOBN(0x5f88434a, 0x9dc20172), TOBN(0xd28d9fd3, 0xa26f506a), TOBN(0xa890cd31, 0x9d1dcd48)}}, {{TOBN(0x0aebaec1, 0x70f4d3b4), TOBN(0xfd1a1369, 0x0ffc8d00), TOBN(0xb9d9c240, 0x57d57838), TOBN(0x45929d26, 0x68bac361)}, {TOBN(0x5a2cd060, 0x25b15ca6), TOBN(0x4b3c83e1, 0x6e474446), TOBN(0x1aac7578, 0xee1e5134), TOBN(0xa418f5d6, 0xc91e2f41)}}, {{TOBN(0x6936fc8a, 0x213ed68b), TOBN(0x860ae7ed, 0x510a5224), TOBN(0x63660335, 0xdef09b53), TOBN(0x641b2897, 0xcd79c98d)}, {TOBN(0x29bd38e1, 0x01110f35), TOBN(0x79c26f42, 0x648b1937), TOBN(0x64dae519, 0x9d9164f4), TOBN(0xd85a2310, 0x0265c273)}}, {{TOBN(0x7173dd5d, 0x4b07e2b1), TOBN(0xd144c4cb, 0x8d9ea221), TOBN(0xe8b04ea4, 0x1105ab14), TOBN(0x92dda542, 0xfe80d8f1)}, {TOBN(0xe9982fa8, 0xcf03dce6), TOBN(0x8b5ea965, 0x1a22cffc), TOBN(0xf7f4ea7f, 0x3fad88c4), TOBN(0x62db773e, 0x6a5ba95c)}}, {{TOBN(0xd20f02fb, 0x93f24567), TOBN(0xfd46c69a, 0x315257ca), TOBN(0x0ac74cc7, 0x8bcab987), TOBN(0x46f31c01, 0x5ceca2f5)}, {TOBN(0x40aedb59, 0x888b219e), TOBN(0xe50ecc37, 0xe1fccd02), TOBN(0x1bcd9dad, 0x911f816c), TOBN(0x583cc1ec, 0x8db9b00c)}}, {{TOBN(0xf3cd2e66, 0xa483bf11), TOBN(0xfa08a6f5, 0xb1b2c169), TOBN(0xf375e245, 0x4be9fa28), TOBN(0x99a7ffec, 0x5b6d011f)}, {TOBN(0x6a3ebddb, 0xc4ae62da), TOBN(0x6cea00ae, 0x374aef5d), TOBN(0xab5fb98d, 0x9d4d05bc), TOBN(0x7cba1423, 0xd560f252)}}, {{TOBN(0x49b2cc21, 0x208490de), TOBN(0x1ca66ec3, 0xbcfb2879), TOBN(0x7f1166b7, 0x1b6fb16f), TOBN(0xfff63e08, 0x65fe5db3)}, {TOBN(0xb8345abe, 0x8b2610be), TOBN(0xb732ed80, 0x39de3df4), TOBN(0x0e24ed50, 0x211c32b4), TOBN(0xd10d8a69, 0x848ff27d)}}, {{TOBN(0xc1074398, 0xed4de248), TOBN(0xd7cedace, 0x10488927), TOBN(0xa4aa6bf8, 0x85673e13), TOBN(0xb46bae91, 0x6daf30af)}, {TOBN(0x07088472, 0xfcef7ad8), TOBN(0x61151608, 0xd4b35e97), TOBN(0xbcfe8f26, 0xdde29986), TOBN(0xeb84c4c7, 0xd5a34c79)}}, {{TOBN(0xc1eec55c, 0x164e1214), TOBN(0x891be86d, 0xa147bb03), TOBN(0x9fab4d10, 0x0ba96835), TOBN(0xbf01e9b8, 0xa5c1ae9f)}, {TOBN(0x6b4de139, 0xb186ebc0), TOBN(0xd5c74c26, 0x85b91bca), TOBN(0x5086a99c, 0xc2d93854), TOBN(0xeed62a7b, 0xa7a9dfbc)}}, {{TOBN(0x8778ed6f, 0x76b7618a), TOBN(0xbff750a5, 0x03b66062), TOBN(0x4cb7be22, 0xb65186db), TOBN(0x369dfbf0, 0xcc3a6d13)}, {TOBN(0xc7dab26c, 0x7191a321), TOBN(0x9edac3f9, 0x40ed718e), TOBN(0xbc142b36, 0xd0cfd183), TOBN(0xc8af82f6, 0x7c991693)}}, {{TOBN(0xb3d1e4d8, 0x97ce0b2a), TOBN(0xe6d7c87f, 0xc3a55cdf), TOBN(0x35846b95, 0x68b81afe), TOBN(0x018d12af, 0xd3c239d8)}, {TOBN(0x2b2c6208, 0x01206e15), TOBN(0xe0e42453, 0xa3b882c6), TOBN(0x854470a3, 0xa50162d5), TOBN(0x08157478, 0x7017a62a)}}, {{TOBN(0x18bd3fb4, 0x820357c7), TOBN(0x992039ae, 0x6f1458ad), TOBN(0x9a1df3c5, 0x25b44aa1), TOBN(0x2d780357, 0xed3d5281)}, {TOBN(0x58cf7e4d, 0xc77ad4d4), TOBN(0xd49a7998, 0xf9df4fc4), TOBN(0x4465a8b5, 0x1d71205e), TOBN(0xa0ee0ea6, 0x649254aa)}}, {{TOBN(0x4b5eeecf, 0xab7bd771), TOBN(0x6c873073, 0x35c262b9), TOBN(0xdc5bd648, 0x3c9d61e7), TOBN(0x233d6d54, 0x321460d2)}, {TOBN(0xd20c5626, 0xfc195bcc), TOBN(0x25445958, 0x04d78b63), TOBN(0xe03fcb3d, 0x17ec8ef3), TOBN(0x54b690d1, 0x46b8f781)}}, {{TOBN(0x82fa2c8a, 0x21230646), TOBN(0xf51aabb9, 0x084f418c), TOBN(0xff4fbec1, 0x1a30ba43), TOBN(0x6a5acf73, 0x743c9df7)}, {TOBN(0x1da2b357, 0xd635b4d5), TOBN(0xc3de68dd, 0xecd5c1da), TOBN(0xa689080b, 0xd61af0dd), TOBN(0xdea5938a, 0xd665bf99)}}, {{TOBN(0x0231d71a, 0xfe637294), TOBN(0x01968aa6, 0xa5a81cd8), TOBN(0x11252d50, 0x048e63b5), TOBN(0xc446bc52, 0x6ca007e9)}, {TOBN(0xef8c50a6, 0x96d6134b), TOBN(0x9361fbf5, 0x9e09a05c), TOBN(0xf17f85a6, 0xdca3291a), TOBN(0xb178d548, 0xff251a21)}}, {{TOBN(0x87f6374b, 0xa4df3915), TOBN(0x566ce1bf, 0x2fd5d608), TOBN(0x425cba4d, 0x7de35102), TOBN(0x6b745f8f, 0x58c5d5e2)}, {TOBN(0x88402af6, 0x63122edf), TOBN(0x3190f9ed, 0x3b989a89), TOBN(0x4ad3d387, 0xebba3156), TOBN(0xef385ad9, 0xc7c469a5)}}, {{TOBN(0xb08281de, 0x3f642c29), TOBN(0x20be0888, 0x910ffb88), TOBN(0xf353dd4a, 0xd5292546), TOBN(0x3f1627de, 0x8377a262)}, {TOBN(0xa5faa013, 0xeefcd638), TOBN(0x8f3bf626, 0x74cc77c3), TOBN(0x32618f65, 0xa348f55e), TOBN(0x5787c0dc, 0x9fefeb9e)}}, {{TOBN(0xf1673aa2, 0xd9a23e44), TOBN(0x88dfa993, 0x4e10690d), TOBN(0x1ced1b36, 0x2bf91108), TOBN(0x9193ceca, 0x3af48649)}, {TOBN(0xfb34327d, 0x2d738fc5), TOBN(0x6697b037, 0x975fee6c), TOBN(0x2f485da0, 0xc04079a5), TOBN(0x2cdf5735, 0x2feaa1ac)}}, {{TOBN(0x76944420, 0xbd55659e), TOBN(0x7973e32b, 0x4376090c), TOBN(0x86bb4fe1, 0x163b591a), TOBN(0x10441aed, 0xc196f0ca)}, {TOBN(0x3b431f4a, 0x045ad915), TOBN(0x6c11b437, 0xa4afacb1), TOBN(0x30b0c7db, 0x71fdbbd8), TOBN(0xb642931f, 0xeda65acd)}}, {{TOBN(0x4baae6e8, 0x9c92b235), TOBN(0xa73bbd0e, 0x6b3993a1), TOBN(0xd06d60ec, 0x693dd031), TOBN(0x03cab91b, 0x7156881c)}, {TOBN(0xd615862f, 0x1db3574b), TOBN(0x485b0185, 0x64bb061a), TOBN(0x27434988, 0xa0181e06), TOBN(0x2cd61ad4, 0xc1c0c757)}}, {{TOBN(0x3effed5a, 0x2ff9f403), TOBN(0x8dc98d8b, 0x62239029), TOBN(0x2206021e, 0x1f17b70d), TOBN(0xafbec0ca, 0xbf510015)}, {TOBN(0x9fed7164, 0x80130dfa), TOBN(0x306dc2b5, 0x8a02dcf5), TOBN(0x48f06620, 0xfeb10fc0), TOBN(0x78d1e1d5, 0x5a57cf51)}}, {{TOBN(0xadef8c5a, 0x192ef710), TOBN(0x88afbd4b, 0x3b7431f9), TOBN(0x7e1f7407, 0x64250c9e), TOBN(0x6e31318d, 0xb58bec07)}, {TOBN(0xfd4fc4b8, 0x24f89b4e), TOBN(0x65a5dd88, 0x48c36a2a), TOBN(0x4f1eccff, 0xf024baa7), TOBN(0x22a21cf2, 0xcba94650)}}, {{TOBN(0x95d29dee, 0x42a554f7), TOBN(0x828983a5, 0x002ec4ba), TOBN(0x8112a1f7, 0x8badb73d), TOBN(0x79ea8897, 0xa27c1839)}, {TOBN(0x8969a5a7, 0xd065fd83), TOBN(0xf49af791, 0xb262a0bc), TOBN(0xfcdea8b6, 0xaf2b5127), TOBN(0x10e913e1, 0x564c2dbc)}}, {{TOBN(0x51239d14, 0xbc21ef51), TOBN(0xe51c3ceb, 0x4ce57292), TOBN(0x795ff068, 0x47bbcc3b), TOBN(0x86b46e1e, 0xbd7e11e6)}, {TOBN(0x0ea6ba23, 0x80041ef4), TOBN(0xd72fe505, 0x6262342e), TOBN(0x8abc6dfd, 0x31d294d4), TOBN(0xbbe017a2, 0x1278c2c9)}}, {{TOBN(0xb1fcfa09, 0xb389328a), TOBN(0x322fbc62, 0xd01771b5), TOBN(0x04c0d063, 0x60b045bf), TOBN(0xdb652edc, 0x10e52d01)}, {TOBN(0x50ef932c, 0x03ec6627), TOBN(0xde1b3b2d, 0xc1ee50e3), TOBN(0x5ab7bdc5, 0xdc37a90d), TOBN(0xfea67213, 0x31e33a96)}}, {{TOBN(0x6482b5cb, 0x4f2999aa), TOBN(0x38476cc6, 0xb8cbf0dd), TOBN(0x93ebfacb, 0x173405bb), TOBN(0x15cdafe7, 0xe52369ec)}, {TOBN(0xd42d5ba4, 0xd935b7db), TOBN(0x648b6004, 0x1c99a4cd), TOBN(0x785101bd, 0xa3b5545b), TOBN(0x4bf2c38a, 0x9dd67faf)}}, {{TOBN(0xb1aadc63, 0x4442449c), TOBN(0xe0e9921a, 0x33ad4fb8), TOBN(0x5c552313, 0xaa686d82), TOBN(0xdee635fa, 0x465d866c)}, {TOBN(0xbc3c224a, 0x18ee6e8a), TOBN(0xeed748a6, 0xed42e02f), TOBN(0xe70f930a, 0xd474cd08), TOBN(0x774ea6ec, 0xfff24adf)}}, {{TOBN(0x03e2de1c, 0xf3480d4a), TOBN(0xf0d8edc7, 0xbc8acf1a), TOBN(0xf23e3303, 0x68295a9c), TOBN(0xfadd5f68, 0xc546a97d)}, {TOBN(0x895597ad, 0x96f8acb1), TOBN(0xbddd49d5, 0x671bdae2), TOBN(0x16fcd528, 0x21dd43f4), TOBN(0xa5a45412, 0x6619141a)}}}, {{{TOBN(0x8ce9b6bf, 0xc360e25a), TOBN(0xe6425195, 0x075a1a78), TOBN(0x9dc756a8, 0x481732f4), TOBN(0x83c0440f, 0x5432b57a)}, {TOBN(0xc670b3f1, 0xd720281f), TOBN(0x2205910e, 0xd135e051), TOBN(0xded14b0e, 0xdb052be7), TOBN(0x697b3d27, 0xc568ea39)}}, {{TOBN(0x2e599b9a, 0xfb3ff9ed), TOBN(0x28c2e0ab, 0x17f6515c), TOBN(0x1cbee4fd, 0x474da449), TOBN(0x071279a4, 0x4f364452)}, {TOBN(0x97abff66, 0x01fbe855), TOBN(0x3ee394e8, 0x5fda51c4), TOBN(0x190385f6, 0x67597c0b), TOBN(0x6e9fccc6, 0xa27ee34b)}}, {{TOBN(0x0b89de93, 0x14092ebb), TOBN(0xf17256bd, 0x428e240c), TOBN(0xcf89a7f3, 0x93d2f064), TOBN(0x4f57841e, 0xe1ed3b14)}, {TOBN(0x4ee14405, 0xe708d855), TOBN(0x856aae72, 0x03f1c3d0), TOBN(0xc8e5424f, 0xbdd7eed5), TOBN(0x3333e4ef, 0x73ab4270)}}, {{TOBN(0x3bc77ade, 0xdda492f8), TOBN(0xc11a3aea, 0x78297205), TOBN(0x5e89a3e7, 0x34931b4c), TOBN(0x17512e2e, 0x9f5694bb)}, {TOBN(0x5dc349f3, 0x177bf8b6), TOBN(0x232ea4ba, 0x08c7ff3e), TOBN(0x9c4f9d16, 0xf511145d), TOBN(0xccf109a3, 0x33b379c3)}}, {{TOBN(0xe75e7a88, 0xa1f25897), TOBN(0x7ac6961f, 0xa1b5d4d8), TOBN(0xe3e10773, 0x08f3ed5c), TOBN(0x208a54ec, 0x0a892dfb)}, {TOBN(0xbe826e19, 0x78660710), TOBN(0x0cf70a97, 0x237df2c8), TOBN(0x418a7340, 0xed704da5), TOBN(0xa3eeb9a9, 0x08ca33fd)}}, {{TOBN(0x49d96233, 0x169bca96), TOBN(0x04d286d4, 0x2da6aafb), TOBN(0xc09606ec, 0xa0c2fa94), TOBN(0x8869d0d5, 0x23ff0fb3)}, {TOBN(0xa99937e5, 0xd0150d65), TOBN(0xa92e2503, 0x240c14c9), TOBN(0x656bf945, 0x108e2d49), TOBN(0x152a733a, 0xa2f59e2b)}}, {{TOBN(0xb4323d58, 0x8434a920), TOBN(0xc0af8e93, 0x622103c5), TOBN(0x667518ef, 0x938dbf9a), TOBN(0xa1843073, 0x83a9cdf2)}, {TOBN(0x350a94aa, 0x5447ab80), TOBN(0xe5e5a325, 0xc75a3d61), TOBN(0x74ba507f, 0x68411a9e), TOBN(0x10581fc1, 0x594f70c5)}}, {{TOBN(0x60e28570, 0x80eb24a9), TOBN(0x7bedfb4d, 0x488e0cfd), TOBN(0x721ebbd7, 0xc259cdb8), TOBN(0x0b0da855, 0xbc6390a9)}, {TOBN(0x2b4d04db, 0xde314c70), TOBN(0xcdbf1fbc, 0x6c32e846), TOBN(0x33833eab, 0xb162fc9e), TOBN(0x9939b48b, 0xb0dd3ab7)}}, {{TOBN(0x5aaa98a7, 0xcb0c9c8c), TOBN(0x75105f30, 0x81c4375c), TOBN(0xceee5057, 0x5ef1c90f), TOBN(0xb31e065f, 0xc23a17bf)}, {TOBN(0x5364d275, 0xd4b6d45a), TOBN(0xd363f3ad, 0x62ec8996), TOBN(0xb5d21239, 0x4391c65b), TOBN(0x84564765, 0xebb41b47)}}, {{TOBN(0x20d18ecc, 0x37107c78), TOBN(0xacff3b6b, 0x570c2a66), TOBN(0x22f975d9, 0x9bd0d845), TOBN(0xef0a0c46, 0xba178fa0)}, {TOBN(0x1a419651, 0x76b6028e), TOBN(0xc49ec674, 0x248612d4), TOBN(0x5b6ac4f2, 0x7338af55), TOBN(0x06145e62, 0x7bee5a36)}}, {{TOBN(0x33e95d07, 0xe75746b5), TOBN(0x1c1e1f6d, 0xc40c78be), TOBN(0x967833ef, 0x222ff8e2), TOBN(0x4bedcf6a, 0xb49180ad)}, {TOBN(0x6b37e9c1, 0x3d7a4c8a), TOBN(0x2748887c, 0x6ddfe760), TOBN(0xf7055123, 0xaa3a5bbc), TOBN(0x954ff225, 0x7bbb8e74)}}, {{TOBN(0xc42b8ab1, 0x97c3dfb9), TOBN(0x55a549b0, 0xcf168154), TOBN(0xad6748e7, 0xc1b50692), TOBN(0x2775780f, 0x6fc5cbcb)}, {TOBN(0x4eab80b8, 0xe1c9d7c8), TOBN(0x8c69dae1, 0x3fdbcd56), TOBN(0x47e6b4fb, 0x9969eace), TOBN(0x002f1085, 0xa705cb5a)}}, {{TOBN(0x4e23ca44, 0x6d3fea55), TOBN(0xb4ae9c86, 0xf4810568), TOBN(0x47bfb91b, 0x2a62f27d), TOBN(0x60deb4c9, 0xd9bac28c)}, {TOBN(0xa892d894, 0x7de6c34c), TOBN(0x4ee68259, 0x4494587d), TOBN(0x914ee14e, 0x1a3f8a5b), TOBN(0xbb113eaa, 0x28700385)}}, {{TOBN(0x81ca03b9, 0x2115b4c9), TOBN(0x7c163d38, 0x8908cad1), TOBN(0xc912a118, 0xaa18179a), TOBN(0xe09ed750, 0x886e3081)}, {TOBN(0xa676e3fa, 0x26f516ca), TOBN(0x753cacf7, 0x8e732f91), TOBN(0x51592aea, 0x833da8b4), TOBN(0xc626f42f, 0x4cbea8aa)}}, {{TOBN(0xef9dc899, 0xa7b56eaf), TOBN(0x00c0e52c, 0x34ef7316), TOBN(0x5b1e4e24, 0xfe818a86), TOBN(0x9d31e20d, 0xc538be47)}, {TOBN(0x22eb932d, 0x3ed68974), TOBN(0xe44bbc08, 0x7c4e87c4), TOBN(0x4121086e, 0x0dde9aef), TOBN(0x8e6b9cff, 0x134f4345)}}, {{TOBN(0x96892c1f, 0x711b0eb9), TOBN(0xb905f2c8, 0x780ab954), TOBN(0xace26309, 0xa20792db), TOBN(0xec8ac9b3, 0x0684e126)}, {TOBN(0x486ad8b6, 0xb40a2447), TOBN(0x60121fc1, 0x9fe3fb24), TOBN(0x5626fccf, 0x1a8e3b3f), TOBN(0x4e568622, 0x6ad1f394)}}, {{TOBN(0xda7aae0d, 0x196aa5a1), TOBN(0xe0df8c77, 0x1041b5fb), TOBN(0x451465d9, 0x26b318b7), TOBN(0xc29b6e55, 0x7ab136e9)}, {TOBN(0x2c2ab48b, 0x71148463), TOBN(0xb5738de3, 0x64454a76), TOBN(0x54ccf9a0, 0x5a03abe4), TOBN(0x377c0296, 0x0427d58e)}}, {{TOBN(0x73f5f0b9, 0x2bb39c1f), TOBN(0x14373f2c, 0xe608d8c5), TOBN(0xdcbfd314, 0x00fbb805), TOBN(0xdf18fb20, 0x83afdcfb)}, {TOBN(0x81a57f42, 0x42b3523f), TOBN(0xe958532d, 0x87f650fb), TOBN(0xaa8dc8b6, 0x8b0a7d7c), TOBN(0x1b75dfb7, 0x150166be)}}, {{TOBN(0x90e4f7c9, 0x2d7d1413), TOBN(0x67e2d6b5, 0x9834f597), TOBN(0x4fd4f4f9, 0xa808c3e8), TOBN(0xaf8237e0, 0xd5281ec1)}, {TOBN(0x25ab5fdc, 0x84687cee), TOBN(0xc5ded6b1, 0xa5b26c09), TOBN(0x8e4a5aec, 0xc8ea7650), TOBN(0x23b73e5c, 0x14cc417f)}}, {{TOBN(0x2bfb4318, 0x3037bf52), TOBN(0xb61e6db5, 0x78c725d7), TOBN(0x8efd4060, 0xbbb3e5d7), TOBN(0x2e014701, 0xdbac488e)}, {TOBN(0xac75cf9a, 0x360aa449), TOBN(0xb70cfd05, 0x79634d08), TOBN(0xa591536d, 0xfffb15ef), TOBN(0xb2c37582, 0xd07c106c)}}, {{TOBN(0xb4293fdc, 0xf50225f9), TOBN(0xc52e175c, 0xb0e12b03), TOBN(0xf649c3ba, 0xd0a8bf64), TOBN(0x745a8fef, 0xeb8ae3c6)}, {TOBN(0x30d7e5a3, 0x58321bc3), TOBN(0xb1732be7, 0x0bc4df48), TOBN(0x1f217993, 0xe9ea5058), TOBN(0xf7a71cde, 0x3e4fd745)}}, {{TOBN(0x86cc533e, 0x894c5bbb), TOBN(0x6915c7d9, 0x69d83082), TOBN(0xa6aa2d05, 0x5815c244), TOBN(0xaeeee592, 0x49b22ce5)}, {TOBN(0x89e39d13, 0x78135486), TOBN(0x3a275c1f, 0x16b76f2f), TOBN(0xdb6bcc1b, 0xe036e8f5), TOBN(0x4df69b21, 0x5e4709f5)}}, {{TOBN(0xa188b250, 0x2d0f39aa), TOBN(0x622118bb, 0x15a85947), TOBN(0x2ebf520f, 0xfde0f4fa), TOBN(0xa40e9f29, 0x4860e539)}, {TOBN(0x7b6a51eb, 0x22b57f0f), TOBN(0x849a33b9, 0x7e80644a), TOBN(0x50e5d16f, 0x1cf095fe), TOBN(0xd754b54e, 0xec55f002)}}, {{TOBN(0x5cfbbb22, 0x236f4a98), TOBN(0x0b0c59e9, 0x066800bb), TOBN(0x4ac69a8f, 0x5a9a7774), TOBN(0x2b33f804, 0xd6bec948)}, {TOBN(0xb3729295, 0x32e6c466), TOBN(0x68956d0f, 0x4e599c73), TOBN(0xa47a249f, 0x155c31cc), TOBN(0x24d80f0d, 0xe1ce284e)}}, {{TOBN(0xcd821dfb, 0x988baf01), TOBN(0xe6331a7d, 0xdbb16647), TOBN(0x1eb8ad33, 0x094cb960), TOBN(0x593cca38, 0xc91bbca5)}, {TOBN(0x384aac8d, 0x26567456), TOBN(0x40fa0309, 0xc04b6490), TOBN(0x97834cd6, 0xdab6c8f6), TOBN(0x68a7318d, 0x3f91e55f)}}, {{TOBN(0xa00fd04e, 0xfc4d3157), TOBN(0xb56f8ab2, 0x2bf3bdea), TOBN(0x014f5648, 0x4fa57172), TOBN(0x948c5860, 0x450abdb3)}, {TOBN(0x342b5df0, 0x0ebd4f08), TOBN(0x3e5168cd, 0x0e82938e), TOBN(0x7aedc1ce, 0xb0df5dd0), TOBN(0x6bbbc6d9, 0xe5732516)}}, {{TOBN(0xc7bfd486, 0x605daaa6), TOBN(0x46fd72b7, 0xbb9a6c9e), TOBN(0xe4847fb1, 0xa124fb89), TOBN(0x75959cbd, 0xa2d8ffbc)}, {TOBN(0x42579f65, 0xc8a588ee), TOBN(0x368c92e6, 0xb80b499d), TOBN(0xea4ef6cd, 0x999a5df1), TOBN(0xaa73bb7f, 0x936fe604)}}, {{TOBN(0xf347a70d, 0x6457d188), TOBN(0x86eda86b, 0x8b7a388b), TOBN(0xb7cdff06, 0x0ccd6013), TOBN(0xbeb1b6c7, 0xd0053fb2)}, {TOBN(0x0b022387, 0x99240a9f), TOBN(0x1bbb384f, 0x776189b2), TOBN(0x8695e71e, 0x9066193a), TOBN(0x2eb50097, 0x06ffac7e)}}, {{TOBN(0x0654a9c0, 0x4a7d2caa), TOBN(0x6f3fb3d1, 0xa5aaa290), TOBN(0x835db041, 0xff476e8f), TOBN(0x540b8b0b, 0xc42295e4)}, {TOBN(0xa5c73ac9, 0x05e214f5), TOBN(0x9a74075a, 0x56a0b638), TOBN(0x2e4b1090, 0xce9e680b), TOBN(0x57a5b479, 0x6b8d9afa)}}, {{TOBN(0x0dca48e7, 0x26bfe65c), TOBN(0x097e391c, 0x7290c307), TOBN(0x683c462e, 0x6669e72e), TOBN(0xf505be1e, 0x062559ac)}, {TOBN(0x5fbe3ea1, 0xe3a3035a), TOBN(0x6431ebf6, 0x9cd50da8), TOBN(0xfd169d5c, 0x1f6407f2), TOBN(0x8d838a95, 0x60fce6b8)}}, {{TOBN(0x2a2bfa7f, 0x650006f0), TOBN(0xdfd7dad3, 0x50c0fbb2), TOBN(0x92452495, 0xccf9ad96), TOBN(0x183bf494, 0xd95635f9)}, {TOBN(0x02d5df43, 0x4a7bd989), TOBN(0x505385cc, 0xa5431095), TOBN(0xdd98e67d, 0xfd43f53e), TOBN(0xd61e1a6c, 0x500c34a9)}}, {{TOBN(0x5a4b46c6, 0x4a8a3d62), TOBN(0x8469c4d0, 0x247743d2), TOBN(0x2bb3a13d, 0x88f7e433), TOBN(0x62b23a10, 0x01be5849)}, {TOBN(0xe83596b4, 0xa63d1a4c), TOBN(0x454e7fea, 0x7d183f3e), TOBN(0x643fce61, 0x17afb01c), TOBN(0x4e65e5e6, 0x1c4c3638)}}, {{TOBN(0x41d85ea1, 0xef74c45b), TOBN(0x2cfbfa66, 0xae328506), TOBN(0x98b078f5, 0x3ada7da9), TOBN(0xd985fe37, 0xec752fbb)}, {TOBN(0xeece68fe, 0x5a0148b4), TOBN(0x6f9a55c7, 0x2d78136d), TOBN(0x232dccc4, 0xd2b729ce), TOBN(0xa27e0dfd, 0x90aafbc4)}}, {{TOBN(0x96474452, 0x12b4603e), TOBN(0xa876c551, 0x6b706d14), TOBN(0xdf145fcf, 0x69a9d412), TOBN(0xe2ab75b7, 0x2d479c34)}, {TOBN(0x12df9a76, 0x1a23ff97), TOBN(0xc6138992, 0x5d359d10), TOBN(0x6e51c7ae, 0xfa835f22), TOBN(0x69a79cb1, 0xc0fcc4d9)}}, {{TOBN(0xf57f350d, 0x594cc7e1), TOBN(0x3079ca63, 0x3350ab79), TOBN(0x226fb614, 0x9aff594a), TOBN(0x35afec02, 0x6d59a62b)}, {TOBN(0x9bee46f4, 0x06ed2c6e), TOBN(0x58da1735, 0x7d939a57), TOBN(0x44c50402, 0x8fd1797e), TOBN(0xd8853e7c, 0x5ccea6ca)}}, {{TOBN(0x4065508d, 0xa35fcd5f), TOBN(0x8965df8c, 0x495ccaeb), TOBN(0x0f2da850, 0x12e1a962), TOBN(0xee471b94, 0xc1cf1cc4)}, {TOBN(0xcef19bc8, 0x0a08fb75), TOBN(0x704958f5, 0x81de3591), TOBN(0x2867f8b2, 0x3aef4f88), TOBN(0x8d749384, 0xea9f9a5f)}}, {{TOBN(0x1b385537, 0x8c9049f4), TOBN(0x5be948f3, 0x7b92d8b6), TOBN(0xd96f725d, 0xb6e2bd6b), TOBN(0x37a222bc, 0x958c454d)}, {TOBN(0xe7c61abb, 0x8809bf61), TOBN(0x46f07fbc, 0x1346f18d), TOBN(0xfb567a7a, 0xe87c0d1c), TOBN(0x84a461c8, 0x7ef3d07a)}}, {{TOBN(0x0a5adce6, 0xd9278d98), TOBN(0x24d94813, 0x9dfc73e1), TOBN(0x4f3528b6, 0x054321c3), TOBN(0x2e03fdde, 0x692ea706)}, {TOBN(0x10e60619, 0x47b533c0), TOBN(0x1a8bc73f, 0x2ca3c055), TOBN(0xae58d4b2, 0x1bb62b8f), TOBN(0xb2045a73, 0x584a24e3)}}, {{TOBN(0x3ab3d5af, 0xbd76e195), TOBN(0x478dd1ad, 0x6938a810), TOBN(0x6ffab393, 0x6ee3d5cb), TOBN(0xdfb693db, 0x22b361e4)}, {TOBN(0xf9694496, 0x51dbf1a7), TOBN(0xcab4b4ef, 0x08a2e762), TOBN(0xe8c92f25, 0xd39bba9a), TOBN(0x850e61bc, 0xf1464d96)}}, {{TOBN(0xb7e830e3, 0xdc09508b), TOBN(0xfaf6d2cf, 0x74317655), TOBN(0x72606ceb, 0xdf690355), TOBN(0x48bb92b3, 0xd0c3ded6)}, {TOBN(0x65b75484, 0x5c7cf892), TOBN(0xf6cd7ac9, 0xd5d5f01f), TOBN(0xc2c30a59, 0x96401d69), TOBN(0x91268650, 0xed921878)}}, {{TOBN(0x380bf913, 0xb78c558f), TOBN(0x43c0baeb, 0xc8afdaa9), TOBN(0x377f61d5, 0x54f169d3), TOBN(0xf8da07e3, 0xae5ff20b)}, {TOBN(0xb676c49d, 0xa8a90ea8), TOBN(0x81c1ff2b, 0x83a29b21), TOBN(0x383297ac, 0x2ad8d276), TOBN(0x3001122f, 0xba89f982)}}, {{TOBN(0xe1d794be, 0x6718e448), TOBN(0x246c1482, 0x7c3e6e13), TOBN(0x56646ef8, 0x5d26b5ef), TOBN(0x80f5091e, 0x88069cdd)}, {TOBN(0xc5992e2f, 0x724bdd38), TOBN(0x02e915b4, 0x8471e8c7), TOBN(0x96ff320a, 0x0d0ff2a9), TOBN(0xbf886487, 0x4384d1a0)}}, {{TOBN(0xbbe1e6a6, 0xc93f72d6), TOBN(0xd5f75d12, 0xcad800ea), TOBN(0xfa40a09f, 0xe7acf117), TOBN(0x32c8cdd5, 0x7581a355)}, {TOBN(0x74221992, 0x7023c499), TOBN(0xa8afe5d7, 0x38ec3901), TOBN(0x5691afcb, 0xa90e83f0), TOBN(0x41bcaa03, 0x0b8f8eac)}}, {{TOBN(0xe38b5ff9, 0x8d2668d5), TOBN(0x0715281a, 0x7ad81965), TOBN(0x1bc8fc7c, 0x03c6ce11), TOBN(0xcbbee6e2, 0x8b650436)}, {TOBN(0x06b00fe8, 0x0cdb9808), TOBN(0x17d6e066, 0xfe3ed315), TOBN(0x2e9d38c6, 0x4d0b5018), TOBN(0xab8bfd56, 0x844dcaef)}}, {{TOBN(0x42894a59, 0x513aed8b), TOBN(0xf77f3b6d, 0x314bd07a), TOBN(0xbbdecb8f, 0x8e42b582), TOBN(0xf10e2fa8, 0xd2390fe6)}, {TOBN(0xefb95022, 0x62a2f201), TOBN(0x4d59ea50, 0x50ee32b0), TOBN(0xd87f7728, 0x6da789a8), TOBN(0xcf98a2cf, 0xf79492c4)}}, {{TOBN(0xf9577239, 0x720943c2), TOBN(0xba044cf5, 0x3990b9d0), TOBN(0x5aa8e823, 0x95f2884a), TOBN(0x834de6ed, 0x0278a0af)}, {TOBN(0xc8e1ee9a, 0x5f25bd12), TOBN(0x9259ceaa, 0x6f7ab271), TOBN(0x7e6d97a2, 0x77d00b76), TOBN(0x5c0c6eea, 0xa437832a)}}, {{TOBN(0x5232c20f, 0x5606b81d), TOBN(0xabd7b375, 0x0d991ee5), TOBN(0x4d2bfe35, 0x8632d951), TOBN(0x78f85146, 0x98ed9364)}, {TOBN(0x951873f0, 0xf30c3282), TOBN(0x0da8ac80, 0xa789230b), TOBN(0x3ac7789c, 0x5398967f), TOBN(0xa69b8f7f, 0xbdda0fb5)}}, {{TOBN(0xe5db7717, 0x6add8545), TOBN(0x1b71cb66, 0x72c49b66), TOBN(0xd8560739, 0x68421d77), TOBN(0x03840fe8, 0x83e3afea)}, {TOBN(0xb391dad5, 0x1ec69977), TOBN(0xae243fb9, 0x307f6726), TOBN(0xc88ac87b, 0xe8ca160c), TOBN(0x5174cced, 0x4ce355f4)}}, {{TOBN(0x98a35966, 0xe58ba37d), TOBN(0xfdcc8da2, 0x7817335d), TOBN(0x5b752830, 0x83fbc7bf), TOBN(0x68e419d4, 0xd9c96984)}, {TOBN(0x409a39f4, 0x02a40380), TOBN(0x88940faf, 0x1fe977bc), TOBN(0xc640a94b, 0x8f8edea6), TOBN(0x1e22cd17, 0xed11547d)}}, {{TOBN(0xe28568ce, 0x59ffc3e2), TOBN(0x60aa1b55, 0xc1dee4e7), TOBN(0xc67497c8, 0x837cb363), TOBN(0x06fb438a, 0x105a2bf2)}, {TOBN(0x30357ec4, 0x500d8e20), TOBN(0x1ad9095d, 0x0670db10), TOBN(0x7f589a05, 0xc73b7cfd), TOBN(0xf544607d, 0x880d6d28)}}, {{TOBN(0x17ba93b1, 0xa20ef103), TOBN(0xad859130, 0x6ba6577b), TOBN(0x65c91cf6, 0x6fa214a0), TOBN(0xd7d49c6c, 0x27990da5)}, {TOBN(0xecd9ec8d, 0x20bb569d), TOBN(0xbd4b2502, 0xeeffbc33), TOBN(0x2056ca5a, 0x6bed0467), TOBN(0x7916a1f7, 0x5b63728c)}}, {{TOBN(0xd4f9497d, 0x53a4f566), TOBN(0x89734664, 0x97b56810), TOBN(0xf8e1da74, 0x0494a621), TOBN(0x82546a93, 0x8d011c68)}, {TOBN(0x1f3acb19, 0xc61ac162), TOBN(0x52f8fa9c, 0xabad0d3e), TOBN(0x15356523, 0xb4b7ea43), TOBN(0x5a16ad61, 0xae608125)}}, {{TOBN(0xb0bcb87f, 0x4faed184), TOBN(0x5f236b1d, 0x5029f45f), TOBN(0xd42c7607, 0x0bc6b1fc), TOBN(0xc644324e, 0x68aefce3)}, {TOBN(0x8e191d59, 0x5c5d8446), TOBN(0xc0208077, 0x13ae1979), TOBN(0xadcaee55, 0x3ba59cc7), TOBN(0x20ed6d6b, 0xa2cb81ba)}}, {{TOBN(0x0952ba19, 0xb6efcffc), TOBN(0x60f12d68, 0x97c0b87c), TOBN(0x4ee2c7c4, 0x9caa30bc), TOBN(0x767238b7, 0x97fbff4e)}, {TOBN(0xebc73921, 0x501b5d92), TOBN(0x3279e3df, 0xc2a37737), TOBN(0x9fc12bc8, 0x6d197543), TOBN(0xfa94dc6f, 0x0a40db4e)}}, {{TOBN(0x7392b41a, 0x530ccbbd), TOBN(0x87c82146, 0xea823525), TOBN(0xa52f984c, 0x05d98d0c), TOBN(0x2ae57d73, 0x5ef6974c)}, {TOBN(0x9377f7bf, 0x3042a6dd), TOBN(0xb1a007c0, 0x19647a64), TOBN(0xfaa9079a, 0x0cca9767), TOBN(0x3d81a25b, 0xf68f72d5)}}, {{TOBN(0x752067f8, 0xff81578e), TOBN(0x78622150, 0x9045447d), TOBN(0xc0c22fcf, 0x0505aa6f), TOBN(0x1030f0a6, 0x6bed1c77)}, {TOBN(0x31f29f15, 0x1f0bd739), TOBN(0x2d7989c7, 0xe6debe85), TOBN(0x5c070e72, 0x8e677e98), TOBN(0x0a817bd3, 0x06e81fd5)}}, {{TOBN(0xc110d830, 0xb0f2ac95), TOBN(0x48d0995a, 0xab20e64e), TOBN(0x0f3e00e1, 0x7729cd9a), TOBN(0x2a570c20, 0xdd556946)}, {TOBN(0x912dbcfd, 0x4e86214d), TOBN(0x2d014ee2, 0xcf615498), TOBN(0x55e2b1e6, 0x3530d76e), TOBN(0xc5135ae4, 0xfd0fd6d1)}}, {{TOBN(0x0066273a, 0xd4f3049f), TOBN(0xbb8e9893, 0xe7087477), TOBN(0x2dba1ddb, 0x14c6e5fd), TOBN(0xdba37886, 0x51f57e6c)}, {TOBN(0x5aaee0a6, 0x5a72f2cf), TOBN(0x1208bfbf, 0x7bea5642), TOBN(0xf5c6aa3b, 0x67872c37), TOBN(0xd726e083, 0x43f93224)}}, {{TOBN(0x1854daa5, 0x061f1658), TOBN(0xc0016df1, 0xdf0cd2b3), TOBN(0xc2a3f23e, 0x833d50de), TOBN(0x73b681d2, 0xbbbd3017)}, {TOBN(0x2f046dc4, 0x3ac343c0), TOBN(0x9c847e7d, 0x85716421), TOBN(0xe1e13c91, 0x0917eed4), TOBN(0x3fc9eebd, 0x63a1b9c6)}}, {{TOBN(0x0f816a72, 0x7fe02299), TOBN(0x6335ccc2, 0x294f3319), TOBN(0x3820179f, 0x4745c5be), TOBN(0xe647b782, 0x922f066e)}, {TOBN(0xc22e49de, 0x02cafb8a), TOBN(0x299bc2ff, 0xfcc2eccc), TOBN(0x9a8feea2, 0x6e0e8282), TOBN(0xa627278b, 0xfe893205)}}, {{TOBN(0xa7e19733, 0x7933e47b), TOBN(0xf4ff6b13, 0x2e766402), TOBN(0xa4d8be0a, 0x98440d9f), TOBN(0x658f5c2f, 0x38938808)}, {TOBN(0x90b75677, 0xc95b3b3e), TOBN(0xfa044269, 0x3137b6ff), TOBN(0x077b039b, 0x43c47c29), TOBN(0xcca95dd3, 0x8a6445b2)}}, {{TOBN(0x0b498ba4, 0x2333fc4c), TOBN(0x274f8e68, 0xf736a1b1), TOBN(0x6ca348fd, 0x5f1d4b2e), TOBN(0x24d3be78, 0xa8f10199)}, {TOBN(0x8535f858, 0xca14f530), TOBN(0xa6e7f163, 0x5b982e51), TOBN(0x847c8512, 0x36e1bf62), TOBN(0xf6a7c58e, 0x03448418)}}, {{TOBN(0x583f3703, 0xf9374ab6), TOBN(0x864f9195, 0x6e564145), TOBN(0x33bc3f48, 0x22526d50), TOBN(0x9f323c80, 0x1262a496)}, {TOBN(0xaa97a7ae, 0x3f046a9a), TOBN(0x70da183e, 0xdf8a039a), TOBN(0x5b68f71c, 0x52aa0ba6), TOBN(0x9be0fe51, 0x21459c2d)}}, {{TOBN(0xc1e17eb6, 0xcbc613e5), TOBN(0x33131d55, 0x497ea61c), TOBN(0x2f69d39e, 0xaf7eded5), TOBN(0x73c2f434, 0xde6af11b)}, {TOBN(0x4ca52493, 0xa4a375fa), TOBN(0x5f06787c, 0xb833c5c2), TOBN(0x814e091f, 0x3e6e71cf), TOBN(0x76451f57, 0x8b746666)}}}, {{{TOBN(0x80f9bdef, 0x694db7e0), TOBN(0xedca8787, 0xb9fcddc6), TOBN(0x51981c34, 0x03b8dce1), TOBN(0x4274dcf1, 0x70e10ba1)}, {TOBN(0xf72743b8, 0x6def6d1a), TOBN(0xd25b1670, 0xebdb1866), TOBN(0xc4491e8c, 0x050c6f58), TOBN(0x2be2b2ab, 0x87fbd7f5)}}, {{TOBN(0x3e0e5c9d, 0xd111f8ec), TOBN(0xbcc33f8d, 0xb7c4e760), TOBN(0x702f9a91, 0xbd392a51), TOBN(0x7da4a795, 0xc132e92d)}, {TOBN(0x1a0b0ae3, 0x0bb1151b), TOBN(0x54febac8, 0x02e32251), TOBN(0xea3a5082, 0x694e9e78), TOBN(0xe58ffec1, 0xe4fe40b8)}}, {{TOBN(0xf85592fc, 0xd1e0cf9e), TOBN(0xdea75f0d, 0xc0e7b2e8), TOBN(0xc04215cf, 0xc135584e), TOBN(0x174fc727, 0x2f57092a)}, {TOBN(0xe7277877, 0xeb930bea), TOBN(0x504caccb, 0x5eb02a5a), TOBN(0xf9fe08f7, 0xf5241b9b), TOBN(0xe7fb62f4, 0x8d5ca954)}}, {{TOBN(0xfbb8349d, 0x29c4120b), TOBN(0x9f94391f, 0xc0d0d915), TOBN(0xc4074fa7, 0x5410ba51), TOBN(0xa66adbf6, 0x150a5911)}, {TOBN(0xc164543c, 0x34bfca38), TOBN(0xe0f27560, 0xb9e1ccfc), TOBN(0x99da0f53, 0xe820219c), TOBN(0xe8234498, 0xc6b4997a)}}, {{TOBN(0xcfb88b76, 0x9d4c5423), TOBN(0x9e56eb10, 0xb0521c49), TOBN(0x418e0b5e, 0xbe8700a1), TOBN(0x00cbaad6, 0xf93cb58a)}, {TOBN(0xe923fbde, 0xd92a5e67), TOBN(0xca4979ac, 0x1f347f11), TOBN(0x89162d85, 0x6bc0585b), TOBN(0xdd6254af, 0xac3c70e3)}}, {{TOBN(0x7b23c513, 0x516e19e4), TOBN(0x56e2e847, 0xc5c4d593), TOBN(0x9f727d73, 0x5ce71ef6), TOBN(0x5b6304a6, 0xf79a44c5)}, {TOBN(0x6638a736, 0x3ab7e433), TOBN(0x1adea470, 0xfe742f83), TOBN(0xe054b854, 0x5b7fc19f), TOBN(0xf935381a, 0xba1d0698)}}, {{TOBN(0x546eab2d, 0x799e9a74), TOBN(0x96239e0e, 0xa949f729), TOBN(0xca274c6b, 0x7090055a), TOBN(0x835142c3, 0x9020c9b0)}, {TOBN(0xa405667a, 0xa2e8807f), TOBN(0x29f2c085, 0x1aa3d39e), TOBN(0xcc555d64, 0x42fc72f5), TOBN(0xe856e0e7, 0xfbeacb3c)}}, {{TOBN(0xb5504f9d, 0x918e4936), TOBN(0x65035ef6, 0xb2513982), TOBN(0x0553a0c2, 0x6f4d9cb9), TOBN(0x6cb10d56, 0xbea85509)}, {TOBN(0x48d957b7, 0xa242da11), TOBN(0x16a4d3dd, 0x672b7268), TOBN(0x3d7e637c, 0x8502a96b), TOBN(0x27c7032b, 0x730d463b)}}, {{TOBN(0xbdc02b18, 0xe4136a14), TOBN(0xbacf969d, 0x678e32bf), TOBN(0xc98d89a3, 0xdd9c3c03), TOBN(0x7b92420a, 0x23becc4f)}, {TOBN(0xd4b41f78, 0xc64d565c), TOBN(0x9f969d00, 0x10f28295), TOBN(0xec7f7f76, 0xb13d051a), TOBN(0x08945e1e, 0xa92da585)}}, {{TOBN(0x55366b7d, 0x5846426f), TOBN(0xe7d09e89, 0x247d441d), TOBN(0x510b404d, 0x736fbf48), TOBN(0x7fa003d0, 0xe784bd7d)}, {TOBN(0x25f7614f, 0x17fd9596), TOBN(0x49e0e0a1, 0x35cb98db), TOBN(0x2c65957b, 0x2e83a76a), TOBN(0x5d40da8d, 0xcddbe0f8)}}, {{TOBN(0xf2b8c405, 0x050bad24), TOBN(0x8918426d, 0xc2aa4823), TOBN(0x2aeab3dd, 0xa38365a7), TOBN(0x72031717, 0x7c91b690)}, {TOBN(0x8b00d699, 0x60a94120), TOBN(0x478a255d, 0xe99eaeec), TOBN(0xbf656a5f, 0x6f60aafd), TOBN(0xdfd7cb75, 0x5dee77b3)}}, {{TOBN(0x37f68bb4, 0xa595939d), TOBN(0x03556479, 0x28740217), TOBN(0x8e740e7c, 0x84ad7612), TOBN(0xd89bc843, 0x9044695f)}, {TOBN(0xf7f3da5d, 0x85a9184d), TOBN(0x562563bb, 0x9fc0b074), TOBN(0x06d2e6aa, 0xf88a888e), TOBN(0x612d8643, 0x161fbe7c)}}, {{TOBN(0x465edba7, 0xf64085e7), TOBN(0xb230f304, 0x29aa8511), TOBN(0x53388426, 0xcda2d188), TOBN(0x90885735, 0x4b666649)}, {TOBN(0x6f02ff9a, 0x652f54f6), TOBN(0x65c82294, 0x5fae2bf0), TOBN(0x7816ade0, 0x62f5eee3), TOBN(0xdcdbdf43, 0xfcc56d70)}}, {{TOBN(0x9fb3bba3, 0x54530bb2), TOBN(0xbde3ef77, 0xcb0869ea), TOBN(0x89bc9046, 0x0b431163), TOBN(0x4d03d7d2, 0xe4819a35)}, {TOBN(0x33ae4f9e, 0x43b6a782), TOBN(0x216db307, 0x9c88a686), TOBN(0x91dd88e0, 0x00ffedd9), TOBN(0xb280da9f, 0x12bd4840)}}, {{TOBN(0x32a7cb8a, 0x1635e741), TOBN(0xfe14008a, 0x78be02a7), TOBN(0x3fafb334, 0x1b7ae030), TOBN(0x7fd508e7, 0x5add0ce9)}, {TOBN(0x72c83219, 0xd607ad51), TOBN(0x0f229c0a, 0x8d40964a), TOBN(0x1be2c336, 0x1c878da2), TOBN(0xe0c96742, 0xeab2ab86)}}, {{TOBN(0x458f8691, 0x3e538cd7), TOBN(0xa7001f6c, 0x8e08ad53), TOBN(0x52b8c6e6, 0xbf5d15ff), TOBN(0x548234a4, 0x011215dd)}, {TOBN(0xff5a9d2d, 0x3d5b4045), TOBN(0xb0ffeeb6, 0x4a904190), TOBN(0x55a3aca4, 0x48607f8b), TOBN(0x8cbd665c, 0x30a0672a)}}, {{TOBN(0x87f834e0, 0x42583068), TOBN(0x02da2aeb, 0xf3f6e683), TOBN(0x6b763e5d, 0x05c12248), TOBN(0x7230378f, 0x65a8aefc)}, {TOBN(0x93bd80b5, 0x71e8e5ca), TOBN(0x53ab041c, 0xb3b62524), TOBN(0x1b860513, 0x6c9c552e), TOBN(0xe84d402c, 0xd5524e66)}}, {{TOBN(0xa37f3573, 0xf37f5937), TOBN(0xeb0f6c7d, 0xd1e4fca5), TOBN(0x2965a554, 0xac8ab0fc), TOBN(0x17fbf56c, 0x274676ac)}, {TOBN(0x2e2f6bd9, 0xacf7d720), TOBN(0x41fc8f88, 0x10224766), TOBN(0x517a14b3, 0x85d53bef), TOBN(0xdae327a5, 0x7d76a7d1)}}, {{TOBN(0x6ad0a065, 0xc4818267), TOBN(0x33aa189b, 0x37c1bbc1), TOBN(0x64970b52, 0x27392a92), TOBN(0x21699a1c, 0x2d1535ea)}, {TOBN(0xcd20779c, 0xc2d7a7fd), TOBN(0xe3186059, 0x99c83cf2), TOBN(0x9b69440b, 0x72c0b8c7), TOBN(0xa81497d7, 0x7b9e0e4d)}}, {{TOBN(0x515d5c89, 0x1f5f82dc), TOBN(0x9a7f67d7, 0x6361079e), TOBN(0xa8da81e3, 0x11a35330), TOBN(0xe44990c4, 0x4b18be1b)}, {TOBN(0xc7d5ed95, 0xaf103e59), TOBN(0xece8aba7, 0x8dac9261), TOBN(0xbe82b099, 0x9394b8d3), TOBN(0x6830f09a, 0x16adfe83)}}, {{TOBN(0x250a29b4, 0x88172d01), TOBN(0x8b20bd65, 0xcaff9e02), TOBN(0xb8a7661e, 0xe8a6329a), TOBN(0x4520304d, 0xd3fce920)}, {TOBN(0xae45da1f, 0x2b47f7ef), TOBN(0xe07f5288, 0x5bffc540), TOBN(0xf7997009, 0x3464f874), TOBN(0x2244c2cd, 0xa6fa1f38)}}, {{TOBN(0x43c41ac1, 0x94d7d9b1), TOBN(0x5bafdd82, 0xc82e7f17), TOBN(0xdf0614c1, 0x5fda0fca), TOBN(0x74b043a7, 0xa8ae37ad)}, {TOBN(0x3ba6afa1, 0x9e71734c), TOBN(0x15d5437e, 0x9c450f2e), TOBN(0x4a5883fe, 0x67e242b1), TOBN(0x5143bdc2, 0x2c1953c2)}}, {{TOBN(0x542b8b53, 0xfc5e8920), TOBN(0x363bf9a8, 0x9a9cee08), TOBN(0x02375f10, 0xc3486e08), TOBN(0x2037543b, 0x8c5e70d2)}, {TOBN(0x7109bccc, 0x625640b4), TOBN(0xcbc1051e, 0x8bc62c3b), TOBN(0xf8455fed, 0x803f26ea), TOBN(0x6badceab, 0xeb372424)}}, {{TOBN(0xa2a9ce7c, 0x6b53f5f9), TOBN(0x64246595, 0x1b176d99), TOBN(0xb1298d36, 0xb95c081b), TOBN(0x53505bb8, 0x1d9a9ee6)}, {TOBN(0x3f6f9e61, 0xf2ba70b0), TOBN(0xd07e16c9, 0x8afad453), TOBN(0x9f1694bb, 0xe7eb4a6a), TOBN(0xdfebced9, 0x3cb0bc8e)}}, {{TOBN(0x92d3dcdc, 0x53868c8b), TOBN(0x174311a2, 0x386107a6), TOBN(0x4109e07c, 0x689b4e64), TOBN(0x30e4587f, 0x2df3dcb6)}, {TOBN(0x841aea31, 0x0811b3b2), TOBN(0x6144d41d, 0x0cce43ea), TOBN(0x464c4581, 0x2a9a7803), TOBN(0xd03d371f, 0x3e158930)}}, {{TOBN(0xc676d7f2, 0xb1f3390b), TOBN(0x9f7a1b8c, 0xa5b61272), TOBN(0x4ebebfc9, 0xc2e127a9), TOBN(0x4602500c, 0x5dd997bf)}, {TOBN(0x7f09771c, 0x4711230f), TOBN(0x058eb37c, 0x020f09c1), TOBN(0xab693d4b, 0xfee5e38b), TOBN(0x9289eb1f, 0x4653cbc0)}}, {{TOBN(0xbecf46ab, 0xd51b9cf5), TOBN(0xd2aa9c02, 0x9f0121af), TOBN(0x36aaf7d2, 0xe90dc274), TOBN(0x909e4ea0, 0x48b95a3c)}, {TOBN(0xe6b70496, 0x6f32dbdb), TOBN(0x672188a0, 0x8b030b3e), TOBN(0xeeffe5b3, 0xcfb617e2), TOBN(0x87e947de, 0x7c82709e)}}, {{TOBN(0xa44d2b39, 0x1770f5a7), TOBN(0xe4d4d791, 0x0e44eb82), TOBN(0x42e69d1e, 0x3f69712a), TOBN(0xbf11c4d6, 0xac6a820e)}, {TOBN(0xb5e7f3e5, 0x42c4224c), TOBN(0xd6b4e81c, 0x449d941c), TOBN(0x5d72bd16, 0x5450e878), TOBN(0x6a61e28a, 0xee25ac54)}}, {{TOBN(0x33272094, 0xe6f1cd95), TOBN(0x7512f30d, 0x0d18673f), TOBN(0x32f7a4ca, 0x5afc1464), TOBN(0x2f095656, 0x6bbb977b)}, {TOBN(0x586f47ca, 0xa8226200), TOBN(0x02c868ad, 0x1ac07369), TOBN(0x4ef2b845, 0xc613acbe), TOBN(0x43d7563e, 0x0386054c)}}, {{TOBN(0x54da9dc7, 0xab952578), TOBN(0xb5423df2, 0x26e84d0b), TOBN(0xa8b64eeb, 0x9b872042), TOBN(0xac205782, 0x5990f6df)}, {TOBN(0x4ff696eb, 0x21f4c77a), TOBN(0x1a79c3e4, 0xaab273af), TOBN(0x29bc922e, 0x9436b3f1), TOBN(0xff807ef8, 0xd6d9a27a)}}, {{TOBN(0x82acea3d, 0x778f22a0), TOBN(0xfb10b2e8, 0x5b5e7469), TOBN(0xc0b16980, 0x2818ee7d), TOBN(0x011afff4, 0xc91c1a2f)}, {TOBN(0x95a6d126, 0xad124418), TOBN(0x31c081a5, 0xe72e295f), TOBN(0x36bb283a, 0xf2f4db75), TOBN(0xd115540f, 0x7acef462)}}, {{TOBN(0xc7f3a8f8, 0x33f6746c), TOBN(0x21e46f65, 0xfea990ca), TOBN(0x915fd5c5, 0xcaddb0a9), TOBN(0xbd41f016, 0x78614555)}, {TOBN(0x346f4434, 0x426ffb58), TOBN(0x80559436, 0x14dbc204), TOBN(0xf3dd20fe, 0x5a969b7f), TOBN(0x9d59e956, 0xe899a39a)}}, {{TOBN(0xf1b0971c, 0x8ad4cf4b), TOBN(0x03448860, 0x2ffb8fb8), TOBN(0xf071ac3c, 0x65340ba4), TOBN(0x408d0596, 0xb27fd758)}, {TOBN(0xe7c78ea4, 0x98c364b0), TOBN(0xa4aac4a5, 0x051e8ab5), TOBN(0xb9e1d560, 0x485d9002), TOBN(0x9acd518a, 0x88844455)}}, {{TOBN(0xe4ca688f, 0xd06f56c0), TOBN(0xa48af70d, 0xdf027972), TOBN(0x691f0f04, 0x5e9a609d), TOBN(0xa9dd82cd, 0xee61270e)}, {TOBN(0x8903ca63, 0xa0ef18d3), TOBN(0x9fb7ee35, 0x3d6ca3bd), TOBN(0xa7b4a09c, 0xabf47d03), TOBN(0x4cdada01, 0x1c67de8e)}}, {{TOBN(0x52003749, 0x9355a244), TOBN(0xe77fd2b6, 0x4f2151a9), TOBN(0x695d6cf6, 0x66b4efcb), TOBN(0xc5a0cacf, 0xda2cfe25)}, {TOBN(0x104efe5c, 0xef811865), TOBN(0xf52813e8, 0x9ea5cc3d), TOBN(0x855683dc, 0x40b58dbc), TOBN(0x0338ecde, 0x175fcb11)}}, {{TOBN(0xf9a05637, 0x74921592), TOBN(0xb4f1261d, 0xb9bb9d31), TOBN(0x551429b7, 0x4e9c5459), TOBN(0xbe182e6f, 0x6ea71f53)}, {TOBN(0xd3a3b07c, 0xdfc50573), TOBN(0x9ba1afda, 0x62be8d44), TOBN(0x9bcfd2cb, 0x52ab65d3), TOBN(0xdf11d547, 0xa9571802)}}, {{TOBN(0x099403ee, 0x02a2404a), TOBN(0x497406f4, 0x21088a71), TOBN(0x99479409, 0x5004ae71), TOBN(0xbdb42078, 0xa812c362)}, {TOBN(0x2b72a30f, 0xd8828442), TOBN(0x283add27, 0xfcb5ed1c), TOBN(0xf7c0e200, 0x66a40015), TOBN(0x3e3be641, 0x08b295ef)}}, {{TOBN(0xac127dc1, 0xe038a675), TOBN(0x729deff3, 0x8c5c6320), TOBN(0xb7df8fd4, 0xa90d2c53), TOBN(0x9b74b0ec, 0x681e7cd3)}, {TOBN(0x5cb5a623, 0xdab407e5), TOBN(0xcdbd3615, 0x76b340c6), TOBN(0xa184415a, 0x7d28392c), TOBN(0xc184c1d8, 0xe96f7830)}}, {{TOBN(0xc3204f19, 0x81d3a80f), TOBN(0xfde0c841, 0xc8e02432), TOBN(0x78203b3e, 0x8149e0c1), TOBN(0x5904bdbb, 0x08053a73)}, {TOBN(0x30fc1dd1, 0x101b6805), TOBN(0x43c223bc, 0x49aa6d49), TOBN(0x9ed67141, 0x7a174087), TOBN(0x311469a0, 0xd5997008)}}, {{TOBN(0xb189b684, 0x5e43fc61), TOBN(0xf3282375, 0xe0d3ab57), TOBN(0x4fa34b67, 0xb1181da8), TOBN(0x621ed0b2, 0x99ee52b8)}, {TOBN(0x9b178de1, 0xad990676), TOBN(0xd51de67b, 0x56d54065), TOBN(0x2a2c27c4, 0x7538c201), TOBN(0x33856ec8, 0x38a40f5c)}}, {{TOBN(0x2522fc15, 0xbe6cdcde), TOBN(0x1e603f33, 0x9f0c6f89), TOBN(0x7994edc3, 0x103e30a6), TOBN(0x033a00db, 0x220c853e)}, {TOBN(0xd3cfa409, 0xf7bb7fd7), TOBN(0x70f8781e, 0x462d18f6), TOBN(0xbbd82980, 0x687fe295), TOBN(0x6eef4c32, 0x595669f3)}}, {{TOBN(0x86a9303b, 0x2f7e85c3), TOBN(0x5fce4621, 0x71988f9b), TOBN(0x5b935bf6, 0xc138acb5), TOBN(0x30ea7d67, 0x25661212)}, {TOBN(0xef1eb5f4, 0xe51ab9a2), TOBN(0x0587c98a, 0xae067c78), TOBN(0xb3ce1b3c, 0x77ca9ca6), TOBN(0x2a553d4d, 0x54b5f057)}}, {{TOBN(0xc7898236, 0x4da29ec2), TOBN(0xdbdd5d13, 0xb9c57316), TOBN(0xc57d6e6b, 0x2cd80d47), TOBN(0x80b460cf, 0xfe9e7391)}, {TOBN(0x98648cab, 0xf963c31e), TOBN(0x67f9f633, 0xcc4d32fd), TOBN(0x0af42a9d, 0xfdf7c687), TOBN(0x55f292a3, 0x0b015ea7)}}, {{TOBN(0x89e468b2, 0xcd21ab3d), TOBN(0xe504f022, 0xc393d392), TOBN(0xab21e1d4, 0xa5013af9), TOBN(0xe3283f78, 0xc2c28acb)}, {TOBN(0xf38b35f6, 0x226bf99f), TOBN(0xe8354274, 0x0e291e69), TOBN(0x61673a15, 0xb20c162d), TOBN(0xc101dc75, 0xb04fbdbe)}}, {{TOBN(0x8323b4c2, 0x255bd617), TOBN(0x6c969693, 0x6c2a9154), TOBN(0xc6e65860, 0x62679387), TOBN(0x8e01db0c, 0xb8c88e23)}, {TOBN(0x33c42873, 0x893a5559), TOBN(0x7630f04b, 0x47a3e149), TOBN(0xb5d80805, 0xddcf35f8), TOBN(0x582ca080, 0x77dfe732)}}, {{TOBN(0x2c7156e1, 0x0b1894a0), TOBN(0x92034001, 0xd81c68c0), TOBN(0xed225d00, 0xc8b115b5), TOBN(0x237f9c22, 0x83b907f2)}, {TOBN(0x0ea2f32f, 0x4470e2c0), TOBN(0xb725f7c1, 0x58be4e95), TOBN(0x0f1dcafa, 0xb1ae5463), TOBN(0x59ed5187, 0x1ba2fc04)}}, {{TOBN(0xf6e0f316, 0xd0115d4d), TOBN(0x5180b12f, 0xd3691599), TOBN(0x157e32c9, 0x527f0a41), TOBN(0x7b0b081d, 0xa8e0ecc0)}, {TOBN(0x6dbaaa8a, 0xbf4f0dd0), TOBN(0x99b289c7, 0x4d252696), TOBN(0x79b7755e, 0xdbf864fe), TOBN(0x6974e2b1, 0x76cad3ab)}}, {{TOBN(0x35dbbee2, 0x06ddd657), TOBN(0xe7cbdd11, 0x2ff3a96d), TOBN(0x88381968, 0x076be758), TOBN(0x2d737e72, 0x08c91f5d)}, {TOBN(0x5f83ab62, 0x86ec3776), TOBN(0x98aa649d, 0x945fa7a1), TOBN(0xf477ec37, 0x72ef0933), TOBN(0x66f52b1e, 0x098c17b1)}}, {{TOBN(0x9eec58fb, 0xd803738b), TOBN(0x91aaade7, 0xe4e86aa4), TOBN(0x6b1ae617, 0xa5b51492), TOBN(0x63272121, 0xbbc45974)}, {TOBN(0x7e0e28f0, 0x862c5129), TOBN(0x0a8f79a9, 0x3321a4a0), TOBN(0xe26d1664, 0x5041c88f), TOBN(0x0571b805, 0x53233e3a)}}, {{TOBN(0xd1b0ccde, 0xc9520711), TOBN(0x55a9e4ed, 0x3c8b84bf), TOBN(0x9426bd39, 0xa1fef314), TOBN(0x4f5f638e, 0x6eb93f2b)}, {TOBN(0xba2a1ed3, 0x2bf9341b), TOBN(0xd63c1321, 0x4d42d5a9), TOBN(0xd2964a89, 0x316dc7c5), TOBN(0xd1759606, 0xca511851)}}, {{TOBN(0xd8a9201f, 0xf9e6ed35), TOBN(0xb7b5ee45, 0x6736925a), TOBN(0x0a83fbbc, 0x99581af7), TOBN(0x3076bc40, 0x64eeb051)}, {TOBN(0x5511c98c, 0x02dec312), TOBN(0x270de898, 0x238dcb78), TOBN(0x2cf4cf9c, 0x539c08c9), TOBN(0xa70cb65e, 0x38d3b06e)}}, {{TOBN(0xb12ec10e, 0xcfe57bbd), TOBN(0x82c7b656, 0x35a0c2b5), TOBN(0xddc7d5cd, 0x161c67bd), TOBN(0xe32e8985, 0xae3a32cc)}, {TOBN(0x7aba9444, 0xd11a5529), TOBN(0xe964ed02, 0x2427fa1a), TOBN(0x1528392d, 0x24a1770a), TOBN(0xa152ce2c, 0x12c72fcd)}}, {{TOBN(0x714553a4, 0x8ec07649), TOBN(0x18b4c290, 0x459dd453), TOBN(0xea32b714, 0x7b64b110), TOBN(0xb871bfa5, 0x2e6f07a2)}, {TOBN(0xb67112e5, 0x9e2e3c9b), TOBN(0xfbf250e5, 0x44aa90f6), TOBN(0xf77aedb8, 0xbd539006), TOBN(0x3b0cdf9a, 0xd172a66f)}}, {{TOBN(0xedf69fea, 0xf8c51187), TOBN(0x05bb67ec, 0x741e4da7), TOBN(0x47df0f32, 0x08114345), TOBN(0x56facb07, 0xbb9792b1)}, {TOBN(0xf3e007e9, 0x8f6229e4), TOBN(0x62d103f4, 0x526fba0f), TOBN(0x4f33bef7, 0xb0339d79), TOBN(0x9841357b, 0xb59bfec1)}}, {{TOBN(0xfa8dbb59, 0xc34e6705), TOBN(0xc3c7180b, 0x7fdaa84c), TOBN(0xf95872fc, 0xa4108537), TOBN(0x8750cc3b, 0x932a3e5a)}, {TOBN(0xb61cc69d, 0xb7275d7d), TOBN(0xffa0168b, 0x2e59b2e9), TOBN(0xca032abc, 0x6ecbb493), TOBN(0x1d86dbd3, 0x2c9082d8)}}, {{TOBN(0xae1e0b67, 0xe28ef5ba), TOBN(0x2c9a4699, 0xcb18e169), TOBN(0x0ecd0e33, 0x1e6bbd20), TOBN(0x571b360e, 0xaf5e81d2)}, {TOBN(0xcd9fea58, 0x101c1d45), TOBN(0x6651788e, 0x18880452), TOBN(0xa9972635, 0x1f8dd446), TOBN(0x44bed022, 0xe37281d0)}}, {{TOBN(0x094b2b2d, 0x33da525d), TOBN(0xf193678e, 0x13144fd8), TOBN(0xb8ab5ba4, 0xf4c1061d), TOBN(0x4343b5fa, 0xdccbe0f4)}, {TOBN(0xa8702371, 0x63812713), TOBN(0x47bf6d2d, 0xf7611d93), TOBN(0x46729b8c, 0xbd21e1d7), TOBN(0x7484d4e0, 0xd629e77d)}}, {{TOBN(0x830e6eea, 0x60dbac1f), TOBN(0x23d8c484, 0xda06a2f7), TOBN(0x896714b0, 0x50ca535b), TOBN(0xdc8d3644, 0xebd97a9b)}, {TOBN(0x106ef9fa, 0xb12177b4), TOBN(0xf79bf464, 0x534d5d9c), TOBN(0x2537a349, 0xa6ab360b), TOBN(0xc7c54253, 0xa00c744f)}}, {{TOBN(0xb3c7a047, 0xe5911a76), TOBN(0x61ffa5c8, 0x647f1ee7), TOBN(0x15aed36f, 0x8f56ab42), TOBN(0x6a0d41b0, 0xa3ff9ac9)}, {TOBN(0x68f469f5, 0xcc30d357), TOBN(0xbe9adf81, 0x6b72be96), TOBN(0x1cd926fe, 0x903ad461), TOBN(0x7e89e38f, 0xcaca441b)}}, {{TOBN(0xf0f82de5, 0xfacf69d4), TOBN(0x363b7e76, 0x4775344c), TOBN(0x6894f312, 0xb2e36d04), TOBN(0x3c6cb4fe, 0x11d1c9a5)}, {TOBN(0x85d9c339, 0x4008e1f2), TOBN(0x5e9a85ea, 0x249f326c), TOBN(0xdc35c60a, 0x678c5e06), TOBN(0xc08b944f, 0x9f86fba9)}}, {{TOBN(0xde40c02c, 0x89f71f0f), TOBN(0xad8f3e31, 0xff3da3c0), TOBN(0x3ea5096b, 0x42125ded), TOBN(0x13879cbf, 0xa7379183)}, {TOBN(0x6f4714a5, 0x6b306a0b), TOBN(0x359c2ea6, 0x67646c5e), TOBN(0xfacf8943, 0x07726368), TOBN(0x07a58935, 0x65ff431e)}}, {{TOBN(0x24d661d1, 0x68754ab0), TOBN(0x801fce1d, 0x6f429a76), TOBN(0xc068a85f, 0xa58ce769), TOBN(0xedc35c54, 0x5d5eca2b)}, {TOBN(0xea31276f, 0xa3f660d1), TOBN(0xa0184ebe, 0xb8fc7167), TOBN(0x0f20f21a, 0x1d8db0ae), TOBN(0xd96d095f, 0x56c35e12)}}, {{TOBN(0xedf402b5, 0xf8c2a25b), TOBN(0x1bb772b9, 0x059204b6), TOBN(0x50cbeae2, 0x19b4e34c), TOBN(0x93109d80, 0x3fa0845a)}, {TOBN(0x54f7ccf7, 0x8ef59fb5), TOBN(0x3b438fe2, 0x88070963), TOBN(0x9e28c659, 0x31f3ba9b), TOBN(0x9cc31b46, 0xead9da92)}}, {{TOBN(0x3c2f0ba9, 0xb733aa5f), TOBN(0xdece47cb, 0xf05af235), TOBN(0xf8e3f715, 0xa2ac82a5), TOBN(0xc97ba641, 0x2203f18a)}, {TOBN(0xc3af5504, 0x09c11060), TOBN(0x56ea2c05, 0x46af512d), TOBN(0xfac28daf, 0xf3f28146), TOBN(0x87fab43a, 0x959ef494)}}}, {{{TOBN(0x09891641, 0xd4c5105f), TOBN(0x1ae80f8e, 0x6d7fbd65), TOBN(0x9d67225f, 0xbee6bdb0), TOBN(0x3b433b59, 0x7fc4d860)}, {TOBN(0x44e66db6, 0x93e85638), TOBN(0xf7b59252, 0xe3e9862f), TOBN(0xdb785157, 0x665c32ec), TOBN(0x702fefd7, 0xae362f50)}}, {{TOBN(0x3754475d, 0x0fefb0c3), TOBN(0xd48fb56b, 0x46d7c35d), TOBN(0xa070b633, 0x363798a4), TOBN(0xae89f3d2, 0x8fdb98e6)}, {TOBN(0x970b89c8, 0x6363d14c), TOBN(0x89817521, 0x67abd27d), TOBN(0x9bf7d474, 0x44d5a021), TOBN(0xb3083baf, 0xcac72aee)}}, {{TOBN(0x389741de, 0xbe949a44), TOBN(0x638e9388, 0x546a4fa5), TOBN(0x3fe6419c, 0xa0047bdc), TOBN(0x7047f648, 0xaaea57ca)}, {TOBN(0x54e48a90, 0x41fbab17), TOBN(0xda8e0b28, 0x576bdba2), TOBN(0xe807eebc, 0xc72afddc), TOBN(0x07d3336d, 0xf42577bf)}}, {{TOBN(0x62a8c244, 0xbfe20925), TOBN(0x91c19ac3, 0x8fdce867), TOBN(0x5a96a5d5, 0xdd387063), TOBN(0x61d587d4, 0x21d324f6)}, {TOBN(0xe87673a2, 0xa37173ea), TOBN(0x23848008, 0x53778b65), TOBN(0x10f8441e, 0x05bab43e), TOBN(0xfa11fe12, 0x4621efbe)}}, {{TOBN(0x047b772e, 0x81685d7b), TOBN(0x23f27d81, 0xbf34a976), TOBN(0xc27608e2, 0x915f48ef), TOBN(0x3b0b43fa, 0xa521d5c3)}, {TOBN(0x7613fb26, 0x63ca7284), TOBN(0x7f5729b4, 0x1d4db837), TOBN(0x87b14898, 0x583b526b), TOBN(0x00b732a6, 0xbbadd3d1)}}, {{TOBN(0x8e02f426, 0x2048e396), TOBN(0x436b50b6, 0x383d9de4), TOBN(0xf78d3481, 0x471e85ad), TOBN(0x8b01ea6a, 0xd005c8d6)}, {TOBN(0xd3c7afee, 0x97015c07), TOBN(0x46cdf1a9, 0x4e3ba2ae), TOBN(0x7a42e501, 0x83d3a1d2), TOBN(0xd54b5268, 0xb541dff4)}}, {{TOBN(0x3f24cf30, 0x4e23e9bc), TOBN(0x4387f816, 0x126e3624), TOBN(0x26a46a03, 0x3b0b6d61), TOBN(0xaf1bc845, 0x8b2d777c)}, {TOBN(0x25c401ba, 0x527de79c), TOBN(0x0e1346d4, 0x4261bbb6), TOBN(0x4b96c44b, 0x287b4bc7), TOBN(0x658493c7, 0x5254562f)}}, {{TOBN(0x23f949fe, 0xb8a24a20), TOBN(0x17ebfed1, 0xf52ca53f), TOBN(0x9b691bbe, 0xbcfb4853), TOBN(0x5617ff6b, 0x6278a05d)}, {TOBN(0x241b34c5, 0xe3c99ebd), TOBN(0xfc64242e, 0x1784156a), TOBN(0x4206482f, 0x695d67df), TOBN(0xb967ce0e, 0xee27c011)}}, {{TOBN(0x65db3751, 0x21c80b5d), TOBN(0x2e7a563c, 0xa31ecca0), TOBN(0xe56ffc4e, 0x5238a07e), TOBN(0x3d6c2966, 0x32ced854)}, {TOBN(0xe99d7d1a, 0xaf70b885), TOBN(0xafc3bad9, 0x2d686459), TOBN(0x9c78bf46, 0x0cc8ba5b), TOBN(0x5a439519, 0x18955aa3)}}, {{TOBN(0xf8b517a8, 0x5fe4e314), TOBN(0xe60234d0, 0xfcb8906f), TOBN(0xffe542ac, 0xf2061b23), TOBN(0x287e191f, 0x6b4cb59c)}, {TOBN(0x21857ddc, 0x09d877d8), TOBN(0x1c23478c, 0x14678941), TOBN(0xbbf0c056, 0xb6e05ea4), TOBN(0x82da4b53, 0xb01594fe)}}, {{TOBN(0xf7526791, 0xfadb8608), TOBN(0x049e832d, 0x7b74cdf6), TOBN(0xa43581cc, 0xc2b90a34), TOBN(0x73639eb8, 0x9360b10c)}, {TOBN(0x4fba331f, 0xe1e4a71b), TOBN(0x6ffd6b93, 0x8072f919), TOBN(0x6e53271c, 0x65679032), TOBN(0x67206444, 0xf14272ce)}}, {{TOBN(0xc0f734a3, 0xb2335834), TOBN(0x9526205a, 0x90ef6860), TOBN(0xcb8be717, 0x04e2bb0d), TOBN(0x2418871e, 0x02f383fa)}, {TOBN(0xd7177681, 0x4082c157), TOBN(0xcc914ad0, 0x29c20073), TOBN(0xf186c1eb, 0xe587e728), TOBN(0x6fdb3c22, 0x61bcd5fd)}}, {{TOBN(0x30d014a6, 0xf2f9f8e9), TOBN(0x963ece23, 0x4fec49d2), TOBN(0x862025c5, 0x9605a8d9), TOBN(0x39874445, 0x19f8929a)}, {TOBN(0x01b6ff65, 0x12bf476a), TOBN(0x598a64d8, 0x09cf7d91), TOBN(0xd7ec7749, 0x93be56ca), TOBN(0x10899785, 0xcbb33615)}}, {{TOBN(0xb8a092fd, 0x02eee3ad), TOBN(0xa86b3d35, 0x30145270), TOBN(0x323d98c6, 0x8512b675), TOBN(0x4b8bc785, 0x62ebb40f)}, {TOBN(0x7d301f54, 0x413f9cde), TOBN(0xa5e4fb4f, 0x2bab5664), TOBN(0x1d2b252d, 0x1cbfec23), TOBN(0xfcd576bb, 0xe177120d)}}, {{TOBN(0x04427d3e, 0x83731a34), TOBN(0x2bb9028e, 0xed836e8e), TOBN(0xb36acff8, 0xb612ca7c), TOBN(0xb88fe5ef, 0xd3d9c73a)}, {TOBN(0xbe2a6bc6, 0xedea4eb3), TOBN(0x43b93133, 0x488eec77), TOBN(0xf41ff566, 0xb17106e1), TOBN(0x469e9172, 0x654efa32)}}, {{TOBN(0xb4480f04, 0x41c23fa3), TOBN(0xb4712eb0, 0xc1989a2e), TOBN(0x3ccbba0f, 0x93a29ca7), TOBN(0x6e205c14, 0xd619428c)}, {TOBN(0x90db7957, 0xb3641686), TOBN(0x0432691d, 0x45ac8b4e), TOBN(0x07a759ac, 0xf64e0350), TOBN(0x0514d89c, 0x9c972517)}}, {{TOBN(0x1701147f, 0xa8e67fc3), TOBN(0x9e2e0b8b, 0xab2085be), TOBN(0xd5651824, 0xac284e57), TOBN(0x890d4325, 0x74893664)}, {TOBN(0x8a7c5e6e, 0xc55e68a3), TOBN(0xbf12e90b, 0x4339c85a), TOBN(0x31846b85, 0xf922b655), TOBN(0x9a54ce4d, 0x0bf4d700)}}, {{TOBN(0xd7f4e83a, 0xf1a14295), TOBN(0x916f955c, 0xb285d4f9), TOBN(0xe57bb0e0, 0x99ffdaba), TOBN(0x28a43034, 0xeab0d152)}, {TOBN(0x0a36ffa2, 0xb8a9cef8), TOBN(0x5517407e, 0xb9ec051a), TOBN(0x9c796096, 0xea68e672), TOBN(0x853db5fb, 0xfb3c77fb)}}, {{TOBN(0x21474ba9, 0xe864a51a), TOBN(0x6c267699, 0x6e8a1b8b), TOBN(0x7c823626, 0x94120a28), TOBN(0xe61e9a48, 0x8383a5db)}, {TOBN(0x7dd75003, 0x9f84216d), TOBN(0xab020d07, 0xad43cd85), TOBN(0x9437ae48, 0xda12c659), TOBN(0x6449c2eb, 0xe65452ad)}}, {{TOBN(0xcc7c4c1c, 0x2cf9d7c1), TOBN(0x1320886a, 0xee95e5ab), TOBN(0xbb7b9056, 0xbeae170c), TOBN(0xc8a5b250, 0xdbc0d662)}, {TOBN(0x4ed81432, 0xc11d2303), TOBN(0x7da66912, 0x1f03769f), TOBN(0x3ac7a5fd, 0x84539828), TOBN(0x14dada94, 0x3bccdd02)}}, {{TOBN(0x8b84c321, 0x7ef6b0d1), TOBN(0x52a9477a, 0x7c933f22), TOBN(0x5ef6728a, 0xfd440b82), TOBN(0x5c3bd859, 0x6ce4bd5e)}, {TOBN(0x918b80f5, 0xf22c2d3e), TOBN(0x368d5040, 0xb7bb6cc5), TOBN(0xb66142a1, 0x2695a11c), TOBN(0x60ac583a, 0xeb19ea70)}}, {{TOBN(0x317cbb98, 0x0eab2437), TOBN(0x8cc08c55, 0x5e2654c8), TOBN(0xfe2d6520, 0xe6d8307f), TOBN(0xe9f147f3, 0x57428993)}, {TOBN(0x5f9c7d14, 0xd2fd6cf1), TOBN(0xa3ecd064, 0x2d4fcbb0), TOBN(0xad83fef0, 0x8e7341f7), TOBN(0x643f23a0, 0x3a63115c)}}, {{TOBN(0xd38a78ab, 0xe65ab743), TOBN(0xbf7c75b1, 0x35edc89c), TOBN(0x3dd8752e, 0x530df568), TOBN(0xf85c4a76, 0xe308c682)}, {TOBN(0x4c9955b2, 0xe68acf37), TOBN(0xa544df3d, 0xab32af85), TOBN(0x4b8ec3f5, 0xa25cf493), TOBN(0x4d8f2764, 0x1a622feb)}}, {{TOBN(0x7bb4f7aa, 0xf0dcbc49), TOBN(0x7de551f9, 0x70bbb45b), TOBN(0xcfd0f3e4, 0x9f2ca2e5), TOBN(0xece58709, 0x1f5c76ef)}, {TOBN(0x32920edd, 0x167d79ae), TOBN(0x039df8a2, 0xfa7d7ec1), TOBN(0xf46206c0, 0xbb30af91), TOBN(0x1ff5e2f5, 0x22676b59)}}, {{TOBN(0x11f4a039, 0x6ea51d66), TOBN(0x506c1445, 0x807d7a26), TOBN(0x60da5705, 0x755a9b24), TOBN(0x8fc8cc32, 0x1f1a319e)}, {TOBN(0x83642d4d, 0x9433d67d), TOBN(0x7fa5cb8f, 0x6a7dd296), TOBN(0x576591db, 0x9b7bde07), TOBN(0x13173d25, 0x419716fb)}}, {{TOBN(0xea30599d, 0xd5b340ff), TOBN(0xfc6b5297, 0xb0fe76c5), TOBN(0x1c6968c8, 0xab8f5adc), TOBN(0xf723c7f5, 0x901c928d)}, {TOBN(0x4203c321, 0x9773d402), TOBN(0xdf7c6aa3, 0x1b51dd47), TOBN(0x3d49e37a, 0x552be23c), TOBN(0x57febee8, 0x0b5a6e87)}}, {{TOBN(0xc5ecbee4, 0x7bd8e739), TOBN(0x79d44994, 0xae63bf75), TOBN(0x168bd00f, 0x38fb8923), TOBN(0x75d48ee4, 0xd0533130)}, {TOBN(0x554f77aa, 0xdb5cdf33), TOBN(0x3396e896, 0x3c696769), TOBN(0x2fdddbf2, 0xd3fd674e), TOBN(0xbbb8f6ee, 0x99d0e3e5)}}, {{TOBN(0x51b90651, 0xcbae2f70), TOBN(0xefc4bc05, 0x93aaa8eb), TOBN(0x8ecd8689, 0xdd1df499), TOBN(0x1aee99a8, 0x22f367a5)}, {TOBN(0x95d485b9, 0xae8274c5), TOBN(0x6c14d445, 0x7d30b39c), TOBN(0xbafea90b, 0xbcc1ef81), TOBN(0x7c5f317a, 0xa459a2ed)}}, {{TOBN(0x01211075, 0x4ef44227), TOBN(0xa17bed6e, 0xdc20f496), TOBN(0x0cdfe424, 0x819853cd), TOBN(0x13793298, 0xf71e2ce7)}, {TOBN(0x3c1f3078, 0xdbbe307b), TOBN(0x6dd1c20e, 0x76ee9936), TOBN(0x23ee4b57, 0x423caa20), TOBN(0x4ac3793b, 0x8efb840e)}}, {{TOBN(0x934438eb, 0xed1f8ca0), TOBN(0x3e546658, 0x4ebb25a2), TOBN(0xc415af0e, 0xc069896f), TOBN(0xc13eddb0, 0x9a5aa43d)}, {TOBN(0x7a04204f, 0xd49eb8f6), TOBN(0xd0d5bdfc, 0xd74f1670), TOBN(0x3697e286, 0x56fc0558), TOBN(0x10207371, 0x01cebade)}}, {{TOBN(0x5f87e690, 0x0647a82b), TOBN(0x908e0ed4, 0x8f40054f), TOBN(0xa9f633d4, 0x79853803), TOBN(0x8ed13c9a, 0x4a28b252)}, {TOBN(0x3e2ef676, 0x1f460f64), TOBN(0x53930b9b, 0x36d06336), TOBN(0x347073ac, 0x8fc4979b), TOBN(0x84380e0e, 0x5ecd5597)}}, {{TOBN(0xe3b22c6b, 0xc4fe3c39), TOBN(0xba4a8153, 0x6c7bebdf), TOBN(0xf23ab6b7, 0x25693459), TOBN(0x53bc3770, 0x14922b11)}, {TOBN(0x4645c8ab, 0x5afc60db), TOBN(0xaa022355, 0x20b9f2a3), TOBN(0x52a2954c, 0xce0fc507), TOBN(0x8c2731bb, 0x7ce1c2e7)}}, {{TOBN(0xf39608ab, 0x18a0339d), TOBN(0xac7a658d, 0x3735436c), TOBN(0xb22c2b07, 0xcd992b4f), TOBN(0x4e83daec, 0xf40dcfd4)}, {TOBN(0x8a34c7be, 0x2f39ea3e), TOBN(0xef0c005f, 0xb0a56d2e), TOBN(0x62731f6a, 0x6edd8038), TOBN(0x5721d740, 0x4e3cb075)}}, {{TOBN(0x1ea41511, 0xfbeeee1b), TOBN(0xd1ef5e73, 0xef1d0c05), TOBN(0x42feefd1, 0x73c07d35), TOBN(0xe530a00a, 0x8a329493)}, {TOBN(0x5d55b7fe, 0xf15ebfb0), TOBN(0x549de03c, 0xd322491a), TOBN(0xf7b5f602, 0x745b3237), TOBN(0x3632a3a2, 0x1ab6e2b6)}}, {{TOBN(0x0d3bba89, 0x0ef59f78), TOBN(0x0dfc6443, 0xc9e52b9a), TOBN(0x1dc79699, 0x72631447), TOBN(0xef033917, 0xb3be20b1)}, {TOBN(0x0c92735d, 0xb1383948), TOBN(0xc1fc29a2, 0xc0dd7d7d), TOBN(0x6485b697, 0x403ed068), TOBN(0x13bfaab3, 0xaac93bdc)}}, {{TOBN(0x410dc6a9, 0x0deeaf52), TOBN(0xb003fb02, 0x4c641c15), TOBN(0x1384978c, 0x5bc504c4), TOBN(0x37640487, 0x864a6a77)}, {TOBN(0x05991bc6, 0x222a77da), TOBN(0x62260a57, 0x5e47eb11), TOBN(0xc7af6613, 0xf21b432c), TOBN(0x22f3acc9, 0xab4953e9)}}, {{TOBN(0x52934922, 0x8e41d155), TOBN(0x4d024568, 0x3ac059ef), TOBN(0xb0201755, 0x4d884411), TOBN(0xce8055cf, 0xa59a178f)}, {TOBN(0xcd77d1af, 0xf6204549), TOBN(0xa0a00a3e, 0xc7066759), TOBN(0x471071ef, 0x0272c229), TOBN(0x009bcf6b, 0xd3c4b6b0)}}, {{TOBN(0x2a2638a8, 0x22305177), TOBN(0xd51d59df, 0x41645bbf), TOBN(0xa81142fd, 0xc0a7a3c0), TOBN(0xa17eca6d, 0x4c7063ee)}, {TOBN(0x0bb887ed, 0x60d9dcec), TOBN(0xd6d28e51, 0x20ad2455), TOBN(0xebed6308, 0xa67102ba), TOBN(0x042c3114, 0x8bffa408)}}, {{TOBN(0xfd099ac5, 0x8aa68e30), TOBN(0x7a6a3d7c, 0x1483513e), TOBN(0xffcc6b75, 0xba2d8f0c), TOBN(0x54dacf96, 0x1e78b954)}, {TOBN(0xf645696f, 0xa4a9af89), TOBN(0x3a411940, 0x06ac98ec), TOBN(0x41b8b3f6, 0x22a67a20), TOBN(0x2d0b1e0f, 0x99dec626)}}, {{TOBN(0x27c89192, 0x40be34e8), TOBN(0xc7162b37, 0x91907f35), TOBN(0x90188ec1, 0xa956702b), TOBN(0xca132f7d, 0xdf93769c)}, {TOBN(0x3ece44f9, 0x0e2025b4), TOBN(0x67aaec69, 0x0c62f14c), TOBN(0xad741418, 0x22e3cc11), TOBN(0xcf9b75c3, 0x7ff9a50e)}}, {{TOBN(0x02fa2b16, 0x4d348272), TOBN(0xbd99d61a, 0x9959d56d), TOBN(0xbc4f19db, 0x18762916), TOBN(0xcc7cce50, 0x49c1ac80)}, {TOBN(0x4d59ebaa, 0xd846bd83), TOBN(0x8775a9dc, 0xa9202849), TOBN(0x07ec4ae1, 0x6e1f4ca9), TOBN(0x27eb5875, 0xba893f11)}}, {{TOBN(0x00284d51, 0x662cc565), TOBN(0x82353a6b, 0x0db4138d), TOBN(0xd9c7aaaa, 0xaa32a594), TOBN(0xf5528b5e, 0xa5669c47)}, {TOBN(0xf3220231, 0x2f23c5ff), TOBN(0xe3e8147a, 0x6affa3a1), TOBN(0xfb423d5c, 0x202ddda0), TOBN(0x3d6414ac, 0x6b871bd4)}}, {{TOBN(0x586f82e1, 0xa51a168a), TOBN(0xb712c671, 0x48ae5448), TOBN(0x9a2e4bd1, 0x76233eb8), TOBN(0x0188223a, 0x78811ca9)}, {TOBN(0x553c5e21, 0xf7c18de1), TOBN(0x7682e451, 0xb27bb286), TOBN(0x3ed036b3, 0x0e51e929), TOBN(0xf487211b, 0xec9cb34f)}}, {{TOBN(0x0d094277, 0x0c24efc8), TOBN(0x0349fd04, 0xbef737a4), TOBN(0x6d1c9dd2, 0x514cdd28), TOBN(0x29c135ff, 0x30da9521)}, {TOBN(0xea6e4508, 0xf78b0b6f), TOBN(0x176f5dd2, 0x678c143c), TOBN(0x08148418, 0x4be21e65), TOBN(0x27f7525c, 0xe7df38c4)}}, {{TOBN(0x1fb70e09, 0x748ab1a4), TOBN(0x9cba50a0, 0x5efe4433), TOBN(0x7846c7a6, 0x15f75af2), TOBN(0x2a7c2c57, 0x5ee73ea8)}, {TOBN(0x42e566a4, 0x3f0a449a), TOBN(0x45474c3b, 0xad90fc3d), TOBN(0x7447be3d, 0x8b61d057), TOBN(0x3e9d1cf1, 0x3a4ec092)}}, {{TOBN(0x1603e453, 0xf380a6e6), TOBN(0x0b86e431, 0x9b1437c2), TOBN(0x7a4173f2, 0xef29610a), TOBN(0x8fa729a7, 0xf03d57f7)}, {TOBN(0x3e186f6e, 0x6c9c217e), TOBN(0xbe1d3079, 0x91919524), TOBN(0x92a62a70, 0x153d4fb1), TOBN(0x32ed3e34, 0xd68c2f71)}}, {{TOBN(0xd785027f, 0x9eb1a8b7), TOBN(0xbc37eb77, 0xc5b22fe8), TOBN(0x466b34f0, 0xb9d6a191), TOBN(0x008a89af, 0x9a05f816)}, {TOBN(0x19b028fb, 0x7d42c10a), TOBN(0x7fe8c92f, 0x49b3f6b8), TOBN(0x58907cc0, 0xa5a0ade3), TOBN(0xb3154f51, 0x559d1a7c)}}, {{TOBN(0x5066efb6, 0xd9790ed6), TOBN(0xa77a0cbc, 0xa6aa793b), TOBN(0x1a915f3c, 0x223e042e), TOBN(0x1c5def04, 0x69c5874b)}, {TOBN(0x0e830078, 0x73b6c1da), TOBN(0x55cf85d2, 0xfcd8557a), TOBN(0x0f7c7c76, 0x0460f3b1), TOBN(0x87052acb, 0x46e58063)}}, {{TOBN(0x09212b80, 0x907eae66), TOBN(0x3cb068e0, 0x4d721c89), TOBN(0xa87941ae, 0xdd45ac1c), TOBN(0xde8d5c0d, 0x0daa0dbb)}, {TOBN(0xda421fdc, 0xe3502e6e), TOBN(0xc8944201, 0x4d89a084), TOBN(0x7307ba5e, 0xf0c24bfb), TOBN(0xda212beb, 0x20bde0ef)}}, {{TOBN(0xea2da24b, 0xf82ce682), TOBN(0x058d3816, 0x07f71fe4), TOBN(0x35a02462, 0x5ffad8de), TOBN(0xcd7b05dc, 0xaadcefab)}, {TOBN(0xd442f8ed, 0x1d9f54ec), TOBN(0x8be3d618, 0xb2d3b5ca), TOBN(0xe2220ed0, 0xe06b2ce2), TOBN(0x82699a5f, 0x1b0da4c0)}}, {{TOBN(0x3ff106f5, 0x71c0c3a7), TOBN(0x8f580f5a, 0x0d34180c), TOBN(0x4ebb120e, 0x22d7d375), TOBN(0x5e5782cc, 0xe9513675)}, {TOBN(0x2275580c, 0x99c82a70), TOBN(0xe8359fbf, 0x15ea8c4c), TOBN(0x53b48db8, 0x7b415e70), TOBN(0xaacf2240, 0x100c6014)}}, {{TOBN(0x9faaccf5, 0xe4652f1d), TOBN(0xbd6fdd2a, 0xd56157b2), TOBN(0xa4f4fb1f, 0x6261ec50), TOBN(0x244e55ad, 0x476bcd52)}, {TOBN(0x881c9305, 0x047d320b), TOBN(0x1ca983d5, 0x6181263f), TOBN(0x354e9a44, 0x278fb8ee), TOBN(0xad2dbc0f, 0x396e4964)}}, {{TOBN(0x723f3aa2, 0x9268b3de), TOBN(0x0d1ca29a, 0xe6e0609a), TOBN(0x794866aa, 0x6cf44252), TOBN(0x0b59f3e3, 0x01af87ed)}, {TOBN(0xe234e5ff, 0x7f4a6c51), TOBN(0xa8768fd2, 0x61dc2f7e), TOBN(0xdafc7332, 0x0a94d81f), TOBN(0xd7f84282, 0x06938ce1)}}, {{TOBN(0xae0b3c0e, 0x0546063e), TOBN(0x7fbadcb2, 0x5d61abc6), TOBN(0xd5d7a2c9, 0x369ac400), TOBN(0xa5978d09, 0xae67d10c)}, {TOBN(0x290f211e, 0x4f85eaac), TOBN(0xe61e2ad1, 0xfacac681), TOBN(0xae125225, 0x388384cd), TOBN(0xa7fb68e9, 0xccfde30f)}}, {{TOBN(0x7a59b936, 0x3daed4c2), TOBN(0x80a9aa40, 0x2606f789), TOBN(0xb40c1ea5, 0xf6a6d90a), TOBN(0x948364d3, 0x514d5885)}, {TOBN(0x062ebc60, 0x70985182), TOBN(0xa6db5b0e, 0x33310895), TOBN(0x64a12175, 0xe329c2f5), TOBN(0xc5f25bd2, 0x90ea237e)}}, {{TOBN(0x7915c524, 0x2d0a4c23), TOBN(0xeb5d26e4, 0x6bb3cc52), TOBN(0x369a9116, 0xc09e2c92), TOBN(0x0c527f92, 0xcf182cf8)}, {TOBN(0x9e591938, 0x2aede0ac), TOBN(0xb2922208, 0x6cc34939), TOBN(0x3c9d8962, 0x99a34361), TOBN(0x3c81836d, 0xc1905fe6)}}, {{TOBN(0x4bfeb57f, 0xa001ec5a), TOBN(0xe993f5bb, 0xa0dc5dba), TOBN(0x47884109, 0x724a1380), TOBN(0x8a0369ab, 0x32fe9a04)}, {TOBN(0xea068d60, 0x8c927db8), TOBN(0xbf5f37cf, 0x94655741), TOBN(0x47d402a2, 0x04b6c7ea), TOBN(0x4551c295, 0x6af259cb)}}, {{TOBN(0x698b71e7, 0xed77ee8b), TOBN(0xbddf7bd0, 0xf309d5c7), TOBN(0x6201c22c, 0x34e780ca), TOBN(0xab04f7d8, 0x4c295ef4)}, {TOBN(0x1c947294, 0x4313a8ce), TOBN(0xe532e4ac, 0x92ca4cfe), TOBN(0x89738f80, 0xd0a7a97a), TOBN(0xec088c88, 0xa580fd5b)}}, {{TOBN(0x612b1ecc, 0x42ce9e51), TOBN(0x8f9840fd, 0xb25fdd2a), TOBN(0x3cda78c0, 0x01e7f839), TOBN(0x546b3d3a, 0xece05480)}, {TOBN(0x271719a9, 0x80d30916), TOBN(0x45497107, 0x584c20c4), TOBN(0xaf8f9478, 0x5bc78608), TOBN(0x28c7d484, 0x277e2a4c)}}, {{TOBN(0xfce01767, 0x88a2ffe4), TOBN(0xdc506a35, 0x28e169a5), TOBN(0x0ea10861, 0x7af9c93a), TOBN(0x1ed24361, 0x03fa0e08)}, {TOBN(0x96eaaa92, 0xa3d694e7), TOBN(0xc0f43b4d, 0xef50bc74), TOBN(0xce6aa58c, 0x64114db4), TOBN(0x8218e8ea, 0x7c000fd4)}}, {{TOBN(0xac815dfb, 0x185f8844), TOBN(0xcd7e90cb, 0x1557abfb), TOBN(0x23d16655, 0xafbfecdf), TOBN(0x80f3271f, 0x085cac4a)}, {TOBN(0x7fc39aa7, 0xd0e62f47), TOBN(0x88d519d1, 0x460a48e5), TOBN(0x59559ac4, 0xd28f101e), TOBN(0x7981d9e9, 0xca9ae816)}}, {{TOBN(0x5c38652c, 0x9ac38203), TOBN(0x86eaf87f, 0x57657fe5), TOBN(0x568fc472, 0xe21f5416), TOBN(0x2afff39c, 0xe7e597b5)}, {TOBN(0x3adbbb07, 0x256d4eab), TOBN(0x22598692, 0x8285ab89), TOBN(0x35f8112a, 0x041caefe), TOBN(0x95df02e3, 0xa5064c8b)}}, {{TOBN(0x4d63356e, 0xc7004bf3), TOBN(0x230a08f4, 0xdb83c7de), TOBN(0xca27b270, 0x8709a7b7), TOBN(0x0d1c4cc4, 0xcb9abd2d)}, {TOBN(0x8a0bc66e, 0x7550fee8), TOBN(0x369cd4c7, 0x9cf7247e), TOBN(0x75562e84, 0x92b5b7e7), TOBN(0x8fed0da0, 0x5802af7b)}}, {{TOBN(0x6a7091c2, 0xe48fb889), TOBN(0x26882c13, 0x7b8a9d06), TOBN(0xa2498663, 0x1b82a0e2), TOBN(0x844ed736, 0x3518152d)}, {TOBN(0x282f476f, 0xd86e27c7), TOBN(0xa04edaca, 0x04afefdc), TOBN(0x8b256ebc, 0x6119e34d), TOBN(0x56a413e9, 0x0787d78b)}}}, {{{TOBN(0x82ee061d, 0x5a74be50), TOBN(0xe41781c4, 0xdea16ff5), TOBN(0xe0b0c81e, 0x99bfc8a2), TOBN(0x624f4d69, 0x0b547e2d)}, {TOBN(0x3a83545d, 0xbdcc9ae4), TOBN(0x2573dbb6, 0x409b1e8e), TOBN(0x482960c4, 0xa6c93539), TOBN(0xf01059ad, 0x5ae18798)}}, {{TOBN(0x715c9f97, 0x3112795f), TOBN(0xe8244437, 0x984e6ee1), TOBN(0x55cb4858, 0xecb66bcd), TOBN(0x7c136735, 0xabaffbee)}, {TOBN(0x54661595, 0x5dbec38e), TOBN(0x51c0782c, 0x388ad153), TOBN(0x9ba4c53a, 0xc6e0952f), TOBN(0x27e6782a, 0x1b21dfa8)}}, {{TOBN(0x682f903d, 0x4ed2dbc2), TOBN(0x0eba59c8, 0x7c3b2d83), TOBN(0x8e9dc84d, 0x9c7e9335), TOBN(0x5f9b21b0, 0x0eb226d7)}, {TOBN(0xe33bd394, 0xaf267bae), TOBN(0xaa86cc25, 0xbe2e15ae), TOBN(0x4f0bf67d, 0x6a8ec500), TOBN(0x5846aa44, 0xf9630658)}}, {{TOBN(0xfeb09740, 0xe2c2bf15), TOBN(0x627a2205, 0xa9e99704), TOBN(0xec8d73d0, 0xc2fbc565), TOBN(0x223eed8f, 0xc20c8de8)}, {TOBN(0x1ee32583, 0xa8363b49), TOBN(0x1a0b6cb9, 0xc9c2b0a6), TOBN(0x49f7c3d2, 0x90dbc85c), TOBN(0xa8dfbb97, 0x1ef4c1ac)}}, {{TOBN(0xafb34d4c, 0x65c7c2ab), TOBN(0x1d4610e7, 0xe2c5ea84), TOBN(0x893f6d1b, 0x973c4ab5), TOBN(0xa3cdd7e9, 0x945ba5c4)}, {TOBN(0x60514983, 0x064417ee), TOBN(0x1459b23c, 0xad6bdf2b), TOBN(0x23b2c341, 0x5cf726c3), TOBN(0x3a829635, 0x32d6354a)}}, {{TOBN(0x294f901f, 0xab192c18), TOBN(0xec5fcbfe, 0x7030164f), TOBN(0xe2e2fcb7, 0xe2246ba6), TOBN(0x1e7c88b3, 0x221a1a0c)}, {TOBN(0x72c7dd93, 0xc92d88c5), TOBN(0x41c2148e, 0x1106fb59), TOBN(0x547dd4f5, 0xa0f60f14), TOBN(0xed9b52b2, 0x63960f31)}}, {{TOBN(0x6c8349eb, 0xb0a5b358), TOBN(0xb154c5c2, 0x9e7e2ed6), TOBN(0xcad5eccf, 0xeda462db), TOBN(0xf2d6dbe4, 0x2de66b69)}, {TOBN(0x426aedf3, 0x8665e5b2), TOBN(0x488a8513, 0x7b7f5723), TOBN(0x15cc43b3, 0x8bcbb386), TOBN(0x27ad0af3, 0xd791d879)}}, {{TOBN(0xc16c236e, 0x846e364f), TOBN(0x7f33527c, 0xdea50ca0), TOBN(0xc4810775, 0x0926b86d), TOBN(0x6c2a3609, 0x0598e70c)}, {TOBN(0xa6755e52, 0xf024e924), TOBN(0xe0fa07a4, 0x9db4afca), TOBN(0x15c3ce7d, 0x66831790), TOBN(0x5b4ef350, 0xa6cbb0d6)}}, {{TOBN(0x2c4aafc4, 0xb6205969), TOBN(0x42563f02, 0xf6c7854f), TOBN(0x016aced5, 0x1d983b48), TOBN(0xfeb356d8, 0x99949755)}, {TOBN(0x8c2a2c81, 0xd1a39bd7), TOBN(0x8f44340f, 0xe6934ae9), TOBN(0x148cf91c, 0x447904da), TOBN(0x7340185f, 0x0f51a926)}}, {{TOBN(0x2f8f00fb, 0x7409ab46), TOBN(0x057e78e6, 0x80e289b2), TOBN(0x03e5022c, 0xa888e5d1), TOBN(0x3c87111a, 0x9dede4e2)}, {TOBN(0x5b9b0e1c, 0x7809460b), TOBN(0xe751c852, 0x71c9abc7), TOBN(0x8b944e28, 0xc7cc1dc9), TOBN(0x4f201ffa, 0x1d3cfa08)}}, {{TOBN(0x02fc905c, 0x3e6721ce), TOBN(0xd52d70da, 0xd0b3674c), TOBN(0x5dc2e5ca, 0x18810da4), TOBN(0xa984b273, 0x5c69dd99)}, {TOBN(0x63b92527, 0x84de5ca4), TOBN(0x2f1c9872, 0xc852dec4), TOBN(0x18b03593, 0xc2e3de09), TOBN(0x19d70b01, 0x9813dc2f)}}, {{TOBN(0x42806b2d, 0xa6dc1d29), TOBN(0xd3030009, 0xf871e144), TOBN(0xa1feb333, 0xaaf49276), TOBN(0xb5583b9e, 0xc70bc04b)}, {TOBN(0x1db0be78, 0x95695f20), TOBN(0xfc841811, 0x89d012b5), TOBN(0x6409f272, 0x05f61643), TOBN(0x40d34174, 0xd5883128)}}, {{TOBN(0xd79196f5, 0x67419833), TOBN(0x6059e252, 0x863b7b08), TOBN(0x84da1817, 0x1c56700c), TOBN(0x5758ee56, 0xb28d3ec4)}, {TOBN(0x7da2771d, 0x013b0ea6), TOBN(0xfddf524b, 0x54c5e9b9), TOBN(0x7df4faf8, 0x24305d80), TOBN(0x58f5c1bf, 0x3a97763f)}}, {{TOBN(0xa5af37f1, 0x7c696042), TOBN(0xd4cba22c, 0x4a2538de), TOBN(0x211cb995, 0x9ea42600), TOBN(0xcd105f41, 0x7b069889)}, {TOBN(0xb1e1cf19, 0xddb81e74), TOBN(0x472f2d89, 0x5157b8ca), TOBN(0x086fb008, 0xee9db885), TOBN(0x365cd570, 0x0f26d131)}}, {{TOBN(0x284b02bb, 0xa2be7053), TOBN(0xdcbbf7c6, 0x7ab9a6d6), TOBN(0x4425559c, 0x20f7a530), TOBN(0x961f2dfa, 0x188767c8)}, {TOBN(0xe2fd9435, 0x70dc80c4), TOBN(0x104d6b63, 0xf0784120), TOBN(0x7f592bc1, 0x53567122), TOBN(0xf6bc1246, 0xf688ad77)}}, {{TOBN(0x05214c05, 0x0f15dde9), TOBN(0xa47a76a8, 0x0d5f2b82), TOBN(0xbb254d30, 0x62e82b62), TOBN(0x11a05fe0, 0x3ec955ee)}, {TOBN(0x7eaff46e, 0x9d529b36), TOBN(0x55ab1301, 0x8f9e3df6), TOBN(0xc463e371, 0x99317698), TOBN(0xfd251438, 0xccda47ad)}}, {{TOBN(0xca9c3547, 0x23d695ea), TOBN(0x48ce626e, 0x16e589b5), TOBN(0x6b5b64c7, 0xb187d086), TOBN(0xd02e1794, 0xb2207948)}, {TOBN(0x8b58e98f, 0x7198111d), TOBN(0x90ca6305, 0xdcf9c3cc), TOBN(0x5691fe72, 0xf34089b0), TOBN(0x60941af1, 0xfc7c80ff)}}, {{TOBN(0xa09bc0a2, 0x22eb51e5), TOBN(0xc0bb7244, 0xaa9cf09a), TOBN(0x36a8077f, 0x80159f06), TOBN(0x8b5c989e, 0xdddc560e)}, {TOBN(0x19d2f316, 0x512e1f43), TOBN(0x02eac554, 0xad08ff62), TOBN(0x012ab84c, 0x07d20b4e), TOBN(0x37d1e115, 0xd6d4e4e1)}}, {{TOBN(0xb6443e1a, 0xab7b19a8), TOBN(0xf08d067e, 0xdef8cd45), TOBN(0x63adf3e9, 0x685e03da), TOBN(0xcf15a10e, 0x4792b916)}, {TOBN(0xf44bcce5, 0xb738a425), TOBN(0xebe131d5, 0x9636b2fd), TOBN(0x94068841, 0x7850d605), TOBN(0x09684eaa, 0xb40d749d)}}, {{TOBN(0x8c3c669c, 0x72ba075b), TOBN(0x89f78b55, 0xba469015), TOBN(0x5706aade, 0x3e9f8ba8), TOBN(0x6d8bd565, 0xb32d7ed7)}, {TOBN(0x25f4e63b, 0x805f08d6), TOBN(0x7f48200d, 0xc3bcc1b5), TOBN(0x4e801968, 0xb025d847), TOBN(0x74afac04, 0x87cbe0a8)}}, {{TOBN(0x43ed2c2b, 0x7e63d690), TOBN(0xefb6bbf0, 0x0223cdb8), TOBN(0x4fec3cae, 0x2884d3fe), TOBN(0x065ecce6, 0xd75e25a4)}, {TOBN(0x6c2294ce, 0x69f79071), TOBN(0x0d9a8e5f, 0x044b8666), TOBN(0x5009f238, 0x17b69d8f), TOBN(0x3c29f8fe, 0xc5dfdaf7)}}, {{TOBN(0x9067528f, 0xebae68c4), TOBN(0x5b385632, 0x30c5ba21), TOBN(0x540df119, 0x1fdd1aec), TOBN(0xcf37825b, 0xcfba4c78)}, {TOBN(0x77eff980, 0xbeb11454), TOBN(0x40a1a991, 0x60c1b066), TOBN(0xe8018980, 0xf889a1c7), TOBN(0xb9c52ae9, 0x76c24be0)}}, {{TOBN(0x05fbbcce, 0x45650ef4), TOBN(0xae000f10, 0x8aa29ac7), TOBN(0x884b7172, 0x4f04c470), TOBN(0x7cd4fde2, 0x19bb5c25)}, {TOBN(0x6477b22a, 0xe8840869), TOBN(0xa8868859, 0x5fbd0686), TOBN(0xf23cc02e, 0x1116dfba), TOBN(0x76cd563f, 0xd87d7776)}}, {{TOBN(0xe2a37598, 0xa9d82abf), TOBN(0x5f188ccb, 0xe6c170f5), TOBN(0x81682200, 0x5066b087), TOBN(0xda22c212, 0xc7155ada)}, {TOBN(0x151e5d3a, 0xfbddb479), TOBN(0x4b606b84, 0x6d715b99), TOBN(0x4a73b54b, 0xf997cb2e), TOBN(0x9a1bfe43, 0x3ecd8b66)}}, {{TOBN(0x1c312809, 0x2a67d48a), TOBN(0xcd6a671e, 0x031fa9e2), TOBN(0xbec3312a, 0x0e43a34a), TOBN(0x1d935639, 0x55ef47d3)}, {TOBN(0x5ea02489, 0x8fea73ea), TOBN(0x8247b364, 0xa035afb2), TOBN(0xb58300a6, 0x5265b54c), TOBN(0x3286662f, 0x722c7148)}}, {{TOBN(0xb77fd76b, 0xb4ec4c20), TOBN(0xf0a12fa7, 0x0f3fe3fd), TOBN(0xf845bbf5, 0x41d8c7e8), TOBN(0xe4d969ca, 0x5ec10aa8)}, {TOBN(0x4c0053b7, 0x43e232a3), TOBN(0xdc7a3fac, 0x37f8a45a), TOBN(0x3c4261c5, 0x20d81c8f), TOBN(0xfd4b3453, 0xb00eab00)}}, {{TOBN(0x76d48f86, 0xd36e3062), TOBN(0x626c5277, 0xa143ff02), TOBN(0x538174de, 0xaf76f42e), TOBN(0x2267aa86, 0x6407ceac)}, {TOBN(0xfad76351, 0x72e572d5), TOBN(0xab861af7, 0xba7330eb), TOBN(0xa0a1c8c7, 0x418d8657), TOBN(0x988821cb, 0x20289a52)}}, {{TOBN(0x79732522, 0xcccc18ad), TOBN(0xaadf3f8d, 0xf1a6e027), TOBN(0xf7382c93, 0x17c2354d), TOBN(0x5ce1680c, 0xd818b689)}, {TOBN(0x359ebbfc, 0xd9ecbee9), TOBN(0x4330689c, 0x1cae62ac), TOBN(0xb55ce5b4, 0xc51ac38a), TOBN(0x7921dfea, 0xfe238ee8)}}, {{TOBN(0x3972bef8, 0x271d1ca5), TOBN(0x3e423bc7, 0xe8aabd18), TOBN(0x57b09f3f, 0x44a3e5e3), TOBN(0x5da886ae, 0x7b444d66)}, {TOBN(0x68206634, 0xa9964375), TOBN(0x356a2fa3, 0x699cd0ff), TOBN(0xaf0faa24, 0xdba515e9), TOBN(0x536e1f5c, 0xb321d79a)}}, {{TOBN(0xd3b9913a, 0x5c04e4ea), TOBN(0xd549dcfe, 0xd6f11513), TOBN(0xee227bf5, 0x79fd1d94), TOBN(0x9f35afee, 0xb43f2c67)}, {TOBN(0xd2638d24, 0xf1314f53), TOBN(0x62baf948, 0xcabcd822), TOBN(0x5542de29, 0x4ef48db0), TOBN(0xb3eb6a04, 0xfc5f6bb2)}}, {{TOBN(0x23c110ae, 0x1208e16a), TOBN(0x1a4d15b5, 0xf8363e24), TOBN(0x30716844, 0x164be00b), TOBN(0xa8e24824, 0xf6f4690d)}, {TOBN(0x548773a2, 0x90b170cf), TOBN(0xa1bef331, 0x42f191f4), TOBN(0x70f418d0, 0x9247aa97), TOBN(0xea06028e, 0x48be9147)}}, {{TOBN(0xe13122f3, 0xdbfb894e), TOBN(0xbe9b79f6, 0xce274b18), TOBN(0x85a49de5, 0xca58aadf), TOBN(0x24957758, 0x11487351)}, {TOBN(0x111def61, 0xbb939099), TOBN(0x1d6a974a, 0x26d13694), TOBN(0x4474b4ce, 0xd3fc253b), TOBN(0x3a1485e6, 0x4c5db15e)}}, {{TOBN(0xe79667b4, 0x147c15b4), TOBN(0xe34f553b, 0x7bc61301), TOBN(0x032b80f8, 0x17094381), TOBN(0x55d8bafd, 0x723eaa21)}, {TOBN(0x5a987995, 0xf1c0e74e), TOBN(0x5a9b292e, 0xebba289c), TOBN(0x413cd4b2, 0xeb4c8251), TOBN(0x98b5d243, 0xd162db0a)}}, {{TOBN(0xbb47bf66, 0x68342520), TOBN(0x08d68949, 0xbaa862d1), TOBN(0x11f349c7, 0xe906abcd), TOBN(0x454ce985, 0xed7bf00e)}, {TOBN(0xacab5c9e, 0xb55b803b), TOBN(0xb03468ea, 0x31e3c16d), TOBN(0x5c24213d, 0xd273bf12), TOBN(0x211538eb, 0x71587887)}}, {{TOBN(0x198e4a2f, 0x731dea2d), TOBN(0xd5856cf2, 0x74ed7b2a), TOBN(0x86a632eb, 0x13a664fe), TOBN(0x932cd909, 0xbda41291)}, {TOBN(0x850e95d4, 0xc0c4ddc0), TOBN(0xc0f422f8, 0x347fc2c9), TOBN(0xe68cbec4, 0x86076bcb), TOBN(0xf9e7c0c0, 0xcd6cd286)}}, {{TOBN(0x65994ddb, 0x0f5f27ca), TOBN(0xe85461fb, 0xa80d59ff), TOBN(0xff05481a, 0x66601023), TOBN(0xc665427a, 0xfc9ebbfb)}, {TOBN(0xb0571a69, 0x7587fd52), TOBN(0x935289f8, 0x8d49efce), TOBN(0x61becc60, 0xea420688), TOBN(0xb22639d9, 0x13a786af)}}, {{TOBN(0x1a8e6220, 0x361ecf90), TOBN(0x001f23e0, 0x25506463), TOBN(0xe4ae9b5d, 0x0a5c2b79), TOBN(0xebc9cdad, 0xd8149db5)}, {TOBN(0xb33164a1, 0x934aa728), TOBN(0x750eb00e, 0xae9b60f3), TOBN(0x5a91615b, 0x9b9cfbfd), TOBN(0x97015cbf, 0xef45f7f6)}}, {{TOBN(0xb462c4a5, 0xbf5151df), TOBN(0x21adcc41, 0xb07118f2), TOBN(0xd60c545b, 0x043fa42c), TOBN(0xfc21aa54, 0xe96be1ab)}, {TOBN(0xe84bc32f, 0x4e51ea80), TOBN(0x3dae45f0, 0x259b5d8d), TOBN(0xbb73c7eb, 0xc38f1b5e), TOBN(0xe405a74a, 0xe8ae617d)}}, {{TOBN(0xbb1ae9c6, 0x9f1c56bd), TOBN(0x8c176b98, 0x49f196a4), TOBN(0xc448f311, 0x6875092b), TOBN(0xb5afe3de, 0x9f976033)}, {TOBN(0xa8dafd49, 0x145813e5), TOBN(0x687fc4d9, 0xe2b34226), TOBN(0xf2dfc92d, 0x4c7ff57f), TOBN(0x004e3fc1, 0x401f1b46)}}, {{TOBN(0x5afddab6, 0x1430c9ab), TOBN(0x0bdd41d3, 0x2238e997), TOBN(0xf0947430, 0x418042ae), TOBN(0x71f9adda, 0xcdddc4cb)}, {TOBN(0x7090c016, 0xc52dd907), TOBN(0xd9bdf44d, 0x29e2047f), TOBN(0xe6f1fe80, 0x1b1011a6), TOBN(0xb63accbc, 0xd9acdc78)}}, {{TOBN(0xcfc7e235, 0x1272a95b), TOBN(0x0c667717, 0xa6276ac8), TOBN(0x3c0d3709, 0xe2d7eef7), TOBN(0x5add2b06, 0x9a685b3e)}, {TOBN(0x363ad32d, 0x14ea5d65), TOBN(0xf8e01f06, 0x8d7dd506), TOBN(0xc9ea2213, 0x75b4aac6), TOBN(0xed2a2bf9, 0x0d353466)}}, {{TOBN(0x439d79b5, 0xe9d3a7c3), TOBN(0x8e0ee5a6, 0x81b7f34b), TOBN(0xcf3dacf5, 0x1dc4ba75), TOBN(0x1d3d1773, 0xeb3310c7)}, {TOBN(0xa8e67112, 0x7747ae83), TOBN(0x31f43160, 0x197d6b40), TOBN(0x0521ccee, 0xcd961400), TOBN(0x67246f11, 0xf6535768)}}, {{TOBN(0x702fcc5a, 0xef0c3133), TOBN(0x247cc45d, 0x7e16693b), TOBN(0xfd484e49, 0xc729b749), TOBN(0x522cef7d, 0xb218320f)}, {TOBN(0xe56ef405, 0x59ab93b3), TOBN(0x225fba11, 0x9f181071), TOBN(0x33bd6595, 0x15330ed0), TOBN(0xc4be69d5, 0x1ddb32f7)}}, {{TOBN(0x264c7668, 0x0448087c), TOBN(0xac30903f, 0x71432dae), TOBN(0x3851b266, 0x00f9bf47), TOBN(0x400ed311, 0x6cdd6d03)}, {TOBN(0x045e79fe, 0xf8fd2424), TOBN(0xfdfd974a, 0xfa6da98b), TOBN(0x45c9f641, 0x0c1e673a), TOBN(0x76f2e733, 0x5b2c5168)}}, {{TOBN(0x1adaebb5, 0x2a601753), TOBN(0xb286514c, 0xc57c2d49), TOBN(0xd8769670, 0x1e0bfd24), TOBN(0x950c547e, 0x04478922)}, {TOBN(0xd1d41969, 0xe5d32bfe), TOBN(0x30bc1472, 0x750d6c3e), TOBN(0x8f3679fe, 0xe0e27f3a), TOBN(0x8f64a7dc, 0xa4a6ee0c)}}, {{TOBN(0x2fe59937, 0x633dfb1f), TOBN(0xea82c395, 0x977f2547), TOBN(0xcbdfdf1a, 0x661ea646), TOBN(0xc7ccc591, 0xb9085451)}, {TOBN(0x82177962, 0x81761e13), TOBN(0xda57596f, 0x9196885c), TOBN(0xbc17e849, 0x28ffbd70), TOBN(0x1e6e0a41, 0x2671d36f)}}, {{TOBN(0x61ae872c, 0x4152fcf5), TOBN(0x441c87b0, 0x9e77e754), TOBN(0xd0799dd5, 0xa34dff09), TOBN(0x766b4e44, 0x88a6b171)}, {TOBN(0xdc06a512, 0x11f1c792), TOBN(0xea02ae93, 0x4be35c3e), TOBN(0xe5ca4d6d, 0xe90c469e), TOBN(0x4df4368e, 0x56e4ff5c)}}, {{TOBN(0x7817acab, 0x4baef62e), TOBN(0x9f5a2202, 0xa85b91e8), TOBN(0x9666ebe6, 0x6ce57610), TOBN(0x32ad31f3, 0xf73bfe03)}, {TOBN(0x628330a4, 0x25bcf4d6), TOBN(0xea950593, 0x515056e6), TOBN(0x59811c89, 0xe1332156), TOBN(0xc89cf1fe, 0x8c11b2d7)}}, {{TOBN(0x75b63913, 0x04e60cc0), TOBN(0xce811e8d, 0x4625d375), TOBN(0x030e43fc, 0x2d26e562), TOBN(0xfbb30b4b, 0x608d36a0)}, {TOBN(0x634ff82c, 0x48528118), TOBN(0x7c6fe085, 0xcd285911), TOBN(0x7f2830c0, 0x99358f28), TOBN(0x2e60a95e, 0x665e6c09)}}, {{TOBN(0x08407d3d, 0x9b785dbf), TOBN(0x530889ab, 0xa759bce7), TOBN(0xf228e0e6, 0x52f61239), TOBN(0x2b6d1461, 0x6879be3c)}, {TOBN(0xe6902c04, 0x51a7bbf7), TOBN(0x30ad99f0, 0x76f24a64), TOBN(0x66d9317a, 0x98bc6da0), TOBN(0xf4f877f3, 0xcb596ac0)}}, {{TOBN(0xb05ff62d, 0x4c44f119), TOBN(0x4555f536, 0xe9b77416), TOBN(0xc7c0d059, 0x8caed63b), TOBN(0x0cd2b7ce, 0xc358b2a9)}, {TOBN(0x3f33287b, 0x46945fa3), TOBN(0xf8785b20, 0xd67c8791), TOBN(0xc54a7a61, 0x9637bd08), TOBN(0x54d4598c, 0x18be79d7)}}, {{TOBN(0x889e5acb, 0xc46d7ce1), TOBN(0x9a515bb7, 0x8b085877), TOBN(0xfac1a03d, 0x0b7a5050), TOBN(0x7d3e738a, 0xf2926035)}, {TOBN(0x861cc2ce, 0x2a6cb0eb), TOBN(0x6f2e2955, 0x8f7adc79), TOBN(0x61c4d451, 0x33016376), TOBN(0xd9fd2c80, 0x5ad59090)}}, {{TOBN(0xe5a83738, 0xb2b836a1), TOBN(0x855b41a0, 0x7c0d6622), TOBN(0x186fe317, 0x7cc19af1), TOBN(0x6465c1ff, 0xfdd99acb)}, {TOBN(0x46e5c23f, 0x6974b99e), TOBN(0x75a7cf8b, 0xa2717cbe), TOBN(0x4d2ebc3f, 0x062be658), TOBN(0x094b4447, 0x5f209c98)}}, {{TOBN(0x4af285ed, 0xb940cb5a), TOBN(0x6706d792, 0x7cc82f10), TOBN(0xc8c8776c, 0x030526fa), TOBN(0xfa8e6f76, 0xa0da9140)}, {TOBN(0x77ea9d34, 0x591ee4f0), TOBN(0x5f46e337, 0x40274166), TOBN(0x1bdf98bb, 0xea671457), TOBN(0xd7c08b46, 0x862a1fe2)}}, {{TOBN(0x46cc303c, 0x1c08ad63), TOBN(0x99543440, 0x4c845e7b), TOBN(0x1b8fbdb5, 0x48f36bf7), TOBN(0x5b82c392, 0x8c8273a7)}, {TOBN(0x08f712c4, 0x928435d5), TOBN(0x071cf0f1, 0x79330380), TOBN(0xc74c2d24, 0xa8da054a), TOBN(0xcb0e7201, 0x43c46b5c)}}, {{TOBN(0x0ad7337a, 0xc0b7eff3), TOBN(0x8552225e, 0xc5e48b3c), TOBN(0xe6f78b0c, 0x73f13a5f), TOBN(0x5e70062e, 0x82349cbe)}, {TOBN(0x6b8d5048, 0xe7073969), TOBN(0x392d2a29, 0xc33cb3d2), TOBN(0xee4f727c, 0x4ecaa20f), TOBN(0xa068c99e, 0x2ccde707)}}, {{TOBN(0xfcd5651f, 0xb87a2913), TOBN(0xea3e3c15, 0x3cc252f0), TOBN(0x777d92df, 0x3b6cd3e4), TOBN(0x7a414143, 0xc5a732e7)}, {TOBN(0xa895951a, 0xa71ff493), TOBN(0xfe980c92, 0xbbd37cf6), TOBN(0x45bd5e64, 0xdecfeeff), TOBN(0x910dc2a9, 0xa44c43e9)}}, {{TOBN(0xcb403f26, 0xcca9f54d), TOBN(0x928bbdfb, 0x9303f6db), TOBN(0x3c37951e, 0xa9eee67c), TOBN(0x3bd61a52, 0xf79961c3)}, {TOBN(0x09a238e6, 0x395c9a79), TOBN(0x6940ca2d, 0x61eb352d), TOBN(0x7d1e5c5e, 0xc1875631), TOBN(0x1e19742c, 0x1e1b20d1)}}, {{TOBN(0x4633d908, 0x23fc2e6e), TOBN(0xa76e29a9, 0x08959149), TOBN(0x61069d9c, 0x84ed7da5), TOBN(0x0baa11cf, 0x5dbcad51)}, {TOBN(0xd01eec64, 0x961849da), TOBN(0x93b75f1f, 0xaf3d8c28), TOBN(0x57bc4f9f, 0x1ca2ee44), TOBN(0x5a26322d, 0x00e00558)}}, {{TOBN(0x1888d658, 0x61a023ef), TOBN(0x1d72aab4, 0xb9e5246e), TOBN(0xa9a26348, 0xe5563ec0), TOBN(0xa0971963, 0xc3439a43)}, {TOBN(0x567dd54b, 0xadb9b5b7), TOBN(0x73fac1a1, 0xc45a524b), TOBN(0x8fe97ef7, 0xfe38e608), TOBN(0x608748d2, 0x3f384f48)}}, {{TOBN(0xb0571794, 0xc486094f), TOBN(0x869254a3, 0x8bf3a8d6), TOBN(0x148a8dd1, 0x310b0e25), TOBN(0x99ab9f3f, 0x9aa3f7d8)}, {TOBN(0x0927c68a, 0x6706c02e), TOBN(0x22b5e76c, 0x69790e6c), TOBN(0x6c325260, 0x6c71376c), TOBN(0x53a57690, 0x09ef6657)}}, {{TOBN(0x8d63f852, 0xedffcf3a), TOBN(0xb4d2ed04, 0x3c0a6f55), TOBN(0xdb3aa8de, 0x12519b9e), TOBN(0x5d38e9c4, 0x1e0a569a)}, {TOBN(0x871528bf, 0x303747e2), TOBN(0xa208e77c, 0xf5b5c18d), TOBN(0x9d129c88, 0xca6bf923), TOBN(0xbcbf197f, 0xbf02839f)}}, {{TOBN(0x9b9bf030, 0x27323194), TOBN(0x3b055a8b, 0x339ca59d), TOBN(0xb46b2312, 0x0f669520), TOBN(0x19789f1f, 0x497e5f24)}, {TOBN(0x9c499468, 0xaaf01801), TOBN(0x72ee1190, 0x8b69d59c), TOBN(0x8bd39595, 0xacf4c079), TOBN(0x3ee11ece, 0x8e0cd048)}}, {{TOBN(0xebde86ec, 0x1ed66f18), TOBN(0x225d906b, 0xd61fce43), TOBN(0x5cab07d6, 0xe8bed74d), TOBN(0x16e4617f, 0x27855ab7)}, {TOBN(0x6568aadd, 0xb2fbc3dd), TOBN(0xedb5484f, 0x8aeddf5b), TOBN(0x878f20e8, 0x6dcf2fad), TOBN(0x3516497c, 0x615f5699)}}}, {{{TOBN(0xef0a3fec, 0xfa181e69), TOBN(0x9ea02f81, 0x30d69a98), TOBN(0xb2e9cf8e, 0x66eab95d), TOBN(0x520f2beb, 0x24720021)}, {TOBN(0x621c540a, 0x1df84361), TOBN(0x12037721, 0x71fa6d5d), TOBN(0x6e3c7b51, 0x0ff5f6ff), TOBN(0x817a069b, 0xabb2bef3)}}, {{TOBN(0x83572fb6, 0xb294cda6), TOBN(0x6ce9bf75, 0xb9039f34), TOBN(0x20e012f0, 0x095cbb21), TOBN(0xa0aecc1b, 0xd063f0da)}, {TOBN(0x57c21c3a, 0xf02909e5), TOBN(0xc7d59ecf, 0x48ce9cdc), TOBN(0x2732b844, 0x8ae336f8), TOBN(0x056e3723, 0x3f4f85f4)}}, {{TOBN(0x8a10b531, 0x89e800ca), TOBN(0x50fe0c17, 0x145208fd), TOBN(0x9e43c0d3, 0xb714ba37), TOBN(0x427d200e, 0x34189acc)}, {TOBN(0x05dee24f, 0xe616e2c0), TOBN(0x9c25f4c8, 0xee1854c1), TOBN(0x4d3222a5, 0x8f342a73), TOBN(0x0807804f, 0xa027c952)}}, {{TOBN(0xc222653a, 0x4f0d56f3), TOBN(0x961e4047, 0xca28b805), TOBN(0x2c03f8b0, 0x4a73434b), TOBN(0x4c966787, 0xab712a19)}, {TOBN(0xcc196c42, 0x864fee42), TOBN(0xc1be93da, 0x5b0ece5c), TOBN(0xa87d9f22, 0xc131c159), TOBN(0x2bb6d593, 0xdce45655)}}, {{TOBN(0x22c49ec9, 0xb809b7ce), TOBN(0x8a41486b, 0xe2c72c2c), TOBN(0x813b9420, 0xfea0bf36), TOBN(0xb3d36ee9, 0xa66dac69)}, {TOBN(0x6fddc08a, 0x328cc987), TOBN(0x0a3bcd2c, 0x3a326461), TOBN(0x7103c49d, 0xd810dbba), TOBN(0xf9d81a28, 0x4b78a4c4)}}, {{TOBN(0x3de865ad, 0xe4d55941), TOBN(0xdedafa5e, 0x30384087), TOBN(0x6f414abb, 0x4ef18b9b), TOBN(0x9ee9ea42, 0xfaee5268)}, {TOBN(0x260faa16, 0x37a55a4a), TOBN(0xeb19a514, 0x015f93b9), TOBN(0x51d7ebd2, 0x9e9c3598), TOBN(0x523fc56d, 0x1932178e)}}, {{TOBN(0x501d070c, 0xb98fe684), TOBN(0xd60fbe9a, 0x124a1458), TOBN(0xa45761c8, 0x92bc6b3f), TOBN(0xf5384858, 0xfe6f27cb)}, {TOBN(0x4b0271f7, 0xb59e763b), TOBN(0x3d4606a9, 0x5b5a8e5e), TOBN(0x1eda5d9b, 0x05a48292), TOBN(0xda7731d0, 0xe6fec446)}}, {{TOBN(0xa3e33693, 0x90d45871), TOBN(0xe9764040, 0x06166d8d), TOBN(0xb5c33682, 0x89a90403), TOBN(0x4bd17983, 0x72f1d637)}, {TOBN(0xa616679e, 0xd5d2c53a), TOBN(0x5ec4bcd8, 0xfdcf3b87), TOBN(0xae6d7613, 0xb66a694e), TOBN(0x7460fc76, 0xe3fc27e5)}}, {{TOBN(0x70469b82, 0x95caabee), TOBN(0xde024ca5, 0x889501e3), TOBN(0x6bdadc06, 0x076ed265), TOBN(0x0cb1236b, 0x5a0ef8b2)}, {TOBN(0x4065ddbf, 0x0972ebf9), TOBN(0xf1dd3875, 0x22aca432), TOBN(0xa88b97cf, 0x744aff76), TOBN(0xd1359afd, 0xfe8e3d24)}}, {{TOBN(0x52a3ba2b, 0x91502cf3), TOBN(0x2c3832a8, 0x084db75d), TOBN(0x04a12ddd, 0xde30b1c9), TOBN(0x7802eabc, 0xe31fd60c)}, {TOBN(0x33707327, 0xa37fddab), TOBN(0x65d6f2ab, 0xfaafa973), TOBN(0x3525c5b8, 0x11e6f91a), TOBN(0x76aeb0c9, 0x5f46530b)}}, {{TOBN(0xe8815ff6, 0x2f93a675), TOBN(0xa6ec9684, 0x05f48679), TOBN(0x6dcbb556, 0x358ae884), TOBN(0x0af61472, 0xe19e3873)}, {TOBN(0x72334372, 0xa5f696be), TOBN(0xc65e57ea, 0x6f22fb70), TOBN(0x268da30c, 0x946cea90), TOBN(0x136a8a87, 0x65681b2a)}}, {{TOBN(0xad5e81dc, 0x0f9f44d4), TOBN(0xf09a6960, 0x2c46585a), TOBN(0xd1649164, 0xc447d1b1), TOBN(0x3b4b36c8, 0x879dc8b1)}, {TOBN(0x20d4177b, 0x3b6b234c), TOBN(0x096a2505, 0x1730d9d0), TOBN(0x0611b9b8, 0xef80531d), TOBN(0xba904b3b, 0x64bb495d)}}, {{TOBN(0x1192d9d4, 0x93a3147a), TOBN(0x9f30a5dc, 0x9a565545), TOBN(0x90b1f9cb, 0x6ef07212), TOBN(0x29958546, 0x0d87fc13)}, {TOBN(0xd3323eff, 0xc17db9ba), TOBN(0xcb18548c, 0xcb1644a8), TOBN(0x18a306d4, 0x4f49ffbc), TOBN(0x28d658f1, 0x4c2e8684)}}, {{TOBN(0x44ba60cd, 0xa99f8c71), TOBN(0x67b7abdb, 0x4bf742ff), TOBN(0x66310f9c, 0x914b3f99), TOBN(0xae430a32, 0xf412c161)}, {TOBN(0x1e6776d3, 0x88ace52f), TOBN(0x4bc0fa24, 0x52d7067d), TOBN(0x03c286aa, 0x8f07cd1b), TOBN(0x4cb8f38c, 0xa985b2c1)}}, {{TOBN(0x83ccbe80, 0x8c3bff36), TOBN(0x005a0bd2, 0x5263e575), TOBN(0x460d7dda, 0x259bdcd1), TOBN(0x4a1c5642, 0xfa5cab6b)}, {TOBN(0x2b7bdbb9, 0x9fe4fc88), TOBN(0x09418e28, 0xcc97bbb5), TOBN(0xd8274fb4, 0xa12321ae), TOBN(0xb137007d, 0x5c87b64e)}}, {{TOBN(0x80531fe1, 0xc63c4962), TOBN(0x50541e89, 0x981fdb25), TOBN(0xdc1291a1, 0xfd4c2b6b), TOBN(0xc0693a17, 0xa6df4fca)}, {TOBN(0xb2c4604e, 0x0117f203), TOBN(0x245f1963, 0x0a99b8d0), TOBN(0xaedc20aa, 0xc6212c44), TOBN(0xb1ed4e56, 0x520f52a8)}}, {{TOBN(0xfe48f575, 0xf8547be3), TOBN(0x0a7033cd, 0xa9e45f98), TOBN(0x4b45d3a9, 0x18c50100), TOBN(0xb2a6cd6a, 0xa61d41da)}, {TOBN(0x60bbb4f5, 0x57933c6b), TOBN(0xa7538ebd, 0x2b0d7ffc), TOBN(0x9ea3ab8d, 0x8cd626b6), TOBN(0x8273a484, 0x3601625a)}}, {{TOBN(0x88859845, 0x0168e508), TOBN(0x8cbc9bb2, 0x99a94abd), TOBN(0x713ac792, 0xfab0a671), TOBN(0xa3995b19, 0x6c9ebffc)}, {TOBN(0xe711668e, 0x1239e152), TOBN(0x56892558, 0xbbb8dff4), TOBN(0x8bfc7dab, 0xdbf17963), TOBN(0x5b59fe5a, 0xb3de1253)}}, {{TOBN(0x7e3320eb, 0x34a9f7ae), TOBN(0xe5e8cf72, 0xd751efe4), TOBN(0x7ea003bc, 0xd9be2f37), TOBN(0xc0f551a0, 0xb6c08ef7)}, {TOBN(0x56606268, 0x038f6725), TOBN(0x1dd38e35, 0x6d92d3b6), TOBN(0x07dfce7c, 0xc3cbd686), TOBN(0x4e549e04, 0x651c5da8)}}, {{TOBN(0x4058f93b, 0x08b19340), TOBN(0xc2fae6f4, 0xcac6d89d), TOBN(0x4bad8a8c, 0x8f159cc7), TOBN(0x0ddba4b3, 0xcb0b601c)}, {TOBN(0xda4fc7b5, 0x1dd95f8c), TOBN(0x1d163cd7, 0xcea5c255), TOBN(0x30707d06, 0x274a8c4c), TOBN(0x79d9e008, 0x2802e9ce)}}, {{TOBN(0x02a29ebf, 0xe6ddd505), TOBN(0x37064e74, 0xb50bed1a), TOBN(0x3f6bae65, 0xa7327d57), TOBN(0x3846f5f1, 0xf83920bc)}, {TOBN(0x87c37491, 0x60df1b9b), TOBN(0x4cfb2895, 0x2d1da29f), TOBN(0x10a478ca, 0x4ed1743c), TOBN(0x390c6030, 0x3edd47c6)}}, {{TOBN(0x8f3e5312, 0x8c0a78de), TOBN(0xccd02bda, 0x1e85df70), TOBN(0xd6c75c03, 0xa61b6582), TOBN(0x0762921c, 0xfc0eebd1)}, {TOBN(0xd34d0823, 0xd85010c0), TOBN(0xd73aaacb, 0x0044cf1f), TOBN(0xfb4159bb, 0xa3b5e78a), TOBN(0x2287c7f7, 0xe5826f3f)}}, {{TOBN(0x4aeaf742, 0x580b1a01), TOBN(0xf080415d, 0x60423b79), TOBN(0xe12622cd, 0xa7dea144), TOBN(0x49ea4996, 0x59d62472)}, {TOBN(0xb42991ef, 0x571f3913), TOBN(0x0610f214, 0xf5b25a8a), TOBN(0x47adc585, 0x30b79e8f), TOBN(0xf90e3df6, 0x07a065a2)}}, {{TOBN(0x5d0a5deb, 0x43e2e034), TOBN(0x53fb5a34, 0x444024aa), TOBN(0xa8628c68, 0x6b0c9f7f), TOBN(0x9c69c29c, 0xac563656)}, {TOBN(0x5a231feb, 0xbace47b6), TOBN(0xbdce0289, 0x9ea5a2ec), TOBN(0x05da1fac, 0x9463853e), TOBN(0x96812c52, 0x509e78aa)}}, {{TOBN(0xd3fb5771, 0x57151692), TOBN(0xeb2721f8, 0xd98e1c44), TOBN(0xc0506087, 0x32399be1), TOBN(0xda5a5511, 0xd979d8b8)}, {TOBN(0x737ed55d, 0xc6f56780), TOBN(0xe20d3004, 0x0dc7a7f4), TOBN(0x02ce7301, 0xf5941a03), TOBN(0x91ef5215, 0xed30f83a)}}, {{TOBN(0x28727fc1, 0x4092d85f), TOBN(0x72d223c6, 0x5c49e41a), TOBN(0xa7cf30a2, 0xba6a4d81), TOBN(0x7c086209, 0xb030d87d)}, {TOBN(0x04844c7d, 0xfc588b09), TOBN(0x728cd499, 0x5874bbb0), TOBN(0xcc1281ee, 0xe84c0495), TOBN(0x0769b5ba, 0xec31958f)}}, {{TOBN(0x665c228b, 0xf99c2471), TOBN(0xf2d8a11b, 0x191eb110), TOBN(0x4594f494, 0xd36d7024), TOBN(0x482ded8b, 0xcdcb25a1)}, {TOBN(0xc958a9d8, 0xdadd4885), TOBN(0x7004477e, 0xf1d2b547), TOBN(0x0a45f6ef, 0x2a0af550), TOBN(0x4fc739d6, 0x2f8d6351)}}, {{TOBN(0x75cdaf27, 0x786f08a9), TOBN(0x8700bb26, 0x42c2737f), TOBN(0x855a7141, 0x1c4e2670), TOBN(0x810188c1, 0x15076fef)}, {TOBN(0xc251d0c9, 0xabcd3297), TOBN(0xae4c8967, 0xf48108eb), TOBN(0xbd146de7, 0x18ceed30), TOBN(0xf9d4f07a, 0xc986bced)}}, {{TOBN(0x5ad98ed5, 0x83fa1e08), TOBN(0x7780d33e, 0xbeabd1fb), TOBN(0xe330513c, 0x903b1196), TOBN(0xba11de9e, 0xa47bc8c4)}, {TOBN(0x684334da, 0x02c2d064), TOBN(0x7ecf360d, 0xa48de23b), TOBN(0x57a1b474, 0x0a9089d8), TOBN(0xf28fa439, 0xff36734c)}}, {{TOBN(0xf2a482cb, 0xea4570b3), TOBN(0xee65d68b, 0xa5ebcee9), TOBN(0x988d0036, 0xb9694cd5), TOBN(0x53edd0e9, 0x37885d32)}, {TOBN(0xe37e3307, 0xbeb9bc6d), TOBN(0xe9abb907, 0x9f5c6768), TOBN(0x4396ccd5, 0x51f2160f), TOBN(0x2500888c, 0x47336da6)}}, {{TOBN(0x383f9ed9, 0x926fce43), TOBN(0x809dd1c7, 0x04da2930), TOBN(0x30f6f596, 0x8a4cb227), TOBN(0x0d700c7f, 0x73a56b38)}, {TOBN(0x1825ea33, 0xab64a065), TOBN(0xaab9b735, 0x1338df80), TOBN(0x1516100d, 0x9b63f57f), TOBN(0x2574395a, 0x27a6a634)}}, {{TOBN(0xb5560fb6, 0x700a1acd), TOBN(0xe823fd73, 0xfd999681), TOBN(0xda915d1f, 0x6cb4e1ba), TOBN(0x0d030118, 0x6ebe00a3)}, {TOBN(0x744fb0c9, 0x89fca8cd), TOBN(0x970d01db, 0xf9da0e0b), TOBN(0x0ad8c564, 0x7931d76f), TOBN(0xb15737bf, 0xf659b96a)}}, {{TOBN(0xdc9933e8, 0xa8b484e7), TOBN(0xb2fdbdf9, 0x7a26dec7), TOBN(0x2349e9a4, 0x9f1f0136), TOBN(0x7860368e, 0x70fddddb)}, {TOBN(0xd93d2c1c, 0xf9ad3e18), TOBN(0x6d6c5f17, 0x689f4e79), TOBN(0x7a544d91, 0xb24ff1b6), TOBN(0x3e12a5eb, 0xfe16cd8c)}}, {{TOBN(0x543574e9, 0xa56b872f), TOBN(0xa1ad550c, 0xfcf68ea2), TOBN(0x689e37d2, 0x3f560ef7), TOBN(0x8c54b9ca, 0xc9d47a8b)}, {TOBN(0x46d40a4a, 0x088ac342), TOBN(0xec450c7c, 0x1576c6d0), TOBN(0xb589e31c, 0x1f9689e9), TOBN(0xdacf2602, 0xb8781718)}}, {{TOBN(0xa89237c6, 0xc8cb6b42), TOBN(0x1326fc93, 0xb96ef381), TOBN(0x55d56c6d, 0xb5f07825), TOBN(0xacba2eea, 0x7449e22d)}, {TOBN(0x74e0887a, 0x633c3000), TOBN(0xcb6cd172, 0xd7cbcf71), TOBN(0x309e81de, 0xc36cf1be), TOBN(0x07a18a6d, 0x60ae399b)}}, {{TOBN(0xb36c2679, 0x9edce57e), TOBN(0x52b892f4, 0xdf001d41), TOBN(0xd884ae5d, 0x16a1f2c6), TOBN(0x9b329424, 0xefcc370a)}, {TOBN(0x3120daf2, 0xbd2e21df), TOBN(0x55298d2d, 0x02470a99), TOBN(0x0b78af6c, 0xa05db32e), TOBN(0x5c76a331, 0x601f5636)}}, {{TOBN(0xaae861ff, 0xf8a4f29c), TOBN(0x70dc9240, 0xd68f8d49), TOBN(0x960e649f, 0x81b1321c), TOBN(0x3d2c801b, 0x8792e4ce)}, {TOBN(0xf479f772, 0x42521876), TOBN(0x0bed93bc, 0x416c79b1), TOBN(0xa67fbc05, 0x263e5bc9), TOBN(0x01e8e630, 0x521db049)}}, {{TOBN(0x76f26738, 0xc6f3431e), TOBN(0xe609cb02, 0xe3267541), TOBN(0xb10cff2d, 0x818c877c), TOBN(0x1f0e75ce, 0x786a13cb)}, {TOBN(0xf4fdca64, 0x1158544d), TOBN(0x5d777e89, 0x6cb71ed0), TOBN(0x3c233737, 0xa9aa4755), TOBN(0x7b453192, 0xe527ab40)}}, {{TOBN(0xdb59f688, 0x39f05ffe), TOBN(0x8f4f4be0, 0x6d82574e), TOBN(0xcce3450c, 0xee292d1b), TOBN(0xaa448a12, 0x61ccd086)}, {TOBN(0xabce91b3, 0xf7914967), TOBN(0x4537f09b, 0x1908a5ed), TOBN(0xa812421e, 0xf51042e7), TOBN(0xfaf5cebc, 0xec0b3a34)}}, {{TOBN(0x730ffd87, 0x4ca6b39a), TOBN(0x70fb72ed, 0x02efd342), TOBN(0xeb4735f9, 0xd75c8edb), TOBN(0xc11f2157, 0xc278aa51)}, {TOBN(0xc459f635, 0xbf3bfebf), TOBN(0x3a1ff0b4, 0x6bd9601f), TOBN(0xc9d12823, 0xc420cb73), TOBN(0x3e9af3e2, 0x3c2915a3)}}, {{TOBN(0xe0c82c72, 0xb41c3440), TOBN(0x175239e5, 0xe3039a5f), TOBN(0xe1084b8a, 0x558795a3), TOBN(0x328d0a1d, 0xd01e5c60)}, {TOBN(0x0a495f2e, 0xd3788a04), TOBN(0x25d8ff16, 0x66c11a9f), TOBN(0xf5155f05, 0x9ed692d6), TOBN(0x954fa107, 0x4f425fe4)}}, {{TOBN(0xd16aabf2, 0xe98aaa99), TOBN(0x90cd8ba0, 0x96b0f88a), TOBN(0x957f4782, 0xc154026a), TOBN(0x54ee0734, 0x52af56d2)}, {TOBN(0xbcf89e54, 0x45b4147a), TOBN(0x3d102f21, 0x9a52816c), TOBN(0x6808517e, 0x39b62e77), TOBN(0x92e25421, 0x69169ad8)}}, {{TOBN(0xd721d871, 0xbb608558), TOBN(0x60e4ebae, 0xf6d4ff9b), TOBN(0x0ba10819, 0x41f2763e), TOBN(0xca2e45be, 0x51ee3247)}, {TOBN(0x66d172ec, 0x2bfd7a5f), TOBN(0x528a8f2f, 0x74d0b12d), TOBN(0xe17f1e38, 0xdabe70dc), TOBN(0x1d5d7316, 0x9f93983c)}}, {{TOBN(0x51b2184a, 0xdf423e31), TOBN(0xcb417291, 0xaedb1a10), TOBN(0x2054ca93, 0x625bcab9), TOBN(0x54396860, 0xa98998f0)}, {TOBN(0x4e53f6c4, 0xa54ae57e), TOBN(0x0ffeb590, 0xee648e9d), TOBN(0xfbbdaadc, 0x6afaf6bc), TOBN(0xf88ae796, 0xaa3bfb8a)}}, {{TOBN(0x209f1d44, 0xd2359ed9), TOBN(0xac68dd03, 0xf3544ce2), TOBN(0xf378da47, 0xfd51e569), TOBN(0xe1abd860, 0x2cc80097)}, {TOBN(0x23ca18d9, 0x343b6e3a), TOBN(0x480797e8, 0xb40a1bae), TOBN(0xd1f0c717, 0x533f3e67), TOBN(0x44896970, 0x06e6cdfc)}}, {{TOBN(0x8ca21055, 0x52a82e8d), TOBN(0xb2caf785, 0x78460cdc), TOBN(0x4c1b7b62, 0xe9037178), TOBN(0xefc09d2c, 0xdb514b58)}, {TOBN(0x5f2df9ee, 0x9113be5c), TOBN(0x2fbda78f, 0xb3f9271c), TOBN(0xe09a81af, 0x8f83fc54), TOBN(0x06b13866, 0x8afb5141)}}, {{TOBN(0x38f6480f, 0x43e3865d), TOBN(0x72dd77a8, 0x1ddf47d9), TOBN(0xf2a8e971, 0x4c205ff7), TOBN(0x46d449d8, 0x9d088ad8)}, {TOBN(0x926619ea, 0x185d706f), TOBN(0xe47e02eb, 0xc7dd7f62), TOBN(0xe7f120a7, 0x8cbc2031), TOBN(0xc18bef00, 0x998d4ac9)}}, {{TOBN(0x18f37a9c, 0x6bdf22da), TOBN(0xefbc432f, 0x90dc82df), TOBN(0xc52cef8e, 0x5d703651), TOBN(0x82887ba0, 0xd99881a5)}, {TOBN(0x7cec9dda, 0xb920ec1d), TOBN(0xd0d7e8c3, 0xec3e8d3b), TOBN(0x445bc395, 0x4ca88747), TOBN(0xedeaa2e0, 0x9fd53535)}}, {{TOBN(0x461b1d93, 0x6cc87475), TOBN(0xd92a52e2, 0x6d2383bd), TOBN(0xfabccb59, 0xd7903546), TOBN(0x6111a761, 0x3d14b112)}, {TOBN(0x0ae584fe, 0xb3d5f612), TOBN(0x5ea69b8d, 0x60e828ec), TOBN(0x6c078985, 0x54087030), TOBN(0x649cab04, 0xac4821fe)}}, {{TOBN(0x25ecedcf, 0x8bdce214), TOBN(0xb5622f72, 0x86af7361), TOBN(0x0e1227aa, 0x7038b9e2), TOBN(0xd0efb273, 0xac20fa77)}, {TOBN(0x817ff88b, 0x79df975b), TOBN(0x856bf286, 0x1999503e), TOBN(0xb4d5351f, 0x5038ec46), TOBN(0x740a52c5, 0xfc42af6e)}}, {{TOBN(0x2e38bb15, 0x2cbb1a3f), TOBN(0xc3eb99fe, 0x17a83429), TOBN(0xca4fcbf1, 0xdd66bb74), TOBN(0x880784d6, 0xcde5e8fc)}, {TOBN(0xddc84c1c, 0xb4e7a0be), TOBN(0x8780510d, 0xbd15a72f), TOBN(0x44bcf1af, 0x81ec30e1), TOBN(0x141e50a8, 0x0a61073e)}}, {{TOBN(0x0d955718, 0x47be87ae), TOBN(0x68a61417, 0xf76a4372), TOBN(0xf57e7e87, 0xc607c3d3), TOBN(0x043afaf8, 0x5252f332)}, {TOBN(0xcc14e121, 0x1552a4d2), TOBN(0xb6dee692, 0xbb4d4ab4), TOBN(0xb6ab74c8, 0xa03816a4), TOBN(0x84001ae4, 0x6f394a29)}}, {{TOBN(0x5bed8344, 0xd795fb45), TOBN(0x57326e7d, 0xb79f55a5), TOBN(0xc9533ce0, 0x4accdffc), TOBN(0x53473caf, 0x3993fa04)}, {TOBN(0x7906eb93, 0xa13df4c8), TOBN(0xa73e51f6, 0x97cbe46f), TOBN(0xd1ab3ae1, 0x0ae4ccf8), TOBN(0x25614508, 0x8a5b3dbc)}}, {{TOBN(0x61eff962, 0x11a71b27), TOBN(0xdf71412b, 0x6bb7fa39), TOBN(0xb31ba6b8, 0x2bd7f3ef), TOBN(0xb0b9c415, 0x69180d29)}, {TOBN(0xeec14552, 0x014cdde5), TOBN(0x702c624b, 0x227b4bbb), TOBN(0x2b15e8c2, 0xd3e988f3), TOBN(0xee3bcc6d, 0xa4f7fd04)}}, {{TOBN(0x9d00822a, 0x42ac6c85), TOBN(0x2db0cea6, 0x1df9f2b7), TOBN(0xd7cad2ab, 0x42de1e58), TOBN(0x346ed526, 0x2d6fbb61)}, {TOBN(0xb3962995, 0x1a2faf09), TOBN(0x2fa8a580, 0x7c25612e), TOBN(0x30ae04da, 0x7cf56490), TOBN(0x75662908, 0x0eea3961)}}, {{TOBN(0x3609f5c5, 0x3d080847), TOBN(0xcb081d39, 0x5241d4f6), TOBN(0xb4fb3810, 0x77961a63), TOBN(0xc20c5984, 0x2abb66fc)}, {TOBN(0x3d40aa7c, 0xf902f245), TOBN(0x9cb12736, 0x4e536b1e), TOBN(0x5eda24da, 0x99b3134f), TOBN(0xafbd9c69, 0x5cd011af)}}, {{TOBN(0x9a16e30a, 0xc7088c7d), TOBN(0x5ab65710, 0x3207389f), TOBN(0x1b09547f, 0xe7407a53), TOBN(0x2322f9d7, 0x4fdc6eab)}, {TOBN(0xc0f2f22d, 0x7430de4d), TOBN(0x19382696, 0xe68ca9a9), TOBN(0x17f1eff1, 0x918e5868), TOBN(0xe3b5b635, 0x586f4204)}}, {{TOBN(0x146ef980, 0x3fbc4341), TOBN(0x359f2c80, 0x5b5eed4e), TOBN(0x9f35744e, 0x7482e41d), TOBN(0x9a9ac3ec, 0xf3b224c2)}, {TOBN(0x9161a6fe, 0x91fc50ae), TOBN(0x89ccc66b, 0xc613fa7c), TOBN(0x89268b14, 0xc732f15a), TOBN(0x7cd6f4e2, 0xb467ed03)}}, {{TOBN(0xfbf79869, 0xce56b40e), TOBN(0xf93e094c, 0xc02dde98), TOBN(0xefe0c3a8, 0xedee2cd7), TOBN(0x90f3ffc0, 0xb268fd42)}, {TOBN(0x81a7fd56, 0x08241aed), TOBN(0x95ab7ad8, 0x00b1afe8), TOBN(0x40127056, 0x3e310d52), TOBN(0xd3ffdeb1, 0x09d9fc43)}}, {{TOBN(0xc8f85c91, 0xd11a8594), TOBN(0x2e74d258, 0x31cf6db8), TOBN(0x829c7ca3, 0x02b5dfd0), TOBN(0xe389cfbe, 0x69143c86)}, {TOBN(0xd01b6405, 0x941768d8), TOBN(0x45103995, 0x03bf825d), TOBN(0xcc4ee166, 0x56cd17e2), TOBN(0xbea3c283, 0xba037e79)}}, {{TOBN(0x4e1ac06e, 0xd9a47520), TOBN(0xfbfe18aa, 0xaf852404), TOBN(0x5615f8e2, 0x8087648a), TOBN(0x7301e47e, 0xb9d150d9)}, {TOBN(0x79f9f9dd, 0xb299b977), TOBN(0x76697a7b, 0xa5b78314), TOBN(0x10d67468, 0x7d7c90e7), TOBN(0x7afffe03, 0x937210b5)}}, {{TOBN(0x5aef3e4b, 0x28c22cee), TOBN(0xefb0ecd8, 0x09fd55ae), TOBN(0x4cea7132, 0x0d2a5d6a), TOBN(0x9cfb5fa1, 0x01db6357)}, {TOBN(0x395e0b57, 0xf36e1ac5), TOBN(0x008fa9ad, 0x36cafb7d), TOBN(0x8f6cdf70, 0x5308c4db), TOBN(0x51527a37, 0x95ed2477)}}, {{TOBN(0xba0dee30, 0x5bd21311), TOBN(0x6ed41b22, 0x909c90d7), TOBN(0xc5f6b758, 0x7c8696d3), TOBN(0x0db8eaa8, 0x3ce83a80)}, {TOBN(0xd297fe37, 0xb24b4b6f), TOBN(0xfe58afe8, 0x522d1f0d), TOBN(0x97358736, 0x8c98dbd9), TOBN(0x6bc226ca, 0x9454a527)}}, {{TOBN(0xa12b384e, 0xce53c2d0), TOBN(0x779d897d, 0x5e4606da), TOBN(0xa53e47b0, 0x73ec12b0), TOBN(0x462dbbba, 0x5756f1ad)}, {TOBN(0x69fe09f2, 0xcafe37b6), TOBN(0x273d1ebf, 0xecce2e17), TOBN(0x8ac1d538, 0x3cf607fd), TOBN(0x8035f7ff, 0x12e10c25)}}}, {{{TOBN(0x854d34c7, 0x7e6c5520), TOBN(0xc27df9ef, 0xdcb9ea58), TOBN(0x405f2369, 0xd686666d), TOBN(0x29d1febf, 0x0417aa85)}, {TOBN(0x9846819e, 0x93470afe), TOBN(0x3e6a9669, 0xe2a27f9e), TOBN(0x24d008a2, 0xe31e6504), TOBN(0xdba7cecf, 0x9cb7680a)}}, {{TOBN(0xecaff541, 0x338d6e43), TOBN(0x56f7dd73, 0x4541d5cc), TOBN(0xb5d426de, 0x96bc88ca), TOBN(0x48d94f6b, 0x9ed3a2c3)}, {TOBN(0x6354a3bb, 0x2ef8279c), TOBN(0xd575465b, 0x0b1867f2), TOBN(0xef99b0ff, 0x95225151), TOBN(0xf3e19d88, 0xf94500d8)}}, {{TOBN(0x92a83268, 0xe32dd620), TOBN(0x913ec99f, 0x627849a2), TOBN(0xedd8fdfa, 0x2c378882), TOBN(0xaf96f33e, 0xee6f8cfe)}, {TOBN(0xc06737e5, 0xdc3fa8a5), TOBN(0x236bb531, 0xb0b03a1d), TOBN(0x33e59f29, 0x89f037b0), TOBN(0x13f9b5a7, 0xd9a12a53)}}, {{TOBN(0x0d0df6ce, 0x51efb310), TOBN(0xcb5b2eb4, 0x958df5be), TOBN(0xd6459e29, 0x36158e59), TOBN(0x82aae2b9, 0x1466e336)}, {TOBN(0xfb658a39, 0x411aa636), TOBN(0x7152ecc5, 0xd4c0a933), TOBN(0xf10c758a, 0x49f026b7), TOBN(0xf4837f97, 0xcb09311f)}}, {{TOBN(0xddfb02c4, 0xc753c45f), TOBN(0x18ca81b6, 0xf9c840fe), TOBN(0x846fd09a, 0xb0f8a3e6), TOBN(0xb1162add, 0xe7733dbc)}, {TOBN(0x7070ad20, 0x236e3ab6), TOBN(0xf88cdaf5, 0xb2a56326), TOBN(0x05fc8719, 0x997cbc7a), TOBN(0x442cd452, 0x4b665272)}}, {{TOBN(0x7807f364, 0xb71698f5), TOBN(0x6ba418d2, 0x9f7b605e), TOBN(0xfd20b00f, 0xa03b2cbb), TOBN(0x883eca37, 0xda54386f)}, {TOBN(0xff0be43f, 0xf3437f24), TOBN(0xe910b432, 0xa48bb33c), TOBN(0x4963a128, 0x329df765), TOBN(0xac1dd556, 0xbe2fe6f7)}}, {{TOBN(0x557610f9, 0x24a0a3fc), TOBN(0x38e17bf4, 0xe881c3f9), TOBN(0x6ba84faf, 0xed0dac99), TOBN(0xd4a222c3, 0x59eeb918)}, {TOBN(0xc79c1dbe, 0x13f542b6), TOBN(0x1fc65e0d, 0xe425d457), TOBN(0xeffb754f, 0x1debb779), TOBN(0x638d8fd0, 0x9e08af60)}}, {{TOBN(0x994f523a, 0x626332d5), TOBN(0x7bc38833, 0x5561bb44), TOBN(0x005ed4b0, 0x3d845ea2), TOBN(0xd39d3ee1, 0xc2a1f08a)}, {TOBN(0x6561fdd3, 0xe7676b0d), TOBN(0x620e35ff, 0xfb706017), TOBN(0x36ce424f, 0xf264f9a8), TOBN(0xc4c3419f, 0xda2681f7)}}, {{TOBN(0xfb6afd2f, 0x69beb6e8), TOBN(0x3a50b993, 0x6d700d03), TOBN(0xc840b2ad, 0x0c83a14f), TOBN(0x573207be, 0x54085bef)}, {TOBN(0x5af882e3, 0x09fe7e5b), TOBN(0x957678a4, 0x3b40a7e1), TOBN(0x172d4bdd, 0x543056e2), TOBN(0x9c1b26b4, 0x0df13c0a)}}, {{TOBN(0x1c30861c, 0xf405ff06), TOBN(0xebac86bd, 0x486e828b), TOBN(0xe791a971, 0x636933fc), TOBN(0x50e7c2be, 0x7aeee947)}, {TOBN(0xc3d4a095, 0xfa90d767), TOBN(0xae60eb7b, 0xe670ab7b), TOBN(0x17633a64, 0x397b056d), TOBN(0x93a21f33, 0x105012aa)}}, {{TOBN(0x663c370b, 0xabb88643), TOBN(0x91df36d7, 0x22e21599), TOBN(0x183ba835, 0x8b761671), TOBN(0x381eea1d, 0x728f3bf1)}, {TOBN(0xb9b2f1ba, 0x39966e6c), TOBN(0x7c464a28, 0xe7295492), TOBN(0x0fd5f70a, 0x09b26b7f), TOBN(0xa9aba1f9, 0xfbe009df)}}, {{TOBN(0x857c1f22, 0x369b87ad), TOBN(0x3c00e5d9, 0x32fca556), TOBN(0x1ad74cab, 0x90b06466), TOBN(0xa7112386, 0x550faaf2)}, {TOBN(0x7435e198, 0x6d9bd5f5), TOBN(0x2dcc7e38, 0x59c3463f), TOBN(0xdc7df748, 0xca7bd4b2), TOBN(0x13cd4c08, 0x9dec2f31)}}, {{TOBN(0x0d3b5df8, 0xe3237710), TOBN(0x0dadb26e, 0xcbd2f7b0), TOBN(0x9f5966ab, 0xe4aa082b), TOBN(0x666ec8de, 0x350e966e)}, {TOBN(0x1bfd1ed5, 0xee524216), TOBN(0xcd93c59b, 0x41dab0b6), TOBN(0x658a8435, 0xd186d6ba), TOBN(0x1b7d34d2, 0x159d1195)}}, {{TOBN(0x5936e460, 0x22caf46b), TOBN(0x6a45dd8f, 0x9a96fe4f), TOBN(0xf7925434, 0xb98f474e), TOBN(0x41410412, 0x0053ef15)}, {TOBN(0x71cf8d12, 0x41de97bf), TOBN(0xb8547b61, 0xbd80bef4), TOBN(0xb47d3970, 0xc4db0037), TOBN(0xf1bcd328, 0xfef20dff)}}, {{TOBN(0x31a92e09, 0x10caad67), TOBN(0x1f591960, 0x5531a1e1), TOBN(0x3bb852e0, 0x5f4fc840), TOBN(0x63e297ca, 0x93a72c6c)}, {TOBN(0x3c2b0b2e, 0x49abad67), TOBN(0x6ec405fc, 0xed3db0d9), TOBN(0xdc14a530, 0x7fef1d40), TOBN(0xccd19846, 0x280896fc)}}, {{TOBN(0x00f83176, 0x9bb81648), TOBN(0xd69eb485, 0x653120d0), TOBN(0xd17d75f4, 0x4ccabc62), TOBN(0x34a07f82, 0xb749fcb1)}, {TOBN(0x2c3af787, 0xbbfb5554), TOBN(0xb06ed4d0, 0x62e283f8), TOBN(0x5722889f, 0xa19213a0), TOBN(0x162b085e, 0xdcf3c7b4)}}, {{TOBN(0xbcaecb31, 0xe0dd3eca), TOBN(0xc6237fbc, 0xe52f13a5), TOBN(0xcc2b6b03, 0x27bac297), TOBN(0x2ae1cac5, 0xb917f54a)}, {TOBN(0x474807d4, 0x7845ae4f), TOBN(0xfec7dd92, 0xce5972e0), TOBN(0xc3bd2541, 0x1d7915bb), TOBN(0x66f85dc4, 0xd94907ca)}}, {{TOBN(0xd981b888, 0xbdbcf0ca), TOBN(0xd75f5da6, 0xdf279e9f), TOBN(0x128bbf24, 0x7054e934), TOBN(0x3c6ff6e5, 0x81db134b)}, {TOBN(0x795b7cf4, 0x047d26e4), TOBN(0xf370f7b8, 0x5049ec37), TOBN(0xc6712d4d, 0xced945af), TOBN(0xdf30b5ec, 0x095642bc)}}, {{TOBN(0x9b034c62, 0x4896246e), TOBN(0x5652c016, 0xee90bbd1), TOBN(0xeb38636f, 0x87fedb73), TOBN(0x5e32f847, 0x0135a613)}, {TOBN(0x0703b312, 0xcf933c83), TOBN(0xd05bb76e, 0x1a7f47e6), TOBN(0x825e4f0c, 0x949c2415), TOBN(0x569e5622, 0x7250d6f8)}}, {{TOBN(0xbbe9eb3a, 0x6568013e), TOBN(0x8dbd203f, 0x22f243fc), TOBN(0x9dbd7694, 0xb342734a), TOBN(0x8f6d12f8, 0x46afa984)}, {TOBN(0xb98610a2, 0xc9eade29), TOBN(0xbab4f323, 0x47dd0f18), TOBN(0x5779737b, 0x671c0d46), TOBN(0x10b6a7c6, 0xd3e0a42a)}}, {{TOBN(0xfb19ddf3, 0x3035b41c), TOBN(0xd336343f, 0x99c45895), TOBN(0x61fe4938, 0x54c857e5), TOBN(0xc4d506be, 0xae4e57d5)}, {TOBN(0x3cd8c8cb, 0xbbc33f75), TOBN(0x7281f08a, 0x9262c77d), TOBN(0x083f4ea6, 0xf11a2823), TOBN(0x8895041e, 0x9fba2e33)}}, {{TOBN(0xfcdfea49, 0x9c438edf), TOBN(0x7678dcc3, 0x91edba44), TOBN(0xf07b3b87, 0xe2ba50f0), TOBN(0xc13888ef, 0x43948c1b)}, {TOBN(0xc2135ad4, 0x1140af42), TOBN(0x8e5104f3, 0x926ed1a7), TOBN(0xf24430cb, 0x88f6695f), TOBN(0x0ce0637b, 0x6d73c120)}}, {{TOBN(0xb2db01e6, 0xfe631e8f), TOBN(0x1c5563d7, 0xd7bdd24b), TOBN(0x8daea3ba, 0x369ad44f), TOBN(0x000c81b6, 0x8187a9f9)}, {TOBN(0x5f48a951, 0xaae1fd9a), TOBN(0xe35626c7, 0x8d5aed8a), TOBN(0x20952763, 0x0498c622), TOBN(0x76d17634, 0x773aa504)}}, {{TOBN(0x36d90dda, 0xeb300f7a), TOBN(0x9dcf7dfc, 0xedb5e801), TOBN(0x645cb268, 0x74d5244c), TOBN(0xa127ee79, 0x348e3aa2)}, {TOBN(0x488acc53, 0x575f1dbb), TOBN(0x95037e85, 0x80e6161e), TOBN(0x57e59283, 0x292650d0), TOBN(0xabe67d99, 0x14938216)}}, {{TOBN(0x3c7f944b, 0x3f8e1065), TOBN(0xed908cb6, 0x330e8924), TOBN(0x08ee8fd5, 0x6f530136), TOBN(0x2227b7d5, 0xd7ffc169)}, {TOBN(0x4f55c893, 0xb5cd6dd5), TOBN(0x82225e11, 0xa62796e8), TOBN(0x5c6cead1, 0xcb18e12c), TOBN(0x4381ae0c, 0x84f5a51a)}}, {{TOBN(0x345913d3, 0x7fafa4c8), TOBN(0x3d918082, 0x0491aac0), TOBN(0x9347871f, 0x3e69264c), TOBN(0xbea9dd3c, 0xb4f4f0cd)}, {TOBN(0xbda5d067, 0x3eadd3e7), TOBN(0x0033c1b8, 0x0573bcd8), TOBN(0x25589379, 0x5da2486c), TOBN(0xcb89ee5b, 0x86abbee7)}}, {{TOBN(0x8fe0a8f3, 0x22532e5d), TOBN(0xb6410ff0, 0x727dfc4c), TOBN(0x619b9d58, 0x226726db), TOBN(0x5ec25669, 0x7a2b2dc7)}, {TOBN(0xaf4d2e06, 0x4c3beb01), TOBN(0x852123d0, 0x7acea556), TOBN(0x0e9470fa, 0xf783487a), TOBN(0x75a7ea04, 0x5664b3eb)}}, {{TOBN(0x4ad78f35, 0x6798e4ba), TOBN(0x9214e6e5, 0xc7d0e091), TOBN(0xc420b488, 0xb1290403), TOBN(0x64049e0a, 0xfc295749)}, {TOBN(0x03ef5af1, 0x3ae9841f), TOBN(0xdbe4ca19, 0xb0b662a6), TOBN(0x46845c5f, 0xfa453458), TOBN(0xf8dabf19, 0x10b66722)}}, {{TOBN(0xb650f0aa, 0xcce2793b), TOBN(0x71db851e, 0xc5ec47c1), TOBN(0x3eb78f3e, 0x3b234fa9), TOBN(0xb0c60f35, 0xfc0106ce)}, {TOBN(0x05427121, 0x774eadbd), TOBN(0x25367faf, 0xce323863), TOBN(0x7541b5c9, 0xcd086976), TOBN(0x4ff069e2, 0xdc507ad1)}}, {{TOBN(0x74145256, 0x8776e667), TOBN(0x6e76142c, 0xb23c6bb5), TOBN(0xdbf30712, 0x1b3a8a87), TOBN(0x60e7363e, 0x98450836)}, {TOBN(0x5741450e, 0xb7366d80), TOBN(0xe4ee14ca, 0x4837dbdf), TOBN(0xa765eb9b, 0x69d4316f), TOBN(0x04548dca, 0x8ef43825)}}, {{TOBN(0x9c9f4e4c, 0x5ae888eb), TOBN(0x733abb51, 0x56e9ac99), TOBN(0xdaad3c20, 0xba6ac029), TOBN(0x9b8dd3d3, 0x2ba3e38e)}, {TOBN(0xa9bb4c92, 0x0bc5d11a), TOBN(0xf20127a7, 0x9c5f88a3), TOBN(0x4f52b06e, 0x161d3cb8), TOBN(0x26c1ff09, 0x6afaf0a6)}}, {{TOBN(0x32670d2f, 0x7189e71f), TOBN(0xc6438748, 0x5ecf91e7), TOBN(0x15758e57, 0xdb757a21), TOBN(0x427d09f8, 0x290a9ce5)}, {TOBN(0x846a308f, 0x38384a7a), TOBN(0xaac3acb4, 0xb0732b99), TOBN(0x9e941009, 0x17845819), TOBN(0x95cba111, 0xa7ce5e03)}}, {{TOBN(0x6f3d4f7f, 0xb00009c4), TOBN(0xb8396c27, 0x8ff28b5f), TOBN(0xb1a9ae43, 0x1c97975d), TOBN(0x9d7ba8af, 0xe5d9fed5)}, {TOBN(0x338cf09f, 0x34f485b6), TOBN(0xbc0ddacc, 0x64122516), TOBN(0xa450da12, 0x05d471fe), TOBN(0x4c3a6250, 0x628dd8c9)}}, {{TOBN(0x69c7d103, 0xd1295837), TOBN(0xa2893e50, 0x3807eb2f), TOBN(0xd6e1e1de, 0xbdb41491), TOBN(0xc630745b, 0x5e138235)}, {TOBN(0xc892109e, 0x48661ae1), TOBN(0x8d17e7eb, 0xea2b2674), TOBN(0x00ec0f87, 0xc328d6b5), TOBN(0x6d858645, 0xf079ff9e)}}, {{TOBN(0x6cdf243e, 0x19115ead), TOBN(0x1ce1393e, 0x4bac4fcf), TOBN(0x2c960ed0, 0x9c29f25b), TOBN(0x59be4d8e, 0x9d388a05)}, {TOBN(0x0d46e06c, 0xd0def72b), TOBN(0xb923db5d, 0xe0342748), TOBN(0xf7d3aacd, 0x936d4a3d), TOBN(0x558519cc, 0x0b0b099e)}}, {{TOBN(0x3ea8ebf8, 0x827097ef), TOBN(0x259353db, 0xd054f55d), TOBN(0x84c89abc, 0x6d2ed089), TOBN(0x5c548b69, 0x8e096a7c)}, {TOBN(0xd587f616, 0x994b995d), TOBN(0x4d1531f6, 0xa5845601), TOBN(0x792ab31e, 0x451fd9f0), TOBN(0xc8b57bb2, 0x65adf6ca)}}, {{TOBN(0x68440fcb, 0x1cd5ad73), TOBN(0xb9c860e6, 0x6144da4f), TOBN(0x2ab286aa, 0x8462beb8), TOBN(0xcc6b8fff, 0xef46797f)}, {TOBN(0xac820da4, 0x20c8a471), TOBN(0x69ae05a1, 0x77ff7faf), TOBN(0xb9163f39, 0xbfb5da77), TOBN(0xbd03e590, 0x2c73ab7a)}}, {{TOBN(0x7e862b5e, 0xb2940d9e), TOBN(0x3c663d86, 0x4b9af564), TOBN(0xd8309031, 0xbde3033d), TOBN(0x298231b2, 0xd42c5bc6)}, {TOBN(0x42090d2c, 0x552ad093), TOBN(0xa4799d1c, 0xff854695), TOBN(0x0a88b5d6, 0xd31f0d00), TOBN(0xf8b40825, 0xa2f26b46)}}, {{TOBN(0xec29b1ed, 0xf1bd7218), TOBN(0xd491c53b, 0x4b24c86e), TOBN(0xd2fe588f, 0x3395ea65), TOBN(0x6f3764f7, 0x4456ef15)}, {TOBN(0xdb43116d, 0xcdc34800), TOBN(0xcdbcd456, 0xc1e33955), TOBN(0xefdb5540, 0x74ab286b), TOBN(0x948c7a51, 0xd18c5d7c)}}, {{TOBN(0xeb81aa37, 0x7378058e), TOBN(0x41c746a1, 0x04411154), TOBN(0xa10c73bc, 0xfb828ac7), TOBN(0x6439be91, 0x9d972b29)}, {TOBN(0x4bf3b4b0, 0x43a2fbad), TOBN(0x39e6dadf, 0x82b5e840), TOBN(0x4f716408, 0x6397bd4c), TOBN(0x0f7de568, 0x7f1eeccb)}}, {{TOBN(0x5865c5a1, 0xd2ffbfc1), TOBN(0xf74211fa, 0x4ccb6451), TOBN(0x66368a88, 0xc0b32558), TOBN(0x5b539dc2, 0x9ad7812e)}, {TOBN(0x579483d0, 0x2f3af6f6), TOBN(0x52132078, 0x99934ece), TOBN(0x50b9650f, 0xdcc9e983), TOBN(0xca989ec9, 0xaee42b8a)}}, {{TOBN(0x6a44c829, 0xd6f62f99), TOBN(0x8f06a309, 0x4c2a7c0c), TOBN(0x4ea2b3a0, 0x98a0cb0a), TOBN(0x5c547b70, 0xbeee8364)}, {TOBN(0x461d40e1, 0x682afe11), TOBN(0x9e0fc77a, 0x7b41c0a8), TOBN(0x79e4aefd, 0xe20d5d36), TOBN(0x2916e520, 0x32dd9f63)}}, {{TOBN(0xf59e52e8, 0x3f883faf), TOBN(0x396f9639, 0x2b868d35), TOBN(0xc902a9df, 0x4ca19881), TOBN(0x0fc96822, 0xdb2401a6)}, {TOBN(0x41237587, 0x66f1c68d), TOBN(0x10fc6de3, 0xfb476c0d), TOBN(0xf8b6b579, 0x841f5d90), TOBN(0x2ba8446c, 0xfa24f44a)}}, {{TOBN(0xa237b920, 0xef4a9975), TOBN(0x60bb6004, 0x2330435f), TOBN(0xd6f4ab5a, 0xcfb7e7b5), TOBN(0xb2ac5097, 0x83435391)}, {TOBN(0xf036ee2f, 0xb0d1ea67), TOBN(0xae779a6a, 0x74c56230), TOBN(0x59bff8c8, 0xab838ae6), TOBN(0xcd83ca99, 0x9b38e6f0)}}, {{TOBN(0xbb27bef5, 0xe33deed3), TOBN(0xe6356f6f, 0x001892a8), TOBN(0xbf3be6cc, 0x7adfbd3e), TOBN(0xaecbc81c, 0x33d1ac9d)}, {TOBN(0xe4feb909, 0xe6e861dc), TOBN(0x90a247a4, 0x53f5f801), TOBN(0x01c50acb, 0x27346e57), TOBN(0xce29242e, 0x461acc1b)}}, {{TOBN(0x04dd214a, 0x2f998a91), TOBN(0x271ee9b1, 0xd4baf27b), TOBN(0x7e3027d1, 0xe8c26722), TOBN(0x21d1645c, 0x1820dce5)}, {TOBN(0x086f242c, 0x7501779c), TOBN(0xf0061407, 0xfa0e8009), TOBN(0xf23ce477, 0x60187129), TOBN(0x05bbdedb, 0x0fde9bd0)}}, {{TOBN(0x682f4832, 0x25d98473), TOBN(0xf207fe85, 0x5c658427), TOBN(0xb6fdd7ba, 0x4166ffa1), TOBN(0x0c314056, 0x9eed799d)}, {TOBN(0x0db8048f, 0x4107e28f), TOBN(0x74ed3871, 0x41216840), TOBN(0x74489f8f, 0x56a3c06e), TOBN(0x1e1c005b, 0x12777134)}}, {{TOBN(0xdb332a73, 0xf37ec3c3), TOBN(0xc65259bd, 0xdd59eba0), TOBN(0x2291709c, 0xdb4d3257), TOBN(0x9a793b25, 0xbd389390)}, {TOBN(0xf39fe34b, 0xe43756f0), TOBN(0x2f76bdce, 0x9afb56c9), TOBN(0x9f37867a, 0x61208b27), TOBN(0xea1d4307, 0x089972c3)}}, {{TOBN(0x8c595330, 0x8bdf623a), TOBN(0x5f5accda, 0x8441fb7d), TOBN(0xfafa9418, 0x32ddfd95), TOBN(0x6ad40c5a, 0x0fde9be7)}, {TOBN(0x43faba89, 0xaeca8709), TOBN(0xc64a7cf1, 0x2c248a9d), TOBN(0x16620252, 0x72637a76), TOBN(0xaee1c791, 0x22b8d1bb)}}, {{TOBN(0xf0f798fd, 0x21a843b2), TOBN(0x56e4ed4d, 0x8d005cb1), TOBN(0x355f7780, 0x1f0d8abe), TOBN(0x197b04cf, 0x34522326)}, {TOBN(0x41f9b31f, 0xfd42c13f), TOBN(0x5ef7feb2, 0xb40f933d), TOBN(0x27326f42, 0x5d60bad4), TOBN(0x027ecdb2, 0x8c92cf89)}}, {{TOBN(0x04aae4d1, 0x4e3352fe), TOBN(0x08414d2f, 0x73591b90), TOBN(0x5ed6124e, 0xb7da7d60), TOBN(0xb985b931, 0x4d13d4ec)}, {TOBN(0xa592d3ab, 0x96bf36f9), TOBN(0x012dbed5, 0xbbdf51df), TOBN(0xa57963c0, 0xdf6c177d), TOBN(0x010ec869, 0x87ca29cf)}}, {{TOBN(0xba1700f6, 0xbf926dff), TOBN(0x7c9fdbd1, 0xf4bf6bc2), TOBN(0xdc18dc8f, 0x64da11f5), TOBN(0xa6074b7a, 0xd938ae75)}, {TOBN(0x14270066, 0xe84f44a4), TOBN(0x99998d38, 0xd27b954e), TOBN(0xc1be8ab2, 0xb4f38e9a), TOBN(0x8bb55bbf, 0x15c01016)}}, {{TOBN(0xf73472b4, 0x0ea2ab30), TOBN(0xd365a340, 0xf73d68dd), TOBN(0xc01a7168, 0x19c2e1eb), TOBN(0x32f49e37, 0x34061719)}, {TOBN(0xb73c57f1, 0x01d8b4d6), TOBN(0x03c8423c, 0x26b47700), TOBN(0x321d0bc8, 0xa4d8826a), TOBN(0x6004213c, 0x4bc0e638)}}, {{TOBN(0xf78c64a1, 0xc1c06681), TOBN(0x16e0a16f, 0xef018e50), TOBN(0x31cbdf91, 0xdb42b2b3), TOBN(0xf8f4ffce, 0xe0d36f58)}, {TOBN(0xcdcc71cd, 0x4cc5e3e0), TOBN(0xd55c7cfa, 0xa129e3e0), TOBN(0xccdb6ba0, 0x0fb2cbf1), TOBN(0x6aba0005, 0xc4bce3cb)}}, {{TOBN(0x501cdb30, 0xd232cfc4), TOBN(0x9ddcf12e, 0xd58a3cef), TOBN(0x02d2cf9c, 0x87e09149), TOBN(0xdc5d7ec7, 0x2c976257)}, {TOBN(0x6447986e, 0x0b50d7dd), TOBN(0x88fdbaf7, 0x807f112a), TOBN(0x58c9822a, 0xb00ae9f6), TOBN(0x6abfb950, 0x6d3d27e0)}}, {{TOBN(0xd0a74487, 0x8a429f4f), TOBN(0x0649712b, 0xdb516609), TOBN(0xb826ba57, 0xe769b5df), TOBN(0x82335df2, 0x1fc7aaf2)}, {TOBN(0x2389f067, 0x5c93d995), TOBN(0x59ac367a, 0x68677be6), TOBN(0xa77985ff, 0x21d9951b), TOBN(0x038956fb, 0x85011cce)}}, {{TOBN(0x608e48cb, 0xbb734e37), TOBN(0xc08c0bf2, 0x2be5b26f), TOBN(0x17bbdd3b, 0xf9b1a0d9), TOBN(0xeac7d898, 0x10483319)}, {TOBN(0xc95c4baf, 0xbc1a6dea), TOBN(0xfdd0e2bf, 0x172aafdb), TOBN(0x40373cbc, 0x8235c41a), TOBN(0x14303f21, 0xfb6f41d5)}}, {{TOBN(0xba063621, 0x0408f237), TOBN(0xcad3b09a, 0xecd2d1ed), TOBN(0x4667855a, 0x52abb6a2), TOBN(0xba9157dc, 0xaa8b417b)}, {TOBN(0xfe7f3507, 0x4f013efb), TOBN(0x1b112c4b, 0xaa38c4a2), TOBN(0xa1406a60, 0x9ba64345), TOBN(0xe53cba33, 0x6993c80b)}}, {{TOBN(0x45466063, 0xded40d23), TOBN(0x3d5f1f4d, 0x54908e25), TOBN(0x9ebefe62, 0x403c3c31), TOBN(0x274ea0b5, 0x0672a624)}, {TOBN(0xff818d99, 0x451d1b71), TOBN(0x80e82643, 0x8f79cf79), TOBN(0xa165df13, 0x73ce37f5), TOBN(0xa744ef4f, 0xfe3a21fd)}}, {{TOBN(0x73f1e7f5, 0xcf551396), TOBN(0xc616898e, 0x868c676b), TOBN(0x671c28c7, 0x8c442c36), TOBN(0xcfe5e558, 0x5e0a317d)}, {TOBN(0x1242d818, 0x7051f476), TOBN(0x56fad2a6, 0x14f03442), TOBN(0x262068bc, 0x0a44d0f6), TOBN(0xdfa2cd6e, 0xce6edf4e)}}, {{TOBN(0x0f43813a, 0xd15d1517), TOBN(0x61214cb2, 0x377d44f5), TOBN(0xd399aa29, 0xc639b35f), TOBN(0x42136d71, 0x54c51c19)}, {TOBN(0x9774711b, 0x08417221), TOBN(0x0a5546b3, 0x52545a57), TOBN(0x80624c41, 0x1150582d), TOBN(0x9ec5c418, 0xfbc555bc)}}, {{TOBN(0x2c87dcad, 0x771849f1), TOBN(0xb0c932c5, 0x01d7bf6f), TOBN(0x6aa5cd3e, 0x89116eb2), TOBN(0xd378c25a, 0x51ca7bd3)}, {TOBN(0xc612a0da, 0x9e6e3e31), TOBN(0x0417a54d, 0xb68ad5d0), TOBN(0x00451e4a, 0x22c6edb8), TOBN(0x9fbfe019, 0xb42827ce)}}, {{TOBN(0x2fa92505, 0xba9384a2), TOBN(0x21b8596e, 0x64ad69c1), TOBN(0x8f4fcc49, 0x983b35a6), TOBN(0xde093760, 0x72754672)}, {TOBN(0x2f14ccc8, 0xf7bffe6d), TOBN(0x27566bff, 0x5d94263d), TOBN(0xb5b4e9c6, 0x2df3ec30), TOBN(0x94f1d7d5, 0x3e6ea6ba)}}, {{TOBN(0x97b7851a, 0xaaca5e9b), TOBN(0x518aa521, 0x56713b97), TOBN(0x3357e8c7, 0x150a61f6), TOBN(0x7842e7e2, 0xec2c2b69)}, {TOBN(0x8dffaf65, 0x6868a548), TOBN(0xd963bd82, 0xe068fc81), TOBN(0x64da5c8b, 0x65917733), TOBN(0x927090ff, 0x7b247328)}}}, {{{TOBN(0x214bc9a7, 0xd298c241), TOBN(0xe3b697ba, 0x56807cfd), TOBN(0xef1c7802, 0x4564eadb), TOBN(0xdde8cdcf, 0xb48149c5)}, {TOBN(0x946bf0a7, 0x5a4d2604), TOBN(0x27154d7f, 0x6c1538af), TOBN(0x95cc9230, 0xde5b1fcc), TOBN(0xd88519e9, 0x66864f82)}}, {{TOBN(0xb828dd1a, 0x7cb1282c), TOBN(0xa08d7626, 0xbe46973a), TOBN(0x6baf8d40, 0xe708d6b2), TOBN(0x72571fa1, 0x4daeb3f3)}, {TOBN(0x85b1732f, 0xf22dfd98), TOBN(0x87ab01a7, 0x0087108d), TOBN(0xaaaafea8, 0x5988207a), TOBN(0xccc832f8, 0x69f00755)}}, {{TOBN(0x964d950e, 0x36ff3bf0), TOBN(0x8ad20f6f, 0xf0b34638), TOBN(0x4d9177b3, 0xb5d7585f), TOBN(0xcf839760, 0xef3f019f)}, {TOBN(0x582fc5b3, 0x8288c545), TOBN(0x2f8e4e9b, 0x13116bd1), TOBN(0xf91e1b2f, 0x332120ef), TOBN(0xcf568724, 0x2a17dd23)}}, {{TOBN(0x488f1185, 0xca8d9d1a), TOBN(0xadf2c77d, 0xd987ded2), TOBN(0x5f3039f0, 0x60c46124), TOBN(0xe5d70b75, 0x71e095f4)}, {TOBN(0x82d58650, 0x6260e70f), TOBN(0x39d75ea7, 0xf750d105), TOBN(0x8cf3d0b1, 0x75bac364), TOBN(0xf3a7564d, 0x21d01329)}}, {{TOBN(0x182f04cd, 0x2f52d2a7), TOBN(0x4fde149a, 0xe2df565a), TOBN(0xb80c5eec, 0xa79fb2f7), TOBN(0xab491d7b, 0x22ddc897)}, {TOBN(0x99d76c18, 0xc6312c7f), TOBN(0xca0d5f3d, 0x6aa41a57), TOBN(0x71207325, 0xd15363a0), TOBN(0xe82aa265, 0xbeb252c2)}}, {{TOBN(0x94ab4700, 0xec3128c2), TOBN(0x6c76d862, 0x8e383f49), TOBN(0xdc36b150, 0xc03024eb), TOBN(0xfb439477, 0x53daac69)}, {TOBN(0xfc68764a, 0x8dc79623), TOBN(0x5b86995d, 0xb440fbb2), TOBN(0xd66879bf, 0xccc5ee0d), TOBN(0x05228942, 0x95aa8bd3)}}, {{TOBN(0xb51a40a5, 0x1e6a75c1), TOBN(0x24327c76, 0x0ea7d817), TOBN(0x06630182, 0x07774597), TOBN(0xd6fdbec3, 0x97fa7164)}, {TOBN(0x20c99dfb, 0x13c90f48), TOBN(0xd6ac5273, 0x686ef263), TOBN(0xc6a50bdc, 0xfef64eeb), TOBN(0xcd87b281, 0x86fdfc32)}}, {{TOBN(0xb24aa43e, 0x3fcd3efc), TOBN(0xdd26c034, 0xb8088e9a), TOBN(0xa5ef4dc9, 0xbd3d46ea), TOBN(0xa2f99d58, 0x8a4c6a6f)}, {TOBN(0xddabd355, 0x2f1da46c), TOBN(0x72c3f8ce, 0x1afacdd1), TOBN(0xd90c4eee, 0x92d40578), TOBN(0xd28bb41f, 0xca623b94)}}, {{TOBN(0x50fc0711, 0x745edc11), TOBN(0x9dd9ad7d, 0x3dc87558), TOBN(0xce6931fb, 0xb49d1e64), TOBN(0x6c77a0a2, 0xc98bd0f9)}, {TOBN(0x62b9a629, 0x6baf7cb1), TOBN(0xcf065f91, 0xccf72d22), TOBN(0x7203cce9, 0x79639071), TOBN(0x09ae4885, 0xf9cb732f)}}, {{TOBN(0x5e7c3bec, 0xee8314f3), TOBN(0x1c068aed, 0xdbea298f), TOBN(0x08d381f1, 0x7c80acec), TOBN(0x03b56be8, 0xe330495b)}, {TOBN(0xaeffb8f2, 0x9222882d), TOBN(0x95ff38f6, 0xc4af8bf7), TOBN(0x50e32d35, 0x1fc57d8c), TOBN(0x6635be52, 0x17b444f0)}}, {{TOBN(0x04d15276, 0xa5177900), TOBN(0x4e1dbb47, 0xf6858752), TOBN(0x5b475622, 0xc615796c), TOBN(0xa6fa0387, 0x691867bf)}, {TOBN(0xed7f5d56, 0x2844c6d0), TOBN(0xc633cf9b, 0x03a2477d), TOBN(0xf6be5c40, 0x2d3721d6), TOBN(0xaf312eb7, 0xe9fd68e6)}}, {{TOBN(0x242792d2, 0xe7417ce1), TOBN(0xff42bc71, 0x970ee7f5), TOBN(0x1ff4dc6d, 0x5c67a41e), TOBN(0x77709b7b, 0x20882a58)}, {TOBN(0x3554731d, 0xbe217f2c), TOBN(0x2af2a8cd, 0x5bb72177), TOBN(0x58eee769, 0x591dd059), TOBN(0xbb2930c9, 0x4bba6477)}}, {{TOBN(0x863ee047, 0x7d930cfc), TOBN(0x4c262ad1, 0x396fd1f4), TOBN(0xf4765bc8, 0x039af7e1), TOBN(0x2519834b, 0x5ba104f6)}, {TOBN(0x7cd61b4c, 0xd105f961), TOBN(0xa5415da5, 0xd63bca54), TOBN(0x778280a0, 0x88a1f17c), TOBN(0xc4968949, 0x2329512c)}}, {{TOBN(0x174a9126, 0xcecdaa7a), TOBN(0xfc8c7e0e, 0x0b13247b), TOBN(0x29c110d2, 0x3484c1c4), TOBN(0xf8eb8757, 0x831dfc3b)}, {TOBN(0x022f0212, 0xc0067452), TOBN(0x3f6f69ee, 0x7b9b926c), TOBN(0x09032da0, 0xef42daf4), TOBN(0x79f00ade, 0x83f80de4)}}, {{TOBN(0x6210db71, 0x81236c97), TOBN(0x74f7685b, 0x3ee0781f), TOBN(0x4df7da7b, 0xa3e41372), TOBN(0x2aae38b1, 0xb1a1553e)}, {TOBN(0x1688e222, 0xf6dd9d1b), TOBN(0x57695448, 0x5b8b6487), TOBN(0x478d2127, 0x4b2edeaa), TOBN(0xb2818fa5, 0x1e85956a)}}, {{TOBN(0x1e6addda, 0xf176f2c0), TOBN(0x01ca4604, 0xe2572658), TOBN(0x0a404ded, 0x85342ffb), TOBN(0x8cf60f96, 0x441838d6)}, {TOBN(0x9bbc691c, 0xc9071c4a), TOBN(0xfd588744, 0x34442803), TOBN(0x97101c85, 0x809c0d81), TOBN(0xa7fb754c, 0x8c456f7f)}}, {{TOBN(0xc95f3c5c, 0xd51805e1), TOBN(0xab4ccd39, 0xb299dca8), TOBN(0x3e03d20b, 0x47eaf500), TOBN(0xfa3165c1, 0xd7b80893)}, {TOBN(0x005e8b54, 0xe160e552), TOBN(0xdc4972ba, 0x9019d11f), TOBN(0x21a6972e, 0x0c9a4a7a), TOBN(0xa52c258f, 0x37840fd7)}}, {{TOBN(0xf8559ff4, 0xc1e99d81), TOBN(0x08e1a7d6, 0xa3c617c0), TOBN(0xb398fd43, 0x248c6ba7), TOBN(0x6ffedd91, 0xd1283794)}, {TOBN(0x8a6a59d2, 0xd629d208), TOBN(0xa9d141d5, 0x3490530e), TOBN(0x42f6fc18, 0x38505989), TOBN(0x09bf250d, 0x479d94ee)}}, {{TOBN(0x223ad3b1, 0xb3822790), TOBN(0x6c5926c0, 0x93b8971c), TOBN(0x609efc7e, 0x75f7fa62), TOBN(0x45d66a6d, 0x1ec2d989)}, {TOBN(0x4422d663, 0x987d2792), TOBN(0x4a73caad, 0x3eb31d2b), TOBN(0xf06c2ac1, 0xa32cb9e6), TOBN(0xd9445c5f, 0x91aeba84)}}, {{TOBN(0x6af7a1d5, 0xaf71013f), TOBN(0xe68216e5, 0x0bedc946), TOBN(0xf4cba30b, 0xd27370a0), TOBN(0x7981afbf, 0x870421cc)}, {TOBN(0x02496a67, 0x9449f0e1), TOBN(0x86cfc4be, 0x0a47edae), TOBN(0x3073c936, 0xb1feca22), TOBN(0xf5694612, 0x03f8f8fb)}}, {{TOBN(0xd063b723, 0x901515ea), TOBN(0x4c6c77a5, 0x749cf038), TOBN(0x6361e360, 0xab9e5059), TOBN(0x596cf171, 0xa76a37c0)}, {TOBN(0x800f53fa, 0x6530ae7a), TOBN(0x0f5e631e, 0x0792a7a6), TOBN(0x5cc29c24, 0xefdb81c9), TOBN(0xa269e868, 0x3f9c40ba)}}, {{TOBN(0xec14f9e1, 0x2cb7191e), TOBN(0x78ea1bd8, 0xe5b08ea6), TOBN(0x3c65aa9b, 0x46332bb9), TOBN(0x84cc22b3, 0xbf80ce25)}, {TOBN(0x0098e9e9, 0xd49d5bf1), TOBN(0xcd4ec1c6, 0x19087da4), TOBN(0x3c9d07c5, 0xaef6e357), TOBN(0x839a0268, 0x9f8f64b8)}}, {{TOBN(0xc5e9eb62, 0xc6d8607f), TOBN(0x759689f5, 0x6aa995e4), TOBN(0x70464669, 0xbbb48317), TOBN(0x921474bf, 0xe402417d)}, {TOBN(0xcabe135b, 0x2a354c8c), TOBN(0xd51e52d2, 0x812fa4b5), TOBN(0xec741096, 0x53311fe8), TOBN(0x4f774535, 0xb864514b)}}, {{TOBN(0xbcadd671, 0x5bde48f8), TOBN(0xc9703873, 0x2189bc7d), TOBN(0x5d45299e, 0xc709ee8a), TOBN(0xd1287ee2, 0x845aaff8)}, {TOBN(0x7d1f8874, 0xdb1dbf1f), TOBN(0xea46588b, 0x990c88d6), TOBN(0x60ba649a, 0x84368313), TOBN(0xd5fdcbce, 0x60d543ae)}}, {{TOBN(0x90b46d43, 0x810d5ab0), TOBN(0x6739d8f9, 0x04d7e5cc), TOBN(0x021c1a58, 0x0d337c33), TOBN(0x00a61162, 0x68e67c40)}, {TOBN(0x95ef413b, 0x379f0a1f), TOBN(0xfe126605, 0xe9e2ab95), TOBN(0x67578b85, 0x2f5f199c), TOBN(0xf5c00329, 0x2cb84913)}}, {{TOBN(0xf7956430, 0x37577dd8), TOBN(0x83b82af4, 0x29c5fe88), TOBN(0x9c1bea26, 0xcdbdc132), TOBN(0x589fa086, 0x9c04339e)}, {TOBN(0x033e9538, 0xb13799df), TOBN(0x85fa8b21, 0xd295d034), TOBN(0xdf17f73f, 0xbd9ddcca), TOBN(0xf32bd122, 0xddb66334)}}, {{TOBN(0x55ef88a7, 0x858b044c), TOBN(0x1f0d69c2, 0x5aa9e397), TOBN(0x55fd9cc3, 0x40d85559), TOBN(0xc774df72, 0x7785ddb2)}, {TOBN(0x5dcce9f6, 0xd3bd2e1c), TOBN(0xeb30da20, 0xa85dfed0), TOBN(0x5ed7f5bb, 0xd3ed09c4), TOBN(0x7d42a35c, 0x82a9c1bd)}}, {{TOBN(0xcf3de995, 0x9890272d), TOBN(0x75f3432a, 0x3e713a10), TOBN(0x5e13479f, 0xe28227b8), TOBN(0xb8561ea9, 0xfefacdc8)}, {TOBN(0xa6a297a0, 0x8332aafd), TOBN(0x9b0d8bb5, 0x73809b62), TOBN(0xd2fa1cfd, 0x0c63036f), TOBN(0x7a16eb55, 0xbd64bda8)}}, {{TOBN(0x3f5cf5f6, 0x78e62ddc), TOBN(0x2267c454, 0x07fd752b), TOBN(0x5e361b6b, 0x5e437bbe), TOBN(0x95c59501, 0x8354e075)}, {TOBN(0xec725f85, 0xf2b254d9), TOBN(0x844b617d, 0x2cb52b4e), TOBN(0xed8554f5, 0xcf425fb5), TOBN(0xab67703e, 0x2af9f312)}}, {{TOBN(0x4cc34ec1, 0x3cf48283), TOBN(0xb09daa25, 0x9c8a705e), TOBN(0xd1e9d0d0, 0x5b7d4f84), TOBN(0x4df6ef64, 0xdb38929d)}, {TOBN(0xe16b0763, 0xaa21ba46), TOBN(0xc6b1d178, 0xa293f8fb), TOBN(0x0ff5b602, 0xd520aabf), TOBN(0x94d671bd, 0xc339397a)}}, {{TOBN(0x7c7d98cf, 0x4f5792fa), TOBN(0x7c5e0d67, 0x11215261), TOBN(0x9b19a631, 0xa7c5a6d4), TOBN(0xc8511a62, 0x7a45274d)}, {TOBN(0x0c16621c, 0xa5a60d99), TOBN(0xf7fbab88, 0xcf5e48cb), TOBN(0xab1e6ca2, 0xf7ddee08), TOBN(0x83bd08ce, 0xe7867f3c)}}, {{TOBN(0xf7e48e8a, 0x2ac13e27), TOBN(0x4494f6df, 0x4eb1a9f5), TOBN(0xedbf84eb, 0x981f0a62), TOBN(0x49badc32, 0x536438f0)}, {TOBN(0x50bea541, 0x004f7571), TOBN(0xbac67d10, 0xdf1c94ee), TOBN(0x253d73a1, 0xb727bc31), TOBN(0xb3d01cf2, 0x30686e28)}}, {{TOBN(0x51b77b1b, 0x55fd0b8b), TOBN(0xa099d183, 0xfeec3173), TOBN(0x202b1fb7, 0x670e72b7), TOBN(0xadc88b33, 0xa8e1635f)}, {TOBN(0x34e8216a, 0xf989d905), TOBN(0xc2e68d20, 0x29b58d01), TOBN(0x11f81c92, 0x6fe55a93), TOBN(0x15f1462a, 0x8f296f40)}}, {{TOBN(0x1915d375, 0xea3d62f2), TOBN(0xa17765a3, 0x01c8977d), TOBN(0x7559710a, 0xe47b26f6), TOBN(0xe0bd29c8, 0x535077a5)}, {TOBN(0x615f976d, 0x08d84858), TOBN(0x370dfe85, 0x69ced5c1), TOBN(0xbbc7503c, 0xa734fa56), TOBN(0xfbb9f1ec, 0x91ac4574)}}, {{TOBN(0x95d7ec53, 0x060dd7ef), TOBN(0xeef2dacd, 0x6e657979), TOBN(0x54511af3, 0xe2a08235), TOBN(0x1e324aa4, 0x1f4aea3d)}, {TOBN(0x550e7e71, 0xe6e67671), TOBN(0xbccd5190, 0xbf52faf7), TOBN(0xf880d316, 0x223cc62a), TOBN(0x0d402c7e, 0x2b32eb5d)}}, {{TOBN(0xa40bc039, 0x306a5a3b), TOBN(0x4e0a41fd, 0x96783a1b), TOBN(0xa1e8d39a, 0x0253cdd4), TOBN(0x6480be26, 0xc7388638)}, {TOBN(0xee365e1d, 0x2285f382), TOBN(0x188d8d8f, 0xec0b5c36), TOBN(0x34ef1a48, 0x1f0f4d82), TOBN(0x1a8f43e1, 0xa487d29a)}}, {{TOBN(0x8168226d, 0x77aefb3a), TOBN(0xf69a751e, 0x1e72c253), TOBN(0x8e04359a, 0xe9594df1), TOBN(0x475ffd7d, 0xd14c0467)}, {TOBN(0xb5a2c2b1, 0x3844e95c), TOBN(0x85caf647, 0xdd12ef94), TOBN(0x1ecd2a9f, 0xf1063d00), TOBN(0x1dd2e229, 0x23843311)}}, {{TOBN(0x38f0e09d, 0x73d17244), TOBN(0x3ede7746, 0x8fc653f1), TOBN(0xae4459f5, 0xdc20e21c), TOBN(0x00db2ffa, 0x6a8599ea)}, {TOBN(0x11682c39, 0x30cfd905), TOBN(0x4934d074, 0xa5c112a6), TOBN(0xbdf063c5, 0x568bfe95), TOBN(0x779a440a, 0x016c441a)}}, {{TOBN(0x0c23f218, 0x97d6fbdc), TOBN(0xd3a5cd87, 0xe0776aac), TOBN(0xcee37f72, 0xd712e8db), TOBN(0xfb28c70d, 0x26f74e8d)}, {TOBN(0xffe0c728, 0xb61301a0), TOBN(0xa6282168, 0xd3724354), TOBN(0x7ff4cb00, 0x768ffedc), TOBN(0xc51b3088, 0x03b02de9)}}, {{TOBN(0xa5a8147c, 0x3902dda5), TOBN(0x35d2f706, 0xfe6973b4), TOBN(0x5ac2efcf, 0xc257457e), TOBN(0x933f48d4, 0x8700611b)}, {TOBN(0xc365af88, 0x4912beb2), TOBN(0x7f5a4de6, 0x162edf94), TOBN(0xc646ba7c, 0x0c32f34b), TOBN(0x632c6af3, 0xb2091074)}}, {{TOBN(0x58d4f2e3, 0x753e43a9), TOBN(0x70e1d217, 0x24d4e23f), TOBN(0xb24bf729, 0xafede6a6), TOBN(0x7f4a94d8, 0x710c8b60)}, {TOBN(0xaad90a96, 0x8d4faa6a), TOBN(0xd9ed0b32, 0xb066b690), TOBN(0x52fcd37b, 0x78b6dbfd), TOBN(0x0b64615e, 0x8bd2b431)}}, {{TOBN(0x228e2048, 0xcfb9fad5), TOBN(0xbeaa386d, 0x240b76bd), TOBN(0x2d6681c8, 0x90dad7bc), TOBN(0x3e553fc3, 0x06d38f5e)}, {TOBN(0xf27cdb9b, 0x9d5f9750), TOBN(0x3e85c52a, 0xd28c5b0e), TOBN(0x190795af, 0x5247c39b), TOBN(0x547831eb, 0xbddd6828)}}, {{TOBN(0xf327a227, 0x4a82f424), TOBN(0x36919c78, 0x7e47f89d), TOBN(0xe4783919, 0x43c7392c), TOBN(0xf101b9aa, 0x2316fefe)}, {TOBN(0xbcdc9e9c, 0x1c5009d2), TOBN(0xfb55ea13, 0x9cd18345), TOBN(0xf5b5e231, 0xa3ce77c7), TOBN(0xde6b4527, 0xd2f2cb3d)}}, {{TOBN(0x10f6a333, 0x9bb26f5f), TOBN(0x1e85db8e, 0x044d85b6), TOBN(0xc3697a08, 0x94197e54), TOBN(0x65e18cc0, 0xa7cb4ea8)}, {TOBN(0xa38c4f50, 0xa471fe6e), TOBN(0xf031747a, 0x2f13439c), TOBN(0x53c4a6ba, 0xc007318b), TOBN(0xa8da3ee5, 0x1deccb3d)}}, {{TOBN(0x0555b31c, 0x558216b1), TOBN(0x90c7810c, 0x2f79e6c2), TOBN(0x9b669f4d, 0xfe8eed3c), TOBN(0x70398ec8, 0xe0fac126)}, {TOBN(0xa96a449e, 0xf701b235), TOBN(0x0ceecdb3, 0xeb94f395), TOBN(0x285fc368, 0xd0cb7431), TOBN(0x0d37bb52, 0x16a18c64)}}, {{TOBN(0x05110d38, 0xb880d2dd), TOBN(0xa60f177b, 0x65930d57), TOBN(0x7da34a67, 0xf36235f5), TOBN(0x47f5e17c, 0x183816b9)}, {TOBN(0xc7664b57, 0xdb394af4), TOBN(0x39ba215d, 0x7036f789), TOBN(0x46d2ca0e, 0x2f27b472), TOBN(0xc42647ee, 0xf73a84b7)}}, {{TOBN(0x44bc7545, 0x64488f1d), TOBN(0xaa922708, 0xf4cf85d5), TOBN(0x721a01d5, 0x53e4df63), TOBN(0x649c0c51, 0x5db46ced)}, {TOBN(0x6bf0d64e, 0x3cffcb6c), TOBN(0xe3bf93fe, 0x50f71d96), TOBN(0x75044558, 0xbcc194a0), TOBN(0x16ae3372, 0x6afdc554)}}, {{TOBN(0xbfc01adf, 0x5ca48f3f), TOBN(0x64352f06, 0xe22a9b84), TOBN(0xcee54da1, 0xc1099e4a), TOBN(0xbbda54e8, 0xfa1b89c0)}, {TOBN(0x166a3df5, 0x6f6e55fb), TOBN(0x1ca44a24, 0x20176f88), TOBN(0x936afd88, 0xdfb7b5ff), TOBN(0xe34c2437, 0x8611d4a0)}}, {{TOBN(0x7effbb75, 0x86142103), TOBN(0x6704ba1b, 0x1f34fc4d), TOBN(0x7c2a468f, 0x10c1b122), TOBN(0x36b3a610, 0x8c6aace9)}, {TOBN(0xabfcc0a7, 0x75a0d050), TOBN(0x066f9197, 0x3ce33e32), TOBN(0xce905ef4, 0x29fe09be), TOBN(0x89ee25ba, 0xa8376351)}}, {{TOBN(0x2a3ede22, 0xfd29dc76), TOBN(0x7fd32ed9, 0x36f17260), TOBN(0x0cadcf68, 0x284b4126), TOBN(0x63422f08, 0xa7951fc8)}, {TOBN(0x562b24f4, 0x0807e199), TOBN(0xfe9ce5d1, 0x22ad4490), TOBN(0xc2f51b10, 0x0db2b1b4), TOBN(0xeb3613ff, 0xe4541d0d)}}, {{TOBN(0xbd2c4a05, 0x2680813b), TOBN(0x527aa55d, 0x561b08d6), TOBN(0xa9f8a40e, 0xa7205558), TOBN(0xe3eea56f, 0x243d0bec)}, {TOBN(0x7b853817, 0xa0ff58b3), TOBN(0xb67d3f65, 0x1a69e627), TOBN(0x0b76bbb9, 0xa869b5d6), TOBN(0xa3afeb82, 0x546723ed)}}, {{TOBN(0x5f24416d, 0x3e554892), TOBN(0x8413b53d, 0x430e2a45), TOBN(0x99c56aee, 0x9032a2a0), TOBN(0x09432bf6, 0xeec367b1)}, {TOBN(0x552850c6, 0xdaf0ecc1), TOBN(0x49ebce55, 0x5bc92048), TOBN(0xdfb66ba6, 0x54811307), TOBN(0x1b84f797, 0x6f298597)}}, {{TOBN(0x79590481, 0x8d1d7a0d), TOBN(0xd9fabe03, 0x3a6fa556), TOBN(0xa40f9c59, 0xba9e5d35), TOBN(0xcb1771c1, 0xf6247577)}, {TOBN(0x542a47ca, 0xe9a6312b), TOBN(0xa34b3560, 0x552dd8c5), TOBN(0xfdf94de0, 0x0d794716), TOBN(0xd46124a9, 0x9c623094)}}, {{TOBN(0x56b7435d, 0x68afe8b4), TOBN(0x27f20540, 0x6c0d8ea1), TOBN(0x12b77e14, 0x73186898), TOBN(0xdbc3dd46, 0x7479490f)}, {TOBN(0x951a9842, 0xc03b0c05), TOBN(0x8b1b3bb3, 0x7921bc96), TOBN(0xa573b346, 0x2b202e0a), TOBN(0x77e4665d, 0x47254d56)}}, {{TOBN(0x08b70dfc, 0xd23e3984), TOBN(0xab86e8bc, 0xebd14236), TOBN(0xaa3e07f8, 0x57114ba7), TOBN(0x5ac71689, 0xab0ef4f2)}, {TOBN(0x88fca384, 0x0139d9af), TOBN(0x72733f88, 0x76644af0), TOBN(0xf122f72a, 0x65d74f4a), TOBN(0x13931577, 0xa5626c7a)}}, {{TOBN(0xd5b5d9eb, 0x70f8d5a4), TOBN(0x375adde7, 0xd7bbb228), TOBN(0x31e88b86, 0x0c1c0b32), TOBN(0xd1f568c4, 0x173edbaa)}, {TOBN(0x1592fc83, 0x5459df02), TOBN(0x2beac0fb, 0x0fcd9a7e), TOBN(0xb0a6fdb8, 0x1b473b0a), TOBN(0xe3224c6f, 0x0fe8fc48)}}, {{TOBN(0x680bd00e, 0xe87edf5b), TOBN(0x30385f02, 0x20e77cf5), TOBN(0xe9ab98c0, 0x4d42d1b2), TOBN(0x72d191d2, 0xd3816d77)}, {TOBN(0x1564daca, 0x0917d9e5), TOBN(0x394eab59, 0x1f8fed7f), TOBN(0xa209aa8d, 0x7fbb3896), TOBN(0x5564f3b9, 0xbe6ac98e)}}, {{TOBN(0xead21d05, 0xd73654ef), TOBN(0x68d1a9c4, 0x13d78d74), TOBN(0x61e01708, 0x6d4973a0), TOBN(0x83da3500, 0x46e6d32a)}, {TOBN(0x6a3dfca4, 0x68ae0118), TOBN(0xa1b9a4c9, 0xd02da069), TOBN(0x0b2ff9c7, 0xebab8302), TOBN(0x98af07c3, 0x944ba436)}}, {{TOBN(0x85997326, 0x995f0f9f), TOBN(0x467fade0, 0x71b58bc6), TOBN(0x47e4495a, 0xbd625a2b), TOBN(0xfdd2d01d, 0x33c3b8cd)}, {TOBN(0x2c38ae28, 0xc693f9fa), TOBN(0x48622329, 0x348f7999), TOBN(0x97bf738e, 0x2161f583), TOBN(0x15ee2fa7, 0x565e8cc9)}}, {{TOBN(0xa1a5c845, 0x5777e189), TOBN(0xcc10bee0, 0x456f2829), TOBN(0x8ad95c56, 0xda762bd5), TOBN(0x152e2214, 0xe9d91da8)}, {TOBN(0x975b0e72, 0x7cb23c74), TOBN(0xfd5d7670, 0xa90c66df), TOBN(0xb5b5b8ad, 0x225ffc53), TOBN(0xab6dff73, 0xfaded2ae)}}, {{TOBN(0xebd56781, 0x6f4cbe9d), TOBN(0x0ed8b249, 0x6a574bd7), TOBN(0x41c246fe, 0x81a881fa), TOBN(0x91564805, 0xc3db9c70)}, {TOBN(0xd7c12b08, 0x5b862809), TOBN(0x1facd1f1, 0x55858d7b), TOBN(0x7693747c, 0xaf09e92a), TOBN(0x3b69dcba, 0x189a425f)}}, {{TOBN(0x0be28e9f, 0x967365ef), TOBN(0x57300eb2, 0xe801f5c9), TOBN(0x93b8ac6a, 0xd583352f), TOBN(0xa2cf1f89, 0xcd05b2b7)}, {TOBN(0x7c0c9b74, 0x4dcc40cc), TOBN(0xfee38c45, 0xada523fb), TOBN(0xb49a4dec, 0x1099cc4d), TOBN(0x325c377f, 0x69f069c6)}}, {{TOBN(0xe12458ce, 0x476cc9ff), TOBN(0x580e0b6c, 0xc6d4cb63), TOBN(0xd561c8b7, 0x9072289b), TOBN(0x0377f264, 0xa619e6da)}, {TOBN(0x26685362, 0x88e591a5), TOBN(0xa453a7bd, 0x7523ca2b), TOBN(0x8a9536d2, 0xc1df4533), TOBN(0xc8e50f2f, 0xbe972f79)}}, {{TOBN(0xd433e50f, 0x6d3549cf), TOBN(0x6f33696f, 0xfacd665e), TOBN(0x695bfdac, 0xce11fcb4), TOBN(0x810ee252, 0xaf7c9860)}, {TOBN(0x65450fe1, 0x7159bb2c), TOBN(0xf7dfbebe, 0x758b357b), TOBN(0x2b057e74, 0xd69fea72), TOBN(0xd485717a, 0x92731745)}}}, {{{TOBN(0x896c42e8, 0xee36860c), TOBN(0xdaf04dfd, 0x4113c22d), TOBN(0x1adbb7b7, 0x44104213), TOBN(0xe5fd5fa1, 0x1fd394ea)}, {TOBN(0x68235d94, 0x1a4e0551), TOBN(0x6772cfbe, 0x18d10151), TOBN(0x276071e3, 0x09984523), TOBN(0xe4e879de, 0x5a56ba98)}}, {{TOBN(0xaaafafb0, 0x285b9491), TOBN(0x01a0be88, 0x1e4c705e), TOBN(0xff1d4f5d, 0x2ad9caab), TOBN(0x6e349a4a, 0xc37a233f)}, {TOBN(0xcf1c1246, 0x4a1c6a16), TOBN(0xd99e6b66, 0x29383260), TOBN(0xea3d4366, 0x5f6d5471), TOBN(0x36974d04, 0xff8cc89b)}}, {{TOBN(0xc26c49a1, 0xcfe89d80), TOBN(0xb42c026d, 0xda9c8371), TOBN(0xca6c013a, 0xdad066d2), TOBN(0xfb8f7228, 0x56a4f3ee)}, {TOBN(0x08b579ec, 0xd850935b), TOBN(0x34c1a74c, 0xd631e1b3), TOBN(0xcb5fe596, 0xac198534), TOBN(0x39ff21f6, 0xe1f24f25)}}, {{TOBN(0x27f29e14, 0x8f929057), TOBN(0x7a64ae06, 0xc0c853df), TOBN(0x256cd183, 0x58e9c5ce), TOBN(0x9d9cce82, 0xded092a5)}, {TOBN(0xcc6e5979, 0x6e93b7c7), TOBN(0xe1e47092, 0x31bb9e27), TOBN(0xb70b3083, 0xaa9e29a0), TOBN(0xbf181a75, 0x3785e644)}}, {{TOBN(0xf53f2c65, 0x8ead09f7), TOBN(0x1335e1d5, 0x9780d14d), TOBN(0x69cc20e0, 0xcd1b66bc), TOBN(0x9b670a37, 0xbbe0bfc8)}, {TOBN(0xce53dc81, 0x28efbeed), TOBN(0x0c74e77c, 0x8326a6e5), TOBN(0x3604e0d2, 0xb88e9a63), TOBN(0xbab38fca, 0x13dc2248)}}, {{TOBN(0x8ed6e8c8, 0x5c0a3f1e), TOBN(0xbcad2492, 0x7c87c37f), TOBN(0xfdfb62bb, 0x9ee3b78d), TOBN(0xeba8e477, 0xcbceba46)}, {TOBN(0x37d38cb0, 0xeeaede4b), TOBN(0x0bc498e8, 0x7976deb6), TOBN(0xb2944c04, 0x6b6147fb), TOBN(0x8b123f35, 0xf71f9609)}}, {{TOBN(0xa155dcc7, 0xde79dc24), TOBN(0xf1168a32, 0x558f69cd), TOBN(0xbac21595, 0x0d1850df), TOBN(0x15c8295b, 0xb204c848)}, {TOBN(0xf661aa36, 0x7d8184ff), TOBN(0xc396228e, 0x30447bdb), TOBN(0x11cd5143, 0xbde4a59e), TOBN(0xe3a26e3b, 0x6beab5e6)}}, {{TOBN(0xd3b3a13f, 0x1402b9d0), TOBN(0x573441c3, 0x2c7bc863), TOBN(0x4b301ec4, 0x578c3e6e), TOBN(0xc26fc9c4, 0x0adaf57e)}, {TOBN(0x96e71bfd, 0x7493cea3), TOBN(0xd05d4b3f, 0x1af81456), TOBN(0xdaca2a8a, 0x6a8c608f), TOBN(0x53ef07f6, 0x0725b276)}}, {{TOBN(0x07a5fbd2, 0x7824fc56), TOBN(0x34675218, 0x13289077), TOBN(0x5bf69fd5, 0xe0c48349), TOBN(0xa613ddd3, 0xb6aa7875)}, {TOBN(0x7f78c19c, 0x5450d866), TOBN(0x46f4409c, 0x8f84a481), TOBN(0x9f1d1928, 0x90fce239), TOBN(0x016c4168, 0xb2ce44b9)}}, {{TOBN(0xbae023f0, 0xc7435978), TOBN(0xb152c888, 0x20e30e19), TOBN(0x9c241645, 0xe3fa6faf), TOBN(0x735d95c1, 0x84823e60)}, {TOBN(0x03197573, 0x03955317), TOBN(0x0b4b02a9, 0xf03b4995), TOBN(0x076bf559, 0x70274600), TOBN(0x32c5cc53, 0xaaf57508)}}, {{TOBN(0xe8af6d1f, 0x60624129), TOBN(0xb7bc5d64, 0x9a5e2b5e), TOBN(0x3814b048, 0x5f082d72), TOBN(0x76f267f2, 0xce19677a)}, {TOBN(0x626c630f, 0xb36eed93), TOBN(0x55230cd7, 0x3bf56803), TOBN(0x78837949, 0xce2736a0), TOBN(0x0d792d60, 0xaa6c55f1)}}, {{TOBN(0x0318dbfd, 0xd5c7c5d2), TOBN(0xb38f8da7, 0x072b342d), TOBN(0x3569bddc, 0x7b8de38a), TOBN(0xf25b5887, 0xa1c94842)}, {TOBN(0xb2d5b284, 0x2946ad60), TOBN(0x854f29ad, 0xe9d1707e), TOBN(0xaa5159dc, 0x2c6a4509), TOBN(0x899f94c0, 0x57189837)}}, {{TOBN(0xcf6adc51, 0xf4a55b03), TOBN(0x261762de, 0x35e3b2d5), TOBN(0x4cc43012, 0x04827b51), TOBN(0xcd22a113, 0xc6021442)}, {TOBN(0xce2fd61a, 0x247c9569), TOBN(0x59a50973, 0xd152beca), TOBN(0x6c835a11, 0x63a716d4), TOBN(0xc26455ed, 0x187dedcf)}}, {{TOBN(0x27f536e0, 0x49ce89e7), TOBN(0x18908539, 0xcc890cb5), TOBN(0x308909ab, 0xd83c2aa1), TOBN(0xecd3142b, 0x1ab73bd3)}, {TOBN(0x6a85bf59, 0xb3f5ab84), TOBN(0x3c320a68, 0xf2bea4c6), TOBN(0xad8dc538, 0x6da4541f), TOBN(0xeaf34eb0, 0xb7c41186)}}, {{TOBN(0x1c780129, 0x977c97c4), TOBN(0x5ff9beeb, 0xc57eb9fa), TOBN(0xa24d0524, 0xc822c478), TOBN(0xfd8eec2a, 0x461cd415)}, {TOBN(0xfbde194e, 0xf027458c), TOBN(0xb4ff5319, 0x1d1be115), TOBN(0x63f874d9, 0x4866d6f4), TOBN(0x35c75015, 0xb21ad0c9)}}, {{TOBN(0xa6b5c9d6, 0x46ac49d2), TOBN(0x42c77c0b, 0x83137aa9), TOBN(0x24d000fc, 0x68225a38), TOBN(0x0f63cfc8, 0x2fe1e907)}, {TOBN(0x22d1b01b, 0xc6441f95), TOBN(0x7d38f719, 0xec8e448f), TOBN(0x9b33fa5f, 0x787fb1ba), TOBN(0x94dcfda1, 0x190158df)}}, {{TOBN(0xc47cb339, 0x5f6d4a09), TOBN(0x6b4f355c, 0xee52b826), TOBN(0x3d100f5d, 0xf51b930a), TOBN(0xf4512fac, 0x9f668f69)}, {TOBN(0x546781d5, 0x206c4c74), TOBN(0xd021d4d4, 0xcb4d2e48), TOBN(0x494a54c2, 0xca085c2d), TOBN(0xf1dbaca4, 0x520850a8)}}, {{TOBN(0x63c79326, 0x490a1aca), TOBN(0xcb64dd9c, 0x41526b02), TOBN(0xbb772591, 0xa2979258), TOBN(0x3f582970, 0x48d97846)}, {TOBN(0xd66b70d1, 0x7c213ba7), TOBN(0xc28febb5, 0xe8a0ced4), TOBN(0x6b911831, 0xc10338c1), TOBN(0x0d54e389, 0xbf0126f3)}}, {{TOBN(0x7048d460, 0x4af206ee), TOBN(0x786c88f6, 0x77e97cb9), TOBN(0xd4375ae1, 0xac64802e), TOBN(0x469bcfe1, 0xd53ec11c)}, {TOBN(0xfc9b340d, 0x47062230), TOBN(0xe743bb57, 0xc5b4a3ac), TOBN(0xfe00b4aa, 0x59ef45ac), TOBN(0x29a4ef23, 0x59edf188)}}, {{TOBN(0x40242efe, 0xb483689b), TOBN(0x2575d3f6, 0x513ac262), TOBN(0xf30037c8, 0x0ca6db72), TOBN(0xc9fcce82, 0x98864be2)}, {TOBN(0x84a112ff, 0x0149362d), TOBN(0x95e57582, 0x1c4ae971), TOBN(0x1fa4b1a8, 0x945cf86c), TOBN(0x4525a734, 0x0b024a2f)}}, {{TOBN(0xe76c8b62, 0x8f338360), TOBN(0x483ff593, 0x28edf32b), TOBN(0x67e8e90a, 0x298b1aec), TOBN(0x9caab338, 0x736d9a21)}, {TOBN(0x5c09d2fd, 0x66892709), TOBN(0x2496b4dc, 0xb55a1d41), TOBN(0x93f5fb1a, 0xe24a4394), TOBN(0x08c75049, 0x6fa8f6c1)}}, {{TOBN(0xcaead1c2, 0xc905d85f), TOBN(0xe9d7f790, 0x0733ae57), TOBN(0x24c9a65c, 0xf07cdd94), TOBN(0x7389359c, 0xa4b55931)}, {TOBN(0xf58709b7, 0x367e45f7), TOBN(0x1f203067, 0xcb7e7adc), TOBN(0x82444bff, 0xc7b72818), TOBN(0x07303b35, 0xbaac8033)}}, {{TOBN(0x1e1ee4e4, 0xd13b7ea1), TOBN(0xe6489b24, 0xe0e74180), TOBN(0xa5f2c610, 0x7e70ef70), TOBN(0xa1655412, 0xbdd10894)}, {TOBN(0x555ebefb, 0x7af4194e), TOBN(0x533c1c3c, 0x8e89bd9c), TOBN(0x735b9b57, 0x89895856), TOBN(0x15fb3cd2, 0x567f5c15)}}, {{TOBN(0x057fed45, 0x526f09fd), TOBN(0xe8a4f10c, 0x8128240a), TOBN(0x9332efc4, 0xff2bfd8d), TOBN(0x214e77a0, 0xbd35aa31)}, {TOBN(0x32896d73, 0x14faa40e), TOBN(0x767867ec, 0x01e5f186), TOBN(0xc9adf8f1, 0x17a1813e), TOBN(0xcb6cda78, 0x54741795)}}, {{TOBN(0xb7521b6d, 0x349d51aa), TOBN(0xf56b5a9e, 0xe3c7b8e9), TOBN(0xc6f1e5c9, 0x32a096df), TOBN(0x083667c4, 0xa3635024)}, {TOBN(0x365ea135, 0x18087f2f), TOBN(0xf1b8eaac, 0xd136e45d), TOBN(0xc8a0e484, 0x73aec989), TOBN(0xd75a324b, 0x142c9259)}}, {{TOBN(0xb7b4d001, 0x01dae185), TOBN(0x45434e0b, 0x9b7a94bc), TOBN(0xf54339af, 0xfbd8cb0b), TOBN(0xdcc4569e, 0xe98ef49e)}, {TOBN(0x7789318a, 0x09a51299), TOBN(0x81b4d206, 0xb2b025d8), TOBN(0xf64aa418, 0xfae85792), TOBN(0x3e50258f, 0xacd7baf7)}}, {{TOBN(0xdce84cdb, 0x2996864b), TOBN(0xa2e67089, 0x1f485fa4), TOBN(0xb28b2bb6, 0x534c6a5a), TOBN(0x31a7ec6b, 0xc94b9d39)}, {TOBN(0x1d217766, 0xd6bc20da), TOBN(0x4acdb5ec, 0x86761190), TOBN(0x68726328, 0x73701063), TOBN(0x4d24ee7c, 0x2128c29b)}}, {{TOBN(0xc072ebd3, 0xa19fd868), TOBN(0x612e481c, 0xdb8ddd3b), TOBN(0xb4e1d754, 0x1a64d852), TOBN(0x00ef95ac, 0xc4c6c4ab)}, {TOBN(0x1536d2ed, 0xaa0a6c46), TOBN(0x61294086, 0x43774790), TOBN(0x54af25e8, 0x343fda10), TOBN(0x9ff9d98d, 0xfd25d6f2)}}, {{TOBN(0x0746af7c, 0x468b8835), TOBN(0x977a31cb, 0x730ecea7), TOBN(0xa5096b80, 0xc2cf4a81), TOBN(0xaa986833, 0x6458c37a)}, {TOBN(0x6af29bf3, 0xa6bd9d34), TOBN(0x6a62fe9b, 0x33c5d854), TOBN(0x50e6c304, 0xb7133b5e), TOBN(0x04b60159, 0x7d6e6848)}}, {{TOBN(0x4cd296df, 0x5579bea4), TOBN(0x10e35ac8, 0x5ceedaf1), TOBN(0x04c4c5fd, 0xe3bcc5b1), TOBN(0x95f9ee8a, 0x89412cf9)}, {TOBN(0x2c9459ee, 0x82b6eb0f), TOBN(0x2e845765, 0x95c2aadd), TOBN(0x774a84ae, 0xd327fcfe), TOBN(0xd8c93722, 0x0368d476)}}, {{TOBN(0x0dbd5748, 0xf83e8a3b), TOBN(0xa579aa96, 0x8d2495f3), TOBN(0x535996a0, 0xae496e9b), TOBN(0x07afbfe9, 0xb7f9bcc2)}, {TOBN(0x3ac1dc6d, 0x5b7bd293), TOBN(0x3b592cff, 0x7022323d), TOBN(0xba0deb98, 0x9c0a3e76), TOBN(0x18e78e9f, 0x4b197acb)}}, {{TOBN(0x211cde10, 0x296c36ef), TOBN(0x7ee89672, 0x82c4da77), TOBN(0xb617d270, 0xa57836da), TOBN(0xf0cd9c31, 0x9cb7560b)}, {TOBN(0x01fdcbf7, 0xe455fe90), TOBN(0x3fb53cbb, 0x7e7334f3), TOBN(0x781e2ea4, 0x4e7de4ec), TOBN(0x8adab3ad, 0x0b384fd0)}}, {{TOBN(0x129eee2f, 0x53d64829), TOBN(0x7a471e17, 0xa261492b), TOBN(0xe4f9adb9, 0xe4cb4a2c), TOBN(0x3d359f6f, 0x97ba2c2d)}, {TOBN(0x346c6786, 0x0aacd697), TOBN(0x92b444c3, 0x75c2f8a8), TOBN(0xc79fa117, 0xd85df44e), TOBN(0x56782372, 0x398ddf31)}}, {{TOBN(0x60e690f2, 0xbbbab3b8), TOBN(0x4851f8ae, 0x8b04816b), TOBN(0xc72046ab, 0x9c92e4d2), TOBN(0x518c74a1, 0x7cf3136b)}, {TOBN(0xff4eb50a, 0xf9877d4c), TOBN(0x14578d90, 0xa919cabb), TOBN(0x8218f8c4, 0xac5eb2b6), TOBN(0xa3ccc547, 0x542016e4)}}, {{TOBN(0x025bf48e, 0x327f8349), TOBN(0xf3e97346, 0xf43cb641), TOBN(0xdc2bafdf, 0x500f1085), TOBN(0x57167876, 0x2f063055)}, {TOBN(0x5bd914b9, 0x411925a6), TOBN(0x7c078d48, 0xa1123de5), TOBN(0xee6bf835, 0x182b165d), TOBN(0xb11b5e5b, 0xba519727)}}, {{TOBN(0xe33ea76c, 0x1eea7b85), TOBN(0x2352b461, 0x92d4f85e), TOBN(0xf101d334, 0xafe115bb), TOBN(0xfabc1294, 0x889175a3)}, {TOBN(0x7f6bcdc0, 0x5233f925), TOBN(0xe0a802db, 0xe77fec55), TOBN(0xbdb47b75, 0x8069b659), TOBN(0x1c5e12de, 0xf98fbd74)}}, {{TOBN(0x869c58c6, 0x4b8457ee), TOBN(0xa5360f69, 0x4f7ea9f7), TOBN(0xe576c09f, 0xf460b38f), TOBN(0x6b70d548, 0x22b7fb36)}, {TOBN(0x3fd237f1, 0x3bfae315), TOBN(0x33797852, 0xcbdff369), TOBN(0x97df25f5, 0x25b516f9), TOBN(0x46f388f2, 0xba38ad2d)}}, {{TOBN(0x656c4658, 0x89d8ddbb), TOBN(0x8830b26e, 0x70f38ee8), TOBN(0x4320fd5c, 0xde1212b0), TOBN(0xc34f30cf, 0xe4a2edb2)}, {TOBN(0xabb131a3, 0x56ab64b8), TOBN(0x7f77f0cc, 0xd99c5d26), TOBN(0x66856a37, 0xbf981d94), TOBN(0x19e76d09, 0x738bd76e)}}, {{TOBN(0xe76c8ac3, 0x96238f39), TOBN(0xc0a482be, 0xa830b366), TOBN(0xb7b8eaff, 0x0b4eb499), TOBN(0x8ecd83bc, 0x4bfb4865)}, {TOBN(0x971b2cb7, 0xa2f3776f), TOBN(0xb42176a4, 0xf4b88adf), TOBN(0xb9617df5, 0xbe1fa446), TOBN(0x8b32d508, 0xcd031bd2)}}, {{TOBN(0x1c6bd47d, 0x53b618c0), TOBN(0xc424f46c, 0x6a227923), TOBN(0x7303ffde, 0xdd92d964), TOBN(0xe9712878, 0x71b5abf2)}, {TOBN(0x8f48a632, 0xf815561d), TOBN(0x85f48ff5, 0xd3c055d1), TOBN(0x222a1427, 0x7525684f), TOBN(0xd0d841a0, 0x67360cc3)}}, {{TOBN(0x4245a926, 0x0b9267c6), TOBN(0xc78913f1, 0xcf07f863), TOBN(0xaa844c8e, 0x4d0d9e24), TOBN(0xa42ad522, 0x3d5f9017)}, {TOBN(0xbd371749, 0xa2c989d5), TOBN(0x928292df, 0xe1f5e78e), TOBN(0x493b383e, 0x0a1ea6da), TOBN(0x5136fd8d, 0x13aee529)}}, {{TOBN(0x860c44b1, 0xf2c34a99), TOBN(0x3b00aca4, 0xbf5855ac), TOBN(0xabf6aaa0, 0xfaaf37be), TOBN(0x65f43682, 0x2a53ec08)}, {TOBN(0x1d9a5801, 0xa11b12e1), TOBN(0x78a7ab2c, 0xe20ed475), TOBN(0x0de1067e, 0x9a41e0d5), TOBN(0x30473f5f, 0x305023ea)}}, {{TOBN(0xdd3ae09d, 0x169c7d97), TOBN(0x5cd5baa4, 0xcfaef9cd), TOBN(0x5cd7440b, 0x65a44803), TOBN(0xdc13966a, 0x47f364de)}, {TOBN(0x077b2be8, 0x2b8357c1), TOBN(0x0cb1b4c5, 0xe9d57c2a), TOBN(0x7a4ceb32, 0x05ff363e), TOBN(0xf310fa4d, 0xca35a9ef)}}, {{TOBN(0xdbb7b352, 0xf97f68c6), TOBN(0x0c773b50, 0x0b02cf58), TOBN(0xea2e4821, 0x3c1f96d9), TOBN(0xffb357b0, 0xeee01815)}, {TOBN(0xb9c924cd, 0xe0f28039), TOBN(0x0b36c95a, 0x46a3fbe4), TOBN(0x1faaaea4, 0x5e46db6c), TOBN(0xcae575c3, 0x1928aaff)}}, {{TOBN(0x7f671302, 0xa70dab86), TOBN(0xfcbd12a9, 0x71c58cfc), TOBN(0xcbef9acf, 0xbee0cb92), TOBN(0x573da0b9, 0xf8c1b583)}, {TOBN(0x4752fcfe, 0x0d41d550), TOBN(0xe7eec0e3, 0x2155cffe), TOBN(0x0fc39fcb, 0x545ae248), TOBN(0x522cb8d1, 0x8065f44e)}}, {{TOBN(0x263c962a, 0x70cbb96c), TOBN(0xe034362a, 0xbcd124a9), TOBN(0xf120db28, 0x3c2ae58d), TOBN(0xb9a38d49, 0xfef6d507)}, {TOBN(0xb1fd2a82, 0x1ff140fd), TOBN(0xbd162f30, 0x20aee7e0), TOBN(0x4e17a5d4, 0xcb251949), TOBN(0x2aebcb83, 0x4f7e1c3d)}}, {{TOBN(0x608eb25f, 0x937b0527), TOBN(0xf42e1e47, 0xeb7d9997), TOBN(0xeba699c4, 0xb8a53a29), TOBN(0x1f921c71, 0xe091b536)}, {TOBN(0xcce29e7b, 0x5b26bbd5), TOBN(0x7a8ef5ed, 0x3b61a680), TOBN(0xe5ef8043, 0xba1f1c7e), TOBN(0x16ea8217, 0x18158dda)}}, {{TOBN(0x01778a2b, 0x599ff0f9), TOBN(0x68a923d7, 0x8104fc6b), TOBN(0x5bfa44df, 0xda694ff3), TOBN(0x4f7199db, 0xf7667f12)}, {TOBN(0xc06d8ff6, 0xe46f2a79), TOBN(0x08b5dead, 0xe9f8131d), TOBN(0x02519a59, 0xabb4ce7c), TOBN(0xc4f710bc, 0xb42aec3e)}}, {{TOBN(0x3d77b057, 0x78bde41a), TOBN(0x6474bf80, 0xb4186b5a), TOBN(0x048b3f67, 0x88c65741), TOBN(0xc64519de, 0x03c7c154)}, {TOBN(0xdf073846, 0x0edfcc4f), TOBN(0x319aa737, 0x48f1aa6b), TOBN(0x8b9f8a02, 0xca909f77), TOBN(0x90258139, 0x7580bfef)}}, {{TOBN(0xd8bfd3ca, 0xc0c22719), TOBN(0xc60209e4, 0xc9ca151e), TOBN(0x7a744ab5, 0xd9a1a69c), TOBN(0x6de5048b, 0x14937f8f)}, {TOBN(0x171938d8, 0xe115ac04), TOBN(0x7df70940, 0x1c6b16d2), TOBN(0xa6aeb663, 0x7f8e94e7), TOBN(0xc130388e, 0x2a2cf094)}}, {{TOBN(0x1850be84, 0x77f54e6e), TOBN(0x9f258a72, 0x65d60fe5), TOBN(0xff7ff0c0, 0x6c9146d6), TOBN(0x039aaf90, 0xe63a830b)}, {TOBN(0x38f27a73, 0x9460342f), TOBN(0x4703148c, 0x3f795f8a), TOBN(0x1bb5467b, 0x9681a97e), TOBN(0x00931ba5, 0xecaeb594)}}, {{TOBN(0xcdb6719d, 0x786f337c), TOBN(0xd9c01cd2, 0xe704397d), TOBN(0x0f4a3f20, 0x555c2fef), TOBN(0x00452509, 0x7c0af223)}, {TOBN(0x54a58047, 0x84db8e76), TOBN(0x3bacf1aa, 0x93c8aa06), TOBN(0x11ca957c, 0xf7919422), TOBN(0x50641053, 0x78cdaa40)}}, {{TOBN(0x7a303874, 0x9f7144ae), TOBN(0x170c963f, 0x43d4acfd), TOBN(0x5e148149, 0x58ddd3ef), TOBN(0xa7bde582, 0x9e72dba8)}, {TOBN(0x0769da8b, 0x6fa68750), TOBN(0xfa64e532, 0x572e0249), TOBN(0xfcaadf9d, 0x2619ad31), TOBN(0x87882daa, 0xa7b349cd)}}, {{TOBN(0x9f6eb731, 0x6c67a775), TOBN(0xcb10471a, 0xefc5d0b1), TOBN(0xb433750c, 0xe1b806b2), TOBN(0x19c5714d, 0x57b1ae7e)}, {TOBN(0xc0dc8b7b, 0xed03fd3f), TOBN(0xdd03344f, 0x31bc194e), TOBN(0xa66c52a7, 0x8c6320b5), TOBN(0x8bc82ce3, 0xd0b6fd93)}}, {{TOBN(0xf8e13501, 0xb35f1341), TOBN(0xe53156dd, 0x25a43e42), TOBN(0xd3adf27e, 0x4daeb85c), TOBN(0xb81d8379, 0xbbeddeb5)}, {TOBN(0x1b0b546e, 0x2e435867), TOBN(0x9020eb94, 0xeba5dd60), TOBN(0x37d91161, 0x8210cb9d), TOBN(0x4c596b31, 0x5c91f1cf)}}, {{TOBN(0xb228a90f, 0x0e0b040d), TOBN(0xbaf02d82, 0x45ff897f), TOBN(0x2aac79e6, 0x00fa6122), TOBN(0x24828817, 0x8e36f557)}, {TOBN(0xb9521d31, 0x113ec356), TOBN(0x9e48861e, 0x15eff1f8), TOBN(0x2aa1d412, 0xe0d41715), TOBN(0x71f86203, 0x53f131b8)}}, {{TOBN(0xf60da8da, 0x3fd19408), TOBN(0x4aa716dc, 0x278d9d99), TOBN(0x394531f7, 0xa8c51c90), TOBN(0xb560b0e8, 0xf59db51c)}, {TOBN(0xa28fc992, 0xfa34bdad), TOBN(0xf024fa14, 0x9cd4f8bd), TOBN(0x5cf530f7, 0x23a9d0d3), TOBN(0x615ca193, 0xe28c9b56)}}, {{TOBN(0x6d2a483d, 0x6f73c51e), TOBN(0xa4cb2412, 0xea0dc2dd), TOBN(0x50663c41, 0x1eb917ff), TOBN(0x3d3a74cf, 0xeade299e)}, {TOBN(0x29b3990f, 0x4a7a9202), TOBN(0xa9bccf59, 0xa7b15c3d), TOBN(0x66a3ccdc, 0xa5df9208), TOBN(0x48027c14, 0x43f2f929)}}, {{TOBN(0xd385377c, 0x40b557f0), TOBN(0xe001c366, 0xcd684660), TOBN(0x1b18ed6b, 0xe2183a27), TOBN(0x879738d8, 0x63210329)}, {TOBN(0xa687c74b, 0xbda94882), TOBN(0xd1bbcc48, 0xa684b299), TOBN(0xaf6f1112, 0x863b3724), TOBN(0x6943d1b4, 0x2c8ce9f8)}}, {{TOBN(0xe044a3bb, 0x098cafb4), TOBN(0x27ed2310, 0x60d48caf), TOBN(0x542b5675, 0x3a31b84d), TOBN(0xcbf3dd50, 0xfcddbed7)}, {TOBN(0x25031f16, 0x41b1d830), TOBN(0xa7ec851d, 0xcb0c1e27), TOBN(0xac1c8fe0, 0xb5ae75db), TOBN(0xb24c7557, 0x08c52120)}}, {{TOBN(0x57f811dc, 0x1d4636c3), TOBN(0xf8436526, 0x681a9939), TOBN(0x1f6bc6d9, 0x9c81adb3), TOBN(0x840f8ac3, 0x5b7d80d4)}, {TOBN(0x731a9811, 0xf4387f1a), TOBN(0x7c501cd3, 0xb5156880), TOBN(0xa5ca4a07, 0xdfe68867), TOBN(0xf123d8f0, 0x5fcea120)}}, {{TOBN(0x1fbb0e71, 0xd607039e), TOBN(0x2b70e215, 0xcd3a4546), TOBN(0x32d2f01d, 0x53324091), TOBN(0xb796ff08, 0x180ab19b)}, {TOBN(0x32d87a86, 0x3c57c4aa), TOBN(0x2aed9caf, 0xb7c49a27), TOBN(0x9fb35eac, 0x31630d98), TOBN(0x338e8cdf, 0x5c3e20a3)}}, {{TOBN(0x80f16182, 0x66cde8db), TOBN(0x4e159980, 0x2d72fd36), TOBN(0xd7b8f13b, 0x9b6e5072), TOBN(0xf5213907, 0x3b7b5dc1)}, {TOBN(0x4d431f1d, 0x8ce4396e), TOBN(0x37a1a680, 0xa7ed2142), TOBN(0xbf375696, 0xd01aaf6b), TOBN(0xaa1c0c54, 0xe63aab66)}}, {{TOBN(0x3014368b, 0x4ed80940), TOBN(0x67e6d056, 0x7a6fcedd), TOBN(0x7c208c49, 0xca97579f), TOBN(0xfe3d7a81, 0xa23597f6)}, {TOBN(0x5e203202, 0x7e096ae2), TOBN(0xb1f3e1e7, 0x24b39366), TOBN(0x26da26f3, 0x2fdcdffc), TOBN(0x79422f1d, 0x6097be83)}}}, {{{TOBN(0x263a2cfb, 0x9db3b381), TOBN(0x9c3a2dee, 0xd4df0a4b), TOBN(0x728d06e9, 0x7d04e61f), TOBN(0x8b1adfbc, 0x42449325)}, {TOBN(0x6ec1d939, 0x7e053a1b), TOBN(0xee2be5c7, 0x66daf707), TOBN(0x80ba1e14, 0x810ac7ab), TOBN(0xdd2ae778, 0xf530f174)}}, {{TOBN(0x0435d97a, 0x205b9d8b), TOBN(0x6eb8f064, 0x056756d4), TOBN(0xd5e88a8b, 0xb6f8210e), TOBN(0x070ef12d, 0xec9fd9ea)}, {TOBN(0x4d849505, 0x3bcc876a), TOBN(0x12a75338, 0xa7404ce3), TOBN(0xd22b49e1, 0xb8a1db5e), TOBN(0xec1f2051, 0x14bfa5ad)}}, {{TOBN(0xadbaeb79, 0xb6828f36), TOBN(0x9d7a0258, 0x01bd5b9e), TOBN(0xeda01e0d, 0x1e844b0c), TOBN(0x4b625175, 0x887edfc9)}, {TOBN(0x14109fdd, 0x9669b621), TOBN(0x88a2ca56, 0xf6f87b98), TOBN(0xfe2eb788, 0x170df6bc), TOBN(0x0cea06f4, 0xffa473f9)}}, {{TOBN(0x43ed81b5, 0xc4e83d33), TOBN(0xd9f35879, 0x5efd488b), TOBN(0x164a620f, 0x9deb4d0f), TOBN(0xc6927bdb, 0xac6a7394)}, {TOBN(0x45c28df7, 0x9f9e0f03), TOBN(0x2868661e, 0xfcd7e1a9), TOBN(0x7cf4e8d0, 0xffa348f1), TOBN(0x6bd4c284, 0x398538e0)}}, {{TOBN(0x2618a091, 0x289a8619), TOBN(0xef796e60, 0x6671b173), TOBN(0x664e46e5, 0x9090c632), TOBN(0xa38062d4, 0x1e66f8fb)}, {TOBN(0x6c744a20, 0x0573274e), TOBN(0xd07b67e4, 0xa9271394), TOBN(0x391223b2, 0x6bdc0e20), TOBN(0xbe2d93f1, 0xeb0a05a7)}}, {{TOBN(0xf23e2e53, 0x3f36d141), TOBN(0xe84bb3d4, 0x4dfca442), TOBN(0xb804a48d, 0x6b7c023a), TOBN(0x1e16a8fa, 0x76431c3b)}, {TOBN(0x1b5452ad, 0xddd472e0), TOBN(0x7d405ee7, 0x0d1ee127), TOBN(0x50fc6f1d, 0xffa27599), TOBN(0x351ac53c, 0xbf391b35)}}, {{TOBN(0x7efa14b8, 0x4444896b), TOBN(0x64974d2f, 0xf94027fb), TOBN(0xefdcd0e8, 0xde84487d), TOBN(0x8c45b260, 0x2b48989b)}, {TOBN(0xa8fcbbc2, 0xd8463487), TOBN(0xd1b2b3f7, 0x3fbc476c), TOBN(0x21d005b7, 0xc8f443c0), TOBN(0x518f2e67, 0x40c0139c)}}, {{TOBN(0x56036e8c, 0x06d75fc1), TOBN(0x2dcf7bb7, 0x3249a89f), TOBN(0x81dd1d3d, 0xe245e7dd), TOBN(0xf578dc4b, 0xebd6e2a7)}, {TOBN(0x4c028903, 0xdf2ce7a0), TOBN(0xaee36288, 0x9c39afac), TOBN(0xdc847c31, 0x146404ab), TOBN(0x6304c0d8, 0xa4e97818)}}, {{TOBN(0xae51dca2, 0xa91f6791), TOBN(0x2abe4190, 0x9baa9efc), TOBN(0xd9d2e2f4, 0x559c7ac1), TOBN(0xe82f4b51, 0xfc9f773a)}, {TOBN(0xa7713027, 0x4073e81c), TOBN(0xc0276fac, 0xfbb596fc), TOBN(0x1d819fc9, 0xa684f70c), TOBN(0x29b47fdd, 0xc9f7b1e0)}}, {{TOBN(0x358de103, 0x459b1940), TOBN(0xec881c59, 0x5b013e93), TOBN(0x51574c93, 0x49532ad3), TOBN(0x2db1d445, 0xb37b46de)}, {TOBN(0xc6445b87, 0xdf239fd8), TOBN(0xc718af75, 0x151d24ee), TOBN(0xaea1c4a4, 0xf43c6259), TOBN(0x40c0e5d7, 0x70be02f7)}}, {{TOBN(0x6a4590f4, 0x721b33f2), TOBN(0x2124f1fb, 0xfedf04ea), TOBN(0xf8e53cde, 0x9745efe7), TOBN(0xe7e10432, 0x65f046d9)}, {TOBN(0xc3fca28e, 0xe4d0c7e6), TOBN(0x847e339a, 0x87253b1b), TOBN(0x9b595348, 0x3743e643), TOBN(0xcb6a0a0b, 0x4fd12fc5)}}, {{TOBN(0xfb6836c3, 0x27d02dcc), TOBN(0x5ad00982, 0x7a68bcc2), TOBN(0x1b24b44c, 0x005e912d), TOBN(0xcc83d20f, 0x811fdcfe)}, {TOBN(0x36527ec1, 0x666fba0c), TOBN(0x69948197, 0x14754635), TOBN(0xfcdcb1a8, 0x556da9c2), TOBN(0xa5934267, 0x81a732b2)}}, {{TOBN(0xec1214ed, 0xa714181d), TOBN(0x609ac13b, 0x6067b341), TOBN(0xff4b4c97, 0xa545df1f), TOBN(0xa1240501, 0x34d2076b)}, {TOBN(0x6efa0c23, 0x1409ca97), TOBN(0x254cc1a8, 0x20638c43), TOBN(0xd4e363af, 0xdcfb46cd), TOBN(0x62c2adc3, 0x03942a27)}}, {{TOBN(0xc67b9df0, 0x56e46483), TOBN(0xa55abb20, 0x63736356), TOBN(0xab93c098, 0xc551bc52), TOBN(0x382b49f9, 0xb15fe64b)}, {TOBN(0x9ec221ad, 0x4dff8d47), TOBN(0x79caf615, 0x437df4d6), TOBN(0x5f13dc64, 0xbb456509), TOBN(0xe4c589d9, 0x191f0714)}}, {{TOBN(0x27b6a8ab, 0x3fd40e09), TOBN(0xe455842e, 0x77313ea9), TOBN(0x8b51d1e2, 0x1f55988b), TOBN(0x5716dd73, 0x062bbbfc)}, {TOBN(0x633c11e5, 0x4e8bf3de), TOBN(0x9a0e77b6, 0x1b85be3b), TOBN(0x56510729, 0x0911cca6), TOBN(0x27e76495, 0xefa6590f)}}, {{TOBN(0xe4ac8b33, 0x070d3aab), TOBN(0x2643672b, 0x9a2cd5e5), TOBN(0x52eff79b, 0x1cfc9173), TOBN(0x665ca49b, 0x90a7c13f)}, {TOBN(0x5a8dda59, 0xb3efb998), TOBN(0x8a5b922d, 0x052f1341), TOBN(0xae9ebbab, 0x3cf9a530), TOBN(0x35986e7b, 0xf56da4d7)}}, {{TOBN(0x3a636b5c, 0xff3513cc), TOBN(0xbb0cf8ba, 0x3198f7dd), TOBN(0xb8d40522, 0x41f16f86), TOBN(0x760575d8, 0xde13a7bf)}, {TOBN(0x36f74e16, 0x9f7aa181), TOBN(0x163a3ecf, 0xf509ed1c), TOBN(0x6aead61f, 0x3c40a491), TOBN(0x158c95fc, 0xdfe8fcaa)}}, {{TOBN(0xa3991b6e, 0x13cda46f), TOBN(0x79482415, 0x342faed0), TOBN(0xf3ba5bde, 0x666b5970), TOBN(0x1d52e6bc, 0xb26ab6dd)}, {TOBN(0x768ba1e7, 0x8608dd3d), TOBN(0x4930db2a, 0xea076586), TOBN(0xd9575714, 0xe7dc1afa), TOBN(0x1fc7bf7d, 0xf7c58817)}}, {{TOBN(0x6b47accd, 0xd9eee96c), TOBN(0x0ca277fb, 0xe58cec37), TOBN(0x113fe413, 0xe702c42a), TOBN(0xdd1764ee, 0xc47cbe51)}, {TOBN(0x041e7cde, 0x7b3ed739), TOBN(0x50cb7459, 0x5ce9e1c0), TOBN(0x35568513, 0x2925b212), TOBN(0x7cff95c4, 0x001b081c)}}, {{TOBN(0x63ee4cbd, 0x8088b454), TOBN(0xdb7f32f7, 0x9a9e0c8a), TOBN(0xb377d418, 0x6b2447cb), TOBN(0xe3e982aa, 0xd370219b)}, {TOBN(0x06ccc1e4, 0xc2a2a593), TOBN(0x72c36865, 0x0773f24f), TOBN(0xa13b4da7, 0x95859423), TOBN(0x8bbf1d33, 0x75040c8f)}}, {{TOBN(0x726f0973, 0xda50c991), TOBN(0x48afcd5b, 0x822d6ee2), TOBN(0xe5fc718b, 0x20fd7771), TOBN(0xb9e8e77d, 0xfd0807a1)}, {TOBN(0x7f5e0f44, 0x99a7703d), TOBN(0x6972930e, 0x618e36f3), TOBN(0x2b7c77b8, 0x23807bbe), TOBN(0xe5b82405, 0xcb27ff50)}}, {{TOBN(0xba8b8be3, 0xbd379062), TOBN(0xd64b7a1d, 0x2dce4a92), TOBN(0x040a73c5, 0xb2952e37), TOBN(0x0a9e252e, 0xd438aeca)}, {TOBN(0xdd43956b, 0xc39d3bcb), TOBN(0x1a31ca00, 0xb32b2d63), TOBN(0xd67133b8, 0x5c417a18), TOBN(0xd08e4790, 0x2ef442c8)}}, {{TOBN(0x98cb1ae9, 0x255c0980), TOBN(0x4bd86381, 0x2b4a739f), TOBN(0x5a5c31e1, 0x1e4a45a1), TOBN(0x1e5d55fe, 0x9cb0db2f)}, {TOBN(0x74661b06, 0x8ff5cc29), TOBN(0x026b389f, 0x0eb8a4f4), TOBN(0x536b21a4, 0x58848c24), TOBN(0x2e5bf8ec, 0x81dc72b0)}}, {{TOBN(0x03c187d0, 0xad886aac), TOBN(0x5c16878a, 0xb771b645), TOBN(0xb07dfc6f, 0xc74045ab), TOBN(0x2c6360bf, 0x7800caed)}, {TOBN(0x24295bb5, 0xb9c972a3), TOBN(0xc9e6f88e, 0x7c9a6dba), TOBN(0x90ffbf24, 0x92a79aa6), TOBN(0xde29d50a, 0x41c26ac2)}}, {{TOBN(0x9f0af483, 0xd309cbe6), TOBN(0x5b020d8a, 0xe0bced4f), TOBN(0x606e986d, 0xb38023e3), TOBN(0xad8f2c9d, 0x1abc6933)}, {TOBN(0x19292e1d, 0xe7400e93), TOBN(0xfe3e18a9, 0x52be5e4d), TOBN(0xe8e9771d, 0x2e0680bf), TOBN(0x8c5bec98, 0xc54db063)}}, {{TOBN(0x2af9662a, 0x74a55d1f), TOBN(0xe3fbf28f, 0x046f66d8), TOBN(0xa3a72ab4, 0xd4dc4794), TOBN(0x09779f45, 0x5c7c2dd8)}, {TOBN(0xd893bdaf, 0xc3d19d8d), TOBN(0xd5a75094, 0x57d6a6df), TOBN(0x8cf8fef9, 0x952e6255), TOBN(0x3da67cfb, 0xda9a8aff)}}, {{TOBN(0x4c23f62a, 0x2c160dcd), TOBN(0x34e6c5e3, 0x8f90eaef), TOBN(0x35865519, 0xa9a65d5a), TOBN(0x07c48aae, 0x8fd38a3d)}, {TOBN(0xb7e7aeda, 0x50068527), TOBN(0x2c09ef23, 0x1c90936a), TOBN(0x31ecfeb6, 0xe879324c), TOBN(0xa0871f6b, 0xfb0ec938)}}, {{TOBN(0xb1f0fb68, 0xd84d835d), TOBN(0xc90caf39, 0x861dc1e6), TOBN(0x12e5b046, 0x7594f8d7), TOBN(0x26897ae2, 0x65012b92)}, {TOBN(0xbcf68a08, 0xa4d6755d), TOBN(0x403ee41c, 0x0991fbda), TOBN(0x733e343e, 0x3bbf17e8), TOBN(0xd2c7980d, 0x679b3d65)}}, {{TOBN(0x33056232, 0xd2e11305), TOBN(0x966be492, 0xf3c07a6f), TOBN(0x6a8878ff, 0xbb15509d), TOBN(0xff221101, 0x0a9b59a4)}, {TOBN(0x6c9f564a, 0xabe30129), TOBN(0xc6f2c940, 0x336e64cf), TOBN(0x0fe75262, 0x8b0c8022), TOBN(0xbe0267e9, 0x6ae8db87)}}, {{TOBN(0x22e192f1, 0x93bc042b), TOBN(0xf085b534, 0xb237c458), TOBN(0xa0d192bd, 0x832c4168), TOBN(0x7a76e9e3, 0xbdf6271d)}, {TOBN(0x52a882fa, 0xb88911b5), TOBN(0xc85345e4, 0xb4db0eb5), TOBN(0xa3be02a6, 0x81a7c3ff), TOBN(0x51889c8c, 0xf0ec0469)}}, {{TOBN(0x9d031369, 0xa5e829e5), TOBN(0xcbb4c6fc, 0x1607aa41), TOBN(0x75ac59a6, 0x241d84c1), TOBN(0xc043f2bf, 0x8829e0ee)}, {TOBN(0x82a38f75, 0x8ea5e185), TOBN(0x8bda40b9, 0xd87cbd9f), TOBN(0x9e65e75e, 0x2d8fc601), TOBN(0x3d515f74, 0xa35690b3)}}, {{TOBN(0x534acf4f, 0xda79e5ac), TOBN(0x68b83b3a, 0x8630215f), TOBN(0x5c748b2e, 0xd085756e), TOBN(0xb0317258, 0xe5d37cb2)}, {TOBN(0x6735841a, 0xc5ccc2c4), TOBN(0x7d7dc96b, 0x3d9d5069), TOBN(0xa147e410, 0xfd1754bd), TOBN(0x65296e94, 0xd399ddd5)}}, {{TOBN(0xf6b5b2d0, 0xbc8fa5bc), TOBN(0x8a5ead67, 0x500c277b), TOBN(0x214625e6, 0xdfa08a5d), TOBN(0x51fdfedc, 0x959cf047)}, {TOBN(0x6bc9430b, 0x289fca32), TOBN(0xe36ff0cf, 0x9d9bdc3f), TOBN(0x2fe187cb, 0x58ea0ede), TOBN(0xed66af20, 0x5a900b3f)}}, {{TOBN(0x00e0968b, 0x5fa9f4d6), TOBN(0x2d4066ce, 0x37a362e7), TOBN(0xa99a9748, 0xbd07e772), TOBN(0x710989c0, 0x06a4f1d0)}, {TOBN(0xd5dedf35, 0xce40cbd8), TOBN(0xab55c5f0, 0x1743293d), TOBN(0x766f1144, 0x8aa24e2c), TOBN(0x94d874f8, 0x605fbcb4)}}, {{TOBN(0xa365f0e8, 0xa518001b), TOBN(0xee605eb6, 0x9d04ef0f), TOBN(0x5a3915cd, 0xba8d4d25), TOBN(0x44c0e1b8, 0xb5113472)}, {TOBN(0xcbb024e8, 0x8b6740dc), TOBN(0x89087a53, 0xee1d4f0c), TOBN(0xa88fa05c, 0x1fc4e372), TOBN(0x8bf395cb, 0xaf8b3af2)}}, {{TOBN(0x1e71c9a1, 0xdeb8568b), TOBN(0xa35daea0, 0x80fb3d32), TOBN(0xe8b6f266, 0x2cf8fb81), TOBN(0x6d51afe8, 0x9490696a)}, {TOBN(0x81beac6e, 0x51803a19), TOBN(0xe3d24b7f, 0x86219080), TOBN(0x727cfd9d, 0xdf6f463c), TOBN(0x8c6865ca, 0x72284ee8)}}, {{TOBN(0x32c88b7d, 0xb743f4ef), TOBN(0x3793909b, 0xe7d11dce), TOBN(0xd398f922, 0x2ff2ebe8), TOBN(0x2c70ca44, 0xe5e49796)}, {TOBN(0xdf4d9929, 0xcb1131b1), TOBN(0x7826f298, 0x25888e79), TOBN(0x4d3a112c, 0xf1d8740a), TOBN(0x00384cb6, 0x270afa8b)}}, {{TOBN(0xcb64125b, 0x3ab48095), TOBN(0x3451c256, 0x62d05106), TOBN(0xd73d577d, 0xa4955845), TOBN(0x39570c16, 0xbf9f4433)}, {TOBN(0xd7dfaad3, 0xadecf263), TOBN(0xf1c3d8d1, 0xdc76e102), TOBN(0x5e774a58, 0x54c6a836), TOBN(0xdad4b672, 0x3e92d47b)}}, {{TOBN(0xbe7e990f, 0xf0d796a0), TOBN(0x5fc62478, 0xdf0e8b02), TOBN(0x8aae8bf4, 0x030c00ad), TOBN(0x3d2db93b, 0x9004ba0f)}, {TOBN(0xe48c8a79, 0xd85d5ddc), TOBN(0xe907caa7, 0x6bb07f34), TOBN(0x58db343a, 0xa39eaed5), TOBN(0x0ea6e007, 0xadaf5724)}}, {{TOBN(0xe00df169, 0xd23233f3), TOBN(0x3e322796, 0x77cb637f), TOBN(0x1f897c0e, 0x1da0cf6c), TOBN(0xa651f5d8, 0x31d6bbdd)}, {TOBN(0xdd61af19, 0x1a230c76), TOBN(0xbd527272, 0xcdaa5e4a), TOBN(0xca753636, 0xd0abcd7e), TOBN(0x78bdd37c, 0x370bd8dc)}}, {{TOBN(0xc23916c2, 0x17cd93fe), TOBN(0x65b97a4d, 0xdadce6e2), TOBN(0xe04ed4eb, 0x174e42f8), TOBN(0x1491ccaa, 0xbb21480a)}, {TOBN(0x145a8280, 0x23196332), TOBN(0x3c3862d7, 0x587b479a), TOBN(0x9f4a88a3, 0x01dcd0ed), TOBN(0x4da2b7ef, 0x3ea12f1f)}}, {{TOBN(0xf8e7ae33, 0xb126e48e), TOBN(0x404a0b32, 0xf494e237), TOBN(0x9beac474, 0xc55acadb), TOBN(0x4ee5cf3b, 0xcbec9fd9)}, {TOBN(0x336b33b9, 0x7df3c8c3), TOBN(0xbd905fe3, 0xb76808fd), TOBN(0x8f436981, 0xaa45c16a), TOBN(0x255c5bfa, 0x3dd27b62)}}, {{TOBN(0x71965cbf, 0xc3dd9b4d), TOBN(0xce23edbf, 0xfc068a87), TOBN(0xb78d4725, 0x745b029b), TOBN(0x74610713, 0xcefdd9bd)}, {TOBN(0x7116f75f, 0x1266bf52), TOBN(0x02046722, 0x18e49bb6), TOBN(0xdf43df9f, 0x3d6f19e3), TOBN(0xef1bc7d0, 0xe685cb2f)}}, {{TOBN(0xcddb27c1, 0x7078c432), TOBN(0xe1961b9c, 0xb77fedb7), TOBN(0x1edc2f5c, 0xc2290570), TOBN(0x2c3fefca, 0x19cbd886)}, {TOBN(0xcf880a36, 0xc2af389a), TOBN(0x96c610fd, 0xbda71cea), TOBN(0xf03977a9, 0x32aa8463), TOBN(0x8eb7763f, 0x8586d90a)}}, {{TOBN(0x3f342454, 0x2a296e77), TOBN(0xc8718683, 0x42837a35), TOBN(0x7dc71090, 0x6a09c731), TOBN(0x54778ffb, 0x51b816db)}, {TOBN(0x6b33bfec, 0xaf06defd), TOBN(0xfe3c105f, 0x8592b70b), TOBN(0xf937fda4, 0x61da6114), TOBN(0x3c13e651, 0x4c266ad7)}}, {{TOBN(0xe363a829, 0x855938e8), TOBN(0x2eeb5d9e, 0x9de54b72), TOBN(0xbeb93b0e, 0x20ccfab9), TOBN(0x3dffbb5f, 0x25e61a25)}, {TOBN(0x7f655e43, 0x1acc093d), TOBN(0x0cb6cc3d, 0x3964ce61), TOBN(0x6ab283a1, 0xe5e9b460), TOBN(0x55d787c5, 0xa1c7e72d)}}, {{TOBN(0x4d2efd47, 0xdeadbf02), TOBN(0x11e80219, 0xac459068), TOBN(0x810c7626, 0x71f311f0), TOBN(0xfa17ef8d, 0x4ab6ef53)}, {TOBN(0xaf47fd25, 0x93e43bff), TOBN(0x5cb5ff3f, 0x0be40632), TOBN(0x54687106, 0x8ee61da3), TOBN(0x7764196e, 0xb08afd0f)}}, {{TOBN(0x831ab3ed, 0xf0290a8f), TOBN(0xcae81966, 0xcb47c387), TOBN(0xaad7dece, 0x184efb4f), TOBN(0xdcfc53b3, 0x4749110e)}, {TOBN(0x6698f23c, 0x4cb632f9), TOBN(0xc42a1ad6, 0xb91f8067), TOBN(0xb116a81d, 0x6284180a), TOBN(0xebedf5f8, 0xe901326f)}}, {{TOBN(0xf2274c9f, 0x97e3e044), TOBN(0x42018520, 0x11d09fc9), TOBN(0x56a65f17, 0xd18e6e23), TOBN(0x2ea61e2a, 0x352b683c)}, {TOBN(0x27d291bc, 0x575eaa94), TOBN(0x9e7bc721, 0xb8ff522d), TOBN(0x5f7268bf, 0xa7f04d6f), TOBN(0x5868c73f, 0xaba41748)}}, {{TOBN(0x9f85c2db, 0x7be0eead), TOBN(0x511e7842, 0xff719135), TOBN(0x5a06b1e9, 0xc5ea90d7), TOBN(0x0c19e283, 0x26fab631)}, {TOBN(0x8af8f0cf, 0xe9206c55), TOBN(0x89389cb4, 0x3553c06a), TOBN(0x39dbed97, 0xf65f8004), TOBN(0x0621b037, 0xc508991d)}}, {{TOBN(0x1c52e635, 0x96e78cc4), TOBN(0x5385c8b2, 0x0c06b4a8), TOBN(0xd84ddfdb, 0xb0e87d03), TOBN(0xc49dfb66, 0x934bafad)}, {TOBN(0x7071e170, 0x59f70772), TOBN(0x3a073a84, 0x3a1db56b), TOBN(0x03494903, 0x3b8af190), TOBN(0x7d882de3, 0xd32920f0)}}, {{TOBN(0x91633f0a, 0xb2cf8940), TOBN(0x72b0b178, 0x6f948f51), TOBN(0x2d28dc30, 0x782653c8), TOBN(0x88829849, 0xdb903a05)}, {TOBN(0xb8095d0c, 0x6a19d2bb), TOBN(0x4b9e7f0c, 0x86f782cb), TOBN(0x7af73988, 0x2d907064), TOBN(0xd12be0fe, 0x8b32643c)}}, {{TOBN(0x358ed23d, 0x0e165dc3), TOBN(0x3d47ce62, 0x4e2378ce), TOBN(0x7e2bb0b9, 0xfeb8a087), TOBN(0x3246e8ae, 0xe29e10b9)}, {TOBN(0x459f4ec7, 0x03ce2b4d), TOBN(0xe9b4ca1b, 0xbbc077cf), TOBN(0x2613b4f2, 0x0e9940c1), TOBN(0xfc598bb9, 0x047d1eb1)}}, {{TOBN(0x9744c62b, 0x45036099), TOBN(0xa9dee742, 0x167c65d8), TOBN(0x0c511525, 0xdabe1943), TOBN(0xda110554, 0x93c6c624)}, {TOBN(0xae00a52c, 0x651a3be2), TOBN(0xcda5111d, 0x884449a6), TOBN(0x063c06f4, 0xff33bed1), TOBN(0x73baaf9a, 0x0d3d76b4)}}, {{TOBN(0x52fb0c9d, 0x7fc63668), TOBN(0x6886c9dd, 0x0c039cde), TOBN(0x602bd599, 0x55b22351), TOBN(0xb00cab02, 0x360c7c13)}, {TOBN(0x8cb616bc, 0x81b69442), TOBN(0x41486700, 0xb55c3cee), TOBN(0x71093281, 0xf49ba278), TOBN(0xad956d9c, 0x64a50710)}}, {{TOBN(0x9561f28b, 0x638a7e81), TOBN(0x54155cdf, 0x5980ddc3), TOBN(0xb2db4a96, 0xd26f247a), TOBN(0x9d774e4e, 0x4787d100)}, {TOBN(0x1a9e6e2e, 0x078637d2), TOBN(0x1c363e2d, 0x5e0ae06a), TOBN(0x7493483e, 0xe9cfa354), TOBN(0x76843cb3, 0x7f74b98d)}}, {{TOBN(0xbaca6591, 0xd4b66947), TOBN(0xb452ce98, 0x04460a8c), TOBN(0x6830d246, 0x43768f55), TOBN(0xf4197ed8, 0x7dff12df)}, {TOBN(0x6521b472, 0x400dd0f7), TOBN(0x59f5ca8f, 0x4b1e7093), TOBN(0x6feff11b, 0x080338ae), TOBN(0x0ada31f6, 0xa29ca3c6)}}, {{TOBN(0x24794eb6, 0x94a2c215), TOBN(0xd83a43ab, 0x05a57ab4), TOBN(0x264a543a, 0x2a6f89fe), TOBN(0x2c2a3868, 0xdd5ec7c2)}, {TOBN(0xd3373940, 0x8439d9b2), TOBN(0x715ea672, 0x0acd1f11), TOBN(0x42c1d235, 0xe7e6cc19), TOBN(0x81ce6e96, 0xb990585c)}}, {{TOBN(0x04e5dfe0, 0xd809c7bd), TOBN(0xd7b2580c, 0x8f1050ab), TOBN(0x6d91ad78, 0xd8a4176f), TOBN(0x0af556ee, 0x4e2e897c)}, {TOBN(0x162a8b73, 0x921de0ac), TOBN(0x52ac9c22, 0x7ea78400), TOBN(0xee2a4eea, 0xefce2174), TOBN(0xbe61844e, 0x6d637f79)}}, {{TOBN(0x0491f1bc, 0x789a283b), TOBN(0x72d3ac3d, 0x880836f4), TOBN(0xaa1c5ea3, 0x88e5402d), TOBN(0x1b192421, 0xd5cc473d)}, {TOBN(0x5c0b9998, 0x9dc84cac), TOBN(0xb0a8482d, 0x9c6e75b8), TOBN(0x639961d0, 0x3a191ce2), TOBN(0xda3bc865, 0x6d837930)}}, {{TOBN(0xca990653, 0x056e6f8f), TOBN(0x84861c41, 0x64d133a7), TOBN(0x8b403276, 0x746abe40), TOBN(0xb7b4d51a, 0xebf8e303)}, {TOBN(0x05b43211, 0x220a255d), TOBN(0xc997152c, 0x02419e6e), TOBN(0x76ff47b6, 0x630c2fea), TOBN(0x50518677, 0x281fdade)}}, {{TOBN(0x3283b8ba, 0xcf902b0b), TOBN(0x8d4b4eb5, 0x37db303b), TOBN(0xcc89f42d, 0x755011bc), TOBN(0xb43d74bb, 0xdd09d19b)}, {TOBN(0x65746bc9, 0x8adba350), TOBN(0x364eaf8c, 0xb51c1927), TOBN(0x13c76596, 0x10ad72ec), TOBN(0x30045121, 0xf8d40c20)}}, {{TOBN(0x6d2d99b7, 0xea7b979b), TOBN(0xcd78cd74, 0xe6fb3bcd), TOBN(0x11e45a9e, 0x86cffbfe), TOBN(0x78a61cf4, 0x637024f6)}, {TOBN(0xd06bc872, 0x3d502295), TOBN(0xf1376854, 0x458cb288), TOBN(0xb9db26a1, 0x342f8586), TOBN(0xf33effcf, 0x4beee09e)}}, {{TOBN(0xd7e0c4cd, 0xb30cfb3a), TOBN(0x6d09b8c1, 0x6c9db4c8), TOBN(0x40ba1a42, 0x07c8d9df), TOBN(0x6fd495f7, 0x1c52c66d)}, {TOBN(0xfb0e169f, 0x275264da), TOBN(0x80c2b746, 0xe57d8362), TOBN(0xedd987f7, 0x49ad7222), TOBN(0xfdc229af, 0x4398ec7b)}}}, {{{TOBN(0xb0d1ed84, 0x52666a58), TOBN(0x4bcb6e00, 0xe6a9c3c2), TOBN(0x3c57411c, 0x26906408), TOBN(0xcfc20755, 0x13556400)}, {TOBN(0xa08b1c50, 0x5294dba3), TOBN(0xa30ba286, 0x8b7dd31e), TOBN(0xd70ba90e, 0x991eca74), TOBN(0x094e142c, 0xe762c2b9)}}, {{TOBN(0xb81d783e, 0x979f3925), TOBN(0x1efd130a, 0xaf4c89a7), TOBN(0x525c2144, 0xfd1bf7fa), TOBN(0x4b296904, 0x1b265a9e)}, {TOBN(0xed8e9634, 0xb9db65b6), TOBN(0x35c82e32, 0x03599d8a), TOBN(0xdaa7a54f, 0x403563f3), TOBN(0x9df088ad, 0x022c38ab)}}, {{TOBN(0xe5cfb066, 0xbb3fd30a), TOBN(0x429169da, 0xeff0354e), TOBN(0x809cf852, 0x3524e36c), TOBN(0x136f4fb3, 0x0155be1d)}, {TOBN(0x4826af01, 0x1fbba712), TOBN(0x6ef0f0b4, 0x506ba1a1), TOBN(0xd9928b31, 0x77aea73e), TOBN(0xe2bf6af2, 0x5eaa244e)}}, {{TOBN(0x8d084f12, 0x4237b64b), TOBN(0x688ebe99, 0xe3ecfd07), TOBN(0x57b8a70c, 0xf6845dd8), TOBN(0x808fc59c, 0x5da4a325)}, {TOBN(0xa9032b2b, 0xa3585862), TOBN(0xb66825d5, 0xedf29386), TOBN(0xb5a5a8db, 0x431ec29b), TOBN(0xbb143a98, 0x3a1e8dc8)}}, {{TOBN(0x35ee94ce, 0x12ae381b), TOBN(0x3a7f176c, 0x86ccda90), TOBN(0xc63a657e, 0x4606eaca), TOBN(0x9ae5a380, 0x43cd04df)}, {TOBN(0x9bec8d15, 0xed251b46), TOBN(0x1f5d6d30, 0xcaca5e64), TOBN(0x347b3b35, 0x9ff20f07), TOBN(0x4d65f034, 0xf7e4b286)}}, {{TOBN(0x9e93ba24, 0xf111661e), TOBN(0xedced484, 0xb105eb04), TOBN(0x96dc9ba1, 0xf424b578), TOBN(0xbf8f66b7, 0xe83e9069)}, {TOBN(0x872d4df4, 0xd7ed8216), TOBN(0xbf07f377, 0x8e2cbecf), TOBN(0x4281d899, 0x98e73754), TOBN(0xfec85fbb, 0x8aab8708)}}, {{TOBN(0x9a3c0dee, 0xa5ba5b0b), TOBN(0xe6a116ce, 0x42d05299), TOBN(0xae9775fe, 0xe9b02d42), TOBN(0x72b05200, 0xa1545cb6)}, {TOBN(0xbc506f7d, 0x31a3b4ea), TOBN(0xe5893078, 0x8bbd9b32), TOBN(0xc8bc5f37, 0xe4b12a97), TOBN(0x6b000c06, 0x4a73b671)}}, {{TOBN(0x13b5bf22, 0x765fa7d0), TOBN(0x59805bf0, 0x1d6a5370), TOBN(0x67a5e29d, 0x4280db98), TOBN(0x4f53916f, 0x776b1ce3)}, {TOBN(0x714ff61f, 0x33ddf626), TOBN(0x4206238e, 0xa085d103), TOBN(0x1c50d4b7, 0xe5809ee3), TOBN(0x999f450d, 0x85f8eb1d)}}, {{TOBN(0x658a6051, 0xe4c79e9b), TOBN(0x1394cb73, 0xc66a9fea), TOBN(0x27f31ed5, 0xc6be7b23), TOBN(0xf4c88f36, 0x5aa6f8fe)}, {TOBN(0x0fb0721f, 0x4aaa499e), TOBN(0x68b3a7d5, 0xe3fb2a6b), TOBN(0xa788097d, 0x3a92851d), TOBN(0x060e7f8a, 0xe96f4913)}}, {{TOBN(0x82eebe73, 0x1a3a93bc), TOBN(0x42bbf465, 0xa21adc1a), TOBN(0xc10b6fa4, 0xef030efd), TOBN(0x247aa4c7, 0x87b097bb)}, {TOBN(0x8b8dc632, 0xf60c77da), TOBN(0x6ffbc26a, 0xc223523e), TOBN(0xa4f6ff11, 0x344579cf), TOBN(0x5825653c, 0x980250f6)}}, {{TOBN(0xb2dd097e, 0xbc1aa2b9), TOBN(0x07889393, 0x37a0333a), TOBN(0x1cf55e71, 0x37a0db38), TOBN(0x2648487f, 0x792c1613)}, {TOBN(0xdad01336, 0x3fcef261), TOBN(0x6239c81d, 0x0eabf129), TOBN(0x8ee761de, 0x9d276be2), TOBN(0x406a7a34, 0x1eda6ad3)}}, {{TOBN(0x4bf367ba, 0x4a493b31), TOBN(0x54f20a52, 0x9bf7f026), TOBN(0xb696e062, 0x9795914b), TOBN(0xcddab96d, 0x8bf236ac)}, {TOBN(0x4ff2c70a, 0xed25ea13), TOBN(0xfa1d09eb, 0x81cbbbe7), TOBN(0x88fc8c87, 0x468544c5), TOBN(0x847a670d, 0x696b3317)}}, {{TOBN(0xf133421e, 0x64bcb626), TOBN(0xaea638c8, 0x26dee0b5), TOBN(0xd6e7680b, 0xb310346c), TOBN(0xe06f4097, 0xd5d4ced3)}, {TOBN(0x09961452, 0x7512a30b), TOBN(0xf3d867fd, 0xe589a59a), TOBN(0x2e73254f, 0x52d0c180), TOBN(0x9063d8a3, 0x333c74ac)}}, {{TOBN(0xeda6c595, 0xd314e7bc), TOBN(0x2ee7464b, 0x467899ed), TOBN(0x1cef423c, 0x0a1ed5d3), TOBN(0x217e76ea, 0x69cc7613)}, {TOBN(0x27ccce1f, 0xe7cda917), TOBN(0x12d8016b, 0x8a893f16), TOBN(0xbcd6de84, 0x9fc74f6b), TOBN(0xfa5817e2, 0xf3144e61)}}, {{TOBN(0x1f354164, 0x0821ee4c), TOBN(0x1583eab4, 0x0bc61992), TOBN(0x7490caf6, 0x1d72879f), TOBN(0x998ad9f3, 0xf76ae7b2)}, {TOBN(0x1e181950, 0xa41157f7), TOBN(0xa9d7e1e6, 0xe8da3a7e), TOBN(0x963784eb, 0x8426b95f), TOBN(0x0ee4ed6e, 0x542e2a10)}}, {{TOBN(0xb79d4cc5, 0xac751e7b), TOBN(0x93f96472, 0xfd4211bd), TOBN(0x8c72d3d2, 0xc8de4fc6), TOBN(0x7b69cbf5, 0xdf44f064)}, {TOBN(0x3da90ca2, 0xf4bf94e1), TOBN(0x1a5325f8, 0xf12894e2), TOBN(0x0a437f6c, 0x7917d60b), TOBN(0x9be70486, 0x96c9cb5d)}}, {{TOBN(0xb4d880bf, 0xe1dc5c05), TOBN(0xd738adda, 0xeebeeb57), TOBN(0x6f0119d3, 0xdf0fe6a3), TOBN(0x5c686e55, 0x66eaaf5a)}, {TOBN(0x9cb10b50, 0xdfd0b7ec), TOBN(0xbdd0264b, 0x6a497c21), TOBN(0xfc093514, 0x8c546c96), TOBN(0x58a947fa, 0x79dbf42a)}}, {{TOBN(0xc0b48d4e, 0x49ccd6d7), TOBN(0xff8fb02c, 0x88bd5580), TOBN(0xc75235e9, 0x07d473b2), TOBN(0x4fab1ac5, 0xa2188af3)}, {TOBN(0x030fa3bc, 0x97576ec0), TOBN(0xe8c946e8, 0x0b7e7d2f), TOBN(0x40a5c9cc, 0x70305600), TOBN(0x6d8260a9, 0xc8b013b4)}}, {{TOBN(0x0368304f, 0x70bba85c), TOBN(0xad090da1, 0xa4a0d311), TOBN(0x7170e870, 0x2415eec1), TOBN(0xbfba35fe, 0x8461ea47)}, {TOBN(0x6279019a, 0xc1e91938), TOBN(0xa47638f3, 0x1afc415f), TOBN(0x36c65cbb, 0xbcba0e0f), TOBN(0x02160efb, 0x034e2c48)}}, {{TOBN(0xe6c51073, 0x615cd9e4), TOBN(0x498ec047, 0xf1243c06), TOBN(0x3e5a8809, 0xb17b3d8c), TOBN(0x5cd99e61, 0x0cc565f1)}, {TOBN(0x81e312df, 0x7851dafe), TOBN(0xf156f5ba, 0xa79061e2), TOBN(0x80d62b71, 0x880c590e), TOBN(0xbec9746f, 0x0a39faa1)}}, {{TOBN(0x1d98a9c1, 0xc8ed1f7a), TOBN(0x09e43bb5, 0xa81d5ff2), TOBN(0xd5f00f68, 0x0da0794a), TOBN(0x412050d9, 0x661aa836)}, {TOBN(0xa89f7c4e, 0x90747e40), TOBN(0x6dc05ebb, 0xb62a3686), TOBN(0xdf4de847, 0x308e3353), TOBN(0x53868fbb, 0x9fb53bb9)}}, {{TOBN(0x2b09d2c3, 0xcfdcf7dd), TOBN(0x41a9fce3, 0x723fcab4), TOBN(0x73d905f7, 0x07f57ca3), TOBN(0x080f9fb1, 0xac8e1555)}, {TOBN(0x7c088e84, 0x9ba7a531), TOBN(0x07d35586, 0xed9a147f), TOBN(0x602846ab, 0xaf48c336), TOBN(0x7320fd32, 0x0ccf0e79)}}, {{TOBN(0xaa780798, 0xb18bd1ff), TOBN(0x52c2e300, 0xafdd2905), TOBN(0xf27ea3d6, 0x434267cd), TOBN(0x8b96d16d, 0x15605b5f)}, {TOBN(0x7bb31049, 0x4b45706b), TOBN(0xe7f58b8e, 0x743d25f8), TOBN(0xe9b5e45b, 0x87f30076), TOBN(0xd19448d6, 0x5d053d5a)}}, {{TOBN(0x1ecc8cb9, 0xd3210a04), TOBN(0x6bc7d463, 0xdafb5269), TOBN(0x3e59b10a, 0x67c3489f), TOBN(0x1769788c, 0x65641e1b)}, {TOBN(0x8a53b82d, 0xbd6cb838), TOBN(0x7066d6e6, 0x236d5f22), TOBN(0x03aa1c61, 0x6908536e), TOBN(0xc971da0d, 0x66ae9809)}}, {{TOBN(0x01b3a86b, 0xc49a2fac), TOBN(0x3b8420c0, 0x3092e77a), TOBN(0x02057300, 0x7d6fb556), TOBN(0x6941b2a1, 0xbff40a87)}, {TOBN(0x140b6308, 0x0658ff2a), TOBN(0x87804363, 0x3424ab36), TOBN(0x0253bd51, 0x5751e299), TOBN(0xc75bcd76, 0x449c3e3a)}}, {{TOBN(0x92eb4090, 0x7f8f875d), TOBN(0x9c9d754e, 0x56c26bbf), TOBN(0x158cea61, 0x8110bbe7), TOBN(0x62a6b802, 0x745f91ea)}, {TOBN(0xa79c41aa, 0xc6e7394b), TOBN(0x445b6a83, 0xad57ef10), TOBN(0x0c5277eb, 0x6ea6f40c), TOBN(0x319fe96b, 0x88633365)}}, {{TOBN(0x0b0fc61f, 0x385f63cb), TOBN(0x41250c84, 0x22bdd127), TOBN(0x67d153f1, 0x09e942c2), TOBN(0x60920d08, 0xc021ad5d)}, {TOBN(0x229f5746, 0x724d81a5), TOBN(0xb7ffb892, 0x5bba3299), TOBN(0x518c51a1, 0xde413032), TOBN(0x2a9bfe77, 0x3c2fd94c)}}, {{TOBN(0xcbcde239, 0x3191f4fd), TOBN(0x43093e16, 0xd3d6ada1), TOBN(0x184579f3, 0x58769606), TOBN(0x2c94a8b3, 0xd236625c)}, {TOBN(0x6922b9c0, 0x5c437d8e), TOBN(0x3d4ae423, 0xd8d9f3c8), TOBN(0xf72c31c1, 0x2e7090a2), TOBN(0x4ac3f5f3, 0xd76a55bd)}}, {{TOBN(0x342508fc, 0x6b6af991), TOBN(0x0d527100, 0x1b5cebbd), TOBN(0xb84740d0, 0xdd440dd7), TOBN(0x748ef841, 0x780162fd)}, {TOBN(0xa8dbfe0e, 0xdfc6fafb), TOBN(0xeadfdf05, 0xf7300f27), TOBN(0x7d06555f, 0xfeba4ec9), TOBN(0x12c56f83, 0x9e25fa97)}}, {{TOBN(0x77f84203, 0xd39b8c34), TOBN(0xed8b1be6, 0x3125eddb), TOBN(0x5bbf2441, 0xf6e39dc5), TOBN(0xb00f6ee6, 0x6a5d678a)}, {TOBN(0xba456ecf, 0x57d0ea99), TOBN(0xdcae0f58, 0x17e06c43), TOBN(0x01643de4, 0x0f5b4baa), TOBN(0x2c324341, 0xd161b9be)}}, {{TOBN(0x80177f55, 0xe126d468), TOBN(0xed325f1f, 0x76748e09), TOBN(0x6116004a, 0xcfa9bdc2), TOBN(0x2d8607e6, 0x3a9fb468)}, {TOBN(0x0e573e27, 0x6009d660), TOBN(0x3a525d2e, 0x8d10c5a1), TOBN(0xd26cb45c, 0x3b9009a0), TOBN(0xb6b0cdc0, 0xde9d7448)}}, {{TOBN(0x949c9976, 0xe1337c26), TOBN(0x6faadebd, 0xd73d68e5), TOBN(0x9e158614, 0xf1b768d9), TOBN(0x22dfa557, 0x9cc4f069)}, {TOBN(0xccd6da17, 0xbe93c6d6), TOBN(0x24866c61, 0xa504f5b9), TOBN(0x2121353c, 0x8d694da1), TOBN(0x1c6ca580, 0x0140b8c6)}}, {{TOBN(0xc245ad8c, 0xe964021e), TOBN(0xb83bffba, 0x032b82b3), TOBN(0xfaa220c6, 0x47ef9898), TOBN(0x7e8d3ac6, 0x982c948a)}, {TOBN(0x1faa2091, 0xbc2d124a), TOBN(0xbd54c3dd, 0x05b15ff4), TOBN(0x386bf3ab, 0xc87c6fb7), TOBN(0xfb2b0563, 0xfdeb6f66)}}, {{TOBN(0x4e77c557, 0x5b45afb4), TOBN(0xe9ded649, 0xefb8912d), TOBN(0x7ec9bbf5, 0x42f6e557), TOBN(0x2570dfff, 0x62671f00)}, {TOBN(0x2b3bfb78, 0x88e084bd), TOBN(0xa024b238, 0xf37fe5b4), TOBN(0x44e7dc04, 0x95649aee), TOBN(0x498ca255, 0x5e7ec1d8)}}, {{TOBN(0x3bc766ea, 0xaaa07e86), TOBN(0x0db6facb, 0xf3608586), TOBN(0xbadd2549, 0xbdc259c8), TOBN(0x95af3c6e, 0x041c649f)}, {TOBN(0xb36a928c, 0x02e30afb), TOBN(0x9b5356ad, 0x008a88b8), TOBN(0x4b67a5f1, 0xcf1d9e9d), TOBN(0xc6542e47, 0xa5d8d8ce)}}, {{TOBN(0x73061fe8, 0x7adfb6cc), TOBN(0xcc826fd3, 0x98678141), TOBN(0x00e758b1, 0x3c80515a), TOBN(0x6afe3247, 0x41485083)}, {TOBN(0x0fcb08b9, 0xb6ae8a75), TOBN(0xb8cf388d, 0x4acf51e1), TOBN(0x344a5560, 0x6961b9d6), TOBN(0x1a6778b8, 0x6a97fd0c)}}, {{TOBN(0xd840fdc1, 0xecc4c7e3), TOBN(0xde9fe47d, 0x16db68cc), TOBN(0xe95f89de, 0xa3e216aa), TOBN(0x84f1a6a4, 0x9594a8be)}, {TOBN(0x7ddc7d72, 0x5a7b162b), TOBN(0xc5cfda19, 0xadc817a3), TOBN(0x80a5d350, 0x78b58d46), TOBN(0x93365b13, 0x82978f19)}}, {{TOBN(0x2e44d225, 0x26a1fc90), TOBN(0x0d6d10d2, 0x4d70705d), TOBN(0xd94b6b10, 0xd70c45f4), TOBN(0x0f201022, 0xb216c079)}, {TOBN(0xcec966c5, 0x658fde41), TOBN(0xa8d2bc7d, 0x7e27601d), TOBN(0xbfcce3e1, 0xff230be7), TOBN(0x3394ff6b, 0x0033ffb5)}}, {{TOBN(0xd890c509, 0x8132c9af), TOBN(0xaac4b0eb, 0x361e7868), TOBN(0x5194ded3, 0xe82d15aa), TOBN(0x4550bd2e, 0x23ae6b7d)}, {TOBN(0x3fda318e, 0xea5399d4), TOBN(0xd989bffa, 0x91638b80), TOBN(0x5ea124d0, 0xa14aa12d), TOBN(0x1fb1b899, 0x3667b944)}}, {{TOBN(0x95ec7969, 0x44c44d6a), TOBN(0x91df144a, 0x57e86137), TOBN(0x915fd620, 0x73adac44), TOBN(0x8f01732d, 0x59a83801)}, {TOBN(0xec579d25, 0x3aa0a633), TOBN(0x06de5e7c, 0xc9d6d59c), TOBN(0xc132f958, 0xb1ef8010), TOBN(0x29476f96, 0xe65c1a02)}}, {{TOBN(0x336a77c0, 0xd34c3565), TOBN(0xef1105b2, 0x1b9f1e9e), TOBN(0x63e6d08b, 0xf9e08002), TOBN(0x9aff2f21, 0xc613809e)}, {TOBN(0xb5754f85, 0x3a80e75d), TOBN(0xde71853e, 0x6bbda681), TOBN(0x86f041df, 0x8197fd7a), TOBN(0x8b332e08, 0x127817fa)}}, {{TOBN(0x05d99be8, 0xb9c20cda), TOBN(0x89f7aad5, 0xd5cd0c98), TOBN(0x7ef936fe, 0x5bb94183), TOBN(0x92ca0753, 0xb05cd7f2)}, {TOBN(0x9d65db11, 0x74a1e035), TOBN(0x02628cc8, 0x13eaea92), TOBN(0xf2d9e242, 0x49e4fbf2), TOBN(0x94fdfd9b, 0xe384f8b7)}}, {{TOBN(0x65f56054, 0x63428c6b), TOBN(0x2f7205b2, 0x90b409a5), TOBN(0xf778bb78, 0xff45ae11), TOBN(0xa13045be, 0xc5ee53b2)}, {TOBN(0xe00a14ff, 0x03ef77fe), TOBN(0x689cd59f, 0xffef8bef), TOBN(0x3578f0ed, 0x1e9ade22), TOBN(0xe99f3ec0, 0x6268b6a8)}}, {{TOBN(0xa2057d91, 0xea1b3c3e), TOBN(0x2d1a7053, 0xb8823a4a), TOBN(0xabbb336a, 0x2cca451e), TOBN(0xcd2466e3, 0x2218bb5d)}, {TOBN(0x3ac1f42f, 0xc8cb762d), TOBN(0x7e312aae, 0x7690211f), TOBN(0xebb9bd73, 0x45d07450), TOBN(0x207c4b82, 0x46c2213f)}}, {{TOBN(0x99d425c1, 0x375913ec), TOBN(0x94e45e96, 0x67908220), TOBN(0xc08f3087, 0xcd67dbf6), TOBN(0xa5670fbe, 0xc0887056)}, {TOBN(0x6717b64a, 0x66f5b8fc), TOBN(0xd5a56aea, 0x786fec28), TOBN(0xa8c3f55f, 0xc0ff4952), TOBN(0xa77fefae, 0x457ac49b)}}, {{TOBN(0x29882d7c, 0x98379d44), TOBN(0xd000bdfb, 0x509edc8a), TOBN(0xc6f95979, 0xe66fe464), TOBN(0x504a6115, 0xfa61bde0)}, {TOBN(0x56b3b871, 0xeffea31a), TOBN(0x2d3de26d, 0xf0c21a54), TOBN(0x21dbff31, 0x834753bf), TOBN(0xe67ecf49, 0x69269d86)}}, {{TOBN(0x7a176952, 0x151fe690), TOBN(0x03515804, 0x7f2adb5f), TOBN(0xee794b15, 0xd1b62a8d), TOBN(0xf004ceec, 0xaae454e6)}, {TOBN(0x0897ea7c, 0xf0386fac), TOBN(0x3b62ff12, 0xd1fca751), TOBN(0x154181df, 0x1b7a04ec), TOBN(0x2008e04a, 0xfb5847ec)}}, {{TOBN(0xd147148e, 0x41dbd772), TOBN(0x2b419f73, 0x22942654), TOBN(0x669f30d3, 0xe9c544f7), TOBN(0x52a2c223, 0xc8540149)}, {TOBN(0x5da9ee14, 0x634dfb02), TOBN(0x5f074ff0, 0xf47869f3), TOBN(0x74ee878d, 0xa3933acc), TOBN(0xe6510651, 0x4fe35ed1)}}, {{TOBN(0xb3eb9482, 0xf1012e7a), TOBN(0x51013cc0, 0xa8a566ae), TOBN(0xdd5e9243, 0x47c00d3b), TOBN(0x7fde089d, 0x946bb0e5)}, {TOBN(0x030754fe, 0xc731b4b3), TOBN(0x12a136a4, 0x99fda062), TOBN(0x7c1064b8, 0x5a1a35bc), TOBN(0xbf1f5763, 0x446c84ef)}}, {{TOBN(0xed29a56d, 0xa16d4b34), TOBN(0x7fba9d09, 0xdca21c4f), TOBN(0x66d7ac00, 0x6d8de486), TOBN(0x60061987, 0x73a2a5e1)}, {TOBN(0x8b400f86, 0x9da28ff0), TOBN(0x3133f708, 0x43c4599c), TOBN(0x9911c9b8, 0xee28cb0d), TOBN(0xcd7e2874, 0x8e0af61d)}}, {{TOBN(0x5a85f0f2, 0x72ed91fc), TOBN(0x85214f31, 0x9cd4a373), TOBN(0x881fe5be, 0x1925253c), TOBN(0xd8dc98e0, 0x91e8bc76)}, {TOBN(0x7120affe, 0x585cc3a2), TOBN(0x724952ed, 0x735bf97a), TOBN(0x5581e7dc, 0x3eb34581), TOBN(0x5cbff4f2, 0xe52ee57d)}}, {{TOBN(0x8d320a0e, 0x87d8cc7b), TOBN(0x9beaa7f3, 0xf1d280d0), TOBN(0x7a0b9571, 0x9beec704), TOBN(0x9126332e, 0x5b7f0057)}, {TOBN(0x01fbc1b4, 0x8ed3bd6d), TOBN(0x35bb2c12, 0xd945eb24), TOBN(0x6404694e, 0x9a8ae255), TOBN(0xb6092eec, 0x8d6abfb3)}}, {{TOBN(0x4d76143f, 0xcc058865), TOBN(0x7b0a5af2, 0x6e249922), TOBN(0x8aef9440, 0x6a50d353), TOBN(0xe11e4bcc, 0x64f0e07a)}, {TOBN(0x4472993a, 0xa14a90fa), TOBN(0x7706e20c, 0xba0c51d4), TOBN(0xf403292f, 0x1532672d), TOBN(0x52573bfa, 0x21829382)}}, {{TOBN(0x6a7bb6a9, 0x3b5bdb83), TOBN(0x08da65c0, 0xa4a72318), TOBN(0xc58d22aa, 0x63eb065f), TOBN(0x1717596c, 0x1b15d685)}, {TOBN(0x112df0d0, 0xb266d88b), TOBN(0xf688ae97, 0x5941945a), TOBN(0x487386e3, 0x7c292cac), TOBN(0x42f3b50d, 0x57d6985c)}}, {{TOBN(0x6da4f998, 0x6a90fc34), TOBN(0xc8f257d3, 0x65ca8a8d), TOBN(0xc2feabca, 0x6951f762), TOBN(0xe1bc81d0, 0x74c323ac)}, {TOBN(0x1bc68f67, 0x251a2a12), TOBN(0x10d86587, 0xbe8a70dc), TOBN(0xd648af7f, 0xf0f84d2e), TOBN(0xf0aa9ebc, 0x6a43ac92)}}, {{TOBN(0x69e3be04, 0x27596893), TOBN(0xb6bb02a6, 0x45bf452b), TOBN(0x0875c11a, 0xf4c698c8), TOBN(0x6652b5c7, 0xbece3794)}, {TOBN(0x7b3755fd, 0x4f5c0499), TOBN(0x6ea16558, 0xb5532b38), TOBN(0xd1c69889, 0xa2e96ef7), TOBN(0x9c773c3a, 0x61ed8f48)}}, {{TOBN(0x2b653a40, 0x9b323abc), TOBN(0xe26605e1, 0xf0e1d791), TOBN(0x45d41064, 0x4a87157a), TOBN(0x8f9a78b7, 0xcbbce616)}, {TOBN(0xcf1e44aa, 0xc407eddd), TOBN(0x81ddd1d8, 0xa35b964f), TOBN(0x473e339e, 0xfd083999), TOBN(0x6c94bdde, 0x8e796802)}}, {{TOBN(0x5a304ada, 0x8545d185), TOBN(0x82ae44ea, 0x738bb8cb), TOBN(0x628a35e3, 0xdf87e10e), TOBN(0xd3624f3d, 0xa15b9fe3)}, {TOBN(0xcc44209b, 0x14be4254), TOBN(0x7d0efcbc, 0xbdbc2ea5), TOBN(0x1f603362, 0x04c37bbe), TOBN(0x21f363f5, 0x56a5852c)}}, {{TOBN(0xa1503d1c, 0xa8501550), TOBN(0x2251e0e1, 0xd8ab10bb), TOBN(0xde129c96, 0x6961c51c), TOBN(0x1f7246a4, 0x81910f68)}, {TOBN(0x2eb744ee, 0x5f2591f2), TOBN(0x3c47d33f, 0x5e627157), TOBN(0x4d6d62c9, 0x22f3bd68), TOBN(0x6120a64b, 0xcb8df856)}}, {{TOBN(0x3a9ac6c0, 0x7b5d07df), TOBN(0xa92b9558, 0x7ef39783), TOBN(0xe128a134, 0xab3a9b4f), TOBN(0x41c18807, 0xb1252f05)}, {TOBN(0xfc7ed089, 0x80ba9b1c), TOBN(0xac8dc6de, 0xc532a9dd), TOBN(0xbf829cef, 0x55246809), TOBN(0x101b784f, 0x5b4ee80f)}}, {{TOBN(0xc09945bb, 0xb6f11603), TOBN(0x57b09dbe, 0x41d2801e), TOBN(0xfba5202f, 0xa97534a8), TOBN(0x7fd8ae5f, 0xc17b9614)}, {TOBN(0xa50ba666, 0x78308435), TOBN(0x9572f77c, 0xd3868c4d), TOBN(0x0cef7bfd, 0x2dd7aab0), TOBN(0xe7958e08, 0x2c7c79ff)}}, {{TOBN(0x81262e42, 0x25346689), TOBN(0x716da290, 0xb07c7004), TOBN(0x35f911ea, 0xb7950ee3), TOBN(0x6fd72969, 0x261d21b5)}, {TOBN(0x52389803, 0x08b640d3), TOBN(0x5b0026ee, 0x887f12a1), TOBN(0x20e21660, 0x742e9311), TOBN(0x0ef6d541, 0x5ff77ff7)}}, {{TOBN(0x969127f0, 0xf9c41135), TOBN(0xf21d60c9, 0x68a64993), TOBN(0x656e5d0c, 0xe541875c), TOBN(0xf1e0f84e, 0xa1d3c233)}, {TOBN(0x9bcca359, 0x06002d60), TOBN(0xbe2da60c, 0x06191552), TOBN(0x5da8bbae, 0x61181ec3), TOBN(0x9f04b823, 0x65806f19)}}, {{TOBN(0xf1604a7d, 0xd4b79bb8), TOBN(0xaee806fb, 0x52c878c8), TOBN(0x34144f11, 0x8d47b8e8), TOBN(0x72edf52b, 0x949f9054)}, {TOBN(0xebfca84e, 0x2127015a), TOBN(0x9051d0c0, 0x9cb7cef3), TOBN(0x86e8fe58, 0x296deec8), TOBN(0x33b28188, 0x41010d74)}}}, {{{TOBN(0x01079383, 0x171b445f), TOBN(0x9bcf21e3, 0x8131ad4c), TOBN(0x8cdfe205, 0xc93987e8), TOBN(0xe63f4152, 0xc92e8c8f)}, {TOBN(0x729462a9, 0x30add43d), TOBN(0x62ebb143, 0xc980f05a), TOBN(0x4f3954e5, 0x3b06e968), TOBN(0xfe1d75ad, 0x242cf6b1)}}, {{TOBN(0x5f95c6c7, 0xaf8685c8), TOBN(0xd4c1c8ce, 0x2f8f01aa), TOBN(0xc44bbe32, 0x2574692a), TOBN(0xb8003478, 0xd4a4a068)}, {TOBN(0x7c8fc6e5, 0x2eca3cdb), TOBN(0xea1db16b, 0xec04d399), TOBN(0xb05bc82e, 0x8f2bc5cf), TOBN(0x763d517f, 0xf44793d2)}}, {{TOBN(0x4451c1b8, 0x08bd98d0), TOBN(0x644b1cd4, 0x6575f240), TOBN(0x6907eb33, 0x7375d270), TOBN(0x56c8bebd, 0xfa2286bd)}, {TOBN(0xc713d2ac, 0xc4632b46), TOBN(0x17da427a, 0xafd60242), TOBN(0x313065b7, 0xc95c7546), TOBN(0xf8239898, 0xbf17a3de)}}, {{TOBN(0xf3b7963f, 0x4c830320), TOBN(0x842c7aa0, 0x903203e3), TOBN(0xaf22ca0a, 0xe7327afb), TOBN(0x38e13092, 0x967609b6)}, {TOBN(0x73b8fb62, 0x757558f1), TOBN(0x3cc3e831, 0xf7eca8c1), TOBN(0xe4174474, 0xf6331627), TOBN(0xa77989ca, 0xc3c40234)}}, {{TOBN(0xe5fd17a1, 0x44a081e0), TOBN(0xd797fb7d, 0xb70e296a), TOBN(0x2b472b30, 0x481f719c), TOBN(0x0e632a98, 0xfe6f8c52)}, {TOBN(0x89ccd116, 0xc5f0c284), TOBN(0xf51088af, 0x2d987c62), TOBN(0x2a2bccda, 0x4c2de6cf), TOBN(0x810f9efe, 0xf679f0f9)}}, {{TOBN(0xb0f394b9, 0x7ffe4b3e), TOBN(0x0b691d21, 0xe5fa5d21), TOBN(0xb0bd7747, 0x9dfbbc75), TOBN(0xd2830fda, 0xfaf78b00)}, {TOBN(0xf78c249c, 0x52434f57), TOBN(0x4b1f7545, 0x98096dab), TOBN(0x73bf6f94, 0x8ff8c0b3), TOBN(0x34aef03d, 0x454e134c)}}, {{TOBN(0xf8d151f4, 0xb7ac7ec5), TOBN(0xd6ceb95a, 0xe50da7d5), TOBN(0xa1b492b0, 0xdc3a0eb8), TOBN(0x75157b69, 0xb3dd2863)}, {TOBN(0xe2c4c74e, 0xc5413d62), TOBN(0xbe329ff7, 0xbc5fc4c7), TOBN(0x835a2aea, 0x60fa9dda), TOBN(0xf117f5ad, 0x7445cb87)}}, {{TOBN(0xae8317f4, 0xb0166f7a), TOBN(0xfbd3e3f7, 0xceec74e6), TOBN(0xfdb516ac, 0xe0874bfd), TOBN(0x3d846019, 0xc681f3a3)}, {TOBN(0x0b12ee5c, 0x7c1620b0), TOBN(0xba68b4dd, 0x2b63c501), TOBN(0xac03cd32, 0x6668c51e), TOBN(0x2a6279f7, 0x4e0bcb5b)}}, {{TOBN(0x17bd69b0, 0x6ae85c10), TOBN(0x72946979, 0x1dfdd3a6), TOBN(0xd9a03268, 0x2c078bec), TOBN(0x41c6a658, 0xbfd68a52)}, {TOBN(0xcdea1024, 0x0e023900), TOBN(0xbaeec121, 0xb10d144d), TOBN(0x5a600e74, 0x058ab8dc), TOBN(0x1333af21, 0xbb89ccdd)}}, {{TOBN(0xdf25eae0, 0x3aaba1f1), TOBN(0x2cada16e, 0x3b7144cf), TOBN(0x657ee27d, 0x71ab98bc), TOBN(0x99088b4c, 0x7a6fc96e)}, {TOBN(0x05d5c0a0, 0x3549dbd4), TOBN(0x42cbdf8f, 0xf158c3ac), TOBN(0x3fb6b3b0, 0x87edd685), TOBN(0x22071cf6, 0x86f064d0)}}, {{TOBN(0xd2d6721f, 0xff2811e5), TOBN(0xdb81b703, 0xfe7fae8c), TOBN(0x3cfb74ef, 0xd3f1f7bb), TOBN(0x0cdbcd76, 0x16cdeb5d)}, {TOBN(0x4f39642a, 0x566a808c), TOBN(0x02b74454, 0x340064d6), TOBN(0xfabbadca, 0x0528fa6f), TOBN(0xe4c3074c, 0xd3fc0bb6)}}, {{TOBN(0xb32cb8b0, 0xb796d219), TOBN(0xc3e95f4f, 0x34741dd9), TOBN(0x87212125, 0x68edf6f5), TOBN(0x7a03aee4, 0xa2b9cb8e)}, {TOBN(0x0cd3c376, 0xf53a89aa), TOBN(0x0d8af9b1, 0x948a28dc), TOBN(0xcf86a3f4, 0x902ab04f), TOBN(0x8aacb62a, 0x7f42002d)}}, {{TOBN(0x106985eb, 0xf62ffd52), TOBN(0xe670b54e, 0x5797bf10), TOBN(0x4b405209, 0xc5e30aef), TOBN(0x12c97a20, 0x4365b5e9)}, {TOBN(0x104646ce, 0x1fe32093), TOBN(0x13cb4ff6, 0x3907a8c9), TOBN(0x8b9f30d1, 0xd46e726b), TOBN(0xe1985e21, 0xaba0f499)}}, {{TOBN(0xc573dea9, 0x10a230cd), TOBN(0x24f46a93, 0xcd30f947), TOBN(0xf2623fcf, 0xabe2010a), TOBN(0x3f278cb2, 0x73f00e4f)}, {TOBN(0xed55c67d, 0x50b920eb), TOBN(0xf1cb9a2d, 0x8e760571), TOBN(0x7c50d109, 0x0895b709), TOBN(0x4207cf07, 0x190d4369)}}, {{TOBN(0x3b027e81, 0xc4127fe1), TOBN(0xa9f8b9ad, 0x3ae9c566), TOBN(0x5ab10851, 0xacbfbba5), TOBN(0xa747d648, 0x569556f5)}, {TOBN(0xcc172b5c, 0x2ba97bf7), TOBN(0x15e0f77d, 0xbcfa3324), TOBN(0xa345b797, 0x7686279d), TOBN(0x5a723480, 0xe38003d3)}}, {{TOBN(0xfd8e139f, 0x8f5fcda8), TOBN(0xf3e558c4, 0xbdee5bfd), TOBN(0xd76cbaf4, 0xe33f9f77), TOBN(0x3a4c97a4, 0x71771969)}, {TOBN(0xda27e84b, 0xf6dce6a7), TOBN(0xff373d96, 0x13e6c2d1), TOBN(0xf115193c, 0xd759a6e9), TOBN(0x3f9b7025, 0x63d2262c)}}, {{TOBN(0xd9764a31, 0x317cd062), TOBN(0x30779d8e, 0x199f8332), TOBN(0xd8074106, 0x16b11b0b), TOBN(0x7917ab9f, 0x78aeaed8)}, {TOBN(0xb67a9cbe, 0x28fb1d8e), TOBN(0x2e313563, 0x136eda33), TOBN(0x010b7069, 0xa371a86c), TOBN(0x44d90fa2, 0x6744e6b7)}}, {{TOBN(0x68190867, 0xd6b3e243), TOBN(0x9fe6cd9d, 0x59048c48), TOBN(0xb900b028, 0x95731538), TOBN(0xa012062f, 0x32cae04f)}, {TOBN(0x8107c8bc, 0x9399d082), TOBN(0x47e8c54a, 0x41df12e2), TOBN(0x14ba5117, 0xb6ef3f73), TOBN(0x22260bea, 0x81362f0b)}}, {{TOBN(0x90ea261e, 0x1a18cc20), TOBN(0x2192999f, 0x2321d636), TOBN(0xef64d314, 0xe311b6a0), TOBN(0xd7401e4c, 0x3b54a1f5)}, {TOBN(0x19019983, 0x6fbca2ba), TOBN(0x46ad3293, 0x8fbffc4b), TOBN(0xa142d3f6, 0x3786bf40), TOBN(0xeb5cbc26, 0xb67039fc)}}, {{TOBN(0x9cb0ae6c, 0x252bd479), TOBN(0x05e0f88a, 0x12b5848f), TOBN(0x78f6d2b2, 0xa5c97663), TOBN(0x6f6e149b, 0xc162225c)}, {TOBN(0xe602235c, 0xde601a89), TOBN(0xd17bbe98, 0xf373be1f), TOBN(0xcaf49a5b, 0xa8471827), TOBN(0x7e1a0a85, 0x18aaa116)}}, {{TOBN(0x6c833196, 0x270580c3), TOBN(0x1e233839, 0xf1c98a14), TOBN(0x67b2f7b4, 0xae34e0a5), TOBN(0x47ac8745, 0xd8ce7289)}, {TOBN(0x2b74779a, 0x100dd467), TOBN(0x274a4337, 0x4ee50d09), TOBN(0x603dcf13, 0x83608bc9), TOBN(0xcd9da6c3, 0xc89e8388)}}, {{TOBN(0x2660199f, 0x355116ac), TOBN(0xcc38bb59, 0xb6d18eed), TOBN(0x3075f31f, 0x2f4bc071), TOBN(0x9774457f, 0x265dc57e)}, {TOBN(0x06a6a9c8, 0xc6db88bb), TOBN(0x6429d07f, 0x4ec98e04), TOBN(0x8d05e57b, 0x05ecaa8b), TOBN(0x20f140b1, 0x7872ea7b)}}, {{TOBN(0xdf8c0f09, 0xca494693), TOBN(0x48d3a020, 0xf252e909), TOBN(0x4c5c29af, 0x57b14b12), TOBN(0x7e6fa37d, 0xbf47ad1c)}, {TOBN(0x66e7b506, 0x49a0c938), TOBN(0xb72c0d48, 0x6be5f41f), TOBN(0x6a6242b8, 0xb2359412), TOBN(0xcd35c774, 0x8e859480)}}, {{TOBN(0x12536fea, 0x87baa627), TOBN(0x58c1fec1, 0xf72aa680), TOBN(0x6c29b637, 0x601e5dc9), TOBN(0x9e3c3c1c, 0xde9e01b9)}, {TOBN(0xefc8127b, 0x2bcfe0b0), TOBN(0x35107102, 0x2a12f50d), TOBN(0x6ccd6cb1, 0x4879b397), TOBN(0xf792f804, 0xf8a82f21)}}, {{TOBN(0x509d4804, 0xa9b46402), TOBN(0xedddf85d, 0xc10f0850), TOBN(0x928410dc, 0x4b6208aa), TOBN(0xf6229c46, 0x391012dc)}, {TOBN(0xc5a7c41e, 0x7727b9b6), TOBN(0x289e4e4b, 0xaa444842), TOBN(0x049ba1d9, 0xe9a947ea), TOBN(0x44f9e47f, 0x83c8debc)}}, {{TOBN(0xfa77a1fe, 0x611f8b8e), TOBN(0xfd2e416a, 0xf518f427), TOBN(0xc5fffa70, 0x114ebac3), TOBN(0xfe57c4e9, 0x5d89697b)}, {TOBN(0xfdd053ac, 0xb1aaf613), TOBN(0x31df210f, 0xea585a45), TOBN(0x318cc10e, 0x24985034), TOBN(0x1a38efd1, 0x5f1d6130)}}, {{TOBN(0xbf86f237, 0x0b1e9e21), TOBN(0xb258514d, 0x1dbe88aa), TOBN(0x1e38a588, 0x90c1baf9), TOBN(0x2936a01e, 0xbdb9b692)}, {TOBN(0xd576de98, 0x6dd5b20c), TOBN(0xb586bf71, 0x70f98ecf), TOBN(0xcccf0f12, 0xc42d2fd7), TOBN(0x8717e61c, 0xfb35bd7b)}}, {{TOBN(0x8b1e5722, 0x35e6fc06), TOBN(0x3477728f, 0x0b3e13d5), TOBN(0x150c294d, 0xaa8a7372), TOBN(0xc0291d43, 0x3bfa528a)}, {TOBN(0xc6c8bc67, 0xcec5a196), TOBN(0xdeeb31e4, 0x5c2e8a7c), TOBN(0xba93e244, 0xfb6e1c51), TOBN(0xb9f8b71b, 0x2e28e156)}}, {{TOBN(0xce65a287, 0x968a2ab9), TOBN(0xe3c5ce69, 0x46bbcb1f), TOBN(0xf8c835b9, 0xe7ae3f30), TOBN(0x16bbee26, 0xff72b82b)}, {TOBN(0x665e2017, 0xfd42cd22), TOBN(0x1e139970, 0xf8b1d2a0), TOBN(0x125cda29, 0x79204932), TOBN(0x7aee94a5, 0x49c3bee5)}}, {{TOBN(0x68c70160, 0x89821a66), TOBN(0xf7c37678, 0x8f981669), TOBN(0xd90829fc, 0x48cc3645), TOBN(0x346af049, 0xd70addfc)}, {TOBN(0x2057b232, 0x370bf29c), TOBN(0xf90c73ce, 0x42e650ee), TOBN(0xe03386ea, 0xa126ab90), TOBN(0x0e266e7e, 0x975a087b)}}, {{TOBN(0x80578eb9, 0x0fca65d9), TOBN(0x7e2989ea, 0x16af45b8), TOBN(0x7438212d, 0xcac75a4e), TOBN(0x38c7ca39, 0x4fef36b8)}, {TOBN(0x8650c494, 0xd402676a), TOBN(0x26ab5a66, 0xf72c7c48), TOBN(0x4e6cb426, 0xce3a464e), TOBN(0xf8f99896, 0x2b72f841)}}, {{TOBN(0x8c318491, 0x1a335cc8), TOBN(0x563459ba, 0x6a5913e4), TOBN(0x1b920d61, 0xc7b32919), TOBN(0x805ab8b6, 0xa02425ad)}, {TOBN(0x2ac512da, 0x8d006086), TOBN(0x6ca4846a, 0xbcf5c0fd), TOBN(0xafea51d8, 0xac2138d7), TOBN(0xcb647545, 0x344cd443)}}, {{TOBN(0x0429ee8f, 0xbd7d9040), TOBN(0xee66a2de, 0x819b9c96), TOBN(0x54f9ec25, 0xdea7d744), TOBN(0x2ffea642, 0x671721bb)}, {TOBN(0x4f19dbd1, 0x114344ea), TOBN(0x04304536, 0xfd0dbc8b), TOBN(0x014b50aa, 0x29ec7f91), TOBN(0xb5fc22fe, 0xbb06014d)}}, {{TOBN(0x60d963a9, 0x1ee682e0), TOBN(0xdf48abc0, 0xfe85c727), TOBN(0x0cadba13, 0x2e707c2d), TOBN(0xde608d3a, 0xa645aeff)}, {TOBN(0x05f1c28b, 0xedafd883), TOBN(0x3c362ede, 0xbd94de1f), TOBN(0x8dd0629d, 0x13593e41), TOBN(0x0a5e736f, 0x766d6eaf)}}, {{TOBN(0xbfa92311, 0xf68cf9d1), TOBN(0xa4f9ef87, 0xc1797556), TOBN(0x10d75a1f, 0x5601c209), TOBN(0x651c374c, 0x09b07361)}, {TOBN(0x49950b58, 0x88b5cead), TOBN(0x0ef00058, 0x6fa9dbaa), TOBN(0xf51ddc26, 0x4e15f33a), TOBN(0x1f8b5ca6, 0x2ef46140)}}, {{TOBN(0x343ac0a3, 0xee9523f0), TOBN(0xbb75eab2, 0x975ea978), TOBN(0x1bccf332, 0x107387f4), TOBN(0x790f9259, 0x9ab0062e)}, {TOBN(0xf1a363ad, 0x1e4f6a5f), TOBN(0x06e08b84, 0x62519a50), TOBN(0x60915187, 0x7265f1ee), TOBN(0x6a80ca34, 0x93ae985e)}}, {{TOBN(0x81b29768, 0xaaba4864), TOBN(0xb13cabf2, 0x8d52a7d6), TOBN(0xb5c36348, 0x8ead03f1), TOBN(0xc932ad95, 0x81c7c1c0)}, {TOBN(0x5452708e, 0xcae1e27b), TOBN(0x9dac4269, 0x1b0df648), TOBN(0x233e3f0c, 0xdfcdb8bc), TOBN(0xe6ceccdf, 0xec540174)}}, {{TOBN(0xbd0d845e, 0x95081181), TOBN(0xcc8a7920, 0x699355d5), TOBN(0x111c0f6d, 0xc3b375a8), TOBN(0xfd95bc6b, 0xfd51e0dc)}, {TOBN(0x4a106a26, 0x6888523a), TOBN(0x4d142bd6, 0xcb01a06d), TOBN(0x79bfd289, 0xadb9b397), TOBN(0x0bdbfb94, 0xe9863914)}}, {{TOBN(0x29d8a229, 0x1660f6a6), TOBN(0x7f6abcd6, 0x551c042d), TOBN(0x13039deb, 0x0ac3ffe8), TOBN(0xa01be628, 0xec8523fb)}, {TOBN(0x6ea34103, 0x0ca1c328), TOBN(0xc74114bd, 0xb903928e), TOBN(0x8aa4ff4e, 0x9e9144b0), TOBN(0x7064091f, 0x7f9a4b17)}}, {{TOBN(0xa3f4f521, 0xe447f2c4), TOBN(0x81b8da7a, 0x604291f0), TOBN(0xd680bc46, 0x7d5926de), TOBN(0x84f21fd5, 0x34a1202f)}, {TOBN(0x1d1e3181, 0x4e9df3d8), TOBN(0x1ca4861a, 0x39ab8d34), TOBN(0x809ddeec, 0x5b19aa4a), TOBN(0x59f72f7e, 0x4d329366)}}, {{TOBN(0xa2f93f41, 0x386d5087), TOBN(0x40bf739c, 0xdd67d64f), TOBN(0xb4494205, 0x66702158), TOBN(0xc33c65be, 0x73b1e178)}, {TOBN(0xcdcd657c, 0x38ca6153), TOBN(0x97f4519a, 0xdc791976), TOBN(0xcc7c7f29, 0xcd6e1f39), TOBN(0x38de9cfb, 0x7e3c3932)}}, {{TOBN(0xe448eba3, 0x7b793f85), TOBN(0xe9f8dbf9, 0xf067e914), TOBN(0xc0390266, 0xf114ae87), TOBN(0x39ed75a7, 0xcd6a8e2a)}, {TOBN(0xadb14848, 0x7ffba390), TOBN(0x67f8cb8b, 0x6af9bc09), TOBN(0x322c3848, 0x9c7476db), TOBN(0xa320fecf, 0x52a538d6)}}, {{TOBN(0xe0493002, 0xb2aced2b), TOBN(0xdfba1809, 0x616bd430), TOBN(0x531c4644, 0xc331be70), TOBN(0xbc04d32e, 0x90d2e450)}, {TOBN(0x1805a0d1, 0x0f9f142d), TOBN(0x2c44a0c5, 0x47ee5a23), TOBN(0x31875a43, 0x3989b4e3), TOBN(0x6b1949fd, 0x0c063481)}}, {{TOBN(0x2dfb9e08, 0xbe0f4492), TOBN(0x3ff0da03, 0xe9d5e517), TOBN(0x03dbe9a1, 0xf79466a8), TOBN(0x0b87bcd0, 0x15ea9932)}, {TOBN(0xeb64fc83, 0xab1f58ab), TOBN(0x6d9598da, 0x817edc8a), TOBN(0x699cff66, 0x1d3b67e5), TOBN(0x645c0f29, 0x92635853)}}, {{TOBN(0x253cdd82, 0xeabaf21c), TOBN(0x82b9602a, 0x2241659e), TOBN(0x2cae07ec, 0x2d9f7091), TOBN(0xbe4c720c, 0x8b48cd9b)}, {TOBN(0x6ce5bc03, 0x6f08d6c9), TOBN(0x36e8a997, 0xaf10bf40), TOBN(0x83422d21, 0x3e10ff12), TOBN(0x7b26d3eb, 0xbcc12494)}}, {{TOBN(0xb240d2d0, 0xc9469ad6), TOBN(0xc4a11b4d, 0x30afa05b), TOBN(0x4b604ace, 0xdd6ba286), TOBN(0x18486600, 0x3ee2864c)}, {TOBN(0x5869d6ba, 0x8d9ce5be), TOBN(0x0d8f68c5, 0xff4bfb0d), TOBN(0xb69f210b, 0x5700cf73), TOBN(0x61f6653a, 0x6d37c135)}}, {{TOBN(0xff3d432b, 0x5aff5a48), TOBN(0x0d81c4b9, 0x72ba3a69), TOBN(0xee879ae9, 0xfa1899ef), TOBN(0xbac7e2a0, 0x2d6acafd)}, {TOBN(0xd6d93f6c, 0x1c664399), TOBN(0x4c288de1, 0x5bcb135d), TOBN(0x83031dab, 0x9dab7cbf), TOBN(0xfe23feb0, 0x3abbf5f0)}}, {{TOBN(0x9f1b2466, 0xcdedca85), TOBN(0x140bb710, 0x1a09538c), TOBN(0xac8ae851, 0x5e11115d), TOBN(0x0d63ff67, 0x6f03f59e)}, {TOBN(0x755e5551, 0x7d234afb), TOBN(0x61c2db4e, 0x7e208fc1), TOBN(0xaa9859ce, 0xf28a4b5d), TOBN(0xbdd6d4fc, 0x34af030f)}}, {{TOBN(0xd1c4a26d, 0x3be01cb1), TOBN(0x9ba14ffc, 0x243aa07c), TOBN(0xf95cd3a9, 0xb2503502), TOBN(0xe379bc06, 0x7d2a93ab)}, {TOBN(0x3efc18e9, 0xd4ca8d68), TOBN(0x083558ec, 0x80bb412a), TOBN(0xd903b940, 0x9645a968), TOBN(0xa499f0b6, 0x9ba6054f)}}, {{TOBN(0x208b573c, 0xb8349abe), TOBN(0x3baab3e5, 0x30b4fc1c), TOBN(0x87e978ba, 0xcb524990), TOBN(0x3524194e, 0xccdf0e80)}, {TOBN(0x62711725, 0x7d4bcc42), TOBN(0xe90a3d9b, 0xb90109ba), TOBN(0x3b1bdd57, 0x1323e1e0), TOBN(0xb78e9bd5, 0x5eae1599)}}, {{TOBN(0x0794b746, 0x9e03d278), TOBN(0x80178605, 0xd70e6297), TOBN(0x171792f8, 0x99c97855), TOBN(0x11b393ee, 0xf5a86b5c)}, {TOBN(0x48ef6582, 0xd8884f27), TOBN(0xbd44737a, 0xbf19ba5f), TOBN(0x8698de4c, 0xa42062c6), TOBN(0x8975eb80, 0x61ce9c54)}}, {{TOBN(0xd50e57c7, 0xd7fe71f3), TOBN(0x15342190, 0xbc97ce38), TOBN(0x51bda2de, 0x4df07b63), TOBN(0xba12aeae, 0x200eb87d)}, {TOBN(0xabe135d2, 0xa9b4f8f6), TOBN(0x04619d65, 0xfad6d99c), TOBN(0x4a6683a7, 0x7994937c), TOBN(0x7a778c8b, 0x6f94f09a)}}, {{TOBN(0x8c508623, 0x20a71b89), TOBN(0x241a2aed, 0x1c229165), TOBN(0x352be595, 0xaaf83a99), TOBN(0x9fbfee7f, 0x1562bac8)}, {TOBN(0xeaf658b9, 0x5c4017e3), TOBN(0x1dc7f9e0, 0x15120b86), TOBN(0xd84f13dd, 0x4c034d6f), TOBN(0x283dd737, 0xeaea3038)}}, {{TOBN(0x197f2609, 0xcd85d6a2), TOBN(0x6ebbc345, 0xfae60177), TOBN(0xb80f031b, 0x4e12fede), TOBN(0xde55d0c2, 0x07a2186b)}, {TOBN(0x1fb3e37f, 0x24dcdd5a), TOBN(0x8d602da5, 0x7ed191fb), TOBN(0x108fb056, 0x76023e0d), TOBN(0x70178c71, 0x459c20c0)}}, {{TOBN(0xfad5a386, 0x3fe54cf0), TOBN(0xa4a3ec4f, 0x02bbb475), TOBN(0x1aa5ec20, 0x919d94d7), TOBN(0x5d3b63b5, 0xa81e4ab3)}, {TOBN(0x7fa733d8, 0x5ad3d2af), TOBN(0xfbc586dd, 0xd1ac7a37), TOBN(0x282925de, 0x40779614), TOBN(0xfe0ffffb, 0xe74a242a)}}, {{TOBN(0x3f39e67f, 0x906151e5), TOBN(0xcea27f5f, 0x55e10649), TOBN(0xdca1d4e1, 0xc17cf7b7), TOBN(0x0c326d12, 0x2fe2362d)}, {TOBN(0x05f7ac33, 0x7dd35df3), TOBN(0x0c3b7639, 0xc396dbdf), TOBN(0x0912f5ac, 0x03b7db1c), TOBN(0x9dea4b70, 0x5c9ed4a9)}}, {{TOBN(0x475e6e53, 0xaae3f639), TOBN(0xfaba0e7c, 0xfc278bac), TOBN(0x16f9e221, 0x9490375f), TOBN(0xaebf9746, 0xa5a7ed0a)}, {TOBN(0x45f9af3f, 0xf41ad5d6), TOBN(0x03c4623c, 0xb2e99224), TOBN(0x82c5bb5c, 0xb3cf56aa), TOBN(0x64311819, 0x34567ed3)}}, {{TOBN(0xec57f211, 0x8be489ac), TOBN(0x2821895d, 0xb9a1104b), TOBN(0x610dc875, 0x6064e007), TOBN(0x8e526f3f, 0x5b20d0fe)}, {TOBN(0x6e71ca77, 0x5b645aee), TOBN(0x3d1dcb9f, 0x800e10ff), TOBN(0x36b51162, 0x189cf6de), TOBN(0x2c5a3e30, 0x6bb17353)}}, {{TOBN(0xc186cd3e, 0x2a6c6fbf), TOBN(0xa74516fa, 0x4bf97906), TOBN(0x5b4b8f4b, 0x279d6901), TOBN(0x0c4e57b4, 0x2b573743)}, {TOBN(0x75fdb229, 0xb6e386b6), TOBN(0xb46793fd, 0x99deac27), TOBN(0xeeec47ea, 0xcf712629), TOBN(0xe965f3c4, 0xcbc3b2dd)}}, {{TOBN(0x8dd1fb83, 0x425c6559), TOBN(0x7fc00ee6, 0x0af06fda), TOBN(0xe98c9225, 0x33d956df), TOBN(0x0f1ef335, 0x4fbdc8a2)}, {TOBN(0x2abb5145, 0xb79b8ea2), TOBN(0x40fd2945, 0xbdbff288), TOBN(0x6a814ac4, 0xd7185db7), TOBN(0xc4329d6f, 0xc084609a)}}, {{TOBN(0xc9ba7b52, 0xed1be45d), TOBN(0x891dd20d, 0xe4cd2c74), TOBN(0x5a4d4a7f, 0x824139b1), TOBN(0x66c17716, 0xb873c710)}, {TOBN(0x5e5bc141, 0x2843c4e0), TOBN(0xd5ac4817, 0xb97eb5bf), TOBN(0xc0f8af54, 0x450c95c7), TOBN(0xc91b3fa0, 0x318406c5)}}, {{TOBN(0x360c340a, 0xab9d97f8), TOBN(0xfb57bd07, 0x90a2d611), TOBN(0x4339ae3c, 0xa6a6f7e5), TOBN(0x9c1fcd2a, 0x2feb8a10)}, {TOBN(0x972bcca9, 0xc7ea7432), TOBN(0x1b0b924c, 0x308076f6), TOBN(0x80b2814a, 0x2a5b4ca5), TOBN(0x2f78f55b, 0x61ef3b29)}}, {{TOBN(0xf838744a, 0xc18a414f), TOBN(0xc611eaae, 0x903d0a86), TOBN(0x94dabc16, 0x2a453f55), TOBN(0xe6f2e3da, 0x14efb279)}, {TOBN(0x5b7a6017, 0x9320dc3c), TOBN(0x692e382f, 0x8df6b5a4), TOBN(0x3f5e15e0, 0x2d40fa90), TOBN(0xc87883ae, 0x643dd318)}}, {{TOBN(0x511053e4, 0x53544774), TOBN(0x834d0ecc, 0x3adba2bc), TOBN(0x4215d7f7, 0xbae371f5), TOBN(0xfcfd57bf, 0x6c8663bc)}, {TOBN(0xded2383d, 0xd6901b1d), TOBN(0x3b49fbb4, 0xb5587dc3), TOBN(0xfd44a08d, 0x07625f62), TOBN(0x3ee4d65b, 0x9de9b762)}}}, {{{TOBN(0x64e5137d, 0x0d63d1fa), TOBN(0x658fc052, 0x02a9d89f), TOBN(0x48894874, 0x50436309), TOBN(0xe9ae30f8, 0xd598da61)}, {TOBN(0x2ed710d1, 0x818baf91), TOBN(0xe27e9e06, 0x8b6a0c20), TOBN(0x1e28dcfb, 0x1c1a6b44), TOBN(0x883acb64, 0xd6ac57dc)}}, {{TOBN(0x8735728d, 0xc2c6ff70), TOBN(0x79d6122f, 0xc5dc2235), TOBN(0x23f5d003, 0x19e277f9), TOBN(0x7ee84e25, 0xdded8cc7)}, {TOBN(0x91a8afb0, 0x63cd880a), TOBN(0x3f3ea7c6, 0x3574af60), TOBN(0x0cfcdc84, 0x02de7f42), TOBN(0x62d0792f, 0xb31aa152)}}, {{TOBN(0x8e1b4e43, 0x8a5807ce), TOBN(0xad283893, 0xe4109a7e), TOBN(0xc30cc9cb, 0xafd59dda), TOBN(0xf65f36c6, 0x3d8d8093)}, {TOBN(0xdf31469e, 0xa60d32b2), TOBN(0xee93df4b, 0x3e8191c8), TOBN(0x9c1017c5, 0x355bdeb5), TOBN(0xd2623185, 0x8616aa28)}}, {{TOBN(0xb02c83f9, 0xdec31a21), TOBN(0x988c8b23, 0x6ad9d573), TOBN(0x53e983ae, 0xa57be365), TOBN(0xe968734d, 0x646f834e)}, {TOBN(0x9137ea8f, 0x5da6309b), TOBN(0x10f3a624, 0xc1f1ce16), TOBN(0x782a9ea2, 0xca440921), TOBN(0xdf94739e, 0x5b46f1b5)}}, {{TOBN(0x9f9be006, 0xcce85c9b), TOBN(0x360e70d6, 0xa4c7c2d3), TOBN(0x2cd5beea, 0xaefa1e60), TOBN(0x64cf63c0, 0x8c3d2b6d)}, {TOBN(0xfb107fa3, 0xe1cf6f90), TOBN(0xb7e937c6, 0xd5e044e6), TOBN(0x74e8ca78, 0xce34db9f), TOBN(0x4f8b36c1, 0x3e210bd0)}}, {{TOBN(0x1df165a4, 0x34a35ea8), TOBN(0x3418e0f7, 0x4d4412f6), TOBN(0x5af1f8af, 0x518836c3), TOBN(0x42ceef4d, 0x130e1965)}, {TOBN(0x5560ca0b, 0x543a1957), TOBN(0xc33761e5, 0x886cb123), TOBN(0x66624b1f, 0xfe98ed30), TOBN(0xf772f4bf, 0x1090997d)}}, {{TOBN(0xf4e540bb, 0x4885d410), TOBN(0x7287f810, 0x9ba5f8d7), TOBN(0x22d0d865, 0xde98dfb1), TOBN(0x49ff51a1, 0xbcfbb8a3)}, {TOBN(0xb6b6fa53, 0x6bc3012e), TOBN(0x3d31fd72, 0x170d541d), TOBN(0x8018724f, 0x4b0f4966), TOBN(0x79e7399f, 0x87dbde07)}}, {{TOBN(0x56f8410e, 0xf4f8b16a), TOBN(0x97241afe, 0xc47b266a), TOBN(0x0a406b8e, 0x6d9c87c1), TOBN(0x803f3e02, 0xcd42ab1b)}, {TOBN(0x7f0309a8, 0x04dbec69), TOBN(0xa83b85f7, 0x3bbad05f), TOBN(0xc6097273, 0xad8e197f), TOBN(0xc097440e, 0x5067adc1)}}, {{TOBN(0x730eafb6, 0x3524ff16), TOBN(0xd7f9b51e, 0x823fc6ce), TOBN(0x27bd0d32, 0x443e4ac0), TOBN(0x40c59ad9, 0x4d66f217)}, {TOBN(0x6c33136f, 0x17c387a4), TOBN(0x5043b8d5, 0xeb86804d), TOBN(0x74970312, 0x675a73c9), TOBN(0x838fdb31, 0xf16669b6)}}, {{TOBN(0xc507b6dd, 0x418e7ddd), TOBN(0x39888d93, 0x472f19d6), TOBN(0x7eae26be, 0x0c27eb4d), TOBN(0x17b53ed3, 0xfbabb884)}, {TOBN(0xfc27021b, 0x2b01ae4f), TOBN(0x88462e87, 0xcf488682), TOBN(0xbee096ec, 0x215e2d87), TOBN(0xeb2fea9a, 0xd242e29b)}}, {{TOBN(0x5d985b5f, 0xb821fc28), TOBN(0x89d2e197, 0xdc1e2ad2), TOBN(0x55b566b8, 0x9030ba62), TOBN(0xe3fd41b5, 0x4f41b1c6)}, {TOBN(0xb738ac2e, 0xb9a96d61), TOBN(0x7f8567ca, 0x369443f4), TOBN(0x8698622d, 0xf803a440), TOBN(0x2b586236, 0x8fe2f4dc)}}, {{TOBN(0xbbcc00c7, 0x56b95bce), TOBN(0x5ec03906, 0x616da680), TOBN(0x79162ee6, 0x72214252), TOBN(0x43132b63, 0x86a892d2)}, {TOBN(0x4bdd3ff2, 0x2f3263bf), TOBN(0xd5b3733c, 0x9cd0a142), TOBN(0x592eaa82, 0x44415ccb), TOBN(0x663e8924, 0x8d5474ea)}}, {{TOBN(0x8058a25e, 0x5236344e), TOBN(0x82e8df9d, 0xbda76ee6), TOBN(0xdcf6efd8, 0x11cc3d22), TOBN(0x00089cda, 0x3b4ab529)}, {TOBN(0x91d3a071, 0xbd38a3db), TOBN(0x4ea97fc0, 0xef72b925), TOBN(0x0c9fc15b, 0xea3edf75), TOBN(0x5a6297cd, 0xa4348ed3)}}, {{TOBN(0x0d38ab35, 0xce7c42d4), TOBN(0x9fd493ef, 0x82feab10), TOBN(0x46056b6d, 0x82111b45), TOBN(0xda11dae1, 0x73efc5c3)}, {TOBN(0xdc740278, 0x5545a7fb), TOBN(0xbdb2601c, 0x40d507e6), TOBN(0x121dfeeb, 0x7066fa58), TOBN(0x214369a8, 0x39ae8c2a)}}, {{TOBN(0x195709cb, 0x06e0956c), TOBN(0x4c9d254f, 0x010cd34b), TOBN(0xf51e13f7, 0x0471a532), TOBN(0xe19d6791, 0x1e73054d)}, {TOBN(0xf702a628, 0xdb5c7be3), TOBN(0xc7141218, 0xb24dde05), TOBN(0xdc18233c, 0xf29b2e2e), TOBN(0x3a6bd1e8, 0x85342dba)}}, {{TOBN(0x3f747fa0, 0xb311898c), TOBN(0xe2a272e4, 0xcd0eac65), TOBN(0x4bba5851, 0xf914d0bc), TOBN(0x7a1a9660, 0xc4a43ee3)}, {TOBN(0xe5a367ce, 0xa1c8cde9), TOBN(0x9d958ba9, 0x7271abe3), TOBN(0xf3ff7eb6, 0x3d1615cd), TOBN(0xa2280dce, 0xf5ae20b0)}}, {{TOBN(0x56dba5c1, 0xcf640147), TOBN(0xea5a2e3d, 0x5e83d118), TOBN(0x04cd6b6d, 0xda24c511), TOBN(0x1c0f4671, 0xe854d214)}, {TOBN(0x91a6b7a9, 0x69565381), TOBN(0xdc966240, 0xdecf1f5b), TOBN(0x1b22d21c, 0xfcf5d009), TOBN(0x2a05f641, 0x9021dbd5)}}, {{TOBN(0x8c0ed566, 0xd4312483), TOBN(0x5179a95d, 0x643e216f), TOBN(0xcc185fec, 0x17044493), TOBN(0xb3063339, 0x54991a21)}, {TOBN(0xd801ecdb, 0x0081a726), TOBN(0x0149b0c6, 0x4fa89bbb), TOBN(0xafe9065a, 0x4391b6b9), TOBN(0xedc92786, 0xd633f3a3)}}, {{TOBN(0xe408c24a, 0xae6a8e13), TOBN(0x85833fde, 0x9f3897ab), TOBN(0x43800e7e, 0xd81a0715), TOBN(0xde08e346, 0xb44ffc5f)}, {TOBN(0x7094184c, 0xcdeff2e0), TOBN(0x49f9387b, 0x165eaed1), TOBN(0x635d6129, 0x777c468a), TOBN(0x8c0dcfd1, 0x538c2dd8)}}, {{TOBN(0xd6d9d9e3, 0x7a6a308b), TOBN(0x62375830, 0x4c2767d3), TOBN(0x874a8bc6, 0xf38cbeb6), TOBN(0xd94d3f1a, 0xccb6fd9e)}, {TOBN(0x92a9735b, 0xba21f248), TOBN(0x272ad0e5, 0x6cd1efb0), TOBN(0x7437b69c, 0x05b03284), TOBN(0xe7f04702, 0x6948c225)}}, {{TOBN(0x8a56c04a, 0xcba2ecec), TOBN(0x0c181270, 0xe3a73e41), TOBN(0x6cb34e9d, 0x03e93725), TOBN(0xf77c8713, 0x496521a9)}, {TOBN(0x94569183, 0xfa7f9f90), TOBN(0xf2e7aa4c, 0x8c9707ad), TOBN(0xced2c9ba, 0x26c1c9a3), TOBN(0x9109fe96, 0x40197507)}}, {{TOBN(0x9ae868a9, 0xe9adfe1c), TOBN(0x3984403d, 0x314e39bb), TOBN(0xb5875720, 0xf2fe378f), TOBN(0x33f901e0, 0xba44a628)}, {TOBN(0xea1125fe, 0x3652438c), TOBN(0xae9ec4e6, 0x9dd1f20b), TOBN(0x1e740d9e, 0xbebf7fbd), TOBN(0x6dbd3ddc, 0x42dbe79c)}}, {{TOBN(0x62082aec, 0xedd36776), TOBN(0xf612c478, 0xe9859039), TOBN(0xa493b201, 0x032f7065), TOBN(0xebd4d8f2, 0x4ff9b211)}, {TOBN(0x3f23a0aa, 0xaac4cb32), TOBN(0xea3aadb7, 0x15ed4005), TOBN(0xacf17ea4, 0xafa27e63), TOBN(0x56125c1a, 0xc11fd66c)}}, {{TOBN(0x266344a4, 0x3794f8dc), TOBN(0xdcca923a, 0x483c5c36), TOBN(0x2d6b6bbf, 0x3f9d10a0), TOBN(0xb320c5ca, 0x81d9bdf3)}, {TOBN(0x620e28ff, 0x47b50a95), TOBN(0x933e3b01, 0xcef03371), TOBN(0xf081bf85, 0x99100153), TOBN(0x183be9a0, 0xc3a8c8d6)}}, {{TOBN(0x4e3ddc5a, 0xd6bbe24d), TOBN(0xc6c74630, 0x53843795), TOBN(0x78193dd7, 0x65ec2d4c), TOBN(0xb8df26cc, 0xcd3c89b2)}, {TOBN(0x98dbe399, 0x5a483f8d), TOBN(0x72d8a957, 0x7dd3313a), TOBN(0x65087294, 0xab0bd375), TOBN(0xfcd89248, 0x7c259d16)}}, {{TOBN(0x8a9443d7, 0x7613aa81), TOBN(0x80100800, 0x85fe6584), TOBN(0x70fc4dbc, 0x7fb10288), TOBN(0xf58280d3, 0xe86beee8)}, {TOBN(0x14fdd82f, 0x7c978c38), TOBN(0xdf1204c1, 0x0de44d7b), TOBN(0xa08a1c84, 0x4160252f), TOBN(0x591554ca, 0xc17646a5)}}, {{TOBN(0x214a37d6, 0xa05bd525), TOBN(0x48d5f09b, 0x07957b3c), TOBN(0x0247cdcb, 0xd7109bc9), TOBN(0x40f9e4bb, 0x30599ce7)}, {TOBN(0xc325fa03, 0xf46ad2ec), TOBN(0x00f766cf, 0xc3e3f9ee), TOBN(0xab556668, 0xd43a4577), TOBN(0x68d30a61, 0x3ee03b93)}}, {{TOBN(0x7ddc81ea, 0x77b46a08), TOBN(0xcf5a6477, 0xc7480699), TOBN(0x43a8cb34, 0x6633f683), TOBN(0x1b867e6b, 0x92363c60)}, {TOBN(0x43921114, 0x1f60558e), TOBN(0xcdbcdd63, 0x2f41450e), TOBN(0x7fc04601, 0xcc630e8b), TOBN(0xea7c66d5, 0x97038b43)}}, {{TOBN(0x7259b8a5, 0x04e99fd8), TOBN(0x98a8dd12, 0x4785549a), TOBN(0x0e459a7c, 0x840552e1), TOBN(0xcdfcf4d0, 0x4bb0909e)}, {TOBN(0x34a86db2, 0x53758da7), TOBN(0xe643bb83, 0xeac997e1), TOBN(0x96400bd7, 0x530c5b7e), TOBN(0x9f97af87, 0xb41c8b52)}}, {{TOBN(0x34fc8820, 0xfbeee3f9), TOBN(0x93e53490, 0x49091afd), TOBN(0x764b9be5, 0x9a31f35c), TOBN(0x71f37864, 0x57e3d924)}, {TOBN(0x02fb34e0, 0x943aa75e), TOBN(0xa18c9c58, 0xab8ff6e4), TOBN(0x080f31b1, 0x33cf0d19), TOBN(0x5c9682db, 0x083518a7)}}, {{TOBN(0x873d4ca6, 0xb709c3de), TOBN(0x64a84262, 0x3575b8f0), TOBN(0x6275da1f, 0x020154bb), TOBN(0x97678caa, 0xd17cf1ab)}, {TOBN(0x8779795f, 0x951a95c3), TOBN(0xdd35b163, 0x50fccc08), TOBN(0x32709627, 0x33d8f031), TOBN(0x3c5ab10a, 0x498dd85c)}}, {{TOBN(0xb6c185c3, 0x41dca566), TOBN(0x7de7feda, 0xd8622aa3), TOBN(0x99e84d92, 0x901b6dfb), TOBN(0x30a02b0e, 0x7c4ad288)}, {TOBN(0xc7c81daa, 0x2fd3cf36), TOBN(0xd1319547, 0xdf89e59f), TOBN(0xb2be8184, 0xcd496733), TOBN(0xd5f449eb, 0x93d3412b)}}, {{TOBN(0x7ea41b1b, 0x25fe531d), TOBN(0xf9797432, 0x6a1d5646), TOBN(0x86067f72, 0x2bde501a), TOBN(0xf91481c0, 0x0c85e89c)}, {TOBN(0xca8ee465, 0xf8b05bc6), TOBN(0x1844e1cf, 0x02e83cda), TOBN(0xca82114a, 0xb4dbe33b), TOBN(0x0f9f8769, 0x4eabfde2)}}, {{TOBN(0x4936b1c0, 0x38b27fe2), TOBN(0x63b6359b, 0xaba402df), TOBN(0x40c0ea2f, 0x656bdbab), TOBN(0x9c992a89, 0x6580c39c)}, {TOBN(0x600e8f15, 0x2a60aed1), TOBN(0xeb089ca4, 0xe0bf49df), TOBN(0x9c233d7d, 0x2d42d99a), TOBN(0x648d3f95, 0x4c6bc2fa)}}, {{TOBN(0xdcc383a8, 0xe1add3f3), TOBN(0xf42c0c6a, 0x4f64a348), TOBN(0x2abd176f, 0x0030dbdb), TOBN(0x4de501a3, 0x7d6c215e)}, {TOBN(0x4a107c1f, 0x4b9a64bc), TOBN(0xa77f0ad3, 0x2496cd59), TOBN(0xfb78ac62, 0x7688dffb), TOBN(0x7025a2ca, 0x67937d8e)}}, {{TOBN(0xfde8b2d1, 0xd1a8f4e7), TOBN(0xf5b3da47, 0x7354927c), TOBN(0xe48606a3, 0xd9205735), TOBN(0xac477cc6, 0xe177b917)}, {TOBN(0xfb1f73d2, 0xa883239a), TOBN(0xe12572f6, 0xcc8b8357), TOBN(0x9d355e9c, 0xfb1f4f86), TOBN(0x89b795f8, 0xd9f3ec6e)}}, {{TOBN(0x27be56f1, 0xb54398dc), TOBN(0x1890efd7, 0x3fedeed5), TOBN(0x62f77f1f, 0x9c6d0140), TOBN(0x7ef0e314, 0x596f0ee4)}, {TOBN(0x50ca6631, 0xcc61dab3), TOBN(0x4a39801d, 0xf4866e4f), TOBN(0x66c8d032, 0xae363b39), TOBN(0x22c591e5, 0x2ead66aa)}}, {{TOBN(0x954ba308, 0xde02a53e), TOBN(0x2a6c060f, 0xd389f357), TOBN(0xe6cfcde8, 0xfbf40b66), TOBN(0x8e02fc56, 0xc6340ce1)}, {TOBN(0xe4957795, 0x73adb4ba), TOBN(0x7b86122c, 0xa7b03805), TOBN(0x63f83512, 0x0c8e6fa6), TOBN(0x83660ea0, 0x057d7804)}}, {{TOBN(0xbad79105, 0x21ba473c), TOBN(0xb6c50bee, 0xded5389d), TOBN(0xee2caf4d, 0xaa7c9bc0), TOBN(0xd97b8de4, 0x8c4e98a7)}, {TOBN(0xa9f63e70, 0xab3bbddb), TOBN(0x3898aabf, 0x2597815a), TOBN(0x7659af89, 0xac15b3d9), TOBN(0xedf7725b, 0x703ce784)}}, {{TOBN(0x25470fab, 0xe085116b), TOBN(0x04a43375, 0x87285310), TOBN(0x4e39187e, 0xe2bfd52f), TOBN(0x36166b44, 0x7d9ebc74)}, {TOBN(0x92ad433c, 0xfd4b322c), TOBN(0x726aa817, 0xba79ab51), TOBN(0xf96eacd8, 0xc1db15eb), TOBN(0xfaf71e91, 0x0476be63)}}, {{TOBN(0xdd69a640, 0x641fad98), TOBN(0xb7995918, 0x29622559), TOBN(0x03c6daa5, 0xde4199dc), TOBN(0x92cadc97, 0xad545eb4)}, {TOBN(0x1028238b, 0x256534e4), TOBN(0x73e80ce6, 0x8595409a), TOBN(0x690d4c66, 0xd05dc59b), TOBN(0xc95f7b8f, 0x981dee80)}}, {{TOBN(0xf4337014, 0xd856ac25), TOBN(0x441bd9dd, 0xac524dca), TOBN(0x640b3d85, 0x5f0499f5), TOBN(0x39cf84a9, 0xd5fda182)}, {TOBN(0x04e7b055, 0xb2aa95a0), TOBN(0x29e33f0a, 0x0ddf1860), TOBN(0x082e74b5, 0x423f6b43), TOBN(0x217edeb9, 0x0aaa2b0f)}}, {{TOBN(0x58b83f35, 0x83cbea55), TOBN(0xc485ee4d, 0xbc185d70), TOBN(0x833ff03b, 0x1e5f6992), TOBN(0xb5b9b9cc, 0xcf0c0dd5)}, {TOBN(0x7caaee8e, 0x4e9e8a50), TOBN(0x462e907b, 0x6269dafd), TOBN(0x6ed5cee9, 0xfbe791c6), TOBN(0x68ca3259, 0xed430790)}}, {{TOBN(0x2b72bdf2, 0x13b5ba88), TOBN(0x60294c8a, 0x35ef0ac4), TOBN(0x9c3230ed, 0x19b99b08), TOBN(0x560fff17, 0x6c2589aa)}, {TOBN(0x552b8487, 0xd6770374), TOBN(0xa373202d, 0x9a56f685), TOBN(0xd3e7f907, 0x45f175d9), TOBN(0x3c2f315f, 0xd080d810)}}, {{TOBN(0x1130e9dd, 0x7b9520e8), TOBN(0xc078f9e2, 0x0af037b5), TOBN(0x38cd2ec7, 0x1e9c104c), TOBN(0x0f684368, 0xc472fe92)}, {TOBN(0xd3f1b5ed, 0x6247e7ef), TOBN(0xb32d33a9, 0x396dfe21), TOBN(0x46f59cf4, 0x4a9aa2c2), TOBN(0x69cd5168, 0xff0f7e41)}}, {{TOBN(0x3f59da0f, 0x4b3234da), TOBN(0xcf0b0235, 0xb4579ebe), TOBN(0x6d1cbb25, 0x6d2476c7), TOBN(0x4f0837e6, 0x9dc30f08)}, {TOBN(0x9a4075bb, 0x906f6e98), TOBN(0x253bb434, 0xc761e7d1), TOBN(0xde2e645f, 0x6e73af10), TOBN(0xb89a4060, 0x0c5f131c)}}, {{TOBN(0xd12840c5, 0xb8cc037f), TOBN(0x3d093a5b, 0x7405bb47), TOBN(0x6202c253, 0x206348b8), TOBN(0xbf5d57fc, 0xc55a3ca7)}, {TOBN(0x89f6c90c, 0x8c3bef48), TOBN(0x23ac7623, 0x5a0a960a), TOBN(0xdfbd3d6b, 0x552b42ab), TOBN(0x3ef22458, 0x132061f6)}}, {{TOBN(0xd74e9bda, 0xc97e6516), TOBN(0x88779360, 0xc230f49e), TOBN(0xa6ec1de3, 0x1e74ea49), TOBN(0x581dcee5, 0x3fb645a2)}, {TOBN(0xbaef2391, 0x8f483f14), TOBN(0x6d2dddfc, 0xd137d13b), TOBN(0x54cde50e, 0xd2743a42), TOBN(0x89a34fc5, 0xe4d97e67)}}, {{TOBN(0x13f1f5b3, 0x12e08ce5), TOBN(0xa80540b8, 0xa7f0b2ca), TOBN(0x854bcf77, 0x01982805), TOBN(0xb8653ffd, 0x233bea04)}, {TOBN(0x8e7b8787, 0x02b0b4c9), TOBN(0x2675261f, 0x9acb170a), TOBN(0x061a9d90, 0x930c14e5), TOBN(0xb59b30e0, 0xdef0abea)}}, {{TOBN(0x1dc19ea6, 0x0200ec7d), TOBN(0xb6f4a3f9, 0x0bce132b), TOBN(0xb8d5de90, 0xf13e27e0), TOBN(0xbaee5ef0, 0x1fade16f)}, {TOBN(0x6f406aaa, 0xe4c6cf38), TOBN(0xab4cfe06, 0xd1369815), TOBN(0x0dcffe87, 0xefd550c6), TOBN(0x9d4f59c7, 0x75ff7d39)}}, {{TOBN(0xb02553b1, 0x51deb6ad), TOBN(0x812399a4, 0xb1877749), TOBN(0xce90f71f, 0xca6006e1), TOBN(0xc32363a6, 0xb02b6e77)}, {TOBN(0x02284fbe, 0xdc36c64d), TOBN(0x86c81e31, 0xa7e1ae61), TOBN(0x2576c7e5, 0xb909d94a), TOBN(0x8b6f7d02, 0x818b2bb0)}}, {{TOBN(0xeca3ed07, 0x56faa38a), TOBN(0xa3790e6c, 0x9305bb54), TOBN(0xd784eeda, 0x7bc73061), TOBN(0xbd56d369, 0x6dd50614)}, {TOBN(0xd6575949, 0x229a8aa9), TOBN(0xdcca8f47, 0x4595ec28), TOBN(0x814305c1, 0x06ab4fe6), TOBN(0xc8c39768, 0x24f43f16)}}, {{TOBN(0xe2a45f36, 0x523f2b36), TOBN(0x995c6493, 0x920d93bb), TOBN(0xf8afdab7, 0x90f1632b), TOBN(0x79ebbecd, 0x1c295954)}, {TOBN(0xc7bb3ddb, 0x79592f48), TOBN(0x67216a7b, 0x5f88e998), TOBN(0xd91f098b, 0xbc01193e), TOBN(0xf7d928a5, 0xb1db83fc)}}, {{TOBN(0x55e38417, 0xe991f600), TOBN(0x2a91113e, 0x2981a934), TOBN(0xcbc9d648, 0x06b13bde), TOBN(0xb011b6ac, 0x0755ff44)}, {TOBN(0x6f4cb518, 0x045ec613), TOBN(0x522d2d31, 0xc2f5930a), TOBN(0x5acae1af, 0x382e65de), TOBN(0x57643067, 0x27bc966f)}}, {{TOBN(0x5e12705d, 0x1c7193f0), TOBN(0xf0f32f47, 0x3be8858e), TOBN(0x785c3d7d, 0x96c6dfc7), TOBN(0xd75b4a20, 0xbf31795d)}, {TOBN(0x91acf17b, 0x342659d4), TOBN(0xe596ea34, 0x44f0378f), TOBN(0x4515708f, 0xce52129d), TOBN(0x17387e1e, 0x79f2f585)}}, {{TOBN(0x72cfd2e9, 0x49dee168), TOBN(0x1ae05223, 0x3e2af239), TOBN(0x009e75be, 0x1d94066a), TOBN(0x6cca31c7, 0x38abf413)}, {TOBN(0xb50bd61d, 0x9bc49908), TOBN(0x4a9b4a8c, 0xf5e2bc1e), TOBN(0xeb6cc5f7, 0x946f83ac), TOBN(0x27da93fc, 0xebffab28)}}, {{TOBN(0xea314c96, 0x4821c8c5), TOBN(0x8de49ded, 0xa83c15f4), TOBN(0x7a64cf20, 0x7af33004), TOBN(0x45f1bfeb, 0xc9627e10)}, {TOBN(0x878b0626, 0x54b9df60), TOBN(0x5e4fdc3c, 0xa95c0b33), TOBN(0xe54a37ca, 0xc2035d8e), TOBN(0x9087cda9, 0x80f20b8c)}}, {{TOBN(0x36f61c23, 0x8319ade4), TOBN(0x766f287a, 0xde8cfdf8), TOBN(0x48821948, 0x346f3705), TOBN(0x49a7b853, 0x16e4f4a2)}, {TOBN(0xb9b3f8a7, 0x5cedadfd), TOBN(0x8f562815, 0x8db2a815), TOBN(0xc0b7d554, 0x01f68f95), TOBN(0x12971e27, 0x688a208e)}}, {{TOBN(0xc9f8b696, 0xd0ff34fc), TOBN(0x20824de2, 0x1222718c), TOBN(0x7213cf9f, 0x0c95284d), TOBN(0xe2ad741b, 0xdc158240)}, {TOBN(0x0ee3a6df, 0x54043ccf), TOBN(0x16ff479b, 0xd84412b3), TOBN(0xf6c74ee0, 0xdfc98af0), TOBN(0xa78a169f, 0x52fcd2fb)}}, {{TOBN(0xd8ae8746, 0x99c930e9), TOBN(0x1d33e858, 0x49e117a5), TOBN(0x7581fcb4, 0x6624759f), TOBN(0xde50644f, 0x5bedc01d)}, {TOBN(0xbeec5d00, 0xcaf3155e), TOBN(0x672d66ac, 0xbc73e75f), TOBN(0x86b9d8c6, 0x270b01db), TOBN(0xd249ef83, 0x50f55b79)}}, {{TOBN(0x6131d6d4, 0x73978fe3), TOBN(0xcc4e4542, 0x754b00a1), TOBN(0x4e05df05, 0x57dfcfe9), TOBN(0x94b29cdd, 0x51ef6bf0)}, {TOBN(0xe4530cff, 0x9bc7edf2), TOBN(0x8ac236fd, 0xd3da65f3), TOBN(0x0faf7d5f, 0xc8eb0b48), TOBN(0x4d2de14c, 0x660eb039)}}, {{TOBN(0xc006bba7, 0x60430e54), TOBN(0x10a2d0d6, 0xda3289ab), TOBN(0x9c037a5d, 0xd7979c59), TOBN(0x04d1f3d3, 0xa116d944)}, {TOBN(0x9ff22473, 0x8a0983cd), TOBN(0x28e25b38, 0xc883cabb), TOBN(0xe968dba5, 0x47a58995), TOBN(0x2c80b505, 0x774eebdf)}}, {{TOBN(0xee763b71, 0x4a953beb), TOBN(0x502e223f, 0x1642e7f6), TOBN(0x6fe4b641, 0x61d5e722), TOBN(0x9d37c5b0, 0xdbef5316)}, {TOBN(0x0115ed70, 0xf8330bc7), TOBN(0x139850e6, 0x75a72789), TOBN(0x27d7faec, 0xffceccc2), TOBN(0x3016a860, 0x4fd9f7f6)}}, {{TOBN(0xc492ec64, 0x4cd8f64c), TOBN(0x58a2d790, 0x279d7b51), TOBN(0x0ced1fc5, 0x1fc75256), TOBN(0x3e658aed, 0x8f433017)}, {TOBN(0x0b61942e, 0x05da59eb), TOBN(0xba3d60a3, 0x0ddc3722), TOBN(0x7c311cd1, 0x742e7f87), TOBN(0x6473ffee, 0xf6b01b6e)}}}, {{{TOBN(0x8303604f, 0x692ac542), TOBN(0xf079ffe1, 0x227b91d3), TOBN(0x19f63e63, 0x15aaf9bd), TOBN(0xf99ee565, 0xf1f344fb)}, {TOBN(0x8a1d661f, 0xd6219199), TOBN(0x8c883bc6, 0xd48ce41c), TOBN(0x1065118f, 0x3c74d904), TOBN(0x713889ee, 0x0faf8b1b)}}, {{TOBN(0x972b3f8f, 0x81a1b3be), TOBN(0x4f3ce145, 0xce2764a0), TOBN(0xe2d0f1cc, 0x28c4f5f7), TOBN(0xdeee0c0d, 0xc7f3985b)}, {TOBN(0x7df4adc0, 0xd39e25c3), TOBN(0x40619820, 0xc467a080), TOBN(0x440ebc93, 0x61cf5a58), TOBN(0x527729a6, 0x422ad600)}}, {{TOBN(0xca6c0937, 0xb1b76ba6), TOBN(0x1a2eab85, 0x4d2026dc), TOBN(0xb1715e15, 0x19d9ae0a), TOBN(0xf1ad9199, 0xbac4a026)}, {TOBN(0x35b3dfb8, 0x07ea7b0e), TOBN(0xedf5496f, 0x3ed9eb89), TOBN(0x8932e5ff, 0x2d6d08ab), TOBN(0xf314874e, 0x25bd2731)}}, {{TOBN(0xefb26a75, 0x3f73f449), TOBN(0x1d1c94f8, 0x8d44fc79), TOBN(0x49f0fbc5, 0x3bc0dc4d), TOBN(0xb747ea0b, 0x3698a0d0)}, {TOBN(0x5218c3fe, 0x228d291e), TOBN(0x35b804b5, 0x43c129d6), TOBN(0xfac859b8, 0xd1acc516), TOBN(0x6c10697d, 0x95d6e668)}}, {{TOBN(0xc38e438f, 0x0876fd4e), TOBN(0x45f0c307, 0x83d2f383), TOBN(0x203cc2ec, 0xb10934cb), TOBN(0x6a8f2439, 0x2c9d46ee)}, {TOBN(0xf16b431b, 0x65ccde7b), TOBN(0x41e2cd18, 0x27e76a6f), TOBN(0xb9c8cf8f, 0x4e3484d7), TOBN(0x64426efd, 0x8315244a)}}, {{TOBN(0x1c0a8e44, 0xfc94dea3), TOBN(0x34c8cdbf, 0xdad6a0b0), TOBN(0x919c3840, 0x04113cef), TOBN(0xfd32fba4, 0x15490ffa)}, {TOBN(0x58d190f6, 0x795dcfb7), TOBN(0xfef01b03, 0x83588baf), TOBN(0x9e6d1d63, 0xca1fc1c0), TOBN(0x53173f96, 0xf0a41ac9)}}, {{TOBN(0x2b1d402a, 0xba16f73b), TOBN(0x2fb31014, 0x8cf9b9fc), TOBN(0x2d51e60e, 0x446ef7bf), TOBN(0xc731021b, 0xb91e1745)}, {TOBN(0x9d3b4724, 0x4fee99d4), TOBN(0x4bca48b6, 0xfac5c1ea), TOBN(0x70f5f514, 0xbbea9af7), TOBN(0x751f55a5, 0x974c283a)}}, {{TOBN(0x6e30251a, 0xcb452fdb), TOBN(0x31ee6965, 0x50f30650), TOBN(0xb0b3e508, 0x933548d9), TOBN(0xb8949a4f, 0xf4b0ef5b)}, {TOBN(0x208b8326, 0x3c88f3bd), TOBN(0xab147c30, 0xdb1d9989), TOBN(0xed6515fd, 0x44d4df03), TOBN(0x17a12f75, 0xe72eb0c5)}}, {{TOBN(0x3b59796d, 0x36cf69db), TOBN(0x1219eee9, 0x56670c18), TOBN(0xfe3341f7, 0x7a070d8e), TOBN(0x9b70130b, 0xa327f90c)}, {TOBN(0x36a32462, 0x0ae18e0e), TOBN(0x2021a623, 0x46c0a638), TOBN(0x251b5817, 0xc62eb0d4), TOBN(0x87bfbcdf, 0x4c762293)}}, {{TOBN(0xf78ab505, 0xcdd61d64), TOBN(0x8c7a53fc, 0xc8c18857), TOBN(0xa653ce6f, 0x16147515), TOBN(0x9c923aa5, 0xea7d52d5)}, {TOBN(0xc24709cb, 0x5c18871f), TOBN(0x7d53bec8, 0x73b3cc74), TOBN(0x59264aff, 0xfdd1d4c4), TOBN(0x5555917e, 0x240da582)}}, {{TOBN(0xcae8bbda, 0x548f5a0e), TOBN(0x1910eaba, 0x3bbfbbe1), TOBN(0xae579685, 0x7677afc3), TOBN(0x49ea61f1, 0x73ff0b5c)}, {TOBN(0x78655478, 0x4f7c3922), TOBN(0x95d337cd, 0x20c68eef), TOBN(0x68f1e1e5, 0xdf779ab9), TOBN(0x14b491b0, 0xb5cf69a8)}}, {{TOBN(0x7a6cbbe0, 0x28e3fe89), TOBN(0xe7e1fee4, 0xc5aac0eb), TOBN(0x7f47eda5, 0x697e5140), TOBN(0x4f450137, 0xb454921f)}, {TOBN(0xdb625f84, 0x95cd8185), TOBN(0x74be0ba1, 0xcdb2e583), TOBN(0xaee4fd7c, 0xdd5e6de4), TOBN(0x4251437d, 0xe8101739)}}, {{TOBN(0x686d72a0, 0xac620366), TOBN(0x4be3fb9c, 0xb6d59344), TOBN(0x6e8b44e7, 0xa1eb75b9), TOBN(0x84e39da3, 0x91a5c10c)}, {TOBN(0x37cc1490, 0xb38f0409), TOBN(0x02951943, 0x2c2ade82), TOBN(0x9b688783, 0x1190a2d8), TOBN(0x25627d14, 0x231182ba)}}, {{TOBN(0x6eb550aa, 0x658a6d87), TOBN(0x1405aaa7, 0xcf9c7325), TOBN(0xd147142e, 0x5c8748c9), TOBN(0x7f637e4f, 0x53ede0e0)}, {TOBN(0xf8ca2776, 0x14ffad2c), TOBN(0xe58fb1bd, 0xbafb6791), TOBN(0x17158c23, 0xbf8f93fc), TOBN(0x7f15b373, 0x0a4a4655)}}, {{TOBN(0x39d4add2, 0xd842ca72), TOBN(0xa71e4391, 0x3ed96305), TOBN(0x5bb09cbe, 0x6700be14), TOBN(0x68d69d54, 0xd8befcf6)}, {TOBN(0xa45f5367, 0x37183bcf), TOBN(0x7152b7bb, 0x3370dff7), TOBN(0xcf887baa, 0xbf12525b), TOBN(0xe7ac7bdd, 0xd6d1e3cd)}}, {{TOBN(0x25914f78, 0x81fdad90), TOBN(0xcf638f56, 0x0d2cf6ab), TOBN(0xb90bc03f, 0xcc054de5), TOBN(0x932811a7, 0x18b06350)}, {TOBN(0x2f00b330, 0x9bbd11ff), TOBN(0x76108a6f, 0xb4044974), TOBN(0x801bb9e0, 0xa851d266), TOBN(0x0dd099be, 0xbf8990c1)}}, {{TOBN(0x58c5aaaa, 0xabe32986), TOBN(0x0fe9dd2a, 0x50d59c27), TOBN(0x84951ff4, 0x8d307305), TOBN(0x6c23f829, 0x86529b78)}, {TOBN(0x50bb2218, 0x0b136a79), TOBN(0x7e2174de, 0x77a20996), TOBN(0x6f00a4b9, 0xc0bb4da6), TOBN(0x89a25a17, 0xefdde8da)}}, {{TOBN(0xf728a27e, 0xc11ee01d), TOBN(0xf900553a, 0xe5f10dfb), TOBN(0x189a83c8, 0x02ec893c), TOBN(0x3ca5bdc1, 0x23f66d77)}, {TOBN(0x98781537, 0x97eada9f), TOBN(0x59c50ab3, 0x10256230), TOBN(0x346042d9, 0x323c69b3), TOBN(0x1b715a6d, 0x2c460449)}}, {{TOBN(0xa41dd476, 0x6ae06e0b), TOBN(0xcdd7888e, 0x9d42e25f), TOBN(0x0f395f74, 0x56b25a20), TOBN(0xeadfe0ae, 0x8700e27e)}, {TOBN(0xb09d52a9, 0x69950093), TOBN(0x3525d9cb, 0x327f8d40), TOBN(0xb8235a94, 0x67df886a), TOBN(0x77e4b0dd, 0x035faec2)}}, {{TOBN(0x115eb20a, 0x517d7061), TOBN(0x77fe3433, 0x6c2df683), TOBN(0x6870ddc7, 0xcdc6fc67), TOBN(0xb1610588, 0x0b87de83)}, {TOBN(0x343584ca, 0xd9c4ddbe), TOBN(0xb3164f1c, 0x3d754be2), TOBN(0x0731ed3a, 0xc1e6c894), TOBN(0x26327dec, 0x4f6b904c)}}, {{TOBN(0x9d49c6de, 0x97b5cd32), TOBN(0x40835dae, 0xb5eceecd), TOBN(0xc66350ed, 0xd9ded7fe), TOBN(0x8aeebb5c, 0x7a678804)}, {TOBN(0x51d42fb7, 0x5b8ee9ec), TOBN(0xd7a17bdd, 0x8e3ca118), TOBN(0x40d7511a, 0x2ef4400e), TOBN(0xc48990ac, 0x875a66f4)}}, {{TOBN(0x8de07d2a, 0x2199e347), TOBN(0xbee75556, 0x2a39e051), TOBN(0x56918786, 0x916e51dc), TOBN(0xeb191313, 0x4a2d89ec)}, {TOBN(0x6679610d, 0x37d341ed), TOBN(0x434fbb41, 0x56d51c2b), TOBN(0xe54b7ee7, 0xd7492dba), TOBN(0xaa33a79a, 0x59021493)}}, {{TOBN(0x49fc5054, 0xe4bd6d3d), TOBN(0x09540f04, 0x5ab551d0), TOBN(0x8acc9085, 0x4942d3a6), TOBN(0x231af02f, 0x2d28323b)}, {TOBN(0x93458cac, 0x0992c163), TOBN(0x1fef8e71, 0x888e3bb4), TOBN(0x27578da5, 0xbe8c268c), TOBN(0xcc8be792, 0xe805ec00)}}, {{TOBN(0x29267bae, 0xc61c3855), TOBN(0xebff429d, 0x58c1fd3b), TOBN(0x22d886c0, 0x8c0b93b8), TOBN(0xca5e00b2, 0x2ddb8953)}, {TOBN(0xcf330117, 0xc3fed8b7), TOBN(0xd49ac6fa, 0x819c01f6), TOBN(0x6ddaa6bd, 0x3c0fbd54), TOBN(0x91743068, 0x8049a2cf)}}, {{TOBN(0xd67f981e, 0xaff2ef81), TOBN(0xc3654d35, 0x2818ae80), TOBN(0x81d05044, 0x1b2aa892), TOBN(0x2db067bf, 0x3d099328)}, {TOBN(0xe7c79e86, 0x703dcc97), TOBN(0xe66f9b37, 0xe133e215), TOBN(0xcdf119a6, 0xe39a7a5c), TOBN(0x47c60de3, 0x876f1b61)}}, {{TOBN(0x6e405939, 0xd860f1b2), TOBN(0x3e9a1dbc, 0xf5ed4d4a), TOBN(0x3f23619e, 0xc9b6bcbd), TOBN(0x5ee790cf, 0x734e4497)}, {TOBN(0xf0a834b1, 0x5bdaf9bb), TOBN(0x02cedda7, 0x4ca295f0), TOBN(0x4619aa2b, 0xcb8e378c), TOBN(0xe5613244, 0xcc987ea4)}}, {{TOBN(0x0bc022cc, 0x76b23a50), TOBN(0x4a2793ad, 0x0a6c21ce), TOBN(0x38328780, 0x89cac3f5), TOBN(0x29176f1b, 0xcba26d56)}, {TOBN(0x06296187, 0x4f6f59eb), TOBN(0x86e9bca9, 0x8bdc658e), TOBN(0x2ca9c4d3, 0x57e30402), TOBN(0x5438b216, 0x516a09bb)}}, {{TOBN(0x0a6a063c, 0x7672765a), TOBN(0x37a3ce64, 0x0547b9bf), TOBN(0x42c099c8, 0x98b1a633), TOBN(0xb5ab800d, 0x05ee6961)}, {TOBN(0xf1963f59, 0x11a5acd6), TOBN(0xbaee6157, 0x46201063), TOBN(0x36d9a649, 0xa596210a), TOBN(0xaed04363, 0x1ba7138c)}}, {{TOBN(0xcf817d1c, 0xa4a82b76), TOBN(0x5586960e, 0xf3806be9), TOBN(0x7ab67c89, 0x09dc6bb5), TOBN(0x52ace7a0, 0x114fe7eb)}, {TOBN(0xcd987618, 0xcbbc9b70), TOBN(0x4f06fd5a, 0x604ca5e1), TOBN(0x90af14ca, 0x6dbde133), TOBN(0x1afe4322, 0x948a3264)}}, {{TOBN(0xa70d2ca6, 0xc44b2c6c), TOBN(0xab726799, 0x0ef87dfe), TOBN(0x310f64dc, 0x2e696377), TOBN(0x49b42e68, 0x4c8126a0)}, {TOBN(0x0ea444c3, 0xcea0b176), TOBN(0x53a8ddf7, 0xcb269182), TOBN(0xf3e674eb, 0xbbba9dcb), TOBN(0x0d2878a8, 0xd8669d33)}}, {{TOBN(0x04b935d5, 0xd019b6a3), TOBN(0xbb5cf88e, 0x406f1e46), TOBN(0xa1912d16, 0x5b57c111), TOBN(0x9803fc21, 0x19ebfd78)}, {TOBN(0x4f231c9e, 0xc07764a9), TOBN(0xd93286ee, 0xb75bd055), TOBN(0x83a9457d, 0x8ee6c9de), TOBN(0x04695915, 0x6087ec90)}}, {{TOBN(0x14c6dd8a, 0x58d6cd46), TOBN(0x9cb633b5, 0x8e6634d2), TOBN(0xc1305047, 0xf81bc328), TOBN(0x12ede0e2, 0x26a177e5)}, {TOBN(0x332cca62, 0x065a6f4f), TOBN(0xc3a47ecd, 0x67be487b), TOBN(0x741eb187, 0x0f47ed1c), TOBN(0x99e66e58, 0xe7598b14)}}, {{TOBN(0x6f0544ca, 0x63d0ff12), TOBN(0xe5efc784, 0xb610a05f), TOBN(0xf72917b1, 0x7cad7b47), TOBN(0x3ff6ea20, 0xf2cac0c0)}, {TOBN(0xcc23791b, 0xf21db8b7), TOBN(0x7dac70b1, 0xd7d93565), TOBN(0x682cda1d, 0x694bdaad), TOBN(0xeb88bb8c, 0x1023516d)}}, {{TOBN(0xc4c634b4, 0xdfdbeb1b), TOBN(0x22f5ca72, 0xb4ee4dea), TOBN(0x1045a368, 0xe6524821), TOBN(0xed9e8a3f, 0x052b18b2)}, {TOBN(0x9b7f2cb1, 0xb961f49a), TOBN(0x7fee2ec1, 0x7b009670), TOBN(0x350d8754, 0x22507a6d), TOBN(0x561bd711, 0x4db55f1d)}}, {{TOBN(0x4c189ccc, 0x320bbcaf), TOBN(0x568434cf, 0xdf1de48c), TOBN(0x6af1b00e, 0x0fa8f128), TOBN(0xf0ba9d02, 0x8907583c)}, {TOBN(0x735a4004, 0x32ff9f60), TOBN(0x3dd8e4b6, 0xc25dcf33), TOBN(0xf2230f16, 0x42c74cef), TOBN(0xd8117623, 0x013fa8ad)}}, {{TOBN(0x36822876, 0xf51fe76e), TOBN(0x8a6811cc, 0x11d62589), TOBN(0xc3fc7e65, 0x46225718), TOBN(0xb7df2c9f, 0xc82fdbcd)}, {TOBN(0x3b1d4e52, 0xdd7b205b), TOBN(0xb6959478, 0x47a2e414), TOBN(0x05e4d793, 0xefa91148), TOBN(0xb47ed446, 0xfd2e9675)}}, {{TOBN(0x1a7098b9, 0x04c9d9bf), TOBN(0x661e2881, 0x1b793048), TOBN(0xb1a16966, 0xb01ee461), TOBN(0xbc521308, 0x2954746f)}, {TOBN(0xc909a0fc, 0x2477de50), TOBN(0xd80bb41c, 0x7dbd51ef), TOBN(0xa85be7ec, 0x53294905), TOBN(0x6d465b18, 0x83958f97)}}, {{TOBN(0x16f6f330, 0xfb6840fd), TOBN(0xfaaeb214, 0x3401e6c8), TOBN(0xaf83d30f, 0xccb5b4f8), TOBN(0x22885739, 0x266dec4b)}, {TOBN(0x51b4367c, 0x7bc467df), TOBN(0x926562e3, 0xd842d27a), TOBN(0xdfcb6614, 0x0fea14a6), TOBN(0xeb394dae, 0xf2734cd9)}}, {{TOBN(0x3eeae5d2, 0x11c0be98), TOBN(0xb1e6ed11, 0x814e8165), TOBN(0x191086bc, 0xe52bce1c), TOBN(0x14b74cc6, 0xa75a04da)}, {TOBN(0x63cf1186, 0x8c060985), TOBN(0x071047de, 0x2dbd7f7c), TOBN(0x4e433b8b, 0xce0942ca), TOBN(0xecbac447, 0xd8fec61d)}}, {{TOBN(0x8f0ed0e2, 0xebf3232f), TOBN(0xfff80f9e, 0xc52a2edd), TOBN(0xad9ab433, 0x75b55fdb), TOBN(0x73ca7820, 0xe42e0c11)}, {TOBN(0x6dace0a0, 0xe6251b46), TOBN(0x89bc6b5c, 0x4c0d932d), TOBN(0x3438cd77, 0x095da19a), TOBN(0x2f24a939, 0x8d48bdfb)}}, {{TOBN(0x99b47e46, 0x766561b7), TOBN(0x736600e6, 0x0ed0322a), TOBN(0x06a47cb1, 0x638e1865), TOBN(0x927c1c2d, 0xcb136000)}, {TOBN(0x29542337, 0x0cc5df69), TOBN(0x99b37c02, 0x09d649a9), TOBN(0xc5f0043c, 0x6aefdb27), TOBN(0x6cdd9987, 0x1be95c27)}}, {{TOBN(0x69850931, 0x390420d2), TOBN(0x299c40ac, 0x0983efa4), TOBN(0x3a05e778, 0xaf39aead), TOBN(0x84274408, 0x43a45193)}, {TOBN(0x6bcd0fb9, 0x91a711a0), TOBN(0x461592c8, 0x9f52ab17), TOBN(0xb49302b4, 0xda3c6ed6), TOBN(0xc51fddc7, 0x330d7067)}}, {{TOBN(0x94babeb6, 0xda50d531), TOBN(0x521b840d, 0xa6a7b9da), TOBN(0x5305151e, 0x404bdc89), TOBN(0x1bcde201, 0xd0d07449)}, {TOBN(0xf427a78b, 0x3b76a59a), TOBN(0xf84841ce, 0x07791a1b), TOBN(0xebd314be, 0xbf91ed1c), TOBN(0x8e61d34c, 0xbf172943)}}, {{TOBN(0x1d5dc451, 0x5541b892), TOBN(0xb186ee41, 0xfc9d9e54), TOBN(0x9d9f345e, 0xd5bf610d), TOBN(0x3e7ba65d, 0xf6acca9f)}, {TOBN(0x9dda787a, 0xa8369486), TOBN(0x09f9dab7, 0x8eb5ba53), TOBN(0x5afb2033, 0xd6481bc3), TOBN(0x76f4ce30, 0xafa62104)}}, {{TOBN(0xa8fa00cf, 0xf4f066b5), TOBN(0x89ab5143, 0x461dafc2), TOBN(0x44339ed7, 0xa3389998), TOBN(0x2ff862f1, 0xbc214903)}, {TOBN(0x2c88f985, 0xb05556e3), TOBN(0xcd96058e, 0x3467081e), TOBN(0x7d6a4176, 0xedc637ea), TOBN(0xe1743d09, 0x36a5acdc)}}, {{TOBN(0x66fd72e2, 0x7eb37726), TOBN(0xf7fa264e, 0x1481a037), TOBN(0x9fbd3bde, 0x45f4aa79), TOBN(0xed1e0147, 0x767c3e22)}, {TOBN(0x7621f979, 0x82e7abe2), TOBN(0x19eedc72, 0x45f633f8), TOBN(0xe69b155e, 0x6137bf3a), TOBN(0xa0ad13ce, 0x414ee94e)}}, {{TOBN(0x93e3d524, 0x1c0e651a), TOBN(0xab1a6e2a, 0x02ce227e), TOBN(0xe7af1797, 0x4ab27eca), TOBN(0x245446de, 0xbd444f39)}, {TOBN(0x59e22a21, 0x56c07613), TOBN(0x43deafce, 0xf4275498), TOBN(0x10834ccb, 0x67fd0946), TOBN(0xa75841e5, 0x47406edf)}}, {{TOBN(0xebd6a677, 0x7b0ac93d), TOBN(0xa6e37b0d, 0x78f5e0d7), TOBN(0x2516c096, 0x76f5492b), TOBN(0x1e4bf888, 0x9ac05f3a)}, {TOBN(0xcdb42ce0, 0x4df0ba2b), TOBN(0x935d5cfd, 0x5062341b), TOBN(0x8a303333, 0x82acac20), TOBN(0x429438c4, 0x5198b00e)}}, {{TOBN(0x1d083bc9, 0x049d33fa), TOBN(0x58b82dda, 0x946f67ff), TOBN(0xac3e2db8, 0x67a1d6a3), TOBN(0x62e6bead, 0x1798aac8)}, {TOBN(0xfc85980f, 0xde46c58c), TOBN(0xa7f69379, 0x69c8d7be), TOBN(0x23557927, 0x837b35ec), TOBN(0x06a933d8, 0xe0790c0c)}}, {{TOBN(0x827c0e9b, 0x077ff55d), TOBN(0x53977798, 0xbb26e680), TOBN(0x59530874, 0x1d9cb54f), TOBN(0xcca3f449, 0x4aac53ef)}, {TOBN(0x11dc5c87, 0xa07eda0f), TOBN(0xc138bccf, 0xfd6400c8), TOBN(0x549680d3, 0x13e5da72), TOBN(0xc93eed82, 0x4540617e)}}, {{TOBN(0xfd3db157, 0x4d0b75c0), TOBN(0x9716eb42, 0x6386075b), TOBN(0x0639605c, 0x817b2c16), TOBN(0x09915109, 0xf1e4f201)}, {TOBN(0x35c9a928, 0x5cca6c3b), TOBN(0xb25f7d1a, 0x3505c900), TOBN(0xeb9f7d20, 0x630480c4), TOBN(0xc3c7b8c6, 0x2a1a501c)}}, {{TOBN(0x3f99183c, 0x5a1f8e24), TOBN(0xfdb118fa, 0x9dd255f0), TOBN(0xb9b18b90, 0xc27f62a6), TOBN(0xe8f732f7, 0x396ec191)}, {TOBN(0x524a2d91, 0x0be786ab), TOBN(0x5d32adef, 0x0ac5a0f5), TOBN(0x9b53d4d6, 0x9725f694), TOBN(0x032a76c6, 0x0510ba89)}}, {{TOBN(0x840391a3, 0xebeb1544), TOBN(0x44b7b88c, 0x3ed73ac3), TOBN(0xd24bae7a, 0x256cb8b3), TOBN(0x7ceb151a, 0xe394cb12)}, {TOBN(0xbd6b66d0, 0x5bc1e6a8), TOBN(0xec70cecb, 0x090f07bf), TOBN(0x270644ed, 0x7d937589), TOBN(0xee9e1a3d, 0x5f1dccfe)}}, {{TOBN(0xb0d40a84, 0x745b98d2), TOBN(0xda429a21, 0x2556ed40), TOBN(0xf676eced, 0x85148cb9), TOBN(0x5a22d40c, 0xded18936)}, {TOBN(0x3bc4b9e5, 0x70e8a4ce), TOBN(0xbfd1445b, 0x9eae0379), TOBN(0xf23f2c0c, 0x1a0bd47e), TOBN(0xa9c0bb31, 0xe1845531)}}, {{TOBN(0x9ddc4d60, 0x0a4c3f6b), TOBN(0xbdfaad79, 0x2c15ef44), TOBN(0xce55a236, 0x7f484acc), TOBN(0x08653ca7, 0x055b1f15)}, {TOBN(0x2efa8724, 0x538873a3), TOBN(0x09299e5d, 0xace1c7e7), TOBN(0x07afab66, 0xade332ba), TOBN(0x9be1fdf6, 0x92dd71b7)}}, {{TOBN(0xa49b5d59, 0x5758b11c), TOBN(0x0b852893, 0xc8654f40), TOBN(0xb63ef6f4, 0x52379447), TOBN(0xd4957d29, 0x105e690c)}, {TOBN(0x7d484363, 0x646559b0), TOBN(0xf4a8273c, 0x49788a8e), TOBN(0xee406cb8, 0x34ce54a9), TOBN(0x1e1c260f, 0xf86fda9b)}}, {{TOBN(0xe150e228, 0xcf6a4a81), TOBN(0x1fa3b6a3, 0x1b488772), TOBN(0x1e6ff110, 0xc5a9c15b), TOBN(0xc6133b91, 0x8ad6aa47)}, {TOBN(0x8ac5d55c, 0x9dffa978), TOBN(0xba1d1c1d, 0x5f3965f2), TOBN(0xf969f4e0, 0x7732b52f), TOBN(0xfceecdb5, 0xa5172a07)}}, {{TOBN(0xb0120a5f, 0x10f2b8f5), TOBN(0xc83a6cdf, 0x5c4c2f63), TOBN(0x4d47a491, 0xf8f9c213), TOBN(0xd9e1cce5, 0xd3f1bbd5)}, {TOBN(0x0d91bc7c, 0xaba7e372), TOBN(0xfcdc74c8, 0xdfd1a2db), TOBN(0x05efa800, 0x374618e5), TOBN(0x11216969, 0x15a7925e)}}, {{TOBN(0xd4c89823, 0xf6021c5d), TOBN(0x880d5e84, 0xeff14423), TOBN(0x6523bc5a, 0x6dcd1396), TOBN(0xd1acfdfc, 0x113c978b)}, {TOBN(0xb0c164e8, 0xbbb66840), TOBN(0xf7f4301e, 0x72b58459), TOBN(0xc29ad4a6, 0xa638e8ec), TOBN(0xf5ab8961, 0x46b78699)}}, {{TOBN(0x9dbd7974, 0x0e954750), TOBN(0x0121de88, 0x64f9d2c6), TOBN(0x2e597b42, 0xd985232e), TOBN(0x55b6c3c5, 0x53451777)}, {TOBN(0xbb53e547, 0x519cb9fb), TOBN(0xf134019f, 0x8428600d), TOBN(0x5a473176, 0xe081791a), TOBN(0x2f3e2263, 0x35fb0c08)}}, {{TOBN(0xb28c3017, 0x73d273b0), TOBN(0xccd21076, 0x7721ef9a), TOBN(0x054cc292, 0xb650dc39), TOBN(0x662246de, 0x6188045e)}, {TOBN(0x904b52fa, 0x6b83c0d1), TOBN(0xa72df267, 0x97e9cd46), TOBN(0x886b43cd, 0x899725e4), TOBN(0x2b651688, 0xd849ff22)}}, {{TOBN(0x60479b79, 0x02f34533), TOBN(0x5e354c14, 0x0c77c148), TOBN(0xb4bb7581, 0xa8537c78), TOBN(0x188043d7, 0xefe1495f)}, {TOBN(0x9ba12f42, 0x8c1d5026), TOBN(0x2e0c8a26, 0x93d4aaab), TOBN(0xbdba7b8b, 0xaa57c450), TOBN(0x140c9ad6, 0x9bbdafef)}}, {{TOBN(0x2067aa42, 0x25ac0f18), TOBN(0xf7b1295b, 0x04d1fbf3), TOBN(0x14829111, 0xa4b04824), TOBN(0x2ce3f192, 0x33bd5e91)}, {TOBN(0x9c7a1d55, 0x8f2e1b72), TOBN(0xfe932286, 0x302aa243), TOBN(0x497ca7b4, 0xd4be9554), TOBN(0xb8e821b8, 0xe0547a6e)}}, {{TOBN(0xfb2838be, 0x67e573e0), TOBN(0x05891db9, 0x4084c44b), TOBN(0x91311373, 0x96c1c2c5), TOBN(0x6aebfa3f, 0xd958444b)}, {TOBN(0xac9cdce9, 0xe56e55c1), TOBN(0x7148ced3, 0x2caa46d0), TOBN(0x2e10c7ef, 0xb61fe8eb), TOBN(0x9fd835da, 0xff97cf4d)}}}, {{{TOBN(0xa36da109, 0x081e9387), TOBN(0xfb9780d7, 0x8c935828), TOBN(0xd5940332, 0xe540b015), TOBN(0xc9d7b51b, 0xe0f466fa)}, {TOBN(0xfaadcd41, 0xd6d9f671), TOBN(0xba6c1e28, 0xb1a2ac17), TOBN(0x066a7833, 0xed201e5f), TOBN(0x19d99719, 0xf90f462b)}}, {{TOBN(0xf431f462, 0x060b5f61), TOBN(0xa56f46b4, 0x7bd057c2), TOBN(0x348dca6c, 0x47e1bf65), TOBN(0x9a38783e, 0x41bcf1ff)}, {TOBN(0x7a5d33a9, 0xda710718), TOBN(0x5a779987, 0x2e0aeaf6), TOBN(0xca87314d, 0x2d29d187), TOBN(0xfa0edc3e, 0xc687d733)}}, {{TOBN(0x9df33621, 0x6a31e09b), TOBN(0xde89e44d, 0xc1350e35), TOBN(0x29214871, 0x4ca0cf52), TOBN(0xdf379672, 0x0b88a538)}, {TOBN(0xc92a510a, 0x2591d61b), TOBN(0x79aa87d7, 0x585b447b), TOBN(0xf67db604, 0xe5287f77), TOBN(0x1697c8bf, 0x5efe7a80)}}, {{TOBN(0x1c894849, 0xcb198ac7), TOBN(0xa884a93d, 0x0f264665), TOBN(0x2da964ef, 0x9b200678), TOBN(0x3c351b87, 0x009834e6)}, {TOBN(0xafb2ef9f, 0xe2c4b44b), TOBN(0x580f6c47, 0x3326790c), TOBN(0xb8480521, 0x0b02264a), TOBN(0x8ba6f9e2, 0x42a194e2)}}, {{TOBN(0xfc87975f, 0x8fb54738), TOBN(0x35160788, 0x27c3ead3), TOBN(0x834116d2, 0xb74a085a), TOBN(0x53c99a73, 0xa62fe996)}, {TOBN(0x87585be0, 0x5b81c51b), TOBN(0x925bafa8, 0xbe0852b7), TOBN(0x76a4fafd, 0xa84d19a7), TOBN(0x39a45982, 0x585206d4)}}, {{TOBN(0x499b6ab6, 0x5eb03c0e), TOBN(0xf19b7954, 0x72bc3fde), TOBN(0xa86b5b9c, 0x6e3a80d2), TOBN(0xe4377508, 0x6d42819f)}, {TOBN(0xc1663650, 0xbb3ee8a3), TOBN(0x75eb14fc, 0xb132075f), TOBN(0xa8ccc906, 0x7ad834f6), TOBN(0xea6a2474, 0xe6e92ffd)}}, {{TOBN(0x9d72fd95, 0x0f8d6758), TOBN(0xcb84e101, 0x408c07dd), TOBN(0xb9114bfd, 0xa5e23221), TOBN(0x358b5fe2, 0xe94e742c)}, {TOBN(0x1c0577ec, 0x95f40e75), TOBN(0xf0155451, 0x3d73f3d6), TOBN(0x9d55cd67, 0xbd1b9b66), TOBN(0x63e86e78, 0xaf8d63c7)}}, {{TOBN(0x39d934ab, 0xd3c095f1), TOBN(0x04b261be, 0xe4b76d71), TOBN(0x1d2e6970, 0xe73e6984), TOBN(0x879fb23b, 0x5e5fcb11)}, {TOBN(0x11506c72, 0xdfd75490), TOBN(0x3a97d085, 0x61bcf1c1), TOBN(0x43201d82, 0xbf5e7007), TOBN(0x7f0ac52f, 0x798232a7)}}, {{TOBN(0x2715cbc4, 0x6eb564d4), TOBN(0x8d6c752c, 0x9e570e29), TOBN(0xf80247c8, 0x9ef5fd5d), TOBN(0xc3c66b46, 0xd53eb514)}, {TOBN(0x9666b401, 0x0f87de56), TOBN(0xce62c06f, 0xc6c603b5), TOBN(0xae7b4c60, 0x7e4fc942), TOBN(0x38ac0b77, 0x663a9c19)}}, {{TOBN(0xcb4d20ee, 0x4b049136), TOBN(0x8b63bf12, 0x356a4613), TOBN(0x1221aef6, 0x70e08128), TOBN(0xe62d8c51, 0x4acb6b16)}, {TOBN(0x71f64a67, 0x379e7896), TOBN(0xb25237a2, 0xcafd7fa5), TOBN(0xf077bd98, 0x3841ba6a), TOBN(0xc4ac0244, 0x3cd16e7e)}}, {{TOBN(0x548ba869, 0x21fea4ca), TOBN(0xd36d0817, 0xf3dfdac1), TOBN(0x09d8d71f, 0xf4685faf), TOBN(0x8eff66be, 0xc52c459a)}, {TOBN(0x182faee7, 0x0b57235e), TOBN(0xee3c39b1, 0x0106712b), TOBN(0x5107331f, 0xc0fcdcb0), TOBN(0x669fb9dc, 0xa51054ba)}}, {{TOBN(0xb25101fb, 0x319d7682), TOBN(0xb0293129, 0x0a982fee), TOBN(0x51c1c9b9, 0x0261b344), TOBN(0x0e008c5b, 0xbfd371fa)}, {TOBN(0xd866dd1c, 0x0278ca33), TOBN(0x666f76a6, 0xe5aa53b1), TOBN(0xe5cfb779, 0x6013a2cf), TOBN(0x1d3a1aad, 0xa3521836)}}, {{TOBN(0xcedd2531, 0x73faa485), TOBN(0xc8ee6c4f, 0xc0a76878), TOBN(0xddbccfc9, 0x2a11667d), TOBN(0x1a418ea9, 0x1c2f695a)}, {TOBN(0xdb11bd92, 0x51f73971), TOBN(0x3e4b3c82, 0xda2ed89f), TOBN(0x9a44f3f4, 0xe73e0319), TOBN(0xd1e3de0f, 0x303431af)}}, {{TOBN(0x3c5604ff, 0x50f75f9c), TOBN(0x1d8eddf3, 0x7e752b22), TOBN(0x0ef074dd, 0x3c9a1118), TOBN(0xd0ffc172, 0xccb86d7b)}, {TOBN(0xabd1ece3, 0x037d90f2), TOBN(0xe3f307d6, 0x6055856c), TOBN(0x422f9328, 0x7e4c6daf), TOBN(0x902aac66, 0x334879a0)}}, {{TOBN(0xb6a1e7bf, 0x94cdfade), TOBN(0x6c97e1ed, 0x7fc6d634), TOBN(0x662ad24d, 0xa2fb63f8), TOBN(0xf81be1b9, 0xa5928405)}, {TOBN(0x86d765e4, 0xd14b4206), TOBN(0xbecc2e0e, 0x8fa0db65), TOBN(0xa28838e0, 0xb17fc76c), TOBN(0xe49a602a, 0xe37cf24e)}}, {{TOBN(0x76b4131a, 0x567193ec), TOBN(0xaf3c305a, 0xe5f6e70b), TOBN(0x9587bd39, 0x031eebdd), TOBN(0x5709def8, 0x71bbe831)}, {TOBN(0x57059983, 0x0eb2b669), TOBN(0x4d80ce1b, 0x875b7029), TOBN(0x838a7da8, 0x0364ac16), TOBN(0x2f431d23, 0xbe1c83ab)}}, {{TOBN(0xe56812a6, 0xf9294dd3), TOBN(0xb448d01f, 0x9b4b0d77), TOBN(0xf3ae6061, 0x04e8305c), TOBN(0x2bead645, 0x94d8c63e)}, {TOBN(0x0a85434d, 0x84fd8b07), TOBN(0x537b983f, 0xf7a9dee5), TOBN(0xedcc5f18, 0xef55bd85), TOBN(0x2041af62, 0x21c6cf8b)}}, {{TOBN(0x8e52874c, 0xb940c71e), TOBN(0x211935a9, 0xdb5f4b3a), TOBN(0x94350492, 0x301b1dc3), TOBN(0x33d2646d, 0x29958620)}, {TOBN(0x16b0d64b, 0xef911404), TOBN(0x9d1f25ea, 0x9a3c5ef4), TOBN(0x20f200eb, 0x4a352c78), TOBN(0x43929f2c, 0x4bd0b428)}}, {{TOBN(0xa5656667, 0xc7196e29), TOBN(0x7992c2f0, 0x9391be48), TOBN(0xaaa97cbd, 0x9ee0cd6e), TOBN(0x51b0310c, 0x3dc8c9bf)}, {TOBN(0x237f8acf, 0xdd9f22cb), TOBN(0xbb1d81a1, 0xb585d584), TOBN(0x8d5d85f5, 0x8c416388), TOBN(0x0d6e5a5a, 0x42fe474f)}}, {{TOBN(0xe7812766, 0x38235d4e), TOBN(0x1c62bd67, 0x496e3298), TOBN(0x8378660c, 0x3f175bc8), TOBN(0x4d04e189, 0x17afdd4d)}, {TOBN(0x32a81601, 0x85a8068c), TOBN(0xdb58e4e1, 0x92b29a85), TOBN(0xe8a65b86, 0xc70d8a3b), TOBN(0x5f0e6f4e, 0x98a0403b)}}, {{TOBN(0x08129684, 0x69ed2370), TOBN(0x34dc30bd, 0x0871ee26), TOBN(0x3a5ce948, 0x7c9c5b05), TOBN(0x7d487b80, 0x43a90c87)}, {TOBN(0x4089ba37, 0xdd0e7179), TOBN(0x45f80191, 0xb4041811), TOBN(0x1c3e1058, 0x98747ba5), TOBN(0x98c4e13a, 0x6e1ae592)}}, {{TOBN(0xd44636e6, 0xe82c9f9e), TOBN(0x711db87c, 0xc33a1043), TOBN(0x6f431263, 0xaa8aec05), TOBN(0x43ff120d, 0x2744a4aa)}, {TOBN(0xd3bd892f, 0xae77779b), TOBN(0xf0fe0cc9, 0x8cdc9f82), TOBN(0xca5f7fe6, 0xf1c5b1bc), TOBN(0xcc63a682, 0x44929a72)}}, {{TOBN(0xc7eaba0c, 0x09dbe19a), TOBN(0x2f3585ad, 0x6b5c73c2), TOBN(0x8ab8924b, 0x0ae50c30), TOBN(0x17fcd27a, 0x638b30ba)}, {TOBN(0xaf414d34, 0x10b3d5a5), TOBN(0x09c107d2, 0x2a9accf1), TOBN(0x15dac49f, 0x946a6242), TOBN(0xaec3df2a, 0xd707d642)}}, {{TOBN(0x2c2492b7, 0x3f894ae0), TOBN(0xf59df3e5, 0xb75f18ce), TOBN(0x7cb740d2, 0x8f53cad0), TOBN(0x3eb585fb, 0xc4f01294)}, {TOBN(0x17da0c86, 0x32c7f717), TOBN(0xeb8c795b, 0xaf943f4c), TOBN(0x4ee23fb5, 0xf67c51d2), TOBN(0xef187575, 0x68889949)}}, {{TOBN(0xa6b4bdb2, 0x0389168b), TOBN(0xc4ecd258, 0xea577d03), TOBN(0x3a63782b, 0x55743082), TOBN(0x6f678f4c, 0xc72f08cd)}, {TOBN(0x553511cf, 0x65e58dd8), TOBN(0xd53b4e3e, 0xd402c0cd), TOBN(0x37de3e29, 0xa037c14c), TOBN(0x86b6c516, 0xc05712aa)}}, {{TOBN(0x2834da3e, 0xb38dff6f), TOBN(0xbe012c52, 0xea636be8), TOBN(0x292d238c, 0x61dd37f8), TOBN(0x0e54523f, 0x8f8142db)}, {TOBN(0xe31eb436, 0x036a05d8), TOBN(0x83e3cdff, 0x1e93c0ff), TOBN(0x3fd2fe0f, 0x50821ddf), TOBN(0xc8e19b0d, 0xff9eb33b)}}, {{TOBN(0xc8cc943f, 0xb569a5fe), TOBN(0xad0090d4, 0xd4342d75), TOBN(0x82090b4b, 0xcaeca000), TOBN(0xca39687f, 0x1bd410eb)}, {TOBN(0xe7bb0df7, 0x65959d77), TOBN(0x39d78218, 0x9c964999), TOBN(0xd87f62e8, 0xb2415451), TOBN(0xe5efb774, 0xbed76108)}}, {{TOBN(0x3ea011a4, 0xe822f0d0), TOBN(0xbc647ad1, 0x5a8704f8), TOBN(0xbb315b35, 0x50c6820f), TOBN(0x863dec3d, 0xb7e76bec)}, {TOBN(0x01ff5d3a, 0xf017bfc7), TOBN(0x20054439, 0x976b8229), TOBN(0x067fca37, 0x0bbd0d3b), TOBN(0xf63dde64, 0x7f5e3d0f)}}, {{TOBN(0x22dbefb3, 0x2a4c94e9), TOBN(0xafbff0fe, 0x96f8278a), TOBN(0x80aea0b1, 0x3503793d), TOBN(0xb2238029, 0x5f06cd29)}, {TOBN(0x65703e57, 0x8ec3feca), TOBN(0x06c38314, 0x393e7053), TOBN(0xa0b751eb, 0x7c6734c4), TOBN(0xd2e8a435, 0xc59f0f1e)}}, {{TOBN(0x147d9052, 0x5e9ca895), TOBN(0x2f4dd31e, 0x972072df), TOBN(0xa16fda8e, 0xe6c6755c), TOBN(0xc66826ff, 0xcf196558)}, {TOBN(0x1f1a76a3, 0x0cf43895), TOBN(0xa9d604e0, 0x83c3097b), TOBN(0xe1908309, 0x66390e0e), TOBN(0xa50bf753, 0xb3c85eff)}}, {{TOBN(0x0696bdde, 0xf6a70251), TOBN(0x548b801b, 0x3c6ab16a), TOBN(0x37fcf704, 0xa4d08762), TOBN(0x090b3def, 0xdff76c4e)}, {TOBN(0x87e8cb89, 0x69cb9158), TOBN(0x44a90744, 0x995ece43), TOBN(0xf85395f4, 0x0ad9fbf5), TOBN(0x49b0f6c5, 0x4fb0c82d)}}, {{TOBN(0x75d9bc15, 0xadf7cccf), TOBN(0x81a3e5d6, 0xdfa1e1b0), TOBN(0x8c39e444, 0x249bc17e), TOBN(0xf37dccb2, 0x8ea7fd43)}, {TOBN(0xda654873, 0x907fba12), TOBN(0x35daa6da, 0x4a372904), TOBN(0x0564cfc6, 0x6283a6c5), TOBN(0xd09fa4f6, 0x4a9395bf)}}, {{TOBN(0x688e9ec9, 0xaeb19a36), TOBN(0xd913f1ce, 0xc7bfbfb4), TOBN(0x797b9a3c, 0x61c2faa6), TOBN(0x2f979bec, 0x6a0a9c12)}, {TOBN(0xb5969d0f, 0x359679ec), TOBN(0xebcf523d, 0x079b0460), TOBN(0xfd6b0008, 0x10fab870), TOBN(0x3f2edcda, 0x9373a39c)}}, {{TOBN(0x0d64f9a7, 0x6f568431), TOBN(0xf848c27c, 0x02f8898c), TOBN(0xf418ade1, 0x260b5bd5), TOBN(0xc1f3e323, 0x6973dee8)}, {TOBN(0x46e9319c, 0x26c185dd), TOBN(0x6d85b7d8, 0x546f0ac4), TOBN(0x427965f2, 0x247f9d57), TOBN(0xb519b636, 0xb0035f48)}}, {{TOBN(0x6b6163a9, 0xab87d59c), TOBN(0xff9f58c3, 0x39caaa11), TOBN(0x4ac39cde, 0x3177387b), TOBN(0x5f6557c2, 0x873e77f9)}, {TOBN(0x67504006, 0x36a83041), TOBN(0x9b1c96ca, 0x75ef196c), TOBN(0xf34283de, 0xb08c7940), TOBN(0x7ea09644, 0x1128c316)}}, {{TOBN(0xb510b3b5, 0x6aa39dff), TOBN(0x59b43da2, 0x9f8e4d8c), TOBN(0xa8ce31fd, 0x9e4c4b9f), TOBN(0x0e20be26, 0xc1303c01)}, {TOBN(0x18187182, 0xe8ee47c9), TOBN(0xd9687cdb, 0x7db98101), TOBN(0x7a520e4d, 0xa1e14ff6), TOBN(0x429808ba, 0x8836d572)}}, {{TOBN(0xa37ca60d, 0x4944b663), TOBN(0xf901f7a9, 0xa3f91ae5), TOBN(0xe4e3e76e, 0x9e36e3b1), TOBN(0x9aa219cf, 0x29d93250)}, {TOBN(0x347fe275, 0x056a2512), TOBN(0xa4d643d9, 0xde65d95c), TOBN(0x9669d396, 0x699fc3ed), TOBN(0xb598dee2, 0xcf8c6bbe)}}, {{TOBN(0x682ac1e5, 0xdda9e5c6), TOBN(0x4e0d3c72, 0xcaa9fc95), TOBN(0x17faaade, 0x772bea44), TOBN(0x5ef8428c, 0xab0009c8)}, {TOBN(0xcc4ce47a, 0x460ff016), TOBN(0xda6d12bf, 0x725281cb), TOBN(0x44c67848, 0x0223aad2), TOBN(0x6e342afa, 0x36256e28)}}, {{TOBN(0x1400bb0b, 0x93a37c04), TOBN(0x62b1bc9b, 0xdd10bd96), TOBN(0x7251adeb, 0x0dac46b7), TOBN(0x7d33b92e, 0x7be4ef51)}, {TOBN(0x28b2a94b, 0xe61fa29a), TOBN(0x4b2be13f, 0x06422233), TOBN(0x36d6d062, 0x330d8d37), TOBN(0x5ef80e1e, 0xb28ca005)}}, {{TOBN(0x174d4699, 0x6d16768e), TOBN(0x9fc4ff6a, 0x628bf217), TOBN(0x77705a94, 0x154e490d), TOBN(0x9d96dd28, 0x8d2d997a)}, {TOBN(0x77e2d9d8, 0xce5d72c4), TOBN(0x9d06c5a4, 0xc11c714f), TOBN(0x02aa5136, 0x79e4a03e), TOBN(0x1386b3c2, 0x030ff28b)}}, {{TOBN(0xfe82e8a6, 0xfb283f61), TOBN(0x7df203e5, 0xf3abc3fb), TOBN(0xeec7c351, 0x3a4d3622), TOBN(0xf7d17dbf, 0xdf762761)}, {TOBN(0xc3956e44, 0x522055f0), TOBN(0xde3012db, 0x8fa748db), TOBN(0xca9fcb63, 0xbf1dcc14), TOBN(0xa56d9dcf, 0xbe4e2f3a)}}, {{TOBN(0xb86186b6, 0x8bcec9c2), TOBN(0x7cf24df9, 0x680b9f06), TOBN(0xc46b45ea, 0xc0d29281), TOBN(0xfff42bc5, 0x07b10e12)}, {TOBN(0x12263c40, 0x4d289427), TOBN(0x3d5f1899, 0xb4848ec4), TOBN(0x11f97010, 0xd040800c), TOBN(0xb4c5f529, 0x300feb20)}}, {{TOBN(0xcc543f8f, 0xde94fdcb), TOBN(0xe96af739, 0xc7c2f05e), TOBN(0xaa5e0036, 0x882692e1), TOBN(0x09c75b68, 0x950d4ae9)}, {TOBN(0x62f63df2, 0xb5932a7a), TOBN(0x2658252e, 0xde0979ad), TOBN(0x2a19343f, 0xb5e69631), TOBN(0x718c7501, 0x525b666b)}}, {{TOBN(0x26a42d69, 0xea40dc3a), TOBN(0xdc84ad22, 0xaecc018f), TOBN(0x25c36c7b, 0x3270f04a), TOBN(0x46ba6d47, 0x50fa72ed)}, {TOBN(0x6c37d1c5, 0x93e58a8e), TOBN(0xa2394731, 0x120c088c), TOBN(0xc3be4263, 0xcb6e86da), TOBN(0x2c417d36, 0x7126d038)}}, {{TOBN(0x5b70f9c5, 0x8b6f8efa), TOBN(0x671a2faa, 0x37718536), TOBN(0xd3ced3c6, 0xb539c92b), TOBN(0xe56f1bd9, 0xa31203c2)}, {TOBN(0x8b096ec4, 0x9ff3c8eb), TOBN(0x2deae432, 0x43491cea), TOBN(0x2465c6eb, 0x17943794), TOBN(0x5d267e66, 0x20586843)}}, {{TOBN(0x9d3d116d, 0xb07159d0), TOBN(0xae07a67f, 0xc1896210), TOBN(0x8fc84d87, 0xbb961579), TOBN(0x30009e49, 0x1c1f8dd6)}, {TOBN(0x8a8caf22, 0xe3132819), TOBN(0xcffa197c, 0xf23ab4ff), TOBN(0x58103a44, 0x205dd687), TOBN(0x57b796c3, 0x0ded67a2)}}, {{TOBN(0x0b9c3a6c, 0xa1779ad7), TOBN(0xa33cfe2e, 0x357c09c5), TOBN(0x2ea29315, 0x3db4a57e), TOBN(0x91959695, 0x8ebeb52e)}, {TOBN(0x118db9a6, 0xe546c879), TOBN(0x8e996df4, 0x6295c8d6), TOBN(0xdd990484, 0x55ec806b), TOBN(0x24f291ca, 0x165c1035)}}, {{TOBN(0xcca523bb, 0x440e2229), TOBN(0x324673a2, 0x73ef4d04), TOBN(0xaf3adf34, 0x3e11ec39), TOBN(0x6136d7f1, 0xdc5968d3)}, {TOBN(0x7a7b2899, 0xb053a927), TOBN(0x3eaa2661, 0xae067ecd), TOBN(0x8549b9c8, 0x02779cd9), TOBN(0x061d7940, 0xc53385ea)}}, {{TOBN(0x3e0ba883, 0xf06d18bd), TOBN(0x4ba6de53, 0xb2700843), TOBN(0xb966b668, 0x591a9e4d), TOBN(0x93f67567, 0x7f4fa0ed)}, {TOBN(0x5a02711b, 0x4347237b), TOBN(0xbc041e2f, 0xe794608e), TOBN(0x55af10f5, 0x70f73d8c), TOBN(0xd2d4d4f7, 0xbb7564f7)}}, {{TOBN(0xd7d27a89, 0xb3e93ce7), TOBN(0xf7b5a875, 0x5d3a2c1b), TOBN(0xb29e68a0, 0x255b218a), TOBN(0xb533837e, 0x8af76754)}, {TOBN(0xd1b05a73, 0x579fab2e), TOBN(0xb41055a1, 0xecd74385), TOBN(0xb2369274, 0x445e9115), TOBN(0x2972a7c4, 0xf520274e)}}, {{TOBN(0x6c08334e, 0xf678e68a), TOBN(0x4e4160f0, 0x99b057ed), TOBN(0x3cfe11b8, 0x52ccb69a), TOBN(0x2fd1823a, 0x21c8f772)}, {TOBN(0xdf7f072f, 0x3298f055), TOBN(0x8c0566f9, 0xfec74a6e), TOBN(0xe549e019, 0x5bb4d041), TOBN(0x7c3930ba, 0x9208d850)}}, {{TOBN(0xe07141fc, 0xaaa2902b), TOBN(0x539ad799, 0xe4f69ad3), TOBN(0xa6453f94, 0x813f9ffd), TOBN(0xc58d3c48, 0x375bc2f7)}, {TOBN(0xb3326fad, 0x5dc64e96), TOBN(0x3aafcaa9, 0xb240e354), TOBN(0x1d1b0903, 0xaca1e7a9), TOBN(0x4ceb9767, 0x1211b8a0)}}, {{TOBN(0xeca83e49, 0xe32a858e), TOBN(0x4c32892e, 0xae907bad), TOBN(0xd5b42ab6, 0x2eb9b494), TOBN(0x7fde3ee2, 0x1eabae1b)}, {TOBN(0x13b5ab09, 0xcaf54957), TOBN(0xbfb028be, 0xe5f5d5d5), TOBN(0x928a0650, 0x2003e2c0), TOBN(0x90793aac, 0x67476843)}}, {{TOBN(0x5e942e79, 0xc81710a0), TOBN(0x557e4a36, 0x27ccadd4), TOBN(0x72a2bc56, 0x4bcf6d0c), TOBN(0x09ee5f43, 0x26d7b80c)}, {TOBN(0x6b70dbe9, 0xd4292f19), TOBN(0x56f74c26, 0x63f16b18), TOBN(0xc23db0f7, 0x35fbb42a), TOBN(0xb606bdf6, 0x6ae10040)}}, {{TOBN(0x1eb15d4d, 0x044573ac), TOBN(0x7dc3cf86, 0x556b0ba4), TOBN(0x97af9a33, 0xc60df6f7), TOBN(0x0b1ef85c, 0xa716ce8c)}, {TOBN(0x2922f884, 0xc96958be), TOBN(0x7c32fa94, 0x35690963), TOBN(0x2d7f667c, 0xeaa00061), TOBN(0xeaaf7c17, 0x3547365c)}}, {{TOBN(0x1eb4de46, 0x87032d58), TOBN(0xc54f3d83, 0x5e2c79e0), TOBN(0x07818df4, 0x5d04ef23), TOBN(0x55faa9c8, 0x673d41b4)}, {TOBN(0xced64f6f, 0x89b95355), TOBN(0x4860d2ea, 0xb7415c84), TOBN(0x5fdb9bd2, 0x050ebad3), TOBN(0xdb53e0cc, 0x6685a5bf)}}, {{TOBN(0xb830c031, 0x9feb6593), TOBN(0xdd87f310, 0x6accff17), TOBN(0x2303ebab, 0x9f555c10), TOBN(0x94603695, 0x287e7065)}, {TOBN(0xf88311c3, 0x2e83358c), TOBN(0x508dd9b4, 0xeefb0178), TOBN(0x7ca23706, 0x2dba8652), TOBN(0x62aac5a3, 0x0047abe5)}}, {{TOBN(0x9a61d2a0, 0x8b1ea7b3), TOBN(0xd495ab63, 0xae8b1485), TOBN(0x38740f84, 0x87052f99), TOBN(0x178ebe5b, 0xb2974eea)}, {TOBN(0x030bbcca, 0x5b36d17f), TOBN(0xb5e4cce3, 0xaaf86eea), TOBN(0xb51a0220, 0x68f8e9e0), TOBN(0xa4348796, 0x09eb3e75)}}, {{TOBN(0xbe592309, 0xeef1a752), TOBN(0x5d7162d7, 0x6f2aa1ed), TOBN(0xaebfb5ed, 0x0f007dd2), TOBN(0x255e14b2, 0xc89edd22)}, {TOBN(0xba85e072, 0x0303b697), TOBN(0xc5d17e25, 0xf05720ff), TOBN(0x02b58d6e, 0x5128ebb6), TOBN(0x2c80242d, 0xd754e113)}}, {{TOBN(0x919fca5f, 0xabfae1ca), TOBN(0x937afaac, 0x1a21459b), TOBN(0x9e0ca91c, 0x1f66a4d2), TOBN(0x194cc7f3, 0x23ec1331)}, {TOBN(0xad25143a, 0x8aa11690), TOBN(0xbe40ad8d, 0x09b59e08), TOBN(0x37d60d9b, 0xe750860a), TOBN(0x6c53b008, 0xc6bf434c)}}, {{TOBN(0xb572415d, 0x1356eb80), TOBN(0xb8bf9da3, 0x9578ded8), TOBN(0x22658e36, 0x5e8fb38b), TOBN(0x9b70ce22, 0x5af8cb22)}, {TOBN(0x7c00018a, 0x829a8180), TOBN(0x84329f93, 0xb81ed295), TOBN(0x7c343ea2, 0x5f3cea83), TOBN(0x38f8655f, 0x67586536)}}, {{TOBN(0xa661a0d0, 0x1d3ec517), TOBN(0x98744652, 0x512321ae), TOBN(0x084ca591, 0xeca92598), TOBN(0xa9bb9dc9, 0x1dcb3feb)}, {TOBN(0x14c54355, 0x78b4c240), TOBN(0x5ed62a3b, 0x610cafdc), TOBN(0x07512f37, 0x1b38846b), TOBN(0x571bb70a, 0xb0e38161)}}, {{TOBN(0xb556b95b, 0x2da705d2), TOBN(0x3ef8ada6, 0xb1a08f98), TOBN(0x85302ca7, 0xddecfbe5), TOBN(0x0e530573, 0x943105cd)}, {TOBN(0x60554d55, 0x21a9255d), TOBN(0x63a32fa1, 0xf2f3802a), TOBN(0x35c8c5b0, 0xcd477875), TOBN(0x97f458ea, 0x6ad42da1)}}, {{TOBN(0x832d7080, 0xeb6b242d), TOBN(0xd30bd023, 0x3b71e246), TOBN(0x7027991b, 0xbe31139d), TOBN(0x68797e91, 0x462e4e53)}, {TOBN(0x423fe20a, 0x6b4e185a), TOBN(0x82f2c67e, 0x42d9b707), TOBN(0x25c81768, 0x4cf7811b), TOBN(0xbd53005e, 0x045bb95d)}}}, {{{TOBN(0xe5f649be, 0x9d8e68fd), TOBN(0xdb0f0533, 0x1b044320), TOBN(0xf6fde9b3, 0xe0c33398), TOBN(0x92f4209b, 0x66c8cfae)}, {TOBN(0xe9d1afcc, 0x1a739d4b), TOBN(0x09aea75f, 0xa28ab8de), TOBN(0x14375fb5, 0xeac6f1d0), TOBN(0x6420b560, 0x708f7aa5)}}, {{TOBN(0x9eae499c, 0x6254dc41), TOBN(0x7e293924, 0x7a837e7e), TOBN(0x74aec08c, 0x090524a7), TOBN(0xf82b9219, 0x8d6f55f2)}, {TOBN(0x493c962e, 0x1402cec5), TOBN(0x9f17ca17, 0xfa2f30e7), TOBN(0xbcd783e8, 0xe9b879cb), TOBN(0xea3d8c14, 0x5a6f145f)}}, {{TOBN(0xdede15e7, 0x5e0dee6e), TOBN(0x74f24872, 0xdc628aa2), TOBN(0xd3e9c4fe, 0x7861bb93), TOBN(0x56d4822a, 0x6187b2e0)}, {TOBN(0xb66417cf, 0xc59826f9), TOBN(0xca260969, 0x2408169e), TOBN(0xedf69d06, 0xc79ef885), TOBN(0x00031f8a, 0xdc7d138f)}}, {{TOBN(0x103c46e6, 0x0ebcf726), TOBN(0x4482b831, 0x6231470e), TOBN(0x6f6dfaca, 0x487c2109), TOBN(0x2e0ace97, 0x62e666ef)}, {TOBN(0x3246a9d3, 0x1f8d1f42), TOBN(0x1b1e83f1, 0x574944d2), TOBN(0x13dfa63a, 0xa57f334b), TOBN(0x0cf8daed, 0x9f025d81)}}, {{TOBN(0x30d78ea8, 0x00ee11c1), TOBN(0xeb053cd4, 0xb5e3dd75), TOBN(0x9b65b13e, 0xd58c43c5), TOBN(0xc3ad49bd, 0xbd151663)}, {TOBN(0x99fd8e41, 0xb6427990), TOBN(0x12cf15bd, 0x707eae1e), TOBN(0x29ad4f1b, 0x1aabb71e), TOBN(0x5143e74d, 0x07545d0e)}}, {{TOBN(0x30266336, 0xc88bdee1), TOBN(0x25f29306, 0x5876767c), TOBN(0x9c078571, 0xc6731996), TOBN(0xc88690b2, 0xed552951)}, {TOBN(0x274f2c2d, 0x852705b4), TOBN(0xb0bf8d44, 0x4e09552d), TOBN(0x7628beeb, 0x986575d1), TOBN(0x407be238, 0x7f864651)}}, {{TOBN(0x0e5e3049, 0xa639fc6b), TOBN(0xe75c35d9, 0x86003625), TOBN(0x0cf35bd8, 0x5dcc1646), TOBN(0x8bcaced2, 0x6c26273a)}, {TOBN(0xe22ecf1d, 0xb5536742), TOBN(0x013dd897, 0x1a9e068b), TOBN(0x17f411cb, 0x8a7909c5), TOBN(0x5757ac98, 0x861dd506)}}, {{TOBN(0x85de1f0d, 0x1e935abb), TOBN(0xdefd10b4, 0x154de37a), TOBN(0xb8d9e392, 0x369cebb5), TOBN(0x54d5ef9b, 0x761324be)}, {TOBN(0x4d6341ba, 0x74f17e26), TOBN(0xc0a0e3c8, 0x78c1dde4), TOBN(0xa6d77581, 0x87d918fd), TOBN(0x66876015, 0x02ca3a13)}}, {{TOBN(0xc7313e9c, 0xf36658f0), TOBN(0xc433ef1c, 0x71f8057e), TOBN(0x85326246, 0x1b6a835a), TOBN(0xc8f05398, 0x7c86394c)}, {TOBN(0xff398cdf, 0xe983c4a1), TOBN(0xbf5e8162, 0x03b7b931), TOBN(0x93193c46, 0xb7b9045b), TOBN(0x1e4ebf5d, 0xa4a6e46b)}}, {{TOBN(0xf9942a60, 0x43a24fe7), TOBN(0x29c1191e, 0xffb3492b), TOBN(0x9f662449, 0x902fde05), TOBN(0xc792a7ac, 0x6713c32d)}, {TOBN(0x2fd88ad8, 0xb737982c), TOBN(0x7e3a0319, 0xa21e60e3), TOBN(0x09b0de44, 0x7383591a), TOBN(0x6df141ee, 0x8310a456)}}, {{TOBN(0xaec1a039, 0xe6d6f471), TOBN(0x14b2ba0f, 0x1198d12e), TOBN(0xebc1a160, 0x3aeee5ac), TOBN(0x401f4836, 0xe0b964ce)}, {TOBN(0x2ee43796, 0x4fd03f66), TOBN(0x3fdb4e49, 0xdd8f3f12), TOBN(0x6ef267f6, 0x29380f18), TOBN(0x3e8e9670, 0x8da64d16)}}, {{TOBN(0xbc19180c, 0x207674f1), TOBN(0x112e09a7, 0x33ae8fdb), TOBN(0x99667554, 0x6aaeb71e), TOBN(0x79432af1, 0xe101b1c7)}, {TOBN(0xd5eb558f, 0xde2ddec6), TOBN(0x81392d1f, 0x5357753f), TOBN(0xa7a76b97, 0x3ae1158a), TOBN(0x416fbbff, 0x4a899991)}}, {{TOBN(0x9e65fdfd, 0x0d4a9dcf), TOBN(0x7bc29e48, 0x944ddf12), TOBN(0xbc1a92d9, 0x3c856866), TOBN(0x273c6905, 0x6e98dfe2)}, {TOBN(0x69fce418, 0xcdfaa6b8), TOBN(0x606bd823, 0x5061c69f), TOBN(0x42d495a0, 0x6af75e27), TOBN(0x8ed3d505, 0x6d873a1f)}}, {{TOBN(0xaf552841, 0x6ab25b6a), TOBN(0xc6c0ffc7, 0x2b1a4523), TOBN(0xab18827b, 0x21c99e03), TOBN(0x060e8648, 0x9034691b)}, {TOBN(0x5207f90f, 0x93c7f398), TOBN(0x9f4a96cb, 0x82f8d10b), TOBN(0xdd71cd79, 0x3ad0f9e3), TOBN(0x84f435d2, 0xfc3a54f5)}}, {{TOBN(0x4b03c55b, 0x8e33787f), TOBN(0xef42f975, 0xa6384673), TOBN(0xff7304f7, 0x5051b9f0), TOBN(0x18aca1dc, 0x741c87c2)}, {TOBN(0x56f120a7, 0x2d4bfe80), TOBN(0xfd823b3d, 0x053e732c), TOBN(0x11bccfe4, 0x7537ca16), TOBN(0xdf6c9c74, 0x1b5a996b)}}, {{TOBN(0xee7332c7, 0x904fc3fa), TOBN(0x14a23f45, 0xc7e3636a), TOBN(0xc38659c3, 0xf091d9aa), TOBN(0x4a995e5d, 0xb12d8540)}, {TOBN(0x20a53bec, 0xf3a5598a), TOBN(0x56534b17, 0xb1eaa995), TOBN(0x9ed3dca4, 0xbf04e03c), TOBN(0x716c563a, 0xd8d56268)}}, {{TOBN(0x27ba77a4, 0x1d6178e7), TOBN(0xe4c80c40, 0x68a1ff8e), TOBN(0x75011099, 0x0a13f63d), TOBN(0x7bf33521, 0xa61d46f3)}, {TOBN(0x0aff218e, 0x10b365bb), TOBN(0x81021804, 0x0fd7ea75), TOBN(0x05a3fd8a, 0xa4b3a925), TOBN(0xb829e75f, 0x9b3db4e6)}}, {{TOBN(0x6bdc75a5, 0x4d53e5fb), TOBN(0x04a5dc02, 0xd52717e3), TOBN(0x86af502f, 0xe9a42ec2), TOBN(0x8867e8fb, 0x2630e382)}, {TOBN(0xbf845c6e, 0xbec9889b), TOBN(0x54f491f2, 0xcb47c98d), TOBN(0xa3091fba, 0x790c2a12), TOBN(0xd7f6fd78, 0xc20f708b)}}, {{TOBN(0xa569ac30, 0xacde5e17), TOBN(0xd0f996d0, 0x6852b4d7), TOBN(0xe51d4bb5, 0x4609ae54), TOBN(0x3fa37d17, 0x0daed061)}, {TOBN(0x62a88684, 0x34b8fb41), TOBN(0x99a2acbd, 0x9efb64f1), TOBN(0xb75c1a5e, 0x6448e1f2), TOBN(0xfa99951a, 0x42b5a069)}}, {{TOBN(0x6d956e89, 0x2f3b26e7), TOBN(0xf4709860, 0xda875247), TOBN(0x3ad15179, 0x2482dda3), TOBN(0xd64110e3, 0x017d82f0)}, {TOBN(0x14928d2c, 0xfad414e4), TOBN(0x2b155f58, 0x2ed02b24), TOBN(0x481a141b, 0xcb821bf1), TOBN(0x12e3c770, 0x4f81f5da)}}, {{TOBN(0xe49c5de5, 0x9fff8381), TOBN(0x11053232, 0x5bbec894), TOBN(0xa0d051cc, 0x454d88c4), TOBN(0x4f6db89c, 0x1f8e531b)}, {TOBN(0x34fe3fd6, 0xca563a44), TOBN(0x7f5c2215, 0x58da8ab9), TOBN(0x8445016d, 0x9474f0a1), TOBN(0x17d34d61, 0xcb7d8a0a)}}, {{TOBN(0x8e9d3910, 0x1c474019), TOBN(0xcaff2629, 0xd52ceefb), TOBN(0xf9cf3e32, 0xc1622c2b), TOBN(0xd4b95e3c, 0xe9071a05)}, {TOBN(0xfbbca61f, 0x1594438c), TOBN(0x1eb6e6a6, 0x04aadedf), TOBN(0x853027f4, 0x68e14940), TOBN(0x221d322a, 0xdfabda9c)}}, {{TOBN(0xed8ea9f6, 0xb7cb179a), TOBN(0xdc7b764d, 0xb7934dcc), TOBN(0xfcb13940, 0x5e09180d), TOBN(0x6629a6bf, 0xb47dc2dd)}, {TOBN(0xbfc55e4e, 0x9f5a915e), TOBN(0xb1db9d37, 0x6204441e), TOBN(0xf82d68cf, 0x930c5f53), TOBN(0x17d3a142, 0xcbb605b1)}}, {{TOBN(0xdd5944ea, 0x308780f2), TOBN(0xdc8de761, 0x3845f5e4), TOBN(0x6beaba7d, 0x7624d7a3), TOBN(0x1e709afd, 0x304df11e)}, {TOBN(0x95364376, 0x02170456), TOBN(0xbf204b3a, 0xc8f94b64), TOBN(0x4e53af7c, 0x5680ca68), TOBN(0x0526074a, 0xe0c67574)}}, {{TOBN(0x95d8cef8, 0xecd92af6), TOBN(0xe6b9fa7a, 0x6cd1745a), TOBN(0x3d546d3d, 0xa325c3e4), TOBN(0x1f57691d, 0x9ae93aae)}, {TOBN(0xe891f3fe, 0x9d2e1a33), TOBN(0xd430093f, 0xac063d35), TOBN(0xeda59b12, 0x5513a327), TOBN(0xdc2134f3, 0x5536f18f)}}, {{TOBN(0xaa51fe2c, 0x5c210286), TOBN(0x3f68aaee, 0x1cab658c), TOBN(0x5a23a00b, 0xf9357292), TOBN(0x9a626f39, 0x7efdabed)}, {TOBN(0xfe2b3bf3, 0x199d78e3), TOBN(0xb7a2af77, 0x71bbc345), TOBN(0x3d19827a, 0x1e59802c), TOBN(0x823bbc15, 0xb487a51c)}}, {{TOBN(0x856139f2, 0x99d0a422), TOBN(0x9ac3df65, 0xf456c6fb), TOBN(0xaddf65c6, 0x701f8bd6), TOBN(0x149f321e, 0x3758df87)}, {TOBN(0xb1ecf714, 0x721b7eba), TOBN(0xe17df098, 0x31a3312a), TOBN(0xdb2fd6ec, 0xd5c4d581), TOBN(0xfd02996f, 0x8fcea1b3)}}, {{TOBN(0xe29fa63e, 0x7882f14f), TOBN(0xc9f6dc35, 0x07c6cadc), TOBN(0x46f22d6f, 0xb882bed0), TOBN(0x1a45755b, 0xd118e52c)}, {TOBN(0x9f2c7c27, 0x7c4608cf), TOBN(0x7ccbdf32, 0x568012c2), TOBN(0xfcb0aedd, 0x61729b0e), TOBN(0x7ca2ca9e, 0xf7d75dbf)}}, {{TOBN(0xf58fecb1, 0x6f640f62), TOBN(0xe274b92b, 0x39f51946), TOBN(0x7f4dfc04, 0x6288af44), TOBN(0x0a91f32a, 0xeac329e5)}, {TOBN(0x43ad274b, 0xd6aaba31), TOBN(0x719a1640, 0x0f6884f9), TOBN(0x685d29f6, 0xdaf91e20), TOBN(0x5ec1cc33, 0x27e49d52)}}, {{TOBN(0x38f4de96, 0x3b54a059), TOBN(0x0e0015e5, 0xefbcfdb3), TOBN(0x177d23d9, 0x4dbb8da6), TOBN(0x98724aa2, 0x97a617ad)}, {TOBN(0x30f0885b, 0xfdb6558e), TOBN(0xf9f7a28a, 0xc7899a96), TOBN(0xd2ae8ac8, 0x872dc112), TOBN(0xfa0642ca, 0x73c3c459)}}, {{TOBN(0x15296981, 0xe7dfc8d6), TOBN(0x67cd4450, 0x1fb5b94a), TOBN(0x0ec71cf1, 0x0eddfd37), TOBN(0xc7e5eeb3, 0x9a8eddc7)}, {TOBN(0x02ac8e3d, 0x81d95028), TOBN(0x0088f172, 0x70b0e35d), TOBN(0xec041fab, 0xe1881fe3), TOBN(0x62cf71b8, 0xd99e7faa)}}, {{TOBN(0x5043dea7, 0xe0f222c2), TOBN(0x309d42ac, 0x72e65142), TOBN(0x94fe9ddd, 0x9216cd30), TOBN(0xd6539c7d, 0x0f87feec)}, {TOBN(0x03c5a57c, 0x432ac7d7), TOBN(0x72692cf0, 0x327fda10), TOBN(0xec28c85f, 0x280698de), TOBN(0x2331fb46, 0x7ec283b1)}}, {{TOBN(0xd34bfa32, 0x2867e633), TOBN(0x78709a82, 0x0a9cc815), TOBN(0xb7fe6964, 0x875e2fa5), TOBN(0x25cc064f, 0x9e98bfb5)}, {TOBN(0x9eb0151c, 0x493a65c5), TOBN(0x5fb5d941, 0x53182464), TOBN(0x69e6f130, 0xf04618e2), TOBN(0xa8ecec22, 0xf89c8ab6)}}, {{TOBN(0xcd6ac88b, 0xb96209bd), TOBN(0x65fa8cdb, 0xb3e1c9e0), TOBN(0xa47d22f5, 0x4a8d8eac), TOBN(0x83895cdf, 0x8d33f963)}, {TOBN(0xa8adca59, 0xb56cd3d1), TOBN(0x10c8350b, 0xdaf38232), TOBN(0x2b161fb3, 0xa5080a9f), TOBN(0xbe7f5c64, 0x3af65b3a)}}, {{TOBN(0x2c754039, 0x97403a11), TOBN(0x94626cf7, 0x121b96af), TOBN(0x431de7c4, 0x6a983ec2), TOBN(0x3780dd3a, 0x52cc3df7)}, {TOBN(0xe28a0e46, 0x2baf8e3b), TOBN(0xabe68aad, 0x51d299ae), TOBN(0x603eb8f9, 0x647a2408), TOBN(0x14c61ed6, 0x5c750981)}}, {{TOBN(0x88b34414, 0xc53352e7), TOBN(0x5a34889c, 0x1337d46e), TOBN(0x612c1560, 0xf95f2bc8), TOBN(0x8a3f8441, 0xd4807a3a)}, {TOBN(0x680d9e97, 0x5224da68), TOBN(0x60cd6e88, 0xc3eb00e9), TOBN(0x3875a98e, 0x9a6bc375), TOBN(0xdc80f924, 0x4fd554c2)}}, {{TOBN(0x6c4b3415, 0x6ac77407), TOBN(0xa1e5ea8f, 0x25420681), TOBN(0x541bfa14, 0x4607a458), TOBN(0x5dbc7e7a, 0x96d7fbf9)}, {TOBN(0x646a851b, 0x31590a47), TOBN(0x039e85ba, 0x15ee6df8), TOBN(0xd19fa231, 0xd7b43fc0), TOBN(0x84bc8be8, 0x299a0e04)}}, {{TOBN(0x2b9d2936, 0xf20df03a), TOBN(0x24054382, 0x8608d472), TOBN(0x76b6ba04, 0x9149202a), TOBN(0xb21c3831, 0x3670e7b7)}, {TOBN(0xddd93059, 0xd6fdee10), TOBN(0x9da47ad3, 0x78488e71), TOBN(0x99cc1dfd, 0xa0fcfb25), TOBN(0x42abde10, 0x64696954)}}, {{TOBN(0x14cc15fc, 0x17eab9fe), TOBN(0xd6e863e4, 0xd3e70972), TOBN(0x29a7765c, 0x6432112c), TOBN(0x88660001, 0x5b0774d8)}, {TOBN(0x3729175a, 0x2c088eae), TOBN(0x13afbcae, 0x8230b8d4), TOBN(0x44768151, 0x915f4379), TOBN(0xf086431a, 0xd8d22812)}}, {{TOBN(0x37461955, 0xc298b974), TOBN(0x905fb5f0, 0xf8711e04), TOBN(0x787abf3a, 0xfe969d18), TOBN(0x392167c2, 0x6f6a494e)}, {TOBN(0xfc7a0d2d, 0x28c511da), TOBN(0xf127c7dc, 0xb66a262d), TOBN(0xf9c4bb95, 0xfd63fdf0), TOBN(0x90016589, 0x3913ef46)}}, {{TOBN(0x74d2a73c, 0x11aa600d), TOBN(0x2f5379bd, 0x9fb5ab52), TOBN(0xe49e53a4, 0x7fb70068), TOBN(0x68dd39e5, 0x404aa9a7)}, {TOBN(0xb9b0cf57, 0x2ecaa9c3), TOBN(0xba0e103b, 0xe824826b), TOBN(0x60c2198b, 0x4631a3c4), TOBN(0xc5ff84ab, 0xfa8966a2)}}, {{TOBN(0x2d6ebe22, 0xac95aff8), TOBN(0x1c9bb6db, 0xb5a46d09), TOBN(0x419062da, 0x53ee4f8d), TOBN(0x7b9042d0, 0xbb97efef)}, {TOBN(0x0f87f080, 0x830cf6bd), TOBN(0x4861d19a, 0x6ec8a6c6), TOBN(0xd3a0daa1, 0x202f01aa), TOBN(0xb0111674, 0xf25afbd5)}}, {{TOBN(0x6d00d6cf, 0x1afb20d9), TOBN(0x13695000, 0x40671bc5), TOBN(0x913ab0dc, 0x2485ea9b), TOBN(0x1f2bed06, 0x9eef61ac)}, {TOBN(0x850c8217, 0x6d799e20), TOBN(0x93415f37, 0x3271c2de), TOBN(0x5afb06e9, 0x6c4f5910), TOBN(0x688a52df, 0xc4e9e421)}}, {{TOBN(0x30495ba3, 0xe2a9a6db), TOBN(0x4601303d, 0x58f9268b), TOBN(0xbe3b0dad, 0x7eb0f04f), TOBN(0x4ea47250, 0x4456936d)}, {TOBN(0x8caf8798, 0xd33fd3e7), TOBN(0x1ccd8a89, 0xeb433708), TOBN(0x9effe3e8, 0x87fd50ad), TOBN(0xbe240a56, 0x6b29c4df)}}, {{TOBN(0xec4ffd98, 0xca0e7ebd), TOBN(0xf586783a, 0xe748616e), TOBN(0xa5b00d8f, 0xc77baa99), TOBN(0x0acada29, 0xb4f34c9c)}, {TOBN(0x36dad67d, 0x0fe723ac), TOBN(0x1d8e53a5, 0x39c36c1e), TOBN(0xe4dd342d, 0x1f4bea41), TOBN(0x64fd5e35, 0xebc9e4e0)}}, {{TOBN(0x96f01f90, 0x57908805), TOBN(0xb5b9ea3d, 0x5ed480dd), TOBN(0x366c5dc2, 0x3efd2dd0), TOBN(0xed2fe305, 0x6e9dfa27)}, {TOBN(0x4575e892, 0x6e9197e2), TOBN(0x11719c09, 0xab502a5d), TOBN(0x264c7bec, 0xe81f213f), TOBN(0x741b9241, 0x55f5c457)}}, {{TOBN(0x78ac7b68, 0x49a5f4f4), TOBN(0xf91d70a2, 0x9fc45b7d), TOBN(0x39b05544, 0xb0f5f355), TOBN(0x11f06bce, 0xeef930d9)}, {TOBN(0xdb84d25d, 0x038d05e1), TOBN(0x04838ee5, 0xbacc1d51), TOBN(0x9da3ce86, 0x9e8ee00b), TOBN(0xc3412057, 0xc36eda1f)}}, {{TOBN(0xae80b913, 0x64d9c2f4), TOBN(0x7468bac3, 0xa010a8ff), TOBN(0xdfd20037, 0x37359d41), TOBN(0x1a0f5ab8, 0x15efeacc)}, {TOBN(0x7c25ad2f, 0x659d0ce0), TOBN(0x4011bcbb, 0x6785cff1), TOBN(0x128b9912, 0x7e2192c7), TOBN(0xa549d8e1, 0x13ccb0e8)}}, {{TOBN(0x805588d8, 0xc85438b1), TOBN(0x5680332d, 0xbc25cb27), TOBN(0xdcd1bc96, 0x1a4bfdf4), TOBN(0x779ff428, 0x706f6566)}, {TOBN(0x8bbee998, 0xf059987a), TOBN(0xf6ce8cf2, 0xcc686de7), TOBN(0xf8ad3c4a, 0x953cfdb2), TOBN(0xd1d426d9, 0x2205da36)}}, {{TOBN(0xb3c0f13f, 0xc781a241), TOBN(0x3e89360e, 0xd75362a8), TOBN(0xccd05863, 0xc8a91184), TOBN(0x9bd0c9b7, 0xefa8a7f4)}, {TOBN(0x97ee4d53, 0x8a912a4b), TOBN(0xde5e15f8, 0xbcf518fd), TOBN(0x6a055bf8, 0xc467e1e0), TOBN(0x10be4b4b, 0x1587e256)}}, {{TOBN(0xd90c14f2, 0x668621c9), TOBN(0xd5518f51, 0xab9c92c1), TOBN(0x8e6a0100, 0xd6d47b3c), TOBN(0xcbe980dd, 0x66716175)}, {TOBN(0x500d3f10, 0xddd83683), TOBN(0x3b6cb35d, 0x99cac73c), TOBN(0x53730c8b, 0x6083d550), TOBN(0xcf159767, 0xdf0a1987)}}, {{TOBN(0x84bfcf53, 0x43ad73b3), TOBN(0x1b528c20, 0x4f035a94), TOBN(0x4294edf7, 0x33eeac69), TOBN(0xb6283e83, 0x817f3240)}, {TOBN(0xc3fdc959, 0x0a5f25b1), TOBN(0xefaf8aa5, 0x5844ee22), TOBN(0xde269ba5, 0xdbdde4de), TOBN(0xe3347160, 0xc56133bf)}}, {{TOBN(0xc1184219, 0x8d9ea9f8), TOBN(0x090de5db, 0xf3fc1ab5), TOBN(0x404c37b1, 0x0bf22cda), TOBN(0x7de20ec8, 0xf5618894)}, {TOBN(0x754c588e, 0xecdaecab), TOBN(0x6ca4b0ed, 0x88342743), TOBN(0x76f08bdd, 0xf4a938ec), TOBN(0xd182de89, 0x91493ccb)}}, {{TOBN(0xd652c53e, 0xc8a4186a), TOBN(0xb3e878db, 0x946d8e33), TOBN(0x088453c0, 0x5f37663c), TOBN(0x5cd9daaa, 0xb407748b)}, {TOBN(0xa1f5197f, 0x586d5e72), TOBN(0x47500be8, 0xc443ca59), TOBN(0x78ef35b2, 0xe2652424), TOBN(0x09c5d26f, 0x6dd7767d)}}, {{TOBN(0x7175a79a, 0xa74d3f7b), TOBN(0x0428fd8d, 0xcf5ea459), TOBN(0x511cb97c, 0xa5d1746d), TOBN(0x36363939, 0xe71d1278)}, {TOBN(0xcf2df955, 0x10350bf4), TOBN(0xb3817439, 0x60aae782), TOBN(0xa748c0e4, 0x3e688809), TOBN(0x98021fbf, 0xd7a5a006)}}, {{TOBN(0x9076a70c, 0x0e367a98), TOBN(0xbea1bc15, 0x0f62b7c2), TOBN(0x2645a68c, 0x30fe0343), TOBN(0xacaffa78, 0x699dc14f)}, {TOBN(0xf4469964, 0x457bf9c4), TOBN(0x0db6407b, 0x0d2ead83), TOBN(0x68d56cad, 0xb2c6f3eb), TOBN(0x3b512e73, 0xf376356c)}}, {{TOBN(0xe43b0e1f, 0xfce10408), TOBN(0x89ddc003, 0x5a5e257d), TOBN(0xb0ae0d12, 0x0362e5b3), TOBN(0x07f983c7, 0xb0519161)}, {TOBN(0xc2e94d15, 0x5d5231e7), TOBN(0xcff22aed, 0x0b4f9513), TOBN(0xb02588dd, 0x6ad0b0b5), TOBN(0xb967d1ac, 0x11d0dcd5)}}, {{TOBN(0x8dac6bc6, 0xcf777b6c), TOBN(0x0062bdbd, 0x4c6d1959), TOBN(0x53da71b5, 0x0ef5cc85), TOBN(0x07012c7d, 0x4006f14f)}, {TOBN(0x4617f962, 0xac47800d), TOBN(0x53365f2b, 0xc102ed75), TOBN(0xb422efcb, 0x4ab8c9d3), TOBN(0x195cb26b, 0x34af31c9)}}, {{TOBN(0x3a926e29, 0x05f2c4ce), TOBN(0xbd2bdecb, 0x9856966c), TOBN(0x5d16ab3a, 0x85527015), TOBN(0x9f81609e, 0x4486c231)}, {TOBN(0xd8b96b2c, 0xda350002), TOBN(0xbd054690, 0xfa1b7d36), TOBN(0xdc90ebf5, 0xe71d79bc), TOBN(0xf241b6f9, 0x08964e4e)}}, {{TOBN(0x7c838643, 0x2fe3cd4c), TOBN(0xe0f33acb, 0xb4bc633c), TOBN(0xb4a9ecec, 0x3d139f1f), TOBN(0x05ce69cd, 0xdc4a1f49)}, {TOBN(0xa19d1b16, 0xf5f98aaf), TOBN(0x45bb71d6, 0x6f23e0ef), TOBN(0x33789fcd, 0x46cdfdd3), TOBN(0x9b8e2978, 0xcee040ca)}}, {{TOBN(0x9c69b246, 0xae0a6828), TOBN(0xba533d24, 0x7078d5aa), TOBN(0x7a2e42c0, 0x7bb4fbdb), TOBN(0xcfb4879a, 0x7035385c)}, {TOBN(0x8c3dd30b, 0x3281705b), TOBN(0x7e361c6c, 0x404fe081), TOBN(0x7b21649c, 0x3f604edf), TOBN(0x5dbf6a3f, 0xe52ffe47)}}, {{TOBN(0xc41b7c23, 0x4b54d9bf), TOBN(0x1374e681, 0x3511c3d9), TOBN(0x1863bf16, 0xc1b2b758), TOBN(0x90e78507, 0x1e9e6a96)}, {TOBN(0xab4bf98d, 0x5d86f174), TOBN(0xd74e0bd3, 0x85e96fe4), TOBN(0x8afde39f, 0xcac5d344), TOBN(0x90946dbc, 0xbd91b847)}}, {{TOBN(0xf5b42358, 0xfe1a838c), TOBN(0x05aae6c5, 0x620ac9d8), TOBN(0x8e193bd8, 0xa1ce5a0b), TOBN(0x8f710571, 0x4dabfd72)}, {TOBN(0x8d8fdd48, 0x182caaac), TOBN(0x8c4aeefa, 0x040745cf), TOBN(0x73c6c30a, 0xf3b93e6d), TOBN(0x991241f3, 0x16f42011)}}, {{TOBN(0xa0158eea, 0xe457a477), TOBN(0xd19857db, 0xee6ddc05), TOBN(0xb3265224, 0x18c41671), TOBN(0x3ffdfc7e, 0x3c2c0d58)}, {TOBN(0x3a3a5254, 0x26ee7cda), TOBN(0x341b0869, 0xdf02c3a8), TOBN(0xa023bf42, 0x723bbfc8), TOBN(0x3d15002a, 0x14452691)}}}, {{{TOBN(0x5ef7324c, 0x85edfa30), TOBN(0x25976554, 0x87d4f3da), TOBN(0x352f5bc0, 0xdcb50c86), TOBN(0x8f6927b0, 0x4832a96c)}, {TOBN(0xd08ee1ba, 0x55f2f94c), TOBN(0x6a996f99, 0x344b45fa), TOBN(0xe133cb8d, 0xa8aa455d), TOBN(0x5d0721ec, 0x758dc1f7)}}, {{TOBN(0x6ba7a920, 0x79e5fb67), TOBN(0xe1331feb, 0x70aa725e), TOBN(0x5080ccf5, 0x7df5d837), TOBN(0xe4cae01d, 0x7ff72e21)}, {TOBN(0xd9243ee6, 0x0412a77d), TOBN(0x06ff7cac, 0xdf449025), TOBN(0xbe75f7cd, 0x23ef5a31), TOBN(0xbc957822, 0x0ddef7a8)}}, {{TOBN(0x8cf7230c, 0xb0ce1c55), TOBN(0x5b534d05, 0x0bbfb607), TOBN(0xee1ef113, 0x0e16363b), TOBN(0x27e0aa7a, 0xb4999e82)}, {TOBN(0xce1dac2d, 0x79362c41), TOBN(0x67920c90, 0x91bb6cb0), TOBN(0x1e648d63, 0x2223df24), TOBN(0x0f7d9eef, 0xe32e8f28)}}, {{TOBN(0x6943f39a, 0xfa833834), TOBN(0x22951722, 0xa6328562), TOBN(0x81d63dd5, 0x4170fc10), TOBN(0x9f5fa58f, 0xaecc2e6d)}, {TOBN(0xb66c8725, 0xe77d9a3b), TOBN(0x11235cea, 0x6384ebe0), TOBN(0x06a8c118, 0x5845e24a), TOBN(0x0137b286, 0xebd093b1)}}, {{TOBN(0xc589e1ce, 0x44ace150), TOBN(0xe0f8d3d9, 0x4381e97c), TOBN(0x59e99b11, 0x62c5a4b8), TOBN(0x90d262f7, 0xfd0ec9f9)}, {TOBN(0xfbc854c9, 0x283e13c9), TOBN(0x2d04fde7, 0xaedc7085), TOBN(0x057d7765, 0x47dcbecb), TOBN(0x8dbdf591, 0x9a76fa5f)}}, {{TOBN(0xd0150695, 0x0de1e578), TOBN(0x2e1463e7, 0xe9f72bc6), TOBN(0xffa68441, 0x1b39eca5), TOBN(0x673c8530, 0x7c037f2f)}, {TOBN(0xd0d6a600, 0x747f91da), TOBN(0xb08d43e1, 0xc9cb78e9), TOBN(0x0fc0c644, 0x27b5cef5), TOBN(0x5c1d160a, 0xa60a2fd6)}}, {{TOBN(0xf98cae53, 0x28c8e13b), TOBN(0x375f10c4, 0xb2eddcd1), TOBN(0xd4eb8b7f, 0x5cce06ad), TOBN(0xb4669f45, 0x80a2e1ef)}, {TOBN(0xd593f9d0, 0x5bbd8699), TOBN(0x5528a4c9, 0xe7976d13), TOBN(0x3923e095, 0x1c7e28d3), TOBN(0xb9293790, 0x3f6bb577)}}, {{TOBN(0xdb567d6a, 0xc42bd6d2), TOBN(0x6df86468, 0xbb1f96ae), TOBN(0x0efe5b1a, 0x4843b28e), TOBN(0x961bbb05, 0x6379b240)}, {TOBN(0xb6caf5f0, 0x70a6a26b), TOBN(0x70686c0d, 0x328e6e39), TOBN(0x80da06cf, 0x895fc8d3), TOBN(0x804d8810, 0xb363fdc9)}}, {{TOBN(0xbe22877b, 0x207f1670), TOBN(0x9b0dd188, 0x4e615291), TOBN(0x625ae8dc, 0x97a3c2bf), TOBN(0x08584ef7, 0x439b86e8)}, {TOBN(0xde7190a5, 0xdcd898ff), TOBN(0x26286c40, 0x2058ee3d), TOBN(0x3db0b217, 0x5f87b1c1), TOBN(0xcc334771, 0x102a6db5)}}, {{TOBN(0xd99de954, 0x2f770fb1), TOBN(0x97c1c620, 0x4cd7535e), TOBN(0xd3b6c448, 0x3f09cefc), TOBN(0xd725af15, 0x5a63b4f8)}, {TOBN(0x0c95d24f, 0xc01e20ec), TOBN(0xdfd37494, 0x9ae7121f), TOBN(0x7d6ddb72, 0xec77b7ec), TOBN(0xfe079d3b, 0x0353a4ae)}}, {{TOBN(0x3066e70a, 0x2e6ac8d2), TOBN(0x9c6b5a43, 0x106e5c05), TOBN(0x52d3c6f5, 0xede59b8c), TOBN(0x30d6a5c3, 0xfccec9ae)}, {TOBN(0xedec7c22, 0x4fc0a9ef), TOBN(0x190ff083, 0x95c16ced), TOBN(0xbe12ec8f, 0x94de0fde), TOBN(0x0d131ab8, 0x852d3433)}}, {{TOBN(0x42ace07e, 0x85701291), TOBN(0x94793ed9, 0x194061a8), TOBN(0x30e83ed6, 0xd7f4a485), TOBN(0x9eec7269, 0xf9eeff4d)}, {TOBN(0x90acba59, 0x0c9d8005), TOBN(0x5feca458, 0x1e79b9d1), TOBN(0x8fbe5427, 0x1d506a1e), TOBN(0xa32b2c8e, 0x2439cfa7)}}, {{TOBN(0x1671c173, 0x73dd0b4e), TOBN(0x37a28214, 0x44a054c6), TOBN(0x81760a1b, 0x4e8b53f1), TOBN(0xa6c04224, 0xf9f93b9e)}, {TOBN(0x18784b34, 0xcf671e3c), TOBN(0x81bbecd2, 0xcda9b994), TOBN(0x38831979, 0xb2ab3848), TOBN(0xef54feb7, 0xf2e03c2d)}}, {{TOBN(0xcf197ca7, 0xfb8088fa), TOBN(0x01427247, 0x4ddc96c5), TOBN(0xa2d2550a, 0x30777176), TOBN(0x53469898, 0x4d0cf71d)}, {TOBN(0x6ce937b8, 0x3a2aaac6), TOBN(0xe9f91dc3, 0x5af38d9b), TOBN(0x2598ad83, 0xc8bf2899), TOBN(0x8e706ac9, 0xb5536c16)}}, {{TOBN(0x40dc7495, 0xf688dc98), TOBN(0x26490cd7, 0x124c4afc), TOBN(0xe651ec84, 0x1f18775c), TOBN(0x393ea6c3, 0xb4fdaf4a)}, {TOBN(0x1e1f3343, 0x7f338e0d), TOBN(0x39fb832b, 0x6053e7b5), TOBN(0x46e702da, 0x619e14d5), TOBN(0x859cacd1, 0xcdeef6e0)}}, {{TOBN(0x63b99ce7, 0x4462007d), TOBN(0xb8ab48a5, 0x4cb5f5b7), TOBN(0x9ec673d2, 0xf55edde7), TOBN(0xd1567f74, 0x8cfaefda)}, {TOBN(0x46381b6b, 0x0887bcec), TOBN(0x694497ce, 0xe178f3c2), TOBN(0x5e6525e3, 0x1e6266cb), TOBN(0x5931de26, 0x697d6413)}}, {{TOBN(0x87f8df7c, 0x0e58d493), TOBN(0xb1ae5ed0, 0x58b73f12), TOBN(0xc368f784, 0xdea0c34d), TOBN(0x9bd0a120, 0x859a91a0)}, {TOBN(0xb00d88b7, 0xcc863c68), TOBN(0x3a1cc11e, 0x3d1f4d65), TOBN(0xea38e0e7, 0x0aa85593), TOBN(0x37f13e98, 0x7dc4aee8)}}, {{TOBN(0x10d38667, 0xbc947bad), TOBN(0x738e07ce, 0x2a36ee2e), TOBN(0xc93470cd, 0xc577fcac), TOBN(0xdee1b616, 0x2782470d)}, {TOBN(0x36a25e67, 0x2e793d12), TOBN(0xd6aa6cae, 0xe0f186da), TOBN(0x474d0fd9, 0x80e07af7), TOBN(0xf7cdc47d, 0xba8a5cd4)}}, {{TOBN(0x28af6d9d, 0xab15247f), TOBN(0x7c789c10, 0x493a537f), TOBN(0x7ac9b110, 0x23a334e7), TOBN(0x0236ac09, 0x12c9c277)}, {TOBN(0xa7e5bd25, 0x1d7a5144), TOBN(0x098b9c2a, 0xf13ec4ec), TOBN(0x3639daca, 0xd3f0abca), TOBN(0x642da81a, 0xa23960f9)}}, {{TOBN(0x7d2e5c05, 0x4f7269b1), TOBN(0xfcf30777, 0xe287c385), TOBN(0x10edc84f, 0xf2a46f21), TOBN(0x35441757, 0x4f43fa36)}, {TOBN(0xf1327899, 0xfd703431), TOBN(0xa438d7a6, 0x16dd587a), TOBN(0x65c34c57, 0xe9c8352d), TOBN(0xa728edab, 0x5cc5a24e)}}, {{TOBN(0xaed78abc, 0x42531689), TOBN(0x0a51a0e8, 0x010963ef), TOBN(0x5776fa0a, 0xd717d9b3), TOBN(0xf356c239, 0x7dd3428b)}, {TOBN(0x29903fff, 0x8d3a3dac), TOBN(0x409597fa, 0x3d94491f), TOBN(0x4cd7a5ff, 0xbf4a56a4), TOBN(0xe5096474, 0x8adab462)}}, {{TOBN(0xa97b5126, 0x5c3427b0), TOBN(0x6401405c, 0xd282c9bd), TOBN(0x3629f8d7, 0x222c5c45), TOBN(0xb1c02c16, 0xe8d50aed)}, {TOBN(0xbea2ed75, 0xd9635bc9), TOBN(0x226790c7, 0x6e24552f), TOBN(0x3c33f2a3, 0x65f1d066), TOBN(0x2a43463e, 0x6dfccc2e)}}, {{TOBN(0x8cc3453a, 0xdb483761), TOBN(0xe7cc6085, 0x65d5672b), TOBN(0x277ed6cb, 0xde3efc87), TOBN(0x19f2f368, 0x69234eaf)}, {TOBN(0x9aaf4317, 0x5c0b800b), TOBN(0x1f1e7c89, 0x8b6da6e2), TOBN(0x6cfb4715, 0xb94ec75e), TOBN(0xd590dd5f, 0x453118c2)}}, {{TOBN(0x14e49da1, 0x1f17a34c), TOBN(0x5420ab39, 0x235a1456), TOBN(0xb7637241, 0x2f50363b), TOBN(0x7b15d623, 0xc3fabb6e)}, {TOBN(0xa0ef40b1, 0xe274e49c), TOBN(0x5cf50744, 0x96b1860a), TOBN(0xd6583fbf, 0x66afe5a4), TOBN(0x44240510, 0xf47e3e9a)}}, {{TOBN(0x99254343, 0x11b2d595), TOBN(0xf1367499, 0xeec8df57), TOBN(0x3cb12c61, 0x3e73dd05), TOBN(0xd248c033, 0x7dac102a)}, {TOBN(0xcf154f13, 0xa77739f5), TOBN(0xbf4288cb, 0x23d2af42), TOBN(0xaa64c9b6, 0x32e4a1cf), TOBN(0xee8c07a8, 0xc8a208f3)}}, {{TOBN(0xe10d4999, 0x6fe8393f), TOBN(0x0f809a3f, 0xe91f3a32), TOBN(0x61096d1c, 0x802f63c8), TOBN(0x289e1462, 0x57750d3d)}, {TOBN(0xed06167e, 0x9889feea), TOBN(0xd5c9c0e2, 0xe0993909), TOBN(0x46fca0d8, 0x56508ac6), TOBN(0x91826047, 0x4f1b8e83)}}, {{TOBN(0x4f2c877a, 0x9a4a2751), TOBN(0x71bd0072, 0xcae6fead), TOBN(0x38df8dcc, 0x06aa1941), TOBN(0x5a074b4c, 0x63beeaa8)}, {TOBN(0xd6d65934, 0xc1cec8ed), TOBN(0xa6ecb49e, 0xaabc03bd), TOBN(0xaade91c2, 0xde8a8415), TOBN(0xcfb0efdf, 0x691136e0)}}, {{TOBN(0x11af45ee, 0x23ab3495), TOBN(0xa132df88, 0x0b77463d), TOBN(0x8923c15c, 0x815d06f4), TOBN(0xc3ceb3f5, 0x0d61a436)}, {TOBN(0xaf52291d, 0xe88fb1da), TOBN(0xea057974, 0x1da12179), TOBN(0xb0d7218c, 0xd2fef720), TOBN(0x6c0899c9, 0x8e1d8845)}}, {{TOBN(0x98157504, 0x752ddad7), TOBN(0xd60bd74f, 0xa1a68a97), TOBN(0x7047a3a9, 0xf658fb99), TOBN(0x1f5d86d6, 0x5f8511e4)}, {TOBN(0xb8a4bc42, 0x4b5a6d88), TOBN(0x69eb2c33, 0x1abefa7d), TOBN(0x95bf39e8, 0x13c9c510), TOBN(0xf571960a, 0xd48aab43)}}, {{TOBN(0x7e8cfbcf, 0x704e23c6), TOBN(0xc71b7d22, 0x28aaa65b), TOBN(0xa041b2bd, 0x245e3c83), TOBN(0x69b98834, 0xd21854ff)}, {TOBN(0x89d227a3, 0x963bfeec), TOBN(0x99947aaa, 0xde7da7cb), TOBN(0x1d9ee9db, 0xee68a9b1), TOBN(0x0a08f003, 0x698ec368)}}, {{TOBN(0xe9ea4094, 0x78ef2487), TOBN(0xc8d2d415, 0x02cfec26), TOBN(0xc52f9a6e, 0xb7dcf328), TOBN(0x0ed489e3, 0x85b6a937)}, {TOBN(0x9b94986b, 0xbef3366e), TOBN(0x0de59c70, 0xedddddb8), TOBN(0xffdb748c, 0xeadddbe2), TOBN(0x9b9784bb, 0x8266ea40)}}, {{TOBN(0x142b5502, 0x1a93507a), TOBN(0xb4cd1187, 0x8d3c06cf), TOBN(0xdf70e76a, 0x91ec3f40), TOBN(0x484e81ad, 0x4e7553c2)}, {TOBN(0x830f87b5, 0x272e9d6e), TOBN(0xea1c93e5, 0xc6ff514a), TOBN(0x67cc2adc, 0xc4192a8e), TOBN(0xc77e27e2, 0x42f4535a)}}, {{TOBN(0x9cdbab36, 0xd2b713c5), TOBN(0x86274ea0, 0xcf7b0cd3), TOBN(0x784680f3, 0x09af826b), TOBN(0xbfcc837a, 0x0c72dea3)}, {TOBN(0xa8bdfe9d, 0xd6529b73), TOBN(0x708aa228, 0x63a88002), TOBN(0x6c7a9a54, 0xc91d45b9), TOBN(0xdf1a38bb, 0xfd004f56)}}, {{TOBN(0x2e8c9a26, 0xb8bad853), TOBN(0x2d52cea3, 0x3723eae7), TOBN(0x054d6d81, 0x56ca2830), TOBN(0xa3317d14, 0x9a8dc411)}, {TOBN(0xa08662fe, 0xfd4ddeda), TOBN(0xed2a153a, 0xb55d792b), TOBN(0x7035c16a, 0xbfc6e944), TOBN(0xb6bc5834, 0x00171cf3)}}, {{TOBN(0xe27152b3, 0x83d102b6), TOBN(0xfe695a47, 0x0646b848), TOBN(0xa5bb09d8, 0x916e6d37), TOBN(0xb4269d64, 0x0d17015e)}, {TOBN(0x8d8156a1, 0x0a1d2285), TOBN(0xfeef6c51, 0x46d26d72), TOBN(0x9dac57c8, 0x4c5434a7), TOBN(0x0282e5be, 0x59d39e31)}}, {{TOBN(0xedfff181, 0x721c486d), TOBN(0x301baf10, 0xbc58824e), TOBN(0x8136a6aa, 0x00570031), TOBN(0x55aaf78c, 0x1cddde68)}, {TOBN(0x26829371, 0x59c63952), TOBN(0x3a3bd274, 0x8bc25baf), TOBN(0xecdf8657, 0xb7e52dc3), TOBN(0x2dd8c087, 0xfd78e6c8)}}, {{TOBN(0x20553274, 0xf5531461), TOBN(0x8b4a1281, 0x5d95499b), TOBN(0xe2c8763a, 0x1a80f9d2), TOBN(0xd1dbe32b, 0x4ddec758)}, {TOBN(0xaf12210d, 0x30c34169), TOBN(0xba74a953, 0x78baa533), TOBN(0x3d133c6e, 0xa438f254), TOBN(0xa431531a, 0x201bef5b)}}, {{TOBN(0x15295e22, 0xf669d7ec), TOBN(0xca374f64, 0x357fb515), TOBN(0x8a8406ff, 0xeaa3fdb3), TOBN(0x106ae448, 0xdf3f2da8)}, {TOBN(0x8f9b0a90, 0x33c8e9a1), TOBN(0x234645e2, 0x71ad5885), TOBN(0x3d083224, 0x1c0aed14), TOBN(0xf10a7d3e, 0x7a942d46)}}, {{TOBN(0x7c11deee, 0x40d5c9be), TOBN(0xb2bae7ff, 0xba84ed98), TOBN(0x93e97139, 0xaad58ddd), TOBN(0x3d872796, 0x3f6d1fa3)}, {TOBN(0x483aca81, 0x8569ff13), TOBN(0x8b89a5fb, 0x9a600f72), TOBN(0x4cbc27c3, 0xc06f2b86), TOBN(0x22130713, 0x63ad9c0b)}}, {{TOBN(0xb5358b1e, 0x48ac2840), TOBN(0x18311294, 0xecba9477), TOBN(0xda58f990, 0xa6946b43), TOBN(0x3098baf9, 0x9ab41819)}, {TOBN(0x66c4c158, 0x4198da52), TOBN(0xab4fc17c, 0x146bfd1b), TOBN(0x2f0a4c3c, 0xbf36a908), TOBN(0x2ae9e34b, 0x58cf7838)}}, {{TOBN(0xf411529e, 0x3fa11b1f), TOBN(0x21e43677, 0x974af2b4), TOBN(0x7c20958e, 0xc230793b), TOBN(0x710ea885, 0x16e840f3)}, {TOBN(0xfc0b21fc, 0xc5dc67cf), TOBN(0x08d51647, 0x88405718), TOBN(0xd955c21f, 0xcfe49eb7), TOBN(0x9722a5d5, 0x56dd4a1f)}}, {{TOBN(0xc9ef50e2, 0xc861baa5), TOBN(0xc0c21a5d, 0x9505ac3e), TOBN(0xaf6b9a33, 0x8b7c063f), TOBN(0xc6370339, 0x2f4779c1)}, {TOBN(0x22df99c7, 0x638167c3), TOBN(0xfe6ffe76, 0x795db30c), TOBN(0x2b822d33, 0xa4854989), TOBN(0xfef031dd, 0x30563aa5)}}, {{TOBN(0x16b09f82, 0xd57c667f), TOBN(0xc70312ce, 0xcc0b76f1), TOBN(0xbf04a9e6, 0xc9118aec), TOBN(0x82fcb419, 0x3409d133)}, {TOBN(0x1a8ab385, 0xab45d44d), TOBN(0xfba07222, 0x617b83a3), TOBN(0xb05f50dd, 0x58e81b52), TOBN(0x1d8db553, 0x21ce5aff)}}, {{TOBN(0x3097b8d4, 0xe344a873), TOBN(0x7d8d116d, 0xfe36d53e), TOBN(0x6db22f58, 0x7875e750), TOBN(0x2dc5e373, 0x43e144ea)}, {TOBN(0xc05f32e6, 0xe799eb95), TOBN(0xe9e5f4df, 0x6899e6ec), TOBN(0xbdc3bd68, 0x1fab23d5), TOBN(0xb72b8ab7, 0x73af60e6)}}, {{TOBN(0x8db27ae0, 0x2cecc84a), TOBN(0x600016d8, 0x7bdb871c), TOBN(0x42a44b13, 0xd7c46f58), TOBN(0xb8919727, 0xc3a77d39)}, {TOBN(0xcfc6bbbd, 0xdafd6088), TOBN(0x1a740146, 0x6bd20d39), TOBN(0x8c747abd, 0x98c41072), TOBN(0x4c91e765, 0xbdf68ea1)}}, {{TOBN(0x7c95e5ca, 0x08819a78), TOBN(0xcf48b729, 0xc9587921), TOBN(0x091c7c5f, 0xdebbcc7d), TOBN(0x6f287404, 0xf0e05149)}, {TOBN(0xf83b5ac2, 0x26cd44ec), TOBN(0x88ae32a6, 0xcfea250e), TOBN(0x6ac5047a, 0x1d06ebc5), TOBN(0xc7e550b4, 0xd434f781)}}, {{TOBN(0x61ab1cf2, 0x5c727bd2), TOBN(0x2e4badb1, 0x1cf915b0), TOBN(0x1b4dadec, 0xf69d3920), TOBN(0xe61b1ca6, 0xf14c1dfe)}, {TOBN(0x90b479cc, 0xbd6bd51f), TOBN(0x8024e401, 0x8045ec30), TOBN(0xcab29ca3, 0x25ef0e62), TOBN(0x4f2e9416, 0x49e4ebc0)}}, {{TOBN(0x45eb40ec, 0x0ccced58), TOBN(0x25cd4b9c, 0x0da44f98), TOBN(0x43e06458, 0x871812c6), TOBN(0x99f80d55, 0x16cef651)}, {TOBN(0x571340c9, 0xce6dc153), TOBN(0x138d5117, 0xd8665521), TOBN(0xacdb45bc, 0x4e07014d), TOBN(0x2f34bb38, 0x84b60b91)}}, {{TOBN(0xf44a4fd2, 0x2ae8921e), TOBN(0xb039288e, 0x892ba1e2), TOBN(0x9da50174, 0xb1c180b2), TOBN(0x6b70ab66, 0x1693dc87)}, {TOBN(0x7e9babc9, 0xe7057481), TOBN(0x4581ddef, 0x9c80dc41), TOBN(0x0c890da9, 0x51294682), TOBN(0x0b5629d3, 0x3f4736e5)}}, {{TOBN(0x2340c79e, 0xb06f5b41), TOBN(0xa42e84ce, 0x4e243469), TOBN(0xf9a20135, 0x045a71a9), TOBN(0xefbfb415, 0xd27b6fb6)}, {TOBN(0x25ebea23, 0x9d33cd6f), TOBN(0x9caedb88, 0xaa6c0af8), TOBN(0x53dc7e9a, 0xd9ce6f96), TOBN(0x3897f9fd, 0x51e0b15a)}}, {{TOBN(0xf51cb1f8, 0x8e5d788e), TOBN(0x1aec7ba8, 0xe1d490ee), TOBN(0x265991e0, 0xcc58cb3c), TOBN(0x9f306e8c, 0x9fc3ad31)}, {TOBN(0x5fed006e, 0x5040a0ac), TOBN(0xca9d5043, 0xfb476f2e), TOBN(0xa19c06e8, 0xbeea7a23), TOBN(0xd2865801, 0x0edabb63)}}, {{TOBN(0xdb92293f, 0x6967469a), TOBN(0x2894d839, 0x8d8a8ed8), TOBN(0x87c9e406, 0xbbc77122), TOBN(0x8671c6f1, 0x2ea3a26a)}, {TOBN(0xe42df8d6, 0xd7de9853), TOBN(0x2e3ce346, 0xb1f2bcc7), TOBN(0xda601dfc, 0x899d50cf), TOBN(0xbfc913de, 0xfb1b598f)}}, {{TOBN(0x81c4909f, 0xe61f7908), TOBN(0x192e304f, 0x9bbc7b29), TOBN(0xc3ed8738, 0xc104b338), TOBN(0xedbe9e47, 0x783f5d61)}, {TOBN(0x0c06e9be, 0x2db30660), TOBN(0xda3e613f, 0xc0eb7d8e), TOBN(0xd8fa3e97, 0x322e096e), TOBN(0xfebd91e8, 0xd336e247)}}, {{TOBN(0x8f13ccc4, 0xdf655a49), TOBN(0xa9e00dfc, 0x5eb20210), TOBN(0x84631d0f, 0xc656b6ea), TOBN(0x93a058cd, 0xd8c0d947)}, {TOBN(0x6846904a, 0x67bd3448), TOBN(0x4a3d4e1a, 0xf394fd5c), TOBN(0xc102c1a5, 0xdb225f52), TOBN(0xe3455bba, 0xfc4f5e9a)}}, {{TOBN(0x6b36985b, 0x4b9ad1ce), TOBN(0xa9818536, 0x5bb7f793), TOBN(0x6c25e1d0, 0x48b1a416), TOBN(0x1381dd53, 0x3c81bee7)}, {TOBN(0xd2a30d61, 0x7a4a7620), TOBN(0xc8412926, 0x39b8944c), TOBN(0x3c1c6fbe, 0x7a97c33a), TOBN(0x941e541d, 0x938664e7)}}, {{TOBN(0x417499e8, 0x4a34f239), TOBN(0x15fdb83c, 0xb90402d5), TOBN(0xb75f46bf, 0x433aa832), TOBN(0xb61e15af, 0x63215db1)}, {TOBN(0xaabe59d4, 0xa127f89a), TOBN(0x5d541e0c, 0x07e816da), TOBN(0xaaba0659, 0xa618b692), TOBN(0x55327733, 0x17266026)}}, {{TOBN(0xaf53a0fc, 0x95f57552), TOBN(0x32947650, 0x6cacb0c9), TOBN(0x253ff58d, 0xc821be01), TOBN(0xb0309531, 0xa06f1146)}, {TOBN(0x59bbbdf5, 0x05c2e54d), TOBN(0x158f27ad, 0x26e8dd22), TOBN(0xcc5b7ffb, 0x397e1e53), TOBN(0xae03f65b, 0x7fc1e50d)}}, {{TOBN(0xa9784ebd, 0x9c95f0f9), TOBN(0x5ed9deb2, 0x24640771), TOBN(0x31244af7, 0x035561c4), TOBN(0x87332f3a, 0x7ee857de)}, {TOBN(0x09e16e9e, 0x2b9e0d88), TOBN(0x52d910f4, 0x56a06049), TOBN(0x507ed477, 0xa9592f48), TOBN(0x85cb917b, 0x2365d678)}}, {{TOBN(0xf8511c93, 0x4c8998d1), TOBN(0x2186a3f1, 0x730ea58f), TOBN(0x50189626, 0xb2029db0), TOBN(0x9137a6d9, 0x02ceb75a)}, {TOBN(0x2fe17f37, 0x748bc82c), TOBN(0x87c2e931, 0x80469f8c), TOBN(0x850f71cd, 0xbf891aa2), TOBN(0x0ca1b89b, 0x75ec3d8d)}}, {{TOBN(0x516c43aa, 0x5e1cd3cd), TOBN(0x89397808, 0x9a887c28), TOBN(0x0059c699, 0xddea1f9f), TOBN(0x7737d6fa, 0x8e6868f7)}, {TOBN(0x6d93746a, 0x60f1524b), TOBN(0x36985e55, 0xba052aa7), TOBN(0x41b1d322, 0xed923ea5), TOBN(0x3429759f, 0x25852a11)}}, {{TOBN(0xbeca6ec3, 0x092e9f41), TOBN(0x3a238c66, 0x62256bbd), TOBN(0xd82958ea, 0x70ad487d), TOBN(0x4ac8aaf9, 0x65610d93)}, {TOBN(0x3fa101b1, 0x5e4ccab0), TOBN(0x9bf430f2, 0x9de14bfb), TOBN(0xa10f5cc6, 0x6531899d), TOBN(0x590005fb, 0xea8ce17d)}}, {{TOBN(0xc437912f, 0x24544cb6), TOBN(0x9987b71a, 0xd79ac2e3), TOBN(0x13e3d9dd, 0xc058a212), TOBN(0x00075aac, 0xd2de9606)}, {TOBN(0x80ab508b, 0x6cac8369), TOBN(0x87842be7, 0xf54f6c89), TOBN(0xa7ad663d, 0x6bc532a4), TOBN(0x67813de7, 0x78a91bc8)}}, {{TOBN(0x5dcb61ce, 0xc3427239), TOBN(0x5f3c7cf0, 0xc56934d9), TOBN(0xc079e0fb, 0xe3191591), TOBN(0xe40896bd, 0xb01aada7)}, {TOBN(0x8d466791, 0x0492d25f), TOBN(0x8aeb30c9, 0xe7408276), TOBN(0xe9437495, 0x9287aacc), TOBN(0x23d4708d, 0x79fe03d4)}}, {{TOBN(0x8cda9cf2, 0xd0c05199), TOBN(0x502fbc22, 0xfae78454), TOBN(0xc0bda9df, 0xf572a182), TOBN(0x5f9b71b8, 0x6158b372)}, {TOBN(0xe0f33a59, 0x2b82dd07), TOBN(0x76302735, 0x9523032e), TOBN(0x7fe1a721, 0xc4505a32), TOBN(0x7b6e3e82, 0xf796409f)}}}, {{{TOBN(0xe3417bc0, 0x35d0b34a), TOBN(0x440b386b, 0x8327c0a7), TOBN(0x8fb7262d, 0xac0362d1), TOBN(0x2c41114c, 0xe0cdf943)}, {TOBN(0x2ba5cef1, 0xad95a0b1), TOBN(0xc09b37a8, 0x67d54362), TOBN(0x26d6cdd2, 0x01e486c9), TOBN(0x20477abf, 0x42ff9297)}}, {{TOBN(0xa004dcb3, 0x292a9287), TOBN(0xddc15cf6, 0x77b092c7), TOBN(0x083a8464, 0x806c0605), TOBN(0x4a68df70, 0x3db997b0)}, {TOBN(0x9c134e45, 0x05bf7dd0), TOBN(0xa4e63d39, 0x8ccf7f8c), TOBN(0xa6e6517f, 0x41b5f8af), TOBN(0xaa8b9342, 0xad7bc1cc)}}, {{TOBN(0x126f35b5, 0x1e706ad9), TOBN(0xb99cebb4, 0xc3a9ebdf), TOBN(0xa75389af, 0xbf608d90), TOBN(0x76113c4f, 0xc6c89858)}, {TOBN(0x80de8eb0, 0x97e2b5aa), TOBN(0x7e1022cc, 0x63b91304), TOBN(0x3bdab605, 0x6ccc066c), TOBN(0x33cbb144, 0xb2edf900)}}, {{TOBN(0xc4176471, 0x7af715d2), TOBN(0xe2f7f594, 0xd0134a96), TOBN(0x2c1873ef, 0xa41ec956), TOBN(0xe4e7b4f6, 0x77821304)}, {TOBN(0xe5c8ff97, 0x88d5374a), TOBN(0x2b915e63, 0x80823d5b), TOBN(0xea6bc755, 0xb2ee8fe2), TOBN(0x6657624c, 0xe7112651)}}, {{TOBN(0x157af101, 0xdace5aca), TOBN(0xc4fdbcf2, 0x11a6a267), TOBN(0xdaddf340, 0xc49c8609), TOBN(0x97e49f52, 0xe9604a65)}, {TOBN(0x9be8e790, 0x937e2ad5), TOBN(0x846e2508, 0x326e17f1), TOBN(0x3f38007a, 0x0bbbc0dc), TOBN(0xcf03603f, 0xb11e16d6)}}, {{TOBN(0xd6f800e0, 0x7442f1d5), TOBN(0x475607d1, 0x66e0e3ab), TOBN(0x82807f16, 0xb7c64047), TOBN(0x8858e1e3, 0xa749883d)}, {TOBN(0x5859120b, 0x8231ee10), TOBN(0x1b80e7eb, 0x638a1ece), TOBN(0xcb72525a, 0xc6aa73a4), TOBN(0xa7cdea3d, 0x844423ac)}}, {{TOBN(0x5ed0c007, 0xf8ae7c38), TOBN(0x6db07a5c, 0x3d740192), TOBN(0xbe5e9c2a, 0x5fe36db3), TOBN(0xd5b9d57a, 0x76e95046)}, {TOBN(0x54ac32e7, 0x8eba20f2), TOBN(0xef11ca8f, 0x71b9a352), TOBN(0x305e373e, 0xff98a658), TOBN(0xffe5a100, 0x823eb667)}}, {{TOBN(0x57477b11, 0xe51732d2), TOBN(0xdfd6eb28, 0x2538fc0e), TOBN(0x5c43b0cc, 0x3b39eec5), TOBN(0x6af12778, 0xcb36cc57)}, {TOBN(0x70b0852d, 0x06c425ae), TOBN(0x6df92f8c, 0x5c221b9b), TOBN(0x6c8d4f9e, 0xce826d9c), TOBN(0xf59aba7b, 0xb49359c3)}}, {{TOBN(0x5c8ed8d5, 0xda64309d), TOBN(0x61a6de56, 0x91b30704), TOBN(0xd6b52f6a, 0x2f9b5808), TOBN(0x0eee4194, 0x98c958a7)}, {TOBN(0xcddd9aab, 0x771e4caa), TOBN(0x83965dfd, 0x78bc21be), TOBN(0x02affce3, 0xb3b504f5), TOBN(0x30847a21, 0x561c8291)}}, {{TOBN(0xd2eb2cf1, 0x52bfda05), TOBN(0xe0e4c4e9, 0x6197b98c), TOBN(0x1d35076c, 0xf8a1726f), TOBN(0x6c06085b, 0x2db11e3d)}, {TOBN(0x15c0c4d7, 0x4463ba14), TOBN(0x9d292f83, 0x0030238c), TOBN(0x1311ee8b, 0x3727536d), TOBN(0xfeea86ef, 0xbeaedc1e)}}, {{TOBN(0xb9d18cd3, 0x66131e2e), TOBN(0xf31d974f, 0x80fe2682), TOBN(0xb6e49e0f, 0xe4160289), TOBN(0x7c48ec0b, 0x08e92799)}, {TOBN(0x818111d8, 0xd1989aa7), TOBN(0xb34fa0aa, 0xebf926f9), TOBN(0xdb5fe2f5, 0xa245474a), TOBN(0xf80a6ebb, 0x3c7ca756)}}, {{TOBN(0xa7f96054, 0xafa05dd8), TOBN(0x26dfcf21, 0xfcaf119e), TOBN(0xe20ef2e3, 0x0564bb59), TOBN(0xef4dca50, 0x61cb02b8)}, {TOBN(0xcda7838a, 0x65d30672), TOBN(0x8b08d534, 0xfd657e86), TOBN(0x4c5b4395, 0x46d595c8), TOBN(0x39b58725, 0x425cb836)}}, {{TOBN(0x8ea61059, 0x3de9abe3), TOBN(0x40434881, 0x9cdc03be), TOBN(0x9b261245, 0xcfedce8c), TOBN(0x78c318b4, 0xcf5234a1)}, {TOBN(0x510bcf16, 0xfde24c99), TOBN(0x2a77cb75, 0xa2c2ff5d), TOBN(0x9c895c2b, 0x27960fb4), TOBN(0xd30ce975, 0xb0eda42b)}}, {{TOBN(0xfda85393, 0x1a62cc26), TOBN(0x23c69b96, 0x50c0e052), TOBN(0xa227df15, 0xbfc633f3), TOBN(0x2ac78848, 0x1bae7d48)}, {TOBN(0x487878f9, 0x187d073d), TOBN(0x6c2be919, 0x967f807d), TOBN(0x765861d8, 0x336e6d8f), TOBN(0x88b8974c, 0xce528a43)}}, {{TOBN(0x09521177, 0xff57d051), TOBN(0x2ff38037, 0xfb6a1961), TOBN(0xfc0aba74, 0xa3d76ad4), TOBN(0x7c764803, 0x25a7ec17)}, {TOBN(0x7532d75f, 0x48879bc8), TOBN(0xea7eacc0, 0x58ce6bc1), TOBN(0xc82176b4, 0x8e896c16), TOBN(0x9a30e0b2, 0x2c750fed)}}, {{TOBN(0xc37e2c2e, 0x421d3aa4), TOBN(0xf926407c, 0xe84fa840), TOBN(0x18abc03d, 0x1454e41c), TOBN(0x26605ecd, 0x3f7af644)}, {TOBN(0x242341a6, 0xd6a5eabf), TOBN(0x1edb84f4, 0x216b668e), TOBN(0xd836edb8, 0x04010102), TOBN(0x5b337ce7, 0x945e1d8c)}}, {{TOBN(0xd2075c77, 0xc055dc14), TOBN(0x2a0ffa25, 0x81d89cdf), TOBN(0x8ce815ea, 0x6ffdcbaf), TOBN(0xa3428878, 0xfb648867)}, {TOBN(0x277699cf, 0x884655fb), TOBN(0xfa5b5bd6, 0x364d3e41), TOBN(0x01f680c6, 0x441e1cb7), TOBN(0x3fd61e66, 0xb70a7d67)}}, {{TOBN(0x666ba2dc, 0xcc78cf66), TOBN(0xb3018174, 0x6fdbff77), TOBN(0x8d4dd0db, 0x168d4668), TOBN(0x259455d0, 0x1dab3a2a)}, {TOBN(0xf58564c5, 0xcde3acec), TOBN(0x77141925, 0x13adb276), TOBN(0x527d725d, 0x8a303f65), TOBN(0x55deb6c9, 0xe6f38f7b)}}, {{TOBN(0xfd5bb657, 0xb1fa70fb), TOBN(0xfa07f50f, 0xd8073a00), TOBN(0xf72e3aa7, 0xbca02500), TOBN(0xf68f895d, 0x9975740d)}, {TOBN(0x30112060, 0x5cae2a6a), TOBN(0x01bd7218, 0x02874842), TOBN(0x3d423891, 0x7ce47bd3), TOBN(0xa66663c1, 0x789544f6)}}, {{TOBN(0x864d05d7, 0x3272d838), TOBN(0xe22924f9, 0xfa6295c5), TOBN(0x8189593f, 0x6c2fda32), TOBN(0x330d7189, 0xb184b544)}, {TOBN(0x79efa62c, 0xbde1f714), TOBN(0x35771c94, 0xe5cb1a63), TOBN(0x2f4826b8, 0x641c8332), TOBN(0x00a894fb, 0xc8cee854)}}, {{TOBN(0xb4b9a39b, 0x36194d40), TOBN(0xe857a7c5, 0x77612601), TOBN(0xf4209dd2, 0x4ecf2f58), TOBN(0x82b9e66d, 0x5a033487)}, {TOBN(0xc1e36934, 0xe4e8b9dd), TOBN(0xd2372c9d, 0xa42377d7), TOBN(0x51dc94c7, 0x0e3ae43b), TOBN(0x4c57761e, 0x04474f6f)}}, {{TOBN(0xdcdacd0a, 0x1058a318), TOBN(0x369cf3f5, 0x78053a9a), TOBN(0xc6c3de50, 0x31c68de2), TOBN(0x4653a576, 0x3c4b6d9f)}, {TOBN(0x1688dd5a, 0xaa4e5c97), TOBN(0x5be80aa1, 0xb7ab3c74), TOBN(0x70cefe7c, 0xbc65c283), TOBN(0x57f95f13, 0x06867091)}}, {{TOBN(0xa39114e2, 0x4415503b), TOBN(0xc08ff7c6, 0x4cbb17e9), TOBN(0x1eff674d, 0xd7dec966), TOBN(0x6d4690af, 0x53376f63)}, {TOBN(0xff6fe32e, 0xea74237b), TOBN(0xc436d17e, 0xcd57508e), TOBN(0x15aa28e1, 0xedcc40fe), TOBN(0x0d769c04, 0x581bbb44)}}, {{TOBN(0xc240b6de, 0x34eaacda), TOBN(0xd9e116e8, 0x2ba0f1de), TOBN(0xcbe45ec7, 0x79438e55), TOBN(0x91787c9d, 0x96f752d7)}, {TOBN(0x897f532b, 0xf129ac2f), TOBN(0xd307b7c8, 0x5a36e22c), TOBN(0x91940675, 0x749fb8f3), TOBN(0xd14f95d0, 0x157fdb28)}}, {{TOBN(0xfe51d029, 0x6ae55043), TOBN(0x8931e98f, 0x44a87de1), TOBN(0xe57f1cc6, 0x09e4fee2), TOBN(0x0d063b67, 0x4e072d92)}, {TOBN(0x70a998b9, 0xed0e4316), TOBN(0xe74a736b, 0x306aca46), TOBN(0xecf0fbf2, 0x4fda97c7), TOBN(0xa40f65cb, 0x3e178d93)}}, {{TOBN(0x16253604, 0x16df4285), TOBN(0xb0c9babb, 0xd0c56ae2), TOBN(0x73032b19, 0xcfc5cfc3), TOBN(0xe497e5c3, 0x09752056)}, {TOBN(0x12096bb4, 0x164bda96), TOBN(0x1ee42419, 0xa0b74da1), TOBN(0x8fc36243, 0x403826ba), TOBN(0x0c8f0069, 0xdc09e660)}}, {{TOBN(0x8667e981, 0xc27253c9), TOBN(0x05a6aefb, 0x92b36a45), TOBN(0xa62c4b36, 0x9cb7bb46), TOBN(0x8394f375, 0x11f7027b)}, {TOBN(0x747bc79c, 0x5f109d0f), TOBN(0xcad88a76, 0x5b8cc60a), TOBN(0x80c5a66b, 0x58f09e68), TOBN(0xe753d451, 0xf6127eac)}}, {{TOBN(0xc44b74a1, 0x5b0ec6f5), TOBN(0x47989fe4, 0x5289b2b8), TOBN(0x745f8484, 0x58d6fc73), TOBN(0xec362a6f, 0xf61c70ab)}, {TOBN(0x070c98a7, 0xb3a8ad41), TOBN(0x73a20fc0, 0x7b63db51), TOBN(0xed2c2173, 0xf44c35f4), TOBN(0x8a56149d, 0x9acc9dca)}}, {{TOBN(0x98f17881, 0x9ac6e0f4), TOBN(0x360fdeaf, 0xa413b5ed), TOBN(0x0625b8f4, 0xa300b0fd), TOBN(0xf1f4d76a, 0x5b3222d3)}, {TOBN(0x9d6f5109, 0x587f76b8), TOBN(0x8b4ee08d, 0x2317fdb5), TOBN(0x88089bb7, 0x8c68b095), TOBN(0x95570e9a, 0x5808d9b9)}}, {{TOBN(0xa395c36f, 0x35d33ae7), TOBN(0x200ea123, 0x50bb5a94), TOBN(0x20c789bd, 0x0bafe84b), TOBN(0x243ef52d, 0x0919276a)}, {TOBN(0x3934c577, 0xe23ae233), TOBN(0xb93807af, 0xa460d1ec), TOBN(0xb72a53b1, 0xf8fa76a4), TOBN(0xd8914cb0, 0xc3ca4491)}}, {{TOBN(0x2e128494, 0x3fb42622), TOBN(0x3b2700ac, 0x500907d5), TOBN(0xf370fb09, 0x1a95ec63), TOBN(0xf8f30be2, 0x31b6dfbd)}, {TOBN(0xf2b2f8d2, 0x69e55f15), TOBN(0x1fead851, 0xcc1323e9), TOBN(0xfa366010, 0xd9e5eef6), TOBN(0x64d487b0, 0xe316107e)}}, {{TOBN(0x4c076b86, 0xd23ddc82), TOBN(0x03fd344c, 0x7e0143f0), TOBN(0xa95362ff, 0x317af2c5), TOBN(0x0add3db7, 0xe18b7a4f)}, {TOBN(0x9c673e3f, 0x8260e01b), TOBN(0xfbeb49e5, 0x54a1cc91), TOBN(0x91351bf2, 0x92f2e433), TOBN(0xc755e7ec, 0x851141eb)}}, {{TOBN(0xc9a95139, 0x29607745), TOBN(0x0ca07420, 0xa26f2b28), TOBN(0xcb2790e7, 0x4bc6f9dd), TOBN(0x345bbb58, 0xadcaffc0)}, {TOBN(0xc65ea38c, 0xbe0f27a2), TOBN(0x67c24d7c, 0x641fcb56), TOBN(0x2c25f0a7, 0xa9e2c757), TOBN(0x93f5cdb0, 0x16f16c49)}}, {{TOBN(0x2ca5a9d7, 0xc5ee30a1), TOBN(0xd1593635, 0xb909b729), TOBN(0x804ce9f3, 0xdadeff48), TOBN(0xec464751, 0xb07c30c3)}, {TOBN(0x89d65ff3, 0x9e49af6a), TOBN(0xf2d6238a, 0x6f3d01bc), TOBN(0x1095561e, 0x0bced843), TOBN(0x51789e12, 0xc8a13fd8)}}, {{TOBN(0xd633f929, 0x763231df), TOBN(0x46df9f7d, 0xe7cbddef), TOBN(0x01c889c0, 0xcb265da8), TOBN(0xfce1ad10, 0xaf4336d2)}, {TOBN(0x8d110df6, 0xfc6a0a7e), TOBN(0xdd431b98, 0x6da425dc), TOBN(0xcdc4aeab, 0x1834aabe), TOBN(0x84deb124, 0x8439b7fc)}}, {{TOBN(0x8796f169, 0x3c2a5998), TOBN(0x9b9247b4, 0x7947190d), TOBN(0x55b9d9a5, 0x11597014), TOBN(0x7e9dd70d, 0x7b1566ee)}, {TOBN(0x94ad78f7, 0xcbcd5e64), TOBN(0x0359ac17, 0x9bd4c032), TOBN(0x3b11baaf, 0x7cc222ae), TOBN(0xa6a6e284, 0xba78e812)}}, {{TOBN(0x8392053f, 0x24cea1a0), TOBN(0xc97bce4a, 0x33621491), TOBN(0x7eb1db34, 0x35399ee9), TOBN(0x473f78ef, 0xece81ad1)}, {TOBN(0x41d72fe0, 0xf63d3d0d), TOBN(0xe620b880, 0xafab62fc), TOBN(0x92096bc9, 0x93158383), TOBN(0x41a21357, 0x8f896f6c)}}, {{TOBN(0x1b5ee2fa, 0xc7dcfcab), TOBN(0x650acfde, 0x9546e007), TOBN(0xc081b749, 0xb1b02e07), TOBN(0xda9e41a0, 0xf9eca03d)}, {TOBN(0x013ba727, 0x175a54ab), TOBN(0xca0cd190, 0xea5d8d10), TOBN(0x85ea52c0, 0x95fd96a9), TOBN(0x2c591b9f, 0xbc5c3940)}}, {{TOBN(0x6fb4d4e4, 0x2bad4d5f), TOBN(0xfa4c3590, 0xfef0059b), TOBN(0x6a10218a, 0xf5122294), TOBN(0x9a78a81a, 0xa85751d1)}, {TOBN(0x04f20579, 0xa98e84e7), TOBN(0xfe1242c0, 0x4997e5b5), TOBN(0xe77a273b, 0xca21e1e4), TOBN(0xfcc8b1ef, 0x9411939d)}}, {{TOBN(0xe20ea302, 0x92d0487a), TOBN(0x1442dbec, 0x294b91fe), TOBN(0x1f7a4afe, 0xbb6b0e8f), TOBN(0x1700ef74, 0x6889c318)}, {TOBN(0xf5bbffc3, 0x70f1fc62), TOBN(0x3b31d4b6, 0x69c79cca), TOBN(0xe8bc2aab, 0xa7f6340d), TOBN(0xb0b08ab4, 0xa725e10a)}}, {{TOBN(0x44f05701, 0xae340050), TOBN(0xba4b3016, 0x1cf0c569), TOBN(0x5aa29f83, 0xfbe19a51), TOBN(0x1b9ed428, 0xb71d752e)}, {TOBN(0x1666e54e, 0xeb4819f5), TOBN(0x616cdfed, 0x9e18b75b), TOBN(0x112ed5be, 0x3ee27b0b), TOBN(0xfbf28319, 0x44c7de4d)}}, {{TOBN(0xd685ec85, 0xe0e60d84), TOBN(0x68037e30, 0x1db7ee78), TOBN(0x5b65bdcd, 0x003c4d6e), TOBN(0x33e7363a, 0x93e29a6a)}, {TOBN(0x995b3a61, 0x08d0756c), TOBN(0xd727f85c, 0x2faf134b), TOBN(0xfac6edf7, 0x1d337823), TOBN(0x99b9aa50, 0x0439b8b4)}}, {{TOBN(0x722eb104, 0xe2b4e075), TOBN(0x49987295, 0x437c4926), TOBN(0xb1e4c0e4, 0x46a9b82d), TOBN(0xd0cb3197, 0x57a006f5)}, {TOBN(0xf3de0f7d, 0xd7808c56), TOBN(0xb5c54d8f, 0x51f89772), TOBN(0x500a114a, 0xadbd31aa), TOBN(0x9afaaaa6, 0x295f6cab)}}, {{TOBN(0x94705e21, 0x04cf667a), TOBN(0xfc2a811b, 0x9d3935d7), TOBN(0x560b0280, 0x6d09267c), TOBN(0xf19ed119, 0xf780e53b)}, {TOBN(0xf0227c09, 0x067b6269), TOBN(0x967b8533, 0x5caef599), TOBN(0x155b9243, 0x68efeebc), TOBN(0xcd6d34f5, 0xc497bae6)}}, {{TOBN(0x1dd8d5d3, 0x6cceb370), TOBN(0x2aeac579, 0xa78d7bf9), TOBN(0x5d65017d, 0x70b67a62), TOBN(0x70c8e44f, 0x17c53f67)}, {TOBN(0xd1fc0950, 0x86a34d09), TOBN(0xe0fca256, 0xe7134907), TOBN(0xe24fa29c, 0x80fdd315), TOBN(0x2c4acd03, 0xd87499ad)}}, {{TOBN(0xbaaf7517, 0x3b5a9ba6), TOBN(0xb9cbe1f6, 0x12e51a51), TOBN(0xd88edae3, 0x5e154897), TOBN(0xe4309c3c, 0x77b66ca0)}, {TOBN(0xf5555805, 0xf67f3746), TOBN(0x85fc37ba, 0xa36401ff), TOBN(0xdf86e2ca, 0xd9499a53), TOBN(0x6270b2a3, 0xecbc955b)}}, {{TOBN(0xafae64f5, 0x974ad33b), TOBN(0x04d85977, 0xfe7b2df1), TOBN(0x2a3db3ff, 0x4ab03f73), TOBN(0x0b87878a, 0x8702740a)}, {TOBN(0x6d263f01, 0x5a061732), TOBN(0xc25430ce, 0xa32a1901), TOBN(0xf7ebab3d, 0xdb155018), TOBN(0x3a86f693, 0x63a9b78e)}}, {{TOBN(0x349ae368, 0xda9f3804), TOBN(0x470f07fe, 0xa164349c), TOBN(0xd52f4cc9, 0x8562baa5), TOBN(0xc74a9e86, 0x2b290df3)}, {TOBN(0xd3a1aa35, 0x43471a24), TOBN(0x239446be, 0xb8194511), TOBN(0xbec2dd00, 0x81dcd44d), TOBN(0xca3d7f0f, 0xc42ac82d)}}, {{TOBN(0x1f3db085, 0xfdaf4520), TOBN(0xbb6d3e80, 0x4549daf2), TOBN(0xf5969d8a, 0x19ad5c42), TOBN(0x7052b13d, 0xdbfd1511)}, {TOBN(0x11890d1b, 0x682b9060), TOBN(0xa71d3883, 0xac34452c), TOBN(0xa438055b, 0x783805b4), TOBN(0x43241277, 0x4725b23e)}}, {{TOBN(0xf20cf96e, 0x4901bbed), TOBN(0x6419c710, 0xf432a2bb), TOBN(0x57a0fbb9, 0xdfa9cd7d), TOBN(0x589111e4, 0x00daa249)}, {TOBN(0x19809a33, 0x7b60554e), TOBN(0xea5f8887, 0xede283a4), TOBN(0x2d713802, 0x503bfd35), TOBN(0x151bb0af, 0x585d2a53)}}, {{TOBN(0x40b08f74, 0x43b30ca8), TOBN(0xe10b5bba, 0xd9934583), TOBN(0xe8a546d6, 0xb51110ad), TOBN(0x1dd50e66, 0x28e0b6c5)}, {TOBN(0x292e9d54, 0xcff2b821), TOBN(0x3882555d, 0x47281760), TOBN(0x134838f8, 0x3724d6e3), TOBN(0xf2c679e0, 0x22ddcda1)}}, {{TOBN(0x40ee8815, 0x6d2a5768), TOBN(0x7f227bd2, 0x1c1e7e2d), TOBN(0x487ba134, 0xd04ff443), TOBN(0x76e2ff3d, 0xc614e54b)}, {TOBN(0x36b88d6f, 0xa3177ec7), TOBN(0xbf731d51, 0x2328fff5), TOBN(0x758caea2, 0x49ba158e), TOBN(0x5ab8ff4c, 0x02938188)}}, {{TOBN(0x33e16056, 0x35edc56d), TOBN(0x5a69d349, 0x7e940d79), TOBN(0x6c4fd001, 0x03866dcb), TOBN(0x20a38f57, 0x4893cdef)}, {TOBN(0xfbf3e790, 0xfac3a15b), TOBN(0x6ed7ea2e, 0x7a4f8e6b), TOBN(0xa663eb4f, 0xbc3aca86), TOBN(0x22061ea5, 0x080d53f7)}}, {{TOBN(0x2480dfe6, 0xf546783f), TOBN(0xd38bc6da, 0x5a0a641e), TOBN(0xfb093cd1, 0x2ede8965), TOBN(0x89654db4, 0xacb455cf)}, {TOBN(0x413cbf9a, 0x26e1adee), TOBN(0x291f3764, 0x373294d4), TOBN(0x00797257, 0x648083fe), TOBN(0x25f504d3, 0x208cc341)}}, {{TOBN(0x635a8e5e, 0xc3a0ee43), TOBN(0x70aaebca, 0x679898ff), TOBN(0x9ee9f547, 0x5dc63d56), TOBN(0xce987966, 0xffb34d00)}, {TOBN(0xf9f86b19, 0x5e26310a), TOBN(0x9e435484, 0x382a8ca8), TOBN(0x253bcb81, 0xc2352fe4), TOBN(0xa4eac8b0, 0x4474b571)}}, {{TOBN(0xc1b97512, 0xc1ad8cf8), TOBN(0x193b4e9e, 0x99e0b697), TOBN(0x939d2716, 0x01e85df0), TOBN(0x4fb265b3, 0xcd44eafd)}, {TOBN(0x321e7dcd, 0xe51e1ae2), TOBN(0x8e3a8ca6, 0xe3d8b096), TOBN(0x8de46cb0, 0x52604998), TOBN(0x91099ad8, 0x39072aa7)}}, {{TOBN(0x2617f91c, 0x93aa96b8), TOBN(0x0fc8716b, 0x7fca2e13), TOBN(0xa7106f5e, 0x95328723), TOBN(0xd1c9c40b, 0x262e6522)}, {TOBN(0xb9bafe86, 0x42b7c094), TOBN(0x1873439d, 0x1543c021), TOBN(0xe1baa5de, 0x5cbefd5d), TOBN(0xa363fc5e, 0x521e8aff)}}, {{TOBN(0xefe6320d, 0xf862eaac), TOBN(0x14419c63, 0x22c647dc), TOBN(0x0e06707c, 0x4e46d428), TOBN(0xcb6c834f, 0x4a178f8f)}, {TOBN(0x0f993a45, 0xd30f917c), TOBN(0xd4c4b049, 0x9879afee), TOBN(0xb6142a1e, 0x70500063), TOBN(0x7c9b41c3, 0xa5d9d605)}}, {{TOBN(0xbc00fc2f, 0x2f8ba2c7), TOBN(0x0966eb2f, 0x7c67aa28), TOBN(0x13f7b516, 0x5a786972), TOBN(0x3bfb7557, 0x8a2fbba0)}, {TOBN(0x131c4f23, 0x5a2b9620), TOBN(0xbff3ed27, 0x6faf46be), TOBN(0x9b4473d1, 0x7e172323), TOBN(0x421e8878, 0x339f6246)}}, {{TOBN(0x0fa8587a, 0x25a41632), TOBN(0xc0814124, 0xa35b6c93), TOBN(0x2b18a9f5, 0x59ebb8db), TOBN(0x264e3357, 0x76edb29c)}, {TOBN(0xaf245ccd, 0xc87c51e2), TOBN(0x16b3015b, 0x501e6214), TOBN(0xbb31c560, 0x0a3882ce), TOBN(0x6961bb94, 0xfec11e04)}}, {{TOBN(0x3b825b8d, 0xeff7a3a0), TOBN(0xbec33738, 0xb1df7326), TOBN(0x68ad747c, 0x99604a1f), TOBN(0xd154c934, 0x9a3bd499)}, {TOBN(0xac33506f, 0x1cc7a906), TOBN(0x73bb5392, 0x6c560e8f), TOBN(0x6428fcbe, 0x263e3944), TOBN(0xc11828d5, 0x1c387434)}}, {{TOBN(0x3cd04be1, 0x3e4b12ff), TOBN(0xc3aad9f9, 0x2d88667c), TOBN(0xc52ddcf8, 0x248120cf), TOBN(0x985a892e, 0x2a389532)}, {TOBN(0xfbb4b21b, 0x3bb85fa0), TOBN(0xf95375e0, 0x8dfc6269), TOBN(0xfb4fb06c, 0x7ee2acea), TOBN(0x6785426e, 0x309c4d1f)}}, {{TOBN(0x659b17c8, 0xd8ceb147), TOBN(0x9b649eee, 0xb70a5554), TOBN(0x6b7fa0b5, 0xac6bc634), TOBN(0xd99fe2c7, 0x1d6e732f)}, {TOBN(0x30e6e762, 0x8d3abba2), TOBN(0x18fee6e7, 0xa797b799), TOBN(0x5c9d360d, 0xc696464d), TOBN(0xe3baeb48, 0x27bfde12)}}, {{TOBN(0x2bf5db47, 0xf23206d5), TOBN(0x2f6d3420, 0x1d260152), TOBN(0x17b87653, 0x3f8ff89a), TOBN(0x5157c30c, 0x378fa458)}, {TOBN(0x7517c5c5, 0x2d4fb936), TOBN(0xef22f7ac, 0xe6518cdc), TOBN(0xdeb483e6, 0xbf847a64), TOBN(0xf5084558, 0x92e0fa89)}}}, {{{TOBN(0xab9659d8, 0xdf7304d4), TOBN(0xb71bcf1b, 0xff210e8e), TOBN(0xa9a2438b, 0xd73fbd60), TOBN(0x4595cd1f, 0x5d11b4de)}, {TOBN(0x9c0d329a, 0x4835859d), TOBN(0x4a0f0d2d, 0x7dbb6e56), TOBN(0xc6038e5e, 0xdf928a4e), TOBN(0xc9429621, 0x8f5ad154)}}, {{TOBN(0x91213462, 0xf23f2d92), TOBN(0x6cab71bd, 0x60b94078), TOBN(0x6bdd0a63, 0x176cde20), TOBN(0x54c9b20c, 0xee4d54bc)}, {TOBN(0x3cd2d8aa, 0x9f2ac02f), TOBN(0x03f8e617, 0x206eedb0), TOBN(0xc7f68e16, 0x93086434), TOBN(0x831469c5, 0x92dd3db9)}}, {{TOBN(0x8521df24, 0x8f981354), TOBN(0x587e23ec, 0x3588a259), TOBN(0xcbedf281, 0xd7a0992c), TOBN(0x06930a55, 0x38961407)}, {TOBN(0x09320deb, 0xbe5bbe21), TOBN(0xa7ffa5b5, 0x2491817f), TOBN(0xe6c8b4d9, 0x09065160), TOBN(0xac4f3992, 0xfff6d2a9)}}, {{TOBN(0x7aa7a158, 0x3ae9c1bd), TOBN(0xe0af6d98, 0xe37ce240), TOBN(0xe54342d9, 0x28ab38b4), TOBN(0xe8b75007, 0x0a1c98ca)}, {TOBN(0xefce86af, 0xe02358f2), TOBN(0x31b8b856, 0xea921228), TOBN(0x052a1912, 0x0a1c67fc), TOBN(0xb4069ea4, 0xe3aead59)}}, {{TOBN(0x3232d6e2, 0x7fa03cb3), TOBN(0xdb938e5b, 0x0fdd7d88), TOBN(0x04c1d2cd, 0x2ccbfc5d), TOBN(0xd2f45c12, 0xaf3a580f)}, {TOBN(0x592620b5, 0x7883e614), TOBN(0x5fd27e68, 0xbe7c5f26), TOBN(0x139e45a9, 0x1567e1e3), TOBN(0x2cc71d2d, 0x44d8aaaf)}}, {{TOBN(0x4a9090cd, 0xe36d0757), TOBN(0xf722d7b1, 0xd9a29382), TOBN(0xfb7fb04c, 0x04b48ddf), TOBN(0x628ad2a7, 0xebe16f43)}, {TOBN(0xcd3fbfb5, 0x20226040), TOBN(0x6c34ecb1, 0x5104b6c4), TOBN(0x30c0754e, 0xc903c188), TOBN(0xec336b08, 0x2d23cab0)}}, {{TOBN(0x473d62a2, 0x1e206ee5), TOBN(0xf1e27480, 0x8c49a633), TOBN(0x87ab956c, 0xe9f6b2c3), TOBN(0x61830b48, 0x62b606ea)}, {TOBN(0x67cd6846, 0xe78e815f), TOBN(0xfe40139f, 0x4c02082a), TOBN(0x52bbbfcb, 0x952ec365), TOBN(0x74c11642, 0x6b9836ab)}}, {{TOBN(0x9f51439e, 0x558df019), TOBN(0x230da4ba, 0xac712b27), TOBN(0x518919e3, 0x55185a24), TOBN(0x4dcefcdd, 0x84b78f50)}, {TOBN(0xa7d90fb2, 0xa47d4c5a), TOBN(0x55ac9abf, 0xb30e009e), TOBN(0xfd2fc359, 0x74eed273), TOBN(0xb72d824c, 0xdbea8faf)}}, {{TOBN(0xce721a74, 0x4513e2ca), TOBN(0x0b418612, 0x38240b2c), TOBN(0x05199968, 0xd5baa450), TOBN(0xeb1757ed, 0x2b0e8c25)}, {TOBN(0x6ebc3e28, 0x3dfac6d5), TOBN(0xb2431e2e, 0x48a237f5), TOBN(0x2acb5e23, 0x52f61499), TOBN(0x5558a2a7, 0xe06c936b)}}, {{TOBN(0xd213f923, 0xcbb13d1b), TOBN(0x98799f42, 0x5bfb9bfe), TOBN(0x1ae8ddc9, 0x701144a9), TOBN(0x0b8b3bb6, 0x4c5595ee)}, {TOBN(0x0ea9ef2e, 0x3ecebb21), TOBN(0x17cb6c4b, 0x3671f9a7), TOBN(0x47ef464f, 0x726f1d1f), TOBN(0x171b9484, 0x6943a276)}}, {{TOBN(0x51a4ae2d, 0x7ef0329c), TOBN(0x08509222, 0x91c4402a), TOBN(0x64a61d35, 0xafd45bbc), TOBN(0x38f096fe, 0x3035a851)}, {TOBN(0xc7468b74, 0xa1dec027), TOBN(0xe8cf10e7, 0x4fc7dcba), TOBN(0xea35ff40, 0xf4a06353), TOBN(0x0b4c0dfa, 0x8b77dd66)}}, {{TOBN(0x779b8552, 0xde7e5c19), TOBN(0xfab28609, 0xc1c0256c), TOBN(0x64f58eee, 0xabd4743d), TOBN(0x4e8ef838, 0x7b6cc93b)}, {TOBN(0xee650d26, 0x4cb1bf3d), TOBN(0x4c1f9d09, 0x73dedf61), TOBN(0xaef7c9d7, 0xbfb70ced), TOBN(0x1ec0507e, 0x1641de1e)}}, {{TOBN(0xcd7e5cc7, 0xcde45079), TOBN(0xde173c9a, 0x516ac9e4), TOBN(0x517a8494, 0xc170315c), TOBN(0x438fd905, 0x91d8e8fb)}, {TOBN(0x5145c506, 0xc7d9630b), TOBN(0x6457a87b, 0xf47d4d75), TOBN(0xd31646bf, 0x0d9a80e8), TOBN(0x453add2b, 0xcef3aabe)}}, {{TOBN(0xc9941109, 0xa607419d), TOBN(0xfaa71e62, 0xbb6bca80), TOBN(0x34158c13, 0x07c431f3), TOBN(0x594abebc, 0x992bc47a)}, {TOBN(0x6dfea691, 0xeb78399f), TOBN(0x48aafb35, 0x3f42cba4), TOBN(0xedcd65af, 0x077c04f0), TOBN(0x1a29a366, 0xe884491a)}}, {{TOBN(0x023a40e5, 0x1c21f2bf), TOBN(0xf99a513c, 0xa5057aee), TOBN(0xa3fe7e25, 0xbcab072e), TOBN(0x8568d2e1, 0x40e32bcf)}, {TOBN(0x904594eb, 0xd3f69d9f), TOBN(0x181a9733, 0x07affab1), TOBN(0xe4d68d76, 0xb6e330f4), TOBN(0x87a6dafb, 0xc75a7fc1)}}, {{TOBN(0x549db2b5, 0xef7d9289), TOBN(0x2480d4a8, 0x197f015a), TOBN(0x61d5590b, 0xc40493b6), TOBN(0x3a55b52e, 0x6f780331)}, {TOBN(0x40eb8115, 0x309eadb0), TOBN(0xdea7de5a, 0x92e5c625), TOBN(0x64d631f0, 0xcc6a3d5a), TOBN(0x9d5e9d7c, 0x93e8dd61)}}, {{TOBN(0xf297bef5, 0x206d3ffc), TOBN(0x23d5e033, 0x7d808bd4), TOBN(0x4a4f6912, 0xd24cf5ba), TOBN(0xe4d8163b, 0x09cdaa8a)}, {TOBN(0x0e0de9ef, 0xd3082e8e), TOBN(0x4fe1246c, 0x0192f360), TOBN(0x1f900150, 0x4b8eee0a), TOBN(0x5219da81, 0xf1da391b)}}, {{TOBN(0x7bf6a5c1, 0xf7ea25aa), TOBN(0xd165e6bf, 0xfbb07d5f), TOBN(0xe3539361, 0x89e78671), TOBN(0xa3fcac89, 0x2bac4219)}, {TOBN(0xdfab6fd4, 0xf0baa8ab), TOBN(0x5a4adac1, 0xe2c1c2e5), TOBN(0x6cd75e31, 0x40d85849), TOBN(0xce263fea, 0x19b39181)}}, {{TOBN(0xcb6803d3, 0x07032c72), TOBN(0x7f40d5ce, 0x790968c8), TOBN(0xa6de86bd, 0xdce978f0), TOBN(0x25547c4f, 0x368f751c)}, {TOBN(0xb1e685fd, 0x65fb2a9e), TOBN(0xce69336f, 0x1eb9179c), TOBN(0xb15d1c27, 0x12504442), TOBN(0xb7df465c, 0xb911a06b)}}, {{TOBN(0xb8d804a3, 0x315980cd), TOBN(0x693bc492, 0xfa3bebf7), TOBN(0x3578aeee, 0x2253c504), TOBN(0x158de498, 0xcd2474a2)}, {TOBN(0x1331f5c7, 0xcfda8368), TOBN(0xd2d7bbb3, 0x78d7177e), TOBN(0xdf61133a, 0xf3c1e46e), TOBN(0x5836ce7d, 0xd30e7be8)}}, {{TOBN(0x83084f19, 0x94f834cb), TOBN(0xd35653d4, 0x429ed782), TOBN(0xa542f16f, 0x59e58243), TOBN(0xc2b52f65, 0x0470a22d)}, {TOBN(0xe3b6221b, 0x18f23d96), TOBN(0xcb05abac, 0x3f5252b4), TOBN(0xca00938b, 0x87d61402), TOBN(0x2f186cdd, 0x411933e4)}}, {{TOBN(0xe042ece5, 0x9a29a5c5), TOBN(0xb19b3c07, 0x3b6c8402), TOBN(0xc97667c7, 0x19d92684), TOBN(0xb5624622, 0xebc66372)}, {TOBN(0x0cb96e65, 0x3c04fa02), TOBN(0x83a7176c, 0x8eaa39aa), TOBN(0x2033561d, 0xeaa1633f), TOBN(0x45a9d086, 0x4533df73)}}, {{TOBN(0xe0542c1d, 0x3dc090bc), TOBN(0x82c996ef, 0xaa59c167), TOBN(0xe3f735e8, 0x0ee7fc4d), TOBN(0x7b179393, 0x7c35db79)}, {TOBN(0xb6419e25, 0xf8c5dbfd), TOBN(0x4d9d7a1e, 0x1f327b04), TOBN(0x979f6f9b, 0x298dfca8), TOBN(0xc7c5dff1, 0x8de9366a)}}, {{TOBN(0x1b7a588d, 0x04c82bdd), TOBN(0x68005534, 0xf8319dfd), TOBN(0xde8a55b5, 0xd8eb9580), TOBN(0x5ea886da, 0x8d5bca81)}, {TOBN(0xe8530a01, 0x252a0b4d), TOBN(0x1bffb4fe, 0x35eaa0a1), TOBN(0x2ad828b1, 0xd8e99563), TOBN(0x7de96ef5, 0x95f9cd87)}}, {{TOBN(0x4abb2d0c, 0xd77d970c), TOBN(0x03cfb933, 0xd33ef9cb), TOBN(0xb0547c01, 0x8b211fe9), TOBN(0x2fe64809, 0xa56ed1c6)}, {TOBN(0xcb7d5624, 0xc2ac98cc), TOBN(0x2a1372c0, 0x1a393e33), TOBN(0xc8d1ec1c, 0x29660521), TOBN(0xf3d31b04, 0xb37ac3e9)}}, {{TOBN(0xa29ae9df, 0x5ece6e7c), TOBN(0x0603ac8f, 0x0facfb55), TOBN(0xcfe85b7a, 0xdda233a5), TOBN(0xe618919f, 0xbd75f0b8)}, {TOBN(0xf555a3d2, 0x99bf1603), TOBN(0x1f43afc9, 0xf184255a), TOBN(0xdcdaf341, 0x319a3e02), TOBN(0xd3b117ef, 0x03903a39)}}, {{TOBN(0xe095da13, 0x65d1d131), TOBN(0x86f16367, 0xc37ad03e), TOBN(0x5f37389e, 0x462cd8dd), TOBN(0xc103fa04, 0xd67a60e6)}, {TOBN(0x57c34344, 0xf4b478f0), TOBN(0xce91edd8, 0xe117c98d), TOBN(0x001777b0, 0x231fc12e), TOBN(0x11ae47f2, 0xb207bccb)}}, {{TOBN(0xd983cf8d, 0x20f8a242), TOBN(0x7aff5b1d, 0xf22e1ad8), TOBN(0x68fd11d0, 0x7fc4feb3), TOBN(0x5d53ae90, 0xb0f1c3e1)}, {TOBN(0x50fb7905, 0xec041803), TOBN(0x85e3c977, 0x14404888), TOBN(0x0e67faed, 0xac628d8f), TOBN(0x2e865150, 0x6668532c)}}, {{TOBN(0x15acaaa4, 0x6a67a6b0), TOBN(0xf4cdee25, 0xb25cec41), TOBN(0x49ee565a, 0xe4c6701e), TOBN(0x2a04ca66, 0xfc7d63d8)}, {TOBN(0xeb105018, 0xef0543fb), TOBN(0xf709a4f5, 0xd1b0d81d), TOBN(0x5b906ee6, 0x2915d333), TOBN(0xf4a87412, 0x96f1f0ab)}}, {{TOBN(0xb6b82fa7, 0x4d82f4c2), TOBN(0x90725a60, 0x6804efb3), TOBN(0xbc82ec46, 0xadc3425e), TOBN(0xb7b80581, 0x2787843e)}, {TOBN(0xdf46d91c, 0xdd1fc74c), TOBN(0xdc1c62cb, 0xe783a6c4), TOBN(0x59d1b9f3, 0x1a04cbba), TOBN(0xd87f6f72, 0x95e40764)}}, {{TOBN(0x02b4cfc1, 0x317f4a76), TOBN(0x8d2703eb, 0x91036bce), TOBN(0x98206cc6, 0xa5e72a56), TOBN(0x57be9ed1, 0xcf53fb0f)}, {TOBN(0x09374571, 0xef0b17ac), TOBN(0x74b2655e, 0xd9181b38), TOBN(0xc8f80ea8, 0x89935d0e), TOBN(0xc0d9e942, 0x91529936)}}, {{TOBN(0x19686041, 0x1e84e0e5), TOBN(0xa5db84d3, 0xaea34c93), TOBN(0xf9d5bb19, 0x7073a732), TOBN(0xb8d2fe56, 0x6bcfd7c0)}, {TOBN(0x45775f36, 0xf3eb82fa), TOBN(0x8cb20ccc, 0xfdff8b58), TOBN(0x1659b65f, 0x8374c110), TOBN(0xb8b4a422, 0x330c789a)}}, {{TOBN(0x75e3c3ea, 0x6fe8208b), TOBN(0xbd74b9e4, 0x286e78fe), TOBN(0x0be2e81b, 0xd7d93a1a), TOBN(0x7ed06e27, 0xdd0a5aae)}, {TOBN(0x721f5a58, 0x6be8b800), TOBN(0x428299d1, 0xd846db28), TOBN(0x95cb8e6b, 0x5be88ed3), TOBN(0xc3186b23, 0x1c034e11)}}, {{TOBN(0xa6312c9e, 0x8977d99b), TOBN(0xbe944331, 0x83f531e7), TOBN(0x8232c0c2, 0x18d3b1d4), TOBN(0x617aae8b, 0xe1247b73)}, {TOBN(0x40153fc4, 0x282aec3b), TOBN(0xc6063d2f, 0xf7b8f823), TOBN(0x68f10e58, 0x3304f94c), TOBN(0x31efae74, 0xee676346)}}, {{TOBN(0xbadb6c6d, 0x40a9b97c), TOBN(0x14702c63, 0x4f666256), TOBN(0xdeb954f1, 0x5184b2e3), TOBN(0x5184a526, 0x94b6ca40)}, {TOBN(0xfff05337, 0x003c32ea), TOBN(0x5aa374dd, 0x205974c7), TOBN(0x9a763854, 0x4b0dd71a), TOBN(0x459cd27f, 0xdeb947ec)}}, {{TOBN(0xa6e28161, 0x459c2b92), TOBN(0x2f020fa8, 0x75ee8ef5), TOBN(0xb132ec2d, 0x30b06310), TOBN(0xc3e15899, 0xbc6a4530)}, {TOBN(0xdc5f53fe, 0xaa3f451a), TOBN(0x3a3c7f23, 0xc2d9acac), TOBN(0x2ec2f892, 0x6b27e58b), TOBN(0x68466ee7, 0xd742799f)}}, {{TOBN(0x98324dd4, 0x1fa26613), TOBN(0xa2dc6dab, 0xbdc29d63), TOBN(0xf9675faa, 0xd712d657), TOBN(0x813994be, 0x21fd8d15)}, {TOBN(0x5ccbb722, 0xfd4f7553), TOBN(0x5135ff8b, 0xf3a36b20), TOBN(0x44be28af, 0x69559df5), TOBN(0x40b65bed, 0x9d41bf30)}}, {{TOBN(0xd98bf2a4, 0x3734e520), TOBN(0x5e3abbe3, 0x209bdcba), TOBN(0x77c76553, 0xbc945b35), TOBN(0x5331c093, 0xc6ef14aa)}, {TOBN(0x518ffe29, 0x76b60c80), TOBN(0x2285593b, 0x7ace16f8), TOBN(0xab1f64cc, 0xbe2b9784), TOBN(0xe8f2c0d9, 0xab2421b6)}}, {{TOBN(0x617d7174, 0xc1df065c), TOBN(0xafeeb5ab, 0x5f6578fa), TOBN(0x16ff1329, 0x263b54a8), TOBN(0x45c55808, 0xc990dce3)}, {TOBN(0x42eab6c0, 0xecc8c177), TOBN(0x799ea9b5, 0x5982ecaa), TOBN(0xf65da244, 0xb607ef8e), TOBN(0x8ab226ce, 0x32a3fc2c)}}, {{TOBN(0x745741e5, 0x7ea973dc), TOBN(0x5c00ca70, 0x20888f2e), TOBN(0x7cdce3cf, 0x45fd9cf1), TOBN(0x8a741ef1, 0x5507f872)}, {TOBN(0x47c51c2f, 0x196b4cec), TOBN(0x70d08e43, 0xc97ea618), TOBN(0x930da15c, 0x15b18a2b), TOBN(0x33b6c678, 0x2f610514)}}, {{TOBN(0xc662e4f8, 0x07ac9794), TOBN(0x1eccf050, 0xba06cb79), TOBN(0x1ff08623, 0xe7d954e5), TOBN(0x6ef2c5fb, 0x24cf71c3)}, {TOBN(0xb2c063d2, 0x67978453), TOBN(0xa0cf3796, 0x1d654af8), TOBN(0x7cb242ea, 0x7ebdaa37), TOBN(0x206e0b10, 0xb86747e0)}}, {{TOBN(0x481dae5f, 0xd5ecfefc), TOBN(0x07084fd8, 0xc2bff8fc), TOBN(0x8040a01a, 0xea324596), TOBN(0x4c646980, 0xd4de4036)}, {TOBN(0x9eb8ab4e, 0xd65abfc3), TOBN(0xe01cb91f, 0x13541ec7), TOBN(0x8f029adb, 0xfd695012), TOBN(0x9ae28483, 0x3c7569ec)}}, {{TOBN(0xa5614c9e, 0xa66d80a1), TOBN(0x680a3e44, 0x75f5f911), TOBN(0x0c07b14d, 0xceba4fc1), TOBN(0x891c285b, 0xa13071c1)}, {TOBN(0xcac67ceb, 0x799ece3c), TOBN(0x29b910a9, 0x41e07e27), TOBN(0x66bdb409, 0xf2e43123), TOBN(0x06f8b137, 0x7ac9ecbe)}}, {{TOBN(0x5981fafd, 0x38547090), TOBN(0x19ab8b9f, 0x85e3415d), TOBN(0xfc28c194, 0xc7e31b27), TOBN(0x843be0aa, 0x6fbcbb42)}, {TOBN(0xf3b1ed43, 0xa6db836c), TOBN(0x2a1330e4, 0x01a45c05), TOBN(0x4f19f3c5, 0x95c1a377), TOBN(0xa85f39d0, 0x44b5ee33)}}, {{TOBN(0x3da18e6d, 0x4ae52834), TOBN(0x5a403b39, 0x7423dcb0), TOBN(0xbb555e0a, 0xf2374aef), TOBN(0x2ad599c4, 0x1e8ca111)}, {TOBN(0x1b3a2fb9, 0x014b3bf8), TOBN(0x73092684, 0xf66d5007), TOBN(0x079f1426, 0xc4340102), TOBN(0x1827cf81, 0x8fddf4de)}}, {{TOBN(0xc83605f6, 0xf10ff927), TOBN(0xd3871451, 0x23739fc6), TOBN(0x6d163450, 0xcac1c2cc), TOBN(0x6b521296, 0xa2ec1ac5)}, {TOBN(0x0606c4f9, 0x6e3cb4a5), TOBN(0xe47d3f41, 0x778abff7), TOBN(0x425a8d5e, 0xbe8e3a45), TOBN(0x53ea9e97, 0xa6102160)}}, {{TOBN(0x477a106e, 0x39cbb688), TOBN(0x532401d2, 0xf3386d32), TOBN(0x8e564f64, 0xb1b9b421), TOBN(0xca9b8388, 0x81dad33f)}, {TOBN(0xb1422b4e, 0x2093913e), TOBN(0x533d2f92, 0x69bc8112), TOBN(0x3fa017be, 0xebe7b2c7), TOBN(0xb2767c4a, 0xcaf197c6)}}, {{TOBN(0xc925ff87, 0xaedbae9f), TOBN(0x7daf0eb9, 0x36880a54), TOBN(0x9284ddf5, 0x9c4d0e71), TOBN(0x1581cf93, 0x316f8cf5)}, {TOBN(0x3eeca887, 0x3ac1f452), TOBN(0xb417fce9, 0xfb6aeffe), TOBN(0xa5918046, 0xeefb8dc3), TOBN(0x73d318ac, 0x02209400)}}, {{TOBN(0xe800400f, 0x728693e5), TOBN(0xe87d814b, 0x339927ed), TOBN(0x93e94d3b, 0x57ea9910), TOBN(0xff8a35b6, 0x2245fb69)}, {TOBN(0x043853d7, 0x7f200d34), TOBN(0x470f1e68, 0x0f653ce1), TOBN(0x81ac05bd, 0x59a06379), TOBN(0xa14052c2, 0x03930c29)}}, {{TOBN(0x6b72fab5, 0x26bc2797), TOBN(0x13670d16, 0x99f16771), TOBN(0x00170052, 0x1e3e48d1), TOBN(0x978fe401, 0xb7adf678)}, {TOBN(0x55ecfb92, 0xd41c5dd4), TOBN(0x5ff8e247, 0xc7b27da5), TOBN(0xe7518272, 0x013fb606), TOBN(0x5768d7e5, 0x2f547a3c)}}, {{TOBN(0xbb24eaa3, 0x60017a5f), TOBN(0x6b18e6e4, 0x9c64ce9b), TOBN(0xc225c655, 0x103dde07), TOBN(0xfc3672ae, 0x7592f7ea)}, {TOBN(0x9606ad77, 0xd06283a1), TOBN(0x542fc650, 0xe4d59d99), TOBN(0xabb57c49, 0x2a40e7c2), TOBN(0xac948f13, 0xa8db9f55)}}, {{TOBN(0x6d4c9682, 0xb04465c3), TOBN(0xe3d062fa, 0x6468bd15), TOBN(0xa51729ac, 0x5f318d7e), TOBN(0x1fc87df6, 0x9eb6fc95)}, {TOBN(0x63d146a8, 0x0591f652), TOBN(0xa861b8f7, 0x589621aa), TOBN(0x59f5f15a, 0xce31348c), TOBN(0x8f663391, 0x440da6da)}}, {{TOBN(0xcfa778ac, 0xb591ffa3), TOBN(0x027ca9c5, 0x4cdfebce), TOBN(0xbe8e05a5, 0x444ea6b3), TOBN(0x8aab4e69, 0xa78d8254)}, {TOBN(0x2437f04f, 0xb474d6b8), TOBN(0x6597ffd4, 0x045b3855), TOBN(0xbb0aea4e, 0xca47ecaa), TOBN(0x568aae83, 0x85c7ebfc)}}, {{TOBN(0x0e966e64, 0xc73b2383), TOBN(0x49eb3447, 0xd17d8762), TOBN(0xde107821, 0x8da05dab), TOBN(0x443d8baa, 0x016b7236)}, {TOBN(0x163b63a5, 0xea7610d6), TOBN(0xe47e4185, 0xce1ca979), TOBN(0xae648b65, 0x80baa132), TOBN(0xebf53de2, 0x0e0d5b64)}}, {{TOBN(0x8d3bfcb4, 0xd3c8c1ca), TOBN(0x0d914ef3, 0x5d04b309), TOBN(0x55ef6415, 0x3de7d395), TOBN(0xbde1666f, 0x26b850e8)}, {TOBN(0xdbe1ca6e, 0xd449ab19), TOBN(0x8902b322, 0xe89a2672), TOBN(0xb1674b7e, 0xdacb7a53), TOBN(0x8e9faf6e, 0xf52523ff)}}, {{TOBN(0x6ba535da, 0x9a85788b), TOBN(0xd21f03ae, 0xbd0626d4), TOBN(0x099f8c47, 0xe873dc64), TOBN(0xcda8564d, 0x018ec97e)}, {TOBN(0x3e8d7a5c, 0xde92c68c), TOBN(0x78e035a1, 0x73323cc4), TOBN(0x3ef26275, 0xf880ff7c), TOBN(0xa4ee3dff, 0x273eedaa)}}, {{TOBN(0x58823507, 0xaf4e18f8), TOBN(0x967ec9b5, 0x0672f328), TOBN(0x9ded19d9, 0x559d3186), TOBN(0x5e2ab3de, 0x6cdce39c)}, {TOBN(0xabad6e4d, 0x11c226df), TOBN(0xf9783f43, 0x87723014), TOBN(0x9a49a0cf, 0x1a885719), TOBN(0xfc0c1a5a, 0x90da9dbf)}}, {{TOBN(0x8bbaec49, 0x571d92ac), TOBN(0x569e85fe, 0x4692517f), TOBN(0x8333b014, 0xa14ea4af), TOBN(0x32f2a62f, 0x12e5c5ad)}, {TOBN(0x98c2ce3a, 0x06d89b85), TOBN(0xb90741aa, 0x2ff77a08), TOBN(0x2530defc, 0x01f795a2), TOBN(0xd6e5ba0b, 0x84b3c199)}}, {{TOBN(0x7d8e8451, 0x12e4c936), TOBN(0xae419f7d, 0xbd0be17b), TOBN(0xa583fc8c, 0x22262bc9), TOBN(0x6b842ac7, 0x91bfe2bd)}, {TOBN(0x33cef4e9, 0x440d6827), TOBN(0x5f69f4de, 0xef81fb14), TOBN(0xf16cf6f6, 0x234fbb92), TOBN(0x76ae3fc3, 0xd9e7e158)}}, {{TOBN(0x4e89f6c2, 0xe9740b33), TOBN(0x677bc85d, 0x4962d6a1), TOBN(0x6c6d8a7f, 0x68d10d15), TOBN(0x5f9a7224, 0x0257b1cd)}, {TOBN(0x7096b916, 0x4ad85961), TOBN(0x5f8c47f7, 0xe657ab4a), TOBN(0xde57d7d0, 0xf7461d7e), TOBN(0x7eb6094d, 0x80ce5ee2)}}, {{TOBN(0x0b1e1dfd, 0x34190547), TOBN(0x8a394f43, 0xf05dd150), TOBN(0x0a9eb24d, 0x97df44e6), TOBN(0x78ca06bf, 0x87675719)}, {TOBN(0x6f0b3462, 0x6ffeec22), TOBN(0x9d91bcea, 0x36cdd8fb), TOBN(0xac83363c, 0xa105be47), TOBN(0x81ba76c1, 0x069710e3)}}, {{TOBN(0x3d1b24cb, 0x28c682c6), TOBN(0x27f25228, 0x8612575b), TOBN(0xb587c779, 0xe8e66e98), TOBN(0x7b0c03e9, 0x405eb1fe)}, {TOBN(0xfdf0d030, 0x15b548e7), TOBN(0xa8be76e0, 0x38b36af7), TOBN(0x4cdab04a, 0x4f310c40), TOBN(0x6287223e, 0xf47ecaec)}}, {{TOBN(0x678e6055, 0x8b399320), TOBN(0x61fe3fa6, 0xc01e4646), TOBN(0xc482866b, 0x03261a5e), TOBN(0xdfcf45b8, 0x5c2f244a)}, {TOBN(0x8fab9a51, 0x2f684b43), TOBN(0xf796c654, 0xc7220a66), TOBN(0x1d90707e, 0xf5afa58f), TOBN(0x2c421d97, 0x4fdbe0de)}}, {{TOBN(0xc4f4cda3, 0xaf2ebc2f), TOBN(0xa0af843d, 0xcb4efe24), TOBN(0x53b857c1, 0x9ccd10b1), TOBN(0xddc9d1eb, 0x914d3e04)}, {TOBN(0x7bdec8bb, 0x62771deb), TOBN(0x829277aa, 0x91c5aa81), TOBN(0x7af18dd6, 0x832391ae), TOBN(0x1740f316, 0xc71a84ca)}}}, {{{TOBN(0x8928e99a, 0xeeaf8c49), TOBN(0xee7aa73d, 0x6e24d728), TOBN(0x4c5007c2, 0xe72b156c), TOBN(0x5fcf57c5, 0xed408a1d)}, {TOBN(0x9f719e39, 0xb6057604), TOBN(0x7d343c01, 0xc2868bbf), TOBN(0x2cca254b, 0x7e103e2d), TOBN(0xe6eb38a9, 0xf131bea2)}}, {{TOBN(0xb33e624f, 0x8be762b4), TOBN(0x2a9ee4d1, 0x058e3413), TOBN(0x968e6369, 0x67d805fa), TOBN(0x9848949b, 0x7db8bfd7)}, {TOBN(0x5308d7e5, 0xd23a8417), TOBN(0x892f3b1d, 0xf3e29da5), TOBN(0xc95c139e, 0x3dee471f), TOBN(0x8631594d, 0xd757e089)}}, {{TOBN(0xe0c82a3c, 0xde918dcc), TOBN(0x2e7b5994, 0x26fdcf4b), TOBN(0x82c50249, 0x32cb1b2d), TOBN(0xea613a9d, 0x7657ae07)}, {TOBN(0xc2eb5f6c, 0xf1fdc9f7), TOBN(0xb6eae8b8, 0x879fe682), TOBN(0x253dfee0, 0x591cbc7f), TOBN(0x000da713, 0x3e1290e6)}}, {{TOBN(0x1083e2ea, 0x1f095615), TOBN(0x0a28ad77, 0x14e68c33), TOBN(0x6bfc0252, 0x3d8818be), TOBN(0xb585113a, 0xf35850cd)}, {TOBN(0x7d935f0b, 0x30df8aa1), TOBN(0xaddda07c, 0x4ab7e3ac), TOBN(0x92c34299, 0x552f00cb), TOBN(0xc33ed1de, 0x2909df6c)}}, {{TOBN(0x22c2195d, 0x80e87766), TOBN(0x9e99e6d8, 0x9ddf4ac0), TOBN(0x09642e4e, 0x65e74934), TOBN(0x2610ffa2, 0xff1ff241)}, {TOBN(0x4d1d47d4, 0x751c8159), TOBN(0x697b4985, 0xaf3a9363), TOBN(0x0318ca46, 0x87477c33), TOBN(0xa90cb565, 0x9441eff3)}}, {{TOBN(0x58bb3848, 0x36f024cb), TOBN(0x85be1f77, 0x36016168), TOBN(0x6c59587c, 0xdc7e07f1), TOBN(0x191be071, 0xaf1d8f02)}, {TOBN(0xbf169fa5, 0xcca5e55c), TOBN(0x3864ba3c, 0xf7d04eac), TOBN(0x915e367f, 0x8d7d05db), TOBN(0xb48a876d, 0xa6549e5d)}}, {{TOBN(0xef89c656, 0x580e40a2), TOBN(0xf194ed8c, 0x728068bc), TOBN(0x74528045, 0xa47990c9), TOBN(0xf53fc7d7, 0x5e1a4649)}, {TOBN(0xbec5ae9b, 0x78593e7d), TOBN(0x2cac4ee3, 0x41db65d7), TOBN(0xa8c1eb24, 0x04a3d39b), TOBN(0x53b7d634, 0x03f8f3ef)}}, {{TOBN(0x2dc40d48, 0x3e07113c), TOBN(0x6e4a5d39, 0x7d8b63ae), TOBN(0x5582a94b, 0x79684c2b), TOBN(0x932b33d4, 0x622da26c)}, {TOBN(0xf534f651, 0x0dbbf08d), TOBN(0x211d07c9, 0x64c23a52), TOBN(0x0eeece0f, 0xee5bdc9b), TOBN(0xdf178168, 0xf7015558)}}, {{TOBN(0xd4294635, 0x0a712229), TOBN(0x93cbe448, 0x09273f8c), TOBN(0x00b095ef, 0x8f13bc83), TOBN(0xbb741972, 0x8798978c)}, {TOBN(0x9d7309a2, 0x56dbe6e7), TOBN(0xe578ec56, 0x5a5d39ec), TOBN(0x3961151b, 0x851f9a31), TOBN(0x2da7715d, 0xe5709eb4)}}, {{TOBN(0x867f3017, 0x53dfabf0), TOBN(0x728d2078, 0xb8e39259), TOBN(0x5c75a0cd, 0x815d9958), TOBN(0xf84867a6, 0x16603be1)}, {TOBN(0xc865b13d, 0x70e35b1c), TOBN(0x02414468, 0x19b03e2c), TOBN(0xe46041da, 0xac1f3121), TOBN(0x7c9017ad, 0x6f028a7c)}}, {{TOBN(0xabc96de9, 0x0a482873), TOBN(0x4265d6b1, 0xb77e54d4), TOBN(0x68c38e79, 0xa57d88e7), TOBN(0xd461d766, 0x9ce82de3)}, {TOBN(0x817a9ec5, 0x64a7e489), TOBN(0xcc5675cd, 0xa0def5f2), TOBN(0x9a00e785, 0x985d494e), TOBN(0xc626833f, 0x1b03514a)}}, {{TOBN(0xabe7905a, 0x83cdd60e), TOBN(0x50602fb5, 0xa1170184), TOBN(0x689886cd, 0xb023642a), TOBN(0xd568d090, 0xa6e1fb00)}, {TOBN(0x5b1922c7, 0x0259217f), TOBN(0x93831cd9, 0xc43141e4), TOBN(0xdfca3587, 0x0c95f86e), TOBN(0xdec2057a, 0x568ae828)}}, {{TOBN(0xc44ea599, 0xf98a759a), TOBN(0x55a0a7a2, 0xf7c23c1d), TOBN(0xd5ffb6e6, 0x94c4f687), TOBN(0x3563cce2, 0x12848478)}, {TOBN(0x812b3517, 0xe7b1fbe1), TOBN(0x8a7dc979, 0x4f7338e0), TOBN(0x211ecee9, 0x52d048db), TOBN(0x2eea4056, 0xc86ea3b8)}}, {{TOBN(0xd8cb68a7, 0xba772b34), TOBN(0xe16ed341, 0x5f4e2541), TOBN(0x9b32f6a6, 0x0fec14db), TOBN(0xeee376f7, 0x391698be)}, {TOBN(0xe9a7aa17, 0x83674c02), TOBN(0x65832f97, 0x5843022a), TOBN(0x29f3a8da, 0x5ba4990f), TOBN(0x79a59c3a, 0xfb8e3216)}}, {{TOBN(0x9cdc4d2e, 0xbd19bb16), TOBN(0xc6c7cfd0, 0xb3262d86), TOBN(0xd4ce14d0, 0x969c0b47), TOBN(0x1fa352b7, 0x13e56128)}, {TOBN(0x383d55b8, 0x973db6d3), TOBN(0x71836850, 0xe8e5b7bf), TOBN(0xc7714596, 0xe6bb571f), TOBN(0x259df31f, 0x2d5b2dd2)}}, {{TOBN(0x568f8925, 0x913cc16d), TOBN(0x18bc5b6d, 0xe1a26f5a), TOBN(0xdfa413be, 0xf5f499ae), TOBN(0xf8835dec, 0xc3f0ae84)}, {TOBN(0xb6e60bd8, 0x65a40ab0), TOBN(0x65596439, 0x194b377e), TOBN(0xbcd85625, 0x92084a69), TOBN(0x5ce433b9, 0x4f23ede0)}}, {{TOBN(0xe8e8f04f, 0x6ad65143), TOBN(0x11511827, 0xd6e14af6), TOBN(0x3d390a10, 0x8295c0c7), TOBN(0x71e29ee4, 0x621eba16)}, {TOBN(0xa588fc09, 0x63717b46), TOBN(0x02be02fe, 0xe06ad4a2), TOBN(0x931558c6, 0x04c22b22), TOBN(0xbb4d4bd6, 0x12f3c849)}}, {{TOBN(0x54a4f496, 0x20efd662), TOBN(0x92ba6d20, 0xc5952d14), TOBN(0x2db8ea1e, 0xcc9784c2), TOBN(0x81cc10ca, 0x4b353644)}, {TOBN(0x40b570ad, 0x4b4d7f6c), TOBN(0x5c9f1d96, 0x84a1dcd2), TOBN(0x01379f81, 0x3147e797), TOBN(0xe5c6097b, 0x2bd499f5)}}, {{TOBN(0x40dcafa6, 0x328e5e20), TOBN(0xf7b5244a, 0x54815550), TOBN(0xb9a4f118, 0x47bfc978), TOBN(0x0ea0e79f, 0xd25825b1)}, {TOBN(0xa50f96eb, 0x646c7ecf), TOBN(0xeb811493, 0x446dea9d), TOBN(0x2af04677, 0xdfabcf69), TOBN(0xbe3a068f, 0xc713f6e8)}}, {{TOBN(0x860d523d, 0x42e06189), TOBN(0xbf077941, 0x4e3aff13), TOBN(0x0b616dca, 0xc1b20650), TOBN(0xe66dd6d1, 0x2131300d)}, {TOBN(0xd4a0fd67, 0xff99abde), TOBN(0xc9903550, 0xc7aac50d), TOBN(0x022ecf8b, 0x7c46b2d7), TOBN(0x3333b1e8, 0x3abf92af)}}, {{TOBN(0x11cc113c, 0x6c491c14), TOBN(0x05976688, 0x80dd3f88), TOBN(0xf5b4d9e7, 0x29d932ed), TOBN(0xe982aad8, 0xa2c38b6d)}, {TOBN(0x6f925347, 0x8be0dcf0), TOBN(0x700080ae, 0x65ca53f2), TOBN(0xd8131156, 0x443ca77f), TOBN(0xe92d6942, 0xec51f984)}}, {{TOBN(0xd2a08af8, 0x85dfe9ae), TOBN(0xd825d9a5, 0x4d2a86ca), TOBN(0x2c53988d, 0x39dff020), TOBN(0xf38b135a, 0x430cdc40)}, {TOBN(0x0c918ae0, 0x62a7150b), TOBN(0xf31fd8de, 0x0c340e9b), TOBN(0xafa0e7ae, 0x4dbbf02e), TOBN(0x5847fb2a, 0x5eba6239)}}, {{TOBN(0x6b1647dc, 0xdccbac8b), TOBN(0xb642aa78, 0x06f485c8), TOBN(0x873f3765, 0x7038ecdf), TOBN(0x2ce5e865, 0xfa49d3fe)}, {TOBN(0xea223788, 0xc98c4400), TOBN(0x8104a8cd, 0xf1fa5279), TOBN(0xbcf7cc7a, 0x06becfd7), TOBN(0x49424316, 0xc8f974ae)}}, {{TOBN(0xc0da65e7, 0x84d6365d), TOBN(0xbcb7443f, 0x8f759fb8), TOBN(0x35c712b1, 0x7ae81930), TOBN(0x80428dff, 0x4c6e08ab)}, {TOBN(0xf19dafef, 0xa4faf843), TOBN(0xced8538d, 0xffa9855f), TOBN(0x20ac409c, 0xbe3ac7ce), TOBN(0x358c1fb6, 0x882da71e)}}, {{TOBN(0xafa9c0e5, 0xfd349961), TOBN(0x2b2cfa51, 0x8421c2fc), TOBN(0x2a80db17, 0xf3a28d38), TOBN(0xa8aba539, 0x5d138e7e)}, {TOBN(0x52012d1d, 0x6e96eb8d), TOBN(0x65d8dea0, 0xcbaf9622), TOBN(0x57735447, 0xb264f56c), TOBN(0xbeebef3f, 0x1b6c8da2)}}, {{TOBN(0xfc346d98, 0xce785254), TOBN(0xd50e8d72, 0xbb64a161), TOBN(0xc03567c7, 0x49794add), TOBN(0x15a76065, 0x752c7ef6)}, {TOBN(0x59f3a222, 0x961f23d6), TOBN(0x378e4438, 0x73ecc0b0), TOBN(0xc74be434, 0x5a82fde4), TOBN(0xae509af2, 0xd8b9cf34)}}, {{TOBN(0x4a61ee46, 0x577f44a1), TOBN(0xe09b748c, 0xb611deeb), TOBN(0xc0481b2c, 0xf5f7b884), TOBN(0x35626678, 0x61acfa6b)}, {TOBN(0x37f4c518, 0xbf8d21e6), TOBN(0x22d96531, 0xb205a76d), TOBN(0x37fb85e1, 0x954073c0), TOBN(0xbceafe4f, 0x65b3a567)}}, {{TOBN(0xefecdef7, 0xbe42a582), TOBN(0xd3fc6080, 0x65046be6), TOBN(0xc9af13c8, 0x09e8dba9), TOBN(0x1e6c9847, 0x641491ff)}, {TOBN(0x3b574925, 0xd30c31f7), TOBN(0xb7eb72ba, 0xac2a2122), TOBN(0x776a0dac, 0xef0859e7), TOBN(0x06fec314, 0x21900942)}}, {{TOBN(0x2464bc10, 0xf8c22049), TOBN(0x9bfbcce7, 0x875ebf69), TOBN(0xd7a88e2a, 0x4336326b), TOBN(0xda05261c, 0x5bc2acfa)}, {TOBN(0xc29f5bdc, 0xeba7efc8), TOBN(0x471237ca, 0x25dbbf2e), TOBN(0xa72773f2, 0x2975f127), TOBN(0xdc744e8e, 0x04d0b326)}}, {{TOBN(0x38a7ed16, 0xa56edb73), TOBN(0x64357e37, 0x2c007e70), TOBN(0xa167d15b, 0x5080b400), TOBN(0x07b41164, 0x23de4be1)}, {TOBN(0xb2d91e32, 0x74c89883), TOBN(0x3c162821, 0x2882e7ed), TOBN(0xad6b36ba, 0x7503e482), TOBN(0x48434e8e, 0x0ea34331)}}, {{TOBN(0x79f4f24f, 0x2c7ae0b9), TOBN(0xc46fbf81, 0x1939b44a), TOBN(0x76fefae8, 0x56595eb1), TOBN(0x417b66ab, 0xcd5f29c7)}, {TOBN(0x5f2332b2, 0xc5ceec20), TOBN(0xd69661ff, 0xe1a1cae2), TOBN(0x5ede7e52, 0x9b0286e6), TOBN(0x9d062529, 0xe276b993)}}, {{TOBN(0x324794b0, 0x7e50122b), TOBN(0xdd744f8b, 0x4af07ca5), TOBN(0x30a12f08, 0xd63fc97b), TOBN(0x39650f1a, 0x76626d9d)}, {TOBN(0x101b47f7, 0x1fa38477), TOBN(0x3d815f19, 0xd4dc124f), TOBN(0x1569ae95, 0xb26eb58a), TOBN(0xc3cde188, 0x95fb1887)}}, {{TOBN(0x54e9f37b, 0xf9539a48), TOBN(0xb0100e06, 0x7408c1a5), TOBN(0x821d9811, 0xea580cbb), TOBN(0x8af52d35, 0x86e50c56)}, {TOBN(0xdfbd9d47, 0xdbbf698b), TOBN(0x2961a1ea, 0x03dc1c73), TOBN(0x203d38f8, 0xe76a5df8), TOBN(0x08a53a68, 0x6def707a)}}, {{TOBN(0x26eefb48, 0x1bee45d4), TOBN(0xb3cee346, 0x3c688036), TOBN(0x463c5315, 0xc42f2469), TOBN(0x19d84d2e, 0x81378162)}, {TOBN(0x22d7c3c5, 0x1c4d349f), TOBN(0x65965844, 0x163d59c5), TOBN(0xcf198c56, 0xb8abceae), TOBN(0x6fb1fb1b, 0x628559d5)}}, {{TOBN(0x8bbffd06, 0x07bf8fe3), TOBN(0x46259c58, 0x3467734b), TOBN(0xd8953cea, 0x35f7f0d3), TOBN(0x1f0bece2, 0xd65b0ff1)}, {TOBN(0xf7d5b4b3, 0xf3c72914), TOBN(0x29e8ea95, 0x3cb53389), TOBN(0x4a365626, 0x836b6d46), TOBN(0xe849f910, 0xea174fde)}}, {{TOBN(0x7ec62fbb, 0xf4737f21), TOBN(0xd8dba5ab, 0x6209f5ac), TOBN(0x24b5d7a9, 0xa5f9adbe), TOBN(0x707d28f7, 0xa61dc768)}, {TOBN(0x7711460b, 0xcaa999ea), TOBN(0xba7b174d, 0x1c92e4cc), TOBN(0x3c4bab66, 0x18d4bf2d), TOBN(0xb8f0c980, 0xeb8bd279)}}, {{TOBN(0x024bea9a, 0x324b4737), TOBN(0xfba9e423, 0x32a83bca), TOBN(0x6e635643, 0xa232dced), TOBN(0x99619367, 0x2571c8ba)}, {TOBN(0xe8c9f357, 0x54b7032b), TOBN(0xf936b3ba, 0x2442d54a), TOBN(0x2263f0f0, 0x8290c65a), TOBN(0x48989780, 0xee2c7fdb)}}, {{TOBN(0xadc5d55a, 0x13d4f95e), TOBN(0x737cff85, 0xad9b8500), TOBN(0x271c557b, 0x8a73f43d), TOBN(0xbed617a4, 0xe18bc476)}, {TOBN(0x66245401, 0x7dfd8ab2), TOBN(0xae7b89ae, 0x3a2870aa), TOBN(0x1b555f53, 0x23a7e545), TOBN(0x6791e247, 0xbe057e4c)}}, {{TOBN(0x860136ad, 0x324fa34d), TOBN(0xea111447, 0x4cbeae28), TOBN(0x023a4270, 0xbedd3299), TOBN(0x3d5c3a7f, 0xc1c35c34)}, {TOBN(0xb0f6db67, 0x8d0412d2), TOBN(0xd92625e2, 0xfcdc6b9a), TOBN(0x92ae5ccc, 0x4e28a982), TOBN(0xea251c36, 0x47a3ce7e)}}, {{TOBN(0x9d658932, 0x790691bf), TOBN(0xed610589, 0x06b736ae), TOBN(0x712c2f04, 0xc0d63b6e), TOBN(0x5cf06fd5, 0xc63d488f)}, {TOBN(0x97363fac, 0xd9588e41), TOBN(0x1f9bf762, 0x2b93257e), TOBN(0xa9d1ffc4, 0x667acace), TOBN(0x1cf4a1aa, 0x0a061ecf)}}, {{TOBN(0x40e48a49, 0xdc1818d0), TOBN(0x0643ff39, 0xa3621ab0), TOBN(0x5768640c, 0xe39ef639), TOBN(0x1fc099ea, 0x04d86854)}, {TOBN(0x9130b9c3, 0xeccd28fd), TOBN(0xd743cbd2, 0x7eec54ab), TOBN(0x052b146f, 0xe5b475b6), TOBN(0x058d9a82, 0x900a7d1f)}}, {{TOBN(0x65e02292, 0x91262b72), TOBN(0x96f924f9, 0xbb0edf03), TOBN(0x5cfa59c8, 0xfe206842), TOBN(0xf6037004, 0x5eafa720)}, {TOBN(0x5f30699e, 0x18d7dd96), TOBN(0x381e8782, 0xcbab2495), TOBN(0x91669b46, 0xdd8be949), TOBN(0xb40606f5, 0x26aae8ef)}}, {{TOBN(0x2812b839, 0xfc6751a4), TOBN(0x16196214, 0xfba800ef), TOBN(0x4398d5ca, 0x4c1a2875), TOBN(0x720c00ee, 0x653d8349)}, {TOBN(0xc2699eb0, 0xd820007c), TOBN(0x880ee660, 0xa39b5825), TOBN(0x70694694, 0x471f6984), TOBN(0xf7d16ea8, 0xe3dda99a)}}, {{TOBN(0x28d675b2, 0xc0519a23), TOBN(0x9ebf94fe, 0x4f6952e3), TOBN(0xf28bb767, 0xa2294a8a), TOBN(0x85512b4d, 0xfe0af3f5)}, {TOBN(0x18958ba8, 0x99b16a0d), TOBN(0x95c2430c, 0xba7548a7), TOBN(0xb30d1b10, 0xa16be615), TOBN(0xe3ebbb97, 0x85bfb74c)}}, {{TOBN(0xa3273cfe, 0x18549fdb), TOBN(0xf6e200bf, 0x4fcdb792), TOBN(0x54a76e18, 0x83aba56c), TOBN(0x73ec66f6, 0x89ef6aa2)}, {TOBN(0x8d17add7, 0xd1b9a305), TOBN(0xa959c5b9, 0xb7ae1b9d), TOBN(0x88643522, 0x6bcc094a), TOBN(0xcc5616c4, 0xd7d429b9)}}, {{TOBN(0xa6dada01, 0xe6a33f7c), TOBN(0xc6217a07, 0x9d4e70ad), TOBN(0xd619a818, 0x09c15b7c), TOBN(0xea06b329, 0x0e80c854)}, {TOBN(0x174811ce, 0xa5f5e7b9), TOBN(0x66dfc310, 0x787c65f4), TOBN(0x4ea7bd69, 0x3316ab54), TOBN(0xc12c4acb, 0x1dcc0f70)}}, {{TOBN(0xe4308d1a, 0x1e407dd9), TOBN(0xe8a3587c, 0x91afa997), TOBN(0xea296c12, 0xab77b7a5), TOBN(0xb5ad49e4, 0x673c0d52)}, {TOBN(0x40f9b2b2, 0x7006085a), TOBN(0xa88ff340, 0x87bf6ec2), TOBN(0x978603b1, 0x4e3066a6), TOBN(0xb3f99fc2, 0xb5e486e2)}}, {{TOBN(0x07b53f5e, 0xb2e63645), TOBN(0xbe57e547, 0x84c84232), TOBN(0xd779c216, 0x7214d5cf), TOBN(0x617969cd, 0x029a3aca)}, {TOBN(0xd17668cd, 0x8a7017a0), TOBN(0x77b4d19a, 0xbe9b7ee8), TOBN(0x58fd0e93, 0x9c161776), TOBN(0xa8c4f4ef, 0xd5968a72)}}, {{TOBN(0x296071cc, 0x67b3de77), TOBN(0xae3c0b8e, 0x634f7905), TOBN(0x67e440c2, 0x8a7100c9), TOBN(0xbb8c3c1b, 0xeb4b9b42)}, {TOBN(0x6d71e8ea, 0xc51b3583), TOBN(0x7591f5af, 0x9525e642), TOBN(0xf73a2f7b, 0x13f509f3), TOBN(0x618487aa, 0x5619ac9b)}}, {{TOBN(0x3a72e5f7, 0x9d61718a), TOBN(0x00413bcc, 0x7592d28c), TOBN(0x7d9b11d3, 0x963c35cf), TOBN(0x77623bcf, 0xb90a46ed)}, {TOBN(0xdeef273b, 0xdcdd2a50), TOBN(0x4a741f9b, 0x0601846e), TOBN(0x33b89e51, 0x0ec6e929), TOBN(0xcb02319f, 0x8b7f22cd)}}, {{TOBN(0xbbe1500d, 0x084bae24), TOBN(0x2f0ae8d7, 0x343d2693), TOBN(0xacffb5f2, 0x7cdef811), TOBN(0xaa0c030a, 0x263fb94f)}, {TOBN(0x6eef0d61, 0xa0f442de), TOBN(0xf92e1817, 0x27b139d3), TOBN(0x1ae6deb7, 0x0ad8bc28), TOBN(0xa89e38dc, 0xc0514130)}}, {{TOBN(0x81eeb865, 0xd2fdca23), TOBN(0x5a15ee08, 0xcc8ef895), TOBN(0x768fa10a, 0x01905614), TOBN(0xeff5b8ef, 0x880ee19b)}, {TOBN(0xf0c0cabb, 0xcb1c8a0e), TOBN(0x2e1ee9cd, 0xb8c838f9), TOBN(0x0587d8b8, 0x8a4a14c0), TOBN(0xf6f27896, 0x2ff698e5)}}, {{TOBN(0xed38ef1c, 0x89ee6256), TOBN(0xf44ee1fe, 0x6b353b45), TOBN(0x9115c0c7, 0x70e903b3), TOBN(0xc78ec0a1, 0x818f31df)}, {TOBN(0x6c003324, 0xb7dccbc6), TOBN(0xd96dd1f3, 0x163bbc25), TOBN(0x33aa82dd, 0x5cedd805), TOBN(0x123aae4f, 0x7f7eb2f1)}}, {{TOBN(0x1723fcf5, 0xa26262cd), TOBN(0x1f7f4d5d, 0x0060ebd5), TOBN(0xf19c5c01, 0xb2eaa3af), TOBN(0x2ccb9b14, 0x9790accf)}, {TOBN(0x1f9c1cad, 0x52324aa6), TOBN(0x63200526, 0x7247df54), TOBN(0x5732fe42, 0xbac96f82), TOBN(0x52fe771f, 0x01a1c384)}}, {{TOBN(0x546ca13d, 0xb1001684), TOBN(0xb56b4eee, 0xa1709f75), TOBN(0x266545a9, 0xd5db8672), TOBN(0xed971c90, 0x1e8f3cfb)}, {TOBN(0x4e7d8691, 0xe3a07b29), TOBN(0x7570d9ec, 0xe4b696b9), TOBN(0xdc5fa067, 0x7bc7e9ae), TOBN(0x68b44caf, 0xc82c4844)}}, {{TOBN(0x519d34b3, 0xbf44da80), TOBN(0x283834f9, 0x5ab32e66), TOBN(0x6e608797, 0x6278a000), TOBN(0x1e62960e, 0x627312f6)}, {TOBN(0x9b87b27b, 0xe6901c55), TOBN(0x80e78538, 0x24fdbc1f), TOBN(0xbbbc0951, 0x2facc27d), TOBN(0x06394239, 0xac143b5a)}}, {{TOBN(0x35bb4a40, 0x376c1944), TOBN(0x7cb62694, 0x63da1511), TOBN(0xafd29161, 0xb7148a3b), TOBN(0xa6f9d9ed, 0x4e2ea2ee)}, {TOBN(0x15dc2ca2, 0x880dd212), TOBN(0x903c3813, 0xa61139a9), TOBN(0x2aa7b46d, 0x6c0f8785), TOBN(0x36ce2871, 0x901c60ff)}}, {{TOBN(0xc683b028, 0xe10d9c12), TOBN(0x7573baa2, 0x032f33d3), TOBN(0x87a9b1f6, 0x67a31b58), TOBN(0xfd3ed11a, 0xf4ffae12)}, {TOBN(0x83dcaa9a, 0x0cb2748e), TOBN(0x8239f018, 0x5d6fdf16), TOBN(0xba67b49c, 0x72753941), TOBN(0x2beec455, 0xc321cb36)}}, {{TOBN(0x88015606, 0x3f8b84ce), TOBN(0x76417083, 0x8d38c86f), TOBN(0x054f1ca7, 0x598953dd), TOBN(0xc939e110, 0x4e8e7429)}, {TOBN(0x9b1ac2b3, 0x5a914f2f), TOBN(0x39e35ed3, 0xe74b8f9c), TOBN(0xd0debdb2, 0x781b2fb0), TOBN(0x1585638f, 0x2d997ba2)}}, {{TOBN(0x9c4b646e, 0x9e2fce99), TOBN(0x68a21081, 0x1e80857f), TOBN(0x06d54e44, 0x3643b52a), TOBN(0xde8d6d63, 0x0d8eb843)}, {TOBN(0x70321563, 0x42146a0a), TOBN(0x8ba826f2, 0x5eaa3622), TOBN(0x227a58bd, 0x86138787), TOBN(0x43b6c03c, 0x10281d37)}}, {{TOBN(0x6326afbb, 0xb54dde39), TOBN(0x744e5e8a, 0xdb6f2d5f), TOBN(0x48b2a99a, 0xcff158e1), TOBN(0xa93c8fa0, 0xef87918f)}, {TOBN(0x2182f956, 0xde058c5c), TOBN(0x216235d2, 0x936f9e7a), TOBN(0xace0c0db, 0xd2e31e67), TOBN(0xc96449bf, 0xf23ac3e7)}}, {{TOBN(0x7e9a2874, 0x170693bd), TOBN(0xa28e14fd, 0xa45e6335), TOBN(0x5757f6b3, 0x56427344), TOBN(0x822e4556, 0xacf8edf9)}, {TOBN(0x2b7a6ee2, 0xe6a285cd), TOBN(0x5866f211, 0xa9df3af0), TOBN(0x40dde2dd, 0xf845b844), TOBN(0x986c3726, 0x110e5e49)}}, {{TOBN(0x73680c2a, 0xf7172277), TOBN(0x57b94f0f, 0x0cccb244), TOBN(0xbdff7267, 0x2d438ca7), TOBN(0xbad1ce11, 0xcf4663fd)}, {TOBN(0x9813ed9d, 0xd8f71cae), TOBN(0xf43272a6, 0x961fdaa6), TOBN(0xbeff0119, 0xbd6d1637), TOBN(0xfebc4f91, 0x30361978)}}, {{TOBN(0x02b37a95, 0x2f41deff), TOBN(0x0e44a59a, 0xe63b89b7), TOBN(0x673257dc, 0x143ff951), TOBN(0x19c02205, 0xd752baf4)}, {TOBN(0x46c23069, 0xc4b7d692), TOBN(0x2e6392c3, 0xfd1502ac), TOBN(0x6057b1a2, 0x1b220846), TOBN(0xe51ff946, 0x0c1b5b63)}}}, {{{TOBN(0x6e85cb51, 0x566c5c43), TOBN(0xcff9c919, 0x3597f046), TOBN(0x9354e90c, 0x4994d94a), TOBN(0xe0a39332, 0x2147927d)}, {TOBN(0x8427fac1, 0x0dc1eb2b), TOBN(0x88cfd8c2, 0x2ff319fa), TOBN(0xe2d4e684, 0x01965274), TOBN(0xfa2e067d, 0x67aaa746)}}, {{TOBN(0xb6d92a7f, 0x3e5f9f11), TOBN(0x9afe153a, 0xd6cb3b8e), TOBN(0x4d1a6dd7, 0xddf800bd), TOBN(0xf6c13cc0, 0xcaf17e19)}, {TOBN(0x15f6c58e, 0x325fc3ee), TOBN(0x71095400, 0xa31dc3b2), TOBN(0x168e7c07, 0xafa3d3e7), TOBN(0x3f8417a1, 0x94c7ae2d)}}, {{TOBN(0xec234772, 0x813b230d), TOBN(0x634d0f5f, 0x17344427), TOBN(0x11548ab1, 0xd77fc56a), TOBN(0x7fab1750, 0xce06af77)}, {TOBN(0xb62c10a7, 0x4f7c4f83), TOBN(0xa7d2edc4, 0x220a67d9), TOBN(0x1c404170, 0x921209a0), TOBN(0x0b9815a0, 0xface59f0)}}, {{TOBN(0x2842589b, 0x319540c3), TOBN(0x18490f59, 0xa283d6f8), TOBN(0xa2731f84, 0xdaae9fcb), TOBN(0x3db6d960, 0xc3683ba0)}, {TOBN(0xc85c63bb, 0x14611069), TOBN(0xb19436af, 0x0788bf05), TOBN(0x905459df, 0x347460d2), TOBN(0x73f6e094, 0xe11a7db1)}}, {{TOBN(0xdc7f938e, 0xb6357f37), TOBN(0xc5d00f79, 0x2bd8aa62), TOBN(0xc878dcb9, 0x2ca979fc), TOBN(0x37e83ed9, 0xeb023a99)}, {TOBN(0x6b23e273, 0x1560bf3d), TOBN(0x1086e459, 0x1d0fae61), TOBN(0x78248316, 0x9a9414bd), TOBN(0x1b956bc0, 0xf0ea9ea1)}}, {{TOBN(0x7b85bb91, 0xc31b9c38), TOBN(0x0c5aa90b, 0x48ef57b5), TOBN(0xdedeb169, 0xaf3bab6f), TOBN(0xe610ad73, 0x2d373685)}, {TOBN(0xf13870df, 0x02ba8e15), TOBN(0x0337edb6, 0x8ca7f771), TOBN(0xe4acf747, 0xb62c036c), TOBN(0xd921d576, 0xb6b94e81)}}, {{TOBN(0xdbc86439, 0x2c422f7a), TOBN(0xfb635362, 0xed348898), TOBN(0x83084668, 0xc45bfcd1), TOBN(0xc357c9e3, 0x2b315e11)}, {TOBN(0xb173b540, 0x5b2e5b8c), TOBN(0x7e946931, 0xe102b9a4), TOBN(0x17c890eb, 0x7b0fb199), TOBN(0xec225a83, 0xd61b662b)}}, {{TOBN(0xf306a3c8, 0xee3c76cb), TOBN(0x3cf11623, 0xd32a1f6e), TOBN(0xe6d5ab64, 0x6863e956), TOBN(0x3b8a4cbe, 0x5c005c26)}, {TOBN(0xdcd529a5, 0x9ce6bb27), TOBN(0xc4afaa52, 0x04d4b16f), TOBN(0xb0624a26, 0x7923798d), TOBN(0x85e56df6, 0x6b307fab)}}, {{TOBN(0x0281893c, 0x2bf29698), TOBN(0x91fc19a4, 0xd7ce7603), TOBN(0x75a5dca3, 0xad9a558f), TOBN(0x40ceb3fa, 0x4d50bf77)}, {TOBN(0x1baf6060, 0xbc9ba369), TOBN(0x927e1037, 0x597888c2), TOBN(0xd936bf19, 0x86a34c07), TOBN(0xd4cf10c1, 0xc34ae980)}}, {{TOBN(0x3a3e5334, 0x859dd614), TOBN(0x9c475b5b, 0x18d0c8ee), TOBN(0x63080d1f, 0x07cd51d5), TOBN(0xc9c0d0a6, 0xb88b4326)}, {TOBN(0x1ac98691, 0xc234296f), TOBN(0x2a0a83a4, 0x94887fb6), TOBN(0x56511427, 0x0cea9cf2), TOBN(0x5230a6e8, 0xa24802f5)}}, {{TOBN(0xf7a2bf0f, 0x72e3d5c1), TOBN(0x37717446, 0x4f21439e), TOBN(0xfedcbf25, 0x9ce30334), TOBN(0xe0030a78, 0x7ce202f9)}, {TOBN(0x6f2d9ebf, 0x1202e9ca), TOBN(0xe79dde6c, 0x75e6e591), TOBN(0xf52072af, 0xf1dac4f8), TOBN(0x6c8d087e, 0xbb9b404d)}}, {{TOBN(0xad0fc73d, 0xbce913af), TOBN(0x909e587b, 0x458a07cb), TOBN(0x1300da84, 0xd4f00c8a), TOBN(0x425cd048, 0xb54466ac)}, {TOBN(0xb59cb9be, 0x90e9d8bf), TOBN(0x991616db, 0x3e431b0e), TOBN(0xd3aa117a, 0x531aecff), TOBN(0x91af92d3, 0x59f4dc3b)}}, {{TOBN(0x9b1ec292, 0xe93fda29), TOBN(0x76bb6c17, 0xe97d91bc), TOBN(0x7509d95f, 0xaface1e6), TOBN(0x3653fe47, 0xbe855ae3)}, {TOBN(0x73180b28, 0x0f680e75), TOBN(0x75eefd1b, 0xeeb6c26c), TOBN(0xa4cdf29f, 0xb66d4236), TOBN(0x2d70a997, 0x6b5821d8)}}, {{TOBN(0x7a3ee207, 0x20445c36), TOBN(0x71d1ac82, 0x59877174), TOBN(0x0fc539f7, 0x949f73e9), TOBN(0xd05cf3d7, 0x982e3081)}, {TOBN(0x8758e20b, 0x7b1c7129), TOBN(0xffadcc20, 0x569e61f2), TOBN(0xb05d3a2f, 0x59544c2d), TOBN(0xbe16f5c1, 0x9fff5e53)}}, {{TOBN(0x73cf65b8, 0xaad58135), TOBN(0x622c2119, 0x037aa5be), TOBN(0x79373b3f, 0x646fd6a0), TOBN(0x0e029db5, 0x0d3978cf)}, {TOBN(0x8bdfc437, 0x94fba037), TOBN(0xaefbd687, 0x620797a6), TOBN(0x3fa5382b, 0xbd30d38e), TOBN(0x7627cfbf, 0x585d7464)}}, {{TOBN(0xb2330fef, 0x4e4ca463), TOBN(0xbcef7287, 0x3566cc63), TOBN(0xd161d2ca, 0xcf780900), TOBN(0x135dc539, 0x5b54827d)}, {TOBN(0x638f052e, 0x27bf1bc6), TOBN(0x10a224f0, 0x07dfa06c), TOBN(0xe973586d, 0x6d3321da), TOBN(0x8b0c5738, 0x26152c8f)}}, {{TOBN(0x07ef4f2a, 0x34606074), TOBN(0x80fe7fe8, 0xa0f7047a), TOBN(0x3d1a8152, 0xe1a0e306), TOBN(0x32cf43d8, 0x88da5222)}, {TOBN(0xbf89a95f, 0x5f02ffe6), TOBN(0x3d9eb9a4, 0x806ad3ea), TOBN(0x012c17bb, 0x79c8e55e), TOBN(0xfdcd1a74, 0x99c81dac)}}, {{TOBN(0x7043178b, 0xb9556098), TOBN(0x4090a1df, 0x801c3886), TOBN(0x759800ff, 0x9b67b912), TOBN(0x3e5c0304, 0x232620c8)}, {TOBN(0x4b9d3c4b, 0x70dceeca), TOBN(0xbb2d3c15, 0x181f648e), TOBN(0xf981d837, 0x6e33345c), TOBN(0xb626289b, 0x0cf2297a)}}, {{TOBN(0x766ac659, 0x8baebdcf), TOBN(0x1a28ae09, 0x75df01e5), TOBN(0xb71283da, 0x375876d8), TOBN(0x4865a96d, 0x607b9800)}, {TOBN(0x25dd1bcd, 0x237936b2), TOBN(0x332f4f4b, 0x60417494), TOBN(0xd0923d68, 0x370a2147), TOBN(0x497f5dfb, 0xdc842203)}}, {{TOBN(0x9dc74cbd, 0x32be5e0f), TOBN(0x7475bcb7, 0x17a01375), TOBN(0x438477c9, 0x50d872b1), TOBN(0xcec67879, 0xffe1d63d)}, {TOBN(0x9b006014, 0xd8578c70), TOBN(0xc9ad99a8, 0x78bb6b8b), TOBN(0x6799008e, 0x11fb3806), TOBN(0xcfe81435, 0xcd44cab3)}}, {{TOBN(0xa2ee1582, 0x2f4fb344), TOBN(0xb8823450, 0x483fa6eb), TOBN(0x622d323d, 0x652c7749), TOBN(0xd8474a98, 0xbeb0a15b)}, {TOBN(0xe43c154d, 0x5d1c00d0), TOBN(0x7fd581d9, 0x0e3e7aac), TOBN(0x2b44c619, 0x2525ddf8), TOBN(0x67a033eb, 0xb8ae9739)}}, {{TOBN(0x113ffec1, 0x9ef2d2e4), TOBN(0x1bf6767e, 0xd5a0ea7f), TOBN(0x57fff75e, 0x03714c0a), TOBN(0xa23c422e, 0x0a23e9ee)}, {TOBN(0xdd5f6b2d, 0x540f83af), TOBN(0xc2c2c27e, 0x55ea46a7), TOBN(0xeb6b4246, 0x672a1208), TOBN(0xd13599f7, 0xae634f7a)}}, {{TOBN(0xcf914b5c, 0xd7b32c6e), TOBN(0x61a5a640, 0xeaf61814), TOBN(0x8dc3df8b, 0x208a1bbb), TOBN(0xef627fd6, 0xb6d79aa5)}, {TOBN(0x44232ffc, 0xc4c86bc8), TOBN(0xe6f9231b, 0x061539fe), TOBN(0x1d04f25a, 0x958b9533), TOBN(0x180cf934, 0x49e8c885)}}, {{TOBN(0x89689595, 0x9884aaf7), TOBN(0xb1959be3, 0x07b348a6), TOBN(0x96250e57, 0x3c147c87), TOBN(0xae0efb3a, 0xdd0c61f8)}, {TOBN(0xed00745e, 0xca8c325e), TOBN(0x3c911696, 0xecff3f70), TOBN(0x73acbc65, 0x319ad41d), TOBN(0x7b01a020, 0xf0b1c7ef)}}, {{TOBN(0xea32b293, 0x63a1483f), TOBN(0x89eabe71, 0x7a248f96), TOBN(0x9c6231d3, 0x343157e5), TOBN(0x93a375e5, 0xdf3c546d)}, {TOBN(0xe76e9343, 0x6a2afe69), TOBN(0xc4f89100, 0xe166c88e), TOBN(0x248efd0d, 0x4f872093), TOBN(0xae0eb3ea, 0x8fe0ea61)}}, {{TOBN(0xaf89790d, 0x9d79046e), TOBN(0x4d650f2d, 0x6cee0976), TOBN(0xa3935d9a, 0x43071eca), TOBN(0x66fcd2c9, 0x283b0bfe)}, {TOBN(0x0e665eb5, 0x696605f1), TOBN(0xe77e5d07, 0xa54cd38d), TOBN(0x90ee050a, 0x43d950cf), TOBN(0x86ddebda, 0xd32e69b5)}}, {{TOBN(0x6ad94a3d, 0xfddf7415), TOBN(0xf7fa1309, 0x3f6e8d5a), TOBN(0xc4831d1d, 0xe9957f75), TOBN(0x7de28501, 0xd5817447)}, {TOBN(0x6f1d7078, 0x9e2aeb6b), TOBN(0xba2b9ff4, 0xf67a53c2), TOBN(0x36963767, 0xdf9defc3), TOBN(0x479deed3, 0x0d38022c)}}, {{TOBN(0xd2edb89b, 0x3a8631e8), TOBN(0x8de855de, 0x7a213746), TOBN(0xb2056cb7, 0xb00c5f11), TOBN(0xdeaefbd0, 0x2c9b85e4)}, {TOBN(0x03f39a8d, 0xd150892d), TOBN(0x37b84686, 0x218b7985), TOBN(0x36296dd8, 0xb7375f1a), TOBN(0x472cd4b1, 0xb78e898e)}}, {{TOBN(0x15dff651, 0xe9f05de9), TOBN(0xd4045069, 0x2ce98ba9), TOBN(0x8466a7ae, 0x9b38024c), TOBN(0xb910e700, 0xe5a6b5ef)}, {TOBN(0xae1c56ea, 0xb3aa8f0d), TOBN(0xbab2a507, 0x7eee74a6), TOBN(0x0dca11e2, 0x4b4c4620), TOBN(0xfd896e2e, 0x4c47d1f4)}}, {{TOBN(0xeb45ae53, 0x308fbd93), TOBN(0x46cd5a2e, 0x02c36fda), TOBN(0x6a3d4e90, 0xbaa48385), TOBN(0xdd55e62e, 0x9dbe9960)}, {TOBN(0xa1406aa0, 0x2a81ede7), TOBN(0x6860dd14, 0xf9274ea7), TOBN(0xcfdcb0c2, 0x80414f86), TOBN(0xff410b10, 0x22f94327)}}, {{TOBN(0x5a33cc38, 0x49ad467b), TOBN(0xefb48b6c, 0x0a7335f1), TOBN(0x14fb54a4, 0xb153a360), TOBN(0x604aa9d2, 0xb52469cc)}, {TOBN(0x5e9dc486, 0x754e48e9), TOBN(0x693cb455, 0x37471e8e), TOBN(0xfb2fd7cd, 0x8d3b37b6), TOBN(0x63345e16, 0xcf09ff07)}}, {{TOBN(0x9910ba6b, 0x23a5d896), TOBN(0x1fe19e35, 0x7fe4364e), TOBN(0x6e1da8c3, 0x9a33c677), TOBN(0x15b4488b, 0x29fd9fd0)}, {TOBN(0x1f439254, 0x1a1f22bf), TOBN(0x920a8a70, 0xab8163e8), TOBN(0x3fd1b249, 0x07e5658e), TOBN(0xf2c4f79c, 0xb6ec839b)}}, {{TOBN(0x1abbc3d0, 0x4aa38d1b), TOBN(0x3b0db35c, 0xb5d9510e), TOBN(0x1754ac78, 0x3e60dec0), TOBN(0x53272fd7, 0xea099b33)}, {TOBN(0x5fb0494f, 0x07a8e107), TOBN(0x4a89e137, 0x6a8191fa), TOBN(0xa113b7f6, 0x3c4ad544), TOBN(0x88a2e909, 0x6cb9897b)}}, {{TOBN(0x17d55de3, 0xb44a3f84), TOBN(0xacb2f344, 0x17c6c690), TOBN(0x32088168, 0x10232390), TOBN(0xf2e8a61f, 0x6c733bf7)}, {TOBN(0xa774aab6, 0x9c2d7652), TOBN(0xfb5307e3, 0xed95c5bc), TOBN(0xa05c73c2, 0x4981f110), TOBN(0x1baae31c, 0xa39458c9)}}, {{TOBN(0x1def185b, 0xcbea62e7), TOBN(0xe8ac9eae, 0xeaf63059), TOBN(0x098a8cfd, 0x9921851c), TOBN(0xd959c3f1, 0x3abe2f5b)}, {TOBN(0xa4f19525, 0x20e40ae5), TOBN(0x320789e3, 0x07a24aa1), TOBN(0x259e6927, 0x7392b2bc), TOBN(0x58f6c667, 0x1918668b)}}, {{TOBN(0xce1db2bb, 0xc55d2d8b), TOBN(0x41d58bb7, 0xf4f6ca56), TOBN(0x7650b680, 0x8f877614), TOBN(0x905e16ba, 0xf4c349ed)}, {TOBN(0xed415140, 0xf661acac), TOBN(0x3b8784f0, 0xcb2270af), TOBN(0x3bc280ac, 0x8a402cba), TOBN(0xd53f7146, 0x0937921a)}}, {{TOBN(0xc03c8ee5, 0xe5681e83), TOBN(0x62126105, 0xf6ac9e4a), TOBN(0x9503a53f, 0x936b1a38), TOBN(0x3d45e2d4, 0x782fecbd)}, {TOBN(0x69a5c439, 0x76e8ae98), TOBN(0xb53b2eeb, 0xbfb4b00e), TOBN(0xf1674712, 0x72386c89), TOBN(0x30ca34a2, 0x4268bce4)}}, {{TOBN(0x7f1ed86c, 0x78341730), TOBN(0x8ef5beb8, 0xb525e248), TOBN(0xbbc489fd, 0xb74fbf38), TOBN(0x38a92a0e, 0x91a0b382)}, {TOBN(0x7a77ba3f, 0x22433ccf), TOBN(0xde8362d6, 0xa29f05a9), TOBN(0x7f6a30ea, 0x61189afc), TOBN(0x693b5505, 0x59ef114f)}}, {{TOBN(0x50266bc0, 0xcd1797a1), TOBN(0xea17b47e, 0xf4b7af2d), TOBN(0xd6c4025c, 0x3df9483e), TOBN(0x8cbb9d9f, 0xa37b18c9)}, {TOBN(0x91cbfd9c, 0x4d8424cf), TOBN(0xdb7048f1, 0xab1c3506), TOBN(0x9eaf641f, 0x028206a3), TOBN(0xf986f3f9, 0x25bdf6ce)}}, {{TOBN(0x262143b5, 0x224c08dc), TOBN(0x2bbb09b4, 0x81b50c91), TOBN(0xc16ed709, 0xaca8c84f), TOBN(0xa6210d9d, 0xb2850ca8)}, {TOBN(0x6d8df67a, 0x09cb54d6), TOBN(0x91eef6e0, 0x500919a4), TOBN(0x90f61381, 0x0f132857), TOBN(0x9acede47, 0xf8d5028b)}}, {{TOBN(0x844d1b71, 0x90b771c3), TOBN(0x563b71e4, 0xba6426be), TOBN(0x2efa2e83, 0xbdb802ff), TOBN(0x3410cbab, 0xab5b4a41)}, {TOBN(0x555b2d26, 0x30da84dd), TOBN(0xd0711ae9, 0xee1cc29a), TOBN(0xcf3e8c60, 0x2f547792), TOBN(0x03d7d5de, 0xdc678b35)}}, {{TOBN(0x071a2fa8, 0xced806b8), TOBN(0x222e6134, 0x697f1478), TOBN(0xdc16fd5d, 0xabfcdbbf), TOBN(0x44912ebf, 0x121b53b8)}, {TOBN(0xac943674, 0x2496c27c), TOBN(0x8ea3176c, 0x1ffc26b0), TOBN(0xb6e224ac, 0x13debf2c), TOBN(0x524cc235, 0xf372a832)}}, {{TOBN(0xd706e1d8, 0x9f6f1b18), TOBN(0x2552f005, 0x44cce35b), TOBN(0x8c8326c2, 0xa88e31fc), TOBN(0xb5468b2c, 0xf9552047)}, {TOBN(0xce683e88, 0x3ff90f2b), TOBN(0x77947bdf, 0x2f0a5423), TOBN(0xd0a1b28b, 0xed56e328), TOBN(0xaee35253, 0xc20134ac)}}, {{TOBN(0x7e98367d, 0x3567962f), TOBN(0x379ed61f, 0x8188bffb), TOBN(0x73bba348, 0xfaf130a1), TOBN(0x6c1f75e1, 0x904ed734)}, {TOBN(0x18956642, 0x3b4a79fc), TOBN(0xf20bc83d, 0x54ef4493), TOBN(0x836d425d, 0x9111eca1), TOBN(0xe5b5c318, 0x009a8dcf)}}, {{TOBN(0x3360b25d, 0x13221bc5), TOBN(0x707baad2, 0x6b3eeaf7), TOBN(0xd7279ed8, 0x743a95a1), TOBN(0x7450a875, 0x969e809f)}, {TOBN(0x32b6bd53, 0xe5d0338f), TOBN(0x1e77f7af, 0x2b883bbc), TOBN(0x90da12cc, 0x1063ecd0), TOBN(0xe2697b58, 0xc315be47)}}, {{TOBN(0x2771a5bd, 0xda85d534), TOBN(0x53e78c1f, 0xff980eea), TOBN(0xadf1cf84, 0x900385e7), TOBN(0x7d3b14f6, 0xc9387b62)}, {TOBN(0x170e74b0, 0xcb8f2bd2), TOBN(0x2d50b486, 0x827fa993), TOBN(0xcdbe8c9a, 0xf6f32bab), TOBN(0x55e906b0, 0xc3b93ab8)}}, {{TOBN(0x747f22fc, 0x8fe280d1), TOBN(0xcd8e0de5, 0xb2e114ab), TOBN(0x5ab7dbeb, 0xe10b68b0), TOBN(0x9dc63a9c, 0xa480d4b2)}, {TOBN(0x78d4bc3b, 0x4be1495f), TOBN(0x25eb3db8, 0x9359122d), TOBN(0x3f8ac05b, 0x0809cbdc), TOBN(0xbf4187bb, 0xd37c702f)}}, {{TOBN(0x84cea069, 0x1416a6a5), TOBN(0x8f860c79, 0x43ef881c), TOBN(0x41311f8a, 0x38038a5d), TOBN(0xe78c2ec0, 0xfc612067)}, {TOBN(0x494d2e81, 0x5ad73581), TOBN(0xb4cc9e00, 0x59604097), TOBN(0xff558aec, 0xf3612cba), TOBN(0x35beef7a, 0x9e36c39e)}}, {{TOBN(0x1845c7cf, 0xdbcf41b9), TOBN(0x5703662a, 0xaea997c0), TOBN(0x8b925afe, 0xe402f6d8), TOBN(0xd0a1b1ae, 0x4dd72162)}, {TOBN(0x9f47b375, 0x03c41c4b), TOBN(0xa023829b, 0x0391d042), TOBN(0x5f5045c3, 0x503b8b0a), TOBN(0x123c2688, 0x98c010e5)}}, {{TOBN(0x324ec0cc, 0x36ba06ee), TOBN(0xface3115, 0x3dd2cc0c), TOBN(0xb364f3be, 0xf333e91f), TOBN(0xef8aff73, 0x28e832b0)}, {TOBN(0x1e9bad04, 0x2d05841b), TOBN(0x42f0e3df, 0x356a21e2), TOBN(0xa3270bcb, 0x4add627e), TOBN(0xb09a8158, 0xd322e711)}}, {{TOBN(0x86e326a1, 0x0fee104a), TOBN(0xad7788f8, 0x3703f65d), TOBN(0x7e765430, 0x47bc4833), TOBN(0x6cee582b, 0x2b9b893a)}, {TOBN(0x9cd2a167, 0xe8f55a7b), TOBN(0xefbee3c6, 0xd9e4190d), TOBN(0x33ee7185, 0xd40c2e9d), TOBN(0x844cc9c5, 0xa380b548)}}, {{TOBN(0x323f8ecd, 0x66926e04), TOBN(0x0001e38f, 0x8110c1ba), TOBN(0x8dbcac12, 0xfc6a7f07), TOBN(0xd65e1d58, 0x0cec0827)}, {TOBN(0xd2cd4141, 0xbe76ca2d), TOBN(0x7895cf5c, 0xe892f33a), TOBN(0x956d230d, 0x367139d2), TOBN(0xa91abd3e, 0xd012c4c1)}}, {{TOBN(0x34fa4883, 0x87eb36bf), TOBN(0xc5f07102, 0x914b8fb4), TOBN(0x90f0e579, 0xadb9c95f), TOBN(0xfe6ea8cb, 0x28888195)}, {TOBN(0x7b9b5065, 0xedfa9284), TOBN(0x6c510bd2, 0x2b8c8d65), TOBN(0xd7b8ebef, 0xcbe8aafd), TOBN(0xedb3af98, 0x96b1da07)}}, {{TOBN(0x28ff779d, 0x6295d426), TOBN(0x0c4f6ac7, 0x3fa3ad7b), TOBN(0xec44d054, 0x8b8e2604), TOBN(0x9b32a66d, 0x8b0050e1)}, {TOBN(0x1f943366, 0xf0476ce2), TOBN(0x7554d953, 0xa602c7b4), TOBN(0xbe35aca6, 0x524f2809), TOBN(0xb6881229, 0xfd4edbea)}}, {{TOBN(0xe8cd0c8f, 0x508efb63), TOBN(0x9eb5b5c8, 0x6abcefc7), TOBN(0xf5621f5f, 0xb441ab4f), TOBN(0x79e6c046, 0xb76a2b22)}, {TOBN(0x74a4792c, 0xe37a1f69), TOBN(0xcbd252cb, 0x03542b60), TOBN(0x785f65d5, 0xb3c20bd3), TOBN(0x8dea6143, 0x4fabc60c)}}, {{TOBN(0x45e21446, 0xde673629), TOBN(0x57f7aa1e, 0x703c2d21), TOBN(0xa0e99b7f, 0x98c868c7), TOBN(0x4e42f66d, 0x8b641676)}, {TOBN(0x602884dc, 0x91077896), TOBN(0xa0d690cf, 0xc2c9885b), TOBN(0xfeb4da33, 0x3b9a5187), TOBN(0x5f789598, 0x153c87ee)}}, {{TOBN(0x2192dd47, 0x52b16dba), TOBN(0xdeefc0e6, 0x3524c1b1), TOBN(0x465ea76e, 0xe4383693), TOBN(0x79401711, 0x361b8d98)}, {TOBN(0xa5f9ace9, 0xf21a15cb), TOBN(0x73d26163, 0xefee9aeb), TOBN(0xcca844b3, 0xe677016c), TOBN(0x6c122b07, 0x57eaee06)}}, {{TOBN(0xb782dce7, 0x15f09690), TOBN(0x508b9b12, 0x2dfc0fc9), TOBN(0x9015ab4b, 0x65d89fc6), TOBN(0x5e79dab7, 0xd6d5bb0f)}, {TOBN(0x64f021f0, 0x6c775aa2), TOBN(0xdf09d8cc, 0x37c7eca1), TOBN(0x9a761367, 0xef2fa506), TOBN(0xed4ca476, 0x5b81eec6)}}, {{TOBN(0x262ede36, 0x10bbb8b5), TOBN(0x0737ce83, 0x0641ada3), TOBN(0x4c94288a, 0xe9831ccc), TOBN(0x487fc1ce, 0x8065e635)}, {TOBN(0xb13d7ab3, 0xb8bb3659), TOBN(0xdea5df3e, 0x855e4120), TOBN(0xb9a18573, 0x85eb0244), TOBN(0x1a1b8ea3, 0xa7cfe0a3)}}, {{TOBN(0x3b837119, 0x67b0867c), TOBN(0x8d5e0d08, 0x9d364520), TOBN(0x52dccc1e, 0xd930f0e3), TOBN(0xefbbcec7, 0xbf20bbaf)}, {TOBN(0x99cffcab, 0x0263ad10), TOBN(0xd8199e6d, 0xfcd18f8a), TOBN(0x64e2773f, 0xe9f10617), TOBN(0x0079e8e1, 0x08704848)}}, {{TOBN(0x1169989f, 0x8a342283), TOBN(0x8097799c, 0xa83012e6), TOBN(0xece966cb, 0x8a6a9001), TOBN(0x93b3afef, 0x072ac7fc)}, {TOBN(0xe6893a2a, 0x2db3d5ba), TOBN(0x263dc462, 0x89bf4fdc), TOBN(0x8852dfc9, 0xe0396673), TOBN(0x7ac70895, 0x3af362b6)}}, {{TOBN(0xbb9cce4d, 0x5c2f342b), TOBN(0xbf80907a, 0xb52d7aae), TOBN(0x97f3d3cd, 0x2161bcd0), TOBN(0xb25b0834, 0x0962744d)}, {TOBN(0xc5b18ea5, 0x6c3a1dda), TOBN(0xfe4ec7eb, 0x06c92317), TOBN(0xb787b890, 0xad1c4afe), TOBN(0xdccd9a92, 0x0ede801a)}}, {{TOBN(0x9ac6ddda, 0xdb58da1f), TOBN(0x22bbc12f, 0xb8cae6ee), TOBN(0xc6f8bced, 0x815c4a43), TOBN(0x8105a92c, 0xf96480c7)}, {TOBN(0x0dc3dbf3, 0x7a859d51), TOBN(0xe3ec7ce6, 0x3041196b), TOBN(0xd9f64b25, 0x0d1067c9), TOBN(0xf2321321, 0x3d1f8dd8)}}, {{TOBN(0x8b5c619c, 0x76497ee8), TOBN(0x5d2b0ac6, 0xc717370e), TOBN(0x98204cb6, 0x4fcf68e1), TOBN(0x0bdec211, 0x62bc6792)}, {TOBN(0x6973ccef, 0xa63b1011), TOBN(0xf9e3fa97, 0xe0de1ac5), TOBN(0x5efb693e, 0x3d0e0c8b), TOBN(0x037248e9, 0xd2d4fcb4)}}}, {{{TOBN(0x80802dc9, 0x1ec34f9e), TOBN(0xd8772d35, 0x33810603), TOBN(0x3f06d66c, 0x530cb4f3), TOBN(0x7be5ed0d, 0xc475c129)}, {TOBN(0xcb9e3c19, 0x31e82b10), TOBN(0xc63d2857, 0xc9ff6b4c), TOBN(0xb92118c6, 0x92a1b45e), TOBN(0x0aec4414, 0x7285bbca)}}, {{TOBN(0xfc189ae7, 0x1e29a3ef), TOBN(0xcbe906f0, 0x4c93302e), TOBN(0xd0107914, 0xceaae10e), TOBN(0xb7a23f34, 0xb68e19f8)}, {TOBN(0xe9d875c2, 0xefd2119d), TOBN(0x03198c6e, 0xfcadc9c8), TOBN(0x65591bf6, 0x4da17113), TOBN(0x3cf0bbf8, 0x3d443038)}}, {{TOBN(0xae485bb7, 0x2b724759), TOBN(0x945353e1, 0xb2d4c63a), TOBN(0x82159d07, 0xde7d6f2c), TOBN(0x389caef3, 0x4ec5b109)}, {TOBN(0x4a8ebb53, 0xdb65ef14), TOBN(0x2dc2cb7e, 0xdd99de43), TOBN(0x816fa3ed, 0x83f2405f), TOBN(0x73429bb9, 0xc14208a3)}}, {{TOBN(0xb618d590, 0xb01e6e27), TOBN(0x047e2ccd, 0xe180b2dc), TOBN(0xd1b299b5, 0x04aea4a9), TOBN(0x412c9e1e, 0x9fa403a4)}, {TOBN(0x88d28a36, 0x79407552), TOBN(0x49c50136, 0xf332b8e3), TOBN(0x3a1b6fcc, 0xe668de19), TOBN(0x178851bc, 0x75122b97)}}, {{TOBN(0xb1e13752, 0xfb85fa4c), TOBN(0xd61257ce, 0x383c8ce9), TOBN(0xd43da670, 0xd2f74dae), TOBN(0xa35aa23f, 0xbf846bbb)}, {TOBN(0x5e74235d, 0x4421fc83), TOBN(0xf6df8ee0, 0xc363473b), TOBN(0x34d7f52a, 0x3c4aa158), TOBN(0x50d05aab, 0x9bc6d22e)}}, {{TOBN(0x8c56e735, 0xa64785f4), TOBN(0xbc56637b, 0x5f29cd07), TOBN(0x53b2bb80, 0x3ee35067), TOBN(0x50235a0f, 0xdc919270)}, {TOBN(0x191ab6d8, 0xf2c4aa65), TOBN(0xc3475831, 0x8396023b), TOBN(0x80400ba5, 0xf0f805ba), TOBN(0x8881065b, 0x5ec0f80f)}}, {{TOBN(0xc370e522, 0xcc1b5e83), TOBN(0xde2d4ad1, 0x860b8bfb), TOBN(0xad364df0, 0x67b256df), TOBN(0x8f12502e, 0xe0138997)}, {TOBN(0x503fa0dc, 0x7783920a), TOBN(0xe80014ad, 0xc0bc866a), TOBN(0x3f89b744, 0xd3064ba6), TOBN(0x03511dcd, 0xcba5dba5)}}, {{TOBN(0x197dd46d, 0x95a7b1a2), TOBN(0x9c4e7ad6, 0x3c6341fb), TOBN(0x426eca29, 0x484c2ece), TOBN(0x9211e489, 0xde7f4f8a)}, {TOBN(0x14997f6e, 0xc78ef1f4), TOBN(0x2b2c0910, 0x06574586), TOBN(0x17286a6e, 0x1c3eede8), TOBN(0x25f92e47, 0x0f60e018)}}, {{TOBN(0x805c5646, 0x31890a36), TOBN(0x703ef600, 0x57feea5b), TOBN(0x389f747c, 0xaf3c3030), TOBN(0xe0e5daeb, 0x54dd3739)}, {TOBN(0xfe24a4c3, 0xc9c9f155), TOBN(0x7e4bf176, 0xb5393962), TOBN(0x37183de2, 0xaf20bf29), TOBN(0x4a1bd7b5, 0xf95a8c3b)}}, {{TOBN(0xa83b9699, 0x46191d3d), TOBN(0x281fc8dd, 0x7b87f257), TOBN(0xb18e2c13, 0x54107588), TOBN(0x6372def7, 0x9b2bafe8)}, {TOBN(0xdaf4bb48, 0x0d8972ca), TOBN(0x3f2dd4b7, 0x56167a3f), TOBN(0x1eace32d, 0x84310cf4), TOBN(0xe3bcefaf, 0xe42700aa)}}, {{TOBN(0x5fe5691e, 0xd785e73d), TOBN(0xa5db5ab6, 0x2ea60467), TOBN(0x02e23d41, 0xdfc6514a), TOBN(0x35e8048e, 0xe03c3665)}, {TOBN(0x3f8b118f, 0x1adaa0f8), TOBN(0x28ec3b45, 0x84ce1a5a), TOBN(0xe8cacc6e, 0x2c6646b8), TOBN(0x1343d185, 0xdbd0e40f)}}, {{TOBN(0xe5d7f844, 0xcaaa358c), TOBN(0x1a1db7e4, 0x9924182a), TOBN(0xd64cd42d, 0x9c875d9a), TOBN(0xb37b515f, 0x042eeec8)}, {TOBN(0x4d4dd409, 0x7b165fbe), TOBN(0xfc322ed9, 0xe206eff3), TOBN(0x7dee4102, 0x59b7e17e), TOBN(0x55a481c0, 0x8236ca00)}}, {{TOBN(0x8c885312, 0xc23fc975), TOBN(0x15715806, 0x05d6297b), TOBN(0xa078868e, 0xf78edd39), TOBN(0x956b31e0, 0x03c45e52)}, {TOBN(0x470275d5, 0xff7b33a6), TOBN(0xc8d5dc3a, 0x0c7e673f), TOBN(0x419227b4, 0x7e2f2598), TOBN(0x8b37b634, 0x4c14a975)}}, {{TOBN(0xd0667ed6, 0x8b11888c), TOBN(0x5e0e8c3e, 0x803e25dc), TOBN(0x34e5d0dc, 0xb987a24a), TOBN(0x9f40ac3b, 0xae920323)}, {TOBN(0x5463de95, 0x34e0f63a), TOBN(0xa128bf92, 0x6b6328f9), TOBN(0x491ccd7c, 0xda64f1b7), TOBN(0x7ef1ec27, 0xc47bde35)}}, {{TOBN(0xa857240f, 0xa36a2737), TOBN(0x35dc1366, 0x63621bc1), TOBN(0x7a3a6453, 0xd4fb6897), TOBN(0x80f1a439, 0xc929319d)}, {TOBN(0xfc18274b, 0xf8cb0ba0), TOBN(0xb0b53766, 0x8078c5eb), TOBN(0xfb0d4924, 0x1e01d0ef), TOBN(0x50d7c67d, 0x372ab09c)}}, {{TOBN(0xb4e370af, 0x3aeac968), TOBN(0xe4f7fee9, 0xc4b63266), TOBN(0xb4acd4c2, 0xe3ac5664), TOBN(0xf8910bd2, 0xceb38cbf)}, {TOBN(0x1c3ae50c, 0xc9c0726e), TOBN(0x15309569, 0xd97b40bf), TOBN(0x70884b7f, 0xfd5a5a1b), TOBN(0x3890896a, 0xef8314cd)}}, {{TOBN(0x58e1515c, 0xa5618c93), TOBN(0xe665432b, 0x77d942d1), TOBN(0xb32181bf, 0xb6f767a8), TOBN(0x753794e8, 0x3a604110)}, {TOBN(0x09afeb7c, 0xe8c0dbcc), TOBN(0x31e02613, 0x598673a3), TOBN(0x5d98e557, 0x7d46db00), TOBN(0xfc21fb8c, 0x9d985b28)}}, {{TOBN(0xc9040116, 0xb0843e0b), TOBN(0x53b1b3a8, 0x69b04531), TOBN(0xdd1649f0, 0x85d7d830), TOBN(0xbb3bcc87, 0xcb7427e8)}, {TOBN(0x77261100, 0xc93dce83), TOBN(0x7e79da61, 0xa1922a2a), TOBN(0x587a2b02, 0xf3149ce8), TOBN(0x147e1384, 0xde92ec83)}}, {{TOBN(0x484c83d3, 0xaf077f30), TOBN(0xea78f844, 0x0658b53a), TOBN(0x912076c2, 0x027aec53), TOBN(0xf34714e3, 0x93c8177d)}, {TOBN(0x37ef5d15, 0xc2376c84), TOBN(0x8315b659, 0x3d1aa783), TOBN(0x3a75c484, 0xef852a90), TOBN(0x0ba0c58a, 0x16086bd4)}}, {{TOBN(0x29688d7a, 0x529a6d48), TOBN(0x9c7f250d, 0xc2f19203), TOBN(0x123042fb, 0x682e2df9), TOBN(0x2b7587e7, 0xad8121bc)}, {TOBN(0x30fc0233, 0xe0182a65), TOBN(0xb82ecf87, 0xe3e1128a), TOBN(0x71682861, 0x93fb098f), TOBN(0x043e21ae, 0x85e9e6a7)}}, {{TOBN(0xab5b49d6, 0x66c834ea), TOBN(0x3be43e18, 0x47414287), TOBN(0xf40fb859, 0x219a2a47), TOBN(0x0e6559e9, 0xcc58df3c)}, {TOBN(0xfe1dfe8e, 0x0c6615b4), TOBN(0x14abc8fd, 0x56459d70), TOBN(0x7be0fa8e, 0x05de0386), TOBN(0x8e63ef68, 0xe9035c7c)}}, {{TOBN(0x116401b4, 0x53b31e91), TOBN(0x0cba7ad4, 0x4436b4d8), TOBN(0x9151f9a0, 0x107afd66), TOBN(0xafaca8d0, 0x1f0ee4c4)}, {TOBN(0x75fe5c1d, 0x9ee9761c), TOBN(0x3497a16b, 0xf0c0588f), TOBN(0x3ee2bebd, 0x0304804c), TOBN(0xa8fb9a60, 0xc2c990b9)}}, {{TOBN(0xd14d32fe, 0x39251114), TOBN(0x36bf25bc, 0xcac73366), TOBN(0xc9562c66, 0xdba7495c), TOBN(0x324d301b, 0x46ad348b)}, {TOBN(0x9f46620c, 0xd670407e), TOBN(0x0ea8d4f1, 0xe3733a01), TOBN(0xd396d532, 0xb0c324e0), TOBN(0x5b211a0e, 0x03c317cd)}}, {{TOBN(0x090d7d20, 0x5ffe7b37), TOBN(0x3b7f3efb, 0x1747d2da), TOBN(0xa2cb525f, 0xb54fc519), TOBN(0x6e220932, 0xf66a971e)}, {TOBN(0xddc160df, 0xb486d440), TOBN(0x7fcfec46, 0x3fe13465), TOBN(0x83da7e4e, 0x76e4c151), TOBN(0xd6fa48a1, 0xd8d302b5)}}, {{TOBN(0xc6304f26, 0x5872cd88), TOBN(0x806c1d3c, 0x278b90a1), TOBN(0x3553e725, 0xcaf0bc1c), TOBN(0xff59e603, 0xbb9d8d5c)}, {TOBN(0xa4550f32, 0x7a0b85dd), TOBN(0xdec5720a, 0x93ecc217), TOBN(0x0b88b741, 0x69d62213), TOBN(0x7212f245, 0x5b365955)}}, {{TOBN(0x20764111, 0xb5cae787), TOBN(0x13cb7f58, 0x1dfd3124), TOBN(0x2dca77da, 0x1175aefb), TOBN(0xeb75466b, 0xffaae775)}, {TOBN(0x74d76f3b, 0xdb6cff32), TOBN(0x7440f37a, 0x61fcda9a), TOBN(0x1bb3ac92, 0xb525028b), TOBN(0x20fbf8f7, 0xa1975f29)}}, {{TOBN(0x982692e1, 0xdf83097f), TOBN(0x28738f6c, 0x554b0800), TOBN(0xdc703717, 0xa2ce2f2f), TOBN(0x7913b93c, 0x40814194)}, {TOBN(0x04924593, 0x1fe89636), TOBN(0x7b98443f, 0xf78834a6), TOBN(0x11c6ab01, 0x5114a5a1), TOBN(0x60deb383, 0xffba5f4c)}}, {{TOBN(0x4caa54c6, 0x01a982e6), TOBN(0x1dd35e11, 0x3491cd26), TOBN(0x973c315f, 0x7cbd6b05), TOBN(0xcab00775, 0x52494724)}, {TOBN(0x04659b1f, 0x6565e15a), TOBN(0xbf30f529, 0x8c8fb026), TOBN(0xfc21641b, 0xa8a0de37), TOBN(0xe9c7a366, 0xfa5e5114)}}, {{TOBN(0xdb849ca5, 0x52f03ad8), TOBN(0xc7e8dbe9, 0x024e35c0), TOBN(0xa1a2bbac, 0xcfc3c789), TOBN(0xbf733e7d, 0x9c26f262)}, {TOBN(0x882ffbf5, 0xb8444823), TOBN(0xb7224e88, 0x6bf8483b), TOBN(0x53023b8b, 0x65bef640), TOBN(0xaabfec91, 0xd4d5f8cd)}}, {{TOBN(0xa40e1510, 0x079ea1bd), TOBN(0x1ad9addc, 0xd05d5d26), TOBN(0xdb3f2eab, 0x13e68d4f), TOBN(0x1cff1ae2, 0x640f803f)}, {TOBN(0xe0e7b749, 0xd4cee117), TOBN(0x8e9f275b, 0x4036d909), TOBN(0xce34e31d, 0x8f4d4c38), TOBN(0x22b37f69, 0xd75130fc)}}, {{TOBN(0x83e0f1fd, 0xb4014604), TOBN(0xa8ce9919, 0x89415078), TOBN(0x82375b75, 0x41792efe), TOBN(0x4f59bf5c, 0x97d4515b)}, {TOBN(0xac4f324f, 0x923a277d), TOBN(0xd9bc9b7d, 0x650f3406), TOBN(0xc6fa87d1, 0x8a39bc51), TOBN(0x82588530, 0x5ccc108f)}}, {{TOBN(0x5ced3c9f, 0x82e4c634), TOBN(0x8efb8314, 0x3a4464f8), TOBN(0xe706381b, 0x7a1dca25), TOBN(0x6cd15a3c, 0x5a2a412b)}, {TOBN(0x9347a8fd, 0xbfcd8fb5), TOBN(0x31db2eef, 0x6e54cd22), TOBN(0xc4aeb11e, 0xf8d8932f), TOBN(0x11e7c1ed, 0x344411af)}}, {{TOBN(0x2653050c, 0xdc9a151e), TOBN(0x9edbfc08, 0x3bb0a859), TOBN(0x926c81c7, 0xfd5691e7), TOBN(0x9c1b2342, 0x6f39019a)}, {TOBN(0x64a81c8b, 0x7f8474b9), TOBN(0x90657c07, 0x01761819), TOBN(0x390b3331, 0x55e0375a), TOBN(0xc676c626, 0xb6ebc47d)}}, {{TOBN(0x51623247, 0xb7d6dee8), TOBN(0x0948d927, 0x79659313), TOBN(0x99700161, 0xe9ab35ed), TOBN(0x06cc32b4, 0x8ddde408)}, {TOBN(0x6f2fd664, 0x061ef338), TOBN(0x1606fa02, 0xc202e9ed), TOBN(0x55388bc1, 0x929ba99b), TOBN(0xc4428c5e, 0x1e81df69)}}, {{TOBN(0xce2028ae, 0xf91b0b2a), TOBN(0xce870a23, 0xf03dfd3f), TOBN(0x66ec2c87, 0x0affe8ed), TOBN(0xb205fb46, 0x284d0c00)}, {TOBN(0xbf5dffe7, 0x44cefa48), TOBN(0xb6fc37a8, 0xa19876d7), TOBN(0xbecfa84c, 0x08b72863), TOBN(0xd7205ff5, 0x2576374f)}}, {{TOBN(0x80330d32, 0x8887de41), TOBN(0x5de0df0c, 0x869ea534), TOBN(0x13f42753, 0x3c56ea17), TOBN(0xeb1f6069, 0x452b1a78)}, {TOBN(0x50474396, 0xe30ea15c), TOBN(0x575816a1, 0xc1494125), TOBN(0xbe1ce55b, 0xfe6bb38f), TOBN(0xb901a948, 0x96ae30f7)}}, {{TOBN(0xe5af0f08, 0xd8fc3548), TOBN(0x5010b5d0, 0xd73bfd08), TOBN(0x993d2880, 0x53fe655a), TOBN(0x99f2630b, 0x1c1309fd)}, {TOBN(0xd8677baf, 0xb4e3b76f), TOBN(0x14e51ddc, 0xb840784b), TOBN(0x326c750c, 0xbf0092ce), TOBN(0xc83d306b, 0xf528320f)}}, {{TOBN(0xc4456715, 0x77d4715c), TOBN(0xd30019f9, 0x6b703235), TOBN(0x207ccb2e, 0xd669e986), TOBN(0x57c824af, 0xf6dbfc28)}, {TOBN(0xf0eb532f, 0xd8f92a23), TOBN(0x4a557fd4, 0x9bb98fd2), TOBN(0xa57acea7, 0xc1e6199a), TOBN(0x0c663820, 0x8b94b1ed)}}, {{TOBN(0x9b42be8f, 0xf83a9266), TOBN(0xc7741c97, 0x0101bd45), TOBN(0x95770c11, 0x07bd9ceb), TOBN(0x1f50250a, 0x8b2e0744)}, {TOBN(0xf762eec8, 0x1477b654), TOBN(0xc65b900e, 0x15efe59a), TOBN(0x88c96148, 0x9546a897), TOBN(0x7e8025b3, 0xc30b4d7c)}}, {{TOBN(0xae4065ef, 0x12045cf9), TOBN(0x6fcb2caf, 0x9ccce8bd), TOBN(0x1fa0ba4e, 0xf2cf6525), TOBN(0xf683125d, 0xcb72c312)}, {TOBN(0xa01da4ea, 0xe312410e), TOBN(0x67e28677, 0x6cd8e830), TOBN(0xabd95752, 0x98fb3f07), TOBN(0x05f11e11, 0xeef649a5)}}, {{TOBN(0xba47faef, 0x9d3472c2), TOBN(0x3adff697, 0xc77d1345), TOBN(0x4761fa04, 0xdd15afee), TOBN(0x64f1f61a, 0xb9e69462)}, {TOBN(0xfa691fab, 0x9bfb9093), TOBN(0x3df8ae8f, 0xa1133dfe), TOBN(0xcd5f8967, 0x58cc710d), TOBN(0xfbb88d50, 0x16c7fe79)}}, {{TOBN(0x8e011b4c, 0xe88c50d1), TOBN(0x7532e807, 0xa8771c4f), TOBN(0x64c78a48, 0xe2278ee4), TOBN(0x0b283e83, 0x3845072a)}, {TOBN(0x98a6f291, 0x49e69274), TOBN(0xb96e9668, 0x1868b21c), TOBN(0x38f0adc2, 0xb1a8908e), TOBN(0x90afcff7, 0x1feb829d)}}, {{TOBN(0x9915a383, 0x210b0856), TOBN(0xa5a80602, 0xdef04889), TOBN(0x800e9af9, 0x7c64d509), TOBN(0x81382d0b, 0xb8996f6f)}, {TOBN(0x490eba53, 0x81927e27), TOBN(0x46c63b32, 0x4af50182), TOBN(0x784c5fd9, 0xd3ad62ce), TOBN(0xe4fa1870, 0xf8ae8736)}}, {{TOBN(0x4ec9d0bc, 0xd7466b25), TOBN(0x84ddbe1a, 0xdb235c65), TOBN(0x5e2645ee, 0x163c1688), TOBN(0x570bd00e, 0x00eba747)}, {TOBN(0xfa51b629, 0x128bfa0f), TOBN(0x92fce1bd, 0x6c1d3b68), TOBN(0x3e7361dc, 0xb66778b1), TOBN(0x9c7d249d, 0x5561d2bb)}}, {{TOBN(0xa40b28bf, 0x0bbc6229), TOBN(0x1c83c05e, 0xdfd91497), TOBN(0x5f9f5154, 0xf083df05), TOBN(0xbac38b3c, 0xeee66c9d)}, {TOBN(0xf71db7e3, 0xec0dfcfd), TOBN(0xf2ecda8e, 0x8b0a8416), TOBN(0x52fddd86, 0x7812aa66), TOBN(0x2896ef10, 0x4e6f4272)}}, {{TOBN(0xff27186a, 0x0fe9a745), TOBN(0x08249fcd, 0x49ca70db), TOBN(0x7425a2e6, 0x441cac49), TOBN(0xf4a0885a, 0xece5ff57)}, {TOBN(0x6e2cb731, 0x7d7ead58), TOBN(0xf96cf7d6, 0x1898d104), TOBN(0xafe67c9d, 0x4f2c9a89), TOBN(0x89895a50, 0x1c7bf5bc)}}, {{TOBN(0xdc7cb8e5, 0x573cecfa), TOBN(0x66497eae, 0xd15f03e6), TOBN(0x6bc0de69, 0x3f084420), TOBN(0x323b9b36, 0xacd532b0)}, {TOBN(0xcfed390a, 0x0115a3c1), TOBN(0x9414c40b, 0x2d65ca0e), TOBN(0x641406bd, 0x2f530c78), TOBN(0x29369a44, 0x833438f2)}}, {{TOBN(0x996884f5, 0x903fa271), TOBN(0xe6da0fd2, 0xb9da921e), TOBN(0xa6f2f269, 0x5db01e54), TOBN(0x1ee3e9bd, 0x6876214e)}, {TOBN(0xa26e181c, 0xe27a9497), TOBN(0x36d254e4, 0x8e215e04), TOBN(0x42f32a6c, 0x252cabca), TOBN(0x99481487, 0x80b57614)}}, {{TOBN(0x4c4dfe69, 0x40d9cae1), TOBN(0x05869580, 0x11a10f09), TOBN(0xca287b57, 0x3491b64b), TOBN(0x77862d5d, 0x3fd4a53b)}, {TOBN(0xbf94856e, 0x50349126), TOBN(0x2be30bd1, 0x71c5268f), TOBN(0x10393f19, 0xcbb650a6), TOBN(0x639531fe, 0x778cf9fd)}}, {{TOBN(0x02556a11, 0xb2935359), TOBN(0xda38aa96, 0xaf8c126e), TOBN(0x47dbe6c2, 0x0960167f), TOBN(0x37bbabb6, 0x501901cd)}, {TOBN(0xb6e979e0, 0x2c947778), TOBN(0xd69a5175, 0x7a1a1dc6), TOBN(0xc3ed5095, 0x9d9faf0c), TOBN(0x4dd9c096, 0x1d5fa5f0)}}, {{TOBN(0xa0c4304d, 0x64f16ea8), TOBN(0x8b1cac16, 0x7e718623), TOBN(0x0b576546, 0x7c67f03e), TOBN(0x559cf5ad, 0xcbd88c01)}, {TOBN(0x074877bb, 0x0e2af19a), TOBN(0x1f717ec1, 0xa1228c92), TOBN(0x70bcb800, 0x326e8920), TOBN(0xec6e2c5c, 0x4f312804)}}, {{TOBN(0x426aea7d, 0x3fca4752), TOBN(0xf12c0949, 0x2211f62a), TOBN(0x24beecd8, 0x7be7b6b5), TOBN(0xb77eaf4c, 0x36d7a27d)}, {TOBN(0x154c2781, 0xfda78fd3), TOBN(0x848a83b0, 0x264eeabe), TOBN(0x81287ef0, 0x4ffe2bc4), TOBN(0x7b6d88c6, 0xb6b6fc2a)}}, {{TOBN(0x805fb947, 0xce417d99), TOBN(0x4b93dcc3, 0x8b916cc4), TOBN(0x72e65bb3, 0x21273323), TOBN(0xbcc1badd, 0x6ea9886e)}, {TOBN(0x0e223011, 0x4bc5ee85), TOBN(0xa561be74, 0xc18ee1e4), TOBN(0x762fd2d4, 0xa6bcf1f1), TOBN(0x50e6a5a4, 0x95231489)}}, {{TOBN(0xca96001f, 0xa00b500b), TOBN(0x5c098cfc, 0x5d7dcdf5), TOBN(0xa64e2d2e, 0x8c446a85), TOBN(0xbae9bcf1, 0x971f3c62)}, {TOBN(0x4ec22683, 0x8435a2c5), TOBN(0x8ceaed6c, 0x4bad4643), TOBN(0xe9f8fb47, 0xccccf4e3), TOBN(0xbd4f3fa4, 0x1ce3b21e)}}, {{TOBN(0xd79fb110, 0xa3db3292), TOBN(0xe28a37da, 0xb536c66a), TOBN(0x279ce87b, 0x8e49e6a9), TOBN(0x70ccfe8d, 0xfdcec8e3)}, {TOBN(0x2193e4e0, 0x3ba464b2), TOBN(0x0f39d60e, 0xaca9a398), TOBN(0x7d7932af, 0xf82c12ab), TOBN(0xd8ff50ed, 0x91e7e0f7)}}, {{TOBN(0xea961058, 0xfa28a7e0), TOBN(0xc726cf25, 0x0bf5ec74), TOBN(0xe74d55c8, 0xdb229666), TOBN(0x0bd9abbf, 0xa57f5799)}, {TOBN(0x7479ef07, 0x4dfc47b3), TOBN(0xd9c65fc3, 0x0c52f91d), TOBN(0x8e0283fe, 0x36a8bde2), TOBN(0xa32a8b5e, 0x7d4b7280)}}, {{TOBN(0x6a677c61, 0x12e83233), TOBN(0x0fbb3512, 0xdcc9bf28), TOBN(0x562e8ea5, 0x0d780f61), TOBN(0x0db8b22b, 0x1dc4e89c)}, {TOBN(0x0a6fd1fb, 0x89be0144), TOBN(0x8c77d246, 0xca57113b), TOBN(0x4639075d, 0xff09c91c), TOBN(0x5b47b17f, 0x5060824c)}}, {{TOBN(0x58aea2b0, 0x16287b52), TOBN(0xa1343520, 0xd0cd8eb0), TOBN(0x6148b4d0, 0xc5d58573), TOBN(0xdd2b6170, 0x291c68ae)}, {TOBN(0xa61b3929, 0x1da3b3b7), TOBN(0x5f946d79, 0x08c4ac10), TOBN(0x4105d4a5, 0x7217d583), TOBN(0x5061da3d, 0x25e6de5e)}}, {{TOBN(0x3113940d, 0xec1b4991), TOBN(0xf12195e1, 0x36f485ae), TOBN(0xa7507fb2, 0x731a2ee0), TOBN(0x95057a8e, 0x6e9e196e)}, {TOBN(0xa3c2c911, 0x2e130136), TOBN(0x97dfbb36, 0x33c60d15), TOBN(0xcaf3c581, 0xb300ee2b), TOBN(0x77f25d90, 0xf4bac8b8)}}, {{TOBN(0xdb1c4f98, 0x6d840cd6), TOBN(0x471d62c0, 0xe634288c), TOBN(0x8ec2f85e, 0xcec8a161), TOBN(0x41f37cbc, 0xfa6f4ae2)}, {TOBN(0x6793a20f, 0x4b709985), TOBN(0x7a7bd33b, 0xefa8985b), TOBN(0x2c6a3fbd, 0x938e6446), TOBN(0x19042619, 0x2a8d47c1)}}, {{TOBN(0x16848667, 0xcc36975f), TOBN(0x02acf168, 0x9d5f1dfb), TOBN(0x62d41ad4, 0x613baa94), TOBN(0xb56fbb92, 0x9f684670)}, {TOBN(0xce610d0d, 0xe9e40569), TOBN(0x7b99c65f, 0x35489fef), TOBN(0x0c88ad1b, 0x3df18b97), TOBN(0x81b7d9be, 0x5d0e9edb)}}, {{TOBN(0xd85218c0, 0xc716cc0a), TOBN(0xf4b5ff90, 0x85691c49), TOBN(0xa4fd666b, 0xce356ac6), TOBN(0x17c72895, 0x4b327a7a)}, {TOBN(0xf93d5085, 0xda6be7de), TOBN(0xff71530e, 0x3301d34e), TOBN(0x4cd96442, 0xd8f448e8), TOBN(0x9283d331, 0x2ed18ffa)}}, {{TOBN(0x4d33dd99, 0x2a849870), TOBN(0xa716964b, 0x41576335), TOBN(0xff5e3a9b, 0x179be0e5), TOBN(0x5b9d6b1b, 0x83b13632)}, {TOBN(0x3b8bd7d4, 0xa52f313b), TOBN(0xc9dd95a0, 0x637a4660), TOBN(0x30035962, 0x0b3e218f), TOBN(0xce1481a3, 0xc7b28a3c)}}, {{TOBN(0xab41b43a, 0x43228d83), TOBN(0x24ae1c30, 0x4ad63f99), TOBN(0x8e525f1a, 0x46a51229), TOBN(0x14af860f, 0xcd26d2b4)}, {TOBN(0xd6baef61, 0x3f714aa1), TOBN(0xf51865ad, 0xeb78795e), TOBN(0xd3e21fce, 0xe6a9d694), TOBN(0x82ceb1dd, 0x8a37b527)}}}}; ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.c000064400000000000000000000336101046102023000170230ustar 00000000000000// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. // Copyright (c) 2014, Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) // (1) Intel Corporation, Israel Development Center, Haifa, Israel // (2) University of Haifa, Israel // // Reference: // S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with // 256 Bit Primes" #include #include "../../limbs/limbs.inl" #include #include "p256-nistz.h" #if defined(OPENSSL_USE_NISTZ256) typedef P256_POINT_AFFINE PRECOMP256_ROW[64]; // One converted into the Montgomery domain static const BN_ULONG ONE_MONT[P256_LIMBS] = { TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000), TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe), }; // Precomputed tables for the default generator #include "p256-nistz-table.h" // Recode window to a signed digit, see |ec_GFp_nistp_recode_scalar_bits| in // util.c for details static crypto_word_t booth_recode_w5(crypto_word_t in) { crypto_word_t s, d; s = ~((in >> 5) - 1); d = (1 << 6) - in - 1; d = (d & s) | (in & ~s); d = (d >> 1) + (d & 1); return (d << 1) + (s & 1); } static crypto_word_t booth_recode_w7(crypto_word_t in) { crypto_word_t s, d; s = ~((in >> 7) - 1); d = (1 << 8) - in - 1; d = (d & s) | (in & ~s); d = (d >> 1) + (d & 1); return (d << 1) + (s & 1); } // The `(P256_LIMBS == 8)` case is unreachable for 64-bit targets. #if defined(OPENSSL_64_BIT) && defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunreachable-code" #endif // copy_conditional copies |src| to |dst| if |move| is one and leaves it as-is // if |move| is zero. // // WARNING: this breaks the usual convention of constant-time functions // returning masks. static void copy_conditional(BN_ULONG dst[P256_LIMBS], const BN_ULONG src[P256_LIMBS], BN_ULONG move) { BN_ULONG mask1 = ((BN_ULONG)0) - move; BN_ULONG mask2 = ~mask1; dst[0] = (src[0] & mask1) ^ (dst[0] & mask2); dst[1] = (src[1] & mask1) ^ (dst[1] & mask2); dst[2] = (src[2] & mask1) ^ (dst[2] & mask2); dst[3] = (src[3] & mask1) ^ (dst[3] & mask2); if (P256_LIMBS == 8) { dst[4] = (src[4] & mask1) ^ (dst[4] & mask2); dst[5] = (src[5] & mask1) ^ (dst[5] & mask2); dst[6] = (src[6] & mask1) ^ (dst[6] & mask2); dst[7] = (src[7] & mask1) ^ (dst[7] & mask2); } } #if defined(__clang__) #pragma GCC diagnostic pop #endif // is_not_zero returns one iff in != 0 and zero otherwise. // // WARNING: this breaks the usual convention of constant-time functions // returning masks. // // (define-fun is_not_zero ((in (_ BitVec 64))) (_ BitVec 64) // (bvlshr (bvor in (bvsub #x0000000000000000 in)) #x000000000000003f) // ) // // (declare-fun x () (_ BitVec 64)) // // (assert (and (= x #x0000000000000000) (= (is_not_zero x) // #x0000000000000001))) (check-sat) // // (assert (and (not (= x #x0000000000000000)) (= (is_not_zero x) // #x0000000000000000))) (check-sat) // static BN_ULONG is_not_zero(BN_ULONG in) { in |= (0 - in); in >>= BN_BITS2 - 1; return in; } #if defined(OPENSSL_X86_64) // Dispatch between CPU variations. The "_adx" suffixed functions use MULX in // addition to ADCX/ADOX. MULX is part of BMI2, not ADX, so we must check both // capabilities. void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], const BN_ULONG b[P256_LIMBS]) { if (adx_bmi2_available) { ecp_nistz256_mul_mont_adx(res, a, b); } else { ecp_nistz256_mul_mont_nohw(res, a, b); } } void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]) { if (adx_bmi2_available) { ecp_nistz256_sqr_mont_adx(res, a); } else { ecp_nistz256_sqr_mont_nohw(res, a); } } void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], const BN_ULONG b[P256_LIMBS]) { if (adx_bmi2_available) { ecp_nistz256_ord_mul_mont_adx(res, a, b); } else { ecp_nistz256_ord_mul_mont_nohw(res, a, b); } } void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], BN_ULONG rep) { if (adx_bmi2_available) { ecp_nistz256_ord_sqr_mont_adx(res, a, rep); } else { ecp_nistz256_ord_sqr_mont_nohw(res, a, rep); } } static void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16], int index) { if (avx2_available) { ecp_nistz256_select_w5_avx2(val, in_t, index); } else { ecp_nistz256_select_w5_nohw(val, in_t, index); } } static void ecp_nistz256_select_w7(P256_POINT_AFFINE *val, const P256_POINT_AFFINE in_t[64], int index) { if (avx2_available) { ecp_nistz256_select_w7_avx2(val, in_t, index); } else { ecp_nistz256_select_w7_nohw(val, in_t, index); } } void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) { if (adx_bmi2_available) { ecp_nistz256_point_double_adx(r, a); } else { ecp_nistz256_point_double_nohw(r, a); } } void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a, const P256_POINT *b) { if (adx_bmi2_available) { ecp_nistz256_point_add_adx(r, a, b); } else { ecp_nistz256_point_add_nohw(r, a, b); } } void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a, const P256_POINT_AFFINE *b) { if (adx_bmi2_available) { ecp_nistz256_point_add_affine_adx(r, a, b); } else { ecp_nistz256_point_add_affine_nohw(r, a, b); } } #endif // OPENSSL_X86_64 // r = p * p_scalar static void ecp_nistz256_windowed_mul(P256_POINT *r, const BN_ULONG p_scalar[P256_LIMBS], const BN_ULONG p_x[P256_LIMBS], const BN_ULONG p_y[P256_LIMBS]) { debug_assert_nonsecret(r != NULL); debug_assert_nonsecret(p_scalar != NULL); debug_assert_nonsecret(p_x != NULL); debug_assert_nonsecret(p_y != NULL); static const size_t kWindowSize = 5; static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1; // A |P256_POINT| is (3 * 32) = 96 bytes, and the 64-byte alignment should // add no more than 63 bytes of overhead. Thus, |table| should require // ~1599 ((96 * 16) + 63) bytes of stack space. alignas(64) P256_POINT table[16]; P256_SCALAR_BYTES p_str; p256_scalar_bytes_from_limbs(p_str, p_scalar); // table[0] is implicitly (0,0,0) (the point at infinity), therefore it is // not stored. All other values are actually stored with an offset of -1 in // table. P256_POINT *row = table; limbs_copy(row[1 - 1].X, p_x, P256_LIMBS); limbs_copy(row[1 - 1].Y, p_y, P256_LIMBS); limbs_copy(row[1 - 1].Z, ONE_MONT, P256_LIMBS); ecp_nistz256_point_double(&row[2 - 1], &row[1 - 1]); ecp_nistz256_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]); ecp_nistz256_point_double(&row[4 - 1], &row[2 - 1]); ecp_nistz256_point_double(&row[6 - 1], &row[3 - 1]); ecp_nistz256_point_double(&row[8 - 1], &row[4 - 1]); ecp_nistz256_point_double(&row[12 - 1], &row[6 - 1]); ecp_nistz256_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]); ecp_nistz256_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]); ecp_nistz256_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]); ecp_nistz256_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]); ecp_nistz256_point_double(&row[14 - 1], &row[7 - 1]); ecp_nistz256_point_double(&row[10 - 1], &row[5 - 1]); ecp_nistz256_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]); ecp_nistz256_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]); ecp_nistz256_point_double(&row[16 - 1], &row[8 - 1]); BN_ULONG tmp[P256_LIMBS]; alignas(32) P256_POINT h; size_t index = 255; crypto_word_t wvalue = p_str[(index - 1) / 8]; wvalue = (wvalue >> ((index - 1) % 8)) & kMask; ecp_nistz256_select_w5(r, table, (int)(booth_recode_w5(wvalue) >> 1)); while (index >= 5) { if (index != 255) { size_t off = (index - 1) / 8; wvalue = (crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8; wvalue = (wvalue >> ((index - 1) % 8)) & kMask; wvalue = booth_recode_w5(wvalue); ecp_nistz256_select_w5(&h, table, (int)(wvalue >> 1)); ecp_nistz256_neg(tmp, h.Y); copy_conditional(h.Y, tmp, (wvalue & 1)); ecp_nistz256_point_add(r, r, &h); } index -= kWindowSize; ecp_nistz256_point_double(r, r); ecp_nistz256_point_double(r, r); ecp_nistz256_point_double(r, r); ecp_nistz256_point_double(r, r); ecp_nistz256_point_double(r, r); } // Final window wvalue = p_str[0]; wvalue = (wvalue << 1) & kMask; wvalue = booth_recode_w5(wvalue); ecp_nistz256_select_w5(&h, table, (int)(wvalue >> 1)); ecp_nistz256_neg(tmp, h.Y); copy_conditional(h.Y, tmp, wvalue & 1); ecp_nistz256_point_add(r, r, &h); } static crypto_word_t calc_first_wvalue(size_t *index, const uint8_t p_str[33]) { static const size_t kWindowSize = 7; static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; *index = kWindowSize; crypto_word_t wvalue = ((crypto_word_t)p_str[0] << 1) & kMask; return booth_recode_w7(wvalue); } static crypto_word_t calc_wvalue(size_t *index, const uint8_t p_str[33]) { static const size_t kWindowSize = 7; static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; const size_t off = (*index - 1) / 8; crypto_word_t wvalue = (crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8; wvalue = (wvalue >> ((*index - 1) % 8)) & kMask; *index += kWindowSize; return booth_recode_w7(wvalue); } void p256_point_mul(Limb r[3][P256_LIMBS], const Limb p_scalar[P256_LIMBS], const Limb p_x[P256_LIMBS], const Limb p_y[P256_LIMBS]) { alignas(32) P256_POINT out; ecp_nistz256_windowed_mul(&out, p_scalar, p_x, p_y); limbs_copy(r[0], out.X, P256_LIMBS); limbs_copy(r[1], out.Y, P256_LIMBS); limbs_copy(r[2], out.Z, P256_LIMBS); } void p256_point_mul_base(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS]) { P256_SCALAR_BYTES p_str; p256_scalar_bytes_from_limbs(p_str, scalar); // First window size_t index = 0; crypto_word_t wvalue = calc_first_wvalue(&index, p_str); alignas(32) P256_POINT_AFFINE t; alignas(32) P256_POINT p; ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[0], (int)(wvalue >> 1)); ecp_nistz256_neg(p.Z, t.Y); copy_conditional(t.Y, p.Z, wvalue & 1); // Convert |t| from affine to Jacobian coordinates. We set Z to zero if |t| // is infinity and |ONE| otherwise. |t| was computed from the table, so it // is infinity iff |wvalue >> 1| is zero. limbs_copy(p.X, t.X, P256_LIMBS); limbs_copy(p.Y, t.Y, P256_LIMBS); limbs_zero(p.Z, P256_LIMBS); copy_conditional(p.Z, ONE_MONT, is_not_zero(wvalue >> 1)); for (int i = 1; i < 37; i++) { wvalue = calc_wvalue(&index, p_str); ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[i], (int)(wvalue >> 1)); alignas(32) BN_ULONG neg_Y[P256_LIMBS]; ecp_nistz256_neg(neg_Y, t.Y); copy_conditional(t.Y, neg_Y, wvalue & 1); // Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are the // same non-infinity point. ecp_nistz256_point_add_affine(&p, &p, &t); } limbs_copy(r[0], p.X, P256_LIMBS); limbs_copy(r[1], p.Y, P256_LIMBS); limbs_copy(r[2], p.Z, P256_LIMBS); } void p256_point_mul_base_vartime(Limb r[3][P256_LIMBS], const Limb g_scalar[P256_LIMBS]) { alignas(32) P256_POINT p; uint8_t p_str[33]; OPENSSL_memcpy(p_str, g_scalar, 32); p_str[32] = 0; // First window size_t index = 0; size_t wvalue = calc_first_wvalue(&index, p_str); // Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p| // is infinity and |ONE_MONT| otherwise. |p| was computed from the table, so // it is infinity iff |wvalue >> 1| is zero. if ((wvalue >> 1) != 0) { OPENSSL_memcpy(p.X, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].X, sizeof(p.X)); OPENSSL_memcpy(p.Y, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].Y, sizeof(p.Y)); OPENSSL_memcpy(p.Z, ONE_MONT, sizeof(p.Z)); } else { OPENSSL_memset(p.X, 0, sizeof(p.X)); OPENSSL_memset(p.Y, 0, sizeof(p.Y)); OPENSSL_memset(p.Z, 0, sizeof(p.Z)); } if ((wvalue & 1) == 1) { ecp_nistz256_neg(p.Y, p.Y); } for (int i = 1; i < 37; i++) { wvalue = calc_wvalue(&index, p_str); if ((wvalue >> 1) == 0) { continue; } alignas(32) P256_POINT_AFFINE t; OPENSSL_memcpy(&t, &ecp_nistz256_precomputed[i][(wvalue >> 1) - 1], sizeof(t)); if ((wvalue & 1) == 1) { ecp_nistz256_neg(t.Y, t.Y); } // Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are // the same non-infinity point, so it is important that we compute the // |g_scalar| term before the |p_scalar| term. ecp_nistz256_point_add_affine(&p, &p, &t); } limbs_copy(r[0], p.X, P256_LIMBS); limbs_copy(r[1], p.Y, P256_LIMBS); limbs_copy(r[2], p.Z, P256_LIMBS); } #endif /* defined(OPENSSL_USE_NISTZ256) */ ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.h000064400000000000000000000155221046102023000170320ustar 00000000000000// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. // Copyright (c) 2014, Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) // (1) Intel Corporation, Israel Development Center, Haifa, Israel // (2) University of Haifa, Israel // // Reference: // S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with // 256 Bit Primes" #ifndef OPENSSL_HEADER_EC_P256_X86_64_H #define OPENSSL_HEADER_EC_P256_X86_64_H #include #include "p256_shared.h" #include "../bn/internal.h" #if defined(OPENSSL_USE_NISTZ256) // ecp_nistz256_neg sets |res| to -|a| mod P. void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); // ecp_nistz256_mul_mont sets |res| to |a| * |b| * 2^-256 mod P. #if defined(OPENSSL_X86_64) void ecp_nistz256_mul_mont_nohw(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], const BN_ULONG b[P256_LIMBS]); void ecp_nistz256_mul_mont_adx(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], const BN_ULONG b[P256_LIMBS]); #else void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], const BN_ULONG b[P256_LIMBS]); #endif // ecp_nistz256_sqr_mont sets |res| to |a| * |a| * 2^-256 mod P. #if defined(OPENSSL_X86_64) void ecp_nistz256_sqr_mont_nohw(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); void ecp_nistz256_sqr_mont_adx(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); #else void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); #endif // P-256 scalar operations. // // The following functions compute modulo N, where N is the order of P-256. They // take fully-reduced inputs and give fully-reduced outputs. // ecp_nistz256_ord_mul_mont sets |res| to |a| * |b| where inputs and outputs // are in Montgomery form. That is, |res| is |a| * |b| * 2^-256 mod N. #if defined(OPENSSL_X86_64) void ecp_nistz256_ord_mul_mont_nohw(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], const BN_ULONG b[P256_LIMBS]); void ecp_nistz256_ord_mul_mont_adx(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], const BN_ULONG b[P256_LIMBS]); #else void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], const BN_ULONG b[P256_LIMBS]); #endif // ecp_nistz256_ord_sqr_mont sets |res| to |a|^(2*|rep|) where inputs and // outputs are in Montgomery form. That is, |res| is // (|a| * 2^-256)^(2*|rep|) * 2^256 mod N. #if defined(OPENSSL_X86_64) void ecp_nistz256_ord_sqr_mont_nohw(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], BN_ULONG rep); void ecp_nistz256_ord_sqr_mont_adx(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], BN_ULONG rep); #else void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], BN_ULONG rep); #endif // P-256 point operations. // // The following functions may be used in-place. All coordinates are in the // Montgomery domain. // A P256_POINT_AFFINE represents a P-256 point in affine coordinates. Infinity // is encoded as (0, 0). typedef struct { BN_ULONG X[P256_LIMBS]; BN_ULONG Y[P256_LIMBS]; } P256_POINT_AFFINE; // ecp_nistz256_select_w5 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 16 // and all zeros (the point at infinity) if |index| is 0. This is done in // constant time. #if defined(OPENSSL_X86_64) void ecp_nistz256_select_w5_nohw(P256_POINT *val, const P256_POINT in_t[16], int index); void ecp_nistz256_select_w5_avx2(P256_POINT *val, const P256_POINT in_t[16], int index); #else void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16], int index); #endif // ecp_nistz256_select_w7 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 64 // and all zeros (the point at infinity) if |index| is 0. This is done in // constant time. #if defined(OPENSSL_X86_64) void ecp_nistz256_select_w7_nohw(P256_POINT_AFFINE *val, const P256_POINT_AFFINE in_t[64], int index); void ecp_nistz256_select_w7_avx2(P256_POINT_AFFINE *val, const P256_POINT_AFFINE in_t[64], int index); #else void ecp_nistz256_select_w7(P256_POINT_AFFINE *val, const P256_POINT_AFFINE in_t[64], int index); #endif // ecp_nistz256_point_double sets |r| to |a| doubled. #if defined(OPENSSL_X86_64) void ecp_nistz256_point_double_nohw(P256_POINT *r, const P256_POINT *a); void ecp_nistz256_point_double_adx(P256_POINT *r, const P256_POINT *a); #else void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a); #endif // ecp_nistz256_point_add adds |a| to |b| and places the result in |r|. #if defined(OPENSSL_X86_64) void ecp_nistz256_point_add_nohw(P256_POINT *r, const P256_POINT *a, const P256_POINT *b); void ecp_nistz256_point_add_adx(P256_POINT *r, const P256_POINT *a, const P256_POINT *b); #else void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a, const P256_POINT *b); #endif // ecp_nistz256_point_add_affine adds |a| to |b| and places the result in // |r|. |a| and |b| must not represent the same point unless they are both // infinity. #if defined(OPENSSL_X86_64) void ecp_nistz256_point_add_affine_adx(P256_POINT *r, const P256_POINT *a, const P256_POINT_AFFINE *b); void ecp_nistz256_point_add_affine_nohw(P256_POINT *r, const P256_POINT *a, const P256_POINT_AFFINE *b); #else void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a, const P256_POINT_AFFINE *b); #endif #endif /* defined(OPENSSL_USE_NISTZ256) */ #endif // OPENSSL_HEADER_EC_P256_X86_64_H ring-0.17.14/crypto/fipsmodule/ec/p256.c000064400000000000000000000452321046102023000156610ustar 00000000000000// Copyright 2020 The BoringSSL Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // An implementation of the NIST P-256 elliptic curve point multiplication. // 256-bit Montgomery form for 64 and 32-bit. Field operations are generated by // Fiat, which lives in //third_party/fiat. #include #include "../../limbs/limbs.h" #include "../../limbs/limbs.inl" #include "p256_shared.h" #include "../../internal.h" #include "./util.h" #if !defined(OPENSSL_USE_NISTZ256) #if defined(_MSC_VER) && !defined(__clang__) // '=': conversion from 'int64_t' to 'int32_t', possible loss of data #pragma warning(disable: 4242) // '=': conversion from 'int32_t' to 'uint8_t', possible loss of data #pragma warning(disable: 4244) // 'initializing': conversion from 'size_t' to 'fiat_p256_limb_t' #pragma warning(disable: 4267) #endif #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic ignored "-Wconversion" #pragma GCC diagnostic ignored "-Wsign-conversion" #endif #if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic ignored "-Winline" #endif #if defined(BORINGSSL_HAS_UINT128) #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wpedantic" #endif #include "../../../third_party/fiat/p256_64.h" #elif defined(OPENSSL_64_BIT) #include "../../../third_party/fiat/p256_64_msvc.h" #else #include "../../../third_party/fiat/p256_32.h" #endif // utility functions, handwritten #if defined(OPENSSL_64_BIT) #define FIAT_P256_NLIMBS 4 typedef uint64_t fiat_p256_limb_t; typedef uint64_t fiat_p256_felem[FIAT_P256_NLIMBS]; static const fiat_p256_felem fiat_p256_one = {0x1, 0xffffffff00000000, 0xffffffffffffffff, 0xfffffffe}; #else // 64BIT; else 32BIT #define FIAT_P256_NLIMBS 8 typedef uint32_t fiat_p256_limb_t; typedef uint32_t fiat_p256_felem[FIAT_P256_NLIMBS]; static const fiat_p256_felem fiat_p256_one = { 0x1, 0x0, 0x0, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0x0}; #endif // 64BIT static fiat_p256_limb_t fiat_p256_nz( const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) { fiat_p256_limb_t ret; fiat_p256_nonzero(&ret, in1); return ret; } static void fiat_p256_copy(fiat_p256_limb_t out[FIAT_P256_NLIMBS], const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) { for (size_t i = 0; i < FIAT_P256_NLIMBS; i++) { out[i] = in1[i]; } } static void fiat_p256_cmovznz(fiat_p256_limb_t out[FIAT_P256_NLIMBS], fiat_p256_limb_t t, const fiat_p256_limb_t z[FIAT_P256_NLIMBS], const fiat_p256_limb_t nz[FIAT_P256_NLIMBS]) { fiat_p256_selectznz(out, !!t, z, nz); } static void fiat_p256_from_words(fiat_p256_felem out, const Limb in[32 / sizeof(BN_ULONG)]) { // Typically, |BN_ULONG| and |fiat_p256_limb_t| will be the same type, but on // 64-bit platforms without |uint128_t|, they are different. However, on // little-endian systems, |uint64_t[4]| and |uint32_t[8]| have the same // layout. OPENSSL_memcpy(out, in, 32); } static void fiat_p256_to_words(Limb out[32 / sizeof(BN_ULONG)], const fiat_p256_felem in) { // See |fiat_p256_from_words|. OPENSSL_memcpy(out, in, 32); } // Group operations // ---------------- // // Building on top of the field operations we have the operations on the // elliptic curve group itself. Points on the curve are represented in Jacobian // coordinates. // // Both operations were transcribed to Coq and proven to correspond to naive // implementations using Affine coordinates, for all suitable fields. In the // Coq proofs, issues of constant-time execution and memory layout (aliasing) // conventions were not considered. Specification of affine coordinates: // // As a sanity check, a proof that these points form a commutative group: // // fiat_p256_point_double calculates 2*(x_in, y_in, z_in) // // The method is taken from: // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b // // Coq transcription and correctness proof: // // // // Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. // while x_out == y_in is not (maybe this works, but it's not tested). static void fiat_p256_point_double(fiat_p256_felem x_out, fiat_p256_felem y_out, fiat_p256_felem z_out, const fiat_p256_felem x_in, const fiat_p256_felem y_in, const fiat_p256_felem z_in) { fiat_p256_felem delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta; // delta = z^2 fiat_p256_square(delta, z_in); // gamma = y^2 fiat_p256_square(gamma, y_in); // beta = x*gamma fiat_p256_mul(beta, x_in, gamma); // alpha = 3*(x-delta)*(x+delta) fiat_p256_sub(ftmp, x_in, delta); fiat_p256_add(ftmp2, x_in, delta); fiat_p256_add(tmptmp, ftmp2, ftmp2); fiat_p256_add(ftmp2, ftmp2, tmptmp); fiat_p256_mul(alpha, ftmp, ftmp2); // x' = alpha^2 - 8*beta fiat_p256_square(x_out, alpha); fiat_p256_add(fourbeta, beta, beta); fiat_p256_add(fourbeta, fourbeta, fourbeta); fiat_p256_add(tmptmp, fourbeta, fourbeta); fiat_p256_sub(x_out, x_out, tmptmp); // z' = (y + z)^2 - gamma - delta fiat_p256_add(delta, gamma, delta); fiat_p256_add(ftmp, y_in, z_in); fiat_p256_square(z_out, ftmp); fiat_p256_sub(z_out, z_out, delta); // y' = alpha*(4*beta - x') - 8*gamma^2 fiat_p256_sub(y_out, fourbeta, x_out); fiat_p256_add(gamma, gamma, gamma); fiat_p256_square(gamma, gamma); fiat_p256_mul(y_out, alpha, y_out); fiat_p256_add(gamma, gamma, gamma); fiat_p256_sub(y_out, y_out, gamma); } // fiat_p256_point_add calculates (x1, y1, z1) + (x2, y2, z2) // // The method is taken from: // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, // adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). // // Coq transcription and correctness proof: // // // // This function includes a branch for checking whether the two input points // are equal, (while not equal to the point at infinity). This case never // happens during single point multiplication, so there is no timing leak for // ECDH or ECDSA signing. static void fiat_p256_point_add(fiat_p256_felem x3, fiat_p256_felem y3, fiat_p256_felem z3, const fiat_p256_felem x1, const fiat_p256_felem y1, const fiat_p256_felem z1, const int mixed, const fiat_p256_felem x2, const fiat_p256_felem y2, const fiat_p256_felem z2) { fiat_p256_felem x_out, y_out, z_out; fiat_p256_limb_t z1nz = fiat_p256_nz(z1); fiat_p256_limb_t z2nz = fiat_p256_nz(z2); // z1z1 = z1z1 = z1**2 fiat_p256_felem z1z1; fiat_p256_square(z1z1, z1); fiat_p256_felem u1, s1, two_z1z2; if (!mixed) { // z2z2 = z2**2 fiat_p256_felem z2z2; fiat_p256_square(z2z2, z2); // u1 = x1*z2z2 fiat_p256_mul(u1, x1, z2z2); // two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 fiat_p256_add(two_z1z2, z1, z2); fiat_p256_square(two_z1z2, two_z1z2); fiat_p256_sub(two_z1z2, two_z1z2, z1z1); fiat_p256_sub(two_z1z2, two_z1z2, z2z2); // s1 = y1 * z2**3 fiat_p256_mul(s1, z2, z2z2); fiat_p256_mul(s1, s1, y1); } else { // We'll assume z2 = 1 (special case z2 = 0 is handled later). // u1 = x1*z2z2 fiat_p256_copy(u1, x1); // two_z1z2 = 2z1z2 fiat_p256_add(two_z1z2, z1, z1); // s1 = y1 * z2**3 fiat_p256_copy(s1, y1); } // u2 = x2*z1z1 fiat_p256_felem u2; fiat_p256_mul(u2, x2, z1z1); // h = u2 - u1 fiat_p256_felem h; fiat_p256_sub(h, u2, u1); fiat_p256_limb_t xneq = fiat_p256_nz(h); // z_out = two_z1z2 * h fiat_p256_mul(z_out, h, two_z1z2); // z1z1z1 = z1 * z1z1 fiat_p256_felem z1z1z1; fiat_p256_mul(z1z1z1, z1, z1z1); // s2 = y2 * z1**3 fiat_p256_felem s2; fiat_p256_mul(s2, y2, z1z1z1); // r = (s2 - s1)*2 fiat_p256_felem r; fiat_p256_sub(r, s2, s1); fiat_p256_add(r, r, r); fiat_p256_limb_t yneq = fiat_p256_nz(r); fiat_p256_limb_t is_nontrivial_double = constant_time_is_zero_w(xneq | yneq) & ~constant_time_is_zero_w(z1nz) & ~constant_time_is_zero_w(z2nz); if (constant_time_declassify_w(is_nontrivial_double)) { fiat_p256_point_double(x3, y3, z3, x1, y1, z1); return; } // I = (2h)**2 fiat_p256_felem i; fiat_p256_add(i, h, h); fiat_p256_square(i, i); // J = h * I fiat_p256_felem j; fiat_p256_mul(j, h, i); // V = U1 * I fiat_p256_felem v; fiat_p256_mul(v, u1, i); // x_out = r**2 - J - 2V fiat_p256_square(x_out, r); fiat_p256_sub(x_out, x_out, j); fiat_p256_sub(x_out, x_out, v); fiat_p256_sub(x_out, x_out, v); // y_out = r(V-x_out) - 2 * s1 * J fiat_p256_sub(y_out, v, x_out); fiat_p256_mul(y_out, y_out, r); fiat_p256_felem s1j; fiat_p256_mul(s1j, s1, j); fiat_p256_sub(y_out, y_out, s1j); fiat_p256_sub(y_out, y_out, s1j); fiat_p256_cmovznz(x_out, z1nz, x2, x_out); fiat_p256_cmovznz(x3, z2nz, x1, x_out); fiat_p256_cmovznz(y_out, z1nz, y2, y_out); fiat_p256_cmovznz(y3, z2nz, y1, y_out); fiat_p256_cmovznz(z_out, z1nz, z2, z_out); fiat_p256_cmovznz(z3, z2nz, z1, z_out); } #include "./p256_table.h" // fiat_p256_select_point_affine selects the |idx-1|th point from a // precomputation table and copies it to out. If |idx| is zero, the output is // the point at infinity. static void fiat_p256_select_point_affine( const fiat_p256_limb_t idx, size_t size, const fiat_p256_felem pre_comp[/*size*/][2], fiat_p256_felem out[3]) { OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3); for (size_t i = 0; i < size; i++) { fiat_p256_limb_t mismatch = i ^ (idx - 1); fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]); fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]); } fiat_p256_cmovznz(out[2], idx, out[2], fiat_p256_one); } // fiat_p256_select_point selects the |idx|th point from a precomputation table // and copies it to out. static void fiat_p256_select_point(const fiat_p256_limb_t idx, size_t size, const fiat_p256_felem pre_comp[/*size*/][3], fiat_p256_felem out[3]) { OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3); for (size_t i = 0; i < size; i++) { fiat_p256_limb_t mismatch = i ^ idx; fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]); fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]); fiat_p256_cmovznz(out[2], mismatch, pre_comp[i][2], out[2]); } } // fiat_p256_get_bit returns the |i|th bit in |in| static crypto_word_t fiat_p256_get_bit(const Limb in[P256_LIMBS], int i) { if (i < 0 || i >= 256) { return 0; } #if defined(OPENSSL_64_BIT) OPENSSL_STATIC_ASSERT(sizeof(Limb) == 8, "BN_ULONG was not 64-bit"); return (in[i >> 6] >> (i & 63)) & 1; #else OPENSSL_STATIC_ASSERT(sizeof(Limb) == 4, "BN_ULONG was not 32-bit"); return (in[i >> 5] >> (i & 31)) & 1; #endif } void p256_point_mul(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS], const Limb p_x[P256_LIMBS], const Limb p_y[P256_LIMBS]) { debug_assert_nonsecret(r != NULL); debug_assert_nonsecret(scalar != NULL); debug_assert_nonsecret(p_x != NULL); debug_assert_nonsecret(p_y != NULL); fiat_p256_felem p_pre_comp[17][3]; OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp)); // Precompute multiples. fiat_p256_from_words(p_pre_comp[1][0], p_x); fiat_p256_from_words(p_pre_comp[1][1], p_y); fiat_p256_copy(p_pre_comp[1][2], fiat_p256_one); for (size_t j = 2; j <= 16; ++j) { if (j & 1) { fiat_p256_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2], p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2], 0, p_pre_comp[j - 1][0], p_pre_comp[j - 1][1], p_pre_comp[j - 1][2]); } else { fiat_p256_point_double(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2], p_pre_comp[j / 2][0], p_pre_comp[j / 2][1], p_pre_comp[j / 2][2]); } } // Set nq to the point at infinity. fiat_p256_felem nq[3] = {{0}, {0}, {0}}, ftmp, tmp[3]; // Loop over |scalar| msb-to-lsb, incorporating |p_pre_comp| every 5th round. int skip = 1; // Save two point operations in the first round. for (size_t i = 255; i < 256; i--) { // double if (!skip) { fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); } // do other additions every 5 doublings if (i % 5 == 0) { crypto_word_t bits = fiat_p256_get_bit(scalar, i + 4) << 5; bits |= fiat_p256_get_bit(scalar, i + 3) << 4; bits |= fiat_p256_get_bit(scalar, i + 2) << 3; bits |= fiat_p256_get_bit(scalar, i + 1) << 2; bits |= fiat_p256_get_bit(scalar, i) << 1; bits |= fiat_p256_get_bit(scalar, i - 1); crypto_word_t sign, digit; recode_scalar_bits(&sign, &digit, bits); // select the point to add or subtract, in constant time. fiat_p256_select_point((fiat_p256_limb_t)digit, 17, RING_CORE_POINTLESS_ARRAY_CONST_CAST((const fiat_p256_felem(*)[3]))p_pre_comp, tmp); fiat_p256_opp(ftmp, tmp[1]); // (X, -Y, Z) is the negative point. fiat_p256_cmovznz(tmp[1], (fiat_p256_limb_t)sign, tmp[1], ftmp); if (!skip) { fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 0 /* mixed */, tmp[0], tmp[1], tmp[2]); } else { fiat_p256_copy(nq[0], tmp[0]); fiat_p256_copy(nq[1], tmp[1]); fiat_p256_copy(nq[2], tmp[2]); skip = 0; } } } fiat_p256_to_words(r[0], nq[0]); fiat_p256_to_words(r[1], nq[1]); fiat_p256_to_words(r[2], nq[2]); } void p256_point_mul_base(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS]) { // Set nq to the point at infinity. fiat_p256_felem nq[3] = {{0}, {0}, {0}}, tmp[3]; int skip = 1; // Save two point operations in the first round. for (size_t i = 31; i < 32; i--) { if (!skip) { fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); } // First, look 32 bits upwards. crypto_word_t bits = fiat_p256_get_bit(scalar, i + 224) << 3; bits |= fiat_p256_get_bit(scalar, i + 160) << 2; bits |= fiat_p256_get_bit(scalar, i + 96) << 1; bits |= fiat_p256_get_bit(scalar, i + 32); // Select the point to add, in constant time. fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15, fiat_p256_g_pre_comp[1], tmp); if (!skip) { fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */, tmp[0], tmp[1], tmp[2]); } else { fiat_p256_copy(nq[0], tmp[0]); fiat_p256_copy(nq[1], tmp[1]); fiat_p256_copy(nq[2], tmp[2]); skip = 0; } // Second, look at the current position. bits = fiat_p256_get_bit(scalar, i + 192) << 3; bits |= fiat_p256_get_bit(scalar, i + 128) << 2; bits |= fiat_p256_get_bit(scalar, i + 64) << 1; bits |= fiat_p256_get_bit(scalar, i); // Select the point to add, in constant time. fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15, fiat_p256_g_pre_comp[0], tmp); fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */, tmp[0], tmp[1], tmp[2]); } fiat_p256_to_words(r[0], nq[0]); fiat_p256_to_words(r[1], nq[1]); fiat_p256_to_words(r[2], nq[2]); } void p256_mul_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS], const Limb b[P256_LIMBS]) { fiat_p256_felem a_, b_; fiat_p256_from_words(a_, a); fiat_p256_from_words(b_, b); fiat_p256_mul(a_, a_, b_); fiat_p256_to_words(r, a_); } void p256_sqr_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS]) { fiat_p256_felem x; fiat_p256_from_words(x, a); fiat_p256_square(x, x); fiat_p256_to_words(r, x); } void p256_point_add(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS], const Limb b[3][P256_LIMBS]) { fiat_p256_felem x1, y1, z1, x2, y2, z2; fiat_p256_from_words(x1, a[0]); fiat_p256_from_words(y1, a[1]); fiat_p256_from_words(z1, a[2]); fiat_p256_from_words(x2, b[0]); fiat_p256_from_words(y2, b[1]); fiat_p256_from_words(z2, b[2]); fiat_p256_point_add(x1, y1, z1, x1, y1, z1, 0 /* both Jacobian */, x2, y2, z2); fiat_p256_to_words(r[0], x1); fiat_p256_to_words(r[1], y1); fiat_p256_to_words(r[2], z1); } void p256_point_double(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS]) { fiat_p256_felem x, y, z; fiat_p256_from_words(x, a[0]); fiat_p256_from_words(y, a[1]); fiat_p256_from_words(z, a[2]); fiat_p256_point_double(x, y, z, x, y, z); fiat_p256_to_words(r[0], x); fiat_p256_to_words(r[1], y); fiat_p256_to_words(r[2], z); } // For testing only. void p256_point_add_affine(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS], const Limb b[2][P256_LIMBS]) { fiat_p256_felem x1, y1, z1, x2, y2; fiat_p256_from_words(x1, a[0]); fiat_p256_from_words(y1, a[1]); fiat_p256_from_words(z1, a[2]); fiat_p256_from_words(x2, b[0]); fiat_p256_from_words(y2, b[1]); fiat_p256_felem z2 = {0}; fiat_p256_cmovznz(z2, fiat_p256_nz(x2) & fiat_p256_nz(y2), z2, fiat_p256_one); fiat_p256_point_add(x1, y1, z1, x1, y1, z1, 1 /* mixed */, x2, y2, z2); fiat_p256_to_words(r[0], x1); fiat_p256_to_words(r[1], y1); fiat_p256_to_words(r[2], z1); } #endif ring-0.17.14/crypto/fipsmodule/ec/p256_shared.h000064400000000000000000000040441046102023000172100ustar 00000000000000// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. // Copyright (c) 2014, Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) // (1) Intel Corporation, Israel Development Center, Haifa, Israel // (2) University of Haifa, Israel // // Reference: // S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with // 256 Bit Primes" #ifndef OPENSSL_HEADER_EC_P256_SHARED_H #define OPENSSL_HEADER_EC_P256_SHARED_H #include "ring-core/base.h" #include "../bn/internal.h" #if !defined(OPENSSL_NO_ASM) && \ (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ !defined(OPENSSL_SMALL) # define OPENSSL_USE_NISTZ256 #endif // P-256 field operations. // // An element mod P in P-256 is represented as a little-endian array of // |P256_LIMBS| |BN_ULONG|s, spanning the full range of values. // // The following functions take fully-reduced inputs mod P and give // fully-reduced outputs. They may be used in-place. #define P256_LIMBS (256 / BN_BITS2) // A P256_POINT represents a P-256 point in Jacobian coordinates. typedef struct { BN_ULONG X[P256_LIMBS]; BN_ULONG Y[P256_LIMBS]; BN_ULONG Z[P256_LIMBS]; } P256_POINT; typedef unsigned char P256_SCALAR_BYTES[33]; static inline void p256_scalar_bytes_from_limbs( P256_SCALAR_BYTES bytes_out, const BN_ULONG limbs[P256_LIMBS]) { OPENSSL_memcpy(bytes_out, limbs, 32); bytes_out[32] = 0; } #endif /* !defined(OPENSSL_USE_NISTZ256) */ ring-0.17.14/crypto/fipsmodule/ec/p256_table.h000064400000000000000000000345241046102023000170370ustar 00000000000000// Copyright 2020 The BoringSSL Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // This file is generated by make_tables.go. // Base point pre computation // -------------------------- // // Two different sorts of precomputed tables are used in the following code. // Each contain various points on the curve, where each point is three field // elements (x, y, z). // // For the base point table, z is usually 1 (0 for the point at infinity). // This table has 2 * 16 elements, starting with the following: // index | bits | point // ------+---------+------------------------------ // 0 | 0 0 0 0 | 0G // 1 | 0 0 0 1 | 1G // 2 | 0 0 1 0 | 2^64G // 3 | 0 0 1 1 | (2^64 + 1)G // 4 | 0 1 0 0 | 2^128G // 5 | 0 1 0 1 | (2^128 + 1)G // 6 | 0 1 1 0 | (2^128 + 2^64)G // 7 | 0 1 1 1 | (2^128 + 2^64 + 1)G // 8 | 1 0 0 0 | 2^192G // 9 | 1 0 0 1 | (2^192 + 1)G // 10 | 1 0 1 0 | (2^192 + 2^64)G // 11 | 1 0 1 1 | (2^192 + 2^64 + 1)G // 12 | 1 1 0 0 | (2^192 + 2^128)G // 13 | 1 1 0 1 | (2^192 + 2^128 + 1)G // 14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G // 15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G // followed by a copy of this with each element multiplied by 2^32. // // The reason for this is so that we can clock bits into four different // locations when doing simple scalar multiplies against the base point, // and then another four locations using the second 16 elements. // // Tables for other points have table[i] = iG for i in 0 .. 16. // fiat_p256_g_pre_comp is the table of precomputed base points #if defined(OPENSSL_64_BIT) static const fiat_p256_felem fiat_p256_g_pre_comp[2][15][2] = { {{{0x79e730d418a9143c, 0x75ba95fc5fedb601, 0x79fb732b77622510, 0x18905f76a53755c6}, {0xddf25357ce95560a, 0x8b4ab8e4ba19e45c, 0xd2e88688dd21f325, 0x8571ff1825885d85}}, {{0x4f922fc516a0d2bb, 0x0d5cc16c1a623499, 0x9241cf3a57c62c8b, 0x2f5e6961fd1b667f}, {0x5c15c70bf5a01797, 0x3d20b44d60956192, 0x04911b37071fdb52, 0xf648f9168d6f0f7b}}, {{0x9e566847e137bbbc, 0xe434469e8a6a0bec, 0xb1c4276179d73463, 0x5abe0285133d0015}, {0x92aa837cc04c7dab, 0x573d9f4c43260c07, 0x0c93156278e6cc37, 0x94bb725b6b6f7383}}, {{0x62a8c244bfe20925, 0x91c19ac38fdce867, 0x5a96a5d5dd387063, 0x61d587d421d324f6}, {0xe87673a2a37173ea, 0x2384800853778b65, 0x10f8441e05bab43e, 0xfa11fe124621efbe}}, {{0x1c891f2b2cb19ffd, 0x01ba8d5bb1923c23, 0xb6d03d678ac5ca8e, 0x586eb04c1f13bedc}, {0x0c35c6e527e8ed09, 0x1e81a33c1819ede2, 0x278fd6c056c652fa, 0x19d5ac0870864f11}}, {{0x62577734d2b533d5, 0x673b8af6a1bdddc0, 0x577e7c9aa79ec293, 0xbb6de651c3b266b1}, {0xe7e9303ab65259b3, 0xd6a0afd3d03a7480, 0xc5ac83d19b3cfc27, 0x60b4619a5d18b99b}}, {{0xbd6a38e11ae5aa1c, 0xb8b7652b49e73658, 0x0b130014ee5f87ed, 0x9d0f27b2aeebffcd}, {0xca9246317a730a55, 0x9c955b2fddbbc83a, 0x07c1dfe0ac019a71, 0x244a566d356ec48d}}, {{0x56f8410ef4f8b16a, 0x97241afec47b266a, 0x0a406b8e6d9c87c1, 0x803f3e02cd42ab1b}, {0x7f0309a804dbec69, 0xa83b85f73bbad05f, 0xc6097273ad8e197f, 0xc097440e5067adc1}}, {{0x846a56f2c379ab34, 0xa8ee068b841df8d1, 0x20314459176c68ef, 0xf1af32d5915f1f30}, {0x99c375315d75bd50, 0x837cffbaf72f67bc, 0x0613a41848d7723f, 0x23d0f130e2d41c8b}}, {{0xed93e225d5be5a2b, 0x6fe799835934f3c6, 0x4314092622626ffc, 0x50bbb4d97990216a}, {0x378191c6e57ec63e, 0x65422c40181dcdb2, 0x41a8099b0236e0f6, 0x2b10011801fe49c3}}, {{0xfc68b5c59b391593, 0xc385f5a2598270fc, 0x7144f3aad19adcbb, 0xdd55899983fbae0c}, {0x93b88b8e74b82ff4, 0xd2e03c4071e734c9, 0x9a7a9eaf43c0322a, 0xe6e4c551149d6041}}, {{0x5fe14bfe80ec21fe, 0xf6ce116ac255be82, 0x98bc5a072f4a5d67, 0xfad27148db7e63af}, {0x90c0b6ac29ab05b3, 0x37a9a83c4e251ae6, 0x0a7dc875c2aade7d, 0x77387de39f0e1a84}}, {{0x1e9ecc49a56c0dd7, 0xa5cffcd846086c74, 0x8f7a1408f505aece, 0xb37b85c0bef0c47e}, {0x3596b6e4cc0e6a8f, 0xfd6d4bbf6b388f23, 0xaba453fac39cef4e, 0x9c135ac8f9f628d5}}, {{0x0a1c729495c8f8be, 0x2961c4803bf362bf, 0x9e418403df63d4ac, 0xc109f9cb91ece900}, {0xc2d095d058945705, 0xb9083d96ddeb85c0, 0x84692b8d7a40449b, 0x9bc3344f2eee1ee1}}, {{0x0d5ae35642913074, 0x55491b2748a542b1, 0x469ca665b310732a, 0x29591d525f1a4cc1}, {0xe76f5b6bb84f983f, 0xbe7eef419f5f84e1, 0x1200d49680baa189, 0x6376551f18ef332c}}}, {{{0x202886024147519a, 0xd0981eac26b372f0, 0xa9d4a7caa785ebc8, 0xd953c50ddbdf58e9}, {0x9d6361ccfd590f8f, 0x72e9626b44e6c917, 0x7fd9611022eb64cf, 0x863ebb7e9eb288f3}}, {{0x4fe7ee31b0e63d34, 0xf4600572a9e54fab, 0xc0493334d5e7b5a4, 0x8589fb9206d54831}, {0xaa70f5cc6583553a, 0x0879094ae25649e5, 0xcc90450710044652, 0xebb0696d02541c4f}}, {{0xabbaa0c03b89da99, 0xa6f2d79eb8284022, 0x27847862b81c05e8, 0x337a4b5905e54d63}, {0x3c67500d21f7794a, 0x207005b77d6d7f61, 0x0a5a378104cfd6e8, 0x0d65e0d5f4c2fbd6}}, {{0xd433e50f6d3549cf, 0x6f33696ffacd665e, 0x695bfdacce11fcb4, 0x810ee252af7c9860}, {0x65450fe17159bb2c, 0xf7dfbebe758b357b, 0x2b057e74d69fea72, 0xd485717a92731745}}, {{0xce1f69bbe83f7669, 0x09f8ae8272877d6b, 0x9548ae543244278d, 0x207755dee3c2c19c}, {0x87bd61d96fef1945, 0x18813cefb12d28c3, 0x9fbcd1d672df64aa, 0x48dc5ee57154b00d}}, {{0xef0f469ef49a3154, 0x3e85a5956e2b2e9a, 0x45aaec1eaa924a9c, 0xaa12dfc8a09e4719}, {0x26f272274df69f1d, 0xe0e4c82ca2ff5e73, 0xb9d8ce73b7a9dd44, 0x6c036e73e48ca901}}, {{0xe1e421e1a47153f0, 0xb86c3b79920418c9, 0x93bdce87705d7672, 0xf25ae793cab79a77}, {0x1f3194a36d869d0c, 0x9d55c8824986c264, 0x49fb5ea3096e945e, 0x39b8e65313db0a3e}}, {{0xe3417bc035d0b34a, 0x440b386b8327c0a7, 0x8fb7262dac0362d1, 0x2c41114ce0cdf943}, {0x2ba5cef1ad95a0b1, 0xc09b37a867d54362, 0x26d6cdd201e486c9, 0x20477abf42ff9297}}, {{0x0f121b41bc0a67d2, 0x62d4760a444d248a, 0x0e044f1d659b4737, 0x08fde365250bb4a8}, {0xaceec3da848bf287, 0xc2a62182d3369d6e, 0x3582dfdc92449482, 0x2f7e2fd2565d6cd7}}, {{0x0a0122b5178a876b, 0x51ff96ff085104b4, 0x050b31ab14f29f76, 0x84abb28b5f87d4e6}, {0xd5ed439f8270790a, 0x2d6cb59d85e3f46b, 0x75f55c1b6c1e2212, 0xe5436f6717655640}}, {{0xc2965ecc9aeb596d, 0x01ea03e7023c92b4, 0x4704b4b62e013961, 0x0ca8fd3f905ea367}, {0x92523a42551b2b61, 0x1eb7a89c390fcd06, 0xe7f1d2be0392a63e, 0x96dca2644ddb0c33}}, {{0x231c210e15339848, 0xe87a28e870778c8d, 0x9d1de6616956e170, 0x4ac3c9382bb09c0b}, {0x19be05516998987d, 0x8b2376c4ae09f4d6, 0x1de0b7651a3f933d, 0x380d94c7e39705f4}}, {{0x3685954b8c31c31d, 0x68533d005bf21a0c, 0x0bd7626e75c79ec9, 0xca17754742c69d54}, {0xcc6edafff6d2dbb2, 0xfd0d8cbd174a9d18, 0x875e8793aa4578e8, 0xa976a7139cab2ce6}}, {{0xce37ab11b43ea1db, 0x0a7ff1a95259d292, 0x851b02218f84f186, 0xa7222beadefaad13}, {0xa2ac78ec2b0a9144, 0x5a024051f2fa59c5, 0x91d1eca56147ce38, 0xbe94d523bc2ac690}}, {{0x2d8daefd79ec1a0f, 0x3bbcd6fdceb39c97, 0xf5575ffc58f61a95, 0xdbd986c4adf7b420}, {0x81aa881415f39eb7, 0x6ee2fcf5b98d976c, 0x5465475dcf2f717d, 0x8e24d3c46860bbd0}}}}; #else static const fiat_p256_felem fiat_p256_g_pre_comp[2][15][2] = { {{{0x18a9143c, 0x79e730d4, 0x5fedb601, 0x75ba95fc, 0x77622510, 0x79fb732b, 0xa53755c6, 0x18905f76}, {0xce95560a, 0xddf25357, 0xba19e45c, 0x8b4ab8e4, 0xdd21f325, 0xd2e88688, 0x25885d85, 0x8571ff18}}, {{0x16a0d2bb, 0x4f922fc5, 0x1a623499, 0x0d5cc16c, 0x57c62c8b, 0x9241cf3a, 0xfd1b667f, 0x2f5e6961}, {0xf5a01797, 0x5c15c70b, 0x60956192, 0x3d20b44d, 0x071fdb52, 0x04911b37, 0x8d6f0f7b, 0xf648f916}}, {{0xe137bbbc, 0x9e566847, 0x8a6a0bec, 0xe434469e, 0x79d73463, 0xb1c42761, 0x133d0015, 0x5abe0285}, {0xc04c7dab, 0x92aa837c, 0x43260c07, 0x573d9f4c, 0x78e6cc37, 0x0c931562, 0x6b6f7383, 0x94bb725b}}, {{0xbfe20925, 0x62a8c244, 0x8fdce867, 0x91c19ac3, 0xdd387063, 0x5a96a5d5, 0x21d324f6, 0x61d587d4}, {0xa37173ea, 0xe87673a2, 0x53778b65, 0x23848008, 0x05bab43e, 0x10f8441e, 0x4621efbe, 0xfa11fe12}}, {{0x2cb19ffd, 0x1c891f2b, 0xb1923c23, 0x01ba8d5b, 0x8ac5ca8e, 0xb6d03d67, 0x1f13bedc, 0x586eb04c}, {0x27e8ed09, 0x0c35c6e5, 0x1819ede2, 0x1e81a33c, 0x56c652fa, 0x278fd6c0, 0x70864f11, 0x19d5ac08}}, {{0xd2b533d5, 0x62577734, 0xa1bdddc0, 0x673b8af6, 0xa79ec293, 0x577e7c9a, 0xc3b266b1, 0xbb6de651}, {0xb65259b3, 0xe7e9303a, 0xd03a7480, 0xd6a0afd3, 0x9b3cfc27, 0xc5ac83d1, 0x5d18b99b, 0x60b4619a}}, {{0x1ae5aa1c, 0xbd6a38e1, 0x49e73658, 0xb8b7652b, 0xee5f87ed, 0x0b130014, 0xaeebffcd, 0x9d0f27b2}, {0x7a730a55, 0xca924631, 0xddbbc83a, 0x9c955b2f, 0xac019a71, 0x07c1dfe0, 0x356ec48d, 0x244a566d}}, {{0xf4f8b16a, 0x56f8410e, 0xc47b266a, 0x97241afe, 0x6d9c87c1, 0x0a406b8e, 0xcd42ab1b, 0x803f3e02}, {0x04dbec69, 0x7f0309a8, 0x3bbad05f, 0xa83b85f7, 0xad8e197f, 0xc6097273, 0x5067adc1, 0xc097440e}}, {{0xc379ab34, 0x846a56f2, 0x841df8d1, 0xa8ee068b, 0x176c68ef, 0x20314459, 0x915f1f30, 0xf1af32d5}, {0x5d75bd50, 0x99c37531, 0xf72f67bc, 0x837cffba, 0x48d7723f, 0x0613a418, 0xe2d41c8b, 0x23d0f130}}, {{0xd5be5a2b, 0xed93e225, 0x5934f3c6, 0x6fe79983, 0x22626ffc, 0x43140926, 0x7990216a, 0x50bbb4d9}, {0xe57ec63e, 0x378191c6, 0x181dcdb2, 0x65422c40, 0x0236e0f6, 0x41a8099b, 0x01fe49c3, 0x2b100118}}, {{0x9b391593, 0xfc68b5c5, 0x598270fc, 0xc385f5a2, 0xd19adcbb, 0x7144f3aa, 0x83fbae0c, 0xdd558999}, {0x74b82ff4, 0x93b88b8e, 0x71e734c9, 0xd2e03c40, 0x43c0322a, 0x9a7a9eaf, 0x149d6041, 0xe6e4c551}}, {{0x80ec21fe, 0x5fe14bfe, 0xc255be82, 0xf6ce116a, 0x2f4a5d67, 0x98bc5a07, 0xdb7e63af, 0xfad27148}, {0x29ab05b3, 0x90c0b6ac, 0x4e251ae6, 0x37a9a83c, 0xc2aade7d, 0x0a7dc875, 0x9f0e1a84, 0x77387de3}}, {{0xa56c0dd7, 0x1e9ecc49, 0x46086c74, 0xa5cffcd8, 0xf505aece, 0x8f7a1408, 0xbef0c47e, 0xb37b85c0}, {0xcc0e6a8f, 0x3596b6e4, 0x6b388f23, 0xfd6d4bbf, 0xc39cef4e, 0xaba453fa, 0xf9f628d5, 0x9c135ac8}}, {{0x95c8f8be, 0x0a1c7294, 0x3bf362bf, 0x2961c480, 0xdf63d4ac, 0x9e418403, 0x91ece900, 0xc109f9cb}, {0x58945705, 0xc2d095d0, 0xddeb85c0, 0xb9083d96, 0x7a40449b, 0x84692b8d, 0x2eee1ee1, 0x9bc3344f}}, {{0x42913074, 0x0d5ae356, 0x48a542b1, 0x55491b27, 0xb310732a, 0x469ca665, 0x5f1a4cc1, 0x29591d52}, {0xb84f983f, 0xe76f5b6b, 0x9f5f84e1, 0xbe7eef41, 0x80baa189, 0x1200d496, 0x18ef332c, 0x6376551f}}}, {{{0x4147519a, 0x20288602, 0x26b372f0, 0xd0981eac, 0xa785ebc8, 0xa9d4a7ca, 0xdbdf58e9, 0xd953c50d}, {0xfd590f8f, 0x9d6361cc, 0x44e6c917, 0x72e9626b, 0x22eb64cf, 0x7fd96110, 0x9eb288f3, 0x863ebb7e}}, {{0xb0e63d34, 0x4fe7ee31, 0xa9e54fab, 0xf4600572, 0xd5e7b5a4, 0xc0493334, 0x06d54831, 0x8589fb92}, {0x6583553a, 0xaa70f5cc, 0xe25649e5, 0x0879094a, 0x10044652, 0xcc904507, 0x02541c4f, 0xebb0696d}}, {{0x3b89da99, 0xabbaa0c0, 0xb8284022, 0xa6f2d79e, 0xb81c05e8, 0x27847862, 0x05e54d63, 0x337a4b59}, {0x21f7794a, 0x3c67500d, 0x7d6d7f61, 0x207005b7, 0x04cfd6e8, 0x0a5a3781, 0xf4c2fbd6, 0x0d65e0d5}}, {{0x6d3549cf, 0xd433e50f, 0xfacd665e, 0x6f33696f, 0xce11fcb4, 0x695bfdac, 0xaf7c9860, 0x810ee252}, {0x7159bb2c, 0x65450fe1, 0x758b357b, 0xf7dfbebe, 0xd69fea72, 0x2b057e74, 0x92731745, 0xd485717a}}, {{0xe83f7669, 0xce1f69bb, 0x72877d6b, 0x09f8ae82, 0x3244278d, 0x9548ae54, 0xe3c2c19c, 0x207755de}, {0x6fef1945, 0x87bd61d9, 0xb12d28c3, 0x18813cef, 0x72df64aa, 0x9fbcd1d6, 0x7154b00d, 0x48dc5ee5}}, {{0xf49a3154, 0xef0f469e, 0x6e2b2e9a, 0x3e85a595, 0xaa924a9c, 0x45aaec1e, 0xa09e4719, 0xaa12dfc8}, {0x4df69f1d, 0x26f27227, 0xa2ff5e73, 0xe0e4c82c, 0xb7a9dd44, 0xb9d8ce73, 0xe48ca901, 0x6c036e73}}, {{0xa47153f0, 0xe1e421e1, 0x920418c9, 0xb86c3b79, 0x705d7672, 0x93bdce87, 0xcab79a77, 0xf25ae793}, {0x6d869d0c, 0x1f3194a3, 0x4986c264, 0x9d55c882, 0x096e945e, 0x49fb5ea3, 0x13db0a3e, 0x39b8e653}}, {{0x35d0b34a, 0xe3417bc0, 0x8327c0a7, 0x440b386b, 0xac0362d1, 0x8fb7262d, 0xe0cdf943, 0x2c41114c}, {0xad95a0b1, 0x2ba5cef1, 0x67d54362, 0xc09b37a8, 0x01e486c9, 0x26d6cdd2, 0x42ff9297, 0x20477abf}}, {{0xbc0a67d2, 0x0f121b41, 0x444d248a, 0x62d4760a, 0x659b4737, 0x0e044f1d, 0x250bb4a8, 0x08fde365}, {0x848bf287, 0xaceec3da, 0xd3369d6e, 0xc2a62182, 0x92449482, 0x3582dfdc, 0x565d6cd7, 0x2f7e2fd2}}, {{0x178a876b, 0x0a0122b5, 0x085104b4, 0x51ff96ff, 0x14f29f76, 0x050b31ab, 0x5f87d4e6, 0x84abb28b}, {0x8270790a, 0xd5ed439f, 0x85e3f46b, 0x2d6cb59d, 0x6c1e2212, 0x75f55c1b, 0x17655640, 0xe5436f67}}, {{0x9aeb596d, 0xc2965ecc, 0x023c92b4, 0x01ea03e7, 0x2e013961, 0x4704b4b6, 0x905ea367, 0x0ca8fd3f}, {0x551b2b61, 0x92523a42, 0x390fcd06, 0x1eb7a89c, 0x0392a63e, 0xe7f1d2be, 0x4ddb0c33, 0x96dca264}}, {{0x15339848, 0x231c210e, 0x70778c8d, 0xe87a28e8, 0x6956e170, 0x9d1de661, 0x2bb09c0b, 0x4ac3c938}, {0x6998987d, 0x19be0551, 0xae09f4d6, 0x8b2376c4, 0x1a3f933d, 0x1de0b765, 0xe39705f4, 0x380d94c7}}, {{0x8c31c31d, 0x3685954b, 0x5bf21a0c, 0x68533d00, 0x75c79ec9, 0x0bd7626e, 0x42c69d54, 0xca177547}, {0xf6d2dbb2, 0xcc6edaff, 0x174a9d18, 0xfd0d8cbd, 0xaa4578e8, 0x875e8793, 0x9cab2ce6, 0xa976a713}}, {{0xb43ea1db, 0xce37ab11, 0x5259d292, 0x0a7ff1a9, 0x8f84f186, 0x851b0221, 0xdefaad13, 0xa7222bea}, {0x2b0a9144, 0xa2ac78ec, 0xf2fa59c5, 0x5a024051, 0x6147ce38, 0x91d1eca5, 0xbc2ac690, 0xbe94d523}}, {{0x79ec1a0f, 0x2d8daefd, 0xceb39c97, 0x3bbcd6fd, 0x58f61a95, 0xf5575ffc, 0xadf7b420, 0xdbd986c4}, {0x15f39eb7, 0x81aa8814, 0xb98d976c, 0x6ee2fcf5, 0xcf2f717d, 0x5465475d, 0x6860bbd0, 0x8e24d3c4}}}}; #endif ring-0.17.14/crypto/fipsmodule/ec/util.h000064400000000000000000000261241046102023000161460ustar 00000000000000// Copyright 2015 The BoringSSL Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include "../../internal.h" #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wconversion" #pragma GCC diagnostic ignored "-Wsign-conversion" #endif // This function looks at 5+1 scalar bits (5 current, 1 adjacent less // significant bit), and recodes them into a signed digit for use in fast point // multiplication: the use of signed rather than unsigned digits means that // fewer points need to be precomputed, given that point inversion is easy (a // precomputed point dP makes -dP available as well). // // BACKGROUND: // // Signed digits for multiplication were introduced by Booth ("A signed binary // multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV, // pt. 2 (1951), pp. 236-240), in that case for multiplication of integers. // Booth's original encoding did not generally improve the density of nonzero // digits over the binary representation, and was merely meant to simplify the // handling of signed factors given in two's complement; but it has since been // shown to be the basis of various signed-digit representations that do have // further advantages, including the wNAF, using the following general // approach: // // (1) Given a binary representation // // b_k ... b_2 b_1 b_0, // // of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1 // by using bit-wise subtraction as follows: // // b_k b_(k-1) ... b_2 b_1 b_0 // - b_k ... b_3 b_2 b_1 b_0 // ----------------------------------------- // s_(k+1) s_k ... s_3 s_2 s_1 s_0 // // A left-shift followed by subtraction of the original value yields a new // representation of the same value, using signed bits s_i = b_(i-1) - b_i. // This representation from Booth's paper has since appeared in the // literature under a variety of different names including "reversed binary // form", "alternating greedy expansion", "mutual opposite form", and // "sign-alternating {+-1}-representation". // // An interesting property is that among the nonzero bits, values 1 and -1 // strictly alternate. // // (2) Various window schemes can be applied to the Booth representation of // integers: for example, right-to-left sliding windows yield the wNAF // (a signed-digit encoding independently discovered by various researchers // in the 1990s), and left-to-right sliding windows yield a left-to-right // equivalent of the wNAF (independently discovered by various researchers // around 2004). // // To prevent leaking information through side channels in point multiplication, // we need to recode the given integer into a regular pattern: sliding windows // as in wNAFs won't do, we need their fixed-window equivalent -- which is a few // decades older: we'll be using the so-called "modified Booth encoding" due to // MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49 // (1961), pp. 67-91), in a radix-2^5 setting. That is, we always combine five // signed bits into a signed digit: // // s_(5j + 4) s_(5j + 3) s_(5j + 2) s_(5j + 1) s_(5j) // // The sign-alternating property implies that the resulting digit values are // integers from -16 to 16. // // Of course, we don't actually need to compute the signed digits s_i as an // intermediate step (that's just a nice way to see how this scheme relates // to the wNAF): a direct computation obtains the recoded digit from the // six bits b_(5j + 4) ... b_(5j - 1). // // This function takes those six bits as an integer (0 .. 63), writing the // recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute // value, in the range 0 .. 16). Note that this integer essentially provides // the input bits "shifted to the left" by one position: for example, the input // to compute the least significant recoded digit, given that there's no bit // b_-1, has to be b_4 b_3 b_2 b_1 b_0 0. // // DOUBLING CASE: // // Point addition formulas for short Weierstrass curves are often incomplete. // Edge cases such as P + P or P + ∞ must be handled separately. This // complicates constant-time requirements. P + ∞ cannot be avoided (any window // may be zero) and is handled with constant-time selects. P + P (where P is not // ∞) usually is not. Instead, windowing strategies are chosen to avoid this // case. Whether this happens depends on the group order. // // Let w be the window width (in this function, w = 5). The non-trivial doubling // case in single-point scalar multiplication may occur if and only if the // 2^(w-1) bit of the group order is zero. // // Note the above only holds if the scalar is fully reduced and the group order // is a prime that is much larger than 2^w. It also only holds when windows // are applied from most significant to least significant, doubling between each // window. It does not apply to more complex table strategies such as // |EC_nistz256_method|. // // PROOF: // // Let n be the group order. Let l be the number of bits needed to represent n. // Assume there exists some 0 <= k < n such that signed w-bit windowed // multiplication hits the doubling case. // // Windowed multiplication consists of iterating over groups of s_i (defined // above based on k's binary representation) from most to least significant. At // iteration i (for i = ..., 3w, 2w, w, 0, starting from the most significant // window), we: // // 1. Double the accumulator A, w times. Let A_i be the value of A at this // point. // // 2. Set A to T_i + A_i, where T_i is a precomputed multiple of P // corresponding to the window s_(i+w-1) ... s_i. // // Let j be the index such that A_j = T_j ≠ ∞. Looking at A_i and T_i as // multiples of P, define a_i and t_i to be scalar coefficients of A_i and T_i. // Thus a_j = t_j ≠ 0 (mod n). Note a_i and t_i may not be reduced mod n. t_i is // the value of the w signed bits s_(i+w-1) ... s_i. a_i is computed as a_i = // 2^w * (a_(i+w) + t_(i+w)). // // t_i is bounded by -2^(w-1) <= t_i <= 2^(w-1). Additionally, we may write it // in terms of unsigned bits b_i. t_i consists of signed bits s_(i+w-1) ... s_i. // This is computed as: // // b_(i+w-2) b_(i+w-3) ... b_i b_(i-1) // - b_(i+w-1) b_(i+w-2) ... b_(i+1) b_i // -------------------------------------------- // t_i = s_(i+w-1) s_(i+w-2) ... s_(i+1) s_i // // Observe that b_(i+w-2) through b_i occur in both terms. Let x be the integer // represented by that bit string, i.e. 2^(w-2)*b_(i+w-2) + ... + b_i. // // t_i = (2*x + b_(i-1)) - (2^(w-1)*b_(i+w-1) + x) // = x - 2^(w-1)*b_(i+w-1) + b_(i-1) // // Or, using C notation for bit operations: // // t_i = (k>>i) & ((1<<(w-1)) - 1) - (k>>i) & (1<<(w-1)) + (k>>(i-1)) & 1 // // Note b_(i-1) is added in left-shifted by one (or doubled) from its place. // This is compensated by t_(i-w)'s subtraction term. Thus, a_i may be computed // by adding b_l b_(l-1) ... b_(i+1) b_i and an extra copy of b_(i-1). In C // notation, this is: // // a_i = (k>>(i+w)) << w + ((k>>(i+w-1)) & 1) << w // // Observe that, while t_i may be positive or negative, a_i is bounded by // 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up // are all zero. (Note this implies a non-trivial P + (-P) is unreachable for // all groups. That would imply the subsequent a_i is zero, which means all // terms thus far were zero.) // // Returning to our doubling position, we have a_j = t_j (mod n). We now // determine the value of a_j - t_j, which must be divisible by n. Our bounds on // a_j and t_j imply a_j - t_j is 0 or n. If it is 0, a_j = t_j. However, 2^w // divides a_j and -2^(w-1) <= t_j <= 2^(w-1), so this can only happen if // a_j = t_j = 0, which is a trivial doubling. Therefore, a_j - t_j = n. // // Now we determine j. Suppose j > 0. w divides j, so j >= w. Then, // // n = a_j - t_j = (k>>(j+w)) << w + ((k>>(j+w-1)) & 1) << w - t_j // <= k/2^j + 2^w - t_j // < n/2^w + 2^w + 2^(w-1) // // n is much larger than 2^w, so this is impossible. Thus, j = 0: only the final // addition may hit the doubling case. // // Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L // such that k_H is the contribution from b_(l-1) .. b_w, k_M is the // contribution from b_(w-1), and k_L is the contribution from b_(w-2) ... b_0. // That is: // // - 2^w divides k_H // - k_M is 0 or 2^(w-1) // - 0 <= k_L < 2^(w-1) // // Divide n into n_H + n_M + n_L similarly. We thus have: // // t_0 = (k>>0) & ((1<<(w-1)) - 1) - (k>>0) & (1<<(w-1)) + (k>>(0-1)) & 1 // = k & ((1<<(w-1)) - 1) - k & (1<<(w-1)) // = k_L - k_M // // a_0 = (k>>(0+w)) << w + ((k>>(0+w-1)) & 1) << w // = (k>>w) << w + ((k>>(w-1)) & 1) << w // = k_H + 2*k_M // // n = a_0 - t_0 // n_H + n_M + n_L = (k_H + 2*k_M) - (k_L - k_M) // = k_H + 3*k_M - k_L // // k_H - k_L < k and k < n, so k_H - k_L ≠ n. Therefore k_M is not 0 and must be // 2^(w-1). Now we consider k_H and n_H. We know k_H <= n_H. Suppose k_H = n_H. // Then, // // n_M + n_L = 3*(2^(w-1)) - k_L // > 3*(2^(w-1)) - 2^(w-1) // = 2^w // // Contradiction (n_M + n_L is the bottom w bits of n). Thus k_H < n_H. Suppose // k_H < n_H - 2*2^w. Then, // // n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L // < n_H - 2*2^w + 3*(2^(w-1)) - k_L // n_M + n_L < -2^(w-1) - k_L // // Contradiction. Thus, k_H = n_H - 2^w. (Note 2^w divides n_H and k_H.) Thus, // // n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L // = n_H - 2^w + 3*(2^(w-1)) - k_L // n_M + n_L = 2^(w-1) - k_L // <= 2^(w-1) // // Equality would mean 2^(w-1) divides n, which is impossible if n is prime. // Thus n_M + n_L < 2^(w-1), so n_M is zero, proving our condition. // // This proof constructs k, so, to show the converse, let k_H = n_H - 2^w, // k_M = 2^(w-1), k_L = 2^(w-1) - n_L. This will result in a non-trivial point // doubling in the final addition and is the only such scalar. // // COMMON CURVES: // // The group orders for common curves end in the following bit patterns: // // P-521: ...00001001; w = 4 is okay // P-384: ...01110011; w = 2, 5, 6, 7 are okay // P-256: ...01010001; w = 5, 7 are okay // P-224: ...00111101; w = 3, 4, 5, 6 are okay static inline void recode_scalar_bits(crypto_word_t *sign, crypto_word_t *digit, crypto_word_t in) { crypto_word_t s, d; s = ~((in >> 5) - 1); /* sets all bits to MSB(in), 'in' seen as * 6-bit value */ d = (1 << 6) - in - 1; d = (d & s) | (in & ~s); d = (d >> 1) + (d & 1); *sign = s & 1; *digit = d; } ring-0.17.14/crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt000064400000000000000000002321741046102023000221420ustar 00000000000000# Tests from NIST CAVP 186-4 ECDSA2VS Test Vectors, Signature Verification Test # http://csrc.nist.gov/groups/STM/cavp/documents/dss/186-3ecdsatestvectors.zip # # NIST's files provide message and digest pairs. Since this is a low-level test, # the digests have been extracted. P-521 test vectors were fixed to have the # right number of leading zeros. Curve = P-256 X = 1198b3c409a8b47edb1347e0982d533cb1813e5cb2a92c824b2881b3cd2f3f4a Y = 0bdbac5fa02e41e775f8d602446d58ecb2209b5a3d79ae69eef399016e992e87 Digest = 01ed0c41d650479c47057f61433d7e8b24492649 R = 9206d435f148f88c15b2effbf3c506e41b2c620102022b801e371d0767b54bea S = cbc4e1674ae1af69873946ccf6275946e59e0107278749b2d0010795833d80fa Invalid = Y Curve = P-256 X = f7c6280aecd6b936513b0ca84e63346333dc41437a15442e605d46bba93ae101 Y = 3c834cecc16167b07866a9478f9f2d882de7ef937da447cd837e60cb5ed65d81 Digest = f91b4dfddd5eb33a875d2e50d1e949211ac819da R = f615af212ab030c4bbf9362d9815a1462312df4beb4358a7ce80d820355420bf S = d12ed715ef65cfe6fe6bf348364088a0e7f70927bbafe4c12fc4cb65c0cc51bc Invalid = Y Curve = P-256 X = 0e7632dbc4db879e10d1d80f2789d9fa414c1fe77a6c1e56d6667af43e36e610 Y = 6f0dd2a5840e5a6f6ff7e23f656f5c945b7a493fbb0cfd5b9b531bf04435b1ef Digest = 3905696f8bad8205fa1445df0e91ade3dbc413e6 R = 2b0b9ab4a575732a168f28494b66a855fc1a757fb1177864bf3e4f0a000c4a86 S = 54901ce2f92f55ac112afa0f8b62bc00b44c8c10fe0c863675bfd305d6dc0cd8 Invalid = Y Curve = P-256 X = 1613f12bae8e98d09b4bba53f5229596a0d417d2c625f41bb15f923b3c1e4b57 Y = 411319fa85227997a4cf3b1756161485124d2cedc38c9c30d82f42dc2647d545 Digest = 580d31ce22700a20c2db81bcdac37330b491c86f R = ed058d476a77be99c1b0fc8502abe545541b4c0ff3eed3f558133ae2f02042b0 S = c571b4895712a4f64f7220b0694cab767379b09f1824fe7874acd127deb2371e Invalid = Y Curve = P-256 X = 88bb041dcb1733a676a7f4ae8d3e407d72d5396547f07db77078485c1d5db077 Y = 72cf2b55e596cd140c58228f1b0a19c34fca26ffac043528a417c5abb6fca9c9 Digest = 7900a02f768b0718a13525c33adace583de15c50 R = 87208734deb125dca68f0d33f9d369cf1b79cf5a021391b9c6c1727d2efe663a S = b984f722de18f1ce407104342948f03f2b55413a096c4b5fca1e032a2c814a4a Invalid = Y Curve = P-256 X = 811eb5180def7fb60d632f8cb2cba831b88cee778aa2a82ec3a5fc3d80ff7fb6 Y = db88d65b0fc35d9ba1f1ced0400434979ae895d371d1441d7c7a441a9fb1709b Digest = 17b7451ea903125ccb293ffaa9d1a4ca1141a2c5 R = c329fa28dac0018276c5af0cd770e60be50bc14e2562d5556991971edc7d4916 S = 2d111d13837a02fa279fe835a7dc59a521864d92b26649ca4e24b36ae93878e8 Invalid = Y Curve = P-256 X = 4a6f1e7f7268174d23993b8b58aa60c2a87b18de79b36a750ec86dd6f9e12227 Y = 572df22bd6487a863a51ca544b8c5de2b47f801372a881cb996a97d9a98aa825 Digest = 54e9a048559f370425e9c8e54a460ec91bcc930a R = 4a800e24de65e5c57d4cab4dd1ef7b6c38a2f0aa5cfd3a571a4b552fb1993e69 S = d9c89fb983640a7e65edf632cacd1de0823b7efbc798fc1f7bbfacdda7398955 Invalid = Y Curve = P-256 X = f3033d1e548d245b5e45ff1147db8cd44db8a1f2823c3c164125be88f9a982c2 Y = 3c078f6cee2f50e95e8916aa9c4e93de3fdf9b045abac6f707cfcb22d065638e Digest = e8d38e4c6a905a814b04c2841d898ed6da023c34 R = d4255db86a416a5a688de4e238071ef16e5f2a20e31b9490c03dee9ae6164c34 S = 4e0ac1e1a6725bf7c6bd207439b2d370c5f2dea1ff4decf1650ab84c7769efc0 Curve = P-256 X = 0ea0a6bb6c70966fad1a2307479c12de2322795bdecb70e4b286bd6200ba9c1a Y = c40eda3947021348db691ac4086fb6c06b587ce37c155bb0a7d912b93226de81 Digest = 3b08bf1b67abc03c1cd69b0e24743b5c2d49e506 R = f5509deff7bfda3f3759800fa4033af6a84466b114ecb48eac37eff48d2ae1b3 S = 8c4b62dce2082f80caf220cdbb1d02567bbdfab40564b90ef31d86e3e10ce80a Invalid = Y Curve = P-256 X = e7a57e0f6ec0fa9c7c34978034cf82f039f8fd62804070ad943573fc8efa5775 Y = 87b2cc85dfff2dae5620fbe3e6256bd728de28fc9dc1b5eb6b5d7bd5d29186ad Digest = a8c5dc0344b1442dfdb5f8836251893d6c4ecbe9 R = 97642038932fdddbe2021ec1af53ae6b9af00ef9c8b9f26aea582892e80e6285 S = 9cb14918359338041cf795cf6781e4905837fa5ce3b3e50ffafb5f13c73b5bc8 Invalid = Y Curve = P-256 X = be7a651be0c87278569987cf62d7fa1dd1b3d6e1b868d8f4dfb56135a9960eec Y = b7a62c588a987760b915edbd7f95506870c60f042471de1d8b2d4cd9d6563391 Digest = 2f93ee45db133a14c26d418c2ffd3470ae63bf50 R = aa889fb608b6939f6eeacf2f64c3b2e3a6061f2834058c7e724321720b737a63 S = 6cd6d0ef2b93a760daa914e11b9b414bd4d72457405f00a62ab63f36d76efb73 Invalid = Y Curve = P-256 X = 76ddc46d8db8d7ce2ce837f60cdabcee92b7c7817ee41c8f066f1ae65f85c318 Y = bea47191f1c584c87250370ce337a1de1583bcfc20ccc23b7a82e83f19adaa88 Digest = 2136a5470ff9d45214a0b2c300042efea8ff7266 R = 84a42efbf7ec04166ad144d19cd98c120aa2e79d483b5eea6fbdfa7f1222e07b S = e41531205e691e65668f69f518abc7b60f32c373434872a043b7358462babf83 Invalid = Y Curve = P-256 X = 2f71b932f770ba9daf7c1dd47444ab6cb8881f71a1c597e719845b15cb84ca35 Y = ab928625b40ec0738d0fc8dbc4df4a1f65d20bc0447b69cfa13bb20b95bb41d4 Digest = ae6093bb37c1264ca3ead439e4f678721912c8c4 R = 63fca172bbca6197cd2802a9cb61d74c2b47cf35f6d35203e67ffbaa838be775 S = e70ec283cd212df6ba3723e26b697501f112d7cf64e4f45185dae76055e09f1e Curve = P-256 X = ce775648b928db82ac5edb3b009d32959a73b86c45e96d4b8d5b6e640b7c2790 Y = 52455caf08ee94d86f0984e9ec9268d74823f2102dd97fced59638055f6af18e Digest = 60054807acb29e3091a023c42b9885c4945249e1 R = 2a64b29146588f3153fee1029a0131ac0a8a25ba2ecc494f697c166c7c91fc08 S = 7b429bc12a72ca3d76c119eea9f4098633cc31c87831e54d5d93afd6e8d20f4f Invalid = Y Curve = P-256 X = cd2f29a53f0ce57e0e4a542c3256e65ebbdc30415f4de771d5d706d3aeacc852 Y = dbbf2c129f30d11fe77d7816a24187764eae3fb2ff70c1ec745e876e26f5232f Digest = 5f50e35b134942295c16d003742fd6bce5bdab45 R = 2454c5ee84e4f77b554acd368dd412389db8c78429590a092f24db2da43cb761 S = 63e870ce2fa4085d4ff1e360f7a5c101a1f8b288abe71cca56887e613ad034b7 Curve = P-256 X = 843f6d83d777aac75b758d58c670f417c8deea8d339a440bb626114318c34f29 Y = 83e0c70008521c8509044b724420463e3478e3c91874d424be44413d1ce555f3 Digest = cda2c7ad9abb2a858c4981550f78974c69e41cc31fa33509e3e83dc2 R = d08e9a5db411019d826b20ac889227ed245503a6d839494db1e8d7995a6b245b S = 8d46a204054125d0dc776ab1055302ec4eb0f20b90bca6d205f21d3cefd29097 Curve = P-256 X = f08b56f73f7a0e098444f6f0a02ad81ce0b914a11cafa15893d1c84704e1c564 Y = bbee9aeb91cdc2d1d1437b4168df73acfd64e8b02962b14c85e67187e1ef80a4 Digest = 5453c2656550e9b3dc6c40a3f1362a73522396bc35d383dd6451128f R = 71b3ec982725a007ac18a5cf60587e1fd1beb57685a1f9df3cddd9df25dcbc18 S = 407e41217325f92f8a031cfcc4eb64c1a4b17b0a7459c254af754a7ea9eac997 Invalid = Y Curve = P-256 X = 0b688e761e1ddda2305e002809da65bf5916dfe1356a5b99b61f5576a9b90efa Y = 90ec958e2e3a676e7bbf8e9394f72742875836125a317b0ae38374953f746a91 Digest = 7289573d6bb7486e428e086bec9da9d7ff3c5f8bd0db2ec209fed6ae R = ef89df3bbf079fb250f7e882c4f85c0023fc3804e862d9ef4d9530a15f1013f0 S = 4ba985e900e6737b8e07eac638f7b38277ead4faee6d2076a2eee90fd2a6bf0f Invalid = Y Curve = P-256 X = 0b64480783e260e1e9caef37b4cc9c650d2d57e2c594b1106314843d8d7ab74e Y = 29d373d8522deffe40055aef539f53f38937eb799b44f05a8d8c0b381f12907f Digest = 497656e780360ec3b4bd1be97570615e4a32467982cd9330bc6aa224 R = c5c26b0b21eef0f7a0f1cff38d0079d890376759369b01d8d8e959c1c785e203 S = fecc400bf0deab99d87da168b9d0dd31d2dfa3435b0fe9d38b5fb8efd45195a4 Invalid = Y Curve = P-256 X = 7f78a8fd880c509940e2b83de67c9ab553ab91489bae75cdc1d5b523b06ab7f5 Y = 7786aee7032c373cdfad7d9ddb6fa09a026f6da30fd477ab014d30a289d542a1 Digest = 6d88da9e83ae9457e233d7977172c062dfbdd17d365694515251e031 R = c93ada69db326f76b1362d610cb8bcc6e7ef1dc03d3d11367e153c0e39d5dc86 S = d0c02c71b14ef7a4af4e23bd207ce98449f5d6e7e5b3ec8cbbca9549e97d379d Curve = P-256 X = e58cdc207c56f62e0bb7c0b55b7f7236a6b308f8fc4de3e61cdb3bf20ad2f62c Y = 6056c0ee827e85ba284838954d0c6cc096df03b4611b1e0f7f9002bac86856d4 Digest = 3f9a97b8ea807edc88788df8956c296b1daaed8dd12d50c712344091 R = 2df3906527ad322000285bccdd11dd09130d633cf43534f5802604639eb847e0 S = adaaad19b7c66836ef0f4afeff8ac5e898cd2523246a74a1a291a3a1ff583322 Curve = P-256 X = 70b4bba10b7bbc6d4175ada8d485f3685b13916d0c992301f47e45b629c63d0e Y = 257a93be31b09ff4cd22e3375e30b5a79f3bf3c74c80dde93e5d65e88c07c1c4 Digest = cc3a0d3a5d4f28dc9144a3cdb276eb92265f1157a8d8192cf628673c R = 6e714a737b07a4784d26bde0399d8eee81998a13363785e2e4fb527e6a5c9e4e S = 94c0220f0f3fa66ff24f96717f464b66ae3a7b0f228ab6a0b5775038da13768a Invalid = Y Curve = P-256 X = 8b11b48d2397355000a5289d816b9892ae64dffc842abec02a2fb2db2bb34310 Y = fc1a42528a0473cfc2c2e184b8bc5055096350fe1549d24b526d6536681026e8 Digest = f340e491fa935be8945b8caa485d0699c66331e0e17c7407da1b018e R = 61a91dd1c80049e70dc4aea84bda0efc6ec9c7b9dd16ecbccf687244c51184ce S = e381e7b32bab49578c7e7ce7784ce19263e4a7dab4b614df411d20eaebfc391c Invalid = Y Curve = P-256 X = 7bad1b3d8bad4355a44511d2eb50daeae793af99418ada118327359936aa0e1d Y = e7eff40334b7a5455f6b0d0ecdcdc513702857bb5bbb73c910c86746092bcd7d Digest = 9cf84546c046b370c372c167ebba39af6aadd60463626453787bb058 R = fd961b60b21be32b47abafa77e22197dc99af6825dcca46e0e3b1991a90aa202 S = a0477f97b94a1c26a3b2d186791d7fc9dfa8130bbae79c28fa11ec93a3aeac0b Invalid = Y Curve = P-256 X = 407d92c9b28723602bf09f20f0de002afdf90e22cb709a8d38e3c51e82cba96c Y = 4530659432e1dd74237768133e1f9808e62d0fbe5d1d979d1571baf645dcb84c Digest = 0cf5cd48c93f45472d254196bebea4bddb272a2adff23bab8c3adf99 R = a7dc65293ee3deb0008ae3e2d7ef9e9a4ebb8bf7b10d165f80ab8bed58d6fdef S = 3e8300a3ee603a8d8234fe265c628e705015bf1903eb74c943323050626f701f Invalid = Y Curve = P-256 X = 26aea3dd5c53f984dbdaf415c7f26e1e73048658a548eb3b59dd5f721899919a Y = dff15f57bd9b08644d49cbb214403647195725cd4d4511bc8a48b0770466ae9f Digest = 75d6b6b575d0a2c89528b83c94ef864c825b66253ab662b36bb0e716 R = 726af92afe53e8125b0b9f3659745be401a37ae658b7b1aa88c3cb97e9de22c3 S = 794484c5837a419efe11a4e4293341a6fa36d21230925a0e5e135887302acca9 Invalid = Y Curve = P-256 X = e73418677ce044b331a6d60773cbae199221699d31e1bec4b68b9bc0b87e4cd0 Y = 37215db4e3d9161f3351b385a61ddb2fcf1cec469d1659e7574610ed27fe879f Digest = dcbb92e3be3951d37e37852d508f78da29c8183c5dbe59d6549f78ed R = ac469290a8f61a2a8c6adc7533dd5cfe804e2e7bf101cc74e5f624f301bccd23 S = 4c328c3bc259316641fff44753743afebe89b8627f904df7245e42adcff2dc76 Invalid = Y Curve = P-256 X = b0892b19c508b3543a5ae864ba9194084c8f7ae544760759550cc160972e87ff Y = 9208e9b0c86ad6bc833e53026f233db9a42298cdb35d906326008377520b7d98 Digest = 90333facb4f5068c1d05d1a478fb46d02f367e271a000474c06a5fec R = a62dd0d1518c6b9c60de766b952312a8d8c6eaa36a68196d2a30a46fb17dc067 S = b9ded660e978129277f74c1d436003d1e6d556dc8eed9d505bbaf4c67cb13d21 Invalid = Y Curve = P-256 X = 8c5c41cb07d828a6a86be4533aef791d3a70a95cb285aa2956b21feeac2f8c49 Y = 84101581cad7a48b7d0596df7ffed47085d22e8a4af685cddbeeb32ea69ae190 Digest = 8bb52bd045c985167f673c07b613a3402f435a54c122877bc0c5fe34 R = 9812449df0a51f7a2a8f78aa9a589ca9644dce285f1e69658daaea759fa5bd7e S = beb4c27c748a7944e37afe861576f76b5a749a8ccbbd7dec00838ba250ddfe1a Invalid = Y Curve = P-256 X = 788d7e54ab03020e4954f41259052ee5af68361492b180da31fbbe68d868aa95 Y = 982a3ababa6d351649e56da3faeb7160b9de74e22fe93a06ead1bd9a8dffdf7e Digest = 9870ae25b0f0403eff1079b94669cf95fb250fb098eeb885ff08f117 R = 3ddea06bf8aa4a1b0c68674a2c4796def0bfb52236f4efb3332204a41fd8ea89 S = 871237039431a41aeefcdd08f67848b2b09067e3a1344c8ed9b372d1b1c754a6 Invalid = Y Curve = P-256 X = 87f8f2b218f49845f6f10eec3877136269f5c1a54736dbdf69f89940cad41555 Y = e15f369036f49842fac7a86c8a2b0557609776814448b8f5e84aa9f4395205e9 Digest = a82c31412f537135d1c418bd7136fb5fde9426e70c70e7c2fb11f02f30fdeae2 R = d19ff48b324915576416097d2544f7cbdf8768b1454ad20e0baac50e211f23b0 S = a3e81e59311cdfff2d4784949f7a2cb50ba6c3a91fa54710568e61aca3e847c6 Invalid = Y Curve = P-256 X = 5cf02a00d205bdfee2016f7421807fc38ae69e6b7ccd064ee689fc1a94a9f7d2 Y = ec530ce3cc5c9d1af463f264d685afe2b4db4b5828d7e61b748930f3ce622a85 Digest = 5984eab8854d0a9aa5f0c70f96deeb510e5f9ff8c51befcdc3c41bac53577f22 R = dc23d130c6117fb5751201455e99f36f59aba1a6a21cf2d0e7481a97451d6693 S = d6ce7708c18dbf35d4f8aa7240922dc6823f2e7058cbc1484fcad1599db5018c Invalid = Y Curve = P-256 X = 2ddfd145767883ffbb0ac003ab4a44346d08fa2570b3120dcce94562422244cb Y = 5f70c7d11ac2b7a435ccfbbae02c3df1ea6b532cc0e9db74f93fffca7c6f9a64 Digest = 44b02ad3088076f997220a68ff0b27a58ecfa528b604427097cce5ca956274c5 R = 9913111cff6f20c5bf453a99cd2c2019a4e749a49724a08774d14e4c113edda8 S = 9467cd4cd21ecb56b0cab0a9a453b43386845459127a952421f5c6382866c5cc Invalid = Y Curve = P-256 X = e424dc61d4bb3cb7ef4344a7f8957a0c5134e16f7a67c074f82e6e12f49abf3c Y = 970eed7aa2bc48651545949de1dddaf0127e5965ac85d1243d6f60e7dfaee927 Digest = d1b8ef21eb4182ee270638061063a3f3c16c114e33937f69fb232cc833965a94 R = bf96b99aa49c705c910be33142017c642ff540c76349b9dab72f981fd9347f4f S = 17c55095819089c2e03b9cd415abdf12444e323075d98f31920b9e0f57ec871c Curve = P-256 X = e0fc6a6f50e1c57475673ee54e3a57f9a49f3328e743bf52f335e3eeaa3d2864 Y = 7f59d689c91e463607d9194d99faf316e25432870816dde63f5d4b373f12f22a Digest = b9336a8d1f3e8ede001d19f41320bc7672d772a3d2cb0e435fff3c27d6804a2c R = 1d75830cd36f4c9aa181b2c4221e87f176b7f05b7c87824e82e396c88315c407 S = cb2acb01dac96efc53a32d4a0d85d0c2e48955214783ecf50a4f0414a319c05a Curve = P-256 X = a849bef575cac3c6920fbce675c3b787136209f855de19ffe2e8d29b31a5ad86 Y = bf5fe4f7858f9b805bd8dcc05ad5e7fb889de2f822f3d8b41694e6c55c16b471 Digest = 640c13e290147a48c83e0ea75a0f92723cda125ee21a747e34c8d1b36f16cf2d R = 25acc3aa9d9e84c7abf08f73fa4195acc506491d6fc37cb9074528a7db87b9d6 S = 9b21d5b5259ed3f2ef07dfec6cc90d3a37855d1ce122a85ba6a333f307d31537 Invalid = Y Curve = P-256 X = 3dfb6f40f2471b29b77fdccba72d37c21bba019efa40c1c8f91ec405d7dcc5df Y = f22f953f1e395a52ead7f3ae3fc47451b438117b1e04d613bc8555b7d6e6d1bb Digest = 8a3e7ad7b9b1b0cdc48e58d1e651fe6d710fef1420addeb61582bdd982d2b44c R = 548886278e5ec26bed811dbb72db1e154b6f17be70deb1b210107decb1ec2a5a S = e93bfebd2f14f3d827ca32b464be6e69187f5edbd52def4f96599c37d58eee75 Invalid = Y Curve = P-256 X = 69b7667056e1e11d6caf6e45643f8b21e7a4bebda463c7fdbc13bc98efbd0214 Y = d3f9b12eb46c7c6fda0da3fc85bc1fd831557f9abc902a3be3cb3e8be7d1aa2f Digest = d80e9933e86769731ec16ff31e6821531bcf07fcbad9e2ac16ec9e6cb343a870 R = 288f7a1cd391842cce21f00e6f15471c04dc182fe4b14d92dc18910879799790 S = 247b3c4e89a3bcadfea73c7bfd361def43715fa382b8c3edf4ae15d6e55e9979 Invalid = Y Curve = P-256 X = bf02cbcf6d8cc26e91766d8af0b164fc5968535e84c158eb3bc4e2d79c3cc682 Y = 069ba6cb06b49d60812066afa16ecf7b51352f2c03bd93ec220822b1f3dfba03 Digest = 7c1048884558961c7e178b3a9b22583fca0d17f355a9887e2f96d363d2a776a3 R = f5acb06c59c2b4927fb852faa07faf4b1852bbb5d06840935e849c4d293d1bad S = 049dab79c89cc02f1484c437f523e080a75f134917fda752f2d5ca397addfe5d Invalid = Y Curve = P-256 X = 224a4d65b958f6d6afb2904863efd2a734b31798884801fcab5a590f4d6da9de Y = 178d51fddada62806f097aa615d33b8f2404e6b1479f5fd4859d595734d6d2b9 Digest = 4c8d1afb724ad0c2ec458d866ac1dbb4497e273bbf05f88153102987e376fa75 R = 87b93ee2fecfda54deb8dff8e426f3c72c8864991f8ec2b3205bb3b416de93d2 S = 4044a24df85be0cc76f21a4430b75b8e77b932a87f51e4eccbc45c263ebf8f66 Invalid = Y Curve = P-256 X = 43691c7795a57ead8c5c68536fe934538d46f12889680a9cb6d055a066228369 Y = f8790110b3c3b281aa1eae037d4f1234aff587d903d93ba3af225c27ddc9ccac Digest = 8581034ec7d7a6b163d71820923f616b362748f2846042c9896d8e4bf7577960 R = 8acd62e8c262fa50dd9840480969f4ef70f218ebf8ef9584f199031132c6b1ce S = cfca7ed3d4347fb2a29e526b43c348ae1ce6c60d44f3191b6d8ea3a2d9c92154 Invalid = Y Curve = P-256 X = 9157dbfcf8cf385f5bb1568ad5c6e2a8652ba6dfc63bc1753edf5268cb7eb596 Y = 972570f4313d47fc96f7c02d5594d77d46f91e949808825b3d31f029e8296405 Digest = e5b30e0041a33281210644938d9aaa15ef2c1247b4178f7ca1ee935ce23daabc R = dfaea6f297fa320b707866125c2a7d5d515b51a503bee817de9faa343cc48eeb S = 8f780ad713f9c3e5a4f7fa4c519833dfefc6a7432389b1e4af463961f09764f2 Invalid = Y Curve = P-256 X = 072b10c081a4c1713a294f248aef850e297991aca47fa96a7470abe3b8acfdda Y = 9581145cca04a0fb94cedce752c8f0370861916d2a94e7c647c5373ce6a4c8f5 Digest = edd72dc0aa91649e09e2489c37ec27efab3b61953762c6b4532a9b1cd08a500d R = 09f5483eccec80f9d104815a1be9cc1a8e5b12b6eb482a65c6907b7480cf4f19 S = a4f90e560c5e4eb8696cb276e5165b6a9d486345dedfb094a76e8442d026378d Invalid = Y Curve = P-256 X = 09308ea5bfad6e5adf408634b3d5ce9240d35442f7fe116452aaec0d25be8c24 Y = f40c93e023ef494b1c3079b2d10ef67f3170740495ce2cc57f8ee4b0618b8ee5 Digest = 0d06ba42d256062e16b319a0f3099109518a765f26bac3b9f56930d965617726 R = 5cc8aa7c35743ec0c23dde88dabd5e4fcd0192d2116f6926fef788cddb754e73 S = 9c9c045ebaa1b828c32f82ace0d18daebf5e156eb7cbfdc1eff4399a8a900ae7 Invalid = Y Curve = P-256 X = 2d98ea01f754d34bbc3003df5050200abf445ec728556d7ed7d5c54c55552b6d Y = 9b52672742d637a32add056dfd6d8792f2a33c2e69dafabea09b960bc61e230a Digest = 41007876926a20f821d72d9c6f2c9dae6c03954123ea6e6939d7e6e669438891 R = 06108e525f845d0155bf60193222b3219c98e3d49424c2fb2a0987f825c17959 S = 62b5cdd591e5b507e560167ba8f6f7cda74673eb315680cb89ccbc4eec477dce Curve = P-256 X = 40ded13dbbe72c629c38f07f7f95cf75a50e2a524897604c84fafde5e4cafb9f Y = a17202e92d7d6a37c438779349fd79567d75a40ef22b7d09ca21ccf4aec9a66c Digest = 5aa8e8a6f0622b841416e1a70d79a54641d2c699a075b6960fe5dcf96301da8ca6f15b0948d4ededac30a42e00d3b310 R = be34730c31730b4e412e6c52c23edbd36583ace2102b39afa11d24b6848cb77f S = 03655202d5fd8c9e3ae971b6f080640c406112fd95e7015874e9b6ee77752b10 Invalid = Y Curve = P-256 X = 1f80e19ffeb51dd74f1c397ac3dfd3415ab16ebd0847ed119e6c3b15a1a884b8 Y = 9b395787371dbfb55d1347d7bed1c261d2908121fb78de1d1bf2d00666a62aed Digest = 244656186c11c2e67be88099d55e60f4b68e61fba0b214aac3399dc559cfccc02f9884e85623426dbdc3243f2b5374f7 R = 249ca2c3eb6e04ac57334c2f75dc5e658bbb485bf187100774f5099dd13ef707 S = 97363a05202b602d13166346694e38135bbce025be94950e9233f4c8013bf5bf Invalid = Y Curve = P-256 X = ce4dcfa7384c83443ace0fb82c4ac1adfa100a9b2c7bf09f093f8b6d084e50c2 Y = d98ae7b91abee648d0bfde192703741ac21daad7262af418b50e406d825eb0d6 Digest = adaeadda3f0e941fba1d3e206a84e6d7530d800e0f215b3ddd82022f27c5be44fed27bc73084c6f7ca55555532be2e3b R = 597e1e04d93a6b444ccc447a48651f17657ff43fb65fe94461d2bf816b01af40 S = 359fe3817963548e676d6da34c2d0866aa42499237b682002889eaf8893814d2 Curve = P-256 X = 1b677f535ac69d1acd4592c0d12fac13c9131e5a6f8ab4f9d0afdcb3a3f327e0 Y = 5dca2c73ec89e58ef8267cba2bb5eb0f551f412f9dc087c1a6944f0ce475277a Digest = e34a541f87ff0eaa0c640f555caec6bf11a1320c74c47a8ff172c4e2ec902e48d499732b12a86189e750bbf4c0424c72 R = df0b0cd76d2555d4c38b3d70bfdf964884d0beeb9f74385f0893e87d20c9642d S = 128299aabf1f5496112be1fe04365f5f8215b08a040abdfeca4626f4d15c005b Invalid = Y Curve = P-256 X = 7ffc2853f3e17887dda13b0eb43f183ce50a5ac0f8bba75fb1921172484f9b94 Y = 4cc523d14192f80bd5b27d30b3b41e064da87bfbae15572dd382b9a176c123a2 Digest = 0689927a38486cccf28fe9454e08e0d74843424b89be4cdee8e48f39a69addec730184da72f914cea67231c765ee2574 R = 3156176d52eb26f9391229de4251993a41b8172f78970bb70e32a245be4bb653 S = 62827a29e12d2f29b00fb2d02dd5f2d5412e17a4455f4431a5c996881fdfc0ee Invalid = Y Curve = P-256 X = 5569f76dc94243cde819fb6fc85144ec67e2b5d49539f62e24d406d1b68f0058 Y = 1208c38dbe25870deab53c486f793a1e250c9d1b8e7c147ea68b71196c440730 Digest = 97f8f8cea435282ac746730ac744bf97d85d4e249c0b1d9c7b83c7e59aed172ffc3724d7e6fab7d6ab55ffb3a39c0775 R = 706f2ba4025e7c06b66d6369a3f93b2fec46c51eceff42a158f7431919506cfb S = b4e75ac34a96393237fc4337789e37168d79382705b248051c9c72bcbac5f516 Invalid = Y Curve = P-256 X = e4b470c65b2c04db060d7105ec6911589863d3c7f7ce48726ba3f369ea3467e8 Y = 44c38d3ae098de05f5915a5868c17fee296a6e150beb1f000df5f3bec8fc4532 Digest = 5b937a2af46dbf18b4a6fb042ea353a6878e0d4beac016002b3d91a42bcba52856c07a3f35c08dfecb4f03e1c0b9948e R = c9c347ee5717e4c759ddaf09e86f4e1db2c8658593177cfda4e6514b5e3ecb87 S = baae01e9e44a7b04d69c8eaaed77c9e3a36ce8962f95cc50a0db146b4e49eb40 Invalid = Y Curve = P-256 X = 96050c5fa2ddd1b2e5451d89ee74a0b7b54347364ddc0231715a6ef1146fe8dc Y = e0888a9e78aeea87f6e1e9002b2651169f36c4ee53013cfc8c9912b7fd504858 Digest = b123e07744f05ad523790ea5bfa3f848869a3bfdbf936a496c8606b577ed8427eb7ee888e0fe18d4e3cfac73baad883f R = 2353d6cd3c21b8ea7dbc1cd940519812dbe365a3b15cd6aebba9d11cf269867a S = 85f560273cd9e82e6801e4cb1c8cd29cdac34a020da211d77453756b604b8fa7 Curve = P-256 X = 0c07bb79f44012299fbfd5a0f31397aaf7d757f8a38437407c1b09271c6551a0 Y = 84fe7846d5d403dc92c0091fbd39f3c5cbca3f94c10b5cae44e2e96562131b13 Digest = fb8d12652de59e63ef5297641dfbce084808de146720e9069c2ef814bcd80b6187f7422a6cd9c706f8d64ccf80e8bc54 R = 49e9425f82d0a8c503009cead24e12adc9d48a08594094ca4f6d13ad1e3c571d S = 1f1b70aaa30a8ff639aa0935944e9b88326a213ab8fce5194c1a9dec070eb433 Invalid = Y Curve = P-256 X = 71db1de1a1f38f356c91feaff5cfe395d1a5b9d23cf6aa19f38ae0bcc90a486d Y = ecdd6ffb174a50f1cc792985c2f9608c399c98b8a64a69d2b5b7cdd9241f67e2 Digest = 2d8c6585a3b6319a556e27b53d434f455f73e771c8fc6a115f5c92a8e9a81ce2b4336a5c3edf98910689d11f4c93632a R = b0443b33a6f249470d2f943675009d21b9ccbead1525ae57815df86bb20470bf S = 316dbee27d998e09128539c269e297ac8f34b9ef8249a0619168c3495c5c1198 Invalid = Y Curve = P-256 X = 8219b225aa15472262c648cac8de9aad4173d17a231ba24352a5a1c4eea70fad Y = 0fee2b08ad39fbf0db0016ef2896ca99adc07efc8c415f640f3720498be26037 Digest = a4cc3b23f54d9d48ba6b0ad3da3b2e3a0806f41348bd7844e9c9b8648753bdeef8a039e1fa4f5172c89148d65b14056f R = 134fb689101aaad3954de2819d9fbd12072fe2bc36f496bbf0d13fa72114ab96 S = e65c232bd915b59e087e7fd5ec90bf636cfa80526345c79a0adfd75003045d6f Invalid = Y Curve = P-256 X = c934195de33b60cf00461fc3c45dad068e9f5f7af5c7fa78591e95aeb04e2617 Y = b588dd5f9965fdaa523b475c2812c251bc6973e2df21d9beaace976abf5728cb Digest = b962b63a7743ad77f9072f2f08d277f6dda8cc3420ddd37d873746008895902bcce218fbfed1a8cb28406978dd8e5134 R = 71f302440eb4ed2a939b69e33e905e6fdc545c743458d38f7e1a1d456e35f389 S = 54eaa0eb9cd7503b19a9658f0a04955d9f0ab20ebc8a0877e33c89ee88ad068f Invalid = Y Curve = P-256 X = 9e1adcd48e2e3f0e4c213501808228e587c40558f52bb54ddbb6102d4048ea92 Y = 34eff98704790938e7e0bdf87ae39807a6b77dfdc9ecdfe6dd0f241abae1aeb2 Digest = 21b883fae159867731b123a2606e9b3320fb53a00e4a5dfe3bc3429dd53b8068197be3c7288c1e0bf28a4fc7b13bd70f R = ce4f0d7480522c8dd1b02dd0eb382f22406642f038c1ede9411883d72b3e7ed0 S = 8546e1ee3b77f9927cdaccbc2f1cf19d6b5576b0f738bb1b86a0c66b39ca56fb Invalid = Y Curve = P-256 X = 93edbecb0b019c2cc03060f54cb4904b920fdb34eb83badd752be9443036ae13 Y = b494e9295e080a9080fe7e73249b3a5904aa84e1c028121eecd3e2cf1a55f598 Digest = fcc17b88077570c053650e1de42ae6bb1522900b38996decc87704aab6a87ab01d52f83f6442875f378a262c22d23ab2 R = eec2986d47b71995892b0915d3d5becc4dcb2ab55206d772e0189541b2184ddf S = 8a6c1edeb6452627ad27c8319599c54ac44cdd831ea66f13f49d90affe6ad45b Curve = P-256 X = 3205bae876f9bd50b0713959e72457165e826cbbe3895d67320909daa48b0ebc Y = d1592562273e5e0f57bbfb92cedd9af7f133255684ee050af9b6f02019bbcafa Digest = 299a6070d32a5557010753d7559dbd8d2bde8a8feae5417616ceb5b167997fd2fac0c2bd44264106d3a9720d5e805a04 R = 0124f3f1c61ec458561a4eaa6c155bd29e59703d14556324924683db3a4cf43b S = 688a5c5fc0c7ba92210c50cce5b512a468a880e05acc21ca56571d89f45f603a Invalid = Y Curve = P-256 X = 484e31e69ef70bb8527853c22c6b6b4cd2a51311dde66c7b63f097dbb6ab27bf Y = e1ff8177f4061d4fbbacbbc70519f0fc8c8b6053d72af0fe4f048d615004f74e Digest = f1e9cda2e096ece9a1fc57e55eeeb56b1c635380c0f9a1800a4a1a5f105d1fc0c60e776234daaa8a6f7c0f5286bb420b3f607e7cc0a7d840ad5dcbab26c797b0 R = 91a303d8fe3ab4176070f6406267f6b79bfe5eb5f62ae6aeb374d90667858518 S = e152119cefa26826ea07ec40a428869132d70812c5578c5a260e48d6800e046a Invalid = Y Curve = P-256 X = 8b75fc0129c9a78f8395c63ae9694b05cd6950665cf5da7d66118de451422624 Y = b394171981d4896d6e1b4ef2336d9befe7d27e1eb87f1c14b8ddda622af379dc Digest = 0527199fadea30f9e5e66166a3ebcdf6aedf906984535f48165e591eff36f1c0de6b0fa69aefb6399e8a213cc2ce53268fbe18c3471b7708bc27c426aaa769a4 R = 17e298e67ad2af76f6892fdcead00a88256573868f79dc74431b55103058f0b0 S = 881328cd91e43d30133f6e471e0b9b04353b17893fb7614fd7333d812a3df6b4 Invalid = Y Curve = P-256 X = 76e51086e078b2b116fd1e9c6fa3d53f675ae40252fb9f0cc62817bd9ce8831d Y = ca7e609a0b1d14b7c9249b53da0b2050450e2a25cb6c8f81c5311974a7efb576 Digest = c926a5026d8f83ffa2092caf863f2d8a886af391462969b13a11d3c6c5fa66bb4281bc6e60a1e99a2e1ae95d689a66282096a0f27aacc048f32d39297649a014 R = 23b653faaa7d4552388771931803ce939dd5ee62d3fa72b019be1b2272c85592 S = a03c6f5c54a10861d6b8922821708e9306fd6d5d10d566845a106539cbf4fadd Invalid = Y Curve = P-256 X = bc7c8e09bd093468f706740a4130c544374fdc924a535ef02e9d3be6c6d3bbfa Y = af3f813ae6646f5b6dbfb0f261fd42537705c800bb1647386343428a9f2e10fc Digest = 4d74631eb67fd1a6fa93ecb6e6112b6699e78c1d4c24ae81d0d5842efe5d93c2fd7a7863f8d45d1b2fafecbe41b7dc19c4b2bc208e014ffdc216e7eda0392a70 R = 6bd7ce95af25abfbf14aef4b17392f1da877ab562eca38d785fe39682e9c9324 S = 6688bea20c87bab34d420642da9bdd4c69456bdec50835887367bb4fb7cd8650 Invalid = Y Curve = P-256 X = 9cb0cf69303dafc761d4e4687b4ecf039e6d34ab964af80810d8d558a4a8d6f7 Y = 2d51233a1788920a86ee08a1962c79efa317fb7879e297dad2146db995fa1c78 Digest = 0250f93e6932887df519921f9a8dcff110be0768dc351ef73a940a579fae2d20061759e892e289c3e4ba5f7fe17d6ebb15c5931d48db55ebc81549f6637292fe R = 4b9f91e4285287261a1d1c923cf619cd52c175cfe7f1be60a5258c610348ba3d S = 28c45f901d71c41b298638ec0d6a85d7fcb0c33bbfec5a9c810846b639289a84 Curve = P-256 X = e31096c2d512fbf84f81e9bdb16f33121702897605b43a3db546f8fb695b5f6f Y = 6fbec6a04a8c59d61c900a851d8bf8522187d3ec2637b10fa8f377689e086bba Digest = f91b09107d10904d3968ec29f85e456ac4e828f32e8da3db6a13f5566bfa625e2ad03f8dad5425a073c0d61d25de63dcafa9f4fcd206f29e9cb6b0fecd74aa57 R = 1b244c21c08c0c0a10477fb7a21382d405b95c755088292859ca0e71bab68361 S = 852f4cbfd346e90f404e1dd5c4b2c1debca3ea1abefe8400685d703aea6c5c7f Invalid = Y Curve = P-256 X = 633c2ee5630b62c9ce839efd4d485a6d35e8b9430d264ffe501d28dbace79123 Y = 4b668a1a6d1a25b089f75c2bd8d8c6a9a14fe7b729f45a82565da2e866e2c490 Digest = 575c64df58c8dc517ce65b388fa3ed69470163afecbabc3fa94b497ff7f3fe36ff12fabe2b84cebbf667744195091e4e2335a71d36414e0af0d0260fc8e8ea44 R = bf2111c93ec055a7eda90c106fce494fd866045634fd2aa28d6e018f9106994e S = 86b0341208a0aa55edecfd272f49cb34408ce54b7febc1d0a1c2ce77ab6988f8 Invalid = Y Curve = P-256 X = f78dce40d1cb8c4af2749bf22c6f8a9a470b1e41112796215dd017e57df1b38a Y = 61b29b0bc03dff7fa00613b4de1e2317cfbf2badd50dee3376c032a887c5b865 Digest = 4c097f2f5b2489c94258b34d529675bb5d77d4be083b51b01188dd42b4b5473982728763ee6fbad479375c5eacb5edaaec0b6583a10b19aad81ec88dde2d0e7f R = 4a96169a5dea36a2594011537ee0dc19e8f9f74e82c07434079447155a830152 S = a204eaa4e97d7553a1521d9f6baadc0b6d6183ba0f385d8593d6ca83607c4d82 Invalid = Y Curve = P-256 X = 3fcc3b3e1b103fe435ac214c756bdaad309389e1c803e6d84bbbc27039fcf900 Y = 7f09edd1ec87a6d36dc81c1528d52a62776e666c274415a9f441d6a8df6b9237 Digest = 1a3dd21cb6ac1fa7fc196319cf534b7608afb93805420fcb5250dff453564a5b22e22971a3ce6dd222405fea018cd0508d86c561eca15e1ac7d79c14e916b86a R = 1cac13f277354456ae67ab09b09e07eb1af2a2bf45108da70f5c8c6a4cbcd538 S = 5d83752e540525602ba7e6fee4d4263f3eda59e67df20aac79ca67e8899fed0d Invalid = Y Curve = P-256 X = 5ec702d43a67ada86efbfc136cf16d96078906954a3f1f9e440674cd907e4676 Y = 05a62044fed8470dd4fca38d89d583ce36d50d28b66ab0b51922b21da92c56d9 Digest = c5c016f6c9b525987dd835131def77cc72d8360d364eeccdd7af8b95712b6cd487c0b846201f3b64466fd140833514ae8d765da395fbd9d3c03ca410effa9a69 R = 75f3037298f1457dba55743999976a1c2636b2b8ab2ed3df4736a6d2934acc83 S = 19d43ad168dda1bb8ac423f8f08876515234b3d841e57faef1b5ab27359b27ef Invalid = Y Curve = P-256 X = f63afe99e1b5fc652782f86b59926af22e6072be93390fe41f541204f9c935d1 Y = f6e19ce5935e336183c21becf66596b8f559d2d02ee282aa87a7d6f936f7260c Digest = 9eb2f9fa96a1f3ffcef9600522730e86d26d328ec0c1bf2fbfe55a38754610341fda1b894fdcf10c9bc4f48819010fdcf0d24f27ff539e40c6855cafbd306386 R = cef4831e4515c77ca062282614b54a11b7dc4057e6997685c2fbfa95b392bf72 S = f20dc01bf38e1344ba675a22239d9893b3a3e33d9a403329a3d21650e9125b75 Curve = P-256 X = 6d11b09d2767cf8d275faee746c203486259f66dd2bfa3a65c39371a66b23385 Y = 4eb05c73e05261e979182833f20311e5366f72f4b949665ff294f959375534c6 Digest = 0e71b28b0a1eac7aa881c09daec616c93d9a9286b5f5fdf2642d211021b125fa884b2595b73c7c3e649e61cd7157ef6660076a3b87ddf830db46533f3aa30afa R = 15a697cdb614e11c0810e1e764cd501fcabc70874c957587bc4883d9438e177f S = 7bf6244f92bc768063cecb5336c8eaacd23db930b28703560f241c7d93950dfd Invalid = Y Curve = P-256 X = f3899caba038efb534c4cea0bd276814ffd80194473c903b81af11c8c05cb6e6 Y = 6ea6b17402fcf2e8e737d11ffc7c2ed3b2d0bc3b8f271a381f4294cff62682c3 Digest = 104ace16689d785df09a81c5cf47a496db30fbd696aa4df080219487575a23641436e70329dd1c13290582c0d03aae200e51189d43666c86f38a5203c16cd7e4 R = 57b99380452e1d37b133c49b9ba493dee8630940477ca3351a43d90b99871e6a S = df599c3a37105af3ecc159b3b685ccb3e151b7d5cf2d97147974ae71f466b615 Invalid = Y Curve = P-256 X = 1fd6f4b98d0755291e7a230e9f81ecf909e6350aadb08e42a3262ff19200fbd2 Y = 5578fef79bc477acfb8ed0dc10c4f5809c14dc5492405b3792a7940650b305d7 Digest = 761a54f3718985b6d7bcfdd57d6c4823f854831bd29305fcb07e34e3f825d451fca28a62ce9582e3957d89ea7c1bc1afe3aa58fd2fa18566974600fc394cf2a8 R = 97a99e96e407b3ada2c2dcf9ceeeb984d9a4d0aa66ddf0a74ca23cabfb1566cc S = 0ecac315dc199cfea3c15348c130924a1f787019fe4cd3ae47ca8b111268754a Invalid = Y Curve = P-256 X = 2dcbd8790cee552e9f18f2b3149a2252dcd58b99ca7dc9680b92c8c43aa33874 Y = 5dbc8bb8813c8e019d80e19acdb0792f537980fecde93db621aaf1f6d0e6ee34 Digest = 45b082e804443b53a82229cdf13e4c5f8f31fe93170cc8a23f63eef506cb7748388e1a971a2f81e3daa324cf2bb69118f7418f40df66a24f50c34a55e1416c3a R = 2bdbd8b0d759595662cc10b10236136ef6ce429641f68cf6480f472fcc77bc9f S = 7e7df0c8b86f7db06caf1610166f7b9c4c75447f991d5aaf4dea720c25985c8c Curve = P-384 X = 6881154cfe3f09affbee04cd387b27b7854326faf8906c4b9c9e6ac2c632e0d59717b3f33f6d747d7b7cbb4e4dc01fb8 Y = ba295ae0966f06ad9d84b3bb4da7f99b56044c99f88d71082cfea6964ea3c63bb79806a6a41fcc314b55b3f64f82b68a Digest = 8a6429d55885146f7aab582a1aa9360fa9591b0a R = 2112385a75d4edda89ae2bc3c74524dc792544a3a52fdb588da3f0feaee6a11623db275e2ab8abdd998cc42a29c60856 S = 8d308a3987b81c595f8cec19898b1a42da8eda97496af280033b0f915283f171fed7e2a221fa9c78927962189333f437 Invalid = Y Curve = P-384 X = 2f2f43f244ae027c3d2ec5c900393f80a8ad0e9b9a12a047195d29a39f2b7026b071688dd9a6764379d02a5ed8035ec1 Y = e43d45851bc76c37d34dbed996a65ffcfbbaf0e2cbfbc9f62d2116bdf3b330bbef5acdbcd0aa6d949f771daa17cda1e3 Digest = 5f41322db1a276042ae807f0f0d6f1e04cb5cd26 R = c011c52e9cb02048957a233704ff9a2c1d4c56e08ebb083aa8ba351f041a23a7d0da19088ac6c60ea2ca117531c7cf35 S = a66ca9bf06c35d129a8253a0f793acf681e482d9994868b275a230b215286e03a66a0de77c7a53174375137fd4688556 Invalid = Y Curve = P-384 X = 9a5e1932d318bfa7986f0dac4489c6f55775427bb60fb24bac7646b9994bbc3a9b5cd15e818cc4e832afc1c3fca9abae Y = 64c89e7c3399c136b2718ab675944207157f0bf23d9e2a807ae7ac3bef81da7ec3c56c2d2c08afc53301af2a3cc71861 Digest = d36ef9ee70a3b61ba31cdfcd0cac6e49331a407f R = 4cf6c63fea6c80efc105cd99afe2b53da05ae16566ddb20b9d40a076575ffac419b6807fa336fc6e7c7416c59775ef09 S = aec2d96054b4b23c49faaf9903ccf63bc96281fb7c1b9d14daa54bba51bb2b2f4d3a901f3b0b9cb2b62976459219350c Invalid = Y Curve = P-384 X = b3aeff27b65540c6da10a88008404b1d49239c87fbf47932518fb87a9bb132403d1f310f531d086340bb4a68c3e64b9b Y = 567e75f442fcd81017b8adc4cce634f5ffa3cd497d38221d34dc1f43aef99133131ff1b197f7b9f37beecae5c438849a Digest = dd0f9c326fb50593fd0a0df31abeeb00a22eb956 R = 3b94a2514eb915b71e18c867ad7f508a35375c5bcd4b797b86054798569870b2477e2ac14406628017d829400efc63b2 S = 179a10441a0beea3b375248e697e0d19e24bb68184c373fe4302839b97dd7353a5a25929c2733796b0c0d8211bd67c51 Invalid = Y Curve = P-384 X = 0874a2e0b8ff448f0e54321e27f4f1e64d064cdeb7d26f458c32e930120f4e57dc85c2693f977eed4a8ecc8db981b4d9 Y = 1f69446df4f4c6f5de19003f45f891d0ebcd2fffdb5c81c040e8d6994c43c7feedb98a4a31edfb35e89a30013c3b9267 Digest = a871caf9fff9856031a79a55b96753c1a34ccb73 R = 8d9d3e3d0b2b2871ea2f03f27ba8699f214be8d875c0d770b0fff1c4ce341f0c834ac11f9ec12bfdb8320b1724c8c220 S = 62150dfba8e65c0c7be7ef81c87241d2c37a83c27eb31ccc2b3c3957670a744c81be6d741340b5189cc0c547df81b0d2 Curve = P-384 X = b4b92211edbd41c5468d2ba70810bc37b5e7c954c7bd0db80c4fa89ccba10bf07cdab953828a068bc0104d28e4040c14 Y = 93ed318efce3dff98fc782b788d78658ea5ecde4f716e2d5d0ec2d87a2e761daa1f1658cfb857762caa567baaccf9924 Digest = 765343d50541bc2c0e20193648048016a95e7588 R = aa3978eabd196ddf9cab2815cc9cbab0b61cd639deaf70e093a10a58ddf9f410ee1ab965ff8fbb98efbe812421a613d3 S = 02761a2947e1855806b8a25b9ebb0762be9f5517461a371e5783f34b184f32c4ea684b362119b1a2d8a3ff439f10291f Curve = P-384 X = 63b4cc14f9efd3b8f29e65806591d1e9c54f34a3f5231339bcdbfa4109c42d946a59cdd7bbd2591fd1b2383a0819772f Y = 55ab3d208109da6ef039c23cddd52a5af619266d8fe066dcabb1af885ad5501401a78c44ed3b5fff2892fdcb2a3ac8b2 Digest = 4535ef8d7396b4f2af65660ebbb56f356cacefd9 R = a3f9b840fd7201356f35b5dde39027410aad26ac61919c14fe7b0535bb74e7218cb3312bfa60aac63f14166f32ceff26 S = 1b1bcbcb0237fad4e406c8d4e3e39b55642d8535afa9ccbc9c601cb4e01891df79f1bc792687cb3a5ee7703565c4a13b Invalid = Y Curve = P-384 X = f82f82f8f7454ce7a94a040ec0bbb52d49e3b9f8ddd095704973c760ee6067a5c28369656f22d70d8bb1cd70ef9bfea0 Y = 0e36e256d02870ee5646a17aac4b280c9d1d2e1d4803eb3cb32e7f754cc889522120efd7c4d8a82e509a4d8f266d3ce4 Digest = 26302c41e6da59e2df2e26c12382738880be94cc R = 27a2332f3c59464f5dfe7bb1201a3936248d375bde603724c048eb8f7c0c2be3ed4b56c14b51d7d68bd2554526b36d9e S = e1f90367b0cc530c545f95163d9ffb1208c943685d5ae221052b83ee40953397be581e5979c9855b20246e9d26d57acc Invalid = Y Curve = P-384 X = 7d40b51127cb1642dd8538d4124138a2f49c41b4d12f702c1b0cec8deba50c3712e01c2e1e693e00438af0e86025da33 Y = e734b5939b673c45dd32baf20d234f01b7124b391d14beea231e9c604e813fc83b3a77b0cb1f2ce4873a69b0165e369d Digest = 0b30b209147432207a72177997d28d6f1d03330f R = abf16821b6657e0005071f78c679cbbb130bee6e7ca63526eef0f747fb721feefe6258dae1aa02064a700e963bd9dedf S = 3f7e61c34a30cc5ff7a8be375fcc9c38a76dbc0c30a4356843421ca37a7bcf24edcd41d8235903bb522fb6e5a8033885 Invalid = Y Curve = P-384 X = a5b59d59599c105e39f61354da99c7c9135c749cf996cc2252eb83b008299cdafbcb44227d2d2c4a5ffa44823922893b Y = 0399fb0edcbfd0b76b524f22b7b87ddbb4fa02f510661615312a4492eb3f2001e0fc0e479f77c33a88f9a7e20757373c Digest = 44aa3083d111bbce7feb412af74a782cd320becd R = a4c9cac2409a9bfea1ebe28fec4e19545f08cd18fdd31048f52a3f2d32b2ed859dcae4dc12fb2fecabe542c4f03191ba S = b4d83f927ad1980d96cbb0ccc36aa640f786293b8b19e4dd97a797d192b420f630a5e42ac42d8736e7d42008f445dbc1 Invalid = Y Curve = P-384 X = 29178ce9127e1048ea70c7d435439e9ff9915387e51b7e5ca10bfdafe53565978eb3784d9a4226f443d4834f4d451685 Y = 5cc2970589a453488649711bdf3cdac9a200519aae65b1c6bd54fed0d965755b36b74d978d674275bd71a03e8f054b0e Digest = c679b4a0e61406c4869d721192bd314d77e1cb39 R = 5d6f5e9a94d9c92a0890c558bc0408b3405cd04e33f663df16701e80520e4394f1c54d3c8225d36f4753a799aaf6ff90 S = d895b1cc522ceec6a7867867b8f603245c6e4d48945dfc43af721ebae4683d40a3c21b905ca3bd4b974d36806825b2cd Invalid = Y Curve = P-384 X = 9f03569f8c6ca2c16d707f0ca36a8a8cf214a9d5c14034829d709e283cd675eb4e3090c6b973429efdf476c0782e0a7c Y = e1b842536731e91596782787d57af17db85dc92fd2fb95ac65339174aee66775ce0a4721d1faeb29da968ea5eb705e59 Digest = ae1a63f88a59c7da5d9f512d11bbd5d75dd1f583 R = 31ccbe22a360b1786dac89394c6ef4ed6604943e50837395f96052821f6182914840096e90f2ad650917bd91d7bd4cfd S = d97199a6b952dcaefb1defe23def92bf2ee236ad18046a2ccf8924d42ee10a62e70ffe7f3c909b11112278f160d98b7a Curve = P-384 X = b85e78a935d169dd5ba8f558f964b21c07804464816f9231233184675f557463a8b00470ac0ca8278cd008f4642e7962 Y = 8edf7be8584c5f207939d479e65173e2e69673090a8538fa93efb4432127895d92b4e4cf13b7632a830e9a33b37f75e1 Digest = 811685f7ff2701e692f6830a33d8712d0432cd5a R = fd2876b250a94ced71734aa7a0d32423b2c6f039c926c557e748f38e23bbdb46e17d1204832c6f76c3ea854e1da23979 S = 76409e381799502c81194ba87540aec0b89fc4680dd683780d49f82a46a7191b40f5f06ccb02e45e704c31fcd59382b9 Invalid = Y Curve = P-384 X = 0c74aaa0527524cb6171ab741896b405a6ac4615e474cdc09c9457b18bed33c6383e1b92f2fa1306e8e5dcd1667e45fe Y = 7b00d934dfd876f6e07dc0582b20ed650be104fa603a5a1255c62b6059d2685aa9773f1ba31254d213c815d0efc8ed93 Digest = 328029316d73d1b8d2b8927d12332036e5671384 R = 832c62b0f34986eda9d1ace5068a0c5318051b0d0166d3dacf137ac072cc359f109ad6e17059e700bb1958bcf4101246 S = 6bb56f4eb550688ea66e5dd09aebe7e0b39e2716b4697ebb68f113e080f0ff26fd0fc947a34f3c5a8a2f10e07dc1405e Invalid = Y Curve = P-384 X = 4104de08b4108ee26ee239e0a5d340c1b1aa48b1b3b40717debd6ed3ff0d777923c106f857a3830ce7f3d08d0d6d7908 Y = 00498c38393e6393edcf254804558f86e461df1f5a6557bc5144f8d2f3806413d372b6ce417d531c08a52d1e38e8b949 Digest = a13ebaf4431c43b684d1e18e610a75fd7527200e R = 9924a3273248db20db007309560a0e616572ac799d773529a5215786cf4a6e03cc73bea81d4810c1eee4b5e975652eee S = 6cc8ea4c4c56da87c25946a198e86917227bcb90da7be1dcde7b6547bc45a98e8175dd54af15bb6ef955b4cb48b7bb0a Invalid = Y Curve = P-384 X = b6bc9418f3da0cce38a65f1b52bb3a9d22a0368e02f5f12fa1f1303ac67df1cffa55d049a782bf5bddb5e841b125aed6 Y = 3b578a0560280a2958a14286e10faa7f5dec77fd8d90123aff5780efa8a636cee833fc9f10d7a164f1254a483b613746 Digest = 7b44de2e448107197558cb071bb5bec9a5849467827d29b2c6625708 R = 6602090aec001c16e5f6e7e3e488bed5d1702d36b258b6a8a2d8392a5ff30a6af12fbf4308d67eed6aaa8b7be8b831c5 S = 65d0c3bb1910ba0b7cc108ae1ccaae63405ff01a8df91021e17cd46aa6f8ca8f4eaeac6d6fc26fc816a3ea537fd9576b Invalid = Y Curve = P-384 X = b4ab83a4ded7d76aa15eaecb1bafe59427d3cfc38564af9123cb707da2405184acd40a6c093ba29e321ba0f67c1e0c6a Y = 26e2902499495f8550e798617a44ac9990c4c1cc3527dc0dd003a15aee3cbd3955151f7863de1692a94aafd3730e7665 Digest = 8f902a34f36d7cd36748d5ddcc8fba6040be223a462842d506f185d1 R = 61e48d5a100049578e820768ea57f30f27ffd1a1f839fabc55e8f4816c9b95d042619cd3bcc7180fd99834e344f53e7f S = 977b81d43216f31d8bedc3ffe873047817de3441df8b80a321aa0a80931f25a15c6628f43cf8e48d5c6aeca7626b0a18 Curve = P-384 X = f886f36fcf34e8df2a7e09220051b9981a3a6f693ec5999f28864e012c13896d633c9564f0118a95631cea8355b25b20 Y = 746f9a77835325f18338dee5dc88a9b086b858ce15b4e4462a98844bb01811195f4fae0bee8f457c32823e142210dbb8 Digest = 6a80377d3c7f0e6a50f6dc1656cef5a0d33cf7934441244f69f0062a R = 665390653ed280b8f6bd3718d8423f26cb38d2d7faa10fc0f094295677d9dafad45fc64cfc22ded56afdd86a77cf3c33 S = 864f0eb3a8d93c388d987cfcb60bba76098039d46bf4ff4be083961f70a29e724c25cf56685802b7b5be048107ad52e3 Invalid = Y Curve = P-384 X = 5fc835a2f5429adb719ed22f11dfcb02731da6759a8ea75c21d1af9631187626c31e191f4dcdc183df01c48e13dbbce6 Y = 9ed2d03df1cbeaefd4478b8106e90f92e0b6e958145cb81b9648aef0b96b71d1d55918564694b1987d68cc8e7cbd7dd1 Digest = 807f609592e2ededa12792a7006a6db641904e86a1df3cec477dfd3c R = 94d9dedd27f2d014ba84ea58d2e88d68f3e86ba88b93750e50255211effe88b0a0e2f62017f22965726cdc77c55bca4f S = 14814bd09d9b7ba81b2485777cc588b5c0a4064df95c63f18a8bfd57494cd0f40c5bda9dc6c01ea72540f57a354360ef Invalid = Y Curve = P-384 X = 0b86851d7c19f0f04a16e5e2903a36d09bf1863e152d87936fb2d74cf916bcf6dedf3c066d242f7dd327df0fcb42270a Y = b0c93480740bb635e6c25fb61630fdfcc462a1418366a51b1265656f721e18ba89ebf754c7dfdad865a252c884a6c4fc Digest = c34e896a31fc4de7596679e12bb2416a51e58e8942eabd5cb01f0737 R = 33fa5fe3e495076e90f4b62753d3cdc7603aa7f5b407dbf89a854b9521d15e6c381d3cf28f103035dc4291ae318c5f82 S = 30919a2a3fae71e1afe8378aedcaa08fadfab6c6bf954031452d4fe514969ede2acf0347a2f1e81abf1bfb9d8bd55a36 Invalid = Y Curve = P-384 X = 6f8f2fc40d1db28309c8850bf94d77c01c5449b4fc556e6bf50e5ee805209c4489d8ff9bd781699eb0e42f6a962d56fe Y = a4c7c77271dbbe7e00d1c6e4287dddc5463c6803a577a18f89a5eea01c6addc12404353abbc128cb9cf2496732312d65 Digest = c19cabc6141b2adf67fe4bd0a3fead50473dea8cb0276de1fdc467c5 R = 327c4642019a635d80dab82f7dc22e3102a3c1ba684c2b6de67d3d3009a17d39ae3d58ca2caec9f6f03f5ba3b406178c S = 6b1af807cc7265cc6d3049959cd7779ae0de819036647f9510b0e9f7e4c0e3fece5fc3741b68881145a2c944dc5c54d1 Curve = P-384 X = e98ba8016a976dcc3c50127d2af792969835b1096b1644b37c004d1786f4fb1026233f33ad56cd9444ba0a332c92efb8 Y = 54bbcb78ffa3c855dd24bf182376ff5d28dd7b7551e4b05a19549c9f59c83dcc12a43092d63c5967fc0256612475b7d4 Digest = d8d9319d3f705d03dfc992e8e7596586200fb1574f2a918350deb268 R = 3b76a0c0ece2348085f3554fc92b9e5b0fe84801ab2adf1d239d7c81c9697b62285e8e5667774559d1bbc6e86f2ade64 S = 91d929e42f8223ccc74d4cb09ee7eb619d3a348886c21091ec55d36164ad3cc04e1da6edd88ad89710a908ca4bc00333 Invalid = Y Curve = P-384 X = b8d7a836715635a8b095d3712817aa9e6ffdd98d24be2db751bb0c1fad42b082542500ea255cde17525ec159afca7002 Y = 1a526c876d4771157b4f66e3056485c95066d4bd1e73e991ce6d5d3642807efe80015c52ef3cf8c86e57ab9a510ec86a Digest = fe23e8ab9dc934144247930a48babb0d8ba57703c2bef60e0e9a1e2b R = 9e36f47ec1b7ffdc6e3472f3cbec913494c0bbaa0c073f597e01845b5a3107c0e23a4575de4f2b582e1c2fe3067ec048 S = b013cf51008a89b379a2a6b519b8d229ff0374401eae21a8da350fe35756b94168e7fafbd81f0f681f21c056941a82eb Invalid = Y Curve = P-384 X = 4ffdecf5d5f7c1164297a93742c8a685bb425b97fdfe85f630dab2064ab29e52a0df34629c2531048c288216723fc9bf Y = 84fcff3e7e478a6932ace6f6b0ab70e61d8a5137b76886c59e721d938e0e252e2f7e57c2ab7dab90493446ad85c3fe4c Digest = 28d44c363bfb2e36bc59bb68c56e8b5d2587f149839fd3b8c05d9eb3 R = 7d909d9aacf064c32d070c3149ace8b8f5d83b2006e8460b84c4bce664fc20e91c61ac8b415965b6155eddbe9238fe3d S = 19d909e358e71985179dab9113941ecad21e4f3608cb3a32dd065868af1657df8e06aa86855ac7ad757a7f8fb568a953 Invalid = Y Curve = P-384 X = e805e0733fc156bd582faaf794e58d4630ce73fc383cdc964dd337728f774e4989a697d79665a3282ee6e0ee343d6c7b Y = 43821b7b9a6ce1ddf0c59ada552668a0cfc85a87a610b5c36b7a691947116b49a4099340306e53494fc6b496cb8d12b0 Digest = fd1bb27d666e3d40f5bd19d8c026a3614404b9edc11e582eb80b044c R = 3d4fa4ec95b55feac607fddc618d6f4eed71da65dc49d732e64460e5c80c57dc4421c64bacf3ef1e22995fd19c2a3cf5 S = b11898ba475f2b28402d038afc15f171b99aab93437b35a2f8a3b89f42fdb7f93a0469d9da7652882000dd5bb1e8b9a8 Invalid = Y Curve = P-384 X = e15c7ef9791b9392c3e97389f2597ee161545c267e584b94262870ef25fda348f72349f396c27ac884fa8d776387fdd8 Y = 107b4a7da8be564a14f9c45e4df5cc9b62f0671b3f2c0573c33fa37f985fefd1ae3ff2640947ebb12dffda72757db6af Digest = 3d9611421379fc93226fff23f5fe472a33f6bdc759d5705f7e9a2be3 R = 9d715fd1a3668283fa83c407242e8d2a4f3fa1bf41919ca4101114bd0e0ac1b16c4379edb11de5210eee8618d42e9ed1 S = 2dc37f453c8cfe01ea80c56d1865daf0f28847b12970132a1853c3ed80da6693e0da47a2476207947f29da34d68d604a Invalid = Y Curve = P-384 X = efcb97dd73106b0a2be4f665c496352f6938da9d0fa97690dc0e8d018b06dce2ba8d19b93ddfe889d549a33e64497c31 Y = 66a0cb7e64f40470b6d09b9e12f217b59e9e6615af52fbdc4ddcb379e77809361eca2093a3e24c7103e971567018400f Digest = 5598b06acf834ffbb2e50784fe2bc493fa51967f7ffadf1ece63f9b2 R = 4ea5d4faf8ee52540db2f4c6283cea5302a3540a56e14c8a7533441c248465be99e10f23bba85be9634efaba7a8b172e S = 4c98a2142ecaba7db44c78658efffc1175f810a147306ba2e6498553526adb1507d7a99a372e0f84c8dbd160ef7fd5bf Curve = P-384 X = 4e916a3cf2561580b49ecc52321db7103292fd2fcce8dd4d6f86be6035808e0df51c3c4ac1894f0b08ef6ebf953e0d18 Y = 4e6f28895d024b4c71220b27052ddd4bf6115a260825acade48c043b3e06d2b6b8e4ebdf465980f3b013cb575d475bbb Digest = 1668ee6ae19c2d6f23b9184b6895ede8f55549b23095d53ef89487f6 R = efce00544ebe0d98ba6015c07e3e9d09af808d49a0820c22ef572a3ef9c8a684b377bef1f8b3bbddb734b9b0bd0b1cd4 S = e80d0e183b3f00098308e20e5b4ae393a07f1d1a8defda9a9d10f19b3e5236e42f593b1dc57f6718dd8d4583f0175ff7 Invalid = Y Curve = P-384 X = 3c6528c82d9d5e8dddf41a211c70f78604d81f49853bdc746270f1340a2a645dca3bc7844c3680268fa5973cd1758313 Y = 4b9e697f1caf83d3224486bb0a8cd6a7c56e47c91043d8cba3aba51b6e504441d37abcc9b7b2d49b9126463703e514a0 Digest = 1b39217bcc5dc841b32ddf00245623c581f19cac8a4ecd03eb2c07f0 R = 848814c01c3d18534f39bcd53a8736db16f0f77a015a0e578cbb2f831739723e83b29cb6d4eee7822c76ff056d0f467d S = 05beb19f766bd1d4ec5e65786042258298a2dc617e3f13d8e2f0f4b50d934565f3162c737fa791a81897397f29305943 Invalid = Y Curve = P-384 X = 80c3f6488dcd76f33cdb75e30f8452ab9a3bd6110f14e25179b0aefe4c19c60a07b4af10844b130b0b75a7024e341298 Y = 6c85a17ad4bbefb33910250e05ac02a17c892c3380712d06dd070843dff0d040e219dae78679b774cd5eff0adb67189a Digest = 23cd0066d1d88702c5d4461deff89aa5662b517806a04c4da30e0d82 R = bc444deb0c7dd9f96f20a7ffd3ddb35a1189316655531860c39b5f87f09992106985e5562e083ee9f538c8e2d5363c52 S = 91adde5d47eae80a98661f4347fd6e4778478c3d4aff3cff8aa92e2345a8e03cd4ab64adfd38e461bb98b496516439e7 Invalid = Y Curve = P-384 X = 97c3f446803a61a7014f61cb7f8b3f36486c7ea96d90ee1767f5c7e1d896dd5114255abb36c74be218c1f0a4e7ebba3d Y = 553ed1fed72c62851e042f0171454f120029adba4ee26855ab881d9470355f1947aa1d2e806a7ff2583660fedbd037a0 Digest = 647eb206a8477440b4bd048d00f37dca8635b15c2a8e79e2a9d74fb9a5553211 R = 7b06d6c2b63f1cc3bfdaa897d07dc15a83bdf35d979f70c34578332b3f4920422bb24867c51bde10831324df424e04ec S = 4bef715161f400dc98d4b63bd13ff4ad4a6c981ead44bfc662fe9bca4b56cd790698e4deddf9a4bd69327f26bfe801e6 Invalid = Y Curve = P-384 X = 08bd5c6cdc1f8c611df96485090e20e9188df6abb766bff3c1ba341ed209ad5dfd78b628ec60998ddfdd0dd029352fbd Y = d9831d75dec760e9f405d1aa5e23aac506dc019fb64d44bd57f6c570d017e6609f8fdbb2dc7b28ca9e00e37cd32a3b73 Digest = 9a4985f744dd6f2774cb6f20ad6b6969e212abf4ac035b72ad3f8b1955ae1862 R = 8b372c86ed1eec2163d6f7152e53696b4a10958948d863eb622873b471702ac5b2e75ff852149a499e61510905f98e4c S = b2ed728e8b30787a28f2a6d3740872e47348686c7cb426411379411310241d25f08a026b853789b1157f1fc1a7f6ff49 Invalid = Y Curve = P-384 X = 10a784abb3c549444a62c28df1c926b8aabb20c8d9aa4b1f7ca830258857cbe9718dbc9845fa9cbb78587a373baee80d Y = a1ad0c10b5ab6780cad49c8cd3eebd27de8f1b382ddd7a604458cef8e76ca632a7e44e1c63141a742426cec598029e2e Digest = f5b47101b4ff9baf64aca830b6afbc4f9620035d88a1d84a12cefa6f7f99faf2 R = d9e52be2a3f7f566899cf6daaa38116d092473066f3a1bf91f3df44d81bca1deb438d9d25ce1632599c1d3576a30f128 S = 0cad30bce4b3d7f40b3eef762a21bb1a3bad77439838b13024b7b2c70316875a99e80723a74a9e7a404715ca06a5d673 Invalid = Y Curve = P-384 X = 8760182393132d69011edfa127e36f92eeac8272641c27f52f3337ef8af7451e6d14f4e4590c7eb9fafb76e8c92865cf Y = ebc2b123ed871ca570ead40ae8f6f32335393c569b21b38f626d09c064a3c8668e9fb10a4667e0f0c68bf25ca98fd6dc Digest = 979131ca1d07e0b4ac6f27b20a978e0a230159eec4906db5dbd22b10ec71af87 R = 1db957e5c2d294035d7f476a0cbc28a4aac2614d8212de5017076cd836bf04ffe237dce8fec91f2fb5ef82449ff1c65d S = 3e3b9058d0a9c5b417f9c6f86557b9d50e7a902694a7012a1be6bb70708497e4d39fc1f6d6bc60dfa52d23cab173385f Invalid = Y Curve = P-384 X = 2b1f98d2acdda8347b9a68c75174408eae7de3d6b9c08c26e73ce9ed2ac147b8d90cd82e30ab43909d63f6b457de2071 Y = 33f5e6f5f5793201991e014cce0045d04adc352298e32f45f4e374450111c8456b5c2efaec43d157949b5c191b2bc934 Digest = a1daaf888d93a2a7e52bcd2a66cca3ff2e02916616d1919adefdd7257490e5b8 R = 23d046402cbce807d232bcf0dc96d53c72992e0ba1ffce0d79050c0f4c5ad9bfbbdc1c96c730d67ff3aa3edaa3845da9 S = 2cd46a4fe5d120b3af3a6d9ea63cc78f4079e8b5520a8fa96828334a4f182ff4d5e3d79470019e4eb8afc4f598b6becb Invalid = Y Curve = P-384 X = 86ac12dd0a7fe5b81fdae86b12435d316ef9392a3f50b307ab65d9c6079dd0d2d819dc09e22861459c2ed99fbab66fae Y = ac8444077aaed6d6ccacbe67a4caacee0b5a094a3575ca12ea4b4774c030fe1c870c9249023f5dc4d9ad6e333668cc38 Digest = e3bcded61cbb0bf6ec20d59f91e8e73e532f15b082b89c984c1b51fb0d1db8a9 R = 798065f1d1cbd3a1897794f4a025ed47565df773843f4fa74c85fe4d30e3a394783ec5723b530fc5f57906f946ce15e8 S = b57166044c57c7d9582066805b5885abc06e0bfc02433850c2b74973205ca357a2da94a65172086f5a1580baa697400b Curve = P-384 X = 9e7553eab8cc7e2e7396128f42ab260c6dbb5457cbff2070ea7c0db21def1537939e3f02699e5dd460eca3798d08bd6d Y = 892c0c8e47dddf858e89099a8fc1026e8b8333532b22f561f7647f63f9c79dbf5e8dd18fbfe6ff34902233119c5d5aa3 Digest = 0f2a9b447ea5cfcfb9e67d661d7f0752befd3b4e3454fe40b9ae1eca47806025 R = 2452da6a48c3749b66e576e0f1f768d51728be17aea149164c4e1654c5ce27f625a4610c4a2eeddb3a0626d3abc6c37c S = 499504fb58c9db24a7ff5f7921e1312f8aa583c08a308e080f5ef1acf5cdae7927c4101573db069ab0b6de7f4f1cab38 Invalid = Y Curve = P-384 X = 0cf4dc51e71185a29c0c6fa3c075d9da5bd7ede085053344dce5dbbe8329e8ac9045f7246c9d0efed393b8e113c71429 Y = fdb7917b73974b355cf9f3bef6a0a460c2d39fdf1fe32a7744be0a54ddd1cfa8d03914cff4b5ca536b40707ff2629aa4 Digest = 331aefe2369b9c5ee6dd9f850259b3b8512f5934434e61573f97fe2c1cd2b147 R = 3812c2dc2881d7ef7f621993b161672329b261ff100bbd19fb5826c9face09aec2017b6843d69336b813b673c5402527 S = 5dc102fab9d6325131c556ec00309c2959d1031a63fbc1e2d5d04996d3234ed33875c0ab98e5878e9bc72742519ed398 Invalid = Y Curve = P-384 X = 6c590434988155236b43147389c6dbfdd27dcd3387e9b4c2587ece670753a542a13a736579887791cf53d31e5ce99994 Y = 35a20194ff3f1b55f7ffb2758ddd4b98dd0d9e0cc213e10ed25e8e0430fe861066c1d4423c67f0c93f7ebd87fd3c561e Digest = 153475076a003545d3ca3d4a772866f12cc85f6e69f8c486a91a80fd709206b1 R = 89ff866889245e797926509e563b1746920b78c9370a6cdae52663730d131e558e327d1f5fef8faf9e6c802fa29504ed S = 8dd68e2de2f788e598b3e5a60c18d81849a0cc14b3b0e3c931910639f3125e5d6045f00330b1fa989252a80f95419b04 Invalid = Y Curve = P-384 X = 499cbdf18ec4e69b88051543c7da80845fa2de8be2b9d9045fee7f104a8b5b7d04e69142de9955c5ab18c5a34ebff075 Y = a29cb8d28836b201a389922b6f8f93870f09c80a00242d00d32656a43ac1440fc55bcb123551a73290f603c3469be9ed Digest = 5f00b3b48c1ee8287abe6f3fbc3438b91f4268f318ae2aa1e7810369d6716020 R = 25d4d243da6fd9b439a9242c3656fade7acb7a306e8cf23ea89e3ff4f9330be19c61aaa42d7b426d12c8e0f96b80dae5 S = e7a99cf4b269bb4a6210d185e9654602523b5cfa1cddc94b1db92018aa557ecb6adda44c816975f5ec1756b6df3c44fd Invalid = Y Curve = P-384 X = 9a74ea00203c571bd91ae873ce0ed517f8f0a929c1854d68abd3b83a5051c0b686bb37d12958a54940cfa2de23902da7 Y = 6f20ccf8fa360a9ec03d7bb79ff17ad885f714757ef62995f824908561dc0c3dffc49d873627936a2fff018b82879ced Digest = 45c3a1b29a18780234f12f5e4b64e7af9de2acf0029ce55b706cc79a7e4df994 R = acc1fcac98c593fb0a0765fce35a601c2e9570d63ea1e612fff8bc99ac2d4d877750bb44cfb1014e52e00b9235e350af S = 7f53de3afa4146b1447e829ebac8f5645e948cc99e871c07280cc631613cfdaf52ccaeccbe93588a3fd12170a7ec79fa Curve = P-384 X = e22f221809fb7a054ac799a70b3d24744eb7c5096c8671770399527c88ccf9ddaea0257a0ae9430d927ff5d9f109c533 Y = af4101d60df9b306ae92da7592f4faf3df422a3e33f1c2ed2973b2b900eefc346b4cf024de650abf537cecd12ac77618 Digest = ef1057d83a6e6481be7caf2c12c15f085ff971f02f0db8544352558e2b9fd61c R = c39a8e79f0560b9f26504469a470c7b2230c0d25de07c206e87dfbde9aff0a5d85322f56dfb50d4c1fc67c67d615dad7 S = 2ad94dd13a39cf4f4cb24c2c81d4c1181652363addd856dc9ba7455458e40ed047cd113129bc87f43949d5a98a0d5205 Invalid = Y Curve = P-384 X = fa8ebc3682d90ac7356f0b75b9e3376e76518676e0bedd176cfa7fa57fea4b3a399dbb2bf735ec90b9c1705cf9fa6f57 Y = 18c3fbca0150ec10696b3851f31fb3ba62c0b6be509d249e0d4b374c7a08e49338e0922e2a8a9319999e6569ab8d292e Digest = 0c7152ec620fe9b783625196b41192dd5d49df184ad26965c970ac5e28bb1c4b R = fb58ab09b8a7ef7a6ec05b854eae11af9b713f7c7540e25115f609846e636ad4f88dcf4dd61e311273df23ccda474f03 S = 485be4c21b7c3a9c6b39ffc9f0c39f4050f76d2a6b3fae203d016318c541c1b4ad6cfc0d0950636ff6883895dd49e4e9 Curve = P-384 X = e5f331536a2940cd67234bedf813c12e15aefa9a1a68429f8754bf2769a47c9c2efb5c42135e7b01a110d7302e097eac Y = 63b2398612c863febd482184e834d3acb51408c49aacbbd35d8719746f37cb13e013c9505ce034cd815aacd10d2f7a0d Digest = d925955406f6b6dd4df05270a2539a5924830dfbcbf6a5a34f21354db246244b R = 96c35f22d036785a392dc6abf9b3cfb0ad37b5c59caefcc0b5212e94e86739a2674020ff79258094d90d7d59f09d47a1 S = 373cbc865384734c56952f7a35a1fdecd88e8b343ee3aa073d30f5f25b73506f1e5f5857f668b0080dec6edeb5e1be96 Invalid = Y Curve = P-384 X = c53ad865beb1e2b92764065f1a6bb465ee94aacabe43426a93c277d02e00fe36be1c859ba08a031fc518a0d007668979 Y = 6728d42bae9bc097151748ffa0982964bdd16076fa0e7cc15837c1f773b08d02c3dbc57339091ccc34105b84781150b4 Digest = 6d5fa5b492406a1e93df6bb6364d7b17a24ef43807a1159acc77486dd7b49b60 R = d4f0dd94fc3b657dbd234767949207624082ff946de9ce0aeb0d9993b8c7d7935760e1bf9d8b233bc7d6cd34928f5218 S = 0941df05062aa8849610f4b37d184db77ed1bc19ad2bb42f9a12c123017592bf4086bf424b3caad9a404b260a0f69efb Invalid = Y Curve = P-384 X = 1f94eb6f439a3806f8054dd79124847d138d14d4f52bac93b042f2ee3cdb7dc9e09925c2a5fee70d4ce08c61e3b19160 Y = 1c4fd111f6e33303069421deb31e873126be35eeb436fe2034856a3ed1e897f26c846ee3233cd16240989a7990c19d8c Digest = 8cf5e81c6858b8395421d8c913f1ac887e282b5818eab525fb79feb9bc64bca7eb98f94b9e48b705e6c28311bb0ca672 R = 3c15c3cedf2a6fbff2f906e661f5932f2542f0ce68e2a8182e5ed3858f33bd3c5666f17ac39e52cb004b80a0d4ba73cd S = 9de879083cbb0a97973c94f1963d84f581e4c6541b7d000f9850deb25154b23a37dd72267bdd72665cc7027f88164fab Invalid = Y Curve = P-384 X = cb908b1fd516a57b8ee1e14383579b33cb154fece20c5035e2b3765195d1951d75bd78fb23e00fef37d7d064fd9af144 Y = cd99c46b5857401ddcff2cf7cf822121faf1cbad9a011bed8c551f6f59b2c360f79bfbe32adbcaa09583bdfdf7c374bb Digest = 965b83f5d34f7443eb88e78fcc23479156c9cb0080dd68334dac0ad33ba8c774100e440063db28b40b51ac37705d4d70 R = 33f64fb65cd6a8918523f23aea0bbcf56bba1daca7aff817c8791dc92428d605ac629de2e847d43cee55ba9e4a0e83ba S = 4428bb478a43ac73ecd6de51ddf7c28ff3c2441625a081714337dd44fea8011bae71959a10947b6ea33f77e128d3c6ae Curve = P-384 X = 9b3c48d924194146eca4172b6d7d618423682686f43e1dbc54ed909053d075ca53b68ae12f0f16a1633d5d9cb17011ec Y = 695039f837b68e59330ee95d11d5315a8fb5602a7b60c15142dbba6e93b5e4aba8ae4469eac39fa6436323eccc60dcb6 Digest = c68382d0641ffad850c41365a8ec68e3d55acba376d1bb941e7dcdf7b71f37b8288b023b942373a40be1dfaaf4aea633 R = 202da4e4e9632bcb6bf0f6dafb7e348528d0b469d77e46b9f939e2fa946a608dd1f166bcbcde96cfad551701da69f6c2 S = db595b49983882c48df8a396884cd98893a469c4d590e56c6a59b6150d9a0acdf142cf92151052644702ed857a5b7981 Invalid = Y Curve = P-384 X = 5140108b93b52d9ad572d6129ed6564766f8df3755e49fa53eba41a5a0d6c1d24a483c90070583a66e3cfa52b6fb1f31 Y = ff52498446a40c61e60c97554256472625633eda0c1a8b4061481fecfbe9c4503e99dfc69e86c9e85c8cc53dca6b8dc4 Digest = 4b945020c329a61221060e924ec682eceb842c09537fe26265ad084753b89f7650cee4e8df30b38126984d80fd25d246 R = b2726b2ba9da02de35e9953fc283d1e78700860d4c33dce8db04dd41499d904866c1b8debb377f6c0dfcb0704252174f S = 0775b027068d7ad55121a278a819f52099ace750d5e996eaec9dee7be72758736cf769650148fbd5c411beb9b88f979e Invalid = Y Curve = P-384 X = 31f4fc2fac3a163a5796f5e414af6f8107ab5e4a98c755d81efa9d5a83c10128c16c863190112fc29d3d5f3057a2edf1 Y = fe208743f3e96c3a34b5fff78c9716c074a1ce3dc01c3f0e471ddfae91cd88e7dda38dd0e5e1f91b00b8539da3cc10bc Digest = 2d6affdf541609f649dbe9fd5829059bf42021fcfefee42d8c9cd5c127015c06b4c3c13ef56d08767788955887752e44 R = 706911812ec9e7370234efd57b2855975eab81e9c2fe783aa8e442dc6e7d681dab2dc0dfc6765f87ab67001108e3facf S = 42c89efa22d853d32f619c9fe13e9852889ac98a9fed5d4fa47fed238e1cbe70d7970af9f7bdf84e51176af4885f2490 Invalid = Y Curve = P-384 X = 1f7911dcfe63a6f270cf75b8584d9b1b4a00afc1fa43543c945945b8a821ebeb37fbc705a000f9cc7c35f7d27027b7bb Y = f11835ec80c4ac06d99247e73bf72522109ac255e6109262de4dfbf9619244f74fb6c9ee57694537d7e79c248db34dc4 Digest = f4b0a912331e7fc59a7071e5f47c9dafa6dc09b32c5c3d05301b3833bbe0b9168e2b63f12248849572a322b2f5423b8d R = 3587c9c6885adf3be1086825f9a41ccd2edfa0bd95e7fc4dba5a9710f41d539132de7772f14c18e318f8992b66d2a86c S = 73a844d729599d4e3e3c1b63e9c4bf5a73d1f69e0160857fe63a56c381c051f5c37ea6b4cc4caacb6ff26ef9699efe30 Invalid = Y Curve = P-384 X = 2039661db813d494a9ecb2c4e0cdd7b54068aae8a5d0597009f67f4f36f32c8ee939abe03716e94970bba69f595fead6 Y = e2d5236e7e357744514e66a3fb111073336de929598eb79fb4368c5bf80814e7584a3b94118faac9321df37452a846fc Digest = cae50a424395e38bde9ba31fa5ea0c107ccceaff06663719162aac2c3e15f2b2cfd376f90d371326e1d29e0392a756ee R = 164b8ac2b34c4c499b9d6727e130b5ef37c296bd22c306d1396c6aa54ca661f729aa6353b55d7cf1793b80b5a485115f S = 4e7187f8f735b7272f2c0985315b5602bb9b1a09f32233aa10570c82d1ccedef6e725800336511e47f88ddbbbdc08f54 Invalid = Y Curve = P-384 X = 46dcf8ee848c6459fa66d1cae91ccd471401a5782cb2d3b9b9264189f0e9ddf7197b05c694931bde3306240cf9d24b7e Y = 79d9508f82c5ead05c3f9392f3b1458f6d6c02f44420b9021d656e59402e2645bf3ba1a6b244ddb12edbb69516d5873b Digest = 039fe89dfc54e7f2162545af700a8c49a1216b08854643656b07d74e7032516fd0c9368c5e5ce54655e4d08baa29b6f0 R = 5ffba3b5bd7c3a89ec40b47884b0b3464e8abb78608c6d61e1e62c2ca98d44fcdf61825d69dffee8408d0849d0623bac S = 0d2597b5fc3842ffce1957172253a8c9c0e4dbe770ce54f70f139e0545dc34ec639d609e14175bdb2b812ccfda00c9d4 Invalid = Y Curve = P-384 X = 097cea75f685cf4d54324ad2124ce3f77b1e490bbaa1ffacde40dd988f7591e1c5d158e6f232500d958762831914af7f Y = 716d8bc056daf69ca2edd21b89a6ae9923cfcae87bfda5f9a6e514dd4b9d28d164fcc613ca2afb9660adfece59f09b66 Digest = 02afb35f1df33b3d83df3391ca4184121ca52f520dd12ffc891aee77eab6503f232a5b1231bd997239751f46c4133edb R = 1c5d4561d2a3af8835839b543098c101c715c545eb7d00300c5cb05bb08dac29e732ffdc31c50915e691999ad505104c S = c3442f2fb1498fd47c2f959edff37a19783e3ccee80dc6955ca64db087fd188e67358e7b9223535bbb858d21ba6a978c Invalid = Y Curve = P-384 X = d2e2b3d262bb1105d914c32c007ea23d15a98197f0ed90b46a17f3d403e406a76c8f752be1a8cd01a94fd45157f6511a Y = e585fba180017b9983b4c853ad3a5dd52e079c5f0ef792d1a0213b6085e390b073de1a4b01749ceab27806e5604980fe Digest = e66b11b84f87c38526438e5e3c5b4521248c358eaab80e40526906a05fb29d14d4e5686681f03bc3f0025d45dfb83b5f R = 49c001c47bbcee10c81c0cdfdb84c86e5b388510801e9c9dc7f81bf667e43f74b6a6769c4ac0a38863dc4f21c558f286 S = 1fb4ff67340cc44f212404ba60f39a2cb8dcd3f354c81b7219289d32e849d4915e9d2f91969ba71e3dd4414f1e8f18f7 Invalid = Y Curve = P-384 X = cd887c65c01a1f0880bf58611bf360a8435573bc6704bfb249f1192793f6d3283637cd50f3911e5134b0d6130a1db60e Y = f2b3cbf4fe475fd15a7897561e5c898f10caa6d9d73fef10d4345917b527ce30caeaef138e21ac6d0a49ef2fef14bee6 Digest = f6325d6bcaaaf1aba1197a290b33974f2fe8af200d5d726e78705904e9894ec31988e35dc76b9976834b7cd1c4c67146 R = addfa475b998f391144156c418561d323bdfd0c4f416a2f71a946712c349bb79ba1334c3de5b86c2567b8657fe4ca1f1 S = 1c314b1339f73545ff457323470695e0474c4b6860b35d703784fbf66e9c665de6ca3acb60283df61413e0740906f19e Invalid = Y Curve = P-384 X = a370cdbef95d1df5bf68ec487122514a107db87df3f8852068fd4694abcadb9b14302c72491a76a64442fc07bd99f02c Y = d397c25dc1a5781573d039f2520cf329bf65120fdbe964b6b80101160e533d5570e62125b9f3276c49244b8d0f3e44ec Digest = 709d1bf45b5817f5a67b859651eb47133ebed2622fda09ab66d3467b5e95da50ecc2c74d8f4d289feebec29729a4bfa3 R = c6c7bb516cc3f37a304328d136b2f44bb89d3dac78f1f5bcd36b412a8b4d879f6cdb75175292c696b58bfa9c91fe6391 S = 6b711425e1b14f7224cd4b96717a84d65a60ec9951a30152ea1dd3b6ea66a0088d1fd3e9a1ef069804b7d969148c37a0 Curve = P-384 X = d1cf635ca04f09b58879d29012f2025479a002bda590020e6a238bccc764478131cac7e6980c67027d92ece947fea5a6 Y = 21f7675c2be60c0a5b7d6df2bcc89b56212a2849ec0210c59316200c59864fd86b9a19e1641d206fd8b29af7768b61d3 Digest = 5d54d236db6ab4691b3d50dc81471c5d388e5735ebdd435e9742a5a8a0ad0e841bab57326c8535a680ada57d2b3a70fa R = 6101d26e76690634b7294b6b162dcc1a5e6233813ba09edf8567fb57a8f707e024abe0eb3ce948675cd518bb3bfd4383 S = 4e2a30f71c8f18b74184837f981a90485cd5943c7a184aba9ac787d179f170114a96ddbb8720860a213cc289ae340f1f Invalid = Y Curve = P-384 X = d15ca4b2d944d5539658a19be8ef85874f0c363b870f1cd1f2dc9cb68b2a43a10d37064697c84543e60982ab62bb32c8 Y = 062fb7dfc379fc6465302ac5d8d11d3b957b594c9ef445cfe856765dd59e6f10f11809e115ac64969baa23543f2e5661 Digest = 67cf9e6f9e9558a379ef7361771323a4f3925f2c7a5d94d9156bf2d9d45f9f8fc4d47322da622fbce92fc764a2ccc327 R = e2cf123ce15ca4edad5f087778d483d9536e4a37d2d55599541c06f878e60354aa31df250b2fc4ed252b80219552c958 S = 696707a7e3f9a4b918e7c994e7332103d8e816bbe6d0d1cf72877318e087ed0e230b0d1269902f369acb432b9e97a389 Curve = P-384 X = c83d30de9c4e18167cb41c990781b34b9fceb52793b4627e696796c5803515dbc4d142977d914bc04c153261cc5b537f Y = 42318e5c15d65c3f545189781619267d899250d80acc611fe7ed0943a0f5bfc9d4328ff7ccf675ae0aac069ccb4b4d6e Digest = e8d6b550271b486e79f6975cff753d49519ed9393b207af7039b4c070cbc2fe7d49dd1bb87f7021e442fadd80ce8a5b0 R = b567c37f7c84107ef72639e52065486c2e5bf4125b861d37ea3b44fc0b75bcd96dcea3e4dbb9e8f4f45923240b2b9e44 S = d06266e0f27cfe4be1c6210734a8fa689a6cd1d63240cb19127961365e35890a5f1b464dcb4305f3e8295c6f842ef344 Invalid = Y Curve = P-384 X = d4e93c4bafb54c06814011309e9f3d8e68b76a5452e364ef05ccc3b44b271e576c9028106b1584f09271c886d467f41d Y = db730ccfdeb6644362f4fb510d5254bfe6f23e891e936132f90f1913e93baa8b1f8c0613a0f0c61a760ce659f22babc6 Digest = d5c82ff11f555ce21c3f20a9ecfa6047cb6895e32fa0fb379f49085a59f61b7c8fa05058ef144cf47db5738fa40f4890cb59695998a2358162bbbf6d7f53517b R = 8d0fd14a59c24b0c2a34b438e162f1f536fe09a698cacfe0760d026d1593265d02f2668d2a5e49ac0b21e93807aa9c18 S = 3162ffd2adc9dd5ec1bb1d97d2b0c27b8ae234235ffb374878d0b76382002ea505e885c178d56a2d7809bd1d83117ef1 Invalid = Y Curve = P-384 X = c665feccf51e6bca31593087df60f65b9fe14a12022814615deb892eedb99d86069a82aa91319310b66588185282dad6 Y = 1e6e25bb8ae7714415b94f89def0f75dcb81d4af6b78d61f277b74b990c11aff51bd12fc88d691c99f2afde7fbd13e51 Digest = ea056beb112fa9aad69c8dfe51ea947b772bf1c11287edcede43a98089d21492ed581edcb6d1823e2873aabba213b84291db3bffa6eac3ae43a92fc2da276a24 R = 0e18c4063137468fe864fdc405ad4e120176eb91b4538b28ce43a22ae1a310cc22a2f7a2b3a0f3d15e0f82038b4a4301 S = 5a1620e42041ce4357daf824befbb2ed65596bcd8214e88726149b26b1f416b9472a8877413f1c3705fc2edf4731943b Curve = P-384 X = a6bbf85e8068151482ce855ccf0ed22988fcf4b162c4b811cb7243b849299e3390a083147fbd68683203ba33588b13ae Y = 5c837ec9f2eda225c83ab2d5f10b1aa5bfb56387deebf27ecda779f6254a17968260247c75dd813ea0e1926887d46f86 Digest = 81b1303e10f25d37877b09f9d82dbd894e40264992d86cc74656ebeef505b46fdf9dec312a7f0a26e3f56a7195d5b01d198c378fff9d049e00cbad9586da20c9 R = 9c11879e59659848274fc1ef5a6a181af813d23708b09a24dc06c089b93b918828dd938a75a34d5a681b0af362dc19a0 S = 9c362231962ba7579c4a874e87bdc60dc15cb2e0677149c8ea31162963e05a6614616f67a5269616071cf095be7ff44b Invalid = Y Curve = P-384 X = 9c1eb5cdb1a873e4c275b7ded8712b9058ee0d9ded06c96a2a8d7c652b82e894e2f918dd8e18138e5c34821744b97952 Y = dd474c93619f02b5d4fe30ea7805c1a13fb80008a81bb5f3eeb95cd11f38841b8e34d64f2c6cc2d6cc2587365eed6b6e Digest = c0f9ae90fe8aaf54962e7d47a832e4ca6e60355e4066cd2b08bff78650d4e4a5d1eb1de296f9f0ef92887e09f82e0db4411aa9c3c6b109159bd39feed40419a3 R = f17b2f2fa3b5c8e9c62a633e5d417139ddf3dafba75b464fa156c99b3948a0aca532c7fd3e14a266eb17e7fa80881da2 S = 01c246866983fa74d6dff38b1ea091f8afd218b5a42467761b147c19a3bb20cd24be8ed1f95f1e61863a709d2d0148e2 Invalid = Y Curve = P-384 X = 20622a293edc96d83fee77cf1ee8077c61d6f8ed0073d53cfb5ee9c68e764c553fa4fc35fe42dade3a7307179d6fc9c2 Y = 710fa24383f78cc4568fe0f4ecbbe6b11f0dce5434f4483712a6d2befae975a2efb554907aa46356f29bf7c6c2707c65 Digest = 5cb8ed471a4001e280a0927faf25183c857b9b2de21c8566e8a1bf04ee085c36db7fab9d8f627898b3bb23c10225305938b56a732659f2cab3fa857d80dfde19 R = 45a6cf5cef06256139caa709292d1e0f963d176add188572e9c7be29af21a95853a98e23aef0a0850e58d44d60b6d780 S = df8d71cd5ab22fc718070078103483e5258734872ab935435f21ea199018e49a69c064a63801beb0759fde6e2c4a85b8 Invalid = Y Curve = P-384 X = 83a4fecc0bf0a353b0acf6f54094b822f2b12564e172b296f3461cafa7315d7d31d0089b1b4c18ad3c86bd18f539774a Y = e4fd57c5b2937e6fba1e7d72fc3f02352bd79c13611931935f4dfd073b9379f862f2277585137e996e212b5b6533dcba Digest = cd7c623c3c3b52f46be0ebb2b353ff97db3cd7dfc1a059a57668fc50101aeeb37b8aee9ddda8ab611546999a120cc9acb0e2c3df48dee66d5c31a46a7be94bc7 R = fb02804010a570d702ebfbcf3d6cc9d55ddac2bd4b4de56d325e9790571b1737f91d3fa1d4caeec6eea806195aed3187 S = 1fd20fe383e907e77639c05594642798619b2742090919bedeefb672c5700881baf0df19b9529d64bc7bb02683226103 Curve = P-384 X = 208a8c5a6b59458160c5b680116c8b23799c54a7ee8954a4869425a717739facfe4fe24540505cdc133fde8c74bfca78 Y = 22aa7aba797bde1e8389c3c3f8d8d9aa2a914f4d2d7aaf7187ebed9b2761975718ef97660ba0b8a71dee17f2b982e2cf Digest = 007b907b90fa60835d45d2f0201a4486d9782fea4f0a235d97d4968336c5369c6c2e82bded56288a10fd6741f4c15d1633bc92e0196308d9f0490fc2077d3b6c R = 0b4e835ed83151d2bde96e201c54544ba5f301aca853957d3c538c9858fcce796b60fc50f5600a48dcdf13e5bc029827 S = 0270adf02d31d5428d523e13d7d315c1929a1d89bbd0f61eec0b1186abe1c307cbba6b1067a68bc3947e6196d49719a0 Invalid = Y Curve = P-384 X = 80ae47e99107d6148b1088c6694df5c1273ff336b66e45b68a7c65fed735129dadcaf2b900e9f8ec50eff70a5ba89ea3 Y = 47450efb5669bfacd7cbff1f801aafa0812ff88a6ae7b5a1f85e88e19129ed995f509fbf8dec15ce42bbbbd33814c09e Digest = 1cacc8f609080e7b8339529f944850a700977ef9107f40956fb35645e15fdd54ef01755f07a2582d0bf2ca0cb84ee8ab154fe0914dfc9ad7ad5fe54b857d0f4e R = bae6fba7b1485ecdca48219ead3c39295fa9c196b1f0941445b1ac768e33962f68d37f1f1749eaad7200064aa202fb41 S = b411a38d02deb42d1015a7837b033c89d2f37d92c70fa8bb1f592223f7750520b950f30277abfb4155a3ab194b3beca0 Invalid = Y Curve = P-384 X = 45cb6dcca8d2e80ac04536a22f9d68ea2313245550108ddcd32799d154c0a55492e49463e826275bd9bf0d5e380205c1 Y = 6fd124f5a6c745751ccfb3ba4dd9144ea8fd41a4d9a4b34820434da66aa7385e73ffe71e6c11ed1beb6c7af22ce00edf Digest = dd7947a5b9a1c988dd7dff537e15335aacafd3e602adc8373765013f338334dd58aed4fb7144de0007c3410d79f5e78bcd4cf0dd63cc33ed3dd564882e299c7b R = 2c782c4263eeee63657fbf20fa287a1a81fcd14b1d3bae333928ba4fc31abb20edebc130714380608e38ea74309eca9d S = 716113d95bc9dba532bfb470112b0d43d9cd6560ad15e0de2e514994801ff339bcf19ad4ee2b8af573f57c038fbd70f0 Curve = P-384 X = 36c1459d9e9f7b6c1598778c784cbf94661a2b11370c02ee092f6ea0ca20acf81f1ed5048a28a1466a91689df26bc291 Y = d1367418c7b216bd32c6dafc8b2be99d02cab68df990758b2ddd543b7eb6ff6e285b649ffe588b1811b549cfb5f0289b Digest = 242ff2713c03e3d5277652f8e7fb1e5a1f0422b6652e1bdd696e46c03cdd3aaac329b1d88e7aa345ff7224ce6dc6df05c7e9d7dc2665282c817d15a15b8288fd R = 40c338adeb504193444bdb95336177362031aaadc5b7e151e42030df9dd8687f3cb8fe2292fd4f9206989c089d966dae S = be4b2ba251094c24de006c89af2b5c77e6937f36d7bb703b4f8edcfe65d45f4b2fd2486222163ae0ed9e215c0a96f488 Invalid = Y Curve = P-384 X = b5eb6670bb0b0d3aef10e533d3660756b7372a2a081d9d920130034f48202cd43b9e2d1e5893d0cfb322db65ab839716 Y = e28444770396041b489b302786a57fca9a98f19685cb4b455d219151e64645ad30dd3149ec96f3bc90879834b65e58aa Digest = 8d2e653807e87962883956ee3705b2167c50370c3af12eb8f6c26f0f15ede56dddc7d0c9642a1c1c2444b06571fa1a4d47e7884acc7ea3884daaa50940f782e2 R = 0887a13df940907864b425ec0d8f91ac719abcc62b276fa08c5122b38831c8930abd3c8454e98182bb588fc72843717a S = a380284eacaa36a34e35f04fbf6e28ffb59176f41ea52d9c9bc1362eccd8e0d699c2e08111d93e9dc2785637b1f4f09e Invalid = Y Curve = P-384 X = 700e8f65e052e918a63a96fa57f4eda849f9f9faca3302d6ead66ebf85838f8145a6d6718a681b7bef73170d7254958f Y = 9e9e10357658913007803859165926cd1e5e92c3a644d834098cb1cbfab466349bf4238a5154cf50ed77c77a78263e81 Digest = cf885fa7a96db595f825a0ccc56b70b60e0e1c30d0a15af636d1f4957328aecb7eeb734d5874bd72ddaf15c357ca36bd42abf387f7b771ea6160e2e23a08652e R = 59be870e0fd684b000cce95c616d9f34674354e9d20db15d204b8a6285ff55258e4eeb49da1573ef1030cd6b2626dcfb S = c0bbbf71d87479d82575458be9f4d686921db7ea458d620271f51ec3f4d1afe3bf25ef9c0c400eb7b92cd7058fb17346 Invalid = Y Curve = P-384 X = a9de6f029445fffcf16349b44095cc83b11e3d0d9f08654b158014803b1cc31b8dfe00b1a8167c6f704d69cdd62c6512 Y = 27336a503a669ba1d1f3619f51dc8aa2a44b2075c682a36f071be486e7dafba9adfac2ce74be0442b7251e99304ffc05 Digest = b7e73f38767f253790e7fff019b4e0e61562aeb97b2b749afec2a61c87ab0e15916d4286c0a13989912f6bafdf3efc6f64ddc3b944f9041266e5abd4480c1606 R = f93a4d2eb94d087f28572847e0099ae2ee944efacdad392ec268c9c1e632e6ccd670c36584e58aba52a4c2b07127d55a S = 941ee89cea6e7ed20213a95482fae134707ddf4d292ab1952ed5464f1f1138669dedbfc9998b696eaf469be5fb240c80 Invalid = Y Curve = P-384 X = e63500d6d13069c01fafc4518f1d429661c5bb6ad1ff0383037ca6a469a5c20c453dce03bf6e4164f7e26f849016b3d0 Y = 83b7b731c2531c3ac61b194cf3db6dc02ccdfa16d9eb49f97bc4ec3fe6c8bd865ea27f1538531ad07dc44fc5107af8e6 Digest = afc0ed355377d0ab0c4f79d420dcf67ad4920c013d5c8afde2287525da4596672927540418a61568b21ae7799d7659f16b85f611bd6e8d2066a55903da0c48b9 R = eb78733e73fd64a6a1f23eba5311af23d26816fb8847671e01fdbd8dc7d5fce1a0823b080ee99e8d75edb3f100e16077 S = bcaedfe599f98b51542c0f94ae1010611c6767ac3abb2bd887399d62fd0f1b3a0e97deb24c95a76de44521bf24c8645e Invalid = Y Curve = P-384 X = 3ebd869be687f82d844416e6816d698d82e1e22a1f451d50b6c146134deb07f05204c0b04e7dc07ebdcfd916531dc7c3 Y = 6e4d7bde063edb7254a82b9d9249d2a2b9ad8988c37a84ac9f7c09daed42b1fd28f7cca1ea8b4f91a66e878224800bdc Digest = 56a61339a35750e95770f28846930e3f594e8d759e07423718734a82b2a80430b0fb3378e40bdcf5c12be135be9a9bec32916b4988a763091a6da7b44631414e R = 575f87a8a7980555a198cfdec279cbb2f89551b5271d242397c29f6bc4bf413dc30312a7e626ef7fc77a9124a79bf9be S = f0b7d759246ad36ba8240c537b1eeb5d148c38d324f48028c598eaef6e49d79ff3f6cfe3a32fbbf6f3ed3aaaec31d572 Invalid = Y # The following tests use digests equal to the order and 2^n - 1, where n is # the number of bits in the order. This is to test the truncated digest not # being fully reduced. Curve = P-256 X = e57231383637c82c1ac801724cf7e03e67198f467a9beb60ac13cb582d13afa8 Y = 8f190e090155fcf63810b858bc88e259dc49afef8bdef6fd06d93dddb1991aed Digest = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551 R = 05cc6037bb021f4910ea2e489fab2bae6bb6a2769a97f42ba5736994102b7f10 S = 5db54832ceabf8bccdb8be99b1a49cecff8feee045cb697dec43118e2695b1da Curve = P-256 X = 6e0e2897b9a554ee287cdaf43bfbe25ca8404373971575a0e4b61c61aff5a2fe Y = 23ea7823a411eb1b39f81bbde24c2cd6ac68be2c7eec3a0671c8676131b8905c Digest = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff R = 16831feeceab2fab1c575e073e944d73ce7e6f3e9b06312088f06159c530ff50 S = 870cb824692638538b1569c6093fcb693c054e8e3b9a919e3bb26798910f66e9 Curve = P-384 X = f4a961c19f9cc4ebe4f43081110955f3cede085a08c1415d726e80b2eb774028c5fc96f092ba3ea7d1288dd57fe1db08 Y = 981398eed0895e09b3b582a0616f3024e51cca7b1ecc347dbf0d24a5f6a222b0c31912f8f5e427d4dde5c6c45212bb10 Digest = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973 R = 0b77eaff05bbd922dd80525d2ab301cc119318f5a920a12c71c4b5ff5bb77d25a538983df9bdd5984b0d159daf21f1a2 S = 73af85ad03a34b6b3993082bf719018d25d1555717b2d2f2535d0601af06a71ad020eff8232d065ab9d7fc4cd0c0ee42 Curve = P-384 X = 54dd8d7cbf2ccdf1a42f5bbc615a372803b094f6040e3c7b651a61bc6912432c836cf2410ab7d67f543236751d81066f Y = 2219d6257b1c80bf327c96786f2b5d0b5a9b9bf7eee9c853bf66a3bf09520494cb1f7823e4c566d79a617b7e201ead96 Digest = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff R = 9d923e199d98272e44b8fba382bf3c19660ecb4a9aae3513ff6802a73fef510c15c202807c3f9334b0bce7d6c6a80839 S = 520784e6290d04d9b61993ee5ebc6fa8ff527fb0777c43cdefc7586701e60edb399005a5648ff852de80208232849fbd # The following tests are intended to stress the final comparison in ECDSA. # ECDSA verification computes some curve point (x, y), picking the fully-reduced # representive of x mod p, and checking that x mod n is r. (n is the order of # the group and p defines the underlying prime field.) # # This makes the computation sensitive to values near n and p, and which of n or # p is larger. Additionally, there is an optimization that performs the # comparison mod p rather than n and compensates for the difference. # # These tests were generated by picking a target value of r and x, adjusting # both until x corresponded to a point on the curve, and then computing the # public key by solving for P in ECDSA's (x, y) = u1*G + u2*P. The digest is the # hash of "hello, world" with the suitably-sized SHA-2 hash, so the test vectors # are suitable for both message- and digest-based APIs. # # "x" in the comments refer to the x-coordinate of the computed point, not that # of the public key. # r = 5, x = 5 is valid. Curve = P-256 X = 264d796a0dab9b376d34eea6fe297dde1c7b73e53944bc96c8f1e8a6850bb6c9 Y = cf5308020eed460c649ddae61d4ef8bb79958113f106befaf4f18876d12a5e64 Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b R = 0000000000000000000000000000000000000000000000000000000000000005 S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e # r = 5 + n, x = 5 is invalid. r must already be reduced. Curve = P-256 X = 264d796a0dab9b376d34eea6fe297dde1c7b73e53944bc96c8f1e8a6850bb6c9 Y = cf5308020eed460c649ddae61d4ef8bb79958113f106befaf4f18876d12a5e64 Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b R = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632556 S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e Invalid = Y # r = n-2, x = n-2 is the largest x without a reduction. Curve = P-256 X = 50a50c01132bf79e42b31fb278f7317b29515e9e1c973a41266b69048826fb8e Y = aac53e7df37b5eb25ce4ddb705fc7135c6b1e00a7f56e30744f62f258afa5537 Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b R = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254f S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e # r = n-3, x = n-2 is incorrect. Curve = P-256 X = 50a50c01132bf79e42b31fb278f7317b29515e9e1c973a41266b69048826fb8e Y = aac53e7df37b5eb25ce4ddb705fc7135c6b1e00a7f56e30744f62f258afa5537 Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b R = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e Invalid = Y # r = 3, x = n+3 is the smallest x with a reduction. Curve = P-256 X = ce24c99032d52ac6ead23c0ae3ec68ef41e51a281fd457808c83136d7dcce90e Y = 8f7a154b551e9f39c59279357aa491b2a62bdebc2bb78613883fc72936c057e0 Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b R = 0000000000000000000000000000000000000000000000000000000000000003 S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e # r = 4, x = n+3 is incorrect. Curve = P-256 X = ce24c99032d52ac6ead23c0ae3ec68ef41e51a281fd457808c83136d7dcce90e Y = 8f7a154b551e9f39c59279357aa491b2a62bdebc2bb78613883fc72936c057e0 Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b R = 0000000000000000000000000000000000000000000000000000000000000004 S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e Invalid = Y # r = p-3-n, x = p-3 is the largest valid x. Curve = P-256 X = 768a0d300a595005a520130e50927d403395c8e1e40be997b48fc048410f7cdb Y = 16f217d8e1c02bd887e5de388a17783b182e61b5d534152dc2c4be8d75fdd706 Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b R = 000000000000000000000000000000004319055358e8617b0c46353d039cdaab S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e # r = p-n+5, x = 5 is incorrect. r is too large to compare r+n with x. Curve = P-256 X = 0ec505bc19b14a43e05678cccf07a443d3e871a2e19b68a4da91859a0650f324 Y = 77300e4f64e9982d94dff5d294428bb37cc9be66117cae9c389d2d495f68b987 Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b R = 000000000000000000000000000000004319055358e8617b0c46353d039cdab3 S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e Invalid = Y # r = 2, x = 2 is valid. Curve = P-384 X = 016d2db67561bc126ad6c344d6eeb2713a9e2892c649af0f015c6b7617f160c8a3b3a88add669d7155025073c5ac5b4f Y = 43bf2ed0088af08645c80aa0a24a567a94ba2d794e9689d3ad4b185bc5d2dd008333e2dd2ebb5069a9b32251a3cac71e Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e R = 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002 S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 # r = 2 + n, x = 2 is invalid. r must already be reduced. Curve = P-384 X = 016d2db67561bc126ad6c344d6eeb2713a9e2892c649af0f015c6b7617f160c8a3b3a88add669d7155025073c5ac5b4f Y = 43bf2ed0088af08645c80aa0a24a567a94ba2d794e9689d3ad4b185bc5d2dd008333e2dd2ebb5069a9b32251a3cac71e Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e R = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52975 S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 Invalid = Y # r = n-1, x = n-1 is the largest x without a reduction. Curve = P-384 X = b5b375264c09acf145ca91d12ab10a096092a41ec43f4d718e129ea1c12b2dea62c7785efc52f46f009fb1dba133e811 Y = bc0b2af172b4b3068d032a798080e76f4d56f72069519e3c19a43682a41794e52cb3ca139348d6bbc923e6a4f7945cb1 Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e R = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52972 S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 # r = n-2, x = n-1 is incorrect. Curve = P-384 X = b5b375264c09acf145ca91d12ab10a096092a41ec43f4d718e129ea1c12b2dea62c7785efc52f46f009fb1dba133e811 Y = bc0b2af172b4b3068d032a798080e76f4d56f72069519e3c19a43682a41794e52cb3ca139348d6bbc923e6a4f7945cb1 Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e R = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52971 S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 Invalid = Y # r = 2, x = n+2 is the smallest x with a reduction. Curve = P-384 X = 01b54a697305092bac2939fb906d7471b411c4eba8654169166a5da3810e1fc96795df921f7abbf519be4a027435176c Y = a19012a3518773d508106d4153adee43c3c384fa62ce36a4addea08f593ec9c76b09a6b9c69d29bd7d47eb48e167dd2f Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e R = 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002 S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 # r = 3, x = n+2 is incorrect. Curve = P-384 X = 01b54a697305092bac2939fb906d7471b411c4eba8654169166a5da3810e1fc96795df921f7abbf519be4a027435176c Y = a19012a3518773d508106d4153adee43c3c384fa62ce36a4addea08f593ec9c76b09a6b9c69d29bd7d47eb48e167dd2f Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e R = 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003 S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 Invalid = Y # r = p-1-n, x = p-1 is the largest valid x. Curve = P-384 X = c4fd8e68006b83f7b7b20b731ae405813aa05f6e57374589b36ae1cecd1d49cae1418c22f398188bcf4ef02e89fe7394 Y = dd1164b3707f59e05129fa228b8448031db159985f035d93470dc42b3ab4129f0760c46cf201d42e73a7e33ba7402ea6 Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e R = 000000000000000000000000000000000000000000000000389cb27e0bc8d21fa7e5f24cb74f58851313e696333ad68b S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 # r = p-n+2, x = 2 is incorrect. r is too large to compare r+n with x. Curve = P-384 X = 4e5e4f1a6e97059a6cf2f4e8129e5c7c64cb84f9994a41ff5bf30b29c1bf5ba6898627c91a23c73e05cd1a43c8f908c0 Y = 06a0aed7f1e63a728f87dbd5360a67571a076ab0b4cde81b10d499959814ddb3a8c7854b0bbfa87cc272f90bca2a2254 Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e R = 000000000000000000000000000000000000000000000000389cb27e0bc8d21fa7e5f24cb74f58851313e696333ad68e S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 Invalid = Y ring-0.17.14/crypto/fipsmodule/sha/asm/sha256-armv4.pl000064400000000000000000000402121046102023000203520ustar 00000000000000#! /usr/bin/env perl # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # SHA256 block procedure for ARMv4. May 2007. # Performance is ~2x better than gcc 3.4 generated code and in "abso- # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per # byte [on single-issue Xscale PXA250 core]. # July 2010. # # Rescheduling for dual-issue pipeline resulted in 22% improvement on # Cortex A8 core and ~20 cycles per processed byte. # February 2011. # # Profiler-assisted and platform-specific optimization resulted in 16% # improvement on Cortex A8 core and ~15.4 cycles per processed byte. # September 2013. # # Add NEON implementation. On Cortex A8 it was measured to process one # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only # code (meaning that latter performs sub-optimally, nothing was done # about it). # May 2014. # # Add ARMv8 code path performing at 2.0 cpb on Apple A7. $flavour = shift; if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; } else { open OUT,">$output"; *STDOUT=*OUT; } $ctx="r0"; $t0="r0"; $inp="r1"; $t4="r1"; $len="r2"; $t1="r2"; $T1="r3"; $t3="r3"; $A="r4"; $B="r5"; $C="r6"; $D="r7"; $E="r8"; $F="r9"; $G="r10"; $H="r11"; @V=($A,$B,$C,$D,$E,$F,$G,$H); $t2="r12"; $Ktbl="r14"; @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); sub BODY_00_15 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___ if ($i<16); #if __ARM_ARCH>=7 @ ldr $t1,[$inp],#4 @ $i # if $i==15 str $inp,[sp,#17*4] @ make room for $t4 # endif eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` add $a,$a,$t2 @ h+=Maj(a,b,c) from the past eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) # ifndef __ARMEB__ rev $t1,$t1 # endif #else @ ldrb $t1,[$inp,#3] @ $i add $a,$a,$t2 @ h+=Maj(a,b,c) from the past ldrb $t2,[$inp,#2] ldrb $t0,[$inp,#1] orr $t1,$t1,$t2,lsl#8 ldrb $t2,[$inp],#4 orr $t1,$t1,$t0,lsl#16 # if $i==15 str $inp,[sp,#17*4] @ make room for $t4 # endif eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` orr $t1,$t1,$t2,lsl#24 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) #endif ___ $code.=<<___; ldr $t2,[$Ktbl],#4 @ *K256++ add $h,$h,$t1 @ h+=X[i] str $t1,[sp,#`$i%16`*4] eor $t1,$f,$g add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) and $t1,$t1,$e add $h,$h,$t2 @ h+=K256[i] eor $t1,$t1,$g @ Ch(e,f,g) eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` add $h,$h,$t1 @ h+=Ch(e,f,g) #if $i==31 and $t2,$t2,#0xff cmp $t2,#0xf2 @ done? #endif #if $i<15 # if __ARM_ARCH>=7 ldr $t1,[$inp],#4 @ prefetch # else ldrb $t1,[$inp,#3] # endif eor $t2,$a,$b @ a^b, b^c in next round #else ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx eor $t2,$a,$b @ a^b, b^c in next round ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx #endif eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) and $t3,$t3,$t2 @ (b^c)&=(a^b) add $d,$d,$h @ d+=h eor $t3,$t3,$b @ Maj(a,b,c) add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) @ add $h,$h,$t3 @ h+=Maj(a,b,c) ___ ($t2,$t3)=($t3,$t2); } sub BODY_16_XX { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i @ ldr $t4,[sp,#`($i+14)%16`*4] mov $t0,$t1,ror#$sigma0[0] add $a,$a,$t2 @ h+=Maj(a,b,c) from the past mov $t2,$t4,ror#$sigma1[0] eor $t0,$t0,$t1,ror#$sigma0[1] eor $t2,$t2,$t4,ror#$sigma1[1] eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) ldr $t1,[sp,#`($i+0)%16`*4] eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) ldr $t4,[sp,#`($i+9)%16`*4] add $t2,$t2,$t0 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 add $t1,$t1,$t2 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) add $t1,$t1,$t4 @ X[i] ___ &BODY_00_15(@_); } $code=<<___; #ifdef __KERNEL__ # define __ARM_ARCH __LINUX_ARM_ARCH__ # define __ARM_MAX_ARCH__ 7 #endif @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both @ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those @ instructions are manually-encoded. (See unsha256.) .arch armv7-a .text #if defined(__thumb2__) .syntax unified .thumb #else .code 32 #endif .type K256,%object .align 5 K256: .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .size K256,.-K256 .word 0 @ terminator .align 5 .global sha256_block_data_order_nohw .type sha256_block_data_order_nohw,%function sha256_block_data_order_nohw: add $len,$inp,$len,lsl#6 @ len to point at the end of inp stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} adr $Ktbl,K256 sub sp,sp,#16*4 @ alloca(X[16]) .Loop: # if __ARM_ARCH>=7 ldr $t1,[$inp],#4 # else ldrb $t1,[$inp,#3] # endif eor $t3,$B,$C @ magic eor $t2,$t2,$t2 ___ for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } $code.=".Lrounds_16_xx:\n"; for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } $code.=<<___; #if __ARM_ARCH>=7 ite eq @ Thumb2 thing, sanity check in ARM #endif ldreq $t3,[sp,#16*4] @ pull ctx bne .Lrounds_16_xx add $A,$A,$t2 @ h+=Maj(a,b,c) from the past ldr $t0,[$t3,#0] ldr $t1,[$t3,#4] ldr $t2,[$t3,#8] add $A,$A,$t0 ldr $t0,[$t3,#12] add $B,$B,$t1 ldr $t1,[$t3,#16] add $C,$C,$t2 ldr $t2,[$t3,#20] add $D,$D,$t0 ldr $t0,[$t3,#24] add $E,$E,$t1 ldr $t1,[$t3,#28] add $F,$F,$t2 ldr $inp,[sp,#17*4] @ pull inp ldr $t2,[sp,#18*4] @ pull inp+len add $G,$G,$t0 add $H,$H,$t1 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} cmp $inp,$t2 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl bne .Loop add sp,sp,#`16+3`*4 @ destroy frame #if __ARM_ARCH>=5 ldmia sp!,{r4-r11,pc} #else ldmia sp!,{r4-r11,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif .size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw ___ ###################################################################### # NEON stuff # {{{ my @X=map("q$_",(0..3)); my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); my $Xfer=$t4; my $j=0; sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; my $arg = pop; $arg = "#$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; } sub Xupdate() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T2,$T0,$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T1,$T0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T2,$T0,32-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T3,$T0,$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T1,$T1,$T2); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T3,$T0,32-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T1,$T1,$T3); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &veor ($T5,$T5,$T4); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T5,$T5,$T4); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T5,$T5,$T4); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vld1_32 ("{$T0}","[$Ktbl,:128]!"); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T5,$T5,$T4); # sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 ($T0,$T0,@X[0]); while($#insns>=2) { eval(shift(@insns)); } &vst1_32 ("{$T0}","[$Xfer,:128]!"); eval(shift(@insns)); eval(shift(@insns)); push(@X,shift(@X)); # "rotate" X[] } sub Xpreload() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vld1_32 ("{$T0}","[$Ktbl,:128]!"); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vrev32_8 (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 ($T0,$T0,@X[0]); foreach (@insns) { eval; } # remaining instructions &vst1_32 ("{$T0}","[$Xfer,:128]!"); push(@X,shift(@X)); # "rotate" X[] } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. '&add ($h,$h,$t1)', # h+=X[i]+K[i] '&eor ($t1,$f,$g)', '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past '&and ($t1,$t1,$e)', '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', '&eor ($t1,$t1,$g)', # Ch(e,f,g) '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) '&eor ($t2,$a,$b)', # a^b, b^c in next round '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) '&add ($h,$h,$t1)', # h+=Ch(e,f,g) '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. '&ldr ($t1,"[$Ktbl]") if ($j==15);'. '&ldr ($t1,"[sp,#64]") if ($j==31)', '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) '&add ($d,$d,$h)', # d+=h '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) '&eor ($t3,$t3,$b)', # Maj(a,b,c) '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' ) } $code.=<<___; #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon .LK256_shortcut_neon: @ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. #if defined(__thumb2__) .word K256-(.LK256_add_neon+4) #else .word K256-(.LK256_add_neon+8) #endif .global sha256_block_data_order_neon .type sha256_block_data_order_neon,%function .align 5 .skip 16 sha256_block_data_order_neon: stmdb sp!,{r4-r12,lr} sub $H,sp,#16*4+16 @ K256 is just at the boundary of being easily referenced by an ADR from @ this function. In Arm mode, when building with __ARM_ARCH=6, it does @ not fit. By moving code around, we could make it fit, but this is too @ fragile. For simplicity, just load the offset from @ .LK256_shortcut_neon. @ @ TODO(davidben): adrl would avoid a load, but clang-assembler does not @ support it. We might be able to emulate it with a macro, but Android's @ did not work when I tried it. @ https://android.googlesource.com/platform/ndk/+/refs/heads/main/docs/ClangMigration.md#arm ldr $Ktbl,.LK256_shortcut_neon .LK256_add_neon: add $Ktbl,pc,$Ktbl bic $H,$H,#15 @ align for 128-bit stores mov $t2,sp mov sp,$H @ alloca add $len,$inp,$len,lsl#6 @ len to point at the end of inp vld1.8 {@X[0]},[$inp]! vld1.8 {@X[1]},[$inp]! vld1.8 {@X[2]},[$inp]! vld1.8 {@X[3]},[$inp]! vld1.32 {$T0},[$Ktbl,:128]! vld1.32 {$T1},[$Ktbl,:128]! vld1.32 {$T2},[$Ktbl,:128]! vld1.32 {$T3},[$Ktbl,:128]! vrev32.8 @X[0],@X[0] @ yes, even on str $ctx,[sp,#64] vrev32.8 @X[1],@X[1] @ big-endian str $inp,[sp,#68] mov $Xfer,sp vrev32.8 @X[2],@X[2] str $len,[sp,#72] vrev32.8 @X[3],@X[3] str $t2,[sp,#76] @ save original sp vadd.i32 $T0,$T0,@X[0] vadd.i32 $T1,$T1,@X[1] vst1.32 {$T0},[$Xfer,:128]! vadd.i32 $T2,$T2,@X[2] vst1.32 {$T1},[$Xfer,:128]! vadd.i32 $T3,$T3,@X[3] vst1.32 {$T2},[$Xfer,:128]! vst1.32 {$T3},[$Xfer,:128]! ldmia $ctx,{$A-$H} sub $Xfer,$Xfer,#64 ldr $t1,[sp,#0] eor $t2,$t2,$t2 eor $t3,$B,$C b .L_00_48 .align 4 .L_00_48: ___ &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); $code.=<<___; teq $t1,#0 @ check for K256 terminator ldr $t1,[sp,#0] sub $Xfer,$Xfer,#64 bne .L_00_48 ldr $inp,[sp,#68] ldr $t0,[sp,#72] sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl teq $inp,$t0 it eq subeq $inp,$inp,#64 @ avoid SEGV vld1.8 {@X[0]},[$inp]! @ load next input block vld1.8 {@X[1]},[$inp]! vld1.8 {@X[2]},[$inp]! vld1.8 {@X[3]},[$inp]! it ne strne $inp,[sp,#68] mov $Xfer,sp ___ &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); $code.=<<___; ldr $t0,[$t1,#0] add $A,$A,$t2 @ h+=Maj(a,b,c) from the past ldr $t2,[$t1,#4] ldr $t3,[$t1,#8] ldr $t4,[$t1,#12] add $A,$A,$t0 @ accumulate ldr $t0,[$t1,#16] add $B,$B,$t2 ldr $t2,[$t1,#20] add $C,$C,$t3 ldr $t3,[$t1,#24] add $D,$D,$t4 ldr $t4,[$t1,#28] add $E,$E,$t0 str $A,[$t1],#4 add $F,$F,$t2 str $B,[$t1],#4 add $G,$G,$t3 str $C,[$t1],#4 add $H,$H,$t4 str $D,[$t1],#4 stmia $t1,{$E-$H} ittte ne movne $Xfer,sp ldrne $t1,[sp,#0] eorne $t2,$t2,$t2 ldreq sp,[sp,#76] @ restore original sp itt ne eorne $t3,$B,$C bne .L_00_48 ldmia sp!,{r4-r12,pc} .size sha256_block_data_order_neon,.-sha256_block_data_order_neon #endif ___ }}} $code.=<<___; .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " .align 2 ___ open SELF,$0; while() { next if (/^#!/); last if (!s/^#/@/ and !/^$/); print; } close SELF; foreach (split($/,$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\bret\b/bx lr/go or s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; # enforce flush ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv4.pl000064400000000000000000000416211046102023000203520ustar 00000000000000#! /usr/bin/env perl # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # SHA512 block procedure for ARMv4. September 2007. # This code is ~4.5 (four and a half) times faster than code generated # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue # Xscale PXA250 core]. # # July 2010. # # Rescheduling for dual-issue pipeline resulted in 6% improvement on # Cortex A8 core and ~40 cycles per processed byte. # February 2011. # # Profiler-assisted and platform-specific optimization resulted in 7% # improvement on Coxtex A8 core and ~38 cycles per byte. # March 2011. # # Add NEON implementation. On Cortex A8 it was measured to process # one byte in 23.3 cycles or ~60% faster than integer-only code. # August 2012. # # Improve NEON performance by 12% on Snapdragon S4. In absolute # terms it's 22.6 cycles per byte, which is disappointing result. # Technical writers asserted that 3-way S4 pipeline can sustain # multiple NEON instructions per cycle, but dual NEON issue could # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html # for further details. On side note Cortex-A15 processes one byte in # 16 cycles. # Byte order [in]dependence. ========================================= # # Originally caller was expected to maintain specific *dword* order in # h[0-7], namely with most significant dword at *lower* address, which # was reflected in below two parameters as 0 and 4. Now caller is # expected to maintain native byte order for whole 64-bit values. $hi="HI"; $lo="LO"; # ==================================================================== $flavour = shift; if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; } else { open OUT,">$output"; *STDOUT=*OUT; } $ctx="r0"; # parameter block $inp="r1"; $len="r2"; $Tlo="r3"; $Thi="r4"; $Alo="r5"; $Ahi="r6"; $Elo="r7"; $Ehi="r8"; $t0="r9"; $t1="r10"; $t2="r11"; $t3="r12"; ############ r13 is stack pointer $Ktbl="r14"; ############ r15 is program counter $Aoff=8*0; $Boff=8*1; $Coff=8*2; $Doff=8*3; $Eoff=8*4; $Foff=8*5; $Goff=8*6; $Hoff=8*7; $Xoff=8*8; sub BODY_00_15() { my $magic = shift; $code.=<<___; @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 mov $t0,$Elo,lsr#14 str $Tlo,[sp,#$Xoff+0] mov $t1,$Ehi,lsr#14 str $Thi,[sp,#$Xoff+4] eor $t0,$t0,$Ehi,lsl#18 ldr $t2,[sp,#$Hoff+0] @ h.lo eor $t1,$t1,$Elo,lsl#18 ldr $t3,[sp,#$Hoff+4] @ h.hi eor $t0,$t0,$Elo,lsr#18 eor $t1,$t1,$Ehi,lsr#18 eor $t0,$t0,$Ehi,lsl#14 eor $t1,$t1,$Elo,lsl#14 eor $t0,$t0,$Ehi,lsr#9 eor $t1,$t1,$Elo,lsr#9 eor $t0,$t0,$Elo,lsl#23 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) adds $Tlo,$Tlo,$t0 ldr $t0,[sp,#$Foff+0] @ f.lo adc $Thi,$Thi,$t1 @ T += Sigma1(e) ldr $t1,[sp,#$Foff+4] @ f.hi adds $Tlo,$Tlo,$t2 ldr $t2,[sp,#$Goff+0] @ g.lo adc $Thi,$Thi,$t3 @ T += h ldr $t3,[sp,#$Goff+4] @ g.hi eor $t0,$t0,$t2 str $Elo,[sp,#$Eoff+0] eor $t1,$t1,$t3 str $Ehi,[sp,#$Eoff+4] and $t0,$t0,$Elo str $Alo,[sp,#$Aoff+0] and $t1,$t1,$Ehi str $Ahi,[sp,#$Aoff+4] eor $t0,$t0,$t2 ldr $t2,[$Ktbl,#$lo] @ K[i].lo eor $t1,$t1,$t3 @ Ch(e,f,g) ldr $t3,[$Ktbl,#$hi] @ K[i].hi adds $Tlo,$Tlo,$t0 ldr $Elo,[sp,#$Doff+0] @ d.lo adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) ldr $Ehi,[sp,#$Doff+4] @ d.hi adds $Tlo,$Tlo,$t2 and $t0,$t2,#0xff adc $Thi,$Thi,$t3 @ T += K[i] adds $Elo,$Elo,$Tlo ldr $t2,[sp,#$Boff+0] @ b.lo adc $Ehi,$Ehi,$Thi @ d += T teq $t0,#$magic ldr $t3,[sp,#$Coff+0] @ c.lo #if __ARM_ARCH>=7 it eq @ Thumb2 thing, sanity check in ARM #endif orreq $Ktbl,$Ktbl,#1 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 mov $t0,$Alo,lsr#28 mov $t1,$Ahi,lsr#28 eor $t0,$t0,$Ahi,lsl#4 eor $t1,$t1,$Alo,lsl#4 eor $t0,$t0,$Ahi,lsr#2 eor $t1,$t1,$Alo,lsr#2 eor $t0,$t0,$Alo,lsl#30 eor $t1,$t1,$Ahi,lsl#30 eor $t0,$t0,$Ahi,lsr#7 eor $t1,$t1,$Alo,lsr#7 eor $t0,$t0,$Alo,lsl#25 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) adds $Tlo,$Tlo,$t0 and $t0,$Alo,$t2 adc $Thi,$Thi,$t1 @ T += Sigma0(a) ldr $t1,[sp,#$Boff+4] @ b.hi orr $Alo,$Alo,$t2 ldr $t2,[sp,#$Coff+4] @ c.hi and $Alo,$Alo,$t3 and $t3,$Ahi,$t1 orr $Ahi,$Ahi,$t1 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo and $Ahi,$Ahi,$t2 adds $Alo,$Alo,$Tlo orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi sub sp,sp,#8 adc $Ahi,$Ahi,$Thi @ h += T tst $Ktbl,#1 add $Ktbl,$Ktbl,#8 ___ } $code=<<___; #ifndef __KERNEL__ # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} # define VFP_ABI_POP vldmia sp!,{d8-d15} #else # define __ARM_MAX_ARCH__ 7 # define VFP_ABI_PUSH # define VFP_ABI_POP #endif @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both @ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. .arch armv7-a #ifdef __ARMEL__ # define LO 0 # define HI 4 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 #else # define HI 0 # define LO 4 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 #endif .text #if defined(__thumb2__) .syntax unified .thumb # define adrl adr #else .code 32 #endif .type K512,%object .align 5 K512: WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 .global sha512_block_data_order_nohw .type sha512_block_data_order_nohw,%function sha512_block_data_order_nohw: add $len,$inp,$len,lsl#7 @ len to point at the end of inp stmdb sp!,{r4-r12,lr} adr $Ktbl,K512 sub sp,sp,#9*8 ldr $Elo,[$ctx,#$Eoff+$lo] ldr $Ehi,[$ctx,#$Eoff+$hi] ldr $t0, [$ctx,#$Goff+$lo] ldr $t1, [$ctx,#$Goff+$hi] ldr $t2, [$ctx,#$Hoff+$lo] ldr $t3, [$ctx,#$Hoff+$hi] .Loop: str $t0, [sp,#$Goff+0] str $t1, [sp,#$Goff+4] str $t2, [sp,#$Hoff+0] str $t3, [sp,#$Hoff+4] ldr $Alo,[$ctx,#$Aoff+$lo] ldr $Ahi,[$ctx,#$Aoff+$hi] ldr $Tlo,[$ctx,#$Boff+$lo] ldr $Thi,[$ctx,#$Boff+$hi] ldr $t0, [$ctx,#$Coff+$lo] ldr $t1, [$ctx,#$Coff+$hi] ldr $t2, [$ctx,#$Doff+$lo] ldr $t3, [$ctx,#$Doff+$hi] str $Tlo,[sp,#$Boff+0] str $Thi,[sp,#$Boff+4] str $t0, [sp,#$Coff+0] str $t1, [sp,#$Coff+4] str $t2, [sp,#$Doff+0] str $t3, [sp,#$Doff+4] ldr $Tlo,[$ctx,#$Foff+$lo] ldr $Thi,[$ctx,#$Foff+$hi] str $Tlo,[sp,#$Foff+0] str $Thi,[sp,#$Foff+4] .L00_15: #if __ARM_ARCH<7 ldrb $Tlo,[$inp,#7] ldrb $t0, [$inp,#6] ldrb $t1, [$inp,#5] ldrb $t2, [$inp,#4] ldrb $Thi,[$inp,#3] ldrb $t3, [$inp,#2] orr $Tlo,$Tlo,$t0,lsl#8 ldrb $t0, [$inp,#1] orr $Tlo,$Tlo,$t1,lsl#16 ldrb $t1, [$inp],#8 orr $Tlo,$Tlo,$t2,lsl#24 orr $Thi,$Thi,$t3,lsl#8 orr $Thi,$Thi,$t0,lsl#16 orr $Thi,$Thi,$t1,lsl#24 #else ldr $Tlo,[$inp,#4] ldr $Thi,[$inp],#8 #ifdef __ARMEL__ rev $Tlo,$Tlo rev $Thi,$Thi #endif #endif ___ &BODY_00_15(0x94); $code.=<<___; tst $Ktbl,#1 beq .L00_15 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] bic $Ktbl,$Ktbl,#1 .L16_79: @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 mov $Tlo,$t0,lsr#1 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] mov $Thi,$t1,lsr#1 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] eor $Tlo,$Tlo,$t1,lsl#31 eor $Thi,$Thi,$t0,lsl#31 eor $Tlo,$Tlo,$t0,lsr#8 eor $Thi,$Thi,$t1,lsr#8 eor $Tlo,$Tlo,$t1,lsl#24 eor $Thi,$Thi,$t0,lsl#24 eor $Tlo,$Tlo,$t0,lsr#7 eor $Thi,$Thi,$t1,lsr#7 eor $Tlo,$Tlo,$t1,lsl#25 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 mov $t0,$t2,lsr#19 mov $t1,$t3,lsr#19 eor $t0,$t0,$t3,lsl#13 eor $t1,$t1,$t2,lsl#13 eor $t0,$t0,$t3,lsr#29 eor $t1,$t1,$t2,lsr#29 eor $t0,$t0,$t2,lsl#3 eor $t1,$t1,$t3,lsl#3 eor $t0,$t0,$t2,lsr#6 eor $t1,$t1,$t3,lsr#6 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] eor $t0,$t0,$t3,lsl#26 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] adds $Tlo,$Tlo,$t0 ldr $t0,[sp,#`$Xoff+8*16`+0] adc $Thi,$Thi,$t1 ldr $t1,[sp,#`$Xoff+8*16`+4] adds $Tlo,$Tlo,$t2 adc $Thi,$Thi,$t3 adds $Tlo,$Tlo,$t0 adc $Thi,$Thi,$t1 ___ &BODY_00_15(0x17); $code.=<<___; #if __ARM_ARCH>=7 ittt eq @ Thumb2 thing, sanity check in ARM #endif ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] beq .L16_79 bic $Ktbl,$Ktbl,#1 ldr $Tlo,[sp,#$Boff+0] ldr $Thi,[sp,#$Boff+4] ldr $t0, [$ctx,#$Aoff+$lo] ldr $t1, [$ctx,#$Aoff+$hi] ldr $t2, [$ctx,#$Boff+$lo] ldr $t3, [$ctx,#$Boff+$hi] adds $t0,$Alo,$t0 str $t0, [$ctx,#$Aoff+$lo] adc $t1,$Ahi,$t1 str $t1, [$ctx,#$Aoff+$hi] adds $t2,$Tlo,$t2 str $t2, [$ctx,#$Boff+$lo] adc $t3,$Thi,$t3 str $t3, [$ctx,#$Boff+$hi] ldr $Alo,[sp,#$Coff+0] ldr $Ahi,[sp,#$Coff+4] ldr $Tlo,[sp,#$Doff+0] ldr $Thi,[sp,#$Doff+4] ldr $t0, [$ctx,#$Coff+$lo] ldr $t1, [$ctx,#$Coff+$hi] ldr $t2, [$ctx,#$Doff+$lo] ldr $t3, [$ctx,#$Doff+$hi] adds $t0,$Alo,$t0 str $t0, [$ctx,#$Coff+$lo] adc $t1,$Ahi,$t1 str $t1, [$ctx,#$Coff+$hi] adds $t2,$Tlo,$t2 str $t2, [$ctx,#$Doff+$lo] adc $t3,$Thi,$t3 str $t3, [$ctx,#$Doff+$hi] ldr $Tlo,[sp,#$Foff+0] ldr $Thi,[sp,#$Foff+4] ldr $t0, [$ctx,#$Eoff+$lo] ldr $t1, [$ctx,#$Eoff+$hi] ldr $t2, [$ctx,#$Foff+$lo] ldr $t3, [$ctx,#$Foff+$hi] adds $Elo,$Elo,$t0 str $Elo,[$ctx,#$Eoff+$lo] adc $Ehi,$Ehi,$t1 str $Ehi,[$ctx,#$Eoff+$hi] adds $t2,$Tlo,$t2 str $t2, [$ctx,#$Foff+$lo] adc $t3,$Thi,$t3 str $t3, [$ctx,#$Foff+$hi] ldr $Alo,[sp,#$Goff+0] ldr $Ahi,[sp,#$Goff+4] ldr $Tlo,[sp,#$Hoff+0] ldr $Thi,[sp,#$Hoff+4] ldr $t0, [$ctx,#$Goff+$lo] ldr $t1, [$ctx,#$Goff+$hi] ldr $t2, [$ctx,#$Hoff+$lo] ldr $t3, [$ctx,#$Hoff+$hi] adds $t0,$Alo,$t0 str $t0, [$ctx,#$Goff+$lo] adc $t1,$Ahi,$t1 str $t1, [$ctx,#$Goff+$hi] adds $t2,$Tlo,$t2 str $t2, [$ctx,#$Hoff+$lo] adc $t3,$Thi,$t3 str $t3, [$ctx,#$Hoff+$hi] add sp,sp,#640 sub $Ktbl,$Ktbl,#640 teq $inp,$len bne .Loop add sp,sp,#8*9 @ destroy frame #if __ARM_ARCH>=5 ldmia sp!,{r4-r12,pc} #else ldmia sp!,{r4-r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif .size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw ___ { my @Sigma0=(28,34,39); my @Sigma1=(14,18,41); my @sigma0=(1, 8, 7); my @sigma1=(19,61,6); my $Ktbl="r3"; my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch my @X=map("d$_",(0..15)); my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); sub NEON_00_15() { my $i=shift; my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps $code.=<<___ if ($i<16 || $i&1); vshr.u64 $t0,$e,#@Sigma1[0] @ $i #if $i<16 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned #endif vshr.u64 $t1,$e,#@Sigma1[1] #if $i>0 vadd.i64 $a,$Maj @ h+=Maj from the past #endif vshr.u64 $t2,$e,#@Sigma1[2] ___ $code.=<<___; vld1.64 {$K},[$Ktbl,:64]! @ K[i++] vsli.64 $t0,$e,#`64-@Sigma1[0]` vsli.64 $t1,$e,#`64-@Sigma1[1]` vmov $Ch,$e vsli.64 $t2,$e,#`64-@Sigma1[2]` #if $i<16 && defined(__ARMEL__) vrev64.8 @X[$i],@X[$i] #endif veor $t1,$t0 vbsl $Ch,$f,$g @ Ch(e,f,g) vshr.u64 $t0,$a,#@Sigma0[0] veor $t2,$t1 @ Sigma1(e) vadd.i64 $T1,$Ch,$h vshr.u64 $t1,$a,#@Sigma0[1] vsli.64 $t0,$a,#`64-@Sigma0[0]` vadd.i64 $T1,$t2 vshr.u64 $t2,$a,#@Sigma0[2] vadd.i64 $K,@X[$i%16] vsli.64 $t1,$a,#`64-@Sigma0[1]` veor $Maj,$a,$b vsli.64 $t2,$a,#`64-@Sigma0[2]` veor $h,$t0,$t1 vadd.i64 $T1,$K vbsl $Maj,$c,$b @ Maj(a,b,c) veor $h,$t2 @ Sigma0(a) vadd.i64 $d,$T1 vadd.i64 $Maj,$T1 @ vadd.i64 $h,$Maj ___ } sub NEON_16_79() { my $i=shift; if ($i&1) { &NEON_00_15($i,@_); return; } # 2x-vectorized, therefore runs every 2nd round my @X=map("q$_",(0..7)); # view @X as 128-bit vector my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 my $e=@_[4]; # $e from NEON_00_15 $i /= 2; $code.=<<___; vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] vadd.i64 @_[0],d30 @ h+=Maj from the past vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` veor $s1,$t0 vshr.u64 $t0,$s0,#@sigma0[0] veor $s1,$t1 @ sigma1(X[i+14]) vshr.u64 $t1,$s0,#@sigma0[1] vadd.i64 @X[$i%8],$s1 vshr.u64 $s1,$s0,#@sigma0[2] vsli.64 $t0,$s0,#`64-@sigma0[0]` vsli.64 $t1,$s0,#`64-@sigma0[1]` vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] veor $s1,$t0 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 vadd.i64 @X[$i%8],$s0 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 veor $s1,$t1 @ sigma0(X[i+1]) vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 vadd.i64 @X[$i%8],$s1 ___ &NEON_00_15(2*$i,@_); } $code.=<<___; #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon .global sha512_block_data_order_neon .type sha512_block_data_order_neon,%function .align 4 sha512_block_data_order_neon: dmb @ errata #451034 on early Cortex A8 add $len,$inp,$len,lsl#7 @ len to point at the end of inp adr $Ktbl,K512 VFP_ABI_PUSH vldmia $ctx,{$A-$H} @ load context .Loop_neon: ___ for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } $code.=<<___; mov $cnt,#4 .L16_79_neon: subs $cnt,#1 ___ for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } $code.=<<___; bne .L16_79_neon vadd.i64 $A,d30 @ h+=Maj from the past vldmia $ctx,{d24-d31} @ load context to temp vadd.i64 q8,q12 @ vectorized accumulate vadd.i64 q9,q13 vadd.i64 q10,q14 vadd.i64 q11,q15 vstmia $ctx,{$A-$H} @ save context teq $inp,$len sub $Ktbl,#640 @ rewind K512 bne .Loop_neon VFP_ABI_POP ret @ bx lr .size sha512_block_data_order_neon,.-sha512_block_data_order_neon #endif ___ } $code.=<<___; .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by " .align 2 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 $code =~ s/\bret\b/bx lr/gm; open SELF,$0; while() { next if (/^#!/); last if (!s/^#/@/ and !/^$/); print; } close SELF; print $code; close STDOUT or die "error closing STDOUT: $!"; # enforce flush ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv8.pl000064400000000000000000000366541046102023000203700ustar 00000000000000#! /usr/bin/env perl # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. # ==================================================================== # # SHA256/512 for ARMv8. # # Performance in cycles per processed byte and improvement coefficient # over code generated with "default" compiler: # # SHA256-hw SHA256(*) SHA512 # Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) # Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) # Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) # Denver 2.01 10.5 (+26%) 6.70 (+8%) # X-Gene 20.0 (+100%) 12.8 (+300%(***)) # Mongoose 2.36 13.0 (+50%) 8.36 (+33%) # Kryo 1.92 17.4 (+30%) 11.2 (+8%) # # (*) Software SHA256 results are of lesser relevance, presented # mostly for informational purposes. # (**) The result is a trade-off: it's possible to improve it by # 10% (or by 1 cycle per round), but at the cost of 20% loss # on Cortex-A53 (or by 4 cycles per round). # (***) Super-impressive coefficients over gcc-generated code are # indication of some compiler "pathology", most notably code # generated with -mgeneral-regs-only is significantly faster # and the gap is only 40-90%. my ($flavour, $output) = @ARGV; if ($output =~ /sha512-armv8/) { $BITS=512; $SZ=8; @Sigma0=(28,34,39); @Sigma1=(14,18,41); @sigma0=(1, 8, 7); @sigma1=(19,61, 6); $rounds=80; $reg_t="x"; } else { $BITS=256; $SZ=4; @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); $rounds=64; $reg_t="w"; } if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; } else { open OUT,">$output"; *STDOUT=*OUT; } $func="sha${BITS}_block_data_order_nohw"; ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); @X=map("$reg_t$_",(3..15,0..2)); @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); ($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); sub BODY_00_xx { my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; my $j=($i+1)&15; my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); $T0=@X[$i+3] if ($i<11); $code.=<<___ if ($i<16); #ifndef __AARCH64EB__ rev @X[$i],@X[$i] // $i #endif ___ $code.=<<___ if ($i<13 && ($i&1)); ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ ___ $code.=<<___ if ($i==13); ldp @X[14],@X[15],[$inp] ___ $code.=<<___ if ($i>=14); ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] ___ $code.=<<___ if ($i>0 && $i<16); add $a,$a,$t1 // h+=Sigma0(a) ___ $code.=<<___ if ($i>=11); str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] ___ # While ARMv8 specifies merged rotate-n-logical operation such as # 'eor x,y,z,ror#n', it was found to negatively affect performance # on Apple A7. The reason seems to be that it requires even 'y' to # be available earlier. This means that such merged instruction is # not necessarily best choice on critical path... On the other hand # Cortex-A5x handles merged instructions much better than disjoint # rotate and logical... See (**) footnote above. $code.=<<___ if ($i<15); ror $t0,$e,#$Sigma1[0] add $h,$h,$t2 // h+=K[i] eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` and $t1,$f,$e bic $t2,$g,$e add $h,$h,@X[$i&15] // h+=X[i] orr $t1,$t1,$t2 // Ch(e,f,g) eor $t2,$a,$b // a^b, b^c in next round eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) ror $T0,$a,#$Sigma0[0] add $h,$h,$t1 // h+=Ch(e,f,g) eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` add $h,$h,$t0 // h+=Sigma1(e) and $t3,$t3,$t2 // (b^c)&=(a^b) add $d,$d,$h // d+=h eor $t3,$t3,$b // Maj(a,b,c) eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) add $h,$h,$t3 // h+=Maj(a,b,c) ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round //add $h,$h,$t1 // h+=Sigma0(a) ___ $code.=<<___ if ($i>=15); ror $t0,$e,#$Sigma1[0] add $h,$h,$t2 // h+=K[i] ror $T1,@X[($j+1)&15],#$sigma0[0] and $t1,$f,$e ror $T2,@X[($j+14)&15],#$sigma1[0] bic $t2,$g,$e ror $T0,$a,#$Sigma0[0] add $h,$h,@X[$i&15] // h+=X[i] eor $t0,$t0,$e,ror#$Sigma1[1] eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] orr $t1,$t1,$t2 // Ch(e,f,g) eor $t2,$a,$b // a^b, b^c in next round eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) eor $T0,$T0,$a,ror#$Sigma0[1] add $h,$h,$t1 // h+=Ch(e,f,g) and $t3,$t3,$t2 // (b^c)&=(a^b) eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) add $h,$h,$t0 // h+=Sigma1(e) eor $t3,$t3,$b // Maj(a,b,c) eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) add @X[$j],@X[$j],@X[($j+9)&15] add $d,$d,$h // d+=h add $h,$h,$t3 // h+=Maj(a,b,c) ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round add @X[$j],@X[$j],$T1 add $h,$h,$t1 // h+=Sigma0(a) add @X[$j],@X[$j],$T2 ___ ($t2,$t3)=($t3,$t2); } $code.=<<___; #ifndef __KERNEL__ #endif .text .globl $func .type $func,%function .align 6 $func: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#4*$SZ ldp $A,$B,[$ctx] // load context ldp $C,$D,[$ctx,#2*$SZ] ldp $E,$F,[$ctx,#4*$SZ] add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input ldp $G,$H,[$ctx,#6*$SZ] adrp $Ktbl,:pg_hi21:.LK$BITS add $Ktbl,$Ktbl,:lo12:.LK$BITS stp $ctx,$num,[x29,#96] .Loop: ldp @X[0],@X[1],[$inp],#2*$SZ ldr $t2,[$Ktbl],#$SZ // *K++ eor $t3,$B,$C // magic seed str $inp,[x29,#112] ___ for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } $code.=".Loop_16_xx:\n"; for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; cbnz $t2,.Loop_16_xx ldp $ctx,$num,[x29,#96] ldr $inp,[x29,#112] sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind ldp @X[0],@X[1],[$ctx] ldp @X[2],@X[3],[$ctx,#2*$SZ] add $inp,$inp,#14*$SZ // advance input pointer ldp @X[4],@X[5],[$ctx,#4*$SZ] add $A,$A,@X[0] ldp @X[6],@X[7],[$ctx,#6*$SZ] add $B,$B,@X[1] add $C,$C,@X[2] add $D,$D,@X[3] stp $A,$B,[$ctx] add $E,$E,@X[4] add $F,$F,@X[5] stp $C,$D,[$ctx,#2*$SZ] add $G,$G,@X[6] add $H,$H,@X[7] cmp $inp,$num stp $E,$F,[$ctx,#4*$SZ] stp $G,$H,[$ctx,#6*$SZ] b.ne .Loop ldp x19,x20,[x29,#16] add sp,sp,#4*$SZ ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 AARCH64_VALIDATE_LINK_REGISTER ret .size $func,.-$func .section .rodata .align 6 .type .LK$BITS,%object .LK$BITS: ___ $code.=<<___ if ($SZ==8); .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0 // terminator ___ $code.=<<___ if ($SZ==4); .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator ___ $code.=<<___; .size .LK$BITS,.-.LK$BITS .asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by " .align 2 ___ if ($SZ==4) { my $Ktbl="x3"; my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); my @MSG=map("v$_.16b",(4..7)); my ($W0,$W1)=("v16.4s","v17.4s"); my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); $code.=<<___; .text #ifndef __KERNEL__ .globl sha256_block_data_order_hw .type sha256_block_data_order_hw,%function .align 6 sha256_block_data_order_hw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ld1.32 {$ABCD,$EFGH},[$ctx] adrp $Ktbl,:pg_hi21:.LK256 add $Ktbl,$Ktbl,:lo12:.LK256 .Loop_hw: ld1 {@MSG[0]-@MSG[3]},[$inp],#64 sub $num,$num,#1 ld1.32 {$W0},[$Ktbl],#16 rev32 @MSG[0],@MSG[0] rev32 @MSG[1],@MSG[1] rev32 @MSG[2],@MSG[2] rev32 @MSG[3],@MSG[3] orr $ABCD_SAVE,$ABCD,$ABCD // offload orr $EFGH_SAVE,$EFGH,$EFGH ___ for($i=0;$i<12;$i++) { $code.=<<___; ld1.32 {$W1},[$Ktbl],#16 add.i32 $W0,$W0,@MSG[0] sha256su0 @MSG[0],@MSG[1] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W0 sha256h2 $EFGH,$abcd,$W0 sha256su1 @MSG[0],@MSG[2],@MSG[3] ___ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); } $code.=<<___; ld1.32 {$W1},[$Ktbl],#16 add.i32 $W0,$W0,@MSG[0] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W0 sha256h2 $EFGH,$abcd,$W0 ld1.32 {$W0},[$Ktbl],#16 add.i32 $W1,$W1,@MSG[1] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W1 sha256h2 $EFGH,$abcd,$W1 ld1.32 {$W1},[$Ktbl] add.i32 $W0,$W0,@MSG[2] sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W0 sha256h2 $EFGH,$abcd,$W0 add.i32 $W1,$W1,@MSG[3] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W1 sha256h2 $EFGH,$abcd,$W1 add.i32 $ABCD,$ABCD,$ABCD_SAVE add.i32 $EFGH,$EFGH,$EFGH_SAVE cbnz $num,.Loop_hw st1.32 {$ABCD,$EFGH},[$ctx] ldr x29,[sp],#16 ret .size sha256_block_data_order_hw,.-sha256_block_data_order_hw #endif ___ } if ($SZ==8) { my $Ktbl="x3"; my @H = map("v$_.16b",(0..4)); my ($fg,$de,$m9_10)=map("v$_.16b",(5..7)); my @MSG=map("v$_.16b",(16..23)); my ($W0,$W1)=("v24.2d","v25.2d"); my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29)); $code.=<<___; .text #ifndef __KERNEL__ .globl sha512_block_data_order_hw .type sha512_block_data_order_hw,%function .align 6 sha512_block_data_order_hw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ld1 {@MSG[0]-@MSG[3]},[$inp],#64 // load input ld1 {@MSG[4]-@MSG[7]},[$inp],#64 ld1.64 {@H[0]-@H[3]},[$ctx] // load context adrp $Ktbl,:pg_hi21:.LK512 add $Ktbl,$Ktbl,:lo12:.LK512 rev64 @MSG[0],@MSG[0] rev64 @MSG[1],@MSG[1] rev64 @MSG[2],@MSG[2] rev64 @MSG[3],@MSG[3] rev64 @MSG[4],@MSG[4] rev64 @MSG[5],@MSG[5] rev64 @MSG[6],@MSG[6] rev64 @MSG[7],@MSG[7] b .Loop_hw .align 4 .Loop_hw: ld1.64 {$W0},[$Ktbl],#16 subs $num,$num,#1 sub x4,$inp,#128 orr $AB,@H[0],@H[0] // offload orr $CD,@H[1],@H[1] orr $EF,@H[2],@H[2] orr $GH,@H[3],@H[3] csel $inp,$inp,x4,ne // conditional rewind ___ for($i=0;$i<32;$i++) { $code.=<<___; add.i64 $W0,$W0,@MSG[0] ld1.64 {$W1},[$Ktbl],#16 ext $W0,$W0,$W0,#8 ext $fg,@H[2],@H[3],#8 ext $de,@H[1],@H[2],#8 add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]" sha512su0 @MSG[0],@MSG[1] ext $m9_10,@MSG[4],@MSG[5],#8 sha512h @H[3],$fg,$de sha512su1 @MSG[0],@MSG[7],$m9_10 add.i64 @H[4],@H[1],@H[3] // "D + T1" sha512h2 @H[3],$H[1],@H[0] ___ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); @H = (@H[3],@H[0],@H[4],@H[2],@H[1]); } for(;$i<40;$i++) { $code.=<<___ if ($i<39); ld1.64 {$W1},[$Ktbl],#16 ___ $code.=<<___ if ($i==39); sub $Ktbl,$Ktbl,#$rounds*$SZ // rewind ___ $code.=<<___; add.i64 $W0,$W0,@MSG[0] ld1 {@MSG[0]},[$inp],#16 // load next input ext $W0,$W0,$W0,#8 ext $fg,@H[2],@H[3],#8 ext $de,@H[1],@H[2],#8 add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]" sha512h @H[3],$fg,$de rev64 @MSG[0],@MSG[0] add.i64 @H[4],@H[1],@H[3] // "D + T1" sha512h2 @H[3],$H[1],@H[0] ___ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); @H = (@H[3],@H[0],@H[4],@H[2],@H[1]); } $code.=<<___; add.i64 @H[0],@H[0],$AB // accumulate add.i64 @H[1],@H[1],$CD add.i64 @H[2],@H[2],$EF add.i64 @H[3],@H[3],$GH cbnz $num,.Loop_hw st1.64 {@H[0]-@H[3]},[$ctx] // store context ldr x29,[sp],#16 ret .size sha512_block_data_order_hw,.-sha512_block_data_order_hw #endif ___ } { my %opcode = ( "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); sub unsha256 { my ($mnemonic,$arg)=@_; $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o && sprintf ".inst\t0x%08x\t//%s %s", $opcode{$mnemonic}|$1|($2<<5)|($3<<16), $mnemonic,$arg; } } { my %opcode = ( "sha512h" => 0xce608000, "sha512h2" => 0xce608400, "sha512su0" => 0xcec08000, "sha512su1" => 0xce608800 ); sub unsha512 { my ($mnemonic,$arg)=@_; $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o && sprintf ".inst\t0x%08x\t//%s %s", $opcode{$mnemonic}|$1|($2<<5)|($3<<16), $mnemonic,$arg; } } open SELF,$0; while() { next if (/^#!/); last if (!s/^#/\/\// and !/^$/); print; } close SELF; foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers s/\.[ui]?8(\s)/$1/; s/\.\w?64\b// and s/\.16b/\.2d/g or s/\.\w?32\b// and s/\.16b/\.4s/g; m/\bext\b/ and s/\.2d/\.16b/g or m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-x86_64.pl000064400000000000000000001251611046102023000202610ustar 00000000000000#! /usr/bin/env perl # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. Rights for redistribution and usage in source and binary # forms are granted according to the License. # ==================================================================== # # sha256/512_block procedure for x86_64. # # 40% improvement over compiler-generated code on Opteron. On EM64T # sha256 was observed to run >80% faster and sha512 - >40%. No magical # tricks, just straight implementation... I really wonder why gcc # [being armed with inline assembler] fails to generate as fast code. # The only thing which is cool about this module is that it's very # same instruction sequence used for both SHA-256 and SHA-512. In # former case the instructions operate on 32-bit operands, while in # latter - on 64-bit ones. All I had to do is to get one flavor right, # the other one passed the test right away:-) # # sha256_block runs in ~1005 cycles on Opteron, which gives you # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock # frequency in GHz. sha512_block runs in ~1275 cycles, which results # in 128*1000/1275=100MBps per GHz. Is there room for improvement? # Well, if you compare it to IA-64 implementation, which maintains # X[16] in register bank[!], tends to 4 instructions per CPU clock # cycle and runs in 1003 cycles, 1275 is very good result for 3-way # issue Opteron pipeline and X[16] maintained in memory. So that *if* # there is a way to improve it, *then* the only way would be to try to # offload X[16] updates to SSE unit, but that would require "deeper" # loop unroll, which in turn would naturally cause size blow-up, not # to mention increased complexity! And once again, only *if* it's # actually possible to noticeably improve overall ILP, instruction # level parallelism, on a given CPU implementation in this case. # # Special note on Intel EM64T. While Opteron CPU exhibits perfect # performance ratio of 1.5 between 64- and 32-bit flavors [see above], # [currently available] EM64T CPUs apparently are far from it. On the # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit # sha256_block:-( This is presumably because 64-bit shifts/rotates # apparently are not atomic instructions, but implemented in microcode. # # May 2012. # # Optimization including one of Pavel Semjanov's ideas, alternative # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and # unfortunately -2% SHA512 on P4 [which nobody should care about # that much]. # # June 2012. # # Add SIMD code paths, see below for improvement coefficients. SSSE3 # code path was not attempted for SHA512, because improvement is not # estimated to be high enough, noticeably less than 9%, to justify # the effort, not on pre-AVX processors. [Obviously with exclusion # for VIA Nano, but it has SHA512 instruction that is faster and # should be used instead.] For reference, corresponding estimated # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that # higher coefficients are observed on VIA Nano and Bulldozer has more # to do with specifics of their architecture [which is topic for # separate discussion]. # # November 2012. # # Add AVX2 code path. Two consecutive input blocks are loaded to # 256-bit %ymm registers, with data from first block to least # significant 128-bit halves and data from second to most significant. # The data is then processed with same SIMD instruction sequence as # for AVX, but with %ymm as operands. Side effect is increased stack # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB # code size increase. # # March 2014. # # Add support for Intel SHA Extensions. ###################################################################### # Current performance in cycles per processed byte (less is better): # # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) # # AMD K8 14.9 - - 9.57 - # P4 17.3 - - 30.8 - # Core 2 15.6 13.8(+13%) - 9.97 - # Westmere 14.8 12.3(+19%) - 9.58 - # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) # Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) # Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) # VIA Nano 23.0 16.5(+39%) - 14.7 - # Atom 23.0 18.9(+22%) - 14.7 - # Silvermont 27.4 20.6(+33%) - 17.5 - # Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) # Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - # # (*) whichever best applicable, including SHAEXT; # (**) switch from ror to shrd stands for fair share of improvement; # (***) execution time is fully determined by remaining integer-only # part, body_00_15; reducing the amount of SIMD instructions # below certain limit makes no difference/sense; to conserve # space SHA256 XOP code path is therefore omitted; # # Modified from upstream OpenSSL to remove the XOP code. my ($flavour, $output) = @ARGV; if ($output =~ /sha512-x86_64/) { $func="sha512_block_data_order"; $TABLE="K512"; $SZ=8; @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", "%r8", "%r9", "%r10","%r11"); ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); @Sigma0=(28,34,39); @Sigma1=(14,18,41); @sigma0=(1, 8, 7); @sigma1=(19,61, 6); $rounds=80; } else { $func="sha256_block_data_order"; $TABLE="K256"; $SZ=4; @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", "%r8d","%r9d","%r10d","%r11d"); ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); $rounds=64; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; # In upstream, this is controlled by shelling out to the compiler to check # versions, but BoringSSL is intended to be used with pre-generated perlasm # output, so this isn't useful anyway. # # This file also has an AVX2 implementation, controlled by setting $avx to 2. # For now, we intentionally disable it. While it gives a 13-16% perf boost, the # CFI annotations are wrong. It allocates stack in a loop and should be # rewritten to avoid this. $avx = 1; $shaext = 1; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $ctx="%rdi"; # 1st arg, zapped by $a3 $inp="%rsi"; # 2nd arg $Tbl="%rbp"; $_ctx="16*$SZ+0*8(%rsp)"; $_inp="16*$SZ+1*8(%rsp)"; $_end="16*$SZ+2*8(%rsp)"; $_rsp="`16*$SZ+3*8`(%rsp)"; $framesz="16*$SZ+4*8"; sub ROUND_00_15() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; my $STRIDE=$SZ; $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); $code.=<<___; ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 mov $f,$a2 xor $e,$a0 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 xor $g,$a2 # f^g mov $T1,`$SZ*($i&0xf)`(%rsp) xor $a,$a1 and $e,$a2 # (f^g)&e ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 add $h,$T1 # T1+=h xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 xor $e,$a0 add $a2,$T1 # T1+=Ch(e,f,g) mov $a,$a2 add ($Tbl),$T1 # T1+=K[round] xor $a,$a1 xor $b,$a2 # a^b, b^c in next round ror \$$Sigma1[0],$a0 # Sigma1(e) mov $b,$h and $a2,$a3 ror \$$Sigma0[0],$a1 # Sigma0(a) add $a0,$T1 # T1+=Sigma1(e) xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) add $T1,$d # d+=T1 add $T1,$h # h+=T1 lea $STRIDE($Tbl),$Tbl # round++ ___ $code.=<<___ if ($i<15); add $a1,$h # h+=Sigma0(a) ___ ($a2,$a3) = ($a3,$a2); } sub ROUND_16_XX() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 mov $a0,$T1 ror \$`$sigma0[1]-$sigma0[0]`,$a0 add $a1,$a # modulo-scheduled h+=Sigma0(a) mov $a2,$a1 ror \$`$sigma1[1]-$sigma1[0]`,$a2 xor $T1,$a0 shr \$$sigma0[2],$T1 ror \$$sigma0[0],$a0 xor $a1,$a2 shr \$$sigma1[2],$a1 ror \$$sigma1[0],$a2 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) xor $a1,$a2 # sigma1(X[(i+14)&0xf]) add `$SZ*(($i+9)&0xf)`(%rsp),$T1 add `$SZ*($i&0xf)`(%rsp),$T1 mov $e,$a0 add $a2,$T1 mov $a,$a1 ___ &ROUND_00_15(@_); } $code=<<___; .text .globl ${func}_nohw .type ${func}_nohw,\@function,3 .align 16 ${func}_nohw: .cfi_startproc _CET_ENDBR mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 shl \$4,%rdx # num*16 sub \$$framesz,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ and \$-64,%rsp # align stack frame mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov %rax,$_rsp # save copy of %rsp .cfi_cfa_expression $_rsp,deref,+8 .Lprologue: mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H jmp .Lloop .align 16 .Lloop: mov $B,$a3 lea $TABLE(%rip),$Tbl xor $C,$a3 # magic ___ for($i=0;$i<16;$i++) { $code.=" mov $SZ*$i($inp),$T1\n"; $code.=" mov @ROT[4],$a0\n"; $code.=" mov @ROT[0],$a1\n"; $code.=" bswap $T1\n"; &ROUND_00_15($i,@ROT); unshift(@ROT,pop(@ROT)); } $code.=<<___; jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: ___ for(;$i<32;$i++) { &ROUND_16_XX($i,@ROT); unshift(@ROT,pop(@ROT)); } $code.=<<___; cmpb \$0,`$SZ-1`($Tbl) jnz .Lrounds_16_xx mov $_ctx,$ctx add $a1,$A # modulo-scheduled h+=Sigma0(a) lea 16*$SZ($inp),$inp add $SZ*0($ctx),$A add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jb .Lloop mov $_rsp,%rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue: ret .cfi_endproc .size ${func}_nohw,.-${func}_nohw ___ if ($SZ==4) { $code.=<<___; .section .rodata .align 64 .type $TABLE,\@object $TABLE: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by " .text ___ } else { $code.=<<___; .section .rodata .align 64 .type $TABLE,\@object $TABLE: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by " .text ___ } ###################################################################### # SIMD code paths # if ($SZ==4 && $shaext) {{{ ###################################################################### # Intel SHA Extensions implementation of SHA256 update function. # my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); my @MSG=map("%xmm$_",(3..6)); $code.=<<___; .globl sha256_block_data_order_hw .type sha256_block_data_order_hw,\@function,3 .align 64 sha256_block_data_order_hw: .cfi_startproc _CET_ENDBR ___ $code.=<<___ if ($win64); lea `-8-5*16`(%rsp),%rsp movaps %xmm6,-8-5*16(%rax) movaps %xmm7,-8-4*16(%rax) movaps %xmm8,-8-3*16(%rax) movaps %xmm9,-8-2*16(%rax) movaps %xmm10,-8-1*16(%rax) .Lprologue_shaext: ___ $code.=<<___; lea K256+0x80(%rip),$Tbl movdqu ($ctx),$ABEF # DCBA movdqu 16($ctx),$CDGH # HGFE movdqa 0x200-0x80($Tbl),$TMP # byte swap mask pshufd \$0x1b,$ABEF,$Wi # ABCD pshufd \$0xb1,$ABEF,$ABEF # CDAB pshufd \$0x1b,$CDGH,$CDGH # EFGH movdqa $TMP,$BSWAP # offload palignr \$8,$CDGH,$ABEF # ABEF punpcklqdq $Wi,$CDGH # CDGH jmp .Loop_shaext .align 16 .Loop_shaext: movdqu ($inp),@MSG[0] movdqu 0x10($inp),@MSG[1] movdqu 0x20($inp),@MSG[2] pshufb $TMP,@MSG[0] movdqu 0x30($inp),@MSG[3] movdqa 0*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi pshufb $TMP,@MSG[1] movdqa $CDGH,$CDGH_SAVE # offload sha256rnds2 $ABEF,$CDGH # 0-3 pshufd \$0x0e,$Wi,$Wi nop movdqa $ABEF,$ABEF_SAVE # offload sha256rnds2 $CDGH,$ABEF movdqa 1*32-0x80($Tbl),$Wi paddd @MSG[1],$Wi pshufb $TMP,@MSG[2] sha256rnds2 $ABEF,$CDGH # 4-7 pshufd \$0x0e,$Wi,$Wi lea 0x40($inp),$inp sha256msg1 @MSG[1],@MSG[0] sha256rnds2 $CDGH,$ABEF movdqa 2*32-0x80($Tbl),$Wi paddd @MSG[2],$Wi pshufb $TMP,@MSG[3] sha256rnds2 $ABEF,$CDGH # 8-11 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[3],$TMP palignr \$4,@MSG[2],$TMP nop paddd $TMP,@MSG[0] sha256msg1 @MSG[2],@MSG[1] sha256rnds2 $CDGH,$ABEF movdqa 3*32-0x80($Tbl),$Wi paddd @MSG[3],$Wi sha256msg2 @MSG[3],@MSG[0] sha256rnds2 $ABEF,$CDGH # 12-15 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[0],$TMP palignr \$4,@MSG[3],$TMP nop paddd $TMP,@MSG[1] sha256msg1 @MSG[3],@MSG[2] sha256rnds2 $CDGH,$ABEF ___ for($i=4;$i<16-3;$i++) { $code.=<<___; movdqa $i*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256rnds2 $ABEF,$CDGH # 16-19... pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP nop paddd $TMP,@MSG[2] sha256msg1 @MSG[0],@MSG[3] sha256rnds2 $CDGH,$ABEF ___ push(@MSG,shift(@MSG)); } $code.=<<___; movdqa 13*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256rnds2 $ABEF,$CDGH # 52-55 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP sha256rnds2 $CDGH,$ABEF paddd $TMP,@MSG[2] movdqa 14*32-0x80($Tbl),$Wi paddd @MSG[1],$Wi sha256rnds2 $ABEF,$CDGH # 56-59 pshufd \$0x0e,$Wi,$Wi sha256msg2 @MSG[1],@MSG[2] movdqa $BSWAP,$TMP sha256rnds2 $CDGH,$ABEF movdqa 15*32-0x80($Tbl),$Wi paddd @MSG[2],$Wi nop sha256rnds2 $ABEF,$CDGH # 60-63 pshufd \$0x0e,$Wi,$Wi dec $num nop sha256rnds2 $CDGH,$ABEF paddd $CDGH_SAVE,$CDGH paddd $ABEF_SAVE,$ABEF jnz .Loop_shaext pshufd \$0xb1,$CDGH,$CDGH # DCHG pshufd \$0x1b,$ABEF,$TMP # FEBA pshufd \$0xb1,$ABEF,$ABEF # BAFE punpckhqdq $CDGH,$ABEF # DCBA palignr \$8,$TMP,$CDGH # HGFE movdqu $ABEF,($ctx) movdqu $CDGH,16($ctx) ___ $code.=<<___ if ($win64); movaps -8-5*16(%rax),%xmm6 movaps -8-4*16(%rax),%xmm7 movaps -8-3*16(%rax),%xmm8 movaps -8-2*16(%rax),%xmm9 movaps -8-1*16(%rax),%xmm10 mov %rax,%rsp .Lepilogue_shaext: ___ $code.=<<___; ret .cfi_endproc .size sha256_block_data_order_hw,.-sha256_block_data_order_hw ___ }}} {{{ my $a4=$T1; my ($a,$b,$c,$d,$e,$f,$g,$h); sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. '&ror ($a0,$Sigma1[2]-$Sigma1[1])', '&mov ($a,$a1)', '&mov ($a4,$f)', '&ror ($a1,$Sigma0[2]-$Sigma0[1])', '&xor ($a0,$e)', '&xor ($a4,$g)', # f^g '&ror ($a0,$Sigma1[1]-$Sigma1[0])', '&xor ($a1,$a)', '&and ($a4,$e)', # (f^g)&e '&xor ($a0,$e)', '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] '&mov ($a2,$a)', '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g '&ror ($a1,$Sigma0[1]-$Sigma0[0])', '&xor ($a2,$b)', # a^b, b^c in next round '&add ($h,$a4)', # h+=Ch(e,f,g) '&ror ($a0,$Sigma1[0])', # Sigma1(e) '&and ($a3,$a2)', # (b^c)&(a^b) '&xor ($a1,$a)', '&add ($h,$a0)', # h+=Sigma1(e) '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) '&ror ($a1,$Sigma0[0])', # Sigma0(a) '&add ($d,$h)', # d+=h '&add ($h,$a3)', # h+=Maj(a,b,c) '&mov ($a0,$d)', '&add ($a1,$h);'. # h+=Sigma0(a) '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' ); } ###################################################################### # SSSE3 code path # if ($SZ==4) { # SHA256 only my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; .globl ${func}_ssse3 .type ${func}_ssse3,\@function,3 .align 64 ${func}_ssse3: .cfi_startproc _CET_ENDBR mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*4`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ and \$-64,%rsp # align stack frame mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov %rax,$_rsp # save copy of %rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm7,16*$SZ+48(%rsp) movaps %xmm8,16*$SZ+64(%rsp) movaps %xmm9,16*$SZ+80(%rsp) ___ $code.=<<___; .Lprologue_ssse3: mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H ___ $code.=<<___; #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Lloop_ssse3 .align 16 .Lloop_ssse3: movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 movdqu 0x00($inp),@X[0] movdqu 0x10($inp),@X[1] movdqu 0x20($inp),@X[2] pshufb $t3,@X[0] movdqu 0x30($inp),@X[3] lea $TABLE(%rip),$Tbl pshufb $t3,@X[1] movdqa 0x00($Tbl),$t0 movdqa 0x20($Tbl),$t1 pshufb $t3,@X[2] paddd @X[0],$t0 movdqa 0x40($Tbl),$t2 pshufb $t3,@X[3] movdqa 0x60($Tbl),$t3 paddd @X[1],$t1 paddd @X[2],$t2 paddd @X[3],$t3 movdqa $t0,0x00(%rsp) mov $A,$a1 movdqa $t1,0x10(%rsp) mov $B,$a3 movdqa $t2,0x20(%rsp) xor $C,$a3 # magic movdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lssse3_00_47 .align 16 .Lssse3_00_47: sub \$`-16*2*$SZ`,$Tbl # size optimization ___ sub Xupdate_256_SSSE3 () { ( '&movdqa ($t0,@X[1]);', '&movdqa ($t3,@X[3])', '&palignr ($t0,@X[0],$SZ)', # X[1..4] '&palignr ($t3,@X[2],$SZ);', # X[9..12] '&movdqa ($t1,$t0)', '&movdqa ($t2,$t0);', '&psrld ($t0,$sigma0[2])', '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] '&psrld ($t2,$sigma0[0])', '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] '&pslld ($t1,8*$SZ-$sigma0[1]);'. '&pxor ($t0,$t2)', '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. '&pxor ($t0,$t1)', '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. '&pxor ($t0,$t2);', '&movdqa ($t2,$t3)', '&pxor ($t0,$t1);', # sigma0(X[1..4]) '&psrld ($t3,$sigma1[2])', '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) '&psrlq ($t2,$sigma1[0])', '&pxor ($t3,$t2);', '&psrlq ($t2,$sigma1[1]-$sigma1[0])', '&pxor ($t3,$t2)', '&pshufb ($t3,$t4)', # sigma1(X[14..15]) '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] '&movdqa ($t2,$t3);', '&psrld ($t3,$sigma1[2])', '&psrlq ($t2,$sigma1[0])', '&pxor ($t3,$t2);', '&psrlq ($t2,$sigma1[1]-$sigma1[0])', '&pxor ($t3,$t2);', '&movdqa ($t2,16*2*$j."($Tbl)")', '&pshufb ($t3,$t5)', '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) ); } sub SSSE3_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions if (0) { foreach (Xupdate_256_SSSE3()) { # 36 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } } else { # squeeze extra 4% on Westmere and 19% on Atom eval(shift(@insns)); #@ &movdqa ($t0,@X[1]); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t3,@X[3]); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &palignr ($t0,@X[0],$SZ); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); &palignr ($t3,@X[2],$SZ); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &movdqa ($t1,$t0); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t2,$t0); eval(shift(@insns)); #@ eval(shift(@insns)); &psrld ($t0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t3); # X[0..3] += X[9..12] eval(shift(@insns)); #@ eval(shift(@insns)); &psrld ($t2,$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &pshufd ($t3,@X[3],0b11111010); # X[4..15] eval(shift(@insns)); eval(shift(@insns)); #@ &pslld ($t1,8*$SZ-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t0,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &psrld ($t2,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); &pxor ($t0,$t1); eval(shift(@insns)); eval(shift(@insns)); &pslld ($t1,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t0,$t2); eval(shift(@insns)); eval(shift(@insns)); #@ &movdqa ($t2,$t3); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t0,$t1); # sigma0(X[1..4]) eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); &psrld ($t3,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); #@ eval(shift(@insns)); &psrlq ($t2,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &psrlq ($t2,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); #&pshufb ($t3,$t4); # sigma1(X[14..15]) &pshufd ($t3,$t3,0b10000000); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psrldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufd ($t3,@X[0],0b01010000); # X[16..17] eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &movdqa ($t2,$t3); eval(shift(@insns)); eval(shift(@insns)); &psrld ($t3,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); #@ &psrlq ($t2,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &psrlq ($t2,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ #&pshufb ($t3,$t5); &pshufd ($t3,$t3,0b00001000); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t2,16*2*$j."($Tbl)"); eval(shift(@insns)); #@ eval(shift(@insns)); &pslldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); } &paddd ($t2,@X[0]); foreach (@insns) { eval; } # remaining instructions &movdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<4; $j++) { &SSSE3_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lssse3_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } $code.=<<___; mov $_ctx,$ctx mov $a1,$A add $SZ*0($ctx),$A lea 16*$SZ($inp),$inp add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jb .Lloop_ssse3 mov $_rsp,%rsi .cfi_def_cfa %rsi,8 ___ $code.=<<___ if ($win64); movaps 16*$SZ+32(%rsp),%xmm6 movaps 16*$SZ+48(%rsp),%xmm7 movaps 16*$SZ+64(%rsp),%xmm8 movaps 16*$SZ+80(%rsp),%xmm9 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_ssse3: ret .cfi_endproc .size ${func}_ssse3,.-${func}_ssse3 ___ } if ($avx) {{ ###################################################################### # AVX+shrd code path # local *ror = sub { &shrd(@_[0],@_) }; $code.=<<___; .globl ${func}_avx .type ${func}_avx,\@function,3 .align 64 ${func}_avx: .cfi_startproc _CET_ENDBR mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ and \$-64,%rsp # align stack frame mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov %rax,$_rsp # save copy of %rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm7,16*$SZ+48(%rsp) movaps %xmm8,16*$SZ+64(%rsp) movaps %xmm9,16*$SZ+80(%rsp) ___ $code.=<<___ if ($win64 && $SZ>4); movaps %xmm10,16*$SZ+96(%rsp) movaps %xmm11,16*$SZ+112(%rsp) ___ $code.=<<___; .Lprologue_avx: vzeroupper mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H ___ if ($SZ==4) { # SHA256 my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Lloop_avx .align 16 .Lloop_avx: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vmovdqu 0x30($inp),@X[3] vpshufb $t3,@X[0],@X[0] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[1],@X[1] vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) mov $B,$a3 vmovdqa $t2,0x20(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lavx_00_47 .align 16 .Lavx_00_47: sub \$`-16*2*$SZ`,$Tbl # size optimization ___ sub Xupdate_256_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] '&vpsrld ($t2,$t0,$sigma0[0]);', '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] '&vpsrld ($t3,$t0,$sigma0[2])', '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', '&vpxor ($t0,$t3,$t2)', '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t1)', '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t2)', '&vpsrld ($t2,$t3,$sigma1[2]);', '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) '&vpsrlq ($t3,$t3,$sigma1[0]);', '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) '&vpxor ($t2,$t2,$t3);', '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', '&vpxor ($t2,$t2,$t3)', '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] '&vpsrld ($t2,$t3,$sigma1[2])', '&vpsrlq ($t3,$t3,$sigma1[0])', '&vpxor ($t2,$t2,$t3);', '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', '&vpxor ($t2,$t2,$t3)', '&vpshufb ($t2,$t2,$t5)', '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) ); } sub AVX_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions foreach (Xupdate_256_AVX()) { # 29 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<4; $j++) { &AVX_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lavx_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } else { # SHA512 my @X = map("%xmm$_",(0..7)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); $code.=<<___; jmp .Lloop_avx .align 16 .Lloop_avx: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] lea $TABLE+0x80(%rip),$Tbl # size optimization vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vpshufb $t3,@X[0],@X[0] vmovdqu 0x30($inp),@X[3] vpshufb $t3,@X[1],@X[1] vmovdqu 0x40($inp),@X[4] vpshufb $t3,@X[2],@X[2] vmovdqu 0x50($inp),@X[5] vpshufb $t3,@X[3],@X[3] vmovdqu 0x60($inp),@X[6] vpshufb $t3,@X[4],@X[4] vmovdqu 0x70($inp),@X[7] vpshufb $t3,@X[5],@X[5] vpaddq -0x80($Tbl),@X[0],$t0 vpshufb $t3,@X[6],@X[6] vpaddq -0x60($Tbl),@X[1],$t1 vpshufb $t3,@X[7],@X[7] vpaddq -0x40($Tbl),@X[2],$t2 vpaddq -0x20($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) vpaddq 0x00($Tbl),@X[4],$t0 vmovdqa $t1,0x10(%rsp) vpaddq 0x20($Tbl),@X[5],$t1 vmovdqa $t2,0x20(%rsp) vpaddq 0x40($Tbl),@X[6],$t2 vmovdqa $t3,0x30(%rsp) vpaddq 0x60($Tbl),@X[7],$t3 vmovdqa $t0,0x40(%rsp) mov $A,$a1 vmovdqa $t1,0x50(%rsp) mov $B,$a3 vmovdqa $t2,0x60(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x70(%rsp) mov $E,$a0 jmp .Lavx_00_47 .align 16 .Lavx_00_47: add \$`16*2*$SZ`,$Tbl ___ sub Xupdate_512_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] '&vpsrlq ($t2,$t0,$sigma0[0])', '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] '&vpsrlq ($t3,$t0,$sigma0[2])', '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', '&vpxor ($t0,$t3,$t2)', '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t1)', '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t2)', '&vpsrlq ($t3,@X[7],$sigma1[2]);', '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) '&vpsrlq ($t1,@X[7],$sigma1[0]);', '&vpxor ($t3,$t3,$t2)', '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', '&vpxor ($t3,$t3,$t1)', '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', '&vpxor ($t3,$t3,$t2)', '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) ); } sub AVX_512_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body); # 52 instructions foreach (Xupdate_512_AVX()) { # 23 instructions eval; eval(shift(@insns)); eval(shift(@insns)); } &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<8; $j++) { &AVX_512_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); &jne (".Lavx_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } $code.=<<___; mov $_ctx,$ctx mov $a1,$A add $SZ*0($ctx),$A lea 16*$SZ($inp),$inp add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jb .Lloop_avx mov $_rsp,%rsi .cfi_def_cfa %rsi,8 vzeroupper ___ $code.=<<___ if ($win64); movaps 16*$SZ+32(%rsp),%xmm6 movaps 16*$SZ+48(%rsp),%xmm7 movaps 16*$SZ+64(%rsp),%xmm8 movaps 16*$SZ+80(%rsp),%xmm9 ___ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+96(%rsp),%xmm10 movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size ${func}_avx,.-${func}_avx ___ }}}}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HanderlData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue ___ $code.=<<___; mov %rax,%rsi # put aside Rsp mov 16*$SZ+3*8(%rax),%rax # pull $_rsp mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 lea .Lepilogue(%rip),%r10 cmp %r10,%rbx jb .Lin_prologue # non-AVX code lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area lea 512($context),%rdi # &context.Xmm6 mov \$`$SZ==4?8:12`,%ecx .long 0xa548f3fc # cld; rep movsq .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler ___ $code.=<<___ if ($SZ==4 && $shaext); .type shaext_handler,\@abi-omnipotent .align 16 shaext_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip lea .Lprologue_shaext(%rip),%r10 cmp %r10,%rbx # context->Rip<.Lprologue jb .Lin_prologue lea .Lepilogue_shaext(%rip),%r10 cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lin_prologue lea -8-5*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$10,%ecx .long 0xa548f3fc # cld; rep movsq jmp .Lin_prologue .size shaext_handler,.-shaext_handler ___ $code.=<<___; .section .pdata .align 4 .rva .LSEH_begin_${func}_nohw .rva .LSEH_end_${func}_nohw .rva .LSEH_info_${func}_nohw ___ $code.=<<___ if ($SZ==4 && $shaext); .rva .LSEH_begin_${func}_hw .rva .LSEH_end_${func}_hw .rva .LSEH_info_${func}_hw ___ $code.=<<___ if ($SZ==4); .rva .LSEH_begin_${func}_ssse3 .rva .LSEH_end_${func}_ssse3 .rva .LSEH_info_${func}_ssse3 ___ $code.=<<___ if ($avx); .rva .LSEH_begin_${func}_avx .rva .LSEH_end_${func}_avx .rva .LSEH_info_${func}_avx ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_${func}_nohw: .byte 9,0,0,0 .rva se_handler .rva .Lprologue,.Lepilogue # HandlerData[] ___ $code.=<<___ if ($SZ==4 && $shaext); .LSEH_info_${func}_hw: .byte 9,0,0,0 .rva shaext_handler ___ $code.=<<___ if ($SZ==4); .LSEH_info_${func}_ssse3: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] ___ $code.=<<___ if ($avx); .LSEH_info_${func}_avx: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ } sub sha256op38 { my $instr = shift; my %opcodelet = ( "sha256rnds2" => 0xcb, "sha256msg1" => 0xcc, "sha256msg2" => 0xcd ); if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { my @opcode=(0x0f,0x38); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/internal.h000064400000000000000000000403331046102023000142450ustar 00000000000000// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef OPENSSL_HEADER_CRYPTO_INTERNAL_H #define OPENSSL_HEADER_CRYPTO_INTERNAL_H #include // Must be first. #include "ring-core/check.h" #if defined(__clang__) // Don't require prototypes for functions defined in C that are only // used from Rust. #pragma GCC diagnostic ignored "-Wmissing-prototypes" #endif #if defined(__GNUC__) && \ (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800 // |alignas| and |alignof| were added in C11. GCC added support in version 4.8. // Testing for __STDC_VERSION__/__cplusplus doesn't work because 4.7 already // reports support for C11. #define alignas(x) __attribute__ ((aligned (x))) #elif defined(_MSC_VER) && !defined(__clang__) #define alignas(x) __declspec(align(x)) #else #include #endif #if defined(__clang__) || defined(__GNUC__) #define RING_NOINLINE __attribute__((noinline)) #elif defined(_MSC_VER) #define RING_NOINLINE __declspec(noinline) #else #define RING_NOINLINE #endif // Some C compilers require a useless cast when dealing with arrays for the // reason explained in // https://gustedt.wordpress.com/2011/02/12/const-and-arrays/ #if defined(__clang__) || defined(_MSC_VER) #define RING_CORE_POINTLESS_ARRAY_CONST_CAST(cast) #else #define RING_CORE_POINTLESS_ARRAY_CONST_CAST(cast) cast #endif // `uint8_t` isn't guaranteed to be 'unsigned char' and only 'char' and // 'unsigned char' are allowed to alias according to ISO C. typedef unsigned char aliasing_uint8_t; #if (!defined(_MSC_VER) || defined(__clang__)) && defined(OPENSSL_64_BIT) #define BORINGSSL_HAS_UINT128 typedef __int128_t int128_t; typedef __uint128_t uint128_t; #endif // GCC-like compilers indicate SSE2 with |__SSE2__|. MSVC leaves the caller to // know that x86_64 has SSE2, and uses _M_IX86_FP to indicate SSE2 on x86. // https://learn.microsoft.com/en-us/cpp/preprocessor/predefined-macros?view=msvc-170 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) # if defined(_MSC_VER) && !defined(__clang__) # if defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) # define OPENSSL_SSE2 # else # error "SSE2 is required." # endif # elif !defined(__SSE2__) # error "SSE2 is required." # endif #endif // For convenience in testing the fallback code, we allow disabling SSE2 // intrinsics via |OPENSSL_NO_SSE2_FOR_TESTING|. We require SSE2 on x86 and // x86_64, so we would otherwise need to test such code on a non-x86 platform. // // This does not remove the above requirement for SSE2 support with assembly // optimizations. It only disables some intrinsics-based optimizations so that // we can test the fallback code on CI. #if defined(OPENSSL_SSE2) && defined(OPENSSL_NO_SSE2_FOR_TESTING) #undef OPENSSL_SSE2 #endif // Pointer utility functions. // buffers_alias returns one if |a| and |b| alias and zero otherwise. static inline int buffers_alias(const void *a, size_t a_bytes, const void *b, size_t b_bytes) { // Cast |a| and |b| to integers. In C, pointer comparisons between unrelated // objects are undefined whereas pointer to integer conversions are merely // implementation-defined. We assume the implementation defined it in a sane // way. uintptr_t a_u = (uintptr_t)a; uintptr_t b_u = (uintptr_t)b; return a_u + a_bytes > b_u && b_u + b_bytes > a_u; } // Constant-time utility functions. // // The following methods return a bitmask of all ones (0xff...f) for true and 0 // for false. This is useful for choosing a value based on the result of a // conditional in constant time. For example, // // if (a < b) { // c = a; // } else { // c = b; // } // // can be written as // // crypto_word_t lt = constant_time_lt_w(a, b); // c = constant_time_select_w(lt, a, b); #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" #pragma GCC diagnostic ignored "-Wsign-conversion" #endif #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(push) // '=': conversion from 'crypto_word_t' to 'uint8_t', possible loss of data #pragma warning(disable: 4242) // 'initializing': conversion from 'crypto_word_t' to 'uint8_t', ... #pragma warning(disable: 4244) #endif // crypto_word_t is the type that most constant-time functions use. Ideally we // would like it to be |size_t|, but NaCl builds in 64-bit mode with 32-bit // pointers, which means that |size_t| can be 32 bits when |BN_ULONG| is 64 // bits. Since we want to be able to do constant-time operations on a // |BN_ULONG|, |crypto_word_t| is defined as an unsigned value with the native // word length. #if defined(OPENSSL_64_BIT) typedef uint64_t crypto_word_t; #define CRYPTO_WORD_BITS (64u) #elif defined(OPENSSL_32_BIT) typedef uint32_t crypto_word_t; #define CRYPTO_WORD_BITS (32u) #else #error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT" #endif #define CONSTTIME_TRUE_W ~((crypto_word_t)0) #define CONSTTIME_FALSE_W ((crypto_word_t)0) // value_barrier_w returns |a|, but prevents GCC and Clang from reasoning about // the returned value. This is used to mitigate compilers undoing constant-time // code, until we can express our requirements directly in the language. // // Note the compiler is aware that |value_barrier_w| has no side effects and // always has the same output for a given input. This allows it to eliminate // dead code, move computations across loops, and vectorize. static inline crypto_word_t value_barrier_w(crypto_word_t a) { #if defined(__GNUC__) || defined(__clang__) __asm__("" : "+r"(a) : /* no inputs */); #endif return a; } // value_barrier_u32 behaves like |value_barrier_w| but takes a |uint32_t|. static inline uint32_t value_barrier_u32(uint32_t a) { #if defined(__GNUC__) || defined(__clang__) __asm__("" : "+r"(a) : /* no inputs */); #endif return a; } // |value_barrier_u8| could be defined as above, but compilers other than // clang seem to still materialize 0x00..00MM instead of reusing 0x??..??MM. // constant_time_msb_w returns the given value with the MSB copied to all the // other bits. static inline crypto_word_t constant_time_msb_w(crypto_word_t a) { return 0u - (a >> (sizeof(a) * 8 - 1)); } // constant_time_is_zero returns 0xff..f if a == 0 and 0 otherwise. static inline crypto_word_t constant_time_is_zero_w(crypto_word_t a) { // Here is an SMT-LIB verification of this formula: // // (define-fun is_zero ((a (_ BitVec 32))) (_ BitVec 32) // (bvand (bvnot a) (bvsub a #x00000001)) // ) // // (declare-fun a () (_ BitVec 32)) // // (assert (not (= (= #x00000001 (bvlshr (is_zero a) #x0000001f)) (= a #x00000000)))) // (check-sat) // (get-model) return constant_time_msb_w(~a & (a - 1)); } static inline crypto_word_t constant_time_is_nonzero_w(crypto_word_t a) { return ~constant_time_is_zero_w(a); } // constant_time_eq_w returns 0xff..f if a == b and 0 otherwise. static inline crypto_word_t constant_time_eq_w(crypto_word_t a, crypto_word_t b) { return constant_time_is_zero_w(a ^ b); } // constant_time_select_w returns (mask & a) | (~mask & b). When |mask| is all // 1s or all 0s (as returned by the methods above), the select methods return // either |a| (if |mask| is nonzero) or |b| (if |mask| is zero). static inline crypto_word_t constant_time_select_w(crypto_word_t mask, crypto_word_t a, crypto_word_t b) { // Clang recognizes this pattern as a select. While it usually transforms it // to a cmov, it sometimes further transforms it into a branch, which we do // not want. // // Hiding the value of the mask from the compiler evades this transformation. mask = value_barrier_w(mask); return (mask & a) | (~mask & b); } // constant_time_select_8 acts like |constant_time_select| but operates on // 8-bit values. static inline uint8_t constant_time_select_8(crypto_word_t mask, uint8_t a, uint8_t b) { // |mask| is a word instead of |uint8_t| to avoid materializing 0x000..0MM // Making both |mask| and its value barrier |uint8_t| would allow the compiler // to materialize 0x????..?MM instead, but only clang is that clever. // However, vectorization of bitwise operations seems to work better on // |uint8_t| than a mix of |uint64_t| and |uint8_t|, so |m| is cast to // |uint8_t| after the value barrier but before the bitwise operations. uint8_t m = value_barrier_w(mask); return (m & a) | (~m & b); } // constant_time_conditional_memcpy copies |n| bytes from |src| to |dst| if // |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory // ranges at |dst| and |src| must not overlap, as when calling |memcpy|. static inline void constant_time_conditional_memcpy(void *dst, const void *src, const size_t n, const crypto_word_t mask) { debug_assert_nonsecret(!buffers_alias(dst, n, src, n)); uint8_t *out = (uint8_t *)dst; const uint8_t *in = (const uint8_t *)src; for (size_t i = 0; i < n; i++) { out[i] = constant_time_select_8(mask, in[i], out[i]); } } // constant_time_conditional_memxor xors |n| bytes from |src| to |dst| if // |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory // ranges at |dst| and |src| must not overlap, as when calling |memcpy|. static inline void constant_time_conditional_memxor(void *dst, const void *src, size_t n, const crypto_word_t mask) { debug_assert_nonsecret(!buffers_alias(dst, n, src, n)); aliasing_uint8_t *out = dst; const aliasing_uint8_t *in = src; #if defined(__GNUC__) && !defined(__clang__) // gcc 13.2.0 doesn't automatically vectorize this loop regardless of barrier typedef aliasing_uint8_t v32u8 __attribute__((vector_size(32), aligned(1), may_alias)); size_t n_vec = n&~(size_t)31; v32u8 masks = ((aliasing_uint8_t)mask-(v32u8){}); // broadcast for (size_t i = 0; i < n_vec; i += 32) { *(v32u8*)&out[i] ^= masks & *(v32u8 const*)&in[i]; } out += n_vec; n -= n_vec; #endif for (size_t i = 0; i < n; i++) { out[i] ^= value_barrier_w(mask) & in[i]; } } #if defined(BORINGSSL_CONSTANT_TIME_VALIDATION) // CONSTTIME_SECRET takes a pointer and a number of bytes and marks that region // of memory as secret. Secret data is tracked as it flows to registers and // other parts of a memory. If secret data is used as a condition for a branch, // or as a memory index, it will trigger warnings in valgrind. #define CONSTTIME_SECRET(ptr, len) VALGRIND_MAKE_MEM_UNDEFINED(ptr, len) // CONSTTIME_DECLASSIFY takes a pointer and a number of bytes and marks that // region of memory as public. Public data is not subject to constant-time // rules. #define CONSTTIME_DECLASSIFY(ptr, len) VALGRIND_MAKE_MEM_DEFINED(ptr, len) #else #define CONSTTIME_SECRET(ptr, len) #define CONSTTIME_DECLASSIFY(ptr, len) #endif // BORINGSSL_CONSTANT_TIME_VALIDATION static inline crypto_word_t constant_time_declassify_w(crypto_word_t v) { // Return |v| through a value barrier to be safe. Valgrind-based constant-time // validation is partly to check the compiler has not undone any constant-time // work. Any place |BORINGSSL_CONSTANT_TIME_VALIDATION| influences // optimizations, this validation is inaccurate. // // However, by sending pointers through valgrind, we likely inhibit escape // analysis. On local variables, particularly booleans, we likely // significantly impact optimizations. // // Thus, to be safe, stick a value barrier, in hopes of comparably inhibiting // compiler analysis. CONSTTIME_DECLASSIFY(&v, sizeof(v)); return value_barrier_w(v); } static inline int constant_time_declassify_int(int v) { OPENSSL_STATIC_ASSERT(sizeof(uint32_t) == sizeof(int), "int is not the same size as uint32_t"); // See comment above. CONSTTIME_DECLASSIFY(&v, sizeof(v)); return value_barrier_u32((uint32_t)v); } #if defined(_MSC_VER) && !defined(__clang__) // '=': conversion from 'int64_t' to 'int32_t', possible loss of data #pragma warning(pop) #endif #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic pop #endif // declassify_assert behaves like |assert| but declassifies the result of // evaluating |expr|. This allows the assertion to branch on the (presumably // public) result, but still ensures that values leading up to the computation // were secret. #define declassify_assert(expr) dev_assert_secret(constant_time_declassify_int(expr)) // Endianness conversions. #if defined(__GNUC__) && __GNUC__ >= 2 static inline uint32_t CRYPTO_bswap4(uint32_t x) { return __builtin_bswap32(x); } static inline uint64_t CRYPTO_bswap8(uint64_t x) { return __builtin_bswap64(x); } #elif defined(_MSC_VER) #pragma warning(push, 3) #include #pragma warning(pop) #pragma intrinsic(_byteswap_ulong) static inline uint32_t CRYPTO_bswap4(uint32_t x) { return _byteswap_ulong(x); } #endif #if !defined(RING_CORE_NOSTDLIBINC) #include #endif static inline void *OPENSSL_memcpy(void *dst, const void *src, size_t n) { #if !defined(RING_CORE_NOSTDLIBINC) if (n == 0) { return dst; } return memcpy(dst, src, n); #else aliasing_uint8_t *d = dst; const aliasing_uint8_t *s = src; for (size_t i = 0; i < n; ++i) { d[i] = s[i]; } return dst; #endif } static inline void *OPENSSL_memset(void *dst, int c, size_t n) { #if !defined(RING_CORE_NOSTDLIBINC) if (n == 0) { return dst; } return memset(dst, c, n); #else aliasing_uint8_t *d = dst; for (size_t i = 0; i < n; ++i) { d[i] = (aliasing_uint8_t)c; } return dst; #endif } // Loads and stores. // // The following functions load and store sized integers with the specified // endianness. They use |memcpy|, and so avoid alignment or strict aliasing // requirements on the input and output pointers. #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define RING_BIG_ENDIAN #endif #endif static inline uint32_t CRYPTO_load_u32_le(const void *in) { uint32_t v; OPENSSL_memcpy(&v, in, sizeof(v)); #if defined(RING_BIG_ENDIAN) return CRYPTO_bswap4(v); #else return v; #endif } static inline void CRYPTO_store_u32_le(void *out, uint32_t v) { #if defined(RING_BIG_ENDIAN) v = CRYPTO_bswap4(v); #endif OPENSSL_memcpy(out, &v, sizeof(v)); } static inline uint32_t CRYPTO_load_u32_be(const void *in) { uint32_t v; OPENSSL_memcpy(&v, in, sizeof(v)); #if !defined(RING_BIG_ENDIAN) return CRYPTO_bswap4(v); #else return v; #endif } static inline void CRYPTO_store_u32_be(void *out, uint32_t v) { #if !defined(RING_BIG_ENDIAN) v = CRYPTO_bswap4(v); #endif OPENSSL_memcpy(out, &v, sizeof(v)); } // Runtime CPU feature support #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) // OPENSSL_ia32cap_P contains the Intel CPUID bits when running on an x86 or // x86-64 system. // // Index 0: // EDX for CPUID where EAX = 1 // Bit 30 is used to indicate an Intel CPU // Index 1: // ECX for CPUID where EAX = 1 // Index 2: // EBX for CPUID where EAX = 7, ECX = 0 // Bit 14 (for removed feature MPX) is used to indicate a preference for ymm // registers over zmm even when zmm registers are supported // Index 3: // ECX for CPUID where EAX = 7, ECX = 0 // // Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the XMM, YMM, // and AVX512 bits in XCR0, so it is not necessary to check those. (WARNING: See // caveats in cpu_intel.c.) #if defined(OPENSSL_X86_64) extern uint32_t avx2_available; extern uint32_t adx_bmi2_available; #endif #endif #if defined(OPENSSL_ARM) extern alignas(4) uint32_t neon_available; #endif // OPENSSL_ARM #endif // OPENSSL_HEADER_CRYPTO_INTERNAL_H ring-0.17.14/crypto/limbs/limbs.c000064400000000000000000000141651046102023000146440ustar 00000000000000/* Copyright 2016-2017 Brian Smith. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "limbs.h" #include "../internal.h" #include "../fipsmodule/bn/internal.h" #include "limbs.inl" /* XXX: We assume that the conversion from |Carry| to |Limb| is constant-time, * but we haven't verified that assumption. TODO: Fix it so we don't need to * make that assumption. */ /* Returns 0xfff..f if |a| is zero, and zero otherwise. */ Limb LIMB_is_zero(const Limb a) { return constant_time_is_zero_w(a); } /* Returns 0xfff..f if |a| is all zero limbs, and zero otherwise. |num_limbs| * may be zero. */ Limb LIMBS_are_zero(const Limb a[], size_t num_limbs) { Limb all = 0; for (size_t i = 0; i < num_limbs; ++i) { all |= a[i]; } return LIMB_is_zero(all); } /* Returns 0xffff..f if |a == b|, and zero otherwise. |num_limbs| may be zero. */ Limb LIMBS_equal(const Limb a[], const Limb b[], size_t num_limbs) { Limb eq = CONSTTIME_TRUE_W; for (size_t i = 0; i < num_limbs; ++i) { eq = constant_time_select_w(eq, constant_time_eq_w(a[i], b[i]), eq); } return eq; } /* Returns 0xffff...f if |a| is less than |b|, and zero otherwise. */ Limb LIMBS_less_than(const Limb a[], const Limb b[], size_t num_limbs) { debug_assert_nonsecret(num_limbs >= 1); /* There are lots of ways to implement this. It is implemented this way to * be consistent with |LIMBS_limbs_reduce_once| and other code that makes such * comparisons as part of doing conditional reductions. */ Limb dummy; Carry borrow = limb_sub(&dummy, a[0], b[0]); for (size_t i = 1; i < num_limbs; ++i) { borrow = limb_sbb(&dummy, a[i], b[i], borrow); } return constant_time_is_nonzero_w(borrow); } /* if (r >= m) { r -= m; } */ void LIMBS_reduce_once(Limb r[], const Limb m[], size_t num_limbs) { debug_assert_nonsecret(num_limbs >= 1); /* This could be done more efficiently if we had |num_limbs| of extra space * available, by storing |r - m| and then doing a conditional copy of either * |r| or |r - m|. But, in order to operate in constant space, with an eye * towards this function being used in RSA in the future, we do things a * slightly less efficient way. */ Limb lt = LIMBS_less_than(r, m, num_limbs); Carry borrow = limb_sub(&r[0], r[0], constant_time_select_w(lt, 0, m[0])); for (size_t i = 1; i < num_limbs; ++i) { /* XXX: This is probably particularly inefficient because the operations in * constant_time_select affect the carry flag, so there will likely be * loads and stores of |borrow|. */ borrow = limb_sbb(&r[i], r[i], constant_time_select_w(lt, 0, m[i]), borrow); } dev_assert_secret(borrow == 0); } void LIMBS_add_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[], size_t num_limbs) { Limb overflow1 = constant_time_is_nonzero_w(limbs_add(r, a, b, num_limbs)); Limb overflow2 = ~LIMBS_less_than(r, m, num_limbs); Limb overflow = overflow1 | overflow2; Carry borrow = limb_sub(&r[0], r[0], m[0] & overflow); for (size_t i = 1; i < num_limbs; ++i) { borrow = limb_sbb(&r[i], r[i], m[i] & overflow, borrow); } } void LIMBS_sub_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[], size_t num_limbs) { Limb underflow = constant_time_is_nonzero_w(limbs_sub(r, a, b, num_limbs)); Carry carry = limb_add(&r[0], r[0], m[0] & underflow); for (size_t i = 1; i < num_limbs; ++i) { carry = limb_adc(&r[i], r[i], m[i] & underflow, carry); } } void LIMBS_shl_mod(Limb r[], const Limb a[], const Limb m[], size_t num_limbs) { Limb overflow1 = constant_time_is_nonzero_w(a[num_limbs - 1] & LIMB_HIGH_BIT); Limb carry = 0; for (size_t i = 0; i < num_limbs; ++i) { Limb limb = a[i]; Limb new_carry = limb >> (LIMB_BITS - 1); r[i] = (limb << 1) | carry; carry = new_carry; } Limb overflow2 = ~LIMBS_less_than(r, m, num_limbs); Limb overflow = overflow1 | overflow2; Carry borrow = limb_sub(&r[0], r[0], m[0] & overflow); for (size_t i = 1; i < num_limbs; ++i) { borrow = limb_sbb(&r[i], r[i], m[i] & overflow, borrow); } } int LIMBS_select_512_32(Limb r[], const Limb table[], size_t num_limbs, crypto_word_t index) { if (num_limbs % (512 / LIMB_BITS) != 0) { return 0; } limbs_select(r, table, num_limbs, 32, index); return 1; } static const Limb FIVE_BITS_MASK = 0x1f; crypto_word_t LIMBS_window5_split_window(Limb lower_limb, Limb higher_limb, size_t index_within_word) { Limb high_bits = (higher_limb << (LIMB_BITS - index_within_word)) & FIVE_BITS_MASK; // There are no bits outside the window above |index_within_word| (if there // were then this wouldn't be a split window), so we don't need to mask // |low_bits|. Limb low_bits = lower_limb >> index_within_word; return low_bits | high_bits; } crypto_word_t LIMBS_window5_unsplit_window(Limb limb, size_t index_within_word) { return (limb >> index_within_word) & FIVE_BITS_MASK; } Limb LIMB_shr(Limb a, size_t shift) { return a >> shift; } Limb limbs_mul_add_limb(Limb r[], const Limb a[], Limb b, size_t num_limbs) { Limb carried = 0; for (size_t i = 0; i < num_limbs; ++i) { Limb lo; Limb hi; bn_umult_lohi(&lo, &hi, a[i], b); Limb tmp; Carry c = limb_add(&tmp, lo, carried); c = limb_adc(&carried, hi, 0, c); dev_assert_secret(c == 0); c = limb_add(&r[i], r[i], tmp); c = limb_adc(&carried, carried, 0, c); // (A * B) + C + D never carries. dev_assert_secret(c == 0); } return carried; } ring-0.17.14/crypto/limbs/limbs.h000064400000000000000000000030651046102023000146460ustar 00000000000000/* Copyright 2016 Brian Smith. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #ifndef RING_LIMBS_H #define RING_LIMBS_H #include #include "../internal.h" typedef crypto_word_t Limb; #define LIMB_BITS CRYPTO_WORD_BITS #define LIMB_HIGH_BIT ((Limb)(1) << (LIMB_BITS - 1)) Limb LIMBS_are_zero(const Limb a[], size_t num_limbs); Limb LIMBS_equal(const Limb a[], const Limb b[], size_t num_limbs); void LIMBS_reduce_once(Limb r[], const Limb m[], size_t num_limbs); void LIMBS_add_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[], size_t num_limbs); void LIMBS_sub_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[], size_t num_limbs); void LIMBS_shl_mod(Limb r[], const Limb a[], const Limb m[], size_t num_limbs); Limb limbs_mul_add_limb(Limb r[], const Limb a[], Limb b, size_t num_limbs); #endif /* RING_LIMBS_H */ ring-0.17.14/crypto/limbs/limbs.inl000064400000000000000000000120341046102023000151750ustar 00000000000000/* Copyright 2016 Brian Smith. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "limbs.h" #include "ring-core/check.h" #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(push, 3) #include #pragma warning(pop) /* MSVC 2015 RC, when compiling for x86 with /Ox (at least), miscompiles * _addcarry_u32(c, 0, prod_hi, &x) like so: * * add eax,esi ; The previous add that might have set the carry flag. * xor esi,esi ; OOPS! Carry flag is now reset! * mov dword ptr [edi-4],eax * adc esi,dword ptr [prod_hi] * * We test with MSVC 2015 update 2, so make sure we're using a version at least * as new as that. */ #if _MSC_FULL_VER < 190023918 #error "MSVC 2015 Update 2 or later is required." #endif typedef uint8_t Carry; #if LIMB_BITS == 64 #pragma intrinsic(_addcarry_u64, _subborrow_u64) #define RING_CORE_ADDCARRY_INTRINSIC _addcarry_u64 #define RING_CORE_SUBBORROW_INTRINSIC _subborrow_u64 #elif LIMB_BITS == 32 #pragma intrinsic(_addcarry_u32, _subborrow_u32) #define RING_CORE_ADDCARRY_INTRINSIC _addcarry_u32 #define RING_CORE_SUBBORROW_INTRINSIC _subborrow_u32 typedef uint64_t DoubleLimb; #endif #else typedef Limb Carry; #if LIMB_BITS == 64 typedef __uint128_t DoubleLimb; #elif LIMB_BITS == 32 typedef uint64_t DoubleLimb; #endif #endif /* |*r = a + b + carry_in|, returning carry out bit. |carry_in| must be 0 or 1. */ static inline Carry limb_adc(Limb *r, Limb a, Limb b, Carry carry_in) { dev_assert_secret(carry_in == 0 || carry_in == 1); Carry ret; #if defined(RING_CORE_ADDCARRY_INTRINSIC) ret = RING_CORE_ADDCARRY_INTRINSIC(carry_in, a, b, r); #else DoubleLimb x = (DoubleLimb)a + b + carry_in; *r = (Limb)x; ret = (Carry)(x >> LIMB_BITS); #endif dev_assert_secret(ret == 0 || ret == 1); return ret; } /* |*r = a + b|, returning carry bit. */ static inline Carry limb_add(Limb *r, Limb a, Limb b) { Carry ret; #if defined(RING_CORE_ADDCARRY_INTRINSIC) ret = RING_CORE_ADDCARRY_INTRINSIC(0, a, b, r); #else DoubleLimb x = (DoubleLimb)a + b; *r = (Limb)x; ret = (Carry)(x >> LIMB_BITS); #endif dev_assert_secret(ret == 0 || ret == 1); return ret; } /* |*r = a - b - borrow_in|, returning the borrow out bit. |borrow_in| must be * 0 or 1. */ static inline Carry limb_sbb(Limb *r, Limb a, Limb b, Carry borrow_in) { dev_assert_secret(borrow_in == 0 || borrow_in == 1); Carry ret; #if defined(RING_CORE_SUBBORROW_INTRINSIC) ret = RING_CORE_SUBBORROW_INTRINSIC(borrow_in, a, b, r); #else DoubleLimb x = (DoubleLimb)a - b - borrow_in; *r = (Limb)x; ret = (Carry)((x >> LIMB_BITS) & 1); #endif dev_assert_secret(ret == 0 || ret == 1); return ret; } /* |*r = a - b|, returning borrow bit. */ static inline Carry limb_sub(Limb *r, Limb a, Limb b) { Carry ret; #if defined(RING_CORE_SUBBORROW_INTRINSIC) ret = RING_CORE_SUBBORROW_INTRINSIC(0, a, b, r); #else DoubleLimb x = (DoubleLimb)a - b; *r = (Limb)x; ret = (Carry)((x >> LIMB_BITS) & 1); #endif dev_assert_secret(ret == 0 || ret == 1); return ret; } static inline Carry limbs_add(Limb r[], const Limb a[], const Limb b[], size_t num_limbs) { debug_assert_nonsecret(num_limbs >= 1); Carry carry = limb_add(&r[0], a[0], b[0]); for (size_t i = 1; i < num_limbs; ++i) { carry = limb_adc(&r[i], a[i], b[i], carry); } return carry; } /* |r -= s|, returning the borrow. */ static inline Carry limbs_sub(Limb r[], const Limb a[], const Limb b[], size_t num_limbs) { debug_assert_nonsecret(num_limbs >= 1); Carry borrow = limb_sub(&r[0], a[0], b[0]); for (size_t i = 1; i < num_limbs; ++i) { borrow = limb_sbb(&r[i], a[i], b[i], borrow); } return borrow; } static inline void limbs_copy(Limb r[], const Limb a[], size_t num_limbs) { for (size_t i = 0; i < num_limbs; ++i) { r[i] = a[i]; } } static inline void limbs_select(Limb r[], const Limb table[], size_t num_limbs, size_t num_entries, crypto_word_t index) { for (size_t i = 0; i < num_limbs; ++i) { r[i] = 0; } for (size_t e = 0; e < num_entries; ++e) { Limb equal = constant_time_eq_w(index, e); for (size_t i = 0; i < num_limbs; ++i) { r[i] = constant_time_select_w(equal, table[(e * num_limbs) + i], r[i]); } } } static inline void limbs_zero(Limb r[], size_t num_limbs) { for (size_t i = 0; i < num_limbs; ++i) { r[i] = 0; } } ring-0.17.14/crypto/mem.c000064400000000000000000000016271046102023000132050ustar 00000000000000// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include "internal.h" int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len) { const aliasing_uint8_t *a = in_a; const aliasing_uint8_t *b = in_b; uint8_t x = 0; for (size_t i = 0; i < len; i++) { x |= a[i] ^ b[i]; } return x; } ring-0.17.14/crypto/perlasm/arm-xlate.pl000064400000000000000000000154701046102023000161560ustar 00000000000000#! /usr/bin/env perl # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. use strict; my $flavour = shift; my $output = shift; open STDOUT,">$output" || die "can't open $output: $!"; $flavour = "linux32" if (!$flavour or $flavour eq "void"); my %GLOBALS; my $dotinlocallabels=($flavour=~/linux/)?1:0; ################################################################ # directives which need special treatment on different platforms ################################################################ my $arch = sub { if ($flavour =~ /linux/) { ".arch\t".join(',',@_); } elsif ($flavour =~ /win64/) { ".arch\t".join(',',@_); } else { ""; } }; my $fpu = sub { if ($flavour =~ /linux/) { ".fpu\t".join(',',@_); } else { ""; } }; my $hidden = sub { if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); } elsif ($flavour =~ /win64/) { ""; } else { ".hidden\t".join(',',@_); } }; my $comm = sub { my @args = split(/,\s*/,shift); my $name = @args[0]; my $global = \$GLOBALS{$name}; my $ret; if ($flavour =~ /ios32/) { $ret = ".comm\t_$name,@args[1]\n"; $ret .= ".non_lazy_symbol_pointer\n"; $ret .= "$name:\n"; $ret .= ".indirect_symbol\t_$name\n"; $ret .= ".long\t0"; $name = "_$name"; } else { $ret = ".comm\t".join(',',@args); } $$global = $name; $ret; }; my $globl = sub { my $name = shift; my $global = \$GLOBALS{$name}; my $ret; SWITCH: for ($flavour) { /ios/ && do { $name = "_$name"; last; }; } $ret = ".globl $name\n"; # All symbols in assembly files are hidden. $ret .= &$hidden($name); $$global = $name; $ret; }; my $global = $globl; my $extern = sub { &$globl(@_); return; # return nothing }; my $type = sub { if ($flavour =~ /linux/) { ".type\t".join(',',@_); } elsif ($flavour =~ /ios32/) { if (join(',',@_) =~ /(\w+),%function/) { "#ifdef __thumb2__\n". ".thumb_func $1\n". "#endif"; } } elsif ($flavour =~ /win64/) { if (join(',',@_) =~ /(\w+),%function/) { # See https://sourceware.org/binutils/docs/as/Pseudo-Ops.html # Per https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#coff-symbol-table, # the type for functions is 0x20, or 32. ".def $1\n". " .type 32\n". ".endef"; } } else { ""; } }; my $size = sub { if ($flavour =~ /linux/) { ".size\t".join(',',@_); } else { ""; } }; my $inst = sub { if ($flavour =~ /linux/) { ".inst\t".join(',',@_); } else { ".long\t".join(',',@_); } }; my $asciz = sub { my $line = join(",",@_); if ($line =~ /^"(.*)"$/) { ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; } else { ""; } }; my $section = sub { if ($flavour =~ /ios/) { if ($_[0] eq ".rodata") { return ".section\t__TEXT,__const"; } die "Unknown section name $_[0]"; } else { return ".section\t" . join(",", @_); } }; sub range { my ($r,$sfx,$start,$end) = @_; join(",",map("$r$_$sfx",($start..$end))); } sub expand_line { my $line = shift; my @ret = (); pos($line)=0; while ($line =~ m/\G[^@\/\{\"]*/g) { if ($line =~ m/\G(@|\/\/|$)/gc) { last; } elsif ($line =~ m/\G\{/gc) { my $saved_pos = pos($line); $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e; pos($line) = $saved_pos; $line =~ m/\G[^\}]*\}/g; } elsif ($line =~ m/\G\"/gc) { $line =~ m/\G[^\"]*\"/g; } } $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge; return $line; } my ($arch_defines, $target_defines); if ($flavour =~ /32/) { $arch_defines = "defined(OPENSSL_ARM)"; } elsif ($flavour =~ /64/) { $arch_defines = "defined(OPENSSL_AARCH64)"; } else { die "unknown architecture: $flavour"; } if ($flavour =~ /linux/) { # Although the flavour is specified as "linux", it is really used by all # ELF platforms. $target_defines = "defined(__ELF__)"; } elsif ($flavour =~ /ios/) { # Although the flavour is specified as "ios", it is really used by all Apple # platforms. $target_defines = "defined(__APPLE__)"; } elsif ($flavour =~ /win/) { $target_defines = "defined(_WIN32)"; } else { die "unknown target: $flavour"; } print <<___; // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && $arch_defines && $target_defines ___ while(my $line=<>) { if ($line =~ m/^\s*(#|@|\/\/)/) { print $line; next; } $line =~ s|/\*.*\*/||; # get rid of C-style comments... $line =~ s|^\s+||; # ... and skip white spaces in beginning... $line =~ s|\s+$||; # ... and at the end if ($flavour =~ /64/) { my $copy = $line; # Also remove line comments. $copy =~ s|//.*||; if ($copy =~ /\b[wx]18\b/) { die "r18 is reserved by the platform and may not be used."; } } { $line =~ s|[\b\.]L(\w{2,})|L$1|g; # common denominator for Locallabel $line =~ s|\bL(\w{2,})|\.L$1|g if ($dotinlocallabels); } { $line =~ s|(^[\.\w]+)\:\s*||; my $label = $1; if ($label) { printf "%s:",($GLOBALS{$label} or $label); } } if ($line !~ m/^[#@]/) { $line =~ s|^\s*(\.?)(\S+)\s*||; my $c = $1; $c = "\t" if ($c eq ""); my $mnemonic = $2; my $opcode; if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) { $opcode = eval("\$$1_$2"); } else { $opcode = eval("\$$mnemonic"); } if ($flavour =~ /ios/) { # Mach-O and ELF use different syntax for these relocations. Note # that we require :pg_hi21: to be explicitly listed. It is normally # optional with adrp instructions. $line =~ s|:pg_hi21:(\w+)|\1\@PAGE|; $line =~ s|:lo12:(\w+)|\1\@PAGEOFF|; } else { # Clang's integrated assembly does not support the optional # :pg_hi21: markers, so erase them. $line =~ s|:pg_hi21:||; } my $arg=expand_line($line); if (ref($opcode) eq 'CODE') { $line = &$opcode($arg); } elsif ($mnemonic) { $line = $c.$mnemonic; $line.= "\t$arg" if ($arg ne ""); } } print $line if ($line); print "\n"; } print <<___; #endif // !OPENSSL_NO_ASM && $arch_defines && $target_defines ___ close STDOUT or die "error closing STDOUT: $!"; ring-0.17.14/crypto/perlasm/x86_64-xlate.pl000064400000000000000000001634271046102023000163430ustar 00000000000000#! /usr/bin/env perl # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Ascetic x86_64 AT&T to MASM/NASM assembler translator by . # # Why AT&T to MASM and not vice versa? Several reasons. Because AT&T # format is way easier to parse. Because it's simpler to "gear" from # Unix ABI to Windows one [see cross-reference "card" at the end of # file]. Because Linux targets were available first... # # In addition the script also "distills" code suitable for GNU # assembler, so that it can be compiled with more rigid assemblers, # such as Solaris /usr/ccs/bin/as. # # This translator is not designed to convert *arbitrary* assembler # code from AT&T format to MASM one. It's designed to convert just # enough to provide for dual-ABI OpenSSL modules development... # There *are* limitations and you might have to modify your assembler # code or this script to achieve the desired result... # # Currently recognized limitations: # # - can't use multiple ops per line; # # Dual-ABI styling rules. # # 1. Adhere to Unix register and stack layout [see cross-reference # ABI "card" at the end for explanation]. # 2. Forget about "red zone," stick to more traditional blended # stack frame allocation. If volatile storage is actually required # that is. If not, just leave the stack as is. # 3. Functions tagged with ".type name,@function" get crafted with # unified Win64 prologue and epilogue automatically. If you want # to take care of ABI differences yourself, tag functions as # ".type name,@abi-omnipotent" instead. # 4. To optimize the Win64 prologue you can specify number of input # arguments as ".type name,@function,N." Keep in mind that if N is # larger than 6, then you *have to* write "abi-omnipotent" code, # because >6 cases can't be addressed with unified prologue. # 5. Name local labels as .L*, do *not* use dynamic labels such as 1: # (sorry about latter). # 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is # required to identify the spots, where to inject Win64 epilogue! # 7. Stick to explicit ip-relative addressing. If you have to use # GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??. # Both are recognized and translated to proper Win64 addressing # modes. # # 8. In order to provide for structured exception handling unified # Win64 prologue copies %rsp value to %rax. For further details # see SEH paragraph at the end. # 9. .init segment is allowed to contain calls to functions only. # a. If function accepts more than 4 arguments *and* >4th argument # is declared as non 64-bit value, do clear its upper part. # # TODO(https://crbug.com/boringssl/259): The dual-ABI mechanism described here # does not quite unwind correctly on Windows. The seh_directive logic below has # the start of a new mechanism. use strict; my $flavour = shift; my $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } open STDOUT,">$output" || die "can't open $output: $!" if (defined($output)); my $gas=1; $gas=0 if ($output =~ /\.asm$/); my $elf=1; $elf=0 if (!$gas); my $apple=0; my $win64=0; my $prefix=""; my $decor=".L"; my $masmref=8 + 50727*2**-32; # 8.00.50727 shipped with VS2005 my $masm=0; my $PTR=" PTR"; my $nasmref=2.03; my $nasm=0; if ($flavour eq "mingw64") { $gas=1; $elf=0; $win64=1; # TODO(davidben): Before supporting the # mingw64 perlasm flavour, do away with this # environment variable check. die "mingw64 not supported"; $prefix=`echo __USER_LABEL_PREFIX__ | $ENV{CC} -E -P -`; $prefix =~ s|\R$||; # Better chomp } elsif ($flavour eq "macosx") { $gas=1; $elf=0; $apple=1; $prefix="_"; $decor="L\$"; } elsif ($flavour eq "masm") { $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; } elsif ($flavour eq "nasm") { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; } elsif (!$gas) { die "unknown flavour $flavour"; } my $current_segment; my $current_function; my %globals; { package opcode; # pick up opcodes sub re { my ($class, $line) = @_; my $self = {}; my $ret; if ($$line =~ /^([a-z][a-z0-9]*)/i) { bless $self,$class; $self->{op} = $1; $ret = $self; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; undef $self->{sz}; if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain... $self->{op} = $1; $self->{sz} = $2; } elsif ($self->{op} =~ /call|jmp/) { $self->{sz} = ""; } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn $self->{sz} = ""; } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov $self->{sz} = ""; } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) { $self->{sz} = ""; } elsif ($self->{op} =~ /^or([qlwb])$/) { $self->{op} = "or"; $self->{sz} = $1; } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) { $self->{op} = $1; $self->{sz} = $2; } } $ret; } sub size { my ($self, $sz) = @_; $self->{sz} = $sz if (defined($sz) && !defined($self->{sz})); $self->{sz}; } sub out { my $self = shift; if ($gas) { if ($self->{op} eq "movz") { # movz is pain... sprintf "%s%s%s",$self->{op},$self->{sz},shift; } elsif ($self->{op} =~ /^set/) { "$self->{op}"; } elsif ($self->{op} eq "ret") { my $epilogue = ""; if ($win64 && $current_function->{abi} eq "svr4") { $epilogue = "movq 8(%rsp),%rdi\n\t" . "movq 16(%rsp),%rsi\n\t"; } $epilogue . "ret"; } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") { ".p2align\t3\n\t.quad"; } else { "$self->{op}$self->{sz}"; } } else { $self->{op} =~ s/^movz/movzx/; if ($self->{op} eq "ret") { $self->{op} = ""; if ($win64 && $current_function->{abi} eq "svr4") { $self->{op} = "mov rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t". "mov rsi,QWORD$PTR\[16+rsp\]\n\t"; } $self->{op} .= "ret"; } elsif ($self->{op} =~ /^(pop|push)f/) { $self->{op} .= $self->{sz}; } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") { $self->{op} = "\tDQ"; } $self->{op}; } } sub mnemonic { my ($self, $op) = @_; $self->{op}=$op if (defined($op)); $self->{op}; } } { package const; # pick up constants, which start with $ sub re { my ($class, $line) = @_; my $self = {}; my $ret; if ($$line =~ /^\$([^,]+)/) { bless $self, $class; $self->{value} = $1; $ret = $self; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; } $ret; } sub out { my $self = shift; $self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig; if ($gas) { # Solaris /usr/ccs/bin/as can't handle multiplications # in $self->{value} my $value = $self->{value}; no warnings; # oct might complain about overflow, ignore here... $value =~ s/(?{value} = $value; } sprintf "\$%s",$self->{value}; } else { my $value = $self->{value}; $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm); sprintf "%s",$value; } } } { package ea; # pick up effective addresses: expr(%reg,%reg,scale) my %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR", d=>"DWORD$PTR", q=>"QWORD$PTR", o=>"OWORD$PTR", x=>"XMMWORD$PTR", y=>"YMMWORD$PTR", z=>"ZMMWORD$PTR" ) if (!$gas); sub re { my ($class, $line, $opcode) = @_; my $self = {}; my $ret; # optional * ----vvv--- appears in indirect jmp/call if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) { bless $self, $class; $self->{asterisk} = $1; $self->{label} = $2; ($self->{base},$self->{index},$self->{scale})=split(/,/,$3); $self->{scale} = 1 if (!defined($self->{scale})); $self->{opmask} = $4; $ret = $self; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; if ($win64 && $self->{label} =~ s/\@GOTPCREL//) { die if ($opcode->mnemonic() ne "mov"); $opcode->mnemonic("lea"); } $self->{base} =~ s/^%//; $self->{index} =~ s/^%// if (defined($self->{index})); $self->{opcode} = $opcode; } $ret; } sub size {} sub out { my ($self, $sz) = @_; $self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; $self->{label} =~ s/\.L/$decor/g; # Silently convert all EAs to 64-bit. This is required for # elder GNU assembler and results in more compact code, # *but* most importantly AES module depends on this feature! $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; $self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; # Solaris /usr/ccs/bin/as can't handle multiplications # in $self->{label}... use integer; $self->{label} =~ s/(?{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg; # Some assemblers insist on signed presentation of 32-bit # offsets, but sign extension is a tricky business in perl... if ((1<<31)<<1) { $self->{label} =~ s/\b([0-9]+)\b/$1<<32>>32/eg; } else { $self->{label} =~ s/\b([0-9]+)\b/$1>>0/eg; } # if base register is %rbp or %r13, see if it's possible to # flip base and index registers [for better performance] if (!$self->{label} && $self->{index} && $self->{scale}==1 && $self->{base} =~ /(rbp|r13)/) { $self->{base} = $self->{index}; $self->{index} = $1; } if ($gas) { $self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64"); if (defined($self->{index})) { sprintf "%s%s(%s,%%%s,%d)%s", $self->{asterisk},$self->{label}, $self->{base}?"%$self->{base}":"", $self->{index},$self->{scale}, $self->{opmask}; } else { sprintf "%s%s(%%%s)%s", $self->{asterisk},$self->{label}, $self->{base},$self->{opmask}; } } else { $self->{label} =~ s/\./\$/g; $self->{label} =~ s/(?{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/); my $mnemonic = $self->{opcode}->mnemonic(); ($self->{asterisk}) && ($sz="q") || ($mnemonic =~ /^v?mov([qd])$/) && ($sz=$1) || ($mnemonic =~ /^v?pinsr([qdwb])$/) && ($sz=$1) || ($mnemonic =~ /^vpbroadcast([qdwb])$/) && ($sz=$1) || ($mnemonic =~ /^v(?!perm)[a-z]+[fi]128$/) && ($sz="x"); $self->{opmask} =~ s/%(k[0-7])/$1/; if (defined($self->{index})) { sprintf "%s[%s%s*%d%s]%s",$szmap{$sz}, $self->{label}?"$self->{label}+":"", $self->{index},$self->{scale}, $self->{base}?"+$self->{base}":"", $self->{opmask}; } elsif ($self->{base} eq "rip") { sprintf "%s[%s]",$szmap{$sz},$self->{label}; } else { sprintf "%s[%s%s]%s", $szmap{$sz}, $self->{label}?"$self->{label}+":"", $self->{base},$self->{opmask}; } } } } { package register; # pick up registers, which start with %. sub re { my ($class, $line, $opcode) = @_; my $self = {}; my $ret; # optional * ----vvv--- appears in indirect jmp/call if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) { bless $self,$class; $self->{asterisk} = $1; $self->{value} = $2; $self->{opmask} = $3; $opcode->size($self->size()); $ret = $self; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; } $ret; } sub size { my $self = shift; my $ret; if ($self->{value} =~ /^r[\d]+b$/i) { $ret="b"; } elsif ($self->{value} =~ /^r[\d]+w$/i) { $ret="w"; } elsif ($self->{value} =~ /^r[\d]+d$/i) { $ret="l"; } elsif ($self->{value} =~ /^r[\w]+$/i) { $ret="q"; } elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; } elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; } elsif ($self->{value} =~ /^[\w]{2}$/i) { $ret="w"; } elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; } $ret; } sub out { my $self = shift; if ($gas) { sprintf "%s%%%s%s", $self->{asterisk}, $self->{value}, $self->{opmask}; } else { $self->{opmask} =~ s/%(k[0-7])/$1/; $self->{value}.$self->{opmask}; } } } { package label; # pick up labels, which end with : sub re { my ($class, $line) = @_; my $self = {}; my $ret; if ($$line =~ /(^[\.\w]+)\:/) { bless $self,$class; $self->{value} = $1; $ret = $self; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; $self->{value} =~ s/^\.L/$decor/; } $ret; } sub out { my $self = shift; if ($gas) { my $func = ($globals{$self->{value}} or $self->{value}) . ":"; if ($win64 && $current_function->{name} eq $self->{value} && $current_function->{abi} eq "svr4") { $func .= "\n"; $func .= " movq %rdi,8(%rsp)\n"; $func .= " movq %rsi,16(%rsp)\n"; $func .= " movq %rsp,%rax\n"; $func .= "${decor}SEH_begin_$current_function->{name}:\n"; my $narg = $current_function->{narg}; $narg=6 if (!defined($narg)); $func .= " movq %rcx,%rdi\n" if ($narg>0); $func .= " movq %rdx,%rsi\n" if ($narg>1); $func .= " movq %r8,%rdx\n" if ($narg>2); $func .= " movq %r9,%rcx\n" if ($narg>3); $func .= " movq 40(%rsp),%r8\n" if ($narg>4); $func .= " movq 48(%rsp),%r9\n" if ($narg>5); } $func; } elsif ($self->{value} ne "$current_function->{name}") { # Make all labels in masm global. $self->{value} .= ":" if ($masm); $self->{value} . ":"; } elsif ($win64 && $current_function->{abi} eq "svr4") { my $func = "$current_function->{name}" . ($nasm ? ":" : "\tPROC $current_function->{scope}") . "\n"; $func .= " mov QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n"; $func .= " mov QWORD$PTR\[16+rsp\],rsi\n"; $func .= " mov rax,rsp\n"; $func .= "${decor}SEH_begin_$current_function->{name}:"; $func .= ":" if ($masm); $func .= "\n"; my $narg = $current_function->{narg}; $narg=6 if (!defined($narg)); $func .= " mov rdi,rcx\n" if ($narg>0); $func .= " mov rsi,rdx\n" if ($narg>1); $func .= " mov rdx,r8\n" if ($narg>2); $func .= " mov rcx,r9\n" if ($narg>3); $func .= " mov r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4); $func .= " mov r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5); $func .= "\n"; } else { "$current_function->{name}". ($nasm ? ":" : "\tPROC $current_function->{scope}"); } } } { package expr; # pick up expressions sub re { my ($class, $line, $opcode) = @_; my $self = {}; my $ret; if ($$line =~ /(^[^,]+)/) { bless $self,$class; $self->{value} = $1; $ret = $self; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; $self->{value} =~ s/\@PLT// if (!$elf); $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; $self->{value} =~ s/\.L/$decor/g; $self->{opcode} = $opcode; } $ret; } sub out { my $self = shift; if ($nasm && $self->{opcode}->mnemonic()=~m/^j(?![re]cxz)/) { "NEAR ".$self->{value}; } else { $self->{value}; } } } { package cfi_directive; # CFI directives annotate instructions that are significant for # stack unwinding procedure compliant with DWARF specification, # see http://dwarfstd.org/. Besides naturally expected for this # script platform-specific filtering function, this module adds # three auxiliary synthetic directives not recognized by [GNU] # assembler: # # - .cfi_push to annotate push instructions in prologue, which # translates to .cfi_adjust_cfa_offset (if needed) and # .cfi_offset; # - .cfi_pop to annotate pop instructions in epilogue, which # translates to .cfi_adjust_cfa_offset (if needed) and # .cfi_restore; # - [and most notably] .cfi_cfa_expression which encodes # DW_CFA_def_cfa_expression and passes it to .cfi_escape as # byte vector; # # CFA expressions were introduced in DWARF specification version # 3 and describe how to deduce CFA, Canonical Frame Address. This # becomes handy if your stack frame is variable and you can't # spare register for [previous] frame pointer. Suggested directive # syntax is made-up mix of DWARF operator suffixes [subset of] # and references to registers with optional bias. Following example # describes offloaded *original* stack pointer at specific offset # from *current* stack pointer: # # .cfi_cfa_expression %rsp+40,deref,+8 # # Final +8 has everything to do with the fact that CFA is defined # as reference to top of caller's stack, and on x86_64 call to # subroutine pushes 8-byte return address. In other words original # stack pointer upon entry to a subroutine is 8 bytes off from CFA. # Below constants are taken from "DWARF Expressions" section of the # DWARF specification, section is numbered 7.7 in versions 3 and 4. my %DW_OP_simple = ( # no-arg operators, mapped directly deref => 0x06, dup => 0x12, drop => 0x13, over => 0x14, pick => 0x15, swap => 0x16, rot => 0x17, xderef => 0x18, abs => 0x19, and => 0x1a, div => 0x1b, minus => 0x1c, mod => 0x1d, mul => 0x1e, neg => 0x1f, not => 0x20, or => 0x21, plus => 0x22, shl => 0x24, shr => 0x25, shra => 0x26, xor => 0x27, ); my %DW_OP_complex = ( # used in specific subroutines constu => 0x10, # uleb128 consts => 0x11, # sleb128 plus_uconst => 0x23, # uleb128 lit0 => 0x30, # add 0-31 to opcode reg0 => 0x50, # add 0-31 to opcode breg0 => 0x70, # add 0-31 to opcole, sleb128 regx => 0x90, # uleb28 fbreg => 0x91, # sleb128 bregx => 0x92, # uleb128, sleb128 piece => 0x93, # uleb128 ); # Following constants are defined in x86_64 ABI supplement, for # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf, # see section 3.7 "Stack Unwind Algorithm". my %DW_reg_idx = ( "%rax"=>0, "%rdx"=>1, "%rcx"=>2, "%rbx"=>3, "%rsi"=>4, "%rdi"=>5, "%rbp"=>6, "%rsp"=>7, "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 ); my ($cfa_reg, $cfa_rsp); my @cfa_stack; # [us]leb128 format is variable-length integer representation base # 2^128, with most significant bit of each byte being 0 denoting # *last* most significant digit. See "Variable Length Data" in the # DWARF specification, numbered 7.6 at least in versions 3 and 4. sub sleb128 { use integer; # get right shift extend sign my $val = shift; my $sign = ($val < 0) ? -1 : 0; my @ret = (); while(1) { push @ret, $val&0x7f; # see if remaining bits are same and equal to most # significant bit of the current digit, if so, it's # last digit... last if (($val>>6) == $sign); @ret[-1] |= 0x80; $val >>= 7; } return @ret; } sub uleb128 { my $val = shift; my @ret = (); while(1) { push @ret, $val&0x7f; # see if it's last significant digit... last if (($val >>= 7) == 0); @ret[-1] |= 0x80; } return @ret; } sub const { my $val = shift; if ($val >= 0 && $val < 32) { return ($DW_OP_complex{lit0}+$val); } return ($DW_OP_complex{consts}, sleb128($val)); } sub reg { my $val = shift; return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/); my $reg = $DW_reg_idx{$1}; my $off = eval ("0 $2 $3"); return (($DW_OP_complex{breg0} + $reg), sleb128($off)); # Yes, we use DW_OP_bregX+0 to push register value and not # DW_OP_regX, because latter would require even DW_OP_piece, # which would be a waste under the circumstances. If you have # to use DWP_OP_reg, use "regx:N"... } sub cfa_expression { my $line = shift; my @ret; foreach my $token (split(/,\s*/,$line)) { if ($token =~ /^%r/) { push @ret,reg($token); } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) { push @ret,reg("$2+$1"); } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) { my $i = 1*eval($2); push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i)); } elsif (my $i = 1*eval($token) or $token eq "0") { if ($token =~ /^\+/) { push @ret,$DW_OP_complex{plus_uconst},uleb128($i); } else { push @ret,const($i); } } else { push @ret,$DW_OP_simple{$token}; } } # Finally we return DW_CFA_def_cfa_expression, 15, followed by # length of the expression and of course the expression itself. return (15,scalar(@ret),@ret); } sub re { my ($class, $line) = @_; my $self = {}; my $ret; if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) { bless $self,$class; $ret = $self; undef $self->{value}; my $dir = $1; SWITCH: for ($dir) { # What is $cfa_rsp? Effectively it's difference between %rsp # value and current CFA, Canonical Frame Address, which is # why it starts with -8. Recall that CFA is top of caller's # stack... /startproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", -8); last; }; /endproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", 0); last; }; /def_cfa_register/ && do { $cfa_reg = $$line; last; }; /def_cfa_offset/ && do { $cfa_rsp = -1*eval($$line) if ($cfa_reg eq "%rsp"); last; }; /adjust_cfa_offset/ && do { $cfa_rsp -= 1*eval($$line) if ($cfa_reg eq "%rsp"); last; }; /def_cfa/ && do { if ($$line =~ /(%r\w+)\s*,\s*(.+)/) { $cfa_reg = $1; $cfa_rsp = -1*eval($2) if ($cfa_reg eq "%rsp"); } last; }; /push/ && do { $dir = undef; $cfa_rsp -= 8; if ($cfa_reg eq "%rsp") { $self->{value} = ".cfi_adjust_cfa_offset\t8\n"; } $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp"; last; }; /pop/ && do { $dir = undef; $cfa_rsp += 8; if ($cfa_reg eq "%rsp") { $self->{value} = ".cfi_adjust_cfa_offset\t-8\n"; } $self->{value} .= ".cfi_restore\t$$line"; last; }; /cfa_expression/ && do { $dir = undef; $self->{value} = ".cfi_escape\t" . join(",", map(sprintf("0x%02x", $_), cfa_expression($$line))); last; }; /remember_state/ && do { push @cfa_stack, [$cfa_reg, $cfa_rsp]; last; }; /restore_state/ && do { ($cfa_reg, $cfa_rsp) = @{pop @cfa_stack}; last; }; } $self->{value} = ".cfi_$dir\t$$line" if ($dir); $$line = ""; } return $ret; } sub out { my $self = shift; return ($elf ? $self->{value} : undef); } } { package seh_directive; # This implements directives, like MASM, gas, and clang-assembler for # specifying Windows unwind codes. See # https://learn.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-170 # for details on the Windows unwind mechanism. As perlasm generally uses gas # syntax, the syntax is patterned after the gas spelling, described in # https://sourceware.org/legacy-ml/binutils/2009-08/msg00193.html # # TODO(https://crbug.com/boringssl/571): Translate to the MASM directives # when using the MASM output. Emit as-is when using "mingw64" output, which # is Windows with gas syntax. # # TODO(https://crbug.com/boringssl/259): For now, SEH directives are ignored # on non-Windows platforms. This means functions need to specify both CFI # and SEH directives, often redundantly. Ideally we'd abstract between the # two. E.g., we can synthesize CFI from SEH prologues, but SEH does not # annotate epilogs, so we'd need to combine parts from both. Or we can # restrict ourselves to a subset of CFI and synthesize SEH from CFI. # # Additionally, this only supports @abi-omnipotent functions. It is # incompatible with the automatic calling convention conversion. The main # complication is the current scheme modifies RDI and RSI (non-volatile on # Windows) at the start of the function, and saves them in the parameter # stack area. This can be expressed with .seh_savereg, but .seh_savereg is # only usable late in the prologue. However, unwind information gives enough # information to locate the parameter stack area at any point in the # function, so we can defer conversion or implement other schemes. my $UWOP_PUSH_NONVOL = 0; my $UWOP_ALLOC_LARGE = 1; my $UWOP_ALLOC_SMALL = 2; my $UWOP_SET_FPREG = 3; my $UWOP_SAVE_NONVOL = 4; my $UWOP_SAVE_NONVOL_FAR = 5; my $UWOP_SAVE_XMM128 = 8; my $UWOP_SAVE_XMM128_FAR = 9; my %UWOP_REG_TO_NUMBER = ("%rax" => 0, "%rcx" => 1, "%rdx" => 2, "%rbx" => 3, "%rsp" => 4, "%rbp" => 5, "%rsi" => 6, "%rdi" => 7, map(("%r$_" => $_), (8..15))); my %UWOP_NUMBER_TO_REG = reverse %UWOP_REG_TO_NUMBER; # The contents of the pdata and xdata sections so far. my ($xdata, $pdata) = ("", ""); my %info; my $next_label = 0; my $current_label_func = ""; # _new_unwind_label allocates a new label, unique to the file. sub _new_unwind_label { my ($name) = (@_); # Labels only need to be unique, but to make diffs easier to read, scope # them all under the current function. my $func = $current_function->{name}; if ($func ne $current_label_func) { $current_label_func = $func; $next_label = 0; } my $num = $next_label++; return ".LSEH_${name}_${func}_${num}"; } sub _check_in_proc { die "Missing .seh_startproc directive" unless %info; } sub _check_in_prologue { _check_in_proc(); die "Invalid SEH directive after .seh_endprologue" if defined($info{endprologue}); } sub _check_not_in_proc { die "Missing .seh_endproc directive" if %info; } sub _startproc { _check_not_in_proc(); if ($current_function->{abi} eq "svr4") { die "SEH directives can only be used with \@abi-omnipotent"; } my $info_label = _new_unwind_label("info"); my $start_label = _new_unwind_label("begin"); %info = ( # info_label is the label of the function's entry in .xdata. info_label => $info_label, # start_label is the start of the function. start_label => $start_label, # endprologue is the label of the end of the prologue. endprologue => undef, # unwind_codes contains the textual representation of the # unwind codes in the function so far. unwind_codes => "", # num_codes is the number of 16-bit words in unwind_codes. num_codes => 0, # frame_reg is the number of the frame register, or zero if # there is none. frame_reg => 0, # frame_offset is the offset into the fixed part of the stack that # the frame register points into. frame_offset => 0, # has_offset is whether directives taking an offset have # been used. This is used to check that such directives # come after the fixed portion of the stack frame is established. has_offset => 0, # has_nonpushreg is whether directives other than # .seh_pushreg have been used. This is used to check that # .seh_pushreg directives are first. has_nonpushreg => 0, ); return $start_label; } sub _add_unwind_code { my ($op, $value, @extra) = @_; _check_in_prologue(); if ($op != $UWOP_PUSH_NONVOL) { $info{has_nonpushreg} = 1; } elsif ($info{has_nonpushreg}) { die ".seh_pushreg directives must appear first in the prologue"; } my $label = _new_unwind_label("prologue"); # Encode an UNWIND_CODE structure. See # https://learn.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-170#struct-unwind_code my $encoded = $op | ($value << 4); my $codes = <<____; .byte $label-$info{start_label} .byte $encoded ____ # Some opcodes need additional values to encode themselves. foreach (@extra) { $codes .= "\t.value\t$_\n"; } $info{num_codes} += 1 + scalar(@extra); # Unwind codes are listed in reverse order. $info{unwind_codes} = $codes . $info{unwind_codes}; return $label; } sub _updating_fixed_allocation { _check_in_prologue(); if ($info{frame_reg} != 0) { # Windows documentation does not explicitly forbid .seh_stackalloc # after .seh_setframe, but it appears to have no effect. Offsets are # still relative to the fixed allocation when the frame register was # established. die "fixed allocation may not be increased after .seh_setframe"; } if ($info{has_offset}) { # Windows documentation does not explicitly forbid .seh_savereg # before .seh_stackalloc, but it does not work very well. Offsets # are relative to the top of the final fixed allocation, not where # RSP currently is. die "directives with an offset must come after the fixed allocation is established."; } } sub _endproc { _check_in_proc(); if (!defined($info{endprologue})) { die "Missing .seh_endprologue"; } my $end_label = _new_unwind_label("end"); # Encode a RUNTIME_FUNCTION. See # https://learn.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-170#struct-runtime_function $pdata .= <<____; .rva $info{start_label} .rva $end_label .rva $info{info_label} ____ # Encode an UNWIND_INFO. See # https://learn.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-170#struct-unwind_info my $frame_encoded = $info{frame_reg} | (($info{frame_offset} / 16) << 4); $xdata .= <<____; $info{info_label}: .byte 1 # version 1, no flags .byte $info{endprologue}-$info{start_label} .byte $info{num_codes} .byte $frame_encoded $info{unwind_codes} ____ # UNWIND_INFOs must be 4-byte aligned. If needed, we must add an extra # unwind code. This does not change the unwind code count. Windows # documentation says "For alignment purposes, this array always has an # even number of entries, and the final entry is potentially unused. In # that case, the array is one longer than indicated by the count of # unwind codes field." if ($info{num_codes} & 1) { $xdata .= "\t.value\t0\n"; } %info = (); return $end_label; } sub re { my ($class, $line) = @_; if ($$line =~ s/^\s*\.seh_(\w+)\s*//) { my $dir = $1; if (!$win64) { $$line = ""; return; } my $label; SWITCH: for ($dir) { /^startproc$/ && do { $label = _startproc($1); last; }; /^pushreg$/ && do { $$line =~ /^(%\w+)\s*$/ or die "could not parse .seh_$dir"; my $reg_num = $UWOP_REG_TO_NUMBER{$1} or die "unknown register $1"; _updating_fixed_allocation(); $label = _add_unwind_code($UWOP_PUSH_NONVOL, $reg_num); last; }; /^stackalloc$/ && do { my $num = eval($$line); if ($num <= 0 || $num % 8 != 0) { die "invalid stack allocation: $num"; } _updating_fixed_allocation(); if ($num <= 128) { $label = _add_unwind_code($UWOP_ALLOC_SMALL, ($num - 8) / 8); } elsif ($num < 512 * 1024) { $label = _add_unwind_code($UWOP_ALLOC_LARGE, 0, $num / 8); } elsif ($num < 4 * 1024 * 1024 * 1024) { $label = _add_unwind_code($UWOP_ALLOC_LARGE, 1, $num >> 16, $num & 0xffff); } else { die "stack allocation too large: $num" } last; }; /^setframe$/ && do { if ($info{frame_reg} != 0) { die "duplicate .seh_setframe directive"; } if ($info{has_offset}) { die "directives with with an offset must come after .seh_setframe."; } $$line =~ /(%\w+)\s*,\s*(.+)/ or die "could not parse .seh_$dir"; my $reg_num = $UWOP_REG_TO_NUMBER{$1} or die "unknown register $1"; my $offset = eval($2); if ($offset < 0 || $offset % 16 != 0 || $offset > 240) { die "invalid offset: $offset"; } $info{frame_reg} = $reg_num; $info{frame_offset} = $offset; $label = _add_unwind_code($UWOP_SET_FPREG, 0); last; }; /^savereg$/ && do { $$line =~ /(%\w+)\s*,\s*(.+)/ or die "could not parse .seh_$dir"; my $reg_num = $UWOP_REG_TO_NUMBER{$1} or die "unknown register $1"; my $offset = eval($2); if ($offset < 0 || $offset % 8 != 0) { die "invalid offset: $offset"; } if ($offset < 8 * 65536) { $label = _add_unwind_code($UWOP_SAVE_NONVOL, $reg_num, $offset / 8); } else { $label = _add_unwind_code($UWOP_SAVE_NONVOL_FAR, $reg_num, $offset >> 16, $offset & 0xffff); } $info{has_offset} = 1; last; }; /^savexmm$/ && do { $$line =~ /%xmm(\d+)\s*,\s*(.+)/ or die "could not parse .seh_$dir"; my $reg_num = $1; my $offset = eval($2); if ($offset < 0 || $offset % 16 != 0) { die "invalid offset: $offset"; } if ($offset < 16 * 65536) { $label = _add_unwind_code($UWOP_SAVE_XMM128, $reg_num, $offset / 16); } else { $label = _add_unwind_code($UWOP_SAVE_XMM128_FAR, $reg_num, $offset >> 16, $offset & 0xffff); } $info{has_offset} = 1; last; }; /^endprologue$/ && do { _check_in_prologue(); if ($info{num_codes} == 0) { # If a Windows function has no directives (i.e. it # doesn't touch the stack), it is a leaf function and is # not expected to appear in .pdata or .xdata. die ".seh_endprologue found with no unwind codes"; } $label = _new_unwind_label("endprologue"); $info{endprologue} = $label; last; }; /^endproc$/ && do { $label = _endproc(); last; }; die "unknown SEH directive .seh_$dir"; } # All SEH directives compile to labels inline. The other data is # emitted later. $$line = ""; $label .= ":"; return label->re(\$label); } } sub pdata_and_xdata { return "" unless $win64; my $ret = ""; if ($pdata ne "") { $ret .= <<____; .section .pdata .align 4 $pdata ____ } if ($xdata ne "") { $ret .= <<____; .section .xdata .align 4 $xdata ____ } return $ret; } } { package directive; # pick up directives, which start with . my %sections; sub nasm_section { my ($name, $qualifiers) = @_; my $ret = "section\t$name"; if (exists $sections{$name}) { # Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392701. Only # emit section qualifiers the first time a section is referenced. # For all subsequent references, require the qualifiers match and # omit them. # # See also https://crbug.com/1422018 and b/270643835. my $old = $sections{$name}; die "Inconsistent qualifiers: $qualifiers vs $old" if ($qualifiers ne "" && $qualifiers ne $old); } else { $sections{$name} = $qualifiers; if ($qualifiers ne "") { $ret .= " $qualifiers"; } } return $ret; } sub re { my ($class, $line) = @_; my $self = {}; my $ret; my $dir; # chain-call to cfi_directive and seh_directive. $ret = cfi_directive->re($line) and return $ret; $ret = seh_directive->re($line) and return $ret; if ($$line =~ /^\s*(\.\w+)/) { bless $self,$class; $dir = $1; $ret = $self; undef $self->{value}; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; SWITCH: for ($dir) { /\.global|\.globl|\.extern/ && do { $globals{$$line} = $prefix . $$line; $$line = $globals{$$line} if ($prefix); last; }; /\.type/ && do { my ($sym,$type,$narg) = split(/\s*,\s*/,$$line); if ($type eq "\@function") { undef $current_function; $current_function->{name} = $sym; $current_function->{abi} = "svr4"; $current_function->{narg} = $narg; $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; } elsif ($type eq "\@abi-omnipotent") { undef $current_function; $current_function->{name} = $sym; $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; } $$line =~ s/\@abi\-omnipotent/\@function/; $$line =~ s/\@function.*/\@function/; last; }; /\.asciz/ && do { if ($$line =~ /^"(.*)"$/) { $dir = ".byte"; $$line = join(",",unpack("C*",$1),0); } last; }; /\.rva|\.long|\.quad|\.byte/ && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; $$line =~ s/\.L/$decor/g; last; }; } if ($gas) { $self->{value} = $dir . "\t" . $$line; if ($dir =~ /\.extern/) { if ($flavour eq "elf") { $self->{value} .= "\n.hidden $$line"; } else { $self->{value} = ""; } } elsif (!$elf && $dir =~ /\.type/) { $self->{value} = ""; $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" . (defined($globals{$1})?".scl 2;":".scl 3;") . "\t.type 32;\t.endef" if ($win64 && $$line =~ /([^,]+),\@function/); } elsif (!$elf && $dir =~ /\.size/) { $self->{value} = ""; if (defined($current_function)) { $self->{value} .= "${decor}SEH_end_$current_function->{name}:" if ($win64 && $current_function->{abi} eq "svr4"); undef $current_function; } } elsif (!$elf && $dir =~ /\.align/) { $self->{value} = ".p2align\t" . (log($$line)/log(2)); } elsif ($dir eq ".section") { $current_segment=$$line; if (!$elf && $current_segment eq ".rodata") { if ($flavour eq "macosx") { $self->{value} = ".section\t__DATA,__const"; } } if (!$elf && $current_segment eq ".init") { if ($flavour eq "macosx") { $self->{value} = ".mod_init_func"; } elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.ctors"; } } } elsif ($dir =~ /\.(text|data)/) { $current_segment=".$1"; } elsif ($dir =~ /\.global|\.globl|\.extern/) { if ($flavour eq "macosx") { $self->{value} .= "\n.private_extern $$line"; } else { $self->{value} .= "\n.hidden $$line"; } } elsif ($dir =~ /\.hidden/) { if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$$line"; } elsif ($flavour eq "mingw64") { $self->{value} = ""; } } elsif ($dir =~ /\.comm/) { $self->{value} = "$dir\t$prefix$$line"; $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx"); } $$line = ""; return $self; } # non-gas case or nasm/masm SWITCH: for ($dir) { /\.text/ && do { my $v=undef; if ($nasm) { $v=nasm_section(".text", "code align=64")."\n"; } else { $v="$current_segment\tENDS\n" if ($current_segment); $current_segment = ".text\$"; $v.="$current_segment\tSEGMENT "; $v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE"; $v.=" 'CODE'"; } $self->{value} = $v; last; }; /\.data/ && do { my $v=undef; if ($nasm) { $v=nasm_section(".data", "data align=8")."\n"; } else { $v="$current_segment\tENDS\n" if ($current_segment); $current_segment = "_DATA"; $v.="$current_segment\tSEGMENT"; } $self->{value} = $v; last; }; /\.section/ && do { my $v=undef; $$line =~ s/([^,]*).*/$1/; $$line = ".CRT\$XCU" if ($$line eq ".init"); $$line = ".rdata" if ($$line eq ".rodata"); if ($nasm) { my $qualifiers = ""; if ($$line=~/\.([prx])data/) { $qualifiers = "rdata align="; $qualifiers .= $1 eq "p"? 4 : 8; } elsif ($$line=~/\.CRT\$/i) { $qualifiers = "rdata align=8"; } $v = nasm_section($$line, $qualifiers); } else { $v="$current_segment\tENDS\n" if ($current_segment); $v.="$$line\tSEGMENT"; if ($$line=~/\.([prx])data/) { $v.=" READONLY"; $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref); } elsif ($$line=~/\.CRT\$/i) { $v.=" READONLY "; $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD"; } } $current_segment = $$line; $self->{value} = $v; last; }; /\.extern/ && do { $self->{value} = "EXTERN\t".$$line; $self->{value} .= ":NEAR" if ($masm); last; }; /\.globl|.global/ && do { $self->{value} = $masm?"PUBLIC":"global"; $self->{value} .= "\t".$$line; last; }; /\.size/ && do { if (defined($current_function)) { undef $self->{value}; if ($current_function->{abi} eq "svr4") { $self->{value}="${decor}SEH_end_$current_function->{name}:"; $self->{value}.=":\n" if($masm); } $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name}); undef $current_function; } last; }; /\.align/ && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096; $self->{value} = "ALIGN\t".($$line>$max?$max:$$line); last; }; /\.(value|long|rva|quad)/ && do { my $sz = substr($1,0,1); my @arr = split(/,\s*/,$$line); my $last = pop(@arr); my $conv = sub { my $var=shift; $var=~s/^(0b[0-1]+)/oct($1)/eig; $var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm); if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva")) { $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; } $var; }; $sz =~ tr/bvlrq/BWDDQ/; $self->{value} = "\tD$sz\t"; for (@arr) { $self->{value} .= &$conv($_).","; } $self->{value} .= &$conv($last); last; }; /\.byte/ && do { my @str=split(/,\s*/,$$line); map(s/(0b[0-1]+)/oct($1)/eig,@str); map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm); while ($#str>15) { $self->{value}.="\tDB\t" .join(",",@str[0..15])."\n"; foreach (0..15) { shift @str; } } $self->{value}.="\tDB\t" .join(",",@str) if (@str); last; }; /\.comm/ && do { my @str=split(/,\s*/,$$line); my $v=undef; if ($nasm) { $v.="common $prefix@str[0] @str[1]"; } else { $v="$current_segment\tENDS\n" if ($current_segment); $current_segment = "_DATA"; $v.="$current_segment\tSEGMENT\n"; $v.="COMM @str[0]:DWORD:".@str[1]/4; } $self->{value} = $v; last; }; } $$line = ""; } $ret; } sub out { my $self = shift; $self->{value}; } } # Upon initial x86_64 introduction SSE>2 extensions were not introduced # yet. In order not to be bothered by tracing exact assembler versions, # but at the same time to provide a bare security minimum of AES-NI, we # hard-code some instructions. Extensions past AES-NI on the other hand # are traced by examining assembler version in individual perlasm # modules... my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3, "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 ); sub rex { my $opcode=shift; my ($dst,$src,$rex)=@_; $rex|=0x04 if($dst>=8); $rex|=0x01 if($src>=8); push @$opcode,($rex|0x40) if ($rex); } my $movq = sub { # elderly gas can't handle inter-register movq my $arg = shift; my @opcode=(0x66); if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) { my ($src,$dst)=($1,$2); if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } rex(\@opcode,$src,$dst,0x8); push @opcode,0x0f,0x7e; push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M @opcode; } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) { my ($src,$dst)=($2,$1); if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } rex(\@opcode,$src,$dst,0x8); push @opcode,0x0f,0x6e; push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M @opcode; } else { (); } }; my $pextrd = sub { if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) { my @opcode=(0x66); my $imm=$1; my $src=$2; my $dst=$3; if ($dst =~ /%r([0-9]+)d/) { $dst = $1; } elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; } rex(\@opcode,$src,$dst); push @opcode,0x0f,0x3a,0x16; push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M push @opcode,$imm; @opcode; } else { (); } }; my $pinsrd = sub { if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) { my @opcode=(0x66); my $imm=$1; my $src=$2; my $dst=$3; if ($src =~ /%r([0-9]+)/) { $src = $1; } elsif ($src =~ /%e/) { $src = $regrm{$src}; } rex(\@opcode,$dst,$src); push @opcode,0x0f,0x3a,0x22; push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M push @opcode,$imm; @opcode; } else { (); } }; my $pshufb = sub { if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x66); rex(\@opcode,$2,$1); push @opcode,0x0f,0x38,0x00; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M @opcode; } else { (); } }; my $palignr = sub { if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x66); rex(\@opcode,$3,$2); push @opcode,0x0f,0x3a,0x0f; push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M push @opcode,$1; @opcode; } else { (); } }; my $pclmulqdq = sub { if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x66); rex(\@opcode,$3,$2); push @opcode,0x0f,0x3a,0x44; push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M my $c=$1; push @opcode,$c=~/^0/?oct($c):$c; @opcode; } else { (); } }; my $rdrand = sub { if (shift =~ /%[er](\w+)/) { my @opcode=(); my $dst=$1; if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } rex(\@opcode,0,$dst,8); push @opcode,0x0f,0xc7,0xf0|($dst&7); @opcode; } else { (); } }; my $rdseed = sub { if (shift =~ /%[er](\w+)/) { my @opcode=(); my $dst=$1; if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } rex(\@opcode,0,$dst,8); push @opcode,0x0f,0xc7,0xf8|($dst&7); @opcode; } else { (); } }; # Not all AVX-capable assemblers recognize AMD XOP extension. Since we # are using only two instructions hand-code them in order to be excused # from chasing assembler versions... sub rxb { my $opcode=shift; my ($dst,$src1,$src2,$rxb)=@_; $rxb|=0x7<<5; $rxb&=~(0x04<<5) if($dst>=8); $rxb&=~(0x01<<5) if($src1>=8); $rxb&=~(0x02<<5) if($src2>=8); push @$opcode,$rxb; } my $vprotd = sub { if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x8f); rxb(\@opcode,$3,$2,-1,0x08); push @opcode,0x78,0xc2; push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M my $c=$1; push @opcode,$c=~/^0/?oct($c):$c; @opcode; } else { (); } }; my $vprotq = sub { if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x8f); rxb(\@opcode,$3,$2,-1,0x08); push @opcode,0x78,0xc3; push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M my $c=$1; push @opcode,$c=~/^0/?oct($c):$c; @opcode; } else { (); } }; # Intel Control-flow Enforcement Technology extension. All functions and # indirect branch targets will have to start with this instruction... my $endbranch = sub { (0xf3,0x0f,0x1e,0xfa); }; ######################################################################## { my $comment = "//"; $comment = ";" if ($masm || $nasm); print <<___; $comment This file is generated from a similarly-named Perl script in the BoringSSL $comment source tree. Do not edit by hand. ___ } if ($nasm) { die "unknown target" unless ($win64); print <<___; \%ifidn __OUTPUT_FORMAT__, win64 default rel \%define XMMWORD \%define YMMWORD \%define ZMMWORD \%define _CET_ENDBR \%include "ring_core_generated/prefix_symbols_nasm.inc" ___ } elsif ($masm) { print <<___; OPTION DOTNAME ___ } if ($gas) { my $target; if ($elf) { # The "elf" target is really ELF with SysV ABI, but every ELF platform # uses the SysV ABI. $target = "defined(__ELF__)"; } elsif ($apple) { $target = "defined(__APPLE__)"; } else { die "unknown target: $flavour"; } print <<___; #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && $target ___ } sub process_line { my $line = shift; $line =~ s|\R$||; # Better chomp if ($nasm) { $line =~ s|^#ifdef |%ifdef |; $line =~ s|^#ifndef |%ifndef |; $line =~ s|^#endif|%endif|; $line =~ s|[#!].*$||; # get rid of asm-style comments... } else { # Get rid of asm-style comments but not preprocessor directives. The # former are identified by having a letter after the '#' and starting in # the first column. $line =~ s|!.*$||; $line =~ s|(?<=.)#.*$||; $line =~ s|^#([^a-z].*)?$||; } $line =~ s|/\*.*\*/||; # ... and C-style comments... $line =~ s|^\s+||; # ... and skip white spaces in beginning $line =~ s|\s+$||; # ... and at the end if (my $label=label->re(\$line)) { print $label->out(); } if (my $directive=directive->re(\$line)) { printf "%s",$directive->out(); } elsif (my $opcode=opcode->re(\$line)) { my $asm = eval("\$".$opcode->mnemonic()); if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) { print $gas?".byte\t":"DB\t",join(',',@bytes),"\n"; next; } my @args; ARGUMENT: while (1) { my $arg; ($arg=register->re(\$line, $opcode))|| ($arg=const->re(\$line)) || ($arg=ea->re(\$line, $opcode)) || ($arg=expr->re(\$line, $opcode)) || last ARGUMENT; push @args,$arg; last ARGUMENT if ($line !~ /^,/); $line =~ s/^,\s*//; } # ARGUMENT: if ($#args>=0) { my $insn; my $sz=$opcode->size(); if ($gas) { $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz); @args = map($_->out($sz),@args); printf "\t%s\t%s",$insn,join(",",@args); } else { $insn = $opcode->out(); foreach (@args) { my $arg = $_->out(); # $insn.=$sz compensates for movq, pinsrw, ... if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; } if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; } if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; } if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; } } @args = reverse(@args); undef $sz if ($nasm && $opcode->mnemonic() eq "lea"); printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args)); } } else { printf "\t%s",$opcode->out(); } } print $line,"\n"; } while(defined(my $line=<>)) { process_line($line); } foreach my $line (split(/\n/, seh_directive->pdata_and_xdata())) { process_line($line); } print "\n$current_segment\tENDS\n" if ($current_segment && $masm); if ($masm) { print "END\n"; } elsif ($gas) { print "#endif\n"; } elsif ($nasm) { print <<___; \%else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret \%endif ___ } else { die "unknown assembler"; } close STDOUT or die "error closing STDOUT: $!"; ################################################# # Cross-reference x86_64 ABI "card" # # Unix Win64 # %rax * * # %rbx - - # %rcx #4 #1 # %rdx #3 #2 # %rsi #2 - # %rdi #1 - # %rbp - - # %rsp - - # %r8 #5 #3 # %r9 #6 #4 # %r10 * * # %r11 * * # %r12 - - # %r13 - - # %r14 - - # %r15 - - # # (*) volatile register # (-) preserved by callee # (#) Nth argument, volatile # # In Unix terms top of stack is argument transfer area for arguments # which could not be accommodated in registers. Or in other words 7th # [integer] argument resides at 8(%rsp) upon function entry point. # 128 bytes above %rsp constitute a "red zone" which is not touched # by signal handlers and can be used as temporal storage without # allocating a frame. # # In Win64 terms N*8 bytes on top of stack is argument transfer area, # which belongs to/can be overwritten by callee. N is the number of # arguments passed to callee, *but* not less than 4! This means that # upon function entry point 5th argument resides at 40(%rsp), as well # as that 32 bytes from 8(%rsp) can always be used as temporal # storage [without allocating a frame]. One can actually argue that # one can assume a "red zone" above stack pointer under Win64 as well. # Point is that at apparently no occasion Windows kernel would alter # the area above user stack pointer in true asynchronous manner... # # All the above means that if assembler programmer adheres to Unix # register and stack layout, but disregards the "red zone" existence, # it's possible to use following prologue and epilogue to "gear" from # Unix to Win64 ABI in leaf functions with not more than 6 arguments. # # omnipotent_function: # ifdef WIN64 # movq %rdi,8(%rsp) # movq %rsi,16(%rsp) # movq %rcx,%rdi ; if 1st argument is actually present # movq %rdx,%rsi ; if 2nd argument is actually ... # movq %r8,%rdx ; if 3rd argument is ... # movq %r9,%rcx ; if 4th argument ... # movq 40(%rsp),%r8 ; if 5th ... # movq 48(%rsp),%r9 ; if 6th ... # endif # ... # ifdef WIN64 # movq 8(%rsp),%rdi # movq 16(%rsp),%rsi # endif # ret # ################################################# # Win64 SEH, Structured Exception Handling. # # Unlike on Unix systems(*) lack of Win64 stack unwinding information # has undesired side-effect at run-time: if an exception is raised in # assembler subroutine such as those in question (basically we're # referring to segmentation violations caused by malformed input # parameters), the application is briskly terminated without invoking # any exception handlers, most notably without generating memory dump # or any user notification whatsoever. This poses a problem. It's # possible to address it by registering custom language-specific # handler that would restore processor context to the state at # subroutine entry point and return "exception is not handled, keep # unwinding" code. Writing such handler can be a challenge... But it's # doable, though requires certain coding convention. Consider following # snippet: # # .type function,@function # function: # movq %rsp,%rax # copy rsp to volatile register # pushq %r15 # save non-volatile registers # pushq %rbx # pushq %rbp # movq %rsp,%r11 # subq %rdi,%r11 # prepare [variable] stack frame # andq $-64,%r11 # movq %rax,0(%r11) # check for exceptions # movq %r11,%rsp # allocate [variable] stack frame # movq %rax,0(%rsp) # save original rsp value # magic_point: # ... # movq 0(%rsp),%rcx # pull original rsp value # movq -24(%rcx),%rbp # restore non-volatile registers # movq -16(%rcx),%rbx # movq -8(%rcx),%r15 # movq %rcx,%rsp # restore original rsp # magic_epilogue: # ret # .size function,.-function # # The key is that up to magic_point copy of original rsp value remains # in chosen volatile register and no non-volatile register, except for # rsp, is modified. While past magic_point rsp remains constant till # the very end of the function. In this case custom language-specific # exception handler would look like this: # # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) # { ULONG64 *rsp = (ULONG64 *)context->Rax; # ULONG64 rip = context->Rip; # # if (rip >= magic_point) # { rsp = (ULONG64 *)context->Rsp; # if (rip < magic_epilogue) # { rsp = (ULONG64 *)rsp[0]; # context->Rbp = rsp[-3]; # context->Rbx = rsp[-2]; # context->R15 = rsp[-1]; # } # } # context->Rsp = (ULONG64)rsp; # context->Rdi = rsp[1]; # context->Rsi = rsp[2]; # # memcpy (disp->ContextRecord,context,sizeof(CONTEXT)); # RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase, # dips->ControlPc,disp->FunctionEntry,disp->ContextRecord, # &disp->HandlerData,&disp->EstablisherFrame,NULL); # return ExceptionContinueSearch; # } # # It's appropriate to implement this handler in assembler, directly in # function's module. In order to do that one has to know members' # offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant # values. Here they are: # # CONTEXT.Rax 120 # CONTEXT.Rcx 128 # CONTEXT.Rdx 136 # CONTEXT.Rbx 144 # CONTEXT.Rsp 152 # CONTEXT.Rbp 160 # CONTEXT.Rsi 168 # CONTEXT.Rdi 176 # CONTEXT.R8 184 # CONTEXT.R9 192 # CONTEXT.R10 200 # CONTEXT.R11 208 # CONTEXT.R12 216 # CONTEXT.R13 224 # CONTEXT.R14 232 # CONTEXT.R15 240 # CONTEXT.Rip 248 # CONTEXT.Xmm6 512 # sizeof(CONTEXT) 1232 # DISPATCHER_CONTEXT.ControlPc 0 # DISPATCHER_CONTEXT.ImageBase 8 # DISPATCHER_CONTEXT.FunctionEntry 16 # DISPATCHER_CONTEXT.EstablisherFrame 24 # DISPATCHER_CONTEXT.TargetIp 32 # DISPATCHER_CONTEXT.ContextRecord 40 # DISPATCHER_CONTEXT.LanguageHandler 48 # DISPATCHER_CONTEXT.HandlerData 56 # UNW_FLAG_NHANDLER 0 # ExceptionContinueSearch 1 # # In order to tie the handler to the function one has to compose # couple of structures: one for .xdata segment and one for .pdata. # # UNWIND_INFO structure for .xdata segment would be # # function_unwind_info: # .byte 9,0,0,0 # .rva handler # # This structure designates exception handler for a function with # zero-length prologue, no stack frame or frame register. # # To facilitate composing of .pdata structures, auto-generated "gear" # prologue copies rsp value to rax and denotes next instruction with # .LSEH_begin_{function_name} label. This essentially defines the SEH # styling rule mentioned in the beginning. Position of this label is # chosen in such manner that possible exceptions raised in the "gear" # prologue would be accounted to caller and unwound from latter's frame. # End of function is marked with respective .LSEH_end_{function_name} # label. To summarize, .pdata segment would contain # # .rva .LSEH_begin_function # .rva .LSEH_end_function # .rva function_unwind_info # # Reference to function_unwind_info from .xdata segment is the anchor. # In case you wonder why references are 32-bit .rvas and not 64-bit # .quads. References put into these two segments are required to be # *relative* to the base address of the current binary module, a.k.a. # image base. No Win64 module, be it .exe or .dll, can be larger than # 2GB and thus such relative references can be and are accommodated in # 32 bits. # # Having reviewed the example function code, one can argue that "movq # %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix # rax would contain an undefined value. If this "offends" you, use # another register and refrain from modifying rax till magic_point is # reached, i.e. as if it was a non-volatile register. If more registers # are required prior [variable] frame setup is completed, note that # nobody says that you can have only one "magic point." You can # "liberate" non-volatile registers by denoting last stack off-load # instruction and reflecting it in finer grade unwind logic in handler. # After all, isn't it why it's called *language-specific* handler... # # SE handlers are also involved in unwinding stack when executable is # profiled or debugged. Profiling implies additional limitations that # are too subtle to discuss here. For now it's sufficient to say that # in order to simplify handlers one should either a) offload original # %rsp to stack (like discussed above); or b) if you have a register to # spare for frame pointer, choose volatile one. # # (*) Note that we're talking about run-time, not debug-time. Lack of # unwind information makes debugging hard on both Windows and # Unix. "Unlike" refers to the fact that on Unix signal handler # will always be invoked, core dumped and appropriate exit code # returned to parent (for user notification). ring-0.17.14/crypto/perlasm/x86asm.pl000064400000000000000000000217031046102023000154060ustar 00000000000000#! /usr/bin/env perl # Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # require 'x86asm.pl'; # &asm_init([,$i386only]); # &function_begin("foo"); # ... # &function_end("foo"); # &asm_finish $out=(); $i386=0; # AUTOLOAD is this context has quite unpleasant side effect, namely # that typos in function calls effectively go to assembler output, # but on the pros side we don't have to implement one subroutine per # each opcode... sub ::AUTOLOAD { my $opcode = $AUTOLOAD; die "more than 4 arguments passed to $opcode" if ($#_>3); $opcode =~ s/.*:://; if ($opcode =~ /^push/) { $stack+=4; } elsif ($opcode =~ /^pop/) { $stack-=4; } &generic($opcode,@_) or die "undefined subroutine \&$AUTOLOAD"; } # record_function_hit(int) writes a byte with value one to the given offset of # |BORINGSSL_function_hit|, but only if BORINGSSL_DISPATCH_TEST is defined. # This is used in impl_dispatch_test.cc to test whether the expected assembly # functions are triggered by high-level API calls. sub ::record_function_hit { my($index)=@_; &preprocessor_ifdef("BORINGSSL_DISPATCH_TEST"); &push("ebx"); &push("edx"); &call(&label("pic_for_function_hit")); &set_label("pic_for_function_hit"); &blindpop("ebx"); &lea("ebx",&DWP("BORINGSSL_function_hit+$index"."-".&label("pic_for_function_hit"),"ebx")); &mov("edx", 1); &movb(&BP(0, "ebx"), "dl"); &pop("edx"); &pop("ebx"); &preprocessor_endif(); } sub ::emit { my $opcode=shift; if ($#_==-1) { push(@out,"\t$opcode\n"); } else { push(@out,"\t$opcode\t".join(',',@_)."\n"); } } sub ::LB { $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'low byte'"; $1."l"; } sub ::HB { $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'high byte'"; $1."h"; } sub ::stack_push{ my $num=$_[0]*4; $stack+=$num; &sub("esp",$num); } sub ::stack_pop { my $num=$_[0]*4; $stack-=$num; &add("esp",$num); } sub ::blindpop { &pop($_[0]); $stack+=4; } sub ::wparam { &DWP($stack+4*$_[0],"esp"); } sub ::swtmp { &DWP(4*$_[0],"esp"); } sub ::bswap { if ($i386) # emulate bswap for i386 { &comment("bswap @_"); &xchg(&HB(@_),&LB(@_)); &ror (@_,16); &xchg(&HB(@_),&LB(@_)); } else { &generic("bswap",@_); } } # These are made-up opcodes introduced over the years essentially # by ignorance, just alias them to real ones... sub ::movb { &mov(@_); } sub ::xorb { &xor(@_); } sub ::rotl { &rol(@_); } sub ::rotr { &ror(@_); } sub ::exch { &xchg(@_); } sub ::halt { &hlt; } sub ::movz { &movzx(@_); } sub ::pushf { &pushfd; } sub ::popf { &popfd; } # 3 argument instructions sub ::movq { my($p1,$p2,$optimize)=@_; if ($optimize && $p1=~/^mm[0-7]$/ && $p2=~/^mm[0-7]$/) # movq between mmx registers can sink Intel CPUs { &::pshufw($p1,$p2,0xe4); } else { &::generic("movq",@_); } } # SSE>2 instructions my %regrm = ( "eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3, "esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7 ); sub ::pextrd { my($dst,$src,$imm)=@_; if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/) { &::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm); } else { &::generic("pextrd",@_); } } sub ::pinsrd { my($dst,$src,$imm)=@_; if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/) { &::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm); } else { &::generic("pinsrd",@_); } } sub ::pshufb { my($dst,$src)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) { &data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2); } else { &::generic("pshufb",@_); } } sub ::palignr { my($dst,$src,$imm)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) { &::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm); } else { &::generic("palignr",@_); } } sub ::pclmulqdq { my($dst,$src,$imm)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) { &::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm); } else { &::generic("pclmulqdq",@_); } } sub ::rdrand { my ($dst)=@_; if ($dst =~ /(e[a-dsd][ixp])/) { &::data_byte(0x0f,0xc7,0xf0|$regrm{$dst}); } else { &::generic("rdrand",@_); } } sub ::rdseed { my ($dst)=@_; if ($dst =~ /(e[a-dsd][ixp])/) { &::data_byte(0x0f,0xc7,0xf8|$regrm{$dst}); } else { &::generic("rdrand",@_); } } sub rxb { local *opcode=shift; my ($dst,$src1,$src2,$rxb)=@_; $rxb|=0x7<<5; $rxb&=~(0x04<<5) if($dst>=8); $rxb&=~(0x01<<5) if($src1>=8); $rxb&=~(0x02<<5) if($src2>=8); push @opcode,$rxb; } sub ::vprotd { my $args=join(',',@_); if ($args =~ /xmm([0-7]),xmm([0-7]),([x0-9a-f]+)/) { my @opcode=(0x8f); rxb(\@opcode,$1,$2,-1,0x08); push @opcode,0x78,0xc2; push @opcode,0xc0|($2&7)|(($1&7)<<3); # ModR/M my $c=$3; push @opcode,$c=~/^0/?oct($c):$c; &::data_byte(@opcode); } else { &::generic("vprotd",@_); } } sub ::endbranch { &::data_byte(0xf3,0x0f,0x1e,0xfb); } # label management $lbdecor="L"; # local label decoration, set by package $label="000"; sub ::islabel # see is argument is a known label { my $i; foreach $i (values %label) { return $i if ($i eq $_[0]); } $label{$_[0]}; # can be undef } sub ::label # instantiate a function-scope label { if (!defined($label{$_[0]})) { $label{$_[0]}="${lbdecor}${label}${_[0]}"; $label++; } $label{$_[0]}; } sub ::LABEL # instantiate a file-scope label { $label{$_[0]}=$_[1] if (!defined($label{$_[0]})); $label{$_[0]}; } sub ::static_label { &::LABEL($_[0],$lbdecor.$_[0]); } sub ::set_label_B { push(@out,"@_:\n"); } sub ::set_label { my $label=&::label($_[0]); &::align($_[1]) if ($_[1]>1); &::set_label_B($label); $label; } sub ::wipe_labels # wipes function-scope labels { foreach $i (keys %label) { delete $label{$i} if ($label{$i} =~ /^\Q${lbdecor}\E[0-9]{3}/); } } # subroutine management sub ::function_begin { &function_begin_B(@_); $stack=4; &push("ebp"); &push("ebx"); &push("esi"); &push("edi"); } sub ::function_end { &pop("edi"); &pop("esi"); &pop("ebx"); &pop("ebp"); &ret(); &function_end_B(@_); $stack=0; &wipe_labels(); } sub ::function_end_A { &pop("edi"); &pop("esi"); &pop("ebx"); &pop("ebp"); &ret(); $stack+=16; # readjust esp as if we didn't pop anything } sub ::asciz { my @str=unpack("C*",shift); push @str,0; while ($#str>15) { &data_byte(@str[0..15]); foreach (0..15) { shift @str; } } &data_byte(@str) if (@str); } sub ::asm_finish { &file_end(); my $comment = "//"; $comment = ";" if ($win32); print <<___; $comment This file is generated from a similarly-named Perl script in the BoringSSL $comment source tree. Do not edit by hand. ___ if ($win32) { print <<___ unless $masm; \%include "ring_core_generated/prefix_symbols_nasm.inc" \%ifidn __OUTPUT_FORMAT__, win32 ___ print @out; print <<___ unless $masm; \%else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret \%endif ___ } else { my $target; if ($elf) { $target = "defined(__ELF__)"; } elsif ($macosx) { $target = "defined(__APPLE__)"; } else { die "unknown target"; } print <<___; #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && $target ___ print @out; print <<___; #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && $target ___ } } sub ::asm_init { my ($type,$cpu)=@_; $i386=$cpu; $elf=$cpp=$coff=$aout=$macosx=$win32=$mwerks=$android=0; if (($type eq "elf")) { $elf=1; require "x86gas.pl"; } elsif (($type eq "elf-1")) { $elf=-1; require "x86gas.pl"; } elsif (($type eq "a\.out")) { $aout=1; require "x86gas.pl"; } elsif (($type eq "coff" or $type eq "gaswin")) { $coff=1; require "x86gas.pl"; } elsif (($type eq "win32n")) { $win32=1; require "x86nasm.pl"; } elsif (($type eq "win32")) { $win32=1; $masm=1; require "x86masm.pl"; } elsif (($type eq "macosx")) { $aout=1; $macosx=1; require "x86gas.pl"; } elsif (($type eq "android")) { $elf=1; $android=1; require "x86gas.pl"; } else { print STDERR <<"EOF"; Pick one target type from elf - Linux, FreeBSD, Solaris x86, etc. a.out - DJGPP, elder OpenBSD, etc. coff - GAS/COFF such as Win32 targets win32n - Windows 95/Windows NT NASM format macosx - Mac OS X EOF exit(1); } $pic=0; for (@ARGV) { $pic=1 if (/\-[fK]PIC/i); } &file(); } sub ::hidden {} 1; ring-0.17.14/crypto/perlasm/x86gas.pl000064400000000000000000000155331046102023000154040ustar 00000000000000#! /usr/bin/env perl # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. package x86gas; *out=\@::out; $::lbdecor=$::aout?"L":".L"; # local label decoration $nmdecor=($::aout or $::coff)?"_":""; # external name decoration $initseg=""; $align=16; $align=log($align)/log(2) if ($::aout); $com_start="#" if ($::aout or $::coff); sub opsize() { my $reg=shift; if ($reg =~ m/^%e/o) { "l"; } elsif ($reg =~ m/^%[a-d][hl]$/o) { "b"; } elsif ($reg =~ m/^%[yxm]/o) { undef; } else { "w"; } } # swap arguments; # expand opcode with size suffix; # prefix numeric constants with $; sub ::generic { my($opcode,@arg)=@_; my($suffix,$dst,$src); @arg=reverse(@arg); for (@arg) { s/^(\*?)(e?[a-dsixphl]{2})$/$1%$2/o; # gp registers s/^([xy]?mm[0-7])$/%$1/o; # xmm/mmx registers s/^(\-?[0-9]+)$/\$$1/o; # constants s/^(\-?0x[0-9a-f]+)$/\$$1/o; # constants } $dst = $arg[$#arg] if ($#arg>=0); $src = $arg[$#arg-1] if ($#arg>=1); if ($dst =~ m/^%/o) { $suffix=&opsize($dst); } elsif ($src =~ m/^%/o) { $suffix=&opsize($src); } else { $suffix="l"; } undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o); if ($#_==0) { &::emit($opcode); } elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o) { &::emit($opcode,@arg); } else { &::emit($opcode.$suffix,@arg);} 1; } # # opcodes not covered by ::generic above, mostly inconsistent namings... # sub ::movzx { &::movzb(@_); } sub ::pushfd { &::pushfl; } sub ::popfd { &::popfl; } sub ::cpuid { &::emit(".byte\t0x0f,0xa2"); } sub ::rdtsc { &::emit(".byte\t0x0f,0x31"); } sub ::call { &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); } sub ::call_ptr { &::generic("call","*$_[0]"); } sub ::jmp_ptr { &::generic("jmp","*$_[0]"); } *::bswap = sub { &::emit("bswap","%$_[0]"); } if (!$::i386); sub ::DWP { my($addr,$reg1,$reg2,$idx)=@_; my $ret=""; if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; } $addr =~ s/^\s+//; # prepend global references with optional underscore $addr =~ s/^([^\+\-0-9][^\+\-]*)/&::islabel($1) or "$nmdecor$1"/ige; $reg1 = "%$reg1" if ($reg1); $reg2 = "%$reg2" if ($reg2); $ret .= $addr if (($addr ne "") && ($addr ne 0)); if ($reg2) { $idx!= 0 or $idx=1; $ret .= "($reg1,$reg2,$idx)"; } elsif ($reg1) { $ret .= "($reg1)"; } $ret; } sub ::QWP { &::DWP(@_); } sub ::BP { &::DWP(@_); } sub ::WP { &::DWP(@_); } sub ::BC { @_; } sub ::DWC { @_; } sub ::file { push(@out,".text\n"); } sub ::function_begin_B { my $func=shift; my $global=($func !~ /^_/); my $begin="${::lbdecor}_${func}_begin"; &::LABEL($func,$global?"$begin":"$nmdecor$func"); $func=$nmdecor.$func; push(@out,".globl\t$func\n") if ($global); if ($::macosx) { push(@out,".private_extern\t$func\n"); } else { push(@out,".hidden\t$func\n"); } if ($::coff) { push(@out,".def\t$func;\t.scl\t".(3-$global).";\t.type\t32;\t.endef\n"); } elsif (($::aout and !$::pic) or $::macosx) { } else { push(@out,".type $func,\@function\n"); } push(@out,".align\t$align\n"); push(@out,"$func:\n"); push(@out,"$begin:\n") if ($global); $::stack=4; } sub ::function_end_B { my $func=shift; push(@out,".size\t$nmdecor$func,.-".&::LABEL($func)."\n") if ($::elf); $::stack=0; &::wipe_labels(); } sub ::comment { if (!defined($com_start) or $::elf) { # Regarding $::elf above... # GNU and SVR4 as'es use different comment delimiters, push(@out,"\n"); # so we just skip ELF comments... return; } foreach (@_) { if (/^\s*$/) { push(@out,"\n"); } else { push(@out,"\t$com_start $_ $com_end\n"); } } } sub ::external_label { foreach(@_) { &::LABEL($_,$nmdecor.$_); } } sub ::public_label { push(@out,".globl\t".&::LABEL($_[0],$nmdecor.$_[0])."\n"); } sub ::file_end { if ($::macosx) { if (%non_lazy_ptr) { push(@out,".section __IMPORT,__pointers,non_lazy_symbol_pointers\n"); foreach $i (keys %non_lazy_ptr) { push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n"); } } } if (0 && grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) { my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,16"; if ($::macosx) { push (@out,"$tmp,2\n"); } elsif ($::elf) { push (@out,"$tmp,4\n"); } else { push (@out,"$tmp\n"); } } push(@out,$initseg) if ($initseg); } sub ::data_byte { push(@out,".byte\t".join(',',@_)."\n"); } sub ::data_short{ push(@out,".value\t".join(',',@_)."\n"); } sub ::data_word { push(@out,".long\t".join(',',@_)."\n"); } sub ::align { my $val=$_[0]; if ($::aout) { $val=int(log($val)/log(2)); $val.=",0x90"; } push(@out,".align\t$val\n"); } sub ::picmeup { my($dst,$sym,$base,$reflabel)=@_; if (($::pic && ($::elf || $::aout)) || $::macosx) { if (!defined($base)) { &::call(&::label("PIC_me_up")); &::set_label("PIC_me_up"); &::blindpop($dst); $base=$dst; $reflabel=&::label("PIC_me_up"); } if ($::macosx) { my $indirect=&::static_label("$nmdecor$sym\$non_lazy_ptr"); &::mov($dst,&::DWP("$indirect-$reflabel",$base)); $non_lazy_ptr{"$nmdecor$sym"}=$indirect; } elsif ($sym eq "OPENSSL_ia32cap_P" && $::elf>0) { &::lea($dst,&::DWP("$sym-$reflabel",$base)); } else { &::lea($dst,&::DWP("_GLOBAL_OFFSET_TABLE_+[.-$reflabel]", $base)); &::mov($dst,&::DWP("$sym\@GOT",$dst)); } } else { &::lea($dst,&::DWP($sym)); } } sub ::initseg { my $f=$nmdecor.shift; if ($::android) { $initseg.=<<___; .section .init_array .align 4 .long $f ___ } elsif ($::elf) { $initseg.=<<___; .section .init call $f ___ } elsif ($::coff) { $initseg.=<<___; # applies to both Cygwin and Mingw .section .ctors .long $f ___ } elsif ($::macosx) { $initseg.=<<___; .mod_init_func .align 2 .long $f ___ } elsif ($::aout) { my $ctor="${nmdecor}_GLOBAL_\$I\$$f"; $initseg.=".text\n"; $initseg.=".type $ctor,\@function\n" if ($::pic); $initseg.=<<___; # OpenBSD way... .globl $ctor .align 2 $ctor: jmp $f ___ } } sub ::dataseg { push(@out,".data\n"); } sub ::preprocessor_ifdef { my($define)=@_; push(@out,"#ifdef ${define}\n"); } sub ::preprocessor_endif { push(@out,"#endif\n"); } *::hidden = sub { push(@out,".hidden\t$nmdecor$_[0]\n"); } if ($::elf); 1; ring-0.17.14/crypto/perlasm/x86nasm.pl000064400000000000000000000115131046102023000155620ustar 00000000000000#! /usr/bin/env perl # Copyright 1999-2018 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. package x86nasm; *out=\@::out; $::lbdecor="L\$"; # local label decoration $nmdecor="_"; # external name decoration $drdecor=$::mwerks?".":""; # directive decoration $initseg=""; sub ::generic { my $opcode=shift; my $tmp; if (!$::mwerks) { if ($opcode =~ m/^j/o && $#_==0) # optimize jumps { $_[0] = "NEAR $_[0]"; } elsif ($opcode eq "lea" && $#_==1) # wipe storage qualifier from lea { $_[1] =~ s/^[^\[]*\[/\[/o; } elsif ($opcode eq "clflush" && $#_==0) { $_[0] =~ s/^[^\[]*\[/\[/o; } } &::emit($opcode,@_); 1; } # # opcodes not covered by ::generic above, mostly inconsistent namings... # sub ::call { &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); } sub ::call_ptr { &::emit("call",@_); } sub ::jmp_ptr { &::emit("jmp",@_); } sub get_mem { my($size,$addr,$reg1,$reg2,$idx)=@_; my($post,$ret); if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; } if ($size ne "") { $ret .= "$size"; $ret .= " PTR" if ($::mwerks); $ret .= " "; } $ret .= "["; $addr =~ s/^\s+//; # prepend global references with optional underscore $addr =~ s/^([^\+\-0-9][^\+\-]*)/::islabel($1) or "$nmdecor$1"/ige; # put address arithmetic expression in parenthesis $addr="($addr)" if ($addr =~ /^.+[\-\+].+$/); if (($addr ne "") && ($addr ne 0)) { if ($addr !~ /^-/) { $ret .= "$addr+"; } else { $post=$addr; } } if ($reg2 ne "") { $idx!=0 or $idx=1; $ret .= "$reg2*$idx"; $ret .= "+$reg1" if ($reg1 ne ""); } else { $ret .= "$reg1"; } $ret .= "$post]"; $ret =~ s/\+\]/]/; # in case $addr was the only argument $ret; } sub ::BP { &get_mem("BYTE",@_); } sub ::DWP { &get_mem("DWORD",@_); } sub ::WP { &get_mem("WORD",@_); } sub ::QWP { &get_mem("",@_); } sub ::BC { (($::mwerks)?"":"BYTE ")."@_"; } sub ::DWC { (($::mwerks)?"":"DWORD ")."@_"; } sub ::file { if ($::mwerks) { push(@out,".section\t.text,64\n"); } else { my $tmp=<<___; %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 \$\@feat.00 equ 1 section .text code align=64 %else section .text code %endif ___ push(@out,$tmp); } } sub ::function_begin_B { my $func=shift; my $global=($func !~ /^_/); my $begin="${::lbdecor}_${func}_begin"; $begin =~ s/^\@/./ if ($::mwerks); # the torture never stops &::LABEL($func,$global?"$begin":"$nmdecor$func"); $func=$nmdecor.$func; push(@out,"${drdecor}global $func\n") if ($global); push(@out,"${drdecor}align 16\n"); push(@out,"$func:\n"); push(@out,"$begin:\n") if ($global); $::stack=4; } sub ::function_end_B { $::stack=0; &::wipe_labels(); } sub ::file_end { if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) { my $comm=<<___; ${drdecor}segment .bss ${drdecor}common ${nmdecor}OPENSSL_ia32cap_P 16 ___ # comment out OPENSSL_ia32cap_P declarations grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out; push (@out,$comm) } push (@out,$initseg) if ($initseg); } sub ::comment { foreach (@_) { push(@out,"\t; $_\n"); } } sub ::external_label { foreach(@_) { push(@out,"${drdecor}extern\t".&::LABEL($_,$nmdecor.$_)."\n"); } } sub ::public_label { push(@out,"${drdecor}global\t".&::LABEL($_[0],$nmdecor.$_[0])."\n"); } sub ::data_byte { push(@out,(($::mwerks)?".byte\t":"db\t").join(',',@_)."\n"); } sub ::data_short { push(@out,(($::mwerks)?".word\t":"dw\t").join(',',@_)."\n"); } sub ::data_word { push(@out,(($::mwerks)?".long\t":"dd\t").join(',',@_)."\n"); } sub ::align { push(@out,"${drdecor}align\t$_[0]\n"); } sub ::picmeup { my($dst,$sym)=@_; &::lea($dst,&::DWP($sym)); } sub ::initseg { my $f=$nmdecor.shift; if ($::win32) { $initseg=<<___; segment .CRT\$XCU data align=4 extern $f dd $f ___ } } sub ::dataseg { if ($mwerks) { push(@out,".section\t.data,4\n"); } else { push(@out,"section\t.data align=4\n"); } } sub ::safeseh { my $nm=shift; push(@out,"%if __NASM_VERSION_ID__ >= 0x02030000\n"); push(@out,"safeseh ".&::LABEL($nm,$nmdecor.$nm)."\n"); push(@out,"%endif\n"); } sub ::preprocessor_ifdef { my($define)=@_; push(@out,"%ifdef ${define}\n"); } sub ::preprocessor_endif { push(@out,"%endif\n"); } 1; ring-0.17.14/crypto/poly1305/poly1305.c000064400000000000000000000165521046102023000152220ustar 00000000000000/* Copyright (c) 2014, Google Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ // This implementation of poly1305 is by Andrew Moon // (https://github.com/floodyberry/poly1305-donna) and released as public // domain. #include #include "../internal.h" #include "ring-core/check.h" #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic ignored "-Wsign-conversion" #pragma GCC diagnostic ignored "-Wconversion" #endif static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; } // Keep in sync with `poly1305_state_st` in ffi_fallback.rs. struct poly1305_state_st { alignas(64) uint32_t r0; uint32_t r1, r2, r3, r4; uint32_t s1, s2, s3, s4; uint32_t h0, h1, h2, h3, h4; uint8_t key[16]; }; // poly1305_blocks updates |state| given some amount of input data. This // function may only be called with a |len| that is not a multiple of 16 at the // end of the data. Otherwise the input must be buffered into 16 byte blocks. static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in, size_t len) { debug_assert_nonsecret((uintptr_t)state % 64 == 0); uint32_t t0, t1, t2, t3; uint64_t t[5]; uint32_t b; uint64_t c; size_t j; uint8_t mp[16]; if (len < 16) { goto poly1305_donna_atmost15bytes; } poly1305_donna_16bytes: t0 = CRYPTO_load_u32_le(in); t1 = CRYPTO_load_u32_le(in + 4); t2 = CRYPTO_load_u32_le(in + 8); t3 = CRYPTO_load_u32_le(in + 12); in += 16; len -= 16; state->h0 += t0 & 0x3ffffff; state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; state->h4 += (t3 >> 8) | (1 << 24); poly1305_donna_mul: t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) + mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) + mul32x32_64(state->h4, state->s1); t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) + mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) + mul32x32_64(state->h4, state->s2); t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) + mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) + mul32x32_64(state->h4, state->s3); t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) + mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) + mul32x32_64(state->h4, state->s4); t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) + mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) + mul32x32_64(state->h4, state->r0); state->h0 = (uint32_t)t[0] & 0x3ffffff; c = (t[0] >> 26); t[1] += c; state->h1 = (uint32_t)t[1] & 0x3ffffff; b = (uint32_t)(t[1] >> 26); t[2] += b; state->h2 = (uint32_t)t[2] & 0x3ffffff; b = (uint32_t)(t[2] >> 26); t[3] += b; state->h3 = (uint32_t)t[3] & 0x3ffffff; b = (uint32_t)(t[3] >> 26); t[4] += b; state->h4 = (uint32_t)t[4] & 0x3ffffff; b = (uint32_t)(t[4] >> 26); state->h0 += b * 5; if (len >= 16) { goto poly1305_donna_16bytes; } // final bytes poly1305_donna_atmost15bytes: if (!len) { return; } for (j = 0; j < len; j++) { mp[j] = in[j]; } mp[j++] = 1; for (; j < 16; j++) { mp[j] = 0; } len = 0; t0 = CRYPTO_load_u32_le(mp + 0); t1 = CRYPTO_load_u32_le(mp + 4); t2 = CRYPTO_load_u32_le(mp + 8); t3 = CRYPTO_load_u32_le(mp + 12); state->h0 += t0 & 0x3ffffff; state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; state->h4 += (t3 >> 8); goto poly1305_donna_mul; } void CRYPTO_poly1305_init(struct poly1305_state_st *state, const uint8_t key[32]) { debug_assert_nonsecret((uintptr_t)state % 64 == 0); uint32_t t0, t1, t2, t3; t0 = CRYPTO_load_u32_le(key + 0); t1 = CRYPTO_load_u32_le(key + 4); t2 = CRYPTO_load_u32_le(key + 8); t3 = CRYPTO_load_u32_le(key + 12); // precompute multipliers state->r0 = t0 & 0x3ffffff; t0 >>= 26; t0 |= t1 << 6; state->r1 = t0 & 0x3ffff03; t1 >>= 20; t1 |= t2 << 12; state->r2 = t1 & 0x3ffc0ff; t2 >>= 14; t2 |= t3 << 18; state->r3 = t2 & 0x3f03fff; t3 >>= 8; state->r4 = t3 & 0x00fffff; state->s1 = state->r1 * 5; state->s2 = state->r2 * 5; state->s3 = state->r3 * 5; state->s4 = state->r4 * 5; // init state state->h0 = 0; state->h1 = 0; state->h2 = 0; state->h3 = 0; state->h4 = 0; OPENSSL_memcpy(state->key, key + 16, sizeof(state->key)); } void CRYPTO_poly1305_update(struct poly1305_state_st *state, const uint8_t *in, size_t in_len) { // Work around a C language bug. See https://crbug.com/1019588. if (in_len == 0) { return; } poly1305_update(state, in, in_len); } void CRYPTO_poly1305_finish(struct poly1305_state_st *state, uint8_t mac[16]) { uint32_t g0, g1, g2, g3, g4; uint32_t b, nb; b = state->h0 >> 26; state->h0 = state->h0 & 0x3ffffff; state->h1 += b; b = state->h1 >> 26; state->h1 = state->h1 & 0x3ffffff; state->h2 += b; b = state->h2 >> 26; state->h2 = state->h2 & 0x3ffffff; state->h3 += b; b = state->h3 >> 26; state->h3 = state->h3 & 0x3ffffff; state->h4 += b; b = state->h4 >> 26; state->h4 = state->h4 & 0x3ffffff; state->h0 += b * 5; g0 = state->h0 + 5; b = g0 >> 26; g0 &= 0x3ffffff; g1 = state->h1 + b; b = g1 >> 26; g1 &= 0x3ffffff; g2 = state->h2 + b; b = g2 >> 26; g2 &= 0x3ffffff; g3 = state->h3 + b; b = g3 >> 26; g3 &= 0x3ffffff; g4 = state->h4 + b - (1 << 26); b = (g4 >> 31) - 1; nb = ~b; state->h0 = (state->h0 & nb) | (g0 & b); state->h1 = (state->h1 & nb) | (g1 & b); state->h2 = (state->h2 & nb) | (g2 & b); state->h3 = (state->h3 & nb) | (g3 & b); state->h4 = (state->h4 & nb) | (g4 & b); uint64_t f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)CRYPTO_load_u32_le(&state->key[0]); uint64_t f1 = ((state->h1 >> 6) | (state->h2 << 20)) + (uint64_t)CRYPTO_load_u32_le(&state->key[4]); uint64_t f2 = ((state->h2 >> 12) | (state->h3 << 14)) + (uint64_t)CRYPTO_load_u32_le(&state->key[8]); uint64_t f3 = ((state->h3 >> 18) | (state->h4 << 8)) + (uint64_t)CRYPTO_load_u32_le(&state->key[12]); CRYPTO_store_u32_le(&mac[0], (uint32_t)f0); f1 += (f0 >> 32); CRYPTO_store_u32_le(&mac[4], (uint32_t)f1); f2 += (f1 >> 32); CRYPTO_store_u32_le(&mac[8], (uint32_t)f2); f3 += (f2 >> 32); CRYPTO_store_u32_le(&mac[12], (uint32_t)f3); } ring-0.17.14/crypto/poly1305/poly1305_arm.c000064400000000000000000000163341046102023000160570ustar 00000000000000/* Copyright (c) 2014, Google Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ // This implementation was taken from the public domain, neon2 version in // SUPERCOP by D. J. Bernstein and Peter Schwabe. #include #include "../internal.h" #pragma GCC diagnostic ignored "-Wsign-conversion" // Keep in sync with ffi_arm_neon.rs typedef struct { uint32_t v[12]; // for alignment; only using 10 } fe1305x2; #define addmulmod openssl_poly1305_neon2_addmulmod #define blocks openssl_poly1305_neon2_blocks extern void addmulmod(fe1305x2 *r, const fe1305x2 *x, const fe1305x2 *y, const fe1305x2 *c); extern int blocks(fe1305x2 *h, const fe1305x2 *precomp, const uint8_t *in, size_t inlen); static void freeze(fe1305x2 *r) { int i; uint32_t x0 = r->v[0]; uint32_t x1 = r->v[2]; uint32_t x2 = r->v[4]; uint32_t x3 = r->v[6]; uint32_t x4 = r->v[8]; uint32_t y0; uint32_t y1; uint32_t y2; uint32_t y3; uint32_t y4; uint32_t swap; for (i = 0; i < 3; ++i) { x1 += x0 >> 26; x0 &= 0x3ffffff; x2 += x1 >> 26; x1 &= 0x3ffffff; x3 += x2 >> 26; x2 &= 0x3ffffff; x4 += x3 >> 26; x3 &= 0x3ffffff; x0 += 5 * (x4 >> 26); x4 &= 0x3ffffff; } y0 = x0 + 5; y1 = x1 + (y0 >> 26); y0 &= 0x3ffffff; y2 = x2 + (y1 >> 26); y1 &= 0x3ffffff; y3 = x3 + (y2 >> 26); y2 &= 0x3ffffff; y4 = x4 + (y3 >> 26); y3 &= 0x3ffffff; swap = -(y4 >> 26); y4 &= 0x3ffffff; y0 ^= x0; y1 ^= x1; y2 ^= x2; y3 ^= x3; y4 ^= x4; y0 &= swap; y1 &= swap; y2 &= swap; y3 &= swap; y4 &= swap; y0 ^= x0; y1 ^= x1; y2 ^= x2; y3 ^= x3; y4 ^= x4; r->v[0] = y0; r->v[2] = y1; r->v[4] = y2; r->v[6] = y3; r->v[8] = y4; } static void store32(uint8_t out[4], uint32_t v) { OPENSSL_memcpy(out, &v, 4); } // load32 exists to avoid breaking strict aliasing rules in // fe1305x2_frombytearray. static uint32_t load32(const uint8_t t[4]) { uint32_t tmp; OPENSSL_memcpy(&tmp, t, sizeof(tmp)); return tmp; } static void fe1305x2_tobytearray(uint8_t r[16], fe1305x2 *x) { uint32_t x0 = x->v[0]; uint32_t x1 = x->v[2]; uint32_t x2 = x->v[4]; uint32_t x3 = x->v[6]; uint32_t x4 = x->v[8]; x1 += x0 >> 26; x0 &= 0x3ffffff; x2 += x1 >> 26; x1 &= 0x3ffffff; x3 += x2 >> 26; x2 &= 0x3ffffff; x4 += x3 >> 26; x3 &= 0x3ffffff; store32(r, x0 + (x1 << 26)); store32(r + 4, (x1 >> 6) + (x2 << 20)); store32(r + 8, (x2 >> 12) + (x3 << 14)); store32(r + 12, (x3 >> 18) + (x4 << 8)); } static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x, size_t xlen) { size_t i; uint8_t t[17]; for (i = 0; (i < 16) && (i < xlen); i++) { t[i] = x[i]; } xlen -= i; x += i; t[i++] = 1; for (; i < 17; i++) { t[i] = 0; } r->v[0] = 0x3ffffff & load32(t); r->v[2] = 0x3ffffff & (load32(t + 3) >> 2); r->v[4] = 0x3ffffff & (load32(t + 6) >> 4); r->v[6] = 0x3ffffff & (load32(t + 9) >> 6); r->v[8] = load32(t + 13); if (xlen) { for (i = 0; (i < 16) && (i < xlen); i++) { t[i] = x[i]; } t[i++] = 1; for (; i < 17; i++) { t[i] = 0; } r->v[1] = 0x3ffffff & load32(t); r->v[3] = 0x3ffffff & (load32(t + 3) >> 2); r->v[5] = 0x3ffffff & (load32(t + 6) >> 4); r->v[7] = 0x3ffffff & (load32(t + 9) >> 6); r->v[9] = load32(t + 13); } else { r->v[1] = r->v[3] = r->v[5] = r->v[7] = r->v[9] = 0; } } static const alignas(16) fe1305x2 zero; // Keep in sync with ffi_arm_neon.rs struct poly1305_state_st { alignas(16) fe1305x2 r; fe1305x2 h; fe1305x2 c; fe1305x2 precomp[2]; uint8_t data[128]; uint8_t buf[32]; size_t buf_used; uint8_t key[16]; }; OPENSSL_STATIC_ASSERT(sizeof(fe1305x2) == 48, "fe1305x2 size is different than expected"); void CRYPTO_poly1305_init_neon(struct poly1305_state_st *st, const uint8_t key[32]) { fe1305x2 *const r = &st->r; fe1305x2 *const h = &st->h; fe1305x2 *const precomp = &st->precomp[0]; r->v[1] = r->v[0] = 0x3ffffff & load32(key); r->v[3] = r->v[2] = 0x3ffff03 & (load32(key + 3) >> 2); r->v[5] = r->v[4] = 0x3ffc0ff & (load32(key + 6) >> 4); r->v[7] = r->v[6] = 0x3f03fff & (load32(key + 9) >> 6); r->v[9] = r->v[8] = 0x00fffff & (load32(key + 12) >> 8); for (size_t j = 0; j < 10; j++) { h->v[j] = 0; // XXX: should fast-forward a bit } addmulmod(precomp, r, r, &zero); // precompute r^2 addmulmod(precomp + 1, precomp, precomp, &zero); // precompute r^4 OPENSSL_memcpy(st->key, key + 16, 16); st->buf_used = 0; } void CRYPTO_poly1305_update_neon(struct poly1305_state_st *st, const uint8_t *in, size_t in_len) { fe1305x2 *const h = &st->h; fe1305x2 *const c = &st->c; fe1305x2 *const precomp = &st->precomp[0]; if (st->buf_used) { size_t todo = 32 - st->buf_used; if (todo > in_len) { todo = in_len; } for (size_t i = 0; i < todo; i++) { st->buf[st->buf_used + i] = in[i]; } st->buf_used += todo; in_len -= todo; in += todo; if (st->buf_used == sizeof(st->buf) && in_len) { addmulmod(h, h, precomp, &zero); fe1305x2_frombytearray(c, st->buf, sizeof(st->buf)); for (size_t i = 0; i < 10; i++) { h->v[i] += c->v[i]; } st->buf_used = 0; } } while (in_len > 32) { size_t tlen = 1048576; if (in_len < tlen) { tlen = in_len; } tlen -= blocks(h, precomp, in, tlen); in_len -= tlen; in += tlen; } if (in_len) { for (size_t i = 0; i < in_len; i++) { st->buf[i] = in[i]; } st->buf_used = in_len; } } void CRYPTO_poly1305_finish_neon(struct poly1305_state_st *st, uint8_t mac[16]) { fe1305x2 *const r = &st->r; fe1305x2 *const h = &st->h; fe1305x2 *const c = &st->c; fe1305x2 *const precomp = &st->precomp[0]; addmulmod(h, h, precomp, &zero); if (st->buf_used > 16) { fe1305x2_frombytearray(c, st->buf, st->buf_used); precomp->v[1] = r->v[1]; precomp->v[3] = r->v[3]; precomp->v[5] = r->v[5]; precomp->v[7] = r->v[7]; precomp->v[9] = r->v[9]; addmulmod(h, h, precomp, c); } else if (st->buf_used > 0) { fe1305x2_frombytearray(c, st->buf, st->buf_used); r->v[1] = 1; r->v[3] = 0; r->v[5] = 0; r->v[7] = 0; r->v[9] = 0; addmulmod(h, h, r, c); } h->v[0] += h->v[1]; h->v[2] += h->v[3]; h->v[4] += h->v[5]; h->v[6] += h->v[7]; h->v[8] += h->v[9]; freeze(h); fe1305x2_frombytearray(c, st->key, 16); c->v[8] ^= (1 << 24); h->v[0] += c->v[0]; h->v[2] += c->v[2]; h->v[4] += c->v[4]; h->v[6] += c->v[6]; h->v[8] += c->v[8]; fe1305x2_tobytearray(mac, h); } ring-0.17.14/crypto/poly1305/poly1305_arm_asm.S000064400000000000000000001721061046102023000166770ustar 00000000000000#include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) #pragma GCC diagnostic ignored "-Wlanguage-extension-token" # This implementation was taken from the public domain, neon2 version in # SUPERCOP by D. J. Bernstein and Peter Schwabe. # qhasm: int32 input_0 # qhasm: int32 input_1 # qhasm: int32 input_2 # qhasm: int32 input_3 # qhasm: stack32 input_4 # qhasm: stack32 input_5 # qhasm: stack32 input_6 # qhasm: stack32 input_7 # qhasm: int32 caller_r4 # qhasm: int32 caller_r5 # qhasm: int32 caller_r6 # qhasm: int32 caller_r7 # qhasm: int32 caller_r8 # qhasm: int32 caller_r9 # qhasm: int32 caller_r10 # qhasm: int32 caller_r11 # qhasm: int32 caller_r12 # qhasm: int32 caller_r14 # qhasm: reg128 caller_q4 # qhasm: reg128 caller_q5 # qhasm: reg128 caller_q6 # qhasm: reg128 caller_q7 # qhasm: startcode .fpu neon .text # qhasm: reg128 r0 # qhasm: reg128 r1 # qhasm: reg128 r2 # qhasm: reg128 r3 # qhasm: reg128 r4 # qhasm: reg128 x01 # qhasm: reg128 x23 # qhasm: reg128 x4 # qhasm: reg128 y0 # qhasm: reg128 y12 # qhasm: reg128 y34 # qhasm: reg128 5y12 # qhasm: reg128 5y34 # qhasm: stack128 y0_stack # qhasm: stack128 y12_stack # qhasm: stack128 y34_stack # qhasm: stack128 5y12_stack # qhasm: stack128 5y34_stack # qhasm: reg128 z0 # qhasm: reg128 z12 # qhasm: reg128 z34 # qhasm: reg128 5z12 # qhasm: reg128 5z34 # qhasm: stack128 z0_stack # qhasm: stack128 z12_stack # qhasm: stack128 z34_stack # qhasm: stack128 5z12_stack # qhasm: stack128 5z34_stack # qhasm: stack128 two24 # qhasm: int32 ptr # qhasm: reg128 c01 # qhasm: reg128 c23 # qhasm: reg128 d01 # qhasm: reg128 d23 # qhasm: reg128 t0 # qhasm: reg128 t1 # qhasm: reg128 t2 # qhasm: reg128 t3 # qhasm: reg128 t4 # qhasm: reg128 mask # qhasm: reg128 u0 # qhasm: reg128 u1 # qhasm: reg128 u2 # qhasm: reg128 u3 # qhasm: reg128 u4 # qhasm: reg128 v01 # qhasm: reg128 mid # qhasm: reg128 v23 # qhasm: reg128 v4 # qhasm: int32 len # qhasm: qpushenter crypto_onetimeauth_poly1305_neon2_blocks .align 4 .global openssl_poly1305_neon2_blocks .hidden openssl_poly1305_neon2_blocks .type openssl_poly1305_neon2_blocks STT_FUNC openssl_poly1305_neon2_blocks: vpush {q4,q5,q6,q7} mov r12,sp sub sp,sp,#192 bic sp,sp,#31 # qhasm: len = input_3 # asm 1: mov >len=int32#4,len=r3,y12=reg128#2%bot->y12=reg128#2%top},[y12=d2->y12=d3},[y34=reg128#3%bot->y34=reg128#3%top},[y34=d4->y34=d5},[input_1=int32#2,input_1=r1,z12=reg128#5%bot->z12=reg128#5%top},[z12=d8->z12=d9},[z34=reg128#6%bot->z34=reg128#6%top},[z34=d10->z34=d11},[mask=reg128#7,#0xffffffff # asm 2: vmov.i64 >mask=q6,#0xffffffff vmov.i64 q6,#0xffffffff # qhasm: 2x u4 = 0xff # asm 1: vmov.i64 >u4=reg128#8,#0xff # asm 2: vmov.i64 >u4=q7,#0xff vmov.i64 q7,#0xff # qhasm: x01 aligned= mem128[input_0];input_0+=16 # asm 1: vld1.8 {>x01=reg128#9%bot->x01=reg128#9%top},[x01=d16->x01=d17},[x23=reg128#10%bot->x23=reg128#10%top},[x23=d18->x23=d19},[input_0=int32#1,input_0=r0,>=6 # asm 1: vshr.u64 >mask=reg128#7,mask=q6,>= 7 # asm 1: vshr.u64 >u4=reg128#8,u4=q7,5y12=reg128#12,5y12=q11,5y34=reg128#13,5y34=q12,5y12=reg128#12,<5y12=reg128#12,5y12=q11,<5y12=q11,5y34=reg128#13,<5y34=reg128#13,5y34=q12,<5y34=q12,u4=reg128#8,u4=q7,5z12=reg128#14,5z12=q13,5z34=reg128#15,5z34=q14,5z12=reg128#14,<5z12=reg128#14,5z12=q13,<5z12=q13,5z34=reg128#15,<5z34=reg128#15,5z34=q14,<5z34=q14,ptr=int32#2,ptr=r1,r4=reg128#16,r4=q15,r0=reg128#8,r0=q7,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,<5y12_stack=stack128#5 # asm 2: lea >ptr=r1,<5y12_stack=[sp,#64] add r1,sp,#64 # qhasm: mem128[ptr] aligned= 5y12 # asm 1: vst1.8 {<5y12=reg128#12%bot-<5y12=reg128#12%top},[ptr=int32#2,<5y34_stack=stack128#6 # asm 2: lea >ptr=r1,<5y34_stack=[sp,#80] add r1,sp,#80 # qhasm: mem128[ptr] aligned= 5y34 # asm 1: vst1.8 {<5y34=reg128#13%bot-<5y34=reg128#13%top},[ptr=int32#2,<5z12_stack=stack128#10 # asm 2: lea >ptr=r1,<5z12_stack=[sp,#144] add r1,sp,#144 # qhasm: mem128[ptr] aligned= 5z12 # asm 1: vst1.8 {<5z12=reg128#14%bot-<5z12=reg128#14%top},[ptr=int32#2,<5z34_stack=stack128#11 # asm 2: lea >ptr=r1,<5z34_stack=[sp,#160] add r1,sp,#160 # qhasm: mem128[ptr] aligned= 5z34 # asm 1: vst1.8 {<5z34=reg128#15%bot-<5z34=reg128#15%top},[? len - 64 # asm 1: cmp bls ._below64bytes # qhasm: input_2 += 32 # asm 1: add >input_2=int32#2,input_2=r1,c01=reg128#1%bot->c01=reg128#1%top},[c01=d0->c01=d1},[c23=reg128#2%bot->c23=reg128#2%top},[c23=d2->c23=d3},[ptr=int32#3,ptr=r2,z12=reg128#3%bot->z12=reg128#3%top},[z12=d4->z12=d5},[ptr=int32#3,ptr=r2,z0=reg128#4%bot->z0=reg128#4%top},[z0=d6->z0=d7},[r3=reg128#5,r3=q4,input_2=int32#2,input_2=r1,ptr=int32#3,<5z34_stack=stack128#11 # asm 2: lea >ptr=r2,<5z34_stack=[sp,#160] add r2,sp,#160 # qhasm: 5z34 aligned= mem128[ptr] # asm 1: vld1.8 {>5z34=reg128#6%bot->5z34=reg128#6%top},[5z34=d10->5z34=d11},[r0=reg128#8,r0=q7,r2=reg128#14,r2=q13,d01=reg128#12%bot->d01=reg128#12%top},[d01=d22->d01=d23},[r1=reg128#15,r1=q14,ptr=int32#3,<5z12_stack=stack128#10 # asm 2: lea >ptr=r2,<5z12_stack=[sp,#144] add r2,sp,#144 # qhasm: 5z12 aligned= mem128[ptr] # asm 1: vld1.8 {>5z12=reg128#1%bot->5z12=reg128#1%top},[5z12=d0->5z12=d1},[d23=reg128#2%bot->d23=reg128#2%top},[d23=d2->d23=d3},[input_2=int32#2,input_2=r1,> 40 # asm 1: vshr.u64 >v4=reg128#4,v4=q3,> 14; v23[3] = d23[2,3] unsigned>> 14 # asm 1: vshrn.u64 > 26; v01[3] = d01[2,3] unsigned>> 26 # asm 1: vshrn.u64 > 20; v23[1] = mid[2,3] unsigned>> 20 # asm 1: vshrn.u64 ptr=int32#3,ptr=r2,y34=reg128#3%bot->y34=reg128#3%top},[y34=d4->y34=d5},[ptr=int32#3,ptr=r2,y12=reg128#2%bot->y12=reg128#2%top},[y12=d2->y12=d3},[ptr=int32#3,ptr=r2,y0=reg128#1%bot->y0=reg128#1%top},[y0=d0->y0=d1},[ptr=int32#3,<5y34_stack=stack128#6 # asm 2: lea >ptr=r2,<5y34_stack=[sp,#80] add r2,sp,#80 # qhasm: 5y34 aligned= mem128[ptr] # asm 1: vld1.8 {>5y34=reg128#13%bot->5y34=reg128#13%top},[5y34=d24->5y34=d25},[ptr=int32#3,<5y12_stack=stack128#5 # asm 2: lea >ptr=r2,<5y12_stack=[sp,#64] add r2,sp,#64 # qhasm: 5y12 aligned= mem128[ptr] # asm 1: vld1.8 {>5y12=reg128#12%bot->5y12=reg128#12%top},[5y12=d22->5y12=d23},[ptr=int32#3,ptr=r2,> 26 # asm 1: vshr.u64 >t1=reg128#4,t1=q3,len=int32#4,len=r3,r0=reg128#6,r0=q5,r1=reg128#4,r1=q3,> 26 # asm 1: vshr.u64 >t4=reg128#8,t4=q7,r3=reg128#5,r3=q4,x4=reg128#8,x4=q7,r4=reg128#16%bot->r4=reg128#16%top},[r4=d30->r4=d31},[> 26 # asm 1: vshr.u64 >t2=reg128#9,t2=q8,r1=reg128#4,r1=q3,> 26 # asm 1: vshr.u64 >t0=reg128#10,t0=q9,r2=reg128#9,r2=q8,x4=reg128#11,x4=q10,x01=reg128#6,x01=q5,r0=reg128#8%bot->r0=reg128#8%top},[r0=d14->r0=d15},[ptr=int32#3,ptr=r2,t0=reg128#10,t0=q9,> 26 # asm 1: vshr.u64 >t3=reg128#14,t3=q13,x01=reg128#15,x01=q14,z34=reg128#6%bot->z34=reg128#6%top},[z34=d10->z34=d11},[x23=reg128#10,x23=q9,r3=reg128#5,r3=q4,input_2=int32#2,input_2=r1,> 26 # asm 1: vshr.u64 >t1=reg128#14,t1=q13,x01=reg128#9,x01=q8,r1=reg128#4,r1=q3,> 26 # asm 1: vshr.u64 >t4=reg128#14,t4=q13,r3=reg128#5,r3=q4,x4=reg128#11,x4=q10,? len - 64 # asm 1: cmp bhi ._mainloop2 # qhasm: input_2 -= 32 # asm 1: sub >input_2=int32#3,input_2=r2,? len - 32 # asm 1: cmp bls ._end # qhasm: mainloop: ._mainloop: # qhasm: new r0 # qhasm: ptr = &two24 # asm 1: lea >ptr=int32#2,ptr=r1,r4=reg128#5%bot->r4=reg128#5%top},[r4=d8->r4=d9},[u4=reg128#6%bot->u4=reg128#6%top},[u4=d10->u4=d11},[c01=reg128#8%bot->c01=reg128#8%top},[c01=d14->c01=d15},[c23=reg128#14%bot->c23=reg128#14%top},[c23=d26->c23=d27},[r0=reg128#4,r0=q3,r3=reg128#6,r3=q5,r1=reg128#14,r1=q13,r2=reg128#8,r2=q7,> 26 # asm 1: vshr.u64 >t1=reg128#9,t1=q8,r0=reg128#4,r0=q3,r1=reg128#9,r1=q8,> 26 # asm 1: vshr.u64 >t4=reg128#10,t4=q9,r3=reg128#6,r3=q5,r4=reg128#5,r4=q4,> 26 # asm 1: vshr.u64 >t2=reg128#10,t2=q9,r1=reg128#11,r1=q10,> 26 # asm 1: vshr.u64 >t0=reg128#9,t0=q8,r2=reg128#8,r2=q7,r4=reg128#5,r4=q4,r0=reg128#4,r0=q3,t0=reg128#9,t0=q8,> 26 # asm 1: vshr.u64 >t3=reg128#14,t3=q13,r0=reg128#4,r0=q3,x23=reg128#10,x23=q9,r3=reg128#6,r3=q5,> 26 # asm 1: vshr.u64 >t1=reg128#8,t1=q7,x01=reg128#9,x01=q8,r1=reg128#4,r1=q3,> 26 # asm 1: vshr.u64 >t4=reg128#8,t4=q7,r3=reg128#6,r3=q5,x4=reg128#11,x4=q10,len=int32#4,len=r3,? len - 32 # asm 1: cmp bhi ._mainloop # qhasm: end: ._end: # qhasm: mem128[input_0] = x01;input_0+=16 # asm 1: vst1.8 {len=int32#1,len=r0,mask=reg128#1,#0xffffffff # asm 2: vmov.i64 >mask=q0,#0xffffffff vmov.i64 q0,#0xffffffff # qhasm: y01 aligned= mem128[input_2];input_2+=16 # asm 1: vld1.8 {>y01=reg128#2%bot->y01=reg128#2%top},[y01=d2->y01=d3},[_5y01=reg128#3,_5y01=q2,y23=reg128#4%bot->y23=reg128#4%top},[y23=d6->y23=d7},[_5y23=reg128#9,_5y23=q8,_5y4=reg128#11,_5y4=q10,x01=reg128#12%bot->x01=reg128#12%top},[x01=d22->x01=d23},[_5y01=reg128#3,<_5y01=reg128#3,_5y01=q2,<_5y01=q2,x23=reg128#13%bot->x23=reg128#13%top},[x23=d24->x23=d25},[_5y23=reg128#9,<_5y23=reg128#9,_5y23=q8,<_5y23=q8,_5y4=reg128#11,<_5y4=reg128#11,_5y4=q10,<_5y4=q10,c01=reg128#14%bot->c01=reg128#14%top},[c01=d26->c01=d27},[x01=reg128#12,x01=q11,c23=reg128#14%bot->c23=reg128#14%top},[c23=d26->c23=d27},[x23=reg128#13,x23=q12,>=6 # asm 1: vshr.u64 >mask=reg128#1,mask=q0,x4=reg128#14,x4=q13,r0=reg128#15,r0=q14,r1=reg128#3,r1=q2,r2=reg128#16,r2=q15,r3=reg128#9,r3=q8,r4=reg128#10,r4=q9,> 26 # asm 1: vshr.u64 >t1=reg128#2,t1=q1,r0=reg128#4,r0=q3,r1=reg128#2,r1=q1,> 26 # asm 1: vshr.u64 >t4=reg128#3,t4=q2,r3=reg128#9,r3=q8,r4=reg128#3,r4=q2,> 26 # asm 1: vshr.u64 >t2=reg128#10,t2=q9,r1=reg128#2,r1=q1,> 26 # asm 1: vshr.u64 >t0=reg128#11,t0=q10,r2=reg128#10,r2=q9,r4=reg128#3,r4=q2,r0=reg128#4,r0=q3,t0=reg128#11,t0=q10,> 26 # asm 1: vshr.u64 >t3=reg128#12,t3=q11,r0=reg128#4,r0=q3,x23=reg128#10,x23=q9,r3=reg128#9,r3=q8,> 26 # asm 1: vshr.u64 >t1=reg128#11,t1=q10,x01=reg128#4,x01=q3,r1=reg128#2,r1=q1,> 26 # asm 1: vshr.u64 >t4=reg128#11,t4=q10,r3=reg128#1,r3=q0,x4=reg128#3,x4=q2, // Raw AES functions. // AES_MAXNR is the maximum number of AES rounds. #define AES_MAXNR 14 // aes_key_st should be an opaque type, but EVP requires that the size be // known. struct aes_key_st { uint32_t rd_key[4 * (AES_MAXNR + 1)]; unsigned rounds; }; typedef struct aes_key_st AES_KEY; #endif // OPENSSL_HEADER_AES_H ring-0.17.14/include/ring-core/asm_base.h000064400000000000000000000172241046102023000161760ustar 00000000000000// Copyright 2023 The BoringSSL Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef OPENSSL_HEADER_ASM_BASE_H #define OPENSSL_HEADER_ASM_BASE_H #include // This header contains symbols and common sections used by assembly files. It // is included as a public header to simplify the build, but is not intended for // external use. // // Every assembly file must include this header. Some linker features require // all object files to be tagged with some section metadata. This header file, // when included in assembly, adds that metadata. It also makes defines like // |OPENSSL_X86_64| available and includes the prefixing macros. // // Including this header in an assembly file imples: // // - The file does not require an executable stack. // // - The file, on aarch64, uses the macros defined below to be compatible with // BTI and PAC. // // - The file, on x86_64, requires the program to be compatible with Intel IBT // and SHSTK #if defined(__ASSEMBLER__) #include #if defined(__ELF__) // Every ELF object file, even empty ones, should disable executable stacks. See // https://www.airs.com/blog/archives/518. .pushsection .note.GNU-stack, "", %progbits .popsection #endif #if defined(__CET__) && defined(OPENSSL_X86_64) // Clang and GCC define __CET__ and provide when they support Intel's // Indirect Branch Tracking. // https://lpc.events/event/7/contributions/729/attachments/496/903/CET-LPC-2020.pdf // // cet.h defines _CET_ENDBR which is used to mark function entry points for IBT. // and adds the assembly marker. The value of _CET_ENDBR is made dependant on if // '-fcf-protection' is passed to the compiler. _CET_ENDBR is only required when // the function is the target of an indirect jump, but BoringSSL chooses to mark // all assembly entry points because it is easier, and allows BoringSSL's ABI // tester to call the assembly entry points via an indirect jump. #include #else #define _CET_ENDBR #endif #if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) // We require the ARM assembler provide |__ARM_ARCH| from Arm C Language // Extensions (ACLE). This is supported in GCC 4.8+ and Clang 3.2+. MSVC does // not implement ACLE, but we require Clang's assembler on Windows. #if !defined(__ARM_ARCH) #error "ARM assembler must define __ARM_ARCH" #endif // Even when building for 32-bit ARM, support for aarch64 crypto instructions // will be included. // // TODO(davidben): Remove this and the corresponding ifdefs? This is only // defined because some OpenSSL assembly files would allow disabling the NEON // code entirely. I think we'd prefer to do that by lifting the dispatch to C // anyway. #define __ARM_MAX_ARCH__ 8 // Support macros for // - Armv8.3-A Pointer Authentication and // - Armv8.5-A Branch Target Identification // features which require emitting a .note.gnu.property section with the // appropriate architecture-dependent feature bits set. // // |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to // PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be // used immediately before saving the LR register (x30) to the stack. // |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring // it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone // with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also // have the same value at the two points. For example: // // .global f // f: // AARCH64_SIGN_LINK_REGISTER // stp x29, x30, [sp, #-96]! // mov x29, sp // ... // ldp x29, x30, [sp], #96 // AARCH64_VALIDATE_LINK_REGISTER // ret // // |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or // |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an // indirect call target. In particular, all symbols exported from a file must // begin with one of these macros. For example, a leaf function that does not // save LR can instead use |AARCH64_VALID_CALL_TARGET|: // // .globl return_zero // return_zero: // AARCH64_VALID_CALL_TARGET // mov x0, #0 // ret // // A non-leaf function which does not immediately save LR may need both macros // because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function // may jump to an alternate implementation before setting up the stack: // // .globl with_early_jump // with_early_jump: // AARCH64_VALID_CALL_TARGET // cmp x0, #128 // b.lt .Lwith_early_jump_128 // AARCH64_SIGN_LINK_REGISTER // stp x29, x30, [sp, #-96]! // mov x29, sp // ... // ldp x29, x30, [sp], #96 // AARCH64_VALIDATE_LINK_REGISTER // ret // // .Lwith_early_jump_128: // ... // ret // // These annotations are only required with indirect calls. Private symbols that // are only the target of direct calls do not require annotations. Also note // that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not // indirect jumps (BR). Indirect jumps in assembly are currently not supported // and would require a macro for BTI 'j'. // // Although not necessary, it is safe to use these macros in 32-bit ARM // assembly. This may be used to simplify dual 32-bit and 64-bit files. // // References: // - "ELF for the Arm® 64-bit Architecture" // https://github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst // - "Providing protection for complex software" // https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software #if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 #define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification #define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c' #else #define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification #define AARCH64_VALID_CALL_TARGET #endif #if defined(__ARM_FEATURE_PAC_DEFAULT) && \ (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 // Signed with A-key #define GNU_PROPERTY_AARCH64_POINTER_AUTH \ (1 << 1) // Has Pointer Authentication #define AARCH64_SIGN_LINK_REGISTER hint #25 // PACIASP #define AARCH64_VALIDATE_LINK_REGISTER hint #29 // AUTIASP #elif defined(__ARM_FEATURE_PAC_DEFAULT) && \ (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 // Signed with B-key #define GNU_PROPERTY_AARCH64_POINTER_AUTH \ (1 << 1) // Has Pointer Authentication #define AARCH64_SIGN_LINK_REGISTER hint #27 // PACIBSP #define AARCH64_VALIDATE_LINK_REGISTER hint #31 // AUTIBSP #else #define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 // No Pointer Authentication #if GNU_PROPERTY_AARCH64_BTI != 0 #define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET #else #define AARCH64_SIGN_LINK_REGISTER #endif #define AARCH64_VALIDATE_LINK_REGISTER #endif #if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0 .pushsection .note.gnu.property, "a"; .balign 8; .long 4; .long 0x10; .long 0x5; .asciz "GNU"; .long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ .long 4; .long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI); .long 0; .popsection; #endif #endif // ARM || AARCH64 #endif // __ASSEMBLER__ #endif // OPENSSL_HEADER_ASM_BASE_H ring-0.17.14/include/ring-core/base.h000064400000000000000000000042771046102023000153420ustar 00000000000000// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef OPENSSL_HEADER_BASE_H #define OPENSSL_HEADER_BASE_H // This file should be the first included by all BoringSSL headers. #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(push, 3) #endif #include #include #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(pop) #endif #if defined(__APPLE__) #include #endif #include // IWYU pragma: export #include #include #if defined(__APPLE__) // Note |TARGET_OS_MAC| is set for all Apple OS variants. |TARGET_OS_OSX| // targets macOS specifically. #if defined(TARGET_OS_OSX) && TARGET_OS_OSX #define OPENSSL_MACOS #endif #if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE #define OPENSSL_IOS #endif #endif // *ring* doesn't support the `BORINGSSL_SHARED_LIBRARY` configuration, so // the default (usually "hidden") visibility is always used, even for exported // items. #define OPENSSL_EXPORT // `ring::c` would need to be customized on any platform where these assertions // fail. Keep in sync with `ring::c`. OPENSSL_STATIC_ASSERT(sizeof(int32_t) == sizeof(int), "int isn't 32 bits."); OPENSSL_STATIC_ASSERT(sizeof(uint32_t) == sizeof(unsigned int), "unsigned int isn't 32 bits."); OPENSSL_STATIC_ASSERT(sizeof(size_t) == sizeof(uintptr_t), "uintptr_t and size_t differ."); OPENSSL_STATIC_ASSERT(sizeof(size_t) <= sizeof(uint64_t), "size_t is larger than uint64_t."); OPENSSL_STATIC_ASSERT(sizeof(size_t) >= sizeof(uint32_t), "size_t is smaller than uint32_t."); #endif // OPENSSL_HEADER_BASE_H ring-0.17.14/include/ring-core/check.h000064400000000000000000000042331046102023000154750ustar 00000000000000// Copyright 2020 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #ifndef RING_CHECK_H #define RING_CHECK_H // |debug_assert_nonsecret| is like |assert| and should be used (only) when the // assertion does not have any potential to leak a secret. |NDEBUG| controls this // exactly like |assert|. It is emulated when there is no assert.h to make // cross-building easier. // // When reviewing uses of |debug_assert_nonsecret|, verify that the check // really does not have potential to leak a secret. #if !defined(RING_CORE_NOSTDLIBINC) # include # define debug_assert_nonsecret(x) assert(x) #else # if !defined(NDEBUG) # define debug_assert_nonsecret(x) ((x) ? ((void)0) : __builtin_trap()) # else # define debug_assert_nonsecret(x) ((void)0) # endif #endif // |dev_assert_secret| is like |assert| and should be used (only) when the // assertion operates on secret data in a way that has the potential to leak // the secret. |dev_assert_secret| can only be enabled by changing the |#if 0| // here to |#if 1| (or equivalent) when |NDEBUG| is not defined. This is not // controlled only through |NDEBUG| so that such checks do not leak into debug // builds that may make it into production use. // // When reviewing uses of |dev_assert_secret|, verify that the check really // does have the potential to leak a secret. #if 0 // DO NOT COMMIT CHANGES TO THIS LINE. # define dev_assert_secret debug_assert_nonsecret #else # define dev_assert_secret(x) ((void)0) #endif #endif // RING_CHECK_H ring-0.17.14/include/ring-core/mem.h000064400000000000000000000022041046102023000151720ustar 00000000000000// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef OPENSSL_HEADER_MEM_H #define OPENSSL_HEADER_MEM_H #include // CRYPTO_memcmp returns zero iff the |len| bytes at |a| and |b| are equal. It // takes an amount of time dependent on |len|, but independent of the contents // of |a| and |b|. Unlike memcmp, it cannot be used to put elements into a // defined order as the return value when a != b is undefined, other than to be // non-zero. OPENSSL_EXPORT int CRYPTO_memcmp(const void *a, const void *b, size_t len); #endif // OPENSSL_HEADER_MEM_H ring-0.17.14/include/ring-core/target.h000064400000000000000000000062411046102023000157070ustar 00000000000000// Copyright 2023 The BoringSSL Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef OPENSSL_HEADER_TARGET_H #define OPENSSL_HEADER_TARGET_H // Preprocessor symbols that define the target platform. // // This file may be included in C, C++, and assembler and must be compatible // with each environment. It is separated out only to share code between // and . Prefer to include those headers // instead. #if defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) #define OPENSSL_64_BIT #define OPENSSL_X86_64 #elif defined(__x86) || defined(__i386) || defined(__i386__) || defined(_M_IX86) #define OPENSSL_32_BIT #define OPENSSL_X86 #elif defined(__AARCH64EL__) || defined(_M_ARM64) #define OPENSSL_64_BIT #define OPENSSL_AARCH64 #elif defined(__ARMEL__) || defined(_M_ARM) #define OPENSSL_32_BIT #define OPENSSL_ARM // All of following architectures are only supported when `__BYTE_ORDER__` can be used to detect // endianness (in crypto/internal.h). #elif !defined(__BYTE_ORDER__) #error "Cannot determine endianness because __BYTE_ORDER__ is not defined" // Targets are assumed to be little-endian unless __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__. #elif !(defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) && \ !(defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)) #error "Unsupported endianness" #elif defined(__LP64__) #define OPENSSL_64_BIT #elif defined(__ILP32__) #define OPENSSL_32_BIT // Versions of GCC before 10.0 didn't define `__ILP32__` for all 32-bit targets. #elif defined(__MIPSEL__) || defined(__MIPSEB__) || defined(__PPC__) || defined(__powerpc__) || defined(__csky__) || defined(__XTENSA__) #define OPENSSL_32_BIT #else #error "Unknown target CPU" #endif #if defined(__APPLE__) #define OPENSSL_APPLE #endif #if defined(_WIN32) #define OPENSSL_WINDOWS #endif #if defined(__has_feature) #if __has_feature(address_sanitizer) #define OPENSSL_ASAN #endif #if __has_feature(thread_sanitizer) #define OPENSSL_TSAN #endif #if __has_feature(memory_sanitizer) #define OPENSSL_MSAN #define OPENSSL_ASM_INCOMPATIBLE #endif #if __has_feature(hwaddress_sanitizer) #define OPENSSL_HWASAN #endif #endif // Disable 32-bit Arm assembly on Apple platforms. The last iOS version that // supported 32-bit Arm was iOS 10. #if defined(OPENSSL_APPLE) && defined(OPENSSL_ARM) #define OPENSSL_ASM_INCOMPATIBLE #endif #if defined(OPENSSL_ASM_INCOMPATIBLE) #undef OPENSSL_ASM_INCOMPATIBLE #if !defined(OPENSSL_NO_ASM) #define OPENSSL_NO_ASM #endif #endif // OPENSSL_ASM_INCOMPATIBLE #if !defined(OPENSSL_X86_64) && !defined(OPENSSL_AARCH64) #define OPENSSL_SMALL #endif #endif // OPENSSL_HEADER_TARGET_H ring-0.17.14/include/ring-core/type_check.h000064400000000000000000000025151046102023000165370ustar 00000000000000// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef OPENSSL_HEADER_TYPE_CHECK_H #define OPENSSL_HEADER_TYPE_CHECK_H #include #if defined(__cplusplus) || (defined(_MSC_VER) && !defined(__clang__)) // In C++ and non-clang MSVC, |static_assert| is a keyword. #define OPENSSL_STATIC_ASSERT(cond, msg) static_assert(cond, msg) #else // C11 defines the |_Static_assert| keyword and the |static_assert| macro in // assert.h. While the former is available at all versions in Clang and GCC, the // later depends on libc and, in glibc, depends on being built in C11 mode. We // do not require this, for now, so use |_Static_assert| directly. #define OPENSSL_STATIC_ASSERT(cond, msg) _Static_assert(cond, msg) #endif #endif // OPENSSL_HEADER_TYPE_CHECK_H ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-elf.S000064400000000000000000000611471046102023000174310ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .section .rodata .align 16 .Lbswap_mask: .quad 0x08090a0b0c0d0e0f, 0x0001020304050607 .Lgfpoly: .quad 1, 0xc200000000000000 .Lgfpoly_and_internal_carrybit: .quad 1, 0xc200000000000001 .align 32 .Lctr_pattern: .quad 0, 0 .quad 1, 0 .Linc_2blocks: .quad 2, 0 .quad 2, 0 .text .globl gcm_init_vpclmulqdq_avx2 .hidden gcm_init_vpclmulqdq_avx2 .type gcm_init_vpclmulqdq_avx2,@function .align 32 gcm_init_vpclmulqdq_avx2: .cfi_startproc _CET_ENDBR vpshufd $0x4e,(%rsi),%xmm3 vpshufd $0xd3,%xmm3,%xmm0 vpsrad $31,%xmm0,%xmm0 vpaddq %xmm3,%xmm3,%xmm3 vpand .Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vbroadcasti128 .Lgfpoly(%rip),%ymm6 vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0 vpclmulqdq $0x01,%xmm3,%xmm3,%xmm1 vpclmulqdq $0x10,%xmm3,%xmm3,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vpclmulqdq $0x01,%xmm0,%xmm6,%xmm2 vpshufd $0x4e,%xmm0,%xmm0 vpxor %xmm0,%xmm1,%xmm1 vpxor %xmm2,%xmm1,%xmm1 vpclmulqdq $0x11,%xmm3,%xmm3,%xmm5 vpclmulqdq $0x01,%xmm1,%xmm6,%xmm0 vpshufd $0x4e,%xmm1,%xmm1 vpxor %xmm1,%xmm5,%xmm5 vpxor %xmm0,%xmm5,%xmm5 vinserti128 $1,%xmm3,%ymm5,%ymm3 vinserti128 $1,%xmm5,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xc5,0x00 .byte 0xc4,0xe3,0x65,0x44,0xcd,0x01 .byte 0xc4,0xe3,0x65,0x44,0xd5,0x10 vpxor %ymm2,%ymm1,%ymm1 .byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01 vpshufd $0x4e,%ymm0,%ymm0 vpxor %ymm0,%ymm1,%ymm1 vpxor %ymm2,%ymm1,%ymm1 .byte 0xc4,0xe3,0x65,0x44,0xe5,0x11 .byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01 vpshufd $0x4e,%ymm1,%ymm1 vpxor %ymm1,%ymm4,%ymm4 vpxor %ymm0,%ymm4,%ymm4 vmovdqu %ymm3,96(%rdi) vmovdqu %ymm4,64(%rdi) vpunpcklqdq %ymm3,%ymm4,%ymm0 vpunpckhqdq %ymm3,%ymm4,%ymm1 vpxor %ymm1,%ymm0,%ymm0 vmovdqu %ymm0,128+32(%rdi) .byte 0xc4,0xe3,0x5d,0x44,0xc5,0x00 .byte 0xc4,0xe3,0x5d,0x44,0xcd,0x01 .byte 0xc4,0xe3,0x5d,0x44,0xd5,0x10 vpxor %ymm2,%ymm1,%ymm1 .byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01 vpshufd $0x4e,%ymm0,%ymm0 vpxor %ymm0,%ymm1,%ymm1 vpxor %ymm2,%ymm1,%ymm1 .byte 0xc4,0xe3,0x5d,0x44,0xdd,0x11 .byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01 vpshufd $0x4e,%ymm1,%ymm1 vpxor %ymm1,%ymm3,%ymm3 vpxor %ymm0,%ymm3,%ymm3 .byte 0xc4,0xe3,0x65,0x44,0xc5,0x00 .byte 0xc4,0xe3,0x65,0x44,0xcd,0x01 .byte 0xc4,0xe3,0x65,0x44,0xd5,0x10 vpxor %ymm2,%ymm1,%ymm1 .byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01 vpshufd $0x4e,%ymm0,%ymm0 vpxor %ymm0,%ymm1,%ymm1 vpxor %ymm2,%ymm1,%ymm1 .byte 0xc4,0xe3,0x65,0x44,0xe5,0x11 .byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01 vpshufd $0x4e,%ymm1,%ymm1 vpxor %ymm1,%ymm4,%ymm4 vpxor %ymm0,%ymm4,%ymm4 vmovdqu %ymm3,32(%rdi) vmovdqu %ymm4,0(%rdi) vpunpcklqdq %ymm3,%ymm4,%ymm0 vpunpckhqdq %ymm3,%ymm4,%ymm1 vpxor %ymm1,%ymm0,%ymm0 vmovdqu %ymm0,128(%rdi) vzeroupper ret .cfi_endproc .size gcm_init_vpclmulqdq_avx2, . - gcm_init_vpclmulqdq_avx2 .globl gcm_ghash_vpclmulqdq_avx2_1 .hidden gcm_ghash_vpclmulqdq_avx2_1 .type gcm_ghash_vpclmulqdq_avx2_1,@function .align 32 gcm_ghash_vpclmulqdq_avx2_1: .cfi_startproc _CET_ENDBR vmovdqu .Lbswap_mask(%rip),%xmm6 vmovdqu .Lgfpoly(%rip),%xmm7 vmovdqu (%rdi),%xmm5 vpshufb %xmm6,%xmm5,%xmm5 .Lghash_lastblock: vmovdqu (%rdx),%xmm0 vpshufb %xmm6,%xmm0,%xmm0 vpxor %xmm0,%xmm5,%xmm5 vmovdqu 128-16(%rsi),%xmm0 vpclmulqdq $0x00,%xmm0,%xmm5,%xmm1 vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2 vpclmulqdq $0x10,%xmm0,%xmm5,%xmm3 vpxor %xmm3,%xmm2,%xmm2 vpclmulqdq $0x01,%xmm1,%xmm7,%xmm3 vpshufd $0x4e,%xmm1,%xmm1 vpxor %xmm1,%xmm2,%xmm2 vpxor %xmm3,%xmm2,%xmm2 vpclmulqdq $0x11,%xmm0,%xmm5,%xmm5 vpclmulqdq $0x01,%xmm2,%xmm7,%xmm1 vpshufd $0x4e,%xmm2,%xmm2 vpxor %xmm2,%xmm5,%xmm5 vpxor %xmm1,%xmm5,%xmm5 .Lghash_done: vpshufb %xmm6,%xmm5,%xmm5 vmovdqu %xmm5,(%rdi) vzeroupper ret .cfi_endproc .size gcm_ghash_vpclmulqdq_avx2_1, . - gcm_ghash_vpclmulqdq_avx2_1 .globl aes_gcm_enc_update_vaes_avx2 .hidden aes_gcm_enc_update_vaes_avx2 .type aes_gcm_enc_update_vaes_avx2,@function .align 32 aes_gcm_enc_update_vaes_avx2: .cfi_startproc _CET_ENDBR pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-16 movq 16(%rsp),%r12 #ifdef BORINGSSL_DISPATCH_TEST .extern BORINGSSL_function_hit .hidden BORINGSSL_function_hit movb $1,BORINGSSL_function_hit+8(%rip) #endif vbroadcasti128 .Lbswap_mask(%rip),%ymm0 vmovdqu (%r12),%xmm1 vpshufb %xmm0,%xmm1,%xmm1 vbroadcasti128 (%r8),%ymm11 vpshufb %ymm0,%ymm11,%ymm11 movl 240(%rcx),%r10d leal -20(,%r10,4),%r10d leaq 96(%rcx,%r10,4),%r11 vbroadcasti128 (%rcx),%ymm9 vbroadcasti128 (%r11),%ymm10 vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 cmpq $127,%rdx jbe .Lcrypt_loop_4x_done__func1 vmovdqu 128(%r9),%ymm7 vmovdqu 128+32(%r9),%ymm8 vmovdqu .Linc_2blocks(%rip),%ymm2 vpshufb %ymm0,%ymm11,%ymm12 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm14 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm15 vpaddd %ymm2,%ymm11,%ymm11 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 vpxor %ymm9,%ymm14,%ymm14 vpxor %ymm9,%ymm15,%ymm15 leaq 16(%rcx),%rax .Lvaesenc_loop_first_4_vecs__func1: vbroadcasti128 (%rax),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa addq $16,%rax cmpq %rax,%r11 jne .Lvaesenc_loop_first_4_vecs__func1 vpxor 0(%rdi),%ymm10,%ymm2 vpxor 32(%rdi),%ymm10,%ymm3 vpxor 64(%rdi),%ymm10,%ymm5 vpxor 96(%rdi),%ymm10,%ymm6 .byte 0xc4,0x62,0x1d,0xdd,0xe2 .byte 0xc4,0x62,0x15,0xdd,0xeb .byte 0xc4,0x62,0x0d,0xdd,0xf5 .byte 0xc4,0x62,0x05,0xdd,0xfe vmovdqu %ymm12,0(%rsi) vmovdqu %ymm13,32(%rsi) vmovdqu %ymm14,64(%rsi) vmovdqu %ymm15,96(%rsi) subq $-128,%rdi addq $-128,%rdx cmpq $127,%rdx jbe .Lghash_last_ciphertext_4x__func1 .align 16 .Lcrypt_loop_4x__func1: vmovdqu .Linc_2blocks(%rip),%ymm2 vpshufb %ymm0,%ymm11,%ymm12 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm14 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm15 vpaddd %ymm2,%ymm11,%ymm11 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 vpxor %ymm9,%ymm14,%ymm14 vpxor %ymm9,%ymm15,%ymm15 cmpl $24,%r10d jl .Laes128__func1 je .Laes192__func1 vbroadcasti128 -208(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 -192(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa .Laes192__func1: vbroadcasti128 -176(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 -160(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa .Laes128__func1: prefetcht0 512(%rdi) prefetcht0 512+64(%rdi) vmovdqu 0(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 0(%r9),%ymm4 vpxor %ymm1,%ymm3,%ymm3 .byte 0xc4,0xe3,0x65,0x44,0xec,0x00 .byte 0xc4,0xe3,0x65,0x44,0xcc,0x11 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00 vbroadcasti128 -144(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 -128(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vmovdqu 32(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 32(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10 vpxor %ymm2,%ymm6,%ymm6 vbroadcasti128 -112(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vmovdqu 64(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 64(%r9),%ymm4 vbroadcasti128 -96(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vbroadcasti128 -80(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00 vpxor %ymm2,%ymm6,%ymm6 vmovdqu 96(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vbroadcasti128 -64(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vmovdqu 96(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10 vpxor %ymm2,%ymm6,%ymm6 vbroadcasti128 -48(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm1,%ymm6,%ymm6 vbroadcasti128 .Lgfpoly(%rip),%ymm4 .byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01 vpshufd $0x4e,%ymm5,%ymm5 vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm2,%ymm6,%ymm6 vbroadcasti128 -32(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa .byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01 vpshufd $0x4e,%ymm6,%ymm6 vpxor %ymm6,%ymm1,%ymm1 vpxor %ymm2,%ymm1,%ymm1 vbroadcasti128 -16(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vextracti128 $1,%ymm1,%xmm2 vpxor %xmm2,%xmm1,%xmm1 subq $-128,%rsi vpxor 0(%rdi),%ymm10,%ymm2 vpxor 32(%rdi),%ymm10,%ymm3 vpxor 64(%rdi),%ymm10,%ymm5 vpxor 96(%rdi),%ymm10,%ymm6 .byte 0xc4,0x62,0x1d,0xdd,0xe2 .byte 0xc4,0x62,0x15,0xdd,0xeb .byte 0xc4,0x62,0x0d,0xdd,0xf5 .byte 0xc4,0x62,0x05,0xdd,0xfe vmovdqu %ymm12,0(%rsi) vmovdqu %ymm13,32(%rsi) vmovdqu %ymm14,64(%rsi) vmovdqu %ymm15,96(%rsi) subq $-128,%rdi addq $-128,%rdx cmpq $127,%rdx ja .Lcrypt_loop_4x__func1 .Lghash_last_ciphertext_4x__func1: vmovdqu 0(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 0(%r9),%ymm4 vpxor %ymm1,%ymm3,%ymm3 .byte 0xc4,0xe3,0x65,0x44,0xec,0x00 .byte 0xc4,0xe3,0x65,0x44,0xcc,0x11 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00 vmovdqu 32(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 32(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10 vpxor %ymm2,%ymm6,%ymm6 vmovdqu 64(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 64(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00 vpxor %ymm2,%ymm6,%ymm6 vmovdqu 96(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 96(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10 vpxor %ymm2,%ymm6,%ymm6 vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm1,%ymm6,%ymm6 vbroadcasti128 .Lgfpoly(%rip),%ymm4 .byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01 vpshufd $0x4e,%ymm5,%ymm5 vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm2,%ymm6,%ymm6 .byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01 vpshufd $0x4e,%ymm6,%ymm6 vpxor %ymm6,%ymm1,%ymm1 vpxor %ymm2,%ymm1,%ymm1 vextracti128 $1,%ymm1,%xmm2 vpxor %xmm2,%xmm1,%xmm1 subq $-128,%rsi .Lcrypt_loop_4x_done__func1: testq %rdx,%rdx jz .Ldone__func1 leaq 128(%r9),%r8 subq %rdx,%r8 vpxor %xmm5,%xmm5,%xmm5 vpxor %xmm6,%xmm6,%xmm6 vpxor %xmm7,%xmm7,%xmm7 cmpq $64,%rdx jb .Llessthan64bytes__func1 vpshufb %ymm0,%ymm11,%ymm12 vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 leaq 16(%rcx),%rax .Lvaesenc_loop_tail_1__func1: vbroadcasti128 (%rax),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea addq $16,%rax cmpq %rax,%r11 jne .Lvaesenc_loop_tail_1__func1 .byte 0xc4,0x42,0x1d,0xdd,0xe2 .byte 0xc4,0x42,0x15,0xdd,0xea vmovdqu 0(%rdi),%ymm2 vmovdqu 32(%rdi),%ymm3 vpxor %ymm2,%ymm12,%ymm12 vpxor %ymm3,%ymm13,%ymm13 vmovdqu %ymm12,0(%rsi) vmovdqu %ymm13,32(%rsi) vpshufb %ymm0,%ymm12,%ymm12 vpshufb %ymm0,%ymm13,%ymm13 vpxor %ymm1,%ymm12,%ymm12 vmovdqu (%r8),%ymm2 vmovdqu 32(%r8),%ymm3 .byte 0xc4,0xe3,0x1d,0x44,0xea,0x00 .byte 0xc4,0xe3,0x1d,0x44,0xf2,0x01 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x1d,0x44,0xfa,0x11 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x00 vpxor %ymm4,%ymm5,%ymm5 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x01 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x10 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x11 vpxor %ymm4,%ymm7,%ymm7 addq $64,%r8 addq $64,%rdi addq $64,%rsi subq $64,%rdx jz .Lreduce__func1 vpxor %xmm1,%xmm1,%xmm1 .Llessthan64bytes__func1: vpshufb %ymm0,%ymm11,%ymm12 vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 leaq 16(%rcx),%rax .Lvaesenc_loop_tail_2__func1: vbroadcasti128 (%rax),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea addq $16,%rax cmpq %rax,%r11 jne .Lvaesenc_loop_tail_2__func1 .byte 0xc4,0x42,0x1d,0xdd,0xe2 .byte 0xc4,0x42,0x15,0xdd,0xea cmpq $32,%rdx jb .Lxor_one_block__func1 je .Lxor_two_blocks__func1 .Lxor_three_blocks__func1: vmovdqu 0(%rdi),%ymm2 vmovdqu 32(%rdi),%xmm3 vpxor %ymm2,%ymm12,%ymm12 vpxor %xmm3,%xmm13,%xmm13 vmovdqu %ymm12,0(%rsi) vmovdqu %xmm13,32(%rsi) vpshufb %ymm0,%ymm12,%ymm12 vpshufb %xmm0,%xmm13,%xmm13 vpxor %ymm1,%ymm12,%ymm12 vmovdqu (%r8),%ymm2 vmovdqu 32(%r8),%xmm3 vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm5,%ymm5 vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm6,%ymm6 vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm6,%ymm6 vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm7,%ymm7 jmp .Lghash_mul_one_vec_unreduced__func1 .Lxor_two_blocks__func1: vmovdqu (%rdi),%ymm2 vpxor %ymm2,%ymm12,%ymm12 vmovdqu %ymm12,(%rsi) vpshufb %ymm0,%ymm12,%ymm12 vpxor %ymm1,%ymm12,%ymm12 vmovdqu (%r8),%ymm2 jmp .Lghash_mul_one_vec_unreduced__func1 .Lxor_one_block__func1: vmovdqu (%rdi),%xmm2 vpxor %xmm2,%xmm12,%xmm12 vmovdqu %xmm12,(%rsi) vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm1,%xmm12,%xmm12 vmovdqu (%r8),%xmm2 .Lghash_mul_one_vec_unreduced__func1: .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x00 vpxor %ymm4,%ymm5,%ymm5 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x01 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x11 vpxor %ymm4,%ymm7,%ymm7 .Lreduce__func1: vbroadcasti128 .Lgfpoly(%rip),%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xdd,0x01 vpshufd $0x4e,%ymm5,%ymm5 vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 .byte 0xc4,0xe3,0x6d,0x44,0xde,0x01 vpshufd $0x4e,%ymm6,%ymm6 vpxor %ymm6,%ymm7,%ymm7 vpxor %ymm3,%ymm7,%ymm7 vextracti128 $1,%ymm7,%xmm1 vpxor %xmm7,%xmm1,%xmm1 .Ldone__func1: vpshufb %xmm0,%xmm1,%xmm1 vmovdqu %xmm1,(%r12) vzeroupper popq %r12 .cfi_adjust_cfa_offset -8 .cfi_restore %r12 ret .cfi_endproc .size aes_gcm_enc_update_vaes_avx2, . - aes_gcm_enc_update_vaes_avx2 .globl aes_gcm_dec_update_vaes_avx2 .hidden aes_gcm_dec_update_vaes_avx2 .type aes_gcm_dec_update_vaes_avx2,@function .align 32 aes_gcm_dec_update_vaes_avx2: .cfi_startproc _CET_ENDBR pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-16 movq 16(%rsp),%r12 vbroadcasti128 .Lbswap_mask(%rip),%ymm0 vmovdqu (%r12),%xmm1 vpshufb %xmm0,%xmm1,%xmm1 vbroadcasti128 (%r8),%ymm11 vpshufb %ymm0,%ymm11,%ymm11 movl 240(%rcx),%r10d leal -20(,%r10,4),%r10d leaq 96(%rcx,%r10,4),%r11 vbroadcasti128 (%rcx),%ymm9 vbroadcasti128 (%r11),%ymm10 vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 cmpq $127,%rdx jbe .Lcrypt_loop_4x_done__func2 vmovdqu 128(%r9),%ymm7 vmovdqu 128+32(%r9),%ymm8 .align 16 .Lcrypt_loop_4x__func2: vmovdqu .Linc_2blocks(%rip),%ymm2 vpshufb %ymm0,%ymm11,%ymm12 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm14 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm15 vpaddd %ymm2,%ymm11,%ymm11 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 vpxor %ymm9,%ymm14,%ymm14 vpxor %ymm9,%ymm15,%ymm15 cmpl $24,%r10d jl .Laes128__func2 je .Laes192__func2 vbroadcasti128 -208(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 -192(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa .Laes192__func2: vbroadcasti128 -176(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 -160(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa .Laes128__func2: prefetcht0 512(%rdi) prefetcht0 512+64(%rdi) vmovdqu 0(%rdi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 0(%r9),%ymm4 vpxor %ymm1,%ymm3,%ymm3 .byte 0xc4,0xe3,0x65,0x44,0xec,0x00 .byte 0xc4,0xe3,0x65,0x44,0xcc,0x11 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00 vbroadcasti128 -144(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 -128(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vmovdqu 32(%rdi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 32(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10 vpxor %ymm2,%ymm6,%ymm6 vbroadcasti128 -112(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vmovdqu 64(%rdi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 64(%r9),%ymm4 vbroadcasti128 -96(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vbroadcasti128 -80(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00 vpxor %ymm2,%ymm6,%ymm6 vmovdqu 96(%rdi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vbroadcasti128 -64(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vmovdqu 96(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10 vpxor %ymm2,%ymm6,%ymm6 vbroadcasti128 -48(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm1,%ymm6,%ymm6 vbroadcasti128 .Lgfpoly(%rip),%ymm4 .byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01 vpshufd $0x4e,%ymm5,%ymm5 vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm2,%ymm6,%ymm6 vbroadcasti128 -32(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa .byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01 vpshufd $0x4e,%ymm6,%ymm6 vpxor %ymm6,%ymm1,%ymm1 vpxor %ymm2,%ymm1,%ymm1 vbroadcasti128 -16(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vextracti128 $1,%ymm1,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vpxor 0(%rdi),%ymm10,%ymm2 vpxor 32(%rdi),%ymm10,%ymm3 vpxor 64(%rdi),%ymm10,%ymm5 vpxor 96(%rdi),%ymm10,%ymm6 .byte 0xc4,0x62,0x1d,0xdd,0xe2 .byte 0xc4,0x62,0x15,0xdd,0xeb .byte 0xc4,0x62,0x0d,0xdd,0xf5 .byte 0xc4,0x62,0x05,0xdd,0xfe vmovdqu %ymm12,0(%rsi) vmovdqu %ymm13,32(%rsi) vmovdqu %ymm14,64(%rsi) vmovdqu %ymm15,96(%rsi) subq $-128,%rdi subq $-128,%rsi addq $-128,%rdx cmpq $127,%rdx ja .Lcrypt_loop_4x__func2 .Lcrypt_loop_4x_done__func2: testq %rdx,%rdx jz .Ldone__func2 leaq 128(%r9),%r8 subq %rdx,%r8 vpxor %xmm5,%xmm5,%xmm5 vpxor %xmm6,%xmm6,%xmm6 vpxor %xmm7,%xmm7,%xmm7 cmpq $64,%rdx jb .Llessthan64bytes__func2 vpshufb %ymm0,%ymm11,%ymm12 vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 leaq 16(%rcx),%rax .Lvaesenc_loop_tail_1__func2: vbroadcasti128 (%rax),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea addq $16,%rax cmpq %rax,%r11 jne .Lvaesenc_loop_tail_1__func2 .byte 0xc4,0x42,0x1d,0xdd,0xe2 .byte 0xc4,0x42,0x15,0xdd,0xea vmovdqu 0(%rdi),%ymm2 vmovdqu 32(%rdi),%ymm3 vpxor %ymm2,%ymm12,%ymm12 vpxor %ymm3,%ymm13,%ymm13 vmovdqu %ymm12,0(%rsi) vmovdqu %ymm13,32(%rsi) vpshufb %ymm0,%ymm2,%ymm12 vpshufb %ymm0,%ymm3,%ymm13 vpxor %ymm1,%ymm12,%ymm12 vmovdqu (%r8),%ymm2 vmovdqu 32(%r8),%ymm3 .byte 0xc4,0xe3,0x1d,0x44,0xea,0x00 .byte 0xc4,0xe3,0x1d,0x44,0xf2,0x01 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x1d,0x44,0xfa,0x11 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x00 vpxor %ymm4,%ymm5,%ymm5 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x01 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x10 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x11 vpxor %ymm4,%ymm7,%ymm7 addq $64,%r8 addq $64,%rdi addq $64,%rsi subq $64,%rdx jz .Lreduce__func2 vpxor %xmm1,%xmm1,%xmm1 .Llessthan64bytes__func2: vpshufb %ymm0,%ymm11,%ymm12 vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 leaq 16(%rcx),%rax .Lvaesenc_loop_tail_2__func2: vbroadcasti128 (%rax),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea addq $16,%rax cmpq %rax,%r11 jne .Lvaesenc_loop_tail_2__func2 .byte 0xc4,0x42,0x1d,0xdd,0xe2 .byte 0xc4,0x42,0x15,0xdd,0xea cmpq $32,%rdx jb .Lxor_one_block__func2 je .Lxor_two_blocks__func2 .Lxor_three_blocks__func2: vmovdqu 0(%rdi),%ymm2 vmovdqu 32(%rdi),%xmm3 vpxor %ymm2,%ymm12,%ymm12 vpxor %xmm3,%xmm13,%xmm13 vmovdqu %ymm12,0(%rsi) vmovdqu %xmm13,32(%rsi) vpshufb %ymm0,%ymm2,%ymm12 vpshufb %xmm0,%xmm3,%xmm13 vpxor %ymm1,%ymm12,%ymm12 vmovdqu (%r8),%ymm2 vmovdqu 32(%r8),%xmm3 vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm5,%ymm5 vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm6,%ymm6 vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm6,%ymm6 vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm7,%ymm7 jmp .Lghash_mul_one_vec_unreduced__func2 .Lxor_two_blocks__func2: vmovdqu (%rdi),%ymm2 vpxor %ymm2,%ymm12,%ymm12 vmovdqu %ymm12,(%rsi) vpshufb %ymm0,%ymm2,%ymm12 vpxor %ymm1,%ymm12,%ymm12 vmovdqu (%r8),%ymm2 jmp .Lghash_mul_one_vec_unreduced__func2 .Lxor_one_block__func2: vmovdqu (%rdi),%xmm2 vpxor %xmm2,%xmm12,%xmm12 vmovdqu %xmm12,(%rsi) vpshufb %xmm0,%xmm2,%xmm12 vpxor %xmm1,%xmm12,%xmm12 vmovdqu (%r8),%xmm2 .Lghash_mul_one_vec_unreduced__func2: .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x00 vpxor %ymm4,%ymm5,%ymm5 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x01 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x11 vpxor %ymm4,%ymm7,%ymm7 .Lreduce__func2: vbroadcasti128 .Lgfpoly(%rip),%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xdd,0x01 vpshufd $0x4e,%ymm5,%ymm5 vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 .byte 0xc4,0xe3,0x6d,0x44,0xde,0x01 vpshufd $0x4e,%ymm6,%ymm6 vpxor %ymm6,%ymm7,%ymm7 vpxor %ymm3,%ymm7,%ymm7 vextracti128 $1,%ymm7,%xmm1 vpxor %xmm7,%xmm1,%xmm1 .Ldone__func2: vpshufb %xmm0,%xmm1,%xmm1 vmovdqu %xmm1,(%r12) vzeroupper popq %r12 .cfi_adjust_cfa_offset -8 .cfi_restore %r12 ret .cfi_endproc .size aes_gcm_dec_update_vaes_avx2, . - aes_gcm_dec_update_vaes_avx2 #endif ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-macosx.S000064400000000000000000000576271046102023000201650ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .section __DATA,__const .p2align 4 L$bswap_mask: .quad 0x08090a0b0c0d0e0f, 0x0001020304050607 L$gfpoly: .quad 1, 0xc200000000000000 L$gfpoly_and_internal_carrybit: .quad 1, 0xc200000000000001 .p2align 5 L$ctr_pattern: .quad 0, 0 .quad 1, 0 L$inc_2blocks: .quad 2, 0 .quad 2, 0 .text .globl _gcm_init_vpclmulqdq_avx2 .private_extern _gcm_init_vpclmulqdq_avx2 .p2align 5 _gcm_init_vpclmulqdq_avx2: _CET_ENDBR vpshufd $0x4e,(%rsi),%xmm3 vpshufd $0xd3,%xmm3,%xmm0 vpsrad $31,%xmm0,%xmm0 vpaddq %xmm3,%xmm3,%xmm3 vpand L$gfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vbroadcasti128 L$gfpoly(%rip),%ymm6 vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0 vpclmulqdq $0x01,%xmm3,%xmm3,%xmm1 vpclmulqdq $0x10,%xmm3,%xmm3,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vpclmulqdq $0x01,%xmm0,%xmm6,%xmm2 vpshufd $0x4e,%xmm0,%xmm0 vpxor %xmm0,%xmm1,%xmm1 vpxor %xmm2,%xmm1,%xmm1 vpclmulqdq $0x11,%xmm3,%xmm3,%xmm5 vpclmulqdq $0x01,%xmm1,%xmm6,%xmm0 vpshufd $0x4e,%xmm1,%xmm1 vpxor %xmm1,%xmm5,%xmm5 vpxor %xmm0,%xmm5,%xmm5 vinserti128 $1,%xmm3,%ymm5,%ymm3 vinserti128 $1,%xmm5,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xc5,0x00 .byte 0xc4,0xe3,0x65,0x44,0xcd,0x01 .byte 0xc4,0xe3,0x65,0x44,0xd5,0x10 vpxor %ymm2,%ymm1,%ymm1 .byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01 vpshufd $0x4e,%ymm0,%ymm0 vpxor %ymm0,%ymm1,%ymm1 vpxor %ymm2,%ymm1,%ymm1 .byte 0xc4,0xe3,0x65,0x44,0xe5,0x11 .byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01 vpshufd $0x4e,%ymm1,%ymm1 vpxor %ymm1,%ymm4,%ymm4 vpxor %ymm0,%ymm4,%ymm4 vmovdqu %ymm3,96(%rdi) vmovdqu %ymm4,64(%rdi) vpunpcklqdq %ymm3,%ymm4,%ymm0 vpunpckhqdq %ymm3,%ymm4,%ymm1 vpxor %ymm1,%ymm0,%ymm0 vmovdqu %ymm0,128+32(%rdi) .byte 0xc4,0xe3,0x5d,0x44,0xc5,0x00 .byte 0xc4,0xe3,0x5d,0x44,0xcd,0x01 .byte 0xc4,0xe3,0x5d,0x44,0xd5,0x10 vpxor %ymm2,%ymm1,%ymm1 .byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01 vpshufd $0x4e,%ymm0,%ymm0 vpxor %ymm0,%ymm1,%ymm1 vpxor %ymm2,%ymm1,%ymm1 .byte 0xc4,0xe3,0x5d,0x44,0xdd,0x11 .byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01 vpshufd $0x4e,%ymm1,%ymm1 vpxor %ymm1,%ymm3,%ymm3 vpxor %ymm0,%ymm3,%ymm3 .byte 0xc4,0xe3,0x65,0x44,0xc5,0x00 .byte 0xc4,0xe3,0x65,0x44,0xcd,0x01 .byte 0xc4,0xe3,0x65,0x44,0xd5,0x10 vpxor %ymm2,%ymm1,%ymm1 .byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01 vpshufd $0x4e,%ymm0,%ymm0 vpxor %ymm0,%ymm1,%ymm1 vpxor %ymm2,%ymm1,%ymm1 .byte 0xc4,0xe3,0x65,0x44,0xe5,0x11 .byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01 vpshufd $0x4e,%ymm1,%ymm1 vpxor %ymm1,%ymm4,%ymm4 vpxor %ymm0,%ymm4,%ymm4 vmovdqu %ymm3,32(%rdi) vmovdqu %ymm4,0(%rdi) vpunpcklqdq %ymm3,%ymm4,%ymm0 vpunpckhqdq %ymm3,%ymm4,%ymm1 vpxor %ymm1,%ymm0,%ymm0 vmovdqu %ymm0,128(%rdi) vzeroupper ret .globl _gcm_ghash_vpclmulqdq_avx2_1 .private_extern _gcm_ghash_vpclmulqdq_avx2_1 .p2align 5 _gcm_ghash_vpclmulqdq_avx2_1: _CET_ENDBR vmovdqu L$bswap_mask(%rip),%xmm6 vmovdqu L$gfpoly(%rip),%xmm7 vmovdqu (%rdi),%xmm5 vpshufb %xmm6,%xmm5,%xmm5 L$ghash_lastblock: vmovdqu (%rdx),%xmm0 vpshufb %xmm6,%xmm0,%xmm0 vpxor %xmm0,%xmm5,%xmm5 vmovdqu 128-16(%rsi),%xmm0 vpclmulqdq $0x00,%xmm0,%xmm5,%xmm1 vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2 vpclmulqdq $0x10,%xmm0,%xmm5,%xmm3 vpxor %xmm3,%xmm2,%xmm2 vpclmulqdq $0x01,%xmm1,%xmm7,%xmm3 vpshufd $0x4e,%xmm1,%xmm1 vpxor %xmm1,%xmm2,%xmm2 vpxor %xmm3,%xmm2,%xmm2 vpclmulqdq $0x11,%xmm0,%xmm5,%xmm5 vpclmulqdq $0x01,%xmm2,%xmm7,%xmm1 vpshufd $0x4e,%xmm2,%xmm2 vpxor %xmm2,%xmm5,%xmm5 vpxor %xmm1,%xmm5,%xmm5 L$ghash_done: vpshufb %xmm6,%xmm5,%xmm5 vmovdqu %xmm5,(%rdi) vzeroupper ret .globl _aes_gcm_enc_update_vaes_avx2 .private_extern _aes_gcm_enc_update_vaes_avx2 .p2align 5 _aes_gcm_enc_update_vaes_avx2: _CET_ENDBR pushq %r12 movq 16(%rsp),%r12 #ifdef BORINGSSL_DISPATCH_TEST movb $1,_BORINGSSL_function_hit+8(%rip) #endif vbroadcasti128 L$bswap_mask(%rip),%ymm0 vmovdqu (%r12),%xmm1 vpshufb %xmm0,%xmm1,%xmm1 vbroadcasti128 (%r8),%ymm11 vpshufb %ymm0,%ymm11,%ymm11 movl 240(%rcx),%r10d leal -20(,%r10,4),%r10d leaq 96(%rcx,%r10,4),%r11 vbroadcasti128 (%rcx),%ymm9 vbroadcasti128 (%r11),%ymm10 vpaddd L$ctr_pattern(%rip),%ymm11,%ymm11 cmpq $127,%rdx jbe L$crypt_loop_4x_done__func1 vmovdqu 128(%r9),%ymm7 vmovdqu 128+32(%r9),%ymm8 vmovdqu L$inc_2blocks(%rip),%ymm2 vpshufb %ymm0,%ymm11,%ymm12 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm14 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm15 vpaddd %ymm2,%ymm11,%ymm11 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 vpxor %ymm9,%ymm14,%ymm14 vpxor %ymm9,%ymm15,%ymm15 leaq 16(%rcx),%rax L$vaesenc_loop_first_4_vecs__func1: vbroadcasti128 (%rax),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa addq $16,%rax cmpq %rax,%r11 jne L$vaesenc_loop_first_4_vecs__func1 vpxor 0(%rdi),%ymm10,%ymm2 vpxor 32(%rdi),%ymm10,%ymm3 vpxor 64(%rdi),%ymm10,%ymm5 vpxor 96(%rdi),%ymm10,%ymm6 .byte 0xc4,0x62,0x1d,0xdd,0xe2 .byte 0xc4,0x62,0x15,0xdd,0xeb .byte 0xc4,0x62,0x0d,0xdd,0xf5 .byte 0xc4,0x62,0x05,0xdd,0xfe vmovdqu %ymm12,0(%rsi) vmovdqu %ymm13,32(%rsi) vmovdqu %ymm14,64(%rsi) vmovdqu %ymm15,96(%rsi) subq $-128,%rdi addq $-128,%rdx cmpq $127,%rdx jbe L$ghash_last_ciphertext_4x__func1 .p2align 4 L$crypt_loop_4x__func1: vmovdqu L$inc_2blocks(%rip),%ymm2 vpshufb %ymm0,%ymm11,%ymm12 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm14 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm15 vpaddd %ymm2,%ymm11,%ymm11 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 vpxor %ymm9,%ymm14,%ymm14 vpxor %ymm9,%ymm15,%ymm15 cmpl $24,%r10d jl L$aes128__func1 je L$aes192__func1 vbroadcasti128 -208(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 -192(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa L$aes192__func1: vbroadcasti128 -176(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 -160(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa L$aes128__func1: prefetcht0 512(%rdi) prefetcht0 512+64(%rdi) vmovdqu 0(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 0(%r9),%ymm4 vpxor %ymm1,%ymm3,%ymm3 .byte 0xc4,0xe3,0x65,0x44,0xec,0x00 .byte 0xc4,0xe3,0x65,0x44,0xcc,0x11 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00 vbroadcasti128 -144(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 -128(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vmovdqu 32(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 32(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10 vpxor %ymm2,%ymm6,%ymm6 vbroadcasti128 -112(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vmovdqu 64(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 64(%r9),%ymm4 vbroadcasti128 -96(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vbroadcasti128 -80(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00 vpxor %ymm2,%ymm6,%ymm6 vmovdqu 96(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vbroadcasti128 -64(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vmovdqu 96(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10 vpxor %ymm2,%ymm6,%ymm6 vbroadcasti128 -48(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm1,%ymm6,%ymm6 vbroadcasti128 L$gfpoly(%rip),%ymm4 .byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01 vpshufd $0x4e,%ymm5,%ymm5 vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm2,%ymm6,%ymm6 vbroadcasti128 -32(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa .byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01 vpshufd $0x4e,%ymm6,%ymm6 vpxor %ymm6,%ymm1,%ymm1 vpxor %ymm2,%ymm1,%ymm1 vbroadcasti128 -16(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vextracti128 $1,%ymm1,%xmm2 vpxor %xmm2,%xmm1,%xmm1 subq $-128,%rsi vpxor 0(%rdi),%ymm10,%ymm2 vpxor 32(%rdi),%ymm10,%ymm3 vpxor 64(%rdi),%ymm10,%ymm5 vpxor 96(%rdi),%ymm10,%ymm6 .byte 0xc4,0x62,0x1d,0xdd,0xe2 .byte 0xc4,0x62,0x15,0xdd,0xeb .byte 0xc4,0x62,0x0d,0xdd,0xf5 .byte 0xc4,0x62,0x05,0xdd,0xfe vmovdqu %ymm12,0(%rsi) vmovdqu %ymm13,32(%rsi) vmovdqu %ymm14,64(%rsi) vmovdqu %ymm15,96(%rsi) subq $-128,%rdi addq $-128,%rdx cmpq $127,%rdx ja L$crypt_loop_4x__func1 L$ghash_last_ciphertext_4x__func1: vmovdqu 0(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 0(%r9),%ymm4 vpxor %ymm1,%ymm3,%ymm3 .byte 0xc4,0xe3,0x65,0x44,0xec,0x00 .byte 0xc4,0xe3,0x65,0x44,0xcc,0x11 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00 vmovdqu 32(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 32(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10 vpxor %ymm2,%ymm6,%ymm6 vmovdqu 64(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 64(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00 vpxor %ymm2,%ymm6,%ymm6 vmovdqu 96(%rsi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 96(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10 vpxor %ymm2,%ymm6,%ymm6 vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm1,%ymm6,%ymm6 vbroadcasti128 L$gfpoly(%rip),%ymm4 .byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01 vpshufd $0x4e,%ymm5,%ymm5 vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm2,%ymm6,%ymm6 .byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01 vpshufd $0x4e,%ymm6,%ymm6 vpxor %ymm6,%ymm1,%ymm1 vpxor %ymm2,%ymm1,%ymm1 vextracti128 $1,%ymm1,%xmm2 vpxor %xmm2,%xmm1,%xmm1 subq $-128,%rsi L$crypt_loop_4x_done__func1: testq %rdx,%rdx jz L$done__func1 leaq 128(%r9),%r8 subq %rdx,%r8 vpxor %xmm5,%xmm5,%xmm5 vpxor %xmm6,%xmm6,%xmm6 vpxor %xmm7,%xmm7,%xmm7 cmpq $64,%rdx jb L$lessthan64bytes__func1 vpshufb %ymm0,%ymm11,%ymm12 vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 leaq 16(%rcx),%rax L$vaesenc_loop_tail_1__func1: vbroadcasti128 (%rax),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea addq $16,%rax cmpq %rax,%r11 jne L$vaesenc_loop_tail_1__func1 .byte 0xc4,0x42,0x1d,0xdd,0xe2 .byte 0xc4,0x42,0x15,0xdd,0xea vmovdqu 0(%rdi),%ymm2 vmovdqu 32(%rdi),%ymm3 vpxor %ymm2,%ymm12,%ymm12 vpxor %ymm3,%ymm13,%ymm13 vmovdqu %ymm12,0(%rsi) vmovdqu %ymm13,32(%rsi) vpshufb %ymm0,%ymm12,%ymm12 vpshufb %ymm0,%ymm13,%ymm13 vpxor %ymm1,%ymm12,%ymm12 vmovdqu (%r8),%ymm2 vmovdqu 32(%r8),%ymm3 .byte 0xc4,0xe3,0x1d,0x44,0xea,0x00 .byte 0xc4,0xe3,0x1d,0x44,0xf2,0x01 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x1d,0x44,0xfa,0x11 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x00 vpxor %ymm4,%ymm5,%ymm5 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x01 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x10 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x11 vpxor %ymm4,%ymm7,%ymm7 addq $64,%r8 addq $64,%rdi addq $64,%rsi subq $64,%rdx jz L$reduce__func1 vpxor %xmm1,%xmm1,%xmm1 L$lessthan64bytes__func1: vpshufb %ymm0,%ymm11,%ymm12 vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 leaq 16(%rcx),%rax L$vaesenc_loop_tail_2__func1: vbroadcasti128 (%rax),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea addq $16,%rax cmpq %rax,%r11 jne L$vaesenc_loop_tail_2__func1 .byte 0xc4,0x42,0x1d,0xdd,0xe2 .byte 0xc4,0x42,0x15,0xdd,0xea cmpq $32,%rdx jb L$xor_one_block__func1 je L$xor_two_blocks__func1 L$xor_three_blocks__func1: vmovdqu 0(%rdi),%ymm2 vmovdqu 32(%rdi),%xmm3 vpxor %ymm2,%ymm12,%ymm12 vpxor %xmm3,%xmm13,%xmm13 vmovdqu %ymm12,0(%rsi) vmovdqu %xmm13,32(%rsi) vpshufb %ymm0,%ymm12,%ymm12 vpshufb %xmm0,%xmm13,%xmm13 vpxor %ymm1,%ymm12,%ymm12 vmovdqu (%r8),%ymm2 vmovdqu 32(%r8),%xmm3 vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm5,%ymm5 vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm6,%ymm6 vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm6,%ymm6 vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm7,%ymm7 jmp L$ghash_mul_one_vec_unreduced__func1 L$xor_two_blocks__func1: vmovdqu (%rdi),%ymm2 vpxor %ymm2,%ymm12,%ymm12 vmovdqu %ymm12,(%rsi) vpshufb %ymm0,%ymm12,%ymm12 vpxor %ymm1,%ymm12,%ymm12 vmovdqu (%r8),%ymm2 jmp L$ghash_mul_one_vec_unreduced__func1 L$xor_one_block__func1: vmovdqu (%rdi),%xmm2 vpxor %xmm2,%xmm12,%xmm12 vmovdqu %xmm12,(%rsi) vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm1,%xmm12,%xmm12 vmovdqu (%r8),%xmm2 L$ghash_mul_one_vec_unreduced__func1: .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x00 vpxor %ymm4,%ymm5,%ymm5 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x01 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x11 vpxor %ymm4,%ymm7,%ymm7 L$reduce__func1: vbroadcasti128 L$gfpoly(%rip),%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xdd,0x01 vpshufd $0x4e,%ymm5,%ymm5 vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 .byte 0xc4,0xe3,0x6d,0x44,0xde,0x01 vpshufd $0x4e,%ymm6,%ymm6 vpxor %ymm6,%ymm7,%ymm7 vpxor %ymm3,%ymm7,%ymm7 vextracti128 $1,%ymm7,%xmm1 vpxor %xmm7,%xmm1,%xmm1 L$done__func1: vpshufb %xmm0,%xmm1,%xmm1 vmovdqu %xmm1,(%r12) vzeroupper popq %r12 ret .globl _aes_gcm_dec_update_vaes_avx2 .private_extern _aes_gcm_dec_update_vaes_avx2 .p2align 5 _aes_gcm_dec_update_vaes_avx2: _CET_ENDBR pushq %r12 movq 16(%rsp),%r12 vbroadcasti128 L$bswap_mask(%rip),%ymm0 vmovdqu (%r12),%xmm1 vpshufb %xmm0,%xmm1,%xmm1 vbroadcasti128 (%r8),%ymm11 vpshufb %ymm0,%ymm11,%ymm11 movl 240(%rcx),%r10d leal -20(,%r10,4),%r10d leaq 96(%rcx,%r10,4),%r11 vbroadcasti128 (%rcx),%ymm9 vbroadcasti128 (%r11),%ymm10 vpaddd L$ctr_pattern(%rip),%ymm11,%ymm11 cmpq $127,%rdx jbe L$crypt_loop_4x_done__func2 vmovdqu 128(%r9),%ymm7 vmovdqu 128+32(%r9),%ymm8 .p2align 4 L$crypt_loop_4x__func2: vmovdqu L$inc_2blocks(%rip),%ymm2 vpshufb %ymm0,%ymm11,%ymm12 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm14 vpaddd %ymm2,%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm15 vpaddd %ymm2,%ymm11,%ymm11 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 vpxor %ymm9,%ymm14,%ymm14 vpxor %ymm9,%ymm15,%ymm15 cmpl $24,%r10d jl L$aes128__func2 je L$aes192__func2 vbroadcasti128 -208(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 -192(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa L$aes192__func2: vbroadcasti128 -176(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 -160(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa L$aes128__func2: prefetcht0 512(%rdi) prefetcht0 512+64(%rdi) vmovdqu 0(%rdi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 0(%r9),%ymm4 vpxor %ymm1,%ymm3,%ymm3 .byte 0xc4,0xe3,0x65,0x44,0xec,0x00 .byte 0xc4,0xe3,0x65,0x44,0xcc,0x11 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00 vbroadcasti128 -144(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 -128(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vmovdqu 32(%rdi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 32(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10 vpxor %ymm2,%ymm6,%ymm6 vbroadcasti128 -112(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vmovdqu 64(%rdi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vmovdqu 64(%r9),%ymm4 vbroadcasti128 -96(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vbroadcasti128 -80(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00 vpxor %ymm2,%ymm6,%ymm6 vmovdqu 96(%rdi),%ymm3 vpshufb %ymm0,%ymm3,%ymm3 vbroadcasti128 -64(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vmovdqu 96(%r9),%ymm4 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor %ymm2,%ymm5,%ymm5 .byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor %ymm2,%ymm1,%ymm1 vpunpckhqdq %ymm3,%ymm3,%ymm2 vpxor %ymm3,%ymm2,%ymm2 .byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10 vpxor %ymm2,%ymm6,%ymm6 vbroadcasti128 -48(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm1,%ymm6,%ymm6 vbroadcasti128 L$gfpoly(%rip),%ymm4 .byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01 vpshufd $0x4e,%ymm5,%ymm5 vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm2,%ymm6,%ymm6 vbroadcasti128 -32(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa .byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01 vpshufd $0x4e,%ymm6,%ymm6 vpxor %ymm6,%ymm1,%ymm1 vpxor %ymm2,%ymm1,%ymm1 vbroadcasti128 -16(%r11),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea .byte 0xc4,0x62,0x0d,0xdc,0xf2 .byte 0xc4,0x62,0x05,0xdc,0xfa vextracti128 $1,%ymm1,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vpxor 0(%rdi),%ymm10,%ymm2 vpxor 32(%rdi),%ymm10,%ymm3 vpxor 64(%rdi),%ymm10,%ymm5 vpxor 96(%rdi),%ymm10,%ymm6 .byte 0xc4,0x62,0x1d,0xdd,0xe2 .byte 0xc4,0x62,0x15,0xdd,0xeb .byte 0xc4,0x62,0x0d,0xdd,0xf5 .byte 0xc4,0x62,0x05,0xdd,0xfe vmovdqu %ymm12,0(%rsi) vmovdqu %ymm13,32(%rsi) vmovdqu %ymm14,64(%rsi) vmovdqu %ymm15,96(%rsi) subq $-128,%rdi subq $-128,%rsi addq $-128,%rdx cmpq $127,%rdx ja L$crypt_loop_4x__func2 L$crypt_loop_4x_done__func2: testq %rdx,%rdx jz L$done__func2 leaq 128(%r9),%r8 subq %rdx,%r8 vpxor %xmm5,%xmm5,%xmm5 vpxor %xmm6,%xmm6,%xmm6 vpxor %xmm7,%xmm7,%xmm7 cmpq $64,%rdx jb L$lessthan64bytes__func2 vpshufb %ymm0,%ymm11,%ymm12 vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 leaq 16(%rcx),%rax L$vaesenc_loop_tail_1__func2: vbroadcasti128 (%rax),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea addq $16,%rax cmpq %rax,%r11 jne L$vaesenc_loop_tail_1__func2 .byte 0xc4,0x42,0x1d,0xdd,0xe2 .byte 0xc4,0x42,0x15,0xdd,0xea vmovdqu 0(%rdi),%ymm2 vmovdqu 32(%rdi),%ymm3 vpxor %ymm2,%ymm12,%ymm12 vpxor %ymm3,%ymm13,%ymm13 vmovdqu %ymm12,0(%rsi) vmovdqu %ymm13,32(%rsi) vpshufb %ymm0,%ymm2,%ymm12 vpshufb %ymm0,%ymm3,%ymm13 vpxor %ymm1,%ymm12,%ymm12 vmovdqu (%r8),%ymm2 vmovdqu 32(%r8),%ymm3 .byte 0xc4,0xe3,0x1d,0x44,0xea,0x00 .byte 0xc4,0xe3,0x1d,0x44,0xf2,0x01 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x1d,0x44,0xfa,0x11 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x00 vpxor %ymm4,%ymm5,%ymm5 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x01 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x10 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x15,0x44,0xe3,0x11 vpxor %ymm4,%ymm7,%ymm7 addq $64,%r8 addq $64,%rdi addq $64,%rsi subq $64,%rdx jz L$reduce__func2 vpxor %xmm1,%xmm1,%xmm1 L$lessthan64bytes__func2: vpshufb %ymm0,%ymm11,%ymm12 vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 vpshufb %ymm0,%ymm11,%ymm13 vpxor %ymm9,%ymm12,%ymm12 vpxor %ymm9,%ymm13,%ymm13 leaq 16(%rcx),%rax L$vaesenc_loop_tail_2__func2: vbroadcasti128 (%rax),%ymm2 .byte 0xc4,0x62,0x1d,0xdc,0xe2 .byte 0xc4,0x62,0x15,0xdc,0xea addq $16,%rax cmpq %rax,%r11 jne L$vaesenc_loop_tail_2__func2 .byte 0xc4,0x42,0x1d,0xdd,0xe2 .byte 0xc4,0x42,0x15,0xdd,0xea cmpq $32,%rdx jb L$xor_one_block__func2 je L$xor_two_blocks__func2 L$xor_three_blocks__func2: vmovdqu 0(%rdi),%ymm2 vmovdqu 32(%rdi),%xmm3 vpxor %ymm2,%ymm12,%ymm12 vpxor %xmm3,%xmm13,%xmm13 vmovdqu %ymm12,0(%rsi) vmovdqu %xmm13,32(%rsi) vpshufb %ymm0,%ymm2,%ymm12 vpshufb %xmm0,%xmm3,%xmm13 vpxor %ymm1,%ymm12,%ymm12 vmovdqu (%r8),%ymm2 vmovdqu 32(%r8),%xmm3 vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm5,%ymm5 vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm6,%ymm6 vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm6,%ymm6 vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 vpxor %ymm4,%ymm7,%ymm7 jmp L$ghash_mul_one_vec_unreduced__func2 L$xor_two_blocks__func2: vmovdqu (%rdi),%ymm2 vpxor %ymm2,%ymm12,%ymm12 vmovdqu %ymm12,(%rsi) vpshufb %ymm0,%ymm2,%ymm12 vpxor %ymm1,%ymm12,%ymm12 vmovdqu (%r8),%ymm2 jmp L$ghash_mul_one_vec_unreduced__func2 L$xor_one_block__func2: vmovdqu (%rdi),%xmm2 vpxor %xmm2,%xmm12,%xmm12 vmovdqu %xmm12,(%rsi) vpshufb %xmm0,%xmm2,%xmm12 vpxor %xmm1,%xmm12,%xmm12 vmovdqu (%r8),%xmm2 L$ghash_mul_one_vec_unreduced__func2: .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x00 vpxor %ymm4,%ymm5,%ymm5 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x01 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 vpxor %ymm4,%ymm6,%ymm6 .byte 0xc4,0xe3,0x1d,0x44,0xe2,0x11 vpxor %ymm4,%ymm7,%ymm7 L$reduce__func2: vbroadcasti128 L$gfpoly(%rip),%ymm2 .byte 0xc4,0xe3,0x6d,0x44,0xdd,0x01 vpshufd $0x4e,%ymm5,%ymm5 vpxor %ymm5,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 .byte 0xc4,0xe3,0x6d,0x44,0xde,0x01 vpshufd $0x4e,%ymm6,%ymm6 vpxor %ymm6,%ymm7,%ymm7 vpxor %ymm3,%ymm7,%ymm7 vextracti128 $1,%ymm7,%xmm1 vpxor %xmm7,%xmm1,%xmm1 L$done__func2: vpshufb %xmm0,%xmm1,%xmm1 vmovdqu %xmm1,(%r12) vzeroupper popq %r12 ret #endif ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.asm000064400000000000000000001006731046102023000201750ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %ifidn __OUTPUT_FORMAT__, win64 default rel %define XMMWORD %define YMMWORD %define ZMMWORD %define _CET_ENDBR %include "ring_core_generated/prefix_symbols_nasm.inc" section .rdata rdata align=8 ALIGN 16 $L$bswap_mask: DQ 0x08090a0b0c0d0e0f,0x0001020304050607 $L$gfpoly: DQ 1,0xc200000000000000 $L$gfpoly_and_internal_carrybit: DQ 1,0xc200000000000001 ALIGN 32 $L$ctr_pattern: DQ 0,0 DQ 1,0 $L$inc_2blocks: DQ 2,0 DQ 2,0 section .text code align=64 global gcm_init_vpclmulqdq_avx2 ALIGN 32 gcm_init_vpclmulqdq_avx2: $L$SEH_begin_gcm_init_vpclmulqdq_avx2_1: _CET_ENDBR sub rsp,24 $L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2: movdqa XMMWORD[rsp],xmm6 $L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3: $L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4: vpshufd xmm3,XMMWORD[rdx],0x4e vpshufd xmm0,xmm3,0xd3 vpsrad xmm0,xmm0,31 vpaddq xmm3,xmm3,xmm3 vpand xmm0,xmm0,XMMWORD[$L$gfpoly_and_internal_carrybit] vpxor xmm3,xmm3,xmm0 vbroadcasti128 ymm6,XMMWORD[$L$gfpoly] vpclmulqdq xmm0,xmm3,xmm3,0x00 vpclmulqdq xmm1,xmm3,xmm3,0x01 vpclmulqdq xmm2,xmm3,xmm3,0x10 vpxor xmm1,xmm1,xmm2 vpclmulqdq xmm2,xmm6,xmm0,0x01 vpshufd xmm0,xmm0,0x4e vpxor xmm1,xmm1,xmm0 vpxor xmm1,xmm1,xmm2 vpclmulqdq xmm5,xmm3,xmm3,0x11 vpclmulqdq xmm0,xmm6,xmm1,0x01 vpshufd xmm1,xmm1,0x4e vpxor xmm5,xmm5,xmm1 vpxor xmm5,xmm5,xmm0 vinserti128 ymm3,ymm5,xmm3,1 vinserti128 ymm5,ymm5,xmm5,1 DB 0xc4,0xe3,0x65,0x44,0xc5,0x00 DB 0xc4,0xe3,0x65,0x44,0xcd,0x01 DB 0xc4,0xe3,0x65,0x44,0xd5,0x10 vpxor ymm1,ymm1,ymm2 DB 0xc4,0xe3,0x4d,0x44,0xd0,0x01 vpshufd ymm0,ymm0,0x4e vpxor ymm1,ymm1,ymm0 vpxor ymm1,ymm1,ymm2 DB 0xc4,0xe3,0x65,0x44,0xe5,0x11 DB 0xc4,0xe3,0x4d,0x44,0xc1,0x01 vpshufd ymm1,ymm1,0x4e vpxor ymm4,ymm4,ymm1 vpxor ymm4,ymm4,ymm0 vmovdqu YMMWORD[96+rcx],ymm3 vmovdqu YMMWORD[64+rcx],ymm4 vpunpcklqdq ymm0,ymm4,ymm3 vpunpckhqdq ymm1,ymm4,ymm3 vpxor ymm0,ymm0,ymm1 vmovdqu YMMWORD[(128+32)+rcx],ymm0 DB 0xc4,0xe3,0x5d,0x44,0xc5,0x00 DB 0xc4,0xe3,0x5d,0x44,0xcd,0x01 DB 0xc4,0xe3,0x5d,0x44,0xd5,0x10 vpxor ymm1,ymm1,ymm2 DB 0xc4,0xe3,0x4d,0x44,0xd0,0x01 vpshufd ymm0,ymm0,0x4e vpxor ymm1,ymm1,ymm0 vpxor ymm1,ymm1,ymm2 DB 0xc4,0xe3,0x5d,0x44,0xdd,0x11 DB 0xc4,0xe3,0x4d,0x44,0xc1,0x01 vpshufd ymm1,ymm1,0x4e vpxor ymm3,ymm3,ymm1 vpxor ymm3,ymm3,ymm0 DB 0xc4,0xe3,0x65,0x44,0xc5,0x00 DB 0xc4,0xe3,0x65,0x44,0xcd,0x01 DB 0xc4,0xe3,0x65,0x44,0xd5,0x10 vpxor ymm1,ymm1,ymm2 DB 0xc4,0xe3,0x4d,0x44,0xd0,0x01 vpshufd ymm0,ymm0,0x4e vpxor ymm1,ymm1,ymm0 vpxor ymm1,ymm1,ymm2 DB 0xc4,0xe3,0x65,0x44,0xe5,0x11 DB 0xc4,0xe3,0x4d,0x44,0xc1,0x01 vpshufd ymm1,ymm1,0x4e vpxor ymm4,ymm4,ymm1 vpxor ymm4,ymm4,ymm0 vmovdqu YMMWORD[32+rcx],ymm3 vmovdqu YMMWORD[rcx],ymm4 vpunpcklqdq ymm0,ymm4,ymm3 vpunpckhqdq ymm1,ymm4,ymm3 vpxor ymm0,ymm0,ymm1 vmovdqu YMMWORD[128+rcx],ymm0 vzeroupper movdqa xmm6,XMMWORD[rsp] add rsp,24 ret $L$SEH_end_gcm_init_vpclmulqdq_avx2_5: global gcm_ghash_vpclmulqdq_avx2_1 ALIGN 32 gcm_ghash_vpclmulqdq_avx2_1: $L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1: _CET_ENDBR sub rsp,72 $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_2: movdqa XMMWORD[rsp],xmm6 $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_3: movdqa XMMWORD[16+rsp],xmm7 $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_4: movdqa XMMWORD[32+rsp],xmm8 $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_5: movdqa XMMWORD[48+rsp],xmm9 $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_6: $L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_1_7: vmovdqu xmm6,XMMWORD[$L$bswap_mask] vmovdqu xmm7,XMMWORD[$L$gfpoly] vmovdqu xmm5,XMMWORD[rcx] vpshufb xmm5,xmm5,xmm6 $L$ghash_lastblock: vmovdqu xmm0,XMMWORD[r8] vpshufb xmm0,xmm0,xmm6 vpxor xmm5,xmm5,xmm0 vmovdqu xmm0,XMMWORD[((128-16))+rdx] vpclmulqdq xmm1,xmm5,xmm0,0x00 vpclmulqdq xmm2,xmm5,xmm0,0x01 vpclmulqdq xmm3,xmm5,xmm0,0x10 vpxor xmm2,xmm2,xmm3 vpclmulqdq xmm3,xmm7,xmm1,0x01 vpshufd xmm1,xmm1,0x4e vpxor xmm2,xmm2,xmm1 vpxor xmm2,xmm2,xmm3 vpclmulqdq xmm5,xmm5,xmm0,0x11 vpclmulqdq xmm1,xmm7,xmm2,0x01 vpshufd xmm2,xmm2,0x4e vpxor xmm5,xmm5,xmm2 vpxor xmm5,xmm5,xmm1 $L$ghash_done: vpshufb xmm5,xmm5,xmm6 vmovdqu XMMWORD[rcx],xmm5 vzeroupper movdqa xmm6,XMMWORD[rsp] movdqa xmm7,XMMWORD[16+rsp] movdqa xmm8,XMMWORD[32+rsp] movdqa xmm9,XMMWORD[48+rsp] add rsp,72 ret $L$SEH_end_gcm_ghash_vpclmulqdq_avx2_1_8: global aes_gcm_enc_update_vaes_avx2 ALIGN 32 aes_gcm_enc_update_vaes_avx2: $L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1: _CET_ENDBR push rsi $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2: push rdi $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3: push r12 $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4: mov rsi,QWORD[64+rsp] mov rdi,QWORD[72+rsp] mov r12,QWORD[80+rsp] sub rsp,160 $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5: movdqa XMMWORD[rsp],xmm6 $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6: movdqa XMMWORD[16+rsp],xmm7 $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7: movdqa XMMWORD[32+rsp],xmm8 $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8: movdqa XMMWORD[48+rsp],xmm9 $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9: movdqa XMMWORD[64+rsp],xmm10 $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10: movdqa XMMWORD[80+rsp],xmm11 $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11: movdqa XMMWORD[96+rsp],xmm12 $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12: movdqa XMMWORD[112+rsp],xmm13 $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13: movdqa XMMWORD[128+rsp],xmm14 $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14: movdqa XMMWORD[144+rsp],xmm15 $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15: $L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16: %ifdef BORINGSSL_DISPATCH_TEST EXTERN BORINGSSL_function_hit mov BYTE[((BORINGSSL_function_hit+8))],1 %endif vbroadcasti128 ymm0,XMMWORD[$L$bswap_mask] vmovdqu xmm1,XMMWORD[r12] vpshufb xmm1,xmm1,xmm0 vbroadcasti128 ymm11,XMMWORD[rsi] vpshufb ymm11,ymm11,ymm0 mov r10d,DWORD[240+r9] lea r10d,[((-20))+r10*4] lea r11,[96+r10*4+r9] vbroadcasti128 ymm9,XMMWORD[r9] vbroadcasti128 ymm10,XMMWORD[r11] vpaddd ymm11,ymm11,YMMWORD[$L$ctr_pattern] cmp r8,127 jbe NEAR $L$crypt_loop_4x_done__func1 vmovdqu ymm7,YMMWORD[128+rdi] vmovdqu ymm8,YMMWORD[((128+32))+rdi] vmovdqu ymm2,YMMWORD[$L$inc_2blocks] vpshufb ymm12,ymm11,ymm0 vpaddd ymm11,ymm11,ymm2 vpshufb ymm13,ymm11,ymm0 vpaddd ymm11,ymm11,ymm2 vpshufb ymm14,ymm11,ymm0 vpaddd ymm11,ymm11,ymm2 vpshufb ymm15,ymm11,ymm0 vpaddd ymm11,ymm11,ymm2 vpxor ymm12,ymm12,ymm9 vpxor ymm13,ymm13,ymm9 vpxor ymm14,ymm14,ymm9 vpxor ymm15,ymm15,ymm9 lea rax,[16+r9] $L$vaesenc_loop_first_4_vecs__func1: vbroadcasti128 ymm2,XMMWORD[rax] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa add rax,16 cmp r11,rax jne NEAR $L$vaesenc_loop_first_4_vecs__func1 vpxor ymm2,ymm10,YMMWORD[rcx] vpxor ymm3,ymm10,YMMWORD[32+rcx] vpxor ymm5,ymm10,YMMWORD[64+rcx] vpxor ymm6,ymm10,YMMWORD[96+rcx] DB 0xc4,0x62,0x1d,0xdd,0xe2 DB 0xc4,0x62,0x15,0xdd,0xeb DB 0xc4,0x62,0x0d,0xdd,0xf5 DB 0xc4,0x62,0x05,0xdd,0xfe vmovdqu YMMWORD[rdx],ymm12 vmovdqu YMMWORD[32+rdx],ymm13 vmovdqu YMMWORD[64+rdx],ymm14 vmovdqu YMMWORD[96+rdx],ymm15 sub rcx,-128 add r8,-128 cmp r8,127 jbe NEAR $L$ghash_last_ciphertext_4x__func1 ALIGN 16 $L$crypt_loop_4x__func1: vmovdqu ymm2,YMMWORD[$L$inc_2blocks] vpshufb ymm12,ymm11,ymm0 vpaddd ymm11,ymm11,ymm2 vpshufb ymm13,ymm11,ymm0 vpaddd ymm11,ymm11,ymm2 vpshufb ymm14,ymm11,ymm0 vpaddd ymm11,ymm11,ymm2 vpshufb ymm15,ymm11,ymm0 vpaddd ymm11,ymm11,ymm2 vpxor ymm12,ymm12,ymm9 vpxor ymm13,ymm13,ymm9 vpxor ymm14,ymm14,ymm9 vpxor ymm15,ymm15,ymm9 cmp r10d,24 jl NEAR $L$aes128__func1 je NEAR $L$aes192__func1 vbroadcasti128 ymm2,XMMWORD[((-208))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 ymm2,XMMWORD[((-192))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa $L$aes192__func1: vbroadcasti128 ymm2,XMMWORD[((-176))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 ymm2,XMMWORD[((-160))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa $L$aes128__func1: prefetcht0 [512+rcx] prefetcht0 [((512+64))+rcx] vmovdqu ymm3,YMMWORD[rdx] vpshufb ymm3,ymm3,ymm0 vmovdqu ymm4,YMMWORD[rdi] vpxor ymm3,ymm3,ymm1 DB 0xc4,0xe3,0x65,0x44,0xec,0x00 DB 0xc4,0xe3,0x65,0x44,0xcc,0x11 vpunpckhqdq ymm2,ymm3,ymm3 vpxor ymm2,ymm2,ymm3 DB 0xc4,0xe3,0x6d,0x44,0xf7,0x00 vbroadcasti128 ymm2,XMMWORD[((-144))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 ymm2,XMMWORD[((-128))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vmovdqu ymm3,YMMWORD[32+rdx] vpshufb ymm3,ymm3,ymm0 vmovdqu ymm4,YMMWORD[32+rdi] DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor ymm5,ymm5,ymm2 DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor ymm1,ymm1,ymm2 vpunpckhqdq ymm2,ymm3,ymm3 vpxor ymm2,ymm2,ymm3 DB 0xc4,0xe3,0x6d,0x44,0xd7,0x10 vpxor ymm6,ymm6,ymm2 vbroadcasti128 ymm2,XMMWORD[((-112))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vmovdqu ymm3,YMMWORD[64+rdx] vpshufb ymm3,ymm3,ymm0 vmovdqu ymm4,YMMWORD[64+rdi] vbroadcasti128 ymm2,XMMWORD[((-96))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor ymm5,ymm5,ymm2 DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor ymm1,ymm1,ymm2 vbroadcasti128 ymm2,XMMWORD[((-80))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vpunpckhqdq ymm2,ymm3,ymm3 vpxor ymm2,ymm2,ymm3 DB 0xc4,0xc3,0x6d,0x44,0xd0,0x00 vpxor ymm6,ymm6,ymm2 vmovdqu ymm3,YMMWORD[96+rdx] vpshufb ymm3,ymm3,ymm0 vbroadcasti128 ymm2,XMMWORD[((-64))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vmovdqu ymm4,YMMWORD[96+rdi] DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor ymm5,ymm5,ymm2 DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor ymm1,ymm1,ymm2 vpunpckhqdq ymm2,ymm3,ymm3 vpxor ymm2,ymm2,ymm3 DB 0xc4,0xc3,0x6d,0x44,0xd0,0x10 vpxor ymm6,ymm6,ymm2 vbroadcasti128 ymm2,XMMWORD[((-48))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vpxor ymm6,ymm6,ymm5 vpxor ymm6,ymm6,ymm1 vbroadcasti128 ymm4,XMMWORD[$L$gfpoly] DB 0xc4,0xe3,0x5d,0x44,0xd5,0x01 vpshufd ymm5,ymm5,0x4e vpxor ymm6,ymm6,ymm5 vpxor ymm6,ymm6,ymm2 vbroadcasti128 ymm2,XMMWORD[((-32))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa DB 0xc4,0xe3,0x5d,0x44,0xd6,0x01 vpshufd ymm6,ymm6,0x4e vpxor ymm1,ymm1,ymm6 vpxor ymm1,ymm1,ymm2 vbroadcasti128 ymm2,XMMWORD[((-16))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vextracti128 xmm2,ymm1,1 vpxor xmm1,xmm1,xmm2 sub rdx,-128 vpxor ymm2,ymm10,YMMWORD[rcx] vpxor ymm3,ymm10,YMMWORD[32+rcx] vpxor ymm5,ymm10,YMMWORD[64+rcx] vpxor ymm6,ymm10,YMMWORD[96+rcx] DB 0xc4,0x62,0x1d,0xdd,0xe2 DB 0xc4,0x62,0x15,0xdd,0xeb DB 0xc4,0x62,0x0d,0xdd,0xf5 DB 0xc4,0x62,0x05,0xdd,0xfe vmovdqu YMMWORD[rdx],ymm12 vmovdqu YMMWORD[32+rdx],ymm13 vmovdqu YMMWORD[64+rdx],ymm14 vmovdqu YMMWORD[96+rdx],ymm15 sub rcx,-128 add r8,-128 cmp r8,127 ja NEAR $L$crypt_loop_4x__func1 $L$ghash_last_ciphertext_4x__func1: vmovdqu ymm3,YMMWORD[rdx] vpshufb ymm3,ymm3,ymm0 vmovdqu ymm4,YMMWORD[rdi] vpxor ymm3,ymm3,ymm1 DB 0xc4,0xe3,0x65,0x44,0xec,0x00 DB 0xc4,0xe3,0x65,0x44,0xcc,0x11 vpunpckhqdq ymm2,ymm3,ymm3 vpxor ymm2,ymm2,ymm3 DB 0xc4,0xe3,0x6d,0x44,0xf7,0x00 vmovdqu ymm3,YMMWORD[32+rdx] vpshufb ymm3,ymm3,ymm0 vmovdqu ymm4,YMMWORD[32+rdi] DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor ymm5,ymm5,ymm2 DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor ymm1,ymm1,ymm2 vpunpckhqdq ymm2,ymm3,ymm3 vpxor ymm2,ymm2,ymm3 DB 0xc4,0xe3,0x6d,0x44,0xd7,0x10 vpxor ymm6,ymm6,ymm2 vmovdqu ymm3,YMMWORD[64+rdx] vpshufb ymm3,ymm3,ymm0 vmovdqu ymm4,YMMWORD[64+rdi] DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor ymm5,ymm5,ymm2 DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor ymm1,ymm1,ymm2 vpunpckhqdq ymm2,ymm3,ymm3 vpxor ymm2,ymm2,ymm3 DB 0xc4,0xc3,0x6d,0x44,0xd0,0x00 vpxor ymm6,ymm6,ymm2 vmovdqu ymm3,YMMWORD[96+rdx] vpshufb ymm3,ymm3,ymm0 vmovdqu ymm4,YMMWORD[96+rdi] DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor ymm5,ymm5,ymm2 DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor ymm1,ymm1,ymm2 vpunpckhqdq ymm2,ymm3,ymm3 vpxor ymm2,ymm2,ymm3 DB 0xc4,0xc3,0x6d,0x44,0xd0,0x10 vpxor ymm6,ymm6,ymm2 vpxor ymm6,ymm6,ymm5 vpxor ymm6,ymm6,ymm1 vbroadcasti128 ymm4,XMMWORD[$L$gfpoly] DB 0xc4,0xe3,0x5d,0x44,0xd5,0x01 vpshufd ymm5,ymm5,0x4e vpxor ymm6,ymm6,ymm5 vpxor ymm6,ymm6,ymm2 DB 0xc4,0xe3,0x5d,0x44,0xd6,0x01 vpshufd ymm6,ymm6,0x4e vpxor ymm1,ymm1,ymm6 vpxor ymm1,ymm1,ymm2 vextracti128 xmm2,ymm1,1 vpxor xmm1,xmm1,xmm2 sub rdx,-128 $L$crypt_loop_4x_done__func1: test r8,r8 jz NEAR $L$done__func1 lea rsi,[128+rdi] sub rsi,r8 vpxor xmm5,xmm5,xmm5 vpxor xmm6,xmm6,xmm6 vpxor xmm7,xmm7,xmm7 cmp r8,64 jb NEAR $L$lessthan64bytes__func1 vpshufb ymm12,ymm11,ymm0 vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] vpshufb ymm13,ymm11,ymm0 vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] vpxor ymm12,ymm12,ymm9 vpxor ymm13,ymm13,ymm9 lea rax,[16+r9] $L$vaesenc_loop_tail_1__func1: vbroadcasti128 ymm2,XMMWORD[rax] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea add rax,16 cmp r11,rax jne NEAR $L$vaesenc_loop_tail_1__func1 DB 0xc4,0x42,0x1d,0xdd,0xe2 DB 0xc4,0x42,0x15,0xdd,0xea vmovdqu ymm2,YMMWORD[rcx] vmovdqu ymm3,YMMWORD[32+rcx] vpxor ymm12,ymm12,ymm2 vpxor ymm13,ymm13,ymm3 vmovdqu YMMWORD[rdx],ymm12 vmovdqu YMMWORD[32+rdx],ymm13 vpshufb ymm12,ymm12,ymm0 vpshufb ymm13,ymm13,ymm0 vpxor ymm12,ymm12,ymm1 vmovdqu ymm2,YMMWORD[rsi] vmovdqu ymm3,YMMWORD[32+rsi] DB 0xc4,0xe3,0x1d,0x44,0xea,0x00 DB 0xc4,0xe3,0x1d,0x44,0xf2,0x01 DB 0xc4,0xe3,0x1d,0x44,0xe2,0x10 vpxor ymm6,ymm6,ymm4 DB 0xc4,0xe3,0x1d,0x44,0xfa,0x11 DB 0xc4,0xe3,0x15,0x44,0xe3,0x00 vpxor ymm5,ymm5,ymm4 DB 0xc4,0xe3,0x15,0x44,0xe3,0x01 vpxor ymm6,ymm6,ymm4 DB 0xc4,0xe3,0x15,0x44,0xe3,0x10 vpxor ymm6,ymm6,ymm4 DB 0xc4,0xe3,0x15,0x44,0xe3,0x11 vpxor ymm7,ymm7,ymm4 add rsi,64 add rcx,64 add rdx,64 sub r8,64 jz NEAR $L$reduce__func1 vpxor xmm1,xmm1,xmm1 $L$lessthan64bytes__func1: vpshufb ymm12,ymm11,ymm0 vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] vpshufb ymm13,ymm11,ymm0 vpxor ymm12,ymm12,ymm9 vpxor ymm13,ymm13,ymm9 lea rax,[16+r9] $L$vaesenc_loop_tail_2__func1: vbroadcasti128 ymm2,XMMWORD[rax] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea add rax,16 cmp r11,rax jne NEAR $L$vaesenc_loop_tail_2__func1 DB 0xc4,0x42,0x1d,0xdd,0xe2 DB 0xc4,0x42,0x15,0xdd,0xea cmp r8,32 jb NEAR $L$xor_one_block__func1 je NEAR $L$xor_two_blocks__func1 $L$xor_three_blocks__func1: vmovdqu ymm2,YMMWORD[rcx] vmovdqu xmm3,XMMWORD[32+rcx] vpxor ymm12,ymm12,ymm2 vpxor xmm13,xmm13,xmm3 vmovdqu YMMWORD[rdx],ymm12 vmovdqu XMMWORD[32+rdx],xmm13 vpshufb ymm12,ymm12,ymm0 vpshufb xmm13,xmm13,xmm0 vpxor ymm12,ymm12,ymm1 vmovdqu ymm2,YMMWORD[rsi] vmovdqu xmm3,XMMWORD[32+rsi] vpclmulqdq xmm4,xmm13,xmm3,0x00 vpxor ymm5,ymm5,ymm4 vpclmulqdq xmm4,xmm13,xmm3,0x01 vpxor ymm6,ymm6,ymm4 vpclmulqdq xmm4,xmm13,xmm3,0x10 vpxor ymm6,ymm6,ymm4 vpclmulqdq xmm4,xmm13,xmm3,0x11 vpxor ymm7,ymm7,ymm4 jmp NEAR $L$ghash_mul_one_vec_unreduced__func1 $L$xor_two_blocks__func1: vmovdqu ymm2,YMMWORD[rcx] vpxor ymm12,ymm12,ymm2 vmovdqu YMMWORD[rdx],ymm12 vpshufb ymm12,ymm12,ymm0 vpxor ymm12,ymm12,ymm1 vmovdqu ymm2,YMMWORD[rsi] jmp NEAR $L$ghash_mul_one_vec_unreduced__func1 $L$xor_one_block__func1: vmovdqu xmm2,XMMWORD[rcx] vpxor xmm12,xmm12,xmm2 vmovdqu XMMWORD[rdx],xmm12 vpshufb xmm12,xmm12,xmm0 vpxor xmm12,xmm12,xmm1 vmovdqu xmm2,XMMWORD[rsi] $L$ghash_mul_one_vec_unreduced__func1: DB 0xc4,0xe3,0x1d,0x44,0xe2,0x00 vpxor ymm5,ymm5,ymm4 DB 0xc4,0xe3,0x1d,0x44,0xe2,0x01 vpxor ymm6,ymm6,ymm4 DB 0xc4,0xe3,0x1d,0x44,0xe2,0x10 vpxor ymm6,ymm6,ymm4 DB 0xc4,0xe3,0x1d,0x44,0xe2,0x11 vpxor ymm7,ymm7,ymm4 $L$reduce__func1: vbroadcasti128 ymm2,XMMWORD[$L$gfpoly] DB 0xc4,0xe3,0x6d,0x44,0xdd,0x01 vpshufd ymm5,ymm5,0x4e vpxor ymm6,ymm6,ymm5 vpxor ymm6,ymm6,ymm3 DB 0xc4,0xe3,0x6d,0x44,0xde,0x01 vpshufd ymm6,ymm6,0x4e vpxor ymm7,ymm7,ymm6 vpxor ymm7,ymm7,ymm3 vextracti128 xmm1,ymm7,1 vpxor xmm1,xmm1,xmm7 $L$done__func1: vpshufb xmm1,xmm1,xmm0 vmovdqu XMMWORD[r12],xmm1 vzeroupper movdqa xmm6,XMMWORD[rsp] movdqa xmm7,XMMWORD[16+rsp] movdqa xmm8,XMMWORD[32+rsp] movdqa xmm9,XMMWORD[48+rsp] movdqa xmm10,XMMWORD[64+rsp] movdqa xmm11,XMMWORD[80+rsp] movdqa xmm12,XMMWORD[96+rsp] movdqa xmm13,XMMWORD[112+rsp] movdqa xmm14,XMMWORD[128+rsp] movdqa xmm15,XMMWORD[144+rsp] add rsp,160 pop r12 pop rdi pop rsi ret $L$SEH_end_aes_gcm_enc_update_vaes_avx2_17: global aes_gcm_dec_update_vaes_avx2 ALIGN 32 aes_gcm_dec_update_vaes_avx2: $L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1: _CET_ENDBR push rsi $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2: push rdi $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3: push r12 $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4: mov rsi,QWORD[64+rsp] mov rdi,QWORD[72+rsp] mov r12,QWORD[80+rsp] sub rsp,160 $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5: movdqa XMMWORD[rsp],xmm6 $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6: movdqa XMMWORD[16+rsp],xmm7 $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7: movdqa XMMWORD[32+rsp],xmm8 $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8: movdqa XMMWORD[48+rsp],xmm9 $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9: movdqa XMMWORD[64+rsp],xmm10 $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10: movdqa XMMWORD[80+rsp],xmm11 $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11: movdqa XMMWORD[96+rsp],xmm12 $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12: movdqa XMMWORD[112+rsp],xmm13 $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13: movdqa XMMWORD[128+rsp],xmm14 $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14: movdqa XMMWORD[144+rsp],xmm15 $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15: $L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16: vbroadcasti128 ymm0,XMMWORD[$L$bswap_mask] vmovdqu xmm1,XMMWORD[r12] vpshufb xmm1,xmm1,xmm0 vbroadcasti128 ymm11,XMMWORD[rsi] vpshufb ymm11,ymm11,ymm0 mov r10d,DWORD[240+r9] lea r10d,[((-20))+r10*4] lea r11,[96+r10*4+r9] vbroadcasti128 ymm9,XMMWORD[r9] vbroadcasti128 ymm10,XMMWORD[r11] vpaddd ymm11,ymm11,YMMWORD[$L$ctr_pattern] cmp r8,127 jbe NEAR $L$crypt_loop_4x_done__func2 vmovdqu ymm7,YMMWORD[128+rdi] vmovdqu ymm8,YMMWORD[((128+32))+rdi] ALIGN 16 $L$crypt_loop_4x__func2: vmovdqu ymm2,YMMWORD[$L$inc_2blocks] vpshufb ymm12,ymm11,ymm0 vpaddd ymm11,ymm11,ymm2 vpshufb ymm13,ymm11,ymm0 vpaddd ymm11,ymm11,ymm2 vpshufb ymm14,ymm11,ymm0 vpaddd ymm11,ymm11,ymm2 vpshufb ymm15,ymm11,ymm0 vpaddd ymm11,ymm11,ymm2 vpxor ymm12,ymm12,ymm9 vpxor ymm13,ymm13,ymm9 vpxor ymm14,ymm14,ymm9 vpxor ymm15,ymm15,ymm9 cmp r10d,24 jl NEAR $L$aes128__func2 je NEAR $L$aes192__func2 vbroadcasti128 ymm2,XMMWORD[((-208))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 ymm2,XMMWORD[((-192))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa $L$aes192__func2: vbroadcasti128 ymm2,XMMWORD[((-176))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 ymm2,XMMWORD[((-160))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa $L$aes128__func2: prefetcht0 [512+rcx] prefetcht0 [((512+64))+rcx] vmovdqu ymm3,YMMWORD[rcx] vpshufb ymm3,ymm3,ymm0 vmovdqu ymm4,YMMWORD[rdi] vpxor ymm3,ymm3,ymm1 DB 0xc4,0xe3,0x65,0x44,0xec,0x00 DB 0xc4,0xe3,0x65,0x44,0xcc,0x11 vpunpckhqdq ymm2,ymm3,ymm3 vpxor ymm2,ymm2,ymm3 DB 0xc4,0xe3,0x6d,0x44,0xf7,0x00 vbroadcasti128 ymm2,XMMWORD[((-144))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vbroadcasti128 ymm2,XMMWORD[((-128))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vmovdqu ymm3,YMMWORD[32+rcx] vpshufb ymm3,ymm3,ymm0 vmovdqu ymm4,YMMWORD[32+rdi] DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor ymm5,ymm5,ymm2 DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor ymm1,ymm1,ymm2 vpunpckhqdq ymm2,ymm3,ymm3 vpxor ymm2,ymm2,ymm3 DB 0xc4,0xe3,0x6d,0x44,0xd7,0x10 vpxor ymm6,ymm6,ymm2 vbroadcasti128 ymm2,XMMWORD[((-112))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vmovdqu ymm3,YMMWORD[64+rcx] vpshufb ymm3,ymm3,ymm0 vmovdqu ymm4,YMMWORD[64+rdi] vbroadcasti128 ymm2,XMMWORD[((-96))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor ymm5,ymm5,ymm2 DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor ymm1,ymm1,ymm2 vbroadcasti128 ymm2,XMMWORD[((-80))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vpunpckhqdq ymm2,ymm3,ymm3 vpxor ymm2,ymm2,ymm3 DB 0xc4,0xc3,0x6d,0x44,0xd0,0x00 vpxor ymm6,ymm6,ymm2 vmovdqu ymm3,YMMWORD[96+rcx] vpshufb ymm3,ymm3,ymm0 vbroadcasti128 ymm2,XMMWORD[((-64))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vmovdqu ymm4,YMMWORD[96+rdi] DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 vpxor ymm5,ymm5,ymm2 DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 vpxor ymm1,ymm1,ymm2 vpunpckhqdq ymm2,ymm3,ymm3 vpxor ymm2,ymm2,ymm3 DB 0xc4,0xc3,0x6d,0x44,0xd0,0x10 vpxor ymm6,ymm6,ymm2 vbroadcasti128 ymm2,XMMWORD[((-48))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vpxor ymm6,ymm6,ymm5 vpxor ymm6,ymm6,ymm1 vbroadcasti128 ymm4,XMMWORD[$L$gfpoly] DB 0xc4,0xe3,0x5d,0x44,0xd5,0x01 vpshufd ymm5,ymm5,0x4e vpxor ymm6,ymm6,ymm5 vpxor ymm6,ymm6,ymm2 vbroadcasti128 ymm2,XMMWORD[((-32))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa DB 0xc4,0xe3,0x5d,0x44,0xd6,0x01 vpshufd ymm6,ymm6,0x4e vpxor ymm1,ymm1,ymm6 vpxor ymm1,ymm1,ymm2 vbroadcasti128 ymm2,XMMWORD[((-16))+r11] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea DB 0xc4,0x62,0x0d,0xdc,0xf2 DB 0xc4,0x62,0x05,0xdc,0xfa vextracti128 xmm2,ymm1,1 vpxor xmm1,xmm1,xmm2 vpxor ymm2,ymm10,YMMWORD[rcx] vpxor ymm3,ymm10,YMMWORD[32+rcx] vpxor ymm5,ymm10,YMMWORD[64+rcx] vpxor ymm6,ymm10,YMMWORD[96+rcx] DB 0xc4,0x62,0x1d,0xdd,0xe2 DB 0xc4,0x62,0x15,0xdd,0xeb DB 0xc4,0x62,0x0d,0xdd,0xf5 DB 0xc4,0x62,0x05,0xdd,0xfe vmovdqu YMMWORD[rdx],ymm12 vmovdqu YMMWORD[32+rdx],ymm13 vmovdqu YMMWORD[64+rdx],ymm14 vmovdqu YMMWORD[96+rdx],ymm15 sub rcx,-128 sub rdx,-128 add r8,-128 cmp r8,127 ja NEAR $L$crypt_loop_4x__func2 $L$crypt_loop_4x_done__func2: test r8,r8 jz NEAR $L$done__func2 lea rsi,[128+rdi] sub rsi,r8 vpxor xmm5,xmm5,xmm5 vpxor xmm6,xmm6,xmm6 vpxor xmm7,xmm7,xmm7 cmp r8,64 jb NEAR $L$lessthan64bytes__func2 vpshufb ymm12,ymm11,ymm0 vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] vpshufb ymm13,ymm11,ymm0 vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] vpxor ymm12,ymm12,ymm9 vpxor ymm13,ymm13,ymm9 lea rax,[16+r9] $L$vaesenc_loop_tail_1__func2: vbroadcasti128 ymm2,XMMWORD[rax] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea add rax,16 cmp r11,rax jne NEAR $L$vaesenc_loop_tail_1__func2 DB 0xc4,0x42,0x1d,0xdd,0xe2 DB 0xc4,0x42,0x15,0xdd,0xea vmovdqu ymm2,YMMWORD[rcx] vmovdqu ymm3,YMMWORD[32+rcx] vpxor ymm12,ymm12,ymm2 vpxor ymm13,ymm13,ymm3 vmovdqu YMMWORD[rdx],ymm12 vmovdqu YMMWORD[32+rdx],ymm13 vpshufb ymm12,ymm2,ymm0 vpshufb ymm13,ymm3,ymm0 vpxor ymm12,ymm12,ymm1 vmovdqu ymm2,YMMWORD[rsi] vmovdqu ymm3,YMMWORD[32+rsi] DB 0xc4,0xe3,0x1d,0x44,0xea,0x00 DB 0xc4,0xe3,0x1d,0x44,0xf2,0x01 DB 0xc4,0xe3,0x1d,0x44,0xe2,0x10 vpxor ymm6,ymm6,ymm4 DB 0xc4,0xe3,0x1d,0x44,0xfa,0x11 DB 0xc4,0xe3,0x15,0x44,0xe3,0x00 vpxor ymm5,ymm5,ymm4 DB 0xc4,0xe3,0x15,0x44,0xe3,0x01 vpxor ymm6,ymm6,ymm4 DB 0xc4,0xe3,0x15,0x44,0xe3,0x10 vpxor ymm6,ymm6,ymm4 DB 0xc4,0xe3,0x15,0x44,0xe3,0x11 vpxor ymm7,ymm7,ymm4 add rsi,64 add rcx,64 add rdx,64 sub r8,64 jz NEAR $L$reduce__func2 vpxor xmm1,xmm1,xmm1 $L$lessthan64bytes__func2: vpshufb ymm12,ymm11,ymm0 vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] vpshufb ymm13,ymm11,ymm0 vpxor ymm12,ymm12,ymm9 vpxor ymm13,ymm13,ymm9 lea rax,[16+r9] $L$vaesenc_loop_tail_2__func2: vbroadcasti128 ymm2,XMMWORD[rax] DB 0xc4,0x62,0x1d,0xdc,0xe2 DB 0xc4,0x62,0x15,0xdc,0xea add rax,16 cmp r11,rax jne NEAR $L$vaesenc_loop_tail_2__func2 DB 0xc4,0x42,0x1d,0xdd,0xe2 DB 0xc4,0x42,0x15,0xdd,0xea cmp r8,32 jb NEAR $L$xor_one_block__func2 je NEAR $L$xor_two_blocks__func2 $L$xor_three_blocks__func2: vmovdqu ymm2,YMMWORD[rcx] vmovdqu xmm3,XMMWORD[32+rcx] vpxor ymm12,ymm12,ymm2 vpxor xmm13,xmm13,xmm3 vmovdqu YMMWORD[rdx],ymm12 vmovdqu XMMWORD[32+rdx],xmm13 vpshufb ymm12,ymm2,ymm0 vpshufb xmm13,xmm3,xmm0 vpxor ymm12,ymm12,ymm1 vmovdqu ymm2,YMMWORD[rsi] vmovdqu xmm3,XMMWORD[32+rsi] vpclmulqdq xmm4,xmm13,xmm3,0x00 vpxor ymm5,ymm5,ymm4 vpclmulqdq xmm4,xmm13,xmm3,0x01 vpxor ymm6,ymm6,ymm4 vpclmulqdq xmm4,xmm13,xmm3,0x10 vpxor ymm6,ymm6,ymm4 vpclmulqdq xmm4,xmm13,xmm3,0x11 vpxor ymm7,ymm7,ymm4 jmp NEAR $L$ghash_mul_one_vec_unreduced__func2 $L$xor_two_blocks__func2: vmovdqu ymm2,YMMWORD[rcx] vpxor ymm12,ymm12,ymm2 vmovdqu YMMWORD[rdx],ymm12 vpshufb ymm12,ymm2,ymm0 vpxor ymm12,ymm12,ymm1 vmovdqu ymm2,YMMWORD[rsi] jmp NEAR $L$ghash_mul_one_vec_unreduced__func2 $L$xor_one_block__func2: vmovdqu xmm2,XMMWORD[rcx] vpxor xmm12,xmm12,xmm2 vmovdqu XMMWORD[rdx],xmm12 vpshufb xmm12,xmm2,xmm0 vpxor xmm12,xmm12,xmm1 vmovdqu xmm2,XMMWORD[rsi] $L$ghash_mul_one_vec_unreduced__func2: DB 0xc4,0xe3,0x1d,0x44,0xe2,0x00 vpxor ymm5,ymm5,ymm4 DB 0xc4,0xe3,0x1d,0x44,0xe2,0x01 vpxor ymm6,ymm6,ymm4 DB 0xc4,0xe3,0x1d,0x44,0xe2,0x10 vpxor ymm6,ymm6,ymm4 DB 0xc4,0xe3,0x1d,0x44,0xe2,0x11 vpxor ymm7,ymm7,ymm4 $L$reduce__func2: vbroadcasti128 ymm2,XMMWORD[$L$gfpoly] DB 0xc4,0xe3,0x6d,0x44,0xdd,0x01 vpshufd ymm5,ymm5,0x4e vpxor ymm6,ymm6,ymm5 vpxor ymm6,ymm6,ymm3 DB 0xc4,0xe3,0x6d,0x44,0xde,0x01 vpshufd ymm6,ymm6,0x4e vpxor ymm7,ymm7,ymm6 vpxor ymm7,ymm7,ymm3 vextracti128 xmm1,ymm7,1 vpxor xmm1,xmm1,xmm7 $L$done__func2: vpshufb xmm1,xmm1,xmm0 vmovdqu XMMWORD[r12],xmm1 vzeroupper movdqa xmm6,XMMWORD[rsp] movdqa xmm7,XMMWORD[16+rsp] movdqa xmm8,XMMWORD[32+rsp] movdqa xmm9,XMMWORD[48+rsp] movdqa xmm10,XMMWORD[64+rsp] movdqa xmm11,XMMWORD[80+rsp] movdqa xmm12,XMMWORD[96+rsp] movdqa xmm13,XMMWORD[112+rsp] movdqa xmm14,XMMWORD[128+rsp] movdqa xmm15,XMMWORD[144+rsp] add rsp,160 pop r12 pop rdi pop rsi ret $L$SEH_end_aes_gcm_dec_update_vaes_avx2_17: section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 wrt ..imagebase DD $L$SEH_end_gcm_init_vpclmulqdq_avx2_5 wrt ..imagebase DD $L$SEH_info_gcm_init_vpclmulqdq_avx2_0 wrt ..imagebase DD $L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 wrt ..imagebase DD $L$SEH_end_gcm_ghash_vpclmulqdq_avx2_1_8 wrt ..imagebase DD $L$SEH_info_gcm_ghash_vpclmulqdq_avx2_1_0 wrt ..imagebase DD $L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 wrt ..imagebase DD $L$SEH_end_aes_gcm_enc_update_vaes_avx2_17 wrt ..imagebase DD $L$SEH_info_aes_gcm_enc_update_vaes_avx2_0 wrt ..imagebase DD $L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 wrt ..imagebase DD $L$SEH_end_aes_gcm_dec_update_vaes_avx2_17 wrt ..imagebase DD $L$SEH_info_aes_gcm_dec_update_vaes_avx2_0 wrt ..imagebase section .xdata rdata align=8 ALIGN 4 $L$SEH_info_gcm_init_vpclmulqdq_avx2_0: DB 1 DB $L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 DB 3 DB 0 DB $L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 DB 104 DW 0 DB $L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 DB 34 DW 0 $L$SEH_info_gcm_ghash_vpclmulqdq_avx2_1_0: DB 1 DB $L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_1_7-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 DB 9 DB 0 DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_6-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 DB 152 DW 3 DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_5-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 DB 136 DW 2 DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_4-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 DB 120 DW 1 DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_3-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 DB 104 DW 0 DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_2-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 DB 130 DW 0 $L$SEH_info_aes_gcm_enc_update_vaes_avx2_0: DB 1 DB $L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 25 DB 0 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 248 DW 9 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 232 DW 8 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 216 DW 7 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 200 DW 6 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 184 DW 5 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 168 DW 4 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 152 DW 3 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 136 DW 2 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 120 DW 1 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 104 DW 0 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 1 DW 20 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 192 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 112 DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 DB 96 DW 0 $L$SEH_info_aes_gcm_dec_update_vaes_avx2_0: DB 1 DB $L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 25 DB 0 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 248 DW 9 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 232 DW 8 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 216 DW 7 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 200 DW 6 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 184 DW 5 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 168 DW 4 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 152 DW 3 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 136 DW 2 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 120 DW 1 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 104 DW 0 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 1 DW 20 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 192 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 112 DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 DB 96 DW 0 %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.o000064400000000000000000000566411046102023000176600ustar 00000000000000dg{Ii.debug$S,-@B.debug$Tl55@B.rdata56@`@.text_6cG p`.pdata0?HoH @0@.xdataH{I@@@=C:\Users\b\p\ring\pregenerated\aes-gcm-avx2-x86_64-nasm.asm֨>z *l枅@_e435 <BCDE$F(H1K7L=MCNGOMPRQVRZS`TfUkVoWs[y\_`abcdefghijkoptuvwz{|}~!'-37=BFJPV[_chlptx #'+047<BIPTU`abdinsz   "#()*-1 245!9):.;2<7=;>@?D@IAMDREWF\GaIeKjLoMtNyO~QRSTUVWXYZ[\]^_abcdeklmnopqrs vwxy {$|*}09>CHMV[`ejsx} #)-37;?EIOTY^chmrx}  !&+059=FLQUY_di n s y~ !"#$%&'(*,-.1234 56789%;*</=4>:?>@DAHBLCPDVEZG_HdIiJoKsLyM}NOPQTUVWXYZ[\]^`adefghjklmnoq t u{| $(,27?DLQVZ_dimpv{      ! & + / 4 9 > B E K P U Y _ e i n r v z                                    " + 1 6 : > D I M Q W [ ` f i n t {    ! " # $ % & ' ( ) / 4 6 8 ; < = > @ B D F H J L N P R# V, Z2 [7 \< ]A aH bP gU hZ i_ lg pk qq sy t u { | } ~                       # ( - 2 7 @ E J O T [ b f k o s y                                 " ( - 2 7 < B F L P V [ ` e j n r x |                           #).2 6"<#A$F%K&P(V)Z-^.c/h0m1r2w3|45678:;<=>ABHILMNPQTUVWXYZ\]^ _`abc!f%g*h.i2j6k;n@oEpIqMrRsXt^udvhwnxtyxz~{|}~  %*.27=AGKQU[_dhlpuy} #*18?ISZ\]^:@C:\Users\b\p\ring\pregenerated\aes-gcm-avx2-x86_64-nasm.o4'The Netwide Assembler 2.13.03 #L$bswap_mask #L$gfpoly+ #L$gfpoly_and_internal_carrybit #L$ctr_pattern #L$inc_2blocks5ring_core_0_17_14__gcm_init_vpclmulqdq_avx20L$SEH_begin_gcm_init_vpclmulqdq_avx2_13L$SEH_prologue_gcm_init_vpclmulqdq_avx2_23L$SEH_prologue_gcm_init_vpclmulqdq_avx2_36L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4.L$SEH_end_gcm_init_vpclmulqdq_avx2_58ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_13L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_16L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_26L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_36L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_46L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_56L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_69L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_1_7L$ghash_lastblockL$ghash_done1L$SEH_end_gcm_ghash_vpclmulqdq_avx2_1_89ring_core_0_17_14__aes_gcm_enc_update_vaes_avx24L$SEH_begin_aes_gcm_enc_update_vaes_avx2_17L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_27L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_37L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_47L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_57L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_67L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_77L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_87L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_98L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_108L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_118L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_128L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_138L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_148L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15;L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16,L$vaesenc_loop_first_4_vecs__func1 L$crypt_loop_4x__func1L$aes192__func1L$aes128__func1+L$ghash_last_ciphertext_4x__func1%L$crypt_loop_4x_done__func1&L$vaesenc_loop_tail_1__func1"L$lessthan64bytes__func1&L$vaesenc_loop_tail_2__func1#L$xor_three_blocks__func1!L$xor_two_blocks__func1 L$xor_one_block__func1.L$ghash_mul_one_vec_unreduced__func1L$reduce__func1L$done__func13L$SEH_end_aes_gcm_enc_update_vaes_avx2_179ring_core_0_17_14__aes_gcm_dec_update_vaes_avx24L$SEH_begin_aes_gcm_dec_update_vaes_avx2_17L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_27L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_37L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_47L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_57L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_67L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_77L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_87L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_98L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_108L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_118L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_128L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_138L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_148L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15;L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16 L$crypt_loop_4x__func2L$aes192__func2L$aes128__func2%L$crypt_loop_4x_done__func2&L$vaesenc_loop_tail_1__func2"L$lessthan64bytes__func2&L$vaesenc_loop_tail_2__func2#L$xor_three_blocks__func2!L$xor_two_blocks__func2 L$xor_one_block__func2.L$ghash_mul_one_vec_unreduced__func2L$reduce__func2L$done__func23L$SEH_end_aes_gcm_dec_update_vaes_avx2_172  L$SEH_info_gcm_init_vpclmulqdq_avx2_05  L$SEH_info_gcm_ghash_vpclmulqdq_avx2_1_06  L$SEH_info_aes_gcm_enc_update_vaes_avx2_06  L$SEH_info_aes_gcm_dec_update_vaes_avx2_0t x < @ W [ n r         8 < m q       D H y }     ! % Y ] ! ! " " # #  $  $ 4 % 8 % o & s & ' ' ( ( !) !) P!* T!* !+ !+ !, !, !- !- 4". 8". m"/ q"/ "0 "0 "1 "1 #2 #2 U#3 Y#3 #4 #4 #5 #5 $6 $6 4$7 8$7 V$8 Z$8 q$9 u$9 $: $: $; $; $< $< %= %= ,%> 0%> T%? X%? y%@ }%@ %A %A %B %B %C %C &D &D "&E &&E W&F [&F &G &G &H &H 'I 'I :'J >'J s'K w'K 'L 'L 'M 'M (N "(N W(O [(O (P (P (Q (Q )R )R >)S B)S x)T |)T )U )U )V )V )*W -*W K*X O*X f*Y j*Y *Z *Z *[ *[ *\ *\ *] *] +^ +^ A+_ E+_ d+` h+` +a +a +b +b +c +c +d +d #,e ',e W,f [,f ,g ,g ,h ,h  Hf4$pNpr }Z5aDaDaDIDpNaDIDpNU8U8eDeDeDMDpNeDMDpNY`a@lm]D]D]DMDpN]DMDpNeDeDeDMDpNeDMDpNY !lmwfo4$HÐHHf4$f|$fDD$ fDL$0o5o=o)QzoyoBpQDQDQDADpNQDADpNQ)wfo4$fo|$fDoD$ fDoL$0HHÐVWATHt$@H|$HLd$PHf4$f|$fDD$ fDL$0fDT$@fD\$PfDd$`fDl$pfD$fD$}Zzo $qb}Zb%EFO\`B}Z B}Z%@Io~oo`b%%b%%b%%b%%AAA AIA}Zbbb bHI9ŭŭY ŭi@ŭq`bbb b~"~j ~r@~z`HII#o`b%%b%%b%%b%%AAA AAz:}Z0bbb b}Z@bbb b}ZPbbb b}Z`bbb b@oeo'eDeDmmD}Zpbbb b}ZSbbb boZ eog eDeDmmD}ZSbbb boZ@eog@}ZSbbb beDeD}ZSbbb bmmDoZ`e}ZSbbb bog`eDeDmmD}ZSbbb b}Z%]DpN}ZSbbb b]DpN}ZSbbb b}9HŭŭY ŭi@ŭq`bbb b~"~j ~r@~z`HIIoeo'eDeDmmDoZ eog eDeDmmDoZ@eog@eDeDmmDoZ`eog`eDeDmmD}Z%]DpN]DpN}9HMIHL)I@b%%`b%%`AAIA}ZbbHI9BBooY ~"~j bboo^ DDDDDDDDH@H@H@I@b%%`b%AAIA}ZbbHI9BBI ^ooY ~"zj bboo^ DDDD7o~"booz"boDDDD}ZmDpNmDpN}9qz $wfo4$fo|$fDoD$ fDoL$0fDoT$@fDo\$PfDod$`fDol$pfDo$fDo$HĠA\_^ÐVWATHt$@H|$HLd$PHf4$f|$fDD$ fDL$0fDT$@fD\$PfDd$`fDl$pfD$fD$}Zzo $qb}Zb%EFO\`B}Z B}Z%@I9o~oo`b%%b%%b%%b%%AAA AAz:}Z0bbb b}Z@bbb b}ZPbbb b}Z`bbb b@oeo'eDeDmmD}Zpbbb b}ZSbbb boY eog eDeDmmD}ZSbbb boY@eog@}ZSbbb beDeD}ZSbbb bmmDoY`e}ZSbbb bog`eDeDmmD}ZSbbb b}Z%]DpN}ZSbbb b]DpN}ZSbbb b}9ŭŭY ŭi@ŭq`bbb b~"~j ~r@~z`HHIIMIHL)I@b%%`b%%`AAIA}ZbbHI9BBooY ~"~j bmbeoo^ DDDDDDDDH@H@H@I@b%%`b%AAIA}ZbbHI9BBI ^ooY ~"zj bmbaoo^ DDDD7o~"bmooz"bioDDDD}ZmDpNmDpN}9qz $wfo4$fo|$fDoD$ fDoL$0fDoT$@fDo\$PfDod$`fDol$pfDo$fDo$HĠA\_^ -%B;H ' ( c   U ` $ _\    $(,   h" x hcc YOHA:3,%xhp`cc YOHA:3,%xhp`.filegC:\Users\b\p\ring\.debug$S,.debug$Tl.rdata.text_.pdata0 .xdata.absolutL$gfpoly 0@>`Lx   Et%R+U`&`Qabdz 7eN} e0GjWg Z / e  * A f" v[      7 e     K y       5  d #  #     T   5 N k  d     _! G p $ \ L$bswap_maskL$gfpoly_and_internal_carrybitL$ctr_patternL$inc_2blocksring_core_0_17_14__gcm_init_vpclmulqdq_avx2L$SEH_begin_gcm_init_vpclmulqdq_avx2_1L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4L$SEH_end_gcm_init_vpclmulqdq_avx2_5ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_2L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_3L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_4L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_5L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_6L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_1_7L$ghash_lastblockL$ghash_doneL$SEH_end_gcm_ghash_vpclmulqdq_avx2_1_8ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16L$vaesenc_loop_first_4_vecs__func1L$crypt_loop_4x__func1L$aes192__func1L$aes128__func1L$ghash_last_ciphertext_4x__func1L$crypt_loop_4x_done__func1L$vaesenc_loop_tail_1__func1L$lessthan64bytes__func1L$vaesenc_loop_tail_2__func1L$xor_three_blocks__func1L$xor_two_blocks__func1L$xor_one_block__func1L$ghash_mul_one_vec_unreduced__func1L$reduce__func1L$done__func1L$SEH_end_aes_gcm_enc_update_vaes_avx2_17ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16L$crypt_loop_4x__func2L$aes192__func2L$aes128__func2L$crypt_loop_4x_done__func2L$vaesenc_loop_tail_1__func2L$lessthan64bytes__func2L$vaesenc_loop_tail_2__func2L$xor_three_blocks__func2L$xor_two_blocks__func2L$xor_one_block__func2L$ghash_mul_one_vec_unreduced__func2L$reduce__func2L$done__func2L$SEH_end_aes_gcm_dec_update_vaes_avx2_17L$SEH_info_gcm_init_vpclmulqdq_avx2_0L$SEH_info_gcm_ghash_vpclmulqdq_avx2_1_0L$SEH_info_aes_gcm_enc_update_vaes_avx2_0L$SEH_info_aes_gcm_dec_update_vaes_avx2_0ring-0.17.14/pregenerated/aesni-gcm-x86_64-elf.S000064400000000000000000000463141046102023000171010ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text .type _aesni_ctr32_ghash_6x,@function .align 32 _aesni_ctr32_ghash_6x: .cfi_startproc vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 vmovdqu 0-128(%rcx),%xmm15 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpaddb %xmm2,%xmm11,%xmm12 vpaddb %xmm2,%xmm12,%xmm13 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm15,%xmm1,%xmm9 vmovdqu %xmm4,16+8(%rsp) jmp .Loop6x .align 32 .Loop6x: addl $100663296,%ebx jc .Lhandle_ctr32 vmovdqu 0-32(%r9),%xmm3 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm15,%xmm10,%xmm10 vpxor %xmm15,%xmm11,%xmm11 .Lresume_ctr32: vmovdqu %xmm1,(%r8) vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 vpxor %xmm15,%xmm12,%xmm12 vmovups 16-128(%rcx),%xmm2 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 xorq %r12,%r12 cmpq %r14,%r15 vaesenc %xmm2,%xmm9,%xmm9 vmovdqu 48+8(%rsp),%xmm0 vpxor %xmm15,%xmm13,%xmm13 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 vaesenc %xmm2,%xmm10,%xmm10 vpxor %xmm15,%xmm14,%xmm14 setnc %r12b vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vaesenc %xmm2,%xmm11,%xmm11 vmovdqu 16-32(%r9),%xmm3 negq %r12 vaesenc %xmm2,%xmm12,%xmm12 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 vpxor %xmm4,%xmm8,%xmm8 vaesenc %xmm2,%xmm13,%xmm13 vpxor %xmm5,%xmm1,%xmm4 andq $0x60,%r12 vmovups 32-128(%rcx),%xmm15 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 vaesenc %xmm2,%xmm14,%xmm14 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 leaq (%r14,%r12,1),%r14 vaesenc %xmm15,%xmm9,%xmm9 vpxor 16+8(%rsp),%xmm8,%xmm8 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 vmovdqu 64+8(%rsp),%xmm0 vaesenc %xmm15,%xmm10,%xmm10 movbeq 88(%r14),%r13 vaesenc %xmm15,%xmm11,%xmm11 movbeq 80(%r14),%r12 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,32+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,40+8(%rsp) vmovdqu 48-32(%r9),%xmm5 vaesenc %xmm15,%xmm14,%xmm14 vmovups 48-128(%rcx),%xmm15 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm3,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 vaesenc %xmm15,%xmm11,%xmm11 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 vmovdqu 80+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vpxor %xmm1,%xmm4,%xmm4 vmovdqu 64-32(%r9),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vmovups 64-128(%rcx),%xmm15 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 vaesenc %xmm15,%xmm10,%xmm10 movbeq 72(%r14),%r13 vpxor %xmm5,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 movbeq 64(%r14),%r12 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 vmovdqu 96+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,48+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,56+8(%rsp) vpxor %xmm2,%xmm4,%xmm4 vmovdqu 96-32(%r9),%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vmovups 80-128(%rcx),%xmm15 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 vaesenc %xmm15,%xmm10,%xmm10 movbeq 56(%r14),%r13 vpxor %xmm1,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 vpxor 112+8(%rsp),%xmm8,%xmm8 vaesenc %xmm15,%xmm11,%xmm11 movbeq 48(%r14),%r12 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,64+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,72+8(%rsp) vpxor %xmm3,%xmm4,%xmm4 vmovdqu 112-32(%r9),%xmm3 vaesenc %xmm15,%xmm14,%xmm14 vmovups 96-128(%rcx),%xmm15 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 vaesenc %xmm15,%xmm10,%xmm10 movbeq 40(%r14),%r13 vpxor %xmm2,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 movbeq 32(%r14),%r12 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,80+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,88+8(%rsp) vpxor %xmm5,%xmm6,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor %xmm1,%xmm6,%xmm6 vmovups 112-128(%rcx),%xmm15 vpslldq $8,%xmm6,%xmm5 vpxor %xmm2,%xmm4,%xmm4 vmovdqu 16(%r11),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm8,%xmm7,%xmm7 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm5,%xmm4,%xmm4 movbeq 24(%r14),%r13 vaesenc %xmm15,%xmm11,%xmm11 movbeq 16(%r14),%r12 vpalignr $8,%xmm4,%xmm4,%xmm0 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 movq %r13,96+8(%rsp) vaesenc %xmm15,%xmm12,%xmm12 movq %r12,104+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 vmovups 128-128(%rcx),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vmovups 144-128(%rcx),%xmm15 vaesenc %xmm1,%xmm10,%xmm10 vpsrldq $8,%xmm6,%xmm6 vaesenc %xmm1,%xmm11,%xmm11 vpxor %xmm6,%xmm7,%xmm7 vaesenc %xmm1,%xmm12,%xmm12 vpxor %xmm0,%xmm4,%xmm4 movbeq 8(%r14),%r13 vaesenc %xmm1,%xmm13,%xmm13 movbeq 0(%r14),%r12 vaesenc %xmm1,%xmm14,%xmm14 vmovups 160-128(%rcx),%xmm1 cmpl $11,%r10d jb .Lenc_tail vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 176-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 192-128(%rcx),%xmm1 vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 208-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 224-128(%rcx),%xmm1 jmp .Lenc_tail .align 32 .Lhandle_ctr32: vmovdqu (%r11),%xmm0 vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vmovdqu 0-32(%r9),%xmm3 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm15,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm15,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpshufb %xmm0,%xmm14,%xmm14 vpshufb %xmm0,%xmm1,%xmm1 jmp .Lresume_ctr32 .align 32 .Lenc_tail: vaesenc %xmm15,%xmm9,%xmm9 vmovdqu %xmm7,16+8(%rsp) vpalignr $8,%xmm4,%xmm4,%xmm8 vaesenc %xmm15,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 vpxor 0(%rdi),%xmm1,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 vpxor 16(%rdi),%xmm1,%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vpxor 32(%rdi),%xmm1,%xmm5 vaesenc %xmm15,%xmm13,%xmm13 vpxor 48(%rdi),%xmm1,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor 64(%rdi),%xmm1,%xmm7 vpxor 80(%rdi),%xmm1,%xmm3 vmovdqu (%r8),%xmm1 vaesenclast %xmm2,%xmm9,%xmm9 vmovdqu 32(%r11),%xmm2 vaesenclast %xmm0,%xmm10,%xmm10 vpaddb %xmm2,%xmm1,%xmm0 movq %r13,112+8(%rsp) leaq 96(%rdi),%rdi prefetcht0 512(%rdi) prefetcht0 576(%rdi) vaesenclast %xmm5,%xmm11,%xmm11 vpaddb %xmm2,%xmm0,%xmm5 movq %r12,120+8(%rsp) leaq 96(%rsi),%rsi vmovdqu 0-128(%rcx),%xmm15 vaesenclast %xmm6,%xmm12,%xmm12 vpaddb %xmm2,%xmm5,%xmm6 vaesenclast %xmm7,%xmm13,%xmm13 vpaddb %xmm2,%xmm6,%xmm7 vaesenclast %xmm3,%xmm14,%xmm14 vpaddb %xmm2,%xmm7,%xmm3 addq $0x60,%rax subq $0x6,%rdx jc .L6x_done vmovups %xmm9,-96(%rsi) vpxor %xmm15,%xmm1,%xmm9 vmovups %xmm10,-80(%rsi) vmovdqa %xmm0,%xmm10 vmovups %xmm11,-64(%rsi) vmovdqa %xmm5,%xmm11 vmovups %xmm12,-48(%rsi) vmovdqa %xmm6,%xmm12 vmovups %xmm13,-32(%rsi) vmovdqa %xmm7,%xmm13 vmovups %xmm14,-16(%rsi) vmovdqa %xmm3,%xmm14 vmovdqu 32+8(%rsp),%xmm7 jmp .Loop6x .L6x_done: vpxor 16+8(%rsp),%xmm8,%xmm8 vpxor %xmm4,%xmm8,%xmm8 ret .cfi_endproc .size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x .globl aesni_gcm_decrypt .hidden aesni_gcm_decrypt .type aesni_gcm_decrypt,@function .align 32 aesni_gcm_decrypt: .cfi_startproc _CET_ENDBR xorq %rax,%rax cmpq $0x60,%rdx jb .Lgcm_dec_abort pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 movq %rsp,%rbp .cfi_def_cfa_register %rbp pushq %rbx .cfi_offset %rbx,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 vzeroupper movq 16(%rbp),%r12 vmovdqu (%r8),%xmm1 addq $-128,%rsp movl 12(%r8),%ebx leaq .Lbswap_mask(%rip),%r11 leaq -128(%rcx),%r14 movq $0xf80,%r15 vmovdqu (%r12),%xmm8 andq $-128,%rsp vmovdqu (%r11),%xmm0 leaq 128(%rcx),%rcx leaq 32(%r9),%r9 movl 240-128(%rcx),%r10d vpshufb %xmm0,%xmm8,%xmm8 andq %r15,%r14 andq %rsp,%r15 subq %r14,%r15 jc .Ldec_no_key_aliasing cmpq $768,%r15 jnc .Ldec_no_key_aliasing subq %r15,%rsp .Ldec_no_key_aliasing: vmovdqu 80(%rdi),%xmm7 movq %rdi,%r14 vmovdqu 64(%rdi),%xmm4 leaq -192(%rdi,%rdx,1),%r15 vmovdqu 48(%rdi),%xmm5 shrq $4,%rdx xorq %rax,%rax vmovdqu 32(%rdi),%xmm6 vpshufb %xmm0,%xmm7,%xmm7 vmovdqu 16(%rdi),%xmm2 vpshufb %xmm0,%xmm4,%xmm4 vmovdqu (%rdi),%xmm3 vpshufb %xmm0,%xmm5,%xmm5 vmovdqu %xmm4,48(%rsp) vpshufb %xmm0,%xmm6,%xmm6 vmovdqu %xmm5,64(%rsp) vpshufb %xmm0,%xmm2,%xmm2 vmovdqu %xmm6,80(%rsp) vpshufb %xmm0,%xmm3,%xmm3 vmovdqu %xmm2,96(%rsp) vmovdqu %xmm3,112(%rsp) call _aesni_ctr32_ghash_6x movq 16(%rbp),%r12 vmovups %xmm9,-96(%rsi) vmovups %xmm10,-80(%rsi) vmovups %xmm11,-64(%rsi) vmovups %xmm12,-48(%rsi) vmovups %xmm13,-32(%rsi) vmovups %xmm14,-16(%rsi) vpshufb (%r11),%xmm8,%xmm8 vmovdqu %xmm8,(%r12) vzeroupper leaq -40(%rbp),%rsp .cfi_def_cfa %rsp, 0x38 popq %r15 .cfi_adjust_cfa_offset -8 .cfi_restore %r15 popq %r14 .cfi_adjust_cfa_offset -8 .cfi_restore %r14 popq %r13 .cfi_adjust_cfa_offset -8 .cfi_restore %r13 popq %r12 .cfi_adjust_cfa_offset -8 .cfi_restore %r12 popq %rbx .cfi_adjust_cfa_offset -8 .cfi_restore %rbx popq %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp .Lgcm_dec_abort: ret .cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt .type _aesni_ctr32_6x,@function .align 32 _aesni_ctr32_6x: .cfi_startproc vmovdqu 0-128(%rcx),%xmm4 vmovdqu 32(%r11),%xmm2 leaq -1(%r10),%r13 vmovups 16-128(%rcx),%xmm15 leaq 32-128(%rcx),%r12 vpxor %xmm4,%xmm1,%xmm9 addl $100663296,%ebx jc .Lhandle_ctr32_2 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpxor %xmm4,%xmm10,%xmm10 vpaddb %xmm2,%xmm11,%xmm12 vpxor %xmm4,%xmm11,%xmm11 vpaddb %xmm2,%xmm12,%xmm13 vpxor %xmm4,%xmm12,%xmm12 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm4,%xmm13,%xmm13 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp .Loop_ctr32 .align 16 .Loop_ctr32: vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vmovups (%r12),%xmm15 leaq 16(%r12),%r12 decl %r13d jnz .Loop_ctr32 vmovdqu (%r12),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor 0(%rdi),%xmm3,%xmm4 vaesenc %xmm15,%xmm10,%xmm10 vpxor 16(%rdi),%xmm3,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 vpxor 32(%rdi),%xmm3,%xmm6 vaesenc %xmm15,%xmm12,%xmm12 vpxor 48(%rdi),%xmm3,%xmm8 vaesenc %xmm15,%xmm13,%xmm13 vpxor 64(%rdi),%xmm3,%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vpxor 80(%rdi),%xmm3,%xmm3 leaq 96(%rdi),%rdi vaesenclast %xmm4,%xmm9,%xmm9 vaesenclast %xmm5,%xmm10,%xmm10 vaesenclast %xmm6,%xmm11,%xmm11 vaesenclast %xmm8,%xmm12,%xmm12 vaesenclast %xmm2,%xmm13,%xmm13 vaesenclast %xmm3,%xmm14,%xmm14 vmovups %xmm9,0(%rsi) vmovups %xmm10,16(%rsi) vmovups %xmm11,32(%rsi) vmovups %xmm12,48(%rsi) vmovups %xmm13,64(%rsi) vmovups %xmm14,80(%rsi) leaq 96(%rsi),%rsi ret .align 32 .Lhandle_ctr32_2: vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm4,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm4,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpxor %xmm4,%xmm12,%xmm12 vpshufb %xmm0,%xmm14,%xmm14 vpxor %xmm4,%xmm13,%xmm13 vpshufb %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp .Loop_ctr32 .cfi_endproc .size _aesni_ctr32_6x,.-_aesni_ctr32_6x .globl aesni_gcm_encrypt .hidden aesni_gcm_encrypt .type aesni_gcm_encrypt,@function .align 32 aesni_gcm_encrypt: .cfi_startproc _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST .extern BORINGSSL_function_hit .hidden BORINGSSL_function_hit movb $1,BORINGSSL_function_hit+2(%rip) #endif xorq %rax,%rax cmpq $288,%rdx jb .Lgcm_enc_abort pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 movq %rsp,%rbp .cfi_def_cfa_register %rbp pushq %rbx .cfi_offset %rbx,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 vzeroupper vmovdqu (%r8),%xmm1 addq $-128,%rsp movl 12(%r8),%ebx leaq .Lbswap_mask(%rip),%r11 leaq -128(%rcx),%r14 movq $0xf80,%r15 leaq 128(%rcx),%rcx vmovdqu (%r11),%xmm0 andq $-128,%rsp movl 240-128(%rcx),%r10d andq %r15,%r14 andq %rsp,%r15 subq %r14,%r15 jc .Lenc_no_key_aliasing cmpq $768,%r15 jnc .Lenc_no_key_aliasing subq %r15,%rsp .Lenc_no_key_aliasing: movq %rsi,%r14 leaq -192(%rsi,%rdx,1),%r15 shrq $4,%rdx call _aesni_ctr32_6x vpshufb %xmm0,%xmm9,%xmm8 vpshufb %xmm0,%xmm10,%xmm2 vmovdqu %xmm8,112(%rsp) vpshufb %xmm0,%xmm11,%xmm4 vmovdqu %xmm2,96(%rsp) vpshufb %xmm0,%xmm12,%xmm5 vmovdqu %xmm4,80(%rsp) vpshufb %xmm0,%xmm13,%xmm6 vmovdqu %xmm5,64(%rsp) vpshufb %xmm0,%xmm14,%xmm7 vmovdqu %xmm6,48(%rsp) call _aesni_ctr32_6x movq 16(%rbp),%r12 leaq 32(%r9),%r9 vmovdqu (%r12),%xmm8 subq $12,%rdx movq $192,%rax vpshufb %xmm0,%xmm8,%xmm8 call _aesni_ctr32_ghash_6x vmovdqu 32(%rsp),%xmm7 vmovdqu (%r11),%xmm0 vmovdqu 0-32(%r9),%xmm3 vpunpckhqdq %xmm7,%xmm7,%xmm1 vmovdqu 32-32(%r9),%xmm15 vmovups %xmm9,-96(%rsi) vpshufb %xmm0,%xmm9,%xmm9 vpxor %xmm7,%xmm1,%xmm1 vmovups %xmm10,-80(%rsi) vpshufb %xmm0,%xmm10,%xmm10 vmovups %xmm11,-64(%rsi) vpshufb %xmm0,%xmm11,%xmm11 vmovups %xmm12,-48(%rsi) vpshufb %xmm0,%xmm12,%xmm12 vmovups %xmm13,-32(%rsi) vpshufb %xmm0,%xmm13,%xmm13 vmovups %xmm14,-16(%rsi) vpshufb %xmm0,%xmm14,%xmm14 vmovdqu %xmm9,16(%rsp) vmovdqu 48(%rsp),%xmm6 vmovdqu 16-32(%r9),%xmm0 vpunpckhqdq %xmm6,%xmm6,%xmm2 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 vpxor %xmm6,%xmm2,%xmm2 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 vmovdqu 64(%rsp),%xmm9 vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 vmovdqu 48-32(%r9),%xmm3 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm9,%xmm9,%xmm5 vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 vpxor %xmm9,%xmm5,%xmm5 vpxor %xmm7,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 vmovdqu 80-32(%r9),%xmm15 vpxor %xmm1,%xmm2,%xmm2 vmovdqu 80(%rsp),%xmm1 vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 vmovdqu 64-32(%r9),%xmm0 vpxor %xmm4,%xmm7,%xmm7 vpunpckhqdq %xmm1,%xmm1,%xmm4 vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpxor %xmm6,%xmm9,%xmm9 vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 96(%rsp),%xmm2 vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 vmovdqu 96-32(%r9),%xmm3 vpxor %xmm7,%xmm6,%xmm6 vpunpckhqdq %xmm2,%xmm2,%xmm7 vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 vpxor %xmm2,%xmm7,%xmm7 vpxor %xmm9,%xmm1,%xmm1 vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 vmovdqu 128-32(%r9),%xmm15 vpxor %xmm5,%xmm4,%xmm4 vpxor 112(%rsp),%xmm8,%xmm8 vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 vmovdqu 112-32(%r9),%xmm0 vpunpckhqdq %xmm8,%xmm8,%xmm9 vpxor %xmm6,%xmm5,%xmm5 vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 vpxor %xmm8,%xmm9,%xmm9 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 vpxor %xmm4,%xmm7,%xmm4 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 vmovdqu 0-32(%r9),%xmm3 vpunpckhqdq %xmm14,%xmm14,%xmm1 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 vpxor %xmm14,%xmm1,%xmm1 vpxor %xmm5,%xmm6,%xmm5 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 vmovdqu 32-32(%r9),%xmm15 vpxor %xmm2,%xmm8,%xmm7 vpxor %xmm4,%xmm9,%xmm6 vmovdqu 16-32(%r9),%xmm0 vpxor %xmm5,%xmm7,%xmm9 vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 vpxor %xmm9,%xmm6,%xmm6 vpunpckhqdq %xmm13,%xmm13,%xmm2 vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 vpxor %xmm13,%xmm2,%xmm2 vpslldq $8,%xmm6,%xmm9 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 vpxor %xmm9,%xmm5,%xmm8 vpsrldq $8,%xmm6,%xmm6 vpxor %xmm6,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 vmovdqu 48-32(%r9),%xmm3 vpxor %xmm4,%xmm5,%xmm5 vpunpckhqdq %xmm12,%xmm12,%xmm9 vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 vpxor %xmm12,%xmm9,%xmm9 vpxor %xmm14,%xmm13,%xmm13 vpalignr $8,%xmm8,%xmm8,%xmm14 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 vmovdqu 80-32(%r9),%xmm15 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 vmovdqu 64-32(%r9),%xmm0 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm11,%xmm11,%xmm1 vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 vpxor %xmm11,%xmm1,%xmm1 vpxor %xmm13,%xmm12,%xmm12 vxorps 16(%rsp),%xmm7,%xmm7 vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm9,%xmm9 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 vxorps %xmm14,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 vmovdqu 96-32(%r9),%xmm3 vpxor %xmm4,%xmm5,%xmm5 vpunpckhqdq %xmm10,%xmm10,%xmm2 vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 vpxor %xmm10,%xmm2,%xmm2 vpalignr $8,%xmm8,%xmm8,%xmm14 vpxor %xmm12,%xmm11,%xmm11 vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 vmovdqu 128-32(%r9),%xmm15 vpxor %xmm9,%xmm1,%xmm1 vxorps %xmm7,%xmm14,%xmm14 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 vxorps %xmm14,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 vmovdqu 112-32(%r9),%xmm0 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm8,%xmm8,%xmm9 vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 vpxor %xmm8,%xmm9,%xmm9 vpxor %xmm11,%xmm10,%xmm10 vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 vpxor %xmm4,%xmm5,%xmm5 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 vpxor %xmm10,%xmm7,%xmm7 vpxor %xmm2,%xmm6,%xmm6 vpxor %xmm5,%xmm7,%xmm4 vpxor %xmm4,%xmm6,%xmm6 vpslldq $8,%xmm6,%xmm1 vmovdqu 16(%r11),%xmm3 vpsrldq $8,%xmm6,%xmm6 vpxor %xmm1,%xmm5,%xmm8 vpxor %xmm6,%xmm7,%xmm7 vpalignr $8,%xmm8,%xmm8,%xmm2 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 vpxor %xmm2,%xmm8,%xmm8 vpalignr $8,%xmm8,%xmm8,%xmm2 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 vpxor %xmm7,%xmm2,%xmm2 vpxor %xmm2,%xmm8,%xmm8 movq 16(%rbp),%r12 vpshufb (%r11),%xmm8,%xmm8 vmovdqu %xmm8,(%r12) vzeroupper leaq -40(%rbp),%rsp .cfi_def_cfa %rsp, 0x38 popq %r15 .cfi_adjust_cfa_offset -8 .cfi_restore %r15 popq %r14 .cfi_adjust_cfa_offset -8 .cfi_restore %r14 popq %r13 .cfi_adjust_cfa_offset -8 .cfi_restore %r13 popq %r12 .cfi_adjust_cfa_offset -8 .cfi_restore %r12 popq %rbx .cfi_adjust_cfa_offset -8 .cfi_restore %rbx popq %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp .Lgcm_enc_abort: ret .cfi_endproc .size aesni_gcm_encrypt,.-aesni_gcm_encrypt .section .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lpoly: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .Lone_msb: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .Ltwo_lsb: .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .Lone_lsb: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 .text #endif ring-0.17.14/pregenerated/aesni-gcm-x86_64-macosx.S000064400000000000000000000436141046102023000176250ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text .p2align 5 _aesni_ctr32_ghash_6x: vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 vmovdqu 0-128(%rcx),%xmm15 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpaddb %xmm2,%xmm11,%xmm12 vpaddb %xmm2,%xmm12,%xmm13 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm15,%xmm1,%xmm9 vmovdqu %xmm4,16+8(%rsp) jmp L$oop6x .p2align 5 L$oop6x: addl $100663296,%ebx jc L$handle_ctr32 vmovdqu 0-32(%r9),%xmm3 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm15,%xmm10,%xmm10 vpxor %xmm15,%xmm11,%xmm11 L$resume_ctr32: vmovdqu %xmm1,(%r8) vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 vpxor %xmm15,%xmm12,%xmm12 vmovups 16-128(%rcx),%xmm2 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 xorq %r12,%r12 cmpq %r14,%r15 vaesenc %xmm2,%xmm9,%xmm9 vmovdqu 48+8(%rsp),%xmm0 vpxor %xmm15,%xmm13,%xmm13 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 vaesenc %xmm2,%xmm10,%xmm10 vpxor %xmm15,%xmm14,%xmm14 setnc %r12b vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vaesenc %xmm2,%xmm11,%xmm11 vmovdqu 16-32(%r9),%xmm3 negq %r12 vaesenc %xmm2,%xmm12,%xmm12 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 vpxor %xmm4,%xmm8,%xmm8 vaesenc %xmm2,%xmm13,%xmm13 vpxor %xmm5,%xmm1,%xmm4 andq $0x60,%r12 vmovups 32-128(%rcx),%xmm15 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 vaesenc %xmm2,%xmm14,%xmm14 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 leaq (%r14,%r12,1),%r14 vaesenc %xmm15,%xmm9,%xmm9 vpxor 16+8(%rsp),%xmm8,%xmm8 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 vmovdqu 64+8(%rsp),%xmm0 vaesenc %xmm15,%xmm10,%xmm10 movbeq 88(%r14),%r13 vaesenc %xmm15,%xmm11,%xmm11 movbeq 80(%r14),%r12 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,32+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,40+8(%rsp) vmovdqu 48-32(%r9),%xmm5 vaesenc %xmm15,%xmm14,%xmm14 vmovups 48-128(%rcx),%xmm15 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm3,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 vaesenc %xmm15,%xmm11,%xmm11 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 vmovdqu 80+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vpxor %xmm1,%xmm4,%xmm4 vmovdqu 64-32(%r9),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vmovups 64-128(%rcx),%xmm15 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 vaesenc %xmm15,%xmm10,%xmm10 movbeq 72(%r14),%r13 vpxor %xmm5,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 movbeq 64(%r14),%r12 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 vmovdqu 96+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,48+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,56+8(%rsp) vpxor %xmm2,%xmm4,%xmm4 vmovdqu 96-32(%r9),%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vmovups 80-128(%rcx),%xmm15 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 vaesenc %xmm15,%xmm10,%xmm10 movbeq 56(%r14),%r13 vpxor %xmm1,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 vpxor 112+8(%rsp),%xmm8,%xmm8 vaesenc %xmm15,%xmm11,%xmm11 movbeq 48(%r14),%r12 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,64+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,72+8(%rsp) vpxor %xmm3,%xmm4,%xmm4 vmovdqu 112-32(%r9),%xmm3 vaesenc %xmm15,%xmm14,%xmm14 vmovups 96-128(%rcx),%xmm15 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 vaesenc %xmm15,%xmm10,%xmm10 movbeq 40(%r14),%r13 vpxor %xmm2,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 movbeq 32(%r14),%r12 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,80+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,88+8(%rsp) vpxor %xmm5,%xmm6,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor %xmm1,%xmm6,%xmm6 vmovups 112-128(%rcx),%xmm15 vpslldq $8,%xmm6,%xmm5 vpxor %xmm2,%xmm4,%xmm4 vmovdqu 16(%r11),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm8,%xmm7,%xmm7 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm5,%xmm4,%xmm4 movbeq 24(%r14),%r13 vaesenc %xmm15,%xmm11,%xmm11 movbeq 16(%r14),%r12 vpalignr $8,%xmm4,%xmm4,%xmm0 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 movq %r13,96+8(%rsp) vaesenc %xmm15,%xmm12,%xmm12 movq %r12,104+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 vmovups 128-128(%rcx),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vmovups 144-128(%rcx),%xmm15 vaesenc %xmm1,%xmm10,%xmm10 vpsrldq $8,%xmm6,%xmm6 vaesenc %xmm1,%xmm11,%xmm11 vpxor %xmm6,%xmm7,%xmm7 vaesenc %xmm1,%xmm12,%xmm12 vpxor %xmm0,%xmm4,%xmm4 movbeq 8(%r14),%r13 vaesenc %xmm1,%xmm13,%xmm13 movbeq 0(%r14),%r12 vaesenc %xmm1,%xmm14,%xmm14 vmovups 160-128(%rcx),%xmm1 cmpl $11,%r10d jb L$enc_tail vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 176-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 192-128(%rcx),%xmm1 vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 208-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 224-128(%rcx),%xmm1 jmp L$enc_tail .p2align 5 L$handle_ctr32: vmovdqu (%r11),%xmm0 vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vmovdqu 0-32(%r9),%xmm3 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm15,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm15,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpshufb %xmm0,%xmm14,%xmm14 vpshufb %xmm0,%xmm1,%xmm1 jmp L$resume_ctr32 .p2align 5 L$enc_tail: vaesenc %xmm15,%xmm9,%xmm9 vmovdqu %xmm7,16+8(%rsp) vpalignr $8,%xmm4,%xmm4,%xmm8 vaesenc %xmm15,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 vpxor 0(%rdi),%xmm1,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 vpxor 16(%rdi),%xmm1,%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vpxor 32(%rdi),%xmm1,%xmm5 vaesenc %xmm15,%xmm13,%xmm13 vpxor 48(%rdi),%xmm1,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor 64(%rdi),%xmm1,%xmm7 vpxor 80(%rdi),%xmm1,%xmm3 vmovdqu (%r8),%xmm1 vaesenclast %xmm2,%xmm9,%xmm9 vmovdqu 32(%r11),%xmm2 vaesenclast %xmm0,%xmm10,%xmm10 vpaddb %xmm2,%xmm1,%xmm0 movq %r13,112+8(%rsp) leaq 96(%rdi),%rdi prefetcht0 512(%rdi) prefetcht0 576(%rdi) vaesenclast %xmm5,%xmm11,%xmm11 vpaddb %xmm2,%xmm0,%xmm5 movq %r12,120+8(%rsp) leaq 96(%rsi),%rsi vmovdqu 0-128(%rcx),%xmm15 vaesenclast %xmm6,%xmm12,%xmm12 vpaddb %xmm2,%xmm5,%xmm6 vaesenclast %xmm7,%xmm13,%xmm13 vpaddb %xmm2,%xmm6,%xmm7 vaesenclast %xmm3,%xmm14,%xmm14 vpaddb %xmm2,%xmm7,%xmm3 addq $0x60,%rax subq $0x6,%rdx jc L$6x_done vmovups %xmm9,-96(%rsi) vpxor %xmm15,%xmm1,%xmm9 vmovups %xmm10,-80(%rsi) vmovdqa %xmm0,%xmm10 vmovups %xmm11,-64(%rsi) vmovdqa %xmm5,%xmm11 vmovups %xmm12,-48(%rsi) vmovdqa %xmm6,%xmm12 vmovups %xmm13,-32(%rsi) vmovdqa %xmm7,%xmm13 vmovups %xmm14,-16(%rsi) vmovdqa %xmm3,%xmm14 vmovdqu 32+8(%rsp),%xmm7 jmp L$oop6x L$6x_done: vpxor 16+8(%rsp),%xmm8,%xmm8 vpxor %xmm4,%xmm8,%xmm8 ret .globl _aesni_gcm_decrypt .private_extern _aesni_gcm_decrypt .p2align 5 _aesni_gcm_decrypt: _CET_ENDBR xorq %rax,%rax cmpq $0x60,%rdx jb L$gcm_dec_abort pushq %rbp movq %rsp,%rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 vzeroupper movq 16(%rbp),%r12 vmovdqu (%r8),%xmm1 addq $-128,%rsp movl 12(%r8),%ebx leaq L$bswap_mask(%rip),%r11 leaq -128(%rcx),%r14 movq $0xf80,%r15 vmovdqu (%r12),%xmm8 andq $-128,%rsp vmovdqu (%r11),%xmm0 leaq 128(%rcx),%rcx leaq 32(%r9),%r9 movl 240-128(%rcx),%r10d vpshufb %xmm0,%xmm8,%xmm8 andq %r15,%r14 andq %rsp,%r15 subq %r14,%r15 jc L$dec_no_key_aliasing cmpq $768,%r15 jnc L$dec_no_key_aliasing subq %r15,%rsp L$dec_no_key_aliasing: vmovdqu 80(%rdi),%xmm7 movq %rdi,%r14 vmovdqu 64(%rdi),%xmm4 leaq -192(%rdi,%rdx,1),%r15 vmovdqu 48(%rdi),%xmm5 shrq $4,%rdx xorq %rax,%rax vmovdqu 32(%rdi),%xmm6 vpshufb %xmm0,%xmm7,%xmm7 vmovdqu 16(%rdi),%xmm2 vpshufb %xmm0,%xmm4,%xmm4 vmovdqu (%rdi),%xmm3 vpshufb %xmm0,%xmm5,%xmm5 vmovdqu %xmm4,48(%rsp) vpshufb %xmm0,%xmm6,%xmm6 vmovdqu %xmm5,64(%rsp) vpshufb %xmm0,%xmm2,%xmm2 vmovdqu %xmm6,80(%rsp) vpshufb %xmm0,%xmm3,%xmm3 vmovdqu %xmm2,96(%rsp) vmovdqu %xmm3,112(%rsp) call _aesni_ctr32_ghash_6x movq 16(%rbp),%r12 vmovups %xmm9,-96(%rsi) vmovups %xmm10,-80(%rsi) vmovups %xmm11,-64(%rsi) vmovups %xmm12,-48(%rsi) vmovups %xmm13,-32(%rsi) vmovups %xmm14,-16(%rsi) vpshufb (%r11),%xmm8,%xmm8 vmovdqu %xmm8,(%r12) vzeroupper leaq -40(%rbp),%rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx popq %rbp L$gcm_dec_abort: ret .p2align 5 _aesni_ctr32_6x: vmovdqu 0-128(%rcx),%xmm4 vmovdqu 32(%r11),%xmm2 leaq -1(%r10),%r13 vmovups 16-128(%rcx),%xmm15 leaq 32-128(%rcx),%r12 vpxor %xmm4,%xmm1,%xmm9 addl $100663296,%ebx jc L$handle_ctr32_2 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpxor %xmm4,%xmm10,%xmm10 vpaddb %xmm2,%xmm11,%xmm12 vpxor %xmm4,%xmm11,%xmm11 vpaddb %xmm2,%xmm12,%xmm13 vpxor %xmm4,%xmm12,%xmm12 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm4,%xmm13,%xmm13 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp L$oop_ctr32 .p2align 4 L$oop_ctr32: vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vmovups (%r12),%xmm15 leaq 16(%r12),%r12 decl %r13d jnz L$oop_ctr32 vmovdqu (%r12),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor 0(%rdi),%xmm3,%xmm4 vaesenc %xmm15,%xmm10,%xmm10 vpxor 16(%rdi),%xmm3,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 vpxor 32(%rdi),%xmm3,%xmm6 vaesenc %xmm15,%xmm12,%xmm12 vpxor 48(%rdi),%xmm3,%xmm8 vaesenc %xmm15,%xmm13,%xmm13 vpxor 64(%rdi),%xmm3,%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vpxor 80(%rdi),%xmm3,%xmm3 leaq 96(%rdi),%rdi vaesenclast %xmm4,%xmm9,%xmm9 vaesenclast %xmm5,%xmm10,%xmm10 vaesenclast %xmm6,%xmm11,%xmm11 vaesenclast %xmm8,%xmm12,%xmm12 vaesenclast %xmm2,%xmm13,%xmm13 vaesenclast %xmm3,%xmm14,%xmm14 vmovups %xmm9,0(%rsi) vmovups %xmm10,16(%rsi) vmovups %xmm11,32(%rsi) vmovups %xmm12,48(%rsi) vmovups %xmm13,64(%rsi) vmovups %xmm14,80(%rsi) leaq 96(%rsi),%rsi ret .p2align 5 L$handle_ctr32_2: vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm4,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm4,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpxor %xmm4,%xmm12,%xmm12 vpshufb %xmm0,%xmm14,%xmm14 vpxor %xmm4,%xmm13,%xmm13 vpshufb %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp L$oop_ctr32 .globl _aesni_gcm_encrypt .private_extern _aesni_gcm_encrypt .p2align 5 _aesni_gcm_encrypt: _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST movb $1,_BORINGSSL_function_hit+2(%rip) #endif xorq %rax,%rax cmpq $288,%rdx jb L$gcm_enc_abort pushq %rbp movq %rsp,%rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 vzeroupper vmovdqu (%r8),%xmm1 addq $-128,%rsp movl 12(%r8),%ebx leaq L$bswap_mask(%rip),%r11 leaq -128(%rcx),%r14 movq $0xf80,%r15 leaq 128(%rcx),%rcx vmovdqu (%r11),%xmm0 andq $-128,%rsp movl 240-128(%rcx),%r10d andq %r15,%r14 andq %rsp,%r15 subq %r14,%r15 jc L$enc_no_key_aliasing cmpq $768,%r15 jnc L$enc_no_key_aliasing subq %r15,%rsp L$enc_no_key_aliasing: movq %rsi,%r14 leaq -192(%rsi,%rdx,1),%r15 shrq $4,%rdx call _aesni_ctr32_6x vpshufb %xmm0,%xmm9,%xmm8 vpshufb %xmm0,%xmm10,%xmm2 vmovdqu %xmm8,112(%rsp) vpshufb %xmm0,%xmm11,%xmm4 vmovdqu %xmm2,96(%rsp) vpshufb %xmm0,%xmm12,%xmm5 vmovdqu %xmm4,80(%rsp) vpshufb %xmm0,%xmm13,%xmm6 vmovdqu %xmm5,64(%rsp) vpshufb %xmm0,%xmm14,%xmm7 vmovdqu %xmm6,48(%rsp) call _aesni_ctr32_6x movq 16(%rbp),%r12 leaq 32(%r9),%r9 vmovdqu (%r12),%xmm8 subq $12,%rdx movq $192,%rax vpshufb %xmm0,%xmm8,%xmm8 call _aesni_ctr32_ghash_6x vmovdqu 32(%rsp),%xmm7 vmovdqu (%r11),%xmm0 vmovdqu 0-32(%r9),%xmm3 vpunpckhqdq %xmm7,%xmm7,%xmm1 vmovdqu 32-32(%r9),%xmm15 vmovups %xmm9,-96(%rsi) vpshufb %xmm0,%xmm9,%xmm9 vpxor %xmm7,%xmm1,%xmm1 vmovups %xmm10,-80(%rsi) vpshufb %xmm0,%xmm10,%xmm10 vmovups %xmm11,-64(%rsi) vpshufb %xmm0,%xmm11,%xmm11 vmovups %xmm12,-48(%rsi) vpshufb %xmm0,%xmm12,%xmm12 vmovups %xmm13,-32(%rsi) vpshufb %xmm0,%xmm13,%xmm13 vmovups %xmm14,-16(%rsi) vpshufb %xmm0,%xmm14,%xmm14 vmovdqu %xmm9,16(%rsp) vmovdqu 48(%rsp),%xmm6 vmovdqu 16-32(%r9),%xmm0 vpunpckhqdq %xmm6,%xmm6,%xmm2 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 vpxor %xmm6,%xmm2,%xmm2 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 vmovdqu 64(%rsp),%xmm9 vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 vmovdqu 48-32(%r9),%xmm3 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm9,%xmm9,%xmm5 vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 vpxor %xmm9,%xmm5,%xmm5 vpxor %xmm7,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 vmovdqu 80-32(%r9),%xmm15 vpxor %xmm1,%xmm2,%xmm2 vmovdqu 80(%rsp),%xmm1 vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 vmovdqu 64-32(%r9),%xmm0 vpxor %xmm4,%xmm7,%xmm7 vpunpckhqdq %xmm1,%xmm1,%xmm4 vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpxor %xmm6,%xmm9,%xmm9 vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 96(%rsp),%xmm2 vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 vmovdqu 96-32(%r9),%xmm3 vpxor %xmm7,%xmm6,%xmm6 vpunpckhqdq %xmm2,%xmm2,%xmm7 vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 vpxor %xmm2,%xmm7,%xmm7 vpxor %xmm9,%xmm1,%xmm1 vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 vmovdqu 128-32(%r9),%xmm15 vpxor %xmm5,%xmm4,%xmm4 vpxor 112(%rsp),%xmm8,%xmm8 vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 vmovdqu 112-32(%r9),%xmm0 vpunpckhqdq %xmm8,%xmm8,%xmm9 vpxor %xmm6,%xmm5,%xmm5 vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 vpxor %xmm8,%xmm9,%xmm9 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 vpxor %xmm4,%xmm7,%xmm4 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 vmovdqu 0-32(%r9),%xmm3 vpunpckhqdq %xmm14,%xmm14,%xmm1 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 vpxor %xmm14,%xmm1,%xmm1 vpxor %xmm5,%xmm6,%xmm5 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 vmovdqu 32-32(%r9),%xmm15 vpxor %xmm2,%xmm8,%xmm7 vpxor %xmm4,%xmm9,%xmm6 vmovdqu 16-32(%r9),%xmm0 vpxor %xmm5,%xmm7,%xmm9 vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 vpxor %xmm9,%xmm6,%xmm6 vpunpckhqdq %xmm13,%xmm13,%xmm2 vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 vpxor %xmm13,%xmm2,%xmm2 vpslldq $8,%xmm6,%xmm9 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 vpxor %xmm9,%xmm5,%xmm8 vpsrldq $8,%xmm6,%xmm6 vpxor %xmm6,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 vmovdqu 48-32(%r9),%xmm3 vpxor %xmm4,%xmm5,%xmm5 vpunpckhqdq %xmm12,%xmm12,%xmm9 vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 vpxor %xmm12,%xmm9,%xmm9 vpxor %xmm14,%xmm13,%xmm13 vpalignr $8,%xmm8,%xmm8,%xmm14 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 vmovdqu 80-32(%r9),%xmm15 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 vmovdqu 64-32(%r9),%xmm0 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm11,%xmm11,%xmm1 vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 vpxor %xmm11,%xmm1,%xmm1 vpxor %xmm13,%xmm12,%xmm12 vxorps 16(%rsp),%xmm7,%xmm7 vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm9,%xmm9 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 vxorps %xmm14,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 vmovdqu 96-32(%r9),%xmm3 vpxor %xmm4,%xmm5,%xmm5 vpunpckhqdq %xmm10,%xmm10,%xmm2 vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 vpxor %xmm10,%xmm2,%xmm2 vpalignr $8,%xmm8,%xmm8,%xmm14 vpxor %xmm12,%xmm11,%xmm11 vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 vmovdqu 128-32(%r9),%xmm15 vpxor %xmm9,%xmm1,%xmm1 vxorps %xmm7,%xmm14,%xmm14 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 vxorps %xmm14,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 vmovdqu 112-32(%r9),%xmm0 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm8,%xmm8,%xmm9 vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 vpxor %xmm8,%xmm9,%xmm9 vpxor %xmm11,%xmm10,%xmm10 vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 vpxor %xmm4,%xmm5,%xmm5 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 vpxor %xmm10,%xmm7,%xmm7 vpxor %xmm2,%xmm6,%xmm6 vpxor %xmm5,%xmm7,%xmm4 vpxor %xmm4,%xmm6,%xmm6 vpslldq $8,%xmm6,%xmm1 vmovdqu 16(%r11),%xmm3 vpsrldq $8,%xmm6,%xmm6 vpxor %xmm1,%xmm5,%xmm8 vpxor %xmm6,%xmm7,%xmm7 vpalignr $8,%xmm8,%xmm8,%xmm2 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 vpxor %xmm2,%xmm8,%xmm8 vpalignr $8,%xmm8,%xmm8,%xmm2 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 vpxor %xmm7,%xmm2,%xmm2 vpxor %xmm2,%xmm8,%xmm8 movq 16(%rbp),%r12 vpshufb (%r11),%xmm8,%xmm8 vmovdqu %xmm8,(%r12) vzeroupper leaq -40(%rbp),%rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx popq %rbp L$gcm_enc_abort: ret .section __DATA,__const .p2align 6 L$bswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 L$poly: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 L$one_msb: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 L$two_lsb: .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 L$one_lsb: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 .text #endif ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.asm000064400000000000000000000624331046102023000176470ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %ifidn __OUTPUT_FORMAT__, win64 default rel %define XMMWORD %define YMMWORD %define ZMMWORD %define _CET_ENDBR %include "ring_core_generated/prefix_symbols_nasm.inc" section .text code align=64 ALIGN 32 _aesni_ctr32_ghash_6x: vmovdqu xmm2,XMMWORD[32+r11] sub r8,6 vpxor xmm4,xmm4,xmm4 vmovdqu xmm15,XMMWORD[((0-128))+r9] vpaddb xmm10,xmm1,xmm2 vpaddb xmm11,xmm10,xmm2 vpaddb xmm12,xmm11,xmm2 vpaddb xmm13,xmm12,xmm2 vpaddb xmm14,xmm13,xmm2 vpxor xmm9,xmm1,xmm15 vmovdqu XMMWORD[(16+8)+rsp],xmm4 jmp NEAR $L$oop6x ALIGN 32 $L$oop6x: add ebx,100663296 jc NEAR $L$handle_ctr32 vmovdqu xmm3,XMMWORD[((0-32))+rsi] vpaddb xmm1,xmm14,xmm2 vpxor xmm10,xmm10,xmm15 vpxor xmm11,xmm11,xmm15 $L$resume_ctr32: vmovdqu XMMWORD[rdi],xmm1 vpclmulqdq xmm5,xmm7,xmm3,0x10 vpxor xmm12,xmm12,xmm15 vmovups xmm2,XMMWORD[((16-128))+r9] vpclmulqdq xmm6,xmm7,xmm3,0x01 xor r12,r12 cmp r15,r14 vaesenc xmm9,xmm9,xmm2 vmovdqu xmm0,XMMWORD[((48+8))+rsp] vpxor xmm13,xmm13,xmm15 vpclmulqdq xmm1,xmm7,xmm3,0x00 vaesenc xmm10,xmm10,xmm2 vpxor xmm14,xmm14,xmm15 setnc r12b vpclmulqdq xmm7,xmm7,xmm3,0x11 vaesenc xmm11,xmm11,xmm2 vmovdqu xmm3,XMMWORD[((16-32))+rsi] neg r12 vaesenc xmm12,xmm12,xmm2 vpxor xmm6,xmm6,xmm5 vpclmulqdq xmm5,xmm0,xmm3,0x00 vpxor xmm8,xmm8,xmm4 vaesenc xmm13,xmm13,xmm2 vpxor xmm4,xmm1,xmm5 and r12,0x60 vmovups xmm15,XMMWORD[((32-128))+r9] vpclmulqdq xmm1,xmm0,xmm3,0x10 vaesenc xmm14,xmm14,xmm2 vpclmulqdq xmm2,xmm0,xmm3,0x01 lea r14,[r12*1+r14] vaesenc xmm9,xmm9,xmm15 vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] vpclmulqdq xmm3,xmm0,xmm3,0x11 vmovdqu xmm0,XMMWORD[((64+8))+rsp] vaesenc xmm10,xmm10,xmm15 movbe r13,QWORD[88+r14] vaesenc xmm11,xmm11,xmm15 movbe r12,QWORD[80+r14] vaesenc xmm12,xmm12,xmm15 mov QWORD[((32+8))+rsp],r13 vaesenc xmm13,xmm13,xmm15 mov QWORD[((40+8))+rsp],r12 vmovdqu xmm5,XMMWORD[((48-32))+rsi] vaesenc xmm14,xmm14,xmm15 vmovups xmm15,XMMWORD[((48-128))+r9] vpxor xmm6,xmm6,xmm1 vpclmulqdq xmm1,xmm0,xmm5,0x00 vaesenc xmm9,xmm9,xmm15 vpxor xmm6,xmm6,xmm2 vpclmulqdq xmm2,xmm0,xmm5,0x10 vaesenc xmm10,xmm10,xmm15 vpxor xmm7,xmm7,xmm3 vpclmulqdq xmm3,xmm0,xmm5,0x01 vaesenc xmm11,xmm11,xmm15 vpclmulqdq xmm5,xmm0,xmm5,0x11 vmovdqu xmm0,XMMWORD[((80+8))+rsp] vaesenc xmm12,xmm12,xmm15 vaesenc xmm13,xmm13,xmm15 vpxor xmm4,xmm4,xmm1 vmovdqu xmm1,XMMWORD[((64-32))+rsi] vaesenc xmm14,xmm14,xmm15 vmovups xmm15,XMMWORD[((64-128))+r9] vpxor xmm6,xmm6,xmm2 vpclmulqdq xmm2,xmm0,xmm1,0x00 vaesenc xmm9,xmm9,xmm15 vpxor xmm6,xmm6,xmm3 vpclmulqdq xmm3,xmm0,xmm1,0x10 vaesenc xmm10,xmm10,xmm15 movbe r13,QWORD[72+r14] vpxor xmm7,xmm7,xmm5 vpclmulqdq xmm5,xmm0,xmm1,0x01 vaesenc xmm11,xmm11,xmm15 movbe r12,QWORD[64+r14] vpclmulqdq xmm1,xmm0,xmm1,0x11 vmovdqu xmm0,XMMWORD[((96+8))+rsp] vaesenc xmm12,xmm12,xmm15 mov QWORD[((48+8))+rsp],r13 vaesenc xmm13,xmm13,xmm15 mov QWORD[((56+8))+rsp],r12 vpxor xmm4,xmm4,xmm2 vmovdqu xmm2,XMMWORD[((96-32))+rsi] vaesenc xmm14,xmm14,xmm15 vmovups xmm15,XMMWORD[((80-128))+r9] vpxor xmm6,xmm6,xmm3 vpclmulqdq xmm3,xmm0,xmm2,0x00 vaesenc xmm9,xmm9,xmm15 vpxor xmm6,xmm6,xmm5 vpclmulqdq xmm5,xmm0,xmm2,0x10 vaesenc xmm10,xmm10,xmm15 movbe r13,QWORD[56+r14] vpxor xmm7,xmm7,xmm1 vpclmulqdq xmm1,xmm0,xmm2,0x01 vpxor xmm8,xmm8,XMMWORD[((112+8))+rsp] vaesenc xmm11,xmm11,xmm15 movbe r12,QWORD[48+r14] vpclmulqdq xmm2,xmm0,xmm2,0x11 vaesenc xmm12,xmm12,xmm15 mov QWORD[((64+8))+rsp],r13 vaesenc xmm13,xmm13,xmm15 mov QWORD[((72+8))+rsp],r12 vpxor xmm4,xmm4,xmm3 vmovdqu xmm3,XMMWORD[((112-32))+rsi] vaesenc xmm14,xmm14,xmm15 vmovups xmm15,XMMWORD[((96-128))+r9] vpxor xmm6,xmm6,xmm5 vpclmulqdq xmm5,xmm8,xmm3,0x10 vaesenc xmm9,xmm9,xmm15 vpxor xmm6,xmm6,xmm1 vpclmulqdq xmm1,xmm8,xmm3,0x01 vaesenc xmm10,xmm10,xmm15 movbe r13,QWORD[40+r14] vpxor xmm7,xmm7,xmm2 vpclmulqdq xmm2,xmm8,xmm3,0x00 vaesenc xmm11,xmm11,xmm15 movbe r12,QWORD[32+r14] vpclmulqdq xmm8,xmm8,xmm3,0x11 vaesenc xmm12,xmm12,xmm15 mov QWORD[((80+8))+rsp],r13 vaesenc xmm13,xmm13,xmm15 mov QWORD[((88+8))+rsp],r12 vpxor xmm6,xmm6,xmm5 vaesenc xmm14,xmm14,xmm15 vpxor xmm6,xmm6,xmm1 vmovups xmm15,XMMWORD[((112-128))+r9] vpslldq xmm5,xmm6,8 vpxor xmm4,xmm4,xmm2 vmovdqu xmm3,XMMWORD[16+r11] vaesenc xmm9,xmm9,xmm15 vpxor xmm7,xmm7,xmm8 vaesenc xmm10,xmm10,xmm15 vpxor xmm4,xmm4,xmm5 movbe r13,QWORD[24+r14] vaesenc xmm11,xmm11,xmm15 movbe r12,QWORD[16+r14] vpalignr xmm0,xmm4,xmm4,8 vpclmulqdq xmm4,xmm4,xmm3,0x10 mov QWORD[((96+8))+rsp],r13 vaesenc xmm12,xmm12,xmm15 mov QWORD[((104+8))+rsp],r12 vaesenc xmm13,xmm13,xmm15 vmovups xmm1,XMMWORD[((128-128))+r9] vaesenc xmm14,xmm14,xmm15 vaesenc xmm9,xmm9,xmm1 vmovups xmm15,XMMWORD[((144-128))+r9] vaesenc xmm10,xmm10,xmm1 vpsrldq xmm6,xmm6,8 vaesenc xmm11,xmm11,xmm1 vpxor xmm7,xmm7,xmm6 vaesenc xmm12,xmm12,xmm1 vpxor xmm4,xmm4,xmm0 movbe r13,QWORD[8+r14] vaesenc xmm13,xmm13,xmm1 movbe r12,QWORD[r14] vaesenc xmm14,xmm14,xmm1 vmovups xmm1,XMMWORD[((160-128))+r9] cmp r10d,11 jb NEAR $L$enc_tail vaesenc xmm9,xmm9,xmm15 vaesenc xmm10,xmm10,xmm15 vaesenc xmm11,xmm11,xmm15 vaesenc xmm12,xmm12,xmm15 vaesenc xmm13,xmm13,xmm15 vaesenc xmm14,xmm14,xmm15 vaesenc xmm9,xmm9,xmm1 vaesenc xmm10,xmm10,xmm1 vaesenc xmm11,xmm11,xmm1 vaesenc xmm12,xmm12,xmm1 vaesenc xmm13,xmm13,xmm1 vmovups xmm15,XMMWORD[((176-128))+r9] vaesenc xmm14,xmm14,xmm1 vmovups xmm1,XMMWORD[((192-128))+r9] vaesenc xmm9,xmm9,xmm15 vaesenc xmm10,xmm10,xmm15 vaesenc xmm11,xmm11,xmm15 vaesenc xmm12,xmm12,xmm15 vaesenc xmm13,xmm13,xmm15 vaesenc xmm14,xmm14,xmm15 vaesenc xmm9,xmm9,xmm1 vaesenc xmm10,xmm10,xmm1 vaesenc xmm11,xmm11,xmm1 vaesenc xmm12,xmm12,xmm1 vaesenc xmm13,xmm13,xmm1 vmovups xmm15,XMMWORD[((208-128))+r9] vaesenc xmm14,xmm14,xmm1 vmovups xmm1,XMMWORD[((224-128))+r9] jmp NEAR $L$enc_tail ALIGN 32 $L$handle_ctr32: vmovdqu xmm0,XMMWORD[r11] vpshufb xmm6,xmm1,xmm0 vmovdqu xmm5,XMMWORD[48+r11] vpaddd xmm10,xmm6,XMMWORD[64+r11] vpaddd xmm11,xmm6,xmm5 vmovdqu xmm3,XMMWORD[((0-32))+rsi] vpaddd xmm12,xmm10,xmm5 vpshufb xmm10,xmm10,xmm0 vpaddd xmm13,xmm11,xmm5 vpshufb xmm11,xmm11,xmm0 vpxor xmm10,xmm10,xmm15 vpaddd xmm14,xmm12,xmm5 vpshufb xmm12,xmm12,xmm0 vpxor xmm11,xmm11,xmm15 vpaddd xmm1,xmm13,xmm5 vpshufb xmm13,xmm13,xmm0 vpshufb xmm14,xmm14,xmm0 vpshufb xmm1,xmm1,xmm0 jmp NEAR $L$resume_ctr32 ALIGN 32 $L$enc_tail: vaesenc xmm9,xmm9,xmm15 vmovdqu XMMWORD[(16+8)+rsp],xmm7 vpalignr xmm8,xmm4,xmm4,8 vaesenc xmm10,xmm10,xmm15 vpclmulqdq xmm4,xmm4,xmm3,0x10 vpxor xmm2,xmm1,XMMWORD[rcx] vaesenc xmm11,xmm11,xmm15 vpxor xmm0,xmm1,XMMWORD[16+rcx] vaesenc xmm12,xmm12,xmm15 vpxor xmm5,xmm1,XMMWORD[32+rcx] vaesenc xmm13,xmm13,xmm15 vpxor xmm6,xmm1,XMMWORD[48+rcx] vaesenc xmm14,xmm14,xmm15 vpxor xmm7,xmm1,XMMWORD[64+rcx] vpxor xmm3,xmm1,XMMWORD[80+rcx] vmovdqu xmm1,XMMWORD[rdi] vaesenclast xmm9,xmm9,xmm2 vmovdqu xmm2,XMMWORD[32+r11] vaesenclast xmm10,xmm10,xmm0 vpaddb xmm0,xmm1,xmm2 mov QWORD[((112+8))+rsp],r13 lea rcx,[96+rcx] prefetcht0 [512+rcx] prefetcht0 [576+rcx] vaesenclast xmm11,xmm11,xmm5 vpaddb xmm5,xmm0,xmm2 mov QWORD[((120+8))+rsp],r12 lea rdx,[96+rdx] vmovdqu xmm15,XMMWORD[((0-128))+r9] vaesenclast xmm12,xmm12,xmm6 vpaddb xmm6,xmm5,xmm2 vaesenclast xmm13,xmm13,xmm7 vpaddb xmm7,xmm6,xmm2 vaesenclast xmm14,xmm14,xmm3 vpaddb xmm3,xmm7,xmm2 add rax,0x60 sub r8,0x6 jc NEAR $L$6x_done vmovups XMMWORD[(-96)+rdx],xmm9 vpxor xmm9,xmm1,xmm15 vmovups XMMWORD[(-80)+rdx],xmm10 vmovdqa xmm10,xmm0 vmovups XMMWORD[(-64)+rdx],xmm11 vmovdqa xmm11,xmm5 vmovups XMMWORD[(-48)+rdx],xmm12 vmovdqa xmm12,xmm6 vmovups XMMWORD[(-32)+rdx],xmm13 vmovdqa xmm13,xmm7 vmovups XMMWORD[(-16)+rdx],xmm14 vmovdqa xmm14,xmm3 vmovdqu xmm7,XMMWORD[((32+8))+rsp] jmp NEAR $L$oop6x $L$6x_done: vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] vpxor xmm8,xmm8,xmm4 ret global aesni_gcm_decrypt ALIGN 32 aesni_gcm_decrypt: $L$SEH_begin_aesni_gcm_decrypt_1: _CET_ENDBR xor rax,rax cmp r8,0x60 jb NEAR $L$gcm_dec_abort push rbp $L$SEH_prologue_aesni_gcm_decrypt_2: mov rbp,rsp push rbx $L$SEH_prologue_aesni_gcm_decrypt_3: push r12 $L$SEH_prologue_aesni_gcm_decrypt_4: push r13 $L$SEH_prologue_aesni_gcm_decrypt_5: push r14 $L$SEH_prologue_aesni_gcm_decrypt_6: push r15 $L$SEH_prologue_aesni_gcm_decrypt_7: lea rsp,[((-168))+rsp] $L$SEH_prologue_aesni_gcm_decrypt_8: $L$SEH_prologue_aesni_gcm_decrypt_9: mov QWORD[16+rbp],rdi $L$SEH_prologue_aesni_gcm_decrypt_10: mov QWORD[24+rbp],rsi $L$SEH_prologue_aesni_gcm_decrypt_11: mov rdi,QWORD[48+rbp] mov rsi,QWORD[56+rbp] movaps XMMWORD[(-208)+rbp],xmm6 $L$SEH_prologue_aesni_gcm_decrypt_12: movaps XMMWORD[(-192)+rbp],xmm7 $L$SEH_prologue_aesni_gcm_decrypt_13: movaps XMMWORD[(-176)+rbp],xmm8 $L$SEH_prologue_aesni_gcm_decrypt_14: movaps XMMWORD[(-160)+rbp],xmm9 $L$SEH_prologue_aesni_gcm_decrypt_15: movaps XMMWORD[(-144)+rbp],xmm10 $L$SEH_prologue_aesni_gcm_decrypt_16: movaps XMMWORD[(-128)+rbp],xmm11 $L$SEH_prologue_aesni_gcm_decrypt_17: movaps XMMWORD[(-112)+rbp],xmm12 $L$SEH_prologue_aesni_gcm_decrypt_18: movaps XMMWORD[(-96)+rbp],xmm13 $L$SEH_prologue_aesni_gcm_decrypt_19: movaps XMMWORD[(-80)+rbp],xmm14 $L$SEH_prologue_aesni_gcm_decrypt_20: movaps XMMWORD[(-64)+rbp],xmm15 $L$SEH_prologue_aesni_gcm_decrypt_21: $L$SEH_endprologue_aesni_gcm_decrypt_22: vzeroupper mov r12,QWORD[64+rbp] vmovdqu xmm1,XMMWORD[rdi] add rsp,-128 mov ebx,DWORD[12+rdi] lea r11,[$L$bswap_mask] lea r14,[((-128))+r9] mov r15,0xf80 vmovdqu xmm8,XMMWORD[r12] and rsp,-128 vmovdqu xmm0,XMMWORD[r11] lea r9,[128+r9] lea rsi,[32+rsi] mov r10d,DWORD[((240-128))+r9] vpshufb xmm8,xmm8,xmm0 and r14,r15 and r15,rsp sub r15,r14 jc NEAR $L$dec_no_key_aliasing cmp r15,768 jnc NEAR $L$dec_no_key_aliasing sub rsp,r15 $L$dec_no_key_aliasing: vmovdqu xmm7,XMMWORD[80+rcx] mov r14,rcx vmovdqu xmm4,XMMWORD[64+rcx] lea r15,[((-192))+r8*1+rcx] vmovdqu xmm5,XMMWORD[48+rcx] shr r8,4 xor rax,rax vmovdqu xmm6,XMMWORD[32+rcx] vpshufb xmm7,xmm7,xmm0 vmovdqu xmm2,XMMWORD[16+rcx] vpshufb xmm4,xmm4,xmm0 vmovdqu xmm3,XMMWORD[rcx] vpshufb xmm5,xmm5,xmm0 vmovdqu XMMWORD[48+rsp],xmm4 vpshufb xmm6,xmm6,xmm0 vmovdqu XMMWORD[64+rsp],xmm5 vpshufb xmm2,xmm2,xmm0 vmovdqu XMMWORD[80+rsp],xmm6 vpshufb xmm3,xmm3,xmm0 vmovdqu XMMWORD[96+rsp],xmm2 vmovdqu XMMWORD[112+rsp],xmm3 call _aesni_ctr32_ghash_6x mov r12,QWORD[64+rbp] vmovups XMMWORD[(-96)+rdx],xmm9 vmovups XMMWORD[(-80)+rdx],xmm10 vmovups XMMWORD[(-64)+rdx],xmm11 vmovups XMMWORD[(-48)+rdx],xmm12 vmovups XMMWORD[(-32)+rdx],xmm13 vmovups XMMWORD[(-16)+rdx],xmm14 vpshufb xmm8,xmm8,XMMWORD[r11] vmovdqu XMMWORD[r12],xmm8 vzeroupper movaps xmm6,XMMWORD[((-208))+rbp] movaps xmm7,XMMWORD[((-192))+rbp] movaps xmm8,XMMWORD[((-176))+rbp] movaps xmm9,XMMWORD[((-160))+rbp] movaps xmm10,XMMWORD[((-144))+rbp] movaps xmm11,XMMWORD[((-128))+rbp] movaps xmm12,XMMWORD[((-112))+rbp] movaps xmm13,XMMWORD[((-96))+rbp] movaps xmm14,XMMWORD[((-80))+rbp] movaps xmm15,XMMWORD[((-64))+rbp] mov rdi,QWORD[16+rbp] mov rsi,QWORD[24+rbp] lea rsp,[((-40))+rbp] pop r15 pop r14 pop r13 pop r12 pop rbx pop rbp $L$gcm_dec_abort: ret $L$SEH_end_aesni_gcm_decrypt_23: ALIGN 32 _aesni_ctr32_6x: vmovdqu xmm4,XMMWORD[((0-128))+r9] vmovdqu xmm2,XMMWORD[32+r11] lea r13,[((-1))+r10] vmovups xmm15,XMMWORD[((16-128))+r9] lea r12,[((32-128))+r9] vpxor xmm9,xmm1,xmm4 add ebx,100663296 jc NEAR $L$handle_ctr32_2 vpaddb xmm10,xmm1,xmm2 vpaddb xmm11,xmm10,xmm2 vpxor xmm10,xmm10,xmm4 vpaddb xmm12,xmm11,xmm2 vpxor xmm11,xmm11,xmm4 vpaddb xmm13,xmm12,xmm2 vpxor xmm12,xmm12,xmm4 vpaddb xmm14,xmm13,xmm2 vpxor xmm13,xmm13,xmm4 vpaddb xmm1,xmm14,xmm2 vpxor xmm14,xmm14,xmm4 jmp NEAR $L$oop_ctr32 ALIGN 16 $L$oop_ctr32: vaesenc xmm9,xmm9,xmm15 vaesenc xmm10,xmm10,xmm15 vaesenc xmm11,xmm11,xmm15 vaesenc xmm12,xmm12,xmm15 vaesenc xmm13,xmm13,xmm15 vaesenc xmm14,xmm14,xmm15 vmovups xmm15,XMMWORD[r12] lea r12,[16+r12] dec r13d jnz NEAR $L$oop_ctr32 vmovdqu xmm3,XMMWORD[r12] vaesenc xmm9,xmm9,xmm15 vpxor xmm4,xmm3,XMMWORD[rcx] vaesenc xmm10,xmm10,xmm15 vpxor xmm5,xmm3,XMMWORD[16+rcx] vaesenc xmm11,xmm11,xmm15 vpxor xmm6,xmm3,XMMWORD[32+rcx] vaesenc xmm12,xmm12,xmm15 vpxor xmm8,xmm3,XMMWORD[48+rcx] vaesenc xmm13,xmm13,xmm15 vpxor xmm2,xmm3,XMMWORD[64+rcx] vaesenc xmm14,xmm14,xmm15 vpxor xmm3,xmm3,XMMWORD[80+rcx] lea rcx,[96+rcx] vaesenclast xmm9,xmm9,xmm4 vaesenclast xmm10,xmm10,xmm5 vaesenclast xmm11,xmm11,xmm6 vaesenclast xmm12,xmm12,xmm8 vaesenclast xmm13,xmm13,xmm2 vaesenclast xmm14,xmm14,xmm3 vmovups XMMWORD[rdx],xmm9 vmovups XMMWORD[16+rdx],xmm10 vmovups XMMWORD[32+rdx],xmm11 vmovups XMMWORD[48+rdx],xmm12 vmovups XMMWORD[64+rdx],xmm13 vmovups XMMWORD[80+rdx],xmm14 lea rdx,[96+rdx] ret ALIGN 32 $L$handle_ctr32_2: vpshufb xmm6,xmm1,xmm0 vmovdqu xmm5,XMMWORD[48+r11] vpaddd xmm10,xmm6,XMMWORD[64+r11] vpaddd xmm11,xmm6,xmm5 vpaddd xmm12,xmm10,xmm5 vpshufb xmm10,xmm10,xmm0 vpaddd xmm13,xmm11,xmm5 vpshufb xmm11,xmm11,xmm0 vpxor xmm10,xmm10,xmm4 vpaddd xmm14,xmm12,xmm5 vpshufb xmm12,xmm12,xmm0 vpxor xmm11,xmm11,xmm4 vpaddd xmm1,xmm13,xmm5 vpshufb xmm13,xmm13,xmm0 vpxor xmm12,xmm12,xmm4 vpshufb xmm14,xmm14,xmm0 vpxor xmm13,xmm13,xmm4 vpshufb xmm1,xmm1,xmm0 vpxor xmm14,xmm14,xmm4 jmp NEAR $L$oop_ctr32 global aesni_gcm_encrypt ALIGN 32 aesni_gcm_encrypt: $L$SEH_begin_aesni_gcm_encrypt_1: _CET_ENDBR %ifdef BORINGSSL_DISPATCH_TEST EXTERN BORINGSSL_function_hit mov BYTE[((BORINGSSL_function_hit+2))],1 %endif xor rax,rax cmp r8,0x60*3 jb NEAR $L$gcm_enc_abort push rbp $L$SEH_prologue_aesni_gcm_encrypt_2: mov rbp,rsp push rbx $L$SEH_prologue_aesni_gcm_encrypt_3: push r12 $L$SEH_prologue_aesni_gcm_encrypt_4: push r13 $L$SEH_prologue_aesni_gcm_encrypt_5: push r14 $L$SEH_prologue_aesni_gcm_encrypt_6: push r15 $L$SEH_prologue_aesni_gcm_encrypt_7: lea rsp,[((-168))+rsp] $L$SEH_prologue_aesni_gcm_encrypt_8: $L$SEH_prologue_aesni_gcm_encrypt_9: mov QWORD[16+rbp],rdi $L$SEH_prologue_aesni_gcm_encrypt_10: mov QWORD[24+rbp],rsi $L$SEH_prologue_aesni_gcm_encrypt_11: mov rdi,QWORD[48+rbp] mov rsi,QWORD[56+rbp] movaps XMMWORD[(-208)+rbp],xmm6 $L$SEH_prologue_aesni_gcm_encrypt_12: movaps XMMWORD[(-192)+rbp],xmm7 $L$SEH_prologue_aesni_gcm_encrypt_13: movaps XMMWORD[(-176)+rbp],xmm8 $L$SEH_prologue_aesni_gcm_encrypt_14: movaps XMMWORD[(-160)+rbp],xmm9 $L$SEH_prologue_aesni_gcm_encrypt_15: movaps XMMWORD[(-144)+rbp],xmm10 $L$SEH_prologue_aesni_gcm_encrypt_16: movaps XMMWORD[(-128)+rbp],xmm11 $L$SEH_prologue_aesni_gcm_encrypt_17: movaps XMMWORD[(-112)+rbp],xmm12 $L$SEH_prologue_aesni_gcm_encrypt_18: movaps XMMWORD[(-96)+rbp],xmm13 $L$SEH_prologue_aesni_gcm_encrypt_19: movaps XMMWORD[(-80)+rbp],xmm14 $L$SEH_prologue_aesni_gcm_encrypt_20: movaps XMMWORD[(-64)+rbp],xmm15 $L$SEH_prologue_aesni_gcm_encrypt_21: $L$SEH_endprologue_aesni_gcm_encrypt_22: vzeroupper vmovdqu xmm1,XMMWORD[rdi] add rsp,-128 mov ebx,DWORD[12+rdi] lea r11,[$L$bswap_mask] lea r14,[((-128))+r9] mov r15,0xf80 lea r9,[128+r9] vmovdqu xmm0,XMMWORD[r11] and rsp,-128 mov r10d,DWORD[((240-128))+r9] and r14,r15 and r15,rsp sub r15,r14 jc NEAR $L$enc_no_key_aliasing cmp r15,768 jnc NEAR $L$enc_no_key_aliasing sub rsp,r15 $L$enc_no_key_aliasing: mov r14,rdx lea r15,[((-192))+r8*1+rdx] shr r8,4 call _aesni_ctr32_6x vpshufb xmm8,xmm9,xmm0 vpshufb xmm2,xmm10,xmm0 vmovdqu XMMWORD[112+rsp],xmm8 vpshufb xmm4,xmm11,xmm0 vmovdqu XMMWORD[96+rsp],xmm2 vpshufb xmm5,xmm12,xmm0 vmovdqu XMMWORD[80+rsp],xmm4 vpshufb xmm6,xmm13,xmm0 vmovdqu XMMWORD[64+rsp],xmm5 vpshufb xmm7,xmm14,xmm0 vmovdqu XMMWORD[48+rsp],xmm6 call _aesni_ctr32_6x mov r12,QWORD[64+rbp] lea rsi,[32+rsi] vmovdqu xmm8,XMMWORD[r12] sub r8,12 mov rax,0x60*2 vpshufb xmm8,xmm8,xmm0 call _aesni_ctr32_ghash_6x vmovdqu xmm7,XMMWORD[32+rsp] vmovdqu xmm0,XMMWORD[r11] vmovdqu xmm3,XMMWORD[((0-32))+rsi] vpunpckhqdq xmm1,xmm7,xmm7 vmovdqu xmm15,XMMWORD[((32-32))+rsi] vmovups XMMWORD[(-96)+rdx],xmm9 vpshufb xmm9,xmm9,xmm0 vpxor xmm1,xmm1,xmm7 vmovups XMMWORD[(-80)+rdx],xmm10 vpshufb xmm10,xmm10,xmm0 vmovups XMMWORD[(-64)+rdx],xmm11 vpshufb xmm11,xmm11,xmm0 vmovups XMMWORD[(-48)+rdx],xmm12 vpshufb xmm12,xmm12,xmm0 vmovups XMMWORD[(-32)+rdx],xmm13 vpshufb xmm13,xmm13,xmm0 vmovups XMMWORD[(-16)+rdx],xmm14 vpshufb xmm14,xmm14,xmm0 vmovdqu XMMWORD[16+rsp],xmm9 vmovdqu xmm6,XMMWORD[48+rsp] vmovdqu xmm0,XMMWORD[((16-32))+rsi] vpunpckhqdq xmm2,xmm6,xmm6 vpclmulqdq xmm5,xmm7,xmm3,0x00 vpxor xmm2,xmm2,xmm6 vpclmulqdq xmm7,xmm7,xmm3,0x11 vpclmulqdq xmm1,xmm1,xmm15,0x00 vmovdqu xmm9,XMMWORD[64+rsp] vpclmulqdq xmm4,xmm6,xmm0,0x00 vmovdqu xmm3,XMMWORD[((48-32))+rsi] vpxor xmm4,xmm4,xmm5 vpunpckhqdq xmm5,xmm9,xmm9 vpclmulqdq xmm6,xmm6,xmm0,0x11 vpxor xmm5,xmm5,xmm9 vpxor xmm6,xmm6,xmm7 vpclmulqdq xmm2,xmm2,xmm15,0x10 vmovdqu xmm15,XMMWORD[((80-32))+rsi] vpxor xmm2,xmm2,xmm1 vmovdqu xmm1,XMMWORD[80+rsp] vpclmulqdq xmm7,xmm9,xmm3,0x00 vmovdqu xmm0,XMMWORD[((64-32))+rsi] vpxor xmm7,xmm7,xmm4 vpunpckhqdq xmm4,xmm1,xmm1 vpclmulqdq xmm9,xmm9,xmm3,0x11 vpxor xmm4,xmm4,xmm1 vpxor xmm9,xmm9,xmm6 vpclmulqdq xmm5,xmm5,xmm15,0x00 vpxor xmm5,xmm5,xmm2 vmovdqu xmm2,XMMWORD[96+rsp] vpclmulqdq xmm6,xmm1,xmm0,0x00 vmovdqu xmm3,XMMWORD[((96-32))+rsi] vpxor xmm6,xmm6,xmm7 vpunpckhqdq xmm7,xmm2,xmm2 vpclmulqdq xmm1,xmm1,xmm0,0x11 vpxor xmm7,xmm7,xmm2 vpxor xmm1,xmm1,xmm9 vpclmulqdq xmm4,xmm4,xmm15,0x10 vmovdqu xmm15,XMMWORD[((128-32))+rsi] vpxor xmm4,xmm4,xmm5 vpxor xmm8,xmm8,XMMWORD[112+rsp] vpclmulqdq xmm5,xmm2,xmm3,0x00 vmovdqu xmm0,XMMWORD[((112-32))+rsi] vpunpckhqdq xmm9,xmm8,xmm8 vpxor xmm5,xmm5,xmm6 vpclmulqdq xmm2,xmm2,xmm3,0x11 vpxor xmm9,xmm9,xmm8 vpxor xmm2,xmm2,xmm1 vpclmulqdq xmm7,xmm7,xmm15,0x00 vpxor xmm4,xmm7,xmm4 vpclmulqdq xmm6,xmm8,xmm0,0x00 vmovdqu xmm3,XMMWORD[((0-32))+rsi] vpunpckhqdq xmm1,xmm14,xmm14 vpclmulqdq xmm8,xmm8,xmm0,0x11 vpxor xmm1,xmm1,xmm14 vpxor xmm5,xmm6,xmm5 vpclmulqdq xmm9,xmm9,xmm15,0x10 vmovdqu xmm15,XMMWORD[((32-32))+rsi] vpxor xmm7,xmm8,xmm2 vpxor xmm6,xmm9,xmm4 vmovdqu xmm0,XMMWORD[((16-32))+rsi] vpxor xmm9,xmm7,xmm5 vpclmulqdq xmm4,xmm14,xmm3,0x00 vpxor xmm6,xmm6,xmm9 vpunpckhqdq xmm2,xmm13,xmm13 vpclmulqdq xmm14,xmm14,xmm3,0x11 vpxor xmm2,xmm2,xmm13 vpslldq xmm9,xmm6,8 vpclmulqdq xmm1,xmm1,xmm15,0x00 vpxor xmm8,xmm5,xmm9 vpsrldq xmm6,xmm6,8 vpxor xmm7,xmm7,xmm6 vpclmulqdq xmm5,xmm13,xmm0,0x00 vmovdqu xmm3,XMMWORD[((48-32))+rsi] vpxor xmm5,xmm5,xmm4 vpunpckhqdq xmm9,xmm12,xmm12 vpclmulqdq xmm13,xmm13,xmm0,0x11 vpxor xmm9,xmm9,xmm12 vpxor xmm13,xmm13,xmm14 vpalignr xmm14,xmm8,xmm8,8 vpclmulqdq xmm2,xmm2,xmm15,0x10 vmovdqu xmm15,XMMWORD[((80-32))+rsi] vpxor xmm2,xmm2,xmm1 vpclmulqdq xmm4,xmm12,xmm3,0x00 vmovdqu xmm0,XMMWORD[((64-32))+rsi] vpxor xmm4,xmm4,xmm5 vpunpckhqdq xmm1,xmm11,xmm11 vpclmulqdq xmm12,xmm12,xmm3,0x11 vpxor xmm1,xmm1,xmm11 vpxor xmm12,xmm12,xmm13 vxorps xmm7,xmm7,XMMWORD[16+rsp] vpclmulqdq xmm9,xmm9,xmm15,0x00 vpxor xmm9,xmm9,xmm2 vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 vxorps xmm8,xmm8,xmm14 vpclmulqdq xmm5,xmm11,xmm0,0x00 vmovdqu xmm3,XMMWORD[((96-32))+rsi] vpxor xmm5,xmm5,xmm4 vpunpckhqdq xmm2,xmm10,xmm10 vpclmulqdq xmm11,xmm11,xmm0,0x11 vpxor xmm2,xmm2,xmm10 vpalignr xmm14,xmm8,xmm8,8 vpxor xmm11,xmm11,xmm12 vpclmulqdq xmm1,xmm1,xmm15,0x10 vmovdqu xmm15,XMMWORD[((128-32))+rsi] vpxor xmm1,xmm1,xmm9 vxorps xmm14,xmm14,xmm7 vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 vxorps xmm8,xmm8,xmm14 vpclmulqdq xmm4,xmm10,xmm3,0x00 vmovdqu xmm0,XMMWORD[((112-32))+rsi] vpxor xmm4,xmm4,xmm5 vpunpckhqdq xmm9,xmm8,xmm8 vpclmulqdq xmm10,xmm10,xmm3,0x11 vpxor xmm9,xmm9,xmm8 vpxor xmm10,xmm10,xmm11 vpclmulqdq xmm2,xmm2,xmm15,0x00 vpxor xmm2,xmm2,xmm1 vpclmulqdq xmm5,xmm8,xmm0,0x00 vpclmulqdq xmm7,xmm8,xmm0,0x11 vpxor xmm5,xmm5,xmm4 vpclmulqdq xmm6,xmm9,xmm15,0x10 vpxor xmm7,xmm7,xmm10 vpxor xmm6,xmm6,xmm2 vpxor xmm4,xmm7,xmm5 vpxor xmm6,xmm6,xmm4 vpslldq xmm1,xmm6,8 vmovdqu xmm3,XMMWORD[16+r11] vpsrldq xmm6,xmm6,8 vpxor xmm8,xmm5,xmm1 vpxor xmm7,xmm7,xmm6 vpalignr xmm2,xmm8,xmm8,8 vpclmulqdq xmm8,xmm8,xmm3,0x10 vpxor xmm8,xmm8,xmm2 vpalignr xmm2,xmm8,xmm8,8 vpclmulqdq xmm8,xmm8,xmm3,0x10 vpxor xmm2,xmm2,xmm7 vpxor xmm8,xmm8,xmm2 mov r12,QWORD[64+rbp] vpshufb xmm8,xmm8,XMMWORD[r11] vmovdqu XMMWORD[r12],xmm8 vzeroupper movaps xmm6,XMMWORD[((-208))+rbp] movaps xmm7,XMMWORD[((-192))+rbp] movaps xmm8,XMMWORD[((-176))+rbp] movaps xmm9,XMMWORD[((-160))+rbp] movaps xmm10,XMMWORD[((-144))+rbp] movaps xmm11,XMMWORD[((-128))+rbp] movaps xmm12,XMMWORD[((-112))+rbp] movaps xmm13,XMMWORD[((-96))+rbp] movaps xmm14,XMMWORD[((-80))+rbp] movaps xmm15,XMMWORD[((-64))+rbp] mov rdi,QWORD[16+rbp] mov rsi,QWORD[24+rbp] lea rsp,[((-40))+rbp] pop r15 pop r14 pop r13 pop r12 pop rbx pop rbp $L$gcm_enc_abort: ret $L$SEH_end_aesni_gcm_encrypt_23: section .rdata rdata align=8 ALIGN 64 $L$bswap_mask: DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 $L$poly: DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 $L$one_msb: DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 $L$two_lsb: DB 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 $L$one_lsb: DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 DB 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108 DB 101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82 DB 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 DB 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 ALIGN 64 section .text section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_aesni_gcm_decrypt_1 wrt ..imagebase DD $L$SEH_end_aesni_gcm_decrypt_23 wrt ..imagebase DD $L$SEH_info_aesni_gcm_decrypt_0 wrt ..imagebase DD $L$SEH_begin_aesni_gcm_encrypt_1 wrt ..imagebase DD $L$SEH_end_aesni_gcm_encrypt_23 wrt ..imagebase DD $L$SEH_info_aesni_gcm_encrypt_0 wrt ..imagebase section .xdata rdata align=8 ALIGN 4 $L$SEH_info_aesni_gcm_decrypt_0: DB 1 DB $L$SEH_endprologue_aesni_gcm_decrypt_22-$L$SEH_begin_aesni_gcm_decrypt_1 DB 33 DB 213 DB $L$SEH_prologue_aesni_gcm_decrypt_21-$L$SEH_begin_aesni_gcm_decrypt_1 DB 248 DW 9 DB $L$SEH_prologue_aesni_gcm_decrypt_20-$L$SEH_begin_aesni_gcm_decrypt_1 DB 232 DW 8 DB $L$SEH_prologue_aesni_gcm_decrypt_19-$L$SEH_begin_aesni_gcm_decrypt_1 DB 216 DW 7 DB $L$SEH_prologue_aesni_gcm_decrypt_18-$L$SEH_begin_aesni_gcm_decrypt_1 DB 200 DW 6 DB $L$SEH_prologue_aesni_gcm_decrypt_17-$L$SEH_begin_aesni_gcm_decrypt_1 DB 184 DW 5 DB $L$SEH_prologue_aesni_gcm_decrypt_16-$L$SEH_begin_aesni_gcm_decrypt_1 DB 168 DW 4 DB $L$SEH_prologue_aesni_gcm_decrypt_15-$L$SEH_begin_aesni_gcm_decrypt_1 DB 152 DW 3 DB $L$SEH_prologue_aesni_gcm_decrypt_14-$L$SEH_begin_aesni_gcm_decrypt_1 DB 136 DW 2 DB $L$SEH_prologue_aesni_gcm_decrypt_13-$L$SEH_begin_aesni_gcm_decrypt_1 DB 120 DW 1 DB $L$SEH_prologue_aesni_gcm_decrypt_12-$L$SEH_begin_aesni_gcm_decrypt_1 DB 104 DW 0 DB $L$SEH_prologue_aesni_gcm_decrypt_11-$L$SEH_begin_aesni_gcm_decrypt_1 DB 100 DW 29 DB $L$SEH_prologue_aesni_gcm_decrypt_10-$L$SEH_begin_aesni_gcm_decrypt_1 DB 116 DW 28 DB $L$SEH_prologue_aesni_gcm_decrypt_9-$L$SEH_begin_aesni_gcm_decrypt_1 DB 3 DB $L$SEH_prologue_aesni_gcm_decrypt_8-$L$SEH_begin_aesni_gcm_decrypt_1 DB 1 DW 21 DB $L$SEH_prologue_aesni_gcm_decrypt_7-$L$SEH_begin_aesni_gcm_decrypt_1 DB 240 DB $L$SEH_prologue_aesni_gcm_decrypt_6-$L$SEH_begin_aesni_gcm_decrypt_1 DB 224 DB $L$SEH_prologue_aesni_gcm_decrypt_5-$L$SEH_begin_aesni_gcm_decrypt_1 DB 208 DB $L$SEH_prologue_aesni_gcm_decrypt_4-$L$SEH_begin_aesni_gcm_decrypt_1 DB 192 DB $L$SEH_prologue_aesni_gcm_decrypt_3-$L$SEH_begin_aesni_gcm_decrypt_1 DB 48 DB $L$SEH_prologue_aesni_gcm_decrypt_2-$L$SEH_begin_aesni_gcm_decrypt_1 DB 80 DW 0 $L$SEH_info_aesni_gcm_encrypt_0: DB 1 DB $L$SEH_endprologue_aesni_gcm_encrypt_22-$L$SEH_begin_aesni_gcm_encrypt_1 DB 33 DB 213 DB $L$SEH_prologue_aesni_gcm_encrypt_21-$L$SEH_begin_aesni_gcm_encrypt_1 DB 248 DW 9 DB $L$SEH_prologue_aesni_gcm_encrypt_20-$L$SEH_begin_aesni_gcm_encrypt_1 DB 232 DW 8 DB $L$SEH_prologue_aesni_gcm_encrypt_19-$L$SEH_begin_aesni_gcm_encrypt_1 DB 216 DW 7 DB $L$SEH_prologue_aesni_gcm_encrypt_18-$L$SEH_begin_aesni_gcm_encrypt_1 DB 200 DW 6 DB $L$SEH_prologue_aesni_gcm_encrypt_17-$L$SEH_begin_aesni_gcm_encrypt_1 DB 184 DW 5 DB $L$SEH_prologue_aesni_gcm_encrypt_16-$L$SEH_begin_aesni_gcm_encrypt_1 DB 168 DW 4 DB $L$SEH_prologue_aesni_gcm_encrypt_15-$L$SEH_begin_aesni_gcm_encrypt_1 DB 152 DW 3 DB $L$SEH_prologue_aesni_gcm_encrypt_14-$L$SEH_begin_aesni_gcm_encrypt_1 DB 136 DW 2 DB $L$SEH_prologue_aesni_gcm_encrypt_13-$L$SEH_begin_aesni_gcm_encrypt_1 DB 120 DW 1 DB $L$SEH_prologue_aesni_gcm_encrypt_12-$L$SEH_begin_aesni_gcm_encrypt_1 DB 104 DW 0 DB $L$SEH_prologue_aesni_gcm_encrypt_11-$L$SEH_begin_aesni_gcm_encrypt_1 DB 100 DW 29 DB $L$SEH_prologue_aesni_gcm_encrypt_10-$L$SEH_begin_aesni_gcm_encrypt_1 DB 116 DW 28 DB $L$SEH_prologue_aesni_gcm_encrypt_9-$L$SEH_begin_aesni_gcm_encrypt_1 DB 3 DB $L$SEH_prologue_aesni_gcm_encrypt_8-$L$SEH_begin_aesni_gcm_encrypt_1 DB 1 DW 21 DB $L$SEH_prologue_aesni_gcm_encrypt_7-$L$SEH_begin_aesni_gcm_encrypt_1 DB 240 DB $L$SEH_prologue_aesni_gcm_encrypt_6-$L$SEH_begin_aesni_gcm_encrypt_1 DB 224 DB $L$SEH_prologue_aesni_gcm_encrypt_5-$L$SEH_begin_aesni_gcm_encrypt_1 DB 208 DB $L$SEH_prologue_aesni_gcm_encrypt_4-$L$SEH_begin_aesni_gcm_encrypt_1 DB 192 DB $L$SEH_prologue_aesni_gcm_encrypt_3-$L$SEH_begin_aesni_gcm_encrypt_1 DB 48 DB $L$SEH_prologue_aesni_gcm_encrypt_2-$L$SEH_begin_aesni_gcm_encrypt_1 DB 80 DW 0 %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.o000064400000000000000000000427171046102023000173300ustar 00000000000000dg$8S.debug$S"#@B.debug$T8x((@B.text (l6 p`.rdata6@7@p@.pdata@7X7@0@.xdata7$8@@@:C:\Users\b\p\ring\pregenerated\aesni-gcm-x86_64-nasm.asmb(wuCURfRqBK@ 4  $(-38 @"F#L$Q%U&Z'_*c+i,n-t.z@}ACDEFGHIJKLMNOPQRSTUVWYZ[\]^ _`abc$d)e.f3g8h=jCkGlMmRnVo\paqerksptvu|vwxyz|}~  %+/5;@FLQV[`dintx~ #(-27<BGLQUZ^dinsy}  %*0 6 : ? C HLQVZ_dhmrw| !"#$%&'()*+-./01245678 9:;< =%>)?.@2B6C:D@FEGJHOISJXK\LaMeNjOnPsQwR}SVWY^cghjmorux{~  #*.4:>CJNRWZ]`fmsv{~ $,4<AFKPUY]acegijkl      !"#$%&' ( )+,-!.&/+00152:3?4D5I6N7S8W:\;a<f=k>p?u@yA~BCDEFHIKLMNOPQRSTUVWXYZ[\]^d m r s u x z }    % ) - 1 5 < C K S [ ` e j o t w {                                  # ) - 2 7 < B G L P T Y ^ b g l q v {                                 $ ( . 4 9 = A G K P V [ !_ #e $k %p &u 'y ( ) * + , . / 0 1 2 3 4 5 6 7 9 : ; < = > ? @ A B C D F G H I J K L$ M* N0 O5 P9 R? SD TH UM VS WX X] Yc Zi [m ]t ^y ` a b c d e f g h i j l m n p q r s t u v w x z { | } ~     # ( , 0 6 < @ F L P T X ] c f m t |                 V =C:\Users\b\p\ring\pregenerated\aesni-gcm-x86_64-nasm.o4'The Netwide Assembler 2.13.03_aesni_ctr32_ghash_6xL$oop6xL$resume_ctr32L$handle_ctr32L$enc_tailL$6x_done.ring_core_0_17_14__aesni_gcm_decrypt)L$SEH_begin_aesni_gcm_decrypt_1,L$SEH_prologue_aesni_gcm_decrypt_2,L$SEH_prologue_aesni_gcm_decrypt_3,L$SEH_prologue_aesni_gcm_decrypt_4,L$SEH_prologue_aesni_gcm_decrypt_5,L$SEH_prologue_aesni_gcm_decrypt_6,L$SEH_prologue_aesni_gcm_decrypt_7,L$SEH_prologue_aesni_gcm_decrypt_8,L$SEH_prologue_aesni_gcm_decrypt_9-L$SEH_prologue_aesni_gcm_decrypt_10-L$SEH_prologue_aesni_gcm_decrypt_11-L$SEH_prologue_aesni_gcm_decrypt_12-L$SEH_prologue_aesni_gcm_decrypt_13-L$SEH_prologue_aesni_gcm_decrypt_14-L$SEH_prologue_aesni_gcm_decrypt_15-L$SEH_prologue_aesni_gcm_decrypt_16-L$SEH_prologue_aesni_gcm_decrypt_17-L$SEH_prologue_aesni_gcm_decrypt_18-L$SEH_prologue_aesni_gcm_decrypt_19-L$SEH_prologue_aesni_gcm_decrypt_20-L$SEH_prologue_aesni_gcm_decrypt_210L$SEH_endprologue_aesni_gcm_decrypt_22L$dec_no_key_aliasingL$gcm_dec_abort(L$SEH_end_aesni_gcm_decrypt_23_aesni_ctr32_6xL$oop_ctr32L$handle_ctr32_2.ring_core_0_17_14__aesni_gcm_encrypt)L$SEH_begin_aesni_gcm_encrypt_1,L$SEH_prologue_aesni_gcm_encrypt_2,L$SEH_prologue_aesni_gcm_encrypt_3,L$SEH_prologue_aesni_gcm_encrypt_4,L$SEH_prologue_aesni_gcm_encrypt_5,L$SEH_prologue_aesni_gcm_encrypt_6,L$SEH_prologue_aesni_gcm_encrypt_7,L$SEH_prologue_aesni_gcm_encrypt_8,L$SEH_prologue_aesni_gcm_encrypt_9-L$SEH_prologue_aesni_gcm_encrypt_10-L$SEH_prologue_aesni_gcm_encrypt_11-L$SEH_prologue_aesni_gcm_encrypt_12-L$SEH_prologue_aesni_gcm_encrypt_13-L$SEH_prologue_aesni_gcm_encrypt_14-L$SEH_prologue_aesni_gcm_encrypt_15-L$SEH_prologue_aesni_gcm_encrypt_16-L$SEH_prologue_aesni_gcm_encrypt_17-L$SEH_prologue_aesni_gcm_encrypt_18-L$SEH_prologue_aesni_gcm_encrypt_19-L$SEH_prologue_aesni_gcm_encrypt_20-L$SEH_prologue_aesni_gcm_encrypt_210L$SEH_endprologue_aesni_gcm_encrypt_22L$enc_no_key_aliasingL$gcm_enc_abort(L$SEH_end_aesni_gcm_encrypt_23  L$bswap_mask  L$poly  L$one_msb  L$two_lsb  L$one_lsb+  L$SEH_info_aesni_gcm_decrypt_0+  L$SEH_info_aesni_gcm_encrypt_0p t 1 5 R V e i            # M Q {        3 7 a e     ! ! " " K# O# z$ ~$ % % & & ' ' 6( :( e) i) * * + + , , - - 1. 5. [/ _/ v0 z0 1 1 2 2 3 3 4 4 25 65 `6 d6 7 7 8 8 9 9 : : F; J; t< x< = = > > ? ? 0@ 4@ _A cA B B C C D D  E  E J F N F y G } G H H I I J J !K !K D!L H!L _!M c!M t!N x!N !O !O !P !P !Q !Q !R !R zoS IAzoyq)!Aqd$o^ʼnA)A!ADAxQADM1M9b1oD$8AADb)A AADb!o^IbyD9bI`AxyyDb yDO4&B19D$yDoD$HB)M8nXB!M8fPBLl$(BLd$0onB AxyyDB1yDB)yDB!yDoD$XBBoN B AxyyDB1yDB)M8nHyDB!M8f@yDoD$hBLl$8BLd$@oV@B AxyyDB1yDB)M8n8yD9D$xB!M8f0yDBLl$HBLd$Po^PB Axy9DB19DB)M8n(9DB!M8f c9DBLl$XBLd$`B Axyszo[B1AB)M8nB!M8fYYDLl$hBLd$pBx B b1Axyb)sb!bM8nbM8&b xI A B1B)B!BBB b1b)b!bbAxy0b xI@B1B)B!BBB b1b)b!bbAxyPb xI`hzoqzok0AIS@Io^)b)!b!A)bA!őbb qB1|$cYB)YDB!ABi Bq0B y@YPob1zoS b)Ll$xHI`@b!L$HR`Azoybbb H`IBxJAqxRyoxZyoxbyoxjyoxryoo|$(9D$9ÐH1I`UHSATAUAVAWH$XH}HuH}0Hu8)0)@D)PD)`D)pD)]D)eD)mD)uD)}wLe@oHĀ_ LMqAAzo$HzoMHv EQpb9M!I!M)IL)oyPIoa@N@oi0IH1oq AoQYoQd$0Il$@it$PaT$`\$pLe@xJxRxZxbxjxrB9Az$w(0(@D(PD(`D(pD(]D(eD(mD(uD(}H}HuHeA_A^A]A\[]ÐzoazoS MjAxyMaq́q))!!ʼn B1B)B!BBB Ax<$Md$Azo$B1!B)iB!q BaA0BQ@B YPHI`b1b)b!Bbb x xRxZ xb0xj@xrPHR`Ðqzok0AIS@I)b)!b!)b!őbb q H1I UHSATAUAVAWH$XH}HuH}0Hu8)0)@D)PD)`D)pD)]D)eD)mD)uD)}woHĀ_ LMqAMzoHEQpM!I!M)IL)IN@Ib1)zD$p!T$`d$Pl$@ t$0eLe@Hv Azo$I b9o|$ zoo^mzo>xJb1xRb)xZb!xbbxjbxrb zL$ot$0oFmADADqDzoL$@IDo^1mIDQiDzo~0oL$P1DoF mc1D1QDoT$`qDo^@mqDqYDzo~`9D$piDoFPA9miDA1AD9Do^ mc9DqC1Dzo>ŹűoFA DImc DiűsqDAQsDo^AmcDA1AC9iDzo~0DoF !mcDqAW|$C1D1C9DCA8W!Do^@)mc!DiC9A!qDzo~`qWC9DCA8W)DoFPA9mc)DA1A)iD9D9D1DAszo[sQ9c9D99c9D9Le@B9Az$w(0(@D(PD(`D(pD(]D(eD(mD(uD(}H}HuHeA_A^A]A\[]&  AES-NI GCM module for x86_64, CRYPTOGAMS by l H   q!q lgb]XPH@x9h*d&t""0Pt!t oje`[SKCx #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) .text #ifdef BORINGSSL_DISPATCH_TEST #endif .hidden _aesni_encrypt2 .type _aesni_encrypt2,@function .align 16 _aesni_encrypt2: movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 movups 32(%edx),%xmm0 leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx .L000enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 jnz .L000enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,221,208 .byte 102,15,56,221,216 ret .size _aesni_encrypt2,.-_aesni_encrypt2 .hidden _aesni_encrypt3 .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 movups 32(%edx),%xmm0 leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx .L001enc3_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 movups (%edx,%ecx,1),%xmm1 addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 movups -16(%edx,%ecx,1),%xmm0 jnz .L001enc3_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 102,15,56,221,224 ret .size _aesni_encrypt3,.-_aesni_encrypt3 .hidden _aesni_encrypt4 .type _aesni_encrypt4,@function .align 16 _aesni_encrypt4: movups (%edx),%xmm0 movups 16(%edx),%xmm1 shll $4,%ecx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 movups 32(%edx),%xmm0 leal 32(%edx,%ecx,1),%edx negl %ecx .byte 15,31,64,0 addl $16,%ecx .L002enc4_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movups (%edx,%ecx,1),%xmm1 addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 movups -16(%edx,%ecx,1),%xmm0 jnz .L002enc4_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 102,15,56,221,224 .byte 102,15,56,221,232 ret .size _aesni_encrypt4,.-_aesni_encrypt4 .hidden _aesni_encrypt6 .type _aesni_encrypt6,@function .align 16 _aesni_encrypt6: movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 .byte 102,15,56,220,209 pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 .byte 102,15,56,220,217 leal 32(%edx,%ecx,1),%edx negl %ecx .byte 102,15,56,220,225 pxor %xmm0,%xmm7 movups (%edx,%ecx,1),%xmm0 addl $16,%ecx jmp .L003_aesni_encrypt6_inner .align 16 .L004enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .L003_aesni_encrypt6_inner: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .L_aesni_encrypt6_enter: movups (%edx,%ecx,1),%xmm1 addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups -16(%edx,%ecx,1),%xmm0 jnz .L004enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 102,15,56,221,224 .byte 102,15,56,221,232 .byte 102,15,56,221,240 .byte 102,15,56,221,248 ret .size _aesni_encrypt6,.-_aesni_encrypt6 .globl aes_hw_ctr32_encrypt_blocks .hidden aes_hw_ctr32_encrypt_blocks .type aes_hw_ctr32_encrypt_blocks,@function .align 16 aes_hw_ctr32_encrypt_blocks: .L_aes_hw_ctr32_encrypt_blocks_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi #ifdef BORINGSSL_DISPATCH_TEST pushl %ebx pushl %edx call .L005pic_for_function_hit .L005pic_for_function_hit: popl %ebx leal BORINGSSL_function_hit+0-.L005pic_for_function_hit(%ebx),%ebx movl $1,%edx movb %dl,(%ebx) popl %edx popl %ebx #endif movl 20(%esp),%esi movl 24(%esp),%edi movl 28(%esp),%eax movl 32(%esp),%edx movl 36(%esp),%ebx movl %esp,%ebp subl $88,%esp andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax je .L006ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) movl $67438087,8(%esp) movl $66051,12(%esp) movl $6,%ecx xorl %ebp,%ebp movl %ecx,16(%esp) movl %ecx,20(%esp) movl %ecx,24(%esp) movl %ebp,28(%esp) .byte 102,15,58,22,251,3 .byte 102,15,58,34,253,3 movl 240(%edx),%ecx bswap %ebx pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 movdqa (%esp),%xmm2 .byte 102,15,58,34,195,0 leal 3(%ebx),%ebp .byte 102,15,58,34,205,0 incl %ebx .byte 102,15,58,34,195,1 incl %ebp .byte 102,15,58,34,205,1 incl %ebx .byte 102,15,58,34,195,2 incl %ebp .byte 102,15,58,34,205,2 movdqa %xmm0,48(%esp) .byte 102,15,56,0,194 movdqu (%edx),%xmm6 movdqa %xmm1,64(%esp) .byte 102,15,56,0,202 pshufd $192,%xmm0,%xmm2 pshufd $128,%xmm0,%xmm3 cmpl $6,%eax jb .L007ctr32_tail pxor %xmm6,%xmm7 shll $4,%ecx movl $16,%ebx movdqa %xmm7,32(%esp) movl %edx,%ebp subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx subl $6,%eax jmp .L008ctr32_loop6 .align 16 .L008ctr32_loop6: pshufd $64,%xmm0,%xmm4 movdqa 32(%esp),%xmm0 pshufd $192,%xmm1,%xmm5 pxor %xmm0,%xmm2 pshufd $128,%xmm1,%xmm6 pxor %xmm0,%xmm3 pshufd $64,%xmm1,%xmm7 movups 16(%ebp),%xmm1 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 .byte 102,15,56,220,209 pxor %xmm0,%xmm6 pxor %xmm0,%xmm7 .byte 102,15,56,220,217 movups 32(%ebp),%xmm0 movl %ebx,%ecx .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 call .L_aesni_encrypt6_enter movups (%esi),%xmm1 movups 16(%esi),%xmm0 xorps %xmm1,%xmm2 movups 32(%esi),%xmm1 xorps %xmm0,%xmm3 movups %xmm2,(%edi) movdqa 16(%esp),%xmm0 xorps %xmm1,%xmm4 movdqa 64(%esp),%xmm1 movups %xmm3,16(%edi) movups %xmm4,32(%edi) paddd %xmm0,%xmm1 paddd 48(%esp),%xmm0 movdqa (%esp),%xmm2 movups 48(%esi),%xmm3 movups 64(%esi),%xmm4 xorps %xmm3,%xmm5 movups 80(%esi),%xmm3 leal 96(%esi),%esi movdqa %xmm0,48(%esp) .byte 102,15,56,0,194 xorps %xmm4,%xmm6 movups %xmm5,48(%edi) xorps %xmm3,%xmm7 movdqa %xmm1,64(%esp) .byte 102,15,56,0,202 movups %xmm6,64(%edi) pshufd $192,%xmm0,%xmm2 movups %xmm7,80(%edi) leal 96(%edi),%edi pshufd $128,%xmm0,%xmm3 subl $6,%eax jnc .L008ctr32_loop6 addl $6,%eax jz .L009ctr32_ret movdqu (%ebp),%xmm7 movl %ebp,%edx pxor 32(%esp),%xmm7 movl 240(%ebp),%ecx .L007ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax jb .L010ctr32_one pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 je .L011ctr32_two pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax jb .L012ctr32_three pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 je .L013ctr32_four por %xmm7,%xmm6 call _aesni_encrypt6 movups (%esi),%xmm1 movups 16(%esi),%xmm0 xorps %xmm1,%xmm2 movups 32(%esi),%xmm1 xorps %xmm0,%xmm3 movups 48(%esi),%xmm0 xorps %xmm1,%xmm4 movups 64(%esi),%xmm1 xorps %xmm0,%xmm5 movups %xmm2,(%edi) xorps %xmm1,%xmm6 movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) jmp .L009ctr32_ret .align 16 .L006ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx .L010ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L014enc1_loop_1: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L014enc1_loop_1 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) jmp .L009ctr32_ret .align 16 .L011ctr32_two: call _aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) jmp .L009ctr32_ret .align 16 .L012ctr32_three: call _aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 xorps %xmm5,%xmm2 movups 32(%esi),%xmm7 xorps %xmm6,%xmm3 movups %xmm2,(%edi) xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) jmp .L009ctr32_ret .align 16 .L013ctr32_four: call _aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 movups 32(%esi),%xmm1 xorps %xmm6,%xmm2 movups 48(%esi),%xmm0 xorps %xmm7,%xmm3 movups %xmm2,(%edi) xorps %xmm1,%xmm4 movups %xmm3,16(%edi) xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) .L009ctr32_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 movdqa %xmm0,32(%esp) pxor %xmm5,%xmm5 movdqa %xmm0,48(%esp) pxor %xmm6,%xmm6 movdqa %xmm0,64(%esp) pxor %xmm7,%xmm7 movl 80(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret .size aes_hw_ctr32_encrypt_blocks,.-.L_aes_hw_ctr32_encrypt_blocks_begin .globl aes_hw_set_encrypt_key_base .hidden aes_hw_set_encrypt_key_base .type aes_hw_set_encrypt_key_base,@function .align 16 aes_hw_set_encrypt_key_base: .L_aes_hw_set_encrypt_key_base_begin: #ifdef BORINGSSL_DISPATCH_TEST pushl %ebx pushl %edx call .L015pic_for_function_hit .L015pic_for_function_hit: popl %ebx leal BORINGSSL_function_hit+3-.L015pic_for_function_hit(%ebx),%ebx movl $1,%edx movb %dl,(%ebx) popl %edx popl %ebx #endif movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx pushl %ebx call .L016pic .L016pic: popl %ebx leal .Lkey_const-.L016pic(%ebx),%ebx movups (%eax),%xmm0 xorps %xmm4,%xmm4 leal 16(%edx),%edx cmpl $256,%ecx je .L01714rounds cmpl $128,%ecx jne .L018bad_keybits .align 16 .L01910rounds: movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 call .L020key_128_cold .byte 102,15,58,223,200,2 call .L021key_128 .byte 102,15,58,223,200,4 call .L021key_128 .byte 102,15,58,223,200,8 call .L021key_128 .byte 102,15,58,223,200,16 call .L021key_128 .byte 102,15,58,223,200,32 call .L021key_128 .byte 102,15,58,223,200,64 call .L021key_128 .byte 102,15,58,223,200,128 call .L021key_128 .byte 102,15,58,223,200,27 call .L021key_128 .byte 102,15,58,223,200,54 call .L021key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) jmp .L022good_key .align 16 .L021key_128: movups %xmm0,(%edx) leal 16(%edx),%edx .L020key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $255,%xmm1,%xmm1 xorps %xmm1,%xmm0 ret .align 16 .L01714rounds: movups 16(%eax),%xmm2 leal 16(%edx),%edx movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 call .L023key_256a_cold .byte 102,15,58,223,200,1 call .L024key_256b .byte 102,15,58,223,202,2 call .L025key_256a .byte 102,15,58,223,200,2 call .L024key_256b .byte 102,15,58,223,202,4 call .L025key_256a .byte 102,15,58,223,200,4 call .L024key_256b .byte 102,15,58,223,202,8 call .L025key_256a .byte 102,15,58,223,200,8 call .L024key_256b .byte 102,15,58,223,202,16 call .L025key_256a .byte 102,15,58,223,200,16 call .L024key_256b .byte 102,15,58,223,202,32 call .L025key_256a .byte 102,15,58,223,200,32 call .L024key_256b .byte 102,15,58,223,202,64 call .L025key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax jmp .L022good_key .align 16 .L025key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx .L023key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $255,%xmm1,%xmm1 xorps %xmm1,%xmm0 ret .align 16 .L024key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 xorps %xmm4,%xmm2 shufps $140,%xmm2,%xmm4 xorps %xmm4,%xmm2 shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 ret .L022good_key: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 xorl %eax,%eax popl %ebx ret .align 4 .L018bad_keybits: pxor %xmm0,%xmm0 movl $-2,%eax popl %ebx ret .size aes_hw_set_encrypt_key_base,.-.L_aes_hw_set_encrypt_key_base_begin .globl aes_hw_set_encrypt_key_alt .hidden aes_hw_set_encrypt_key_alt .type aes_hw_set_encrypt_key_alt,@function .align 16 aes_hw_set_encrypt_key_alt: .L_aes_hw_set_encrypt_key_alt_begin: #ifdef BORINGSSL_DISPATCH_TEST pushl %ebx pushl %edx call .L026pic_for_function_hit .L026pic_for_function_hit: popl %ebx leal BORINGSSL_function_hit+3-.L026pic_for_function_hit(%ebx),%ebx movl $1,%edx movb %dl,(%ebx) popl %edx popl %ebx #endif movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx pushl %ebx call .L027pic .L027pic: popl %ebx leal .Lkey_const-.L027pic(%ebx),%ebx movups (%eax),%xmm0 xorps %xmm4,%xmm4 leal 16(%edx),%edx cmpl $256,%ecx je .L02814rounds_alt cmpl $128,%ecx jne .L029bad_keybits .align 16 .L03010rounds_alt: movdqa (%ebx),%xmm5 movl $8,%ecx movdqa 32(%ebx),%xmm4 movdqa %xmm0,%xmm2 movdqu %xmm0,-16(%edx) .L031loop_key128: .byte 102,15,56,0,197 .byte 102,15,56,221,196 pslld $1,%xmm4 leal 16(%edx),%edx movdqa %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm3,%xmm2 pxor %xmm2,%xmm0 movdqu %xmm0,-16(%edx) movdqa %xmm0,%xmm2 decl %ecx jnz .L031loop_key128 movdqa 48(%ebx),%xmm4 .byte 102,15,56,0,197 .byte 102,15,56,221,196 pslld $1,%xmm4 movdqa %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm3,%xmm2 pxor %xmm2,%xmm0 movdqu %xmm0,(%edx) movdqa %xmm0,%xmm2 .byte 102,15,56,0,197 .byte 102,15,56,221,196 movdqa %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm3,%xmm2 pxor %xmm2,%xmm0 movdqu %xmm0,16(%edx) movl $9,%ecx movl %ecx,96(%edx) jmp .L032good_key .align 16 .L02814rounds_alt: movups 16(%eax),%xmm2 leal 16(%edx),%edx movdqa (%ebx),%xmm5 movdqa 32(%ebx),%xmm4 movl $7,%ecx movdqu %xmm0,-32(%edx) movdqa %xmm2,%xmm1 movdqu %xmm2,-16(%edx) .L033loop_key256: .byte 102,15,56,0,213 .byte 102,15,56,221,212 movdqa %xmm0,%xmm3 pslldq $4,%xmm0 pxor %xmm0,%xmm3 pslldq $4,%xmm0 pxor %xmm0,%xmm3 pslldq $4,%xmm0 pxor %xmm3,%xmm0 pslld $1,%xmm4 pxor %xmm2,%xmm0 movdqu %xmm0,(%edx) decl %ecx jz .L034done_key256 pshufd $255,%xmm0,%xmm2 pxor %xmm3,%xmm3 .byte 102,15,56,221,211 movdqa %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm3,%xmm1 pxor %xmm1,%xmm2 movdqu %xmm2,16(%edx) leal 32(%edx),%edx movdqa %xmm2,%xmm1 jmp .L033loop_key256 .L034done_key256: movl $13,%ecx movl %ecx,16(%edx) .L032good_key: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 xorl %eax,%eax popl %ebx ret .align 4 .L029bad_keybits: pxor %xmm0,%xmm0 movl $-2,%eax popl %ebx ret .size aes_hw_set_encrypt_key_alt,.-.L_aes_hw_set_encrypt_key_alt_begin .align 64 .Lkey_const: .long 202313229,202313229,202313229,202313229 .long 67569157,67569157,67569157,67569157 .long 1,1,1,1 .long 27,27,27,27 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 .byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) ring-0.17.14/pregenerated/aesni-x86-win32n.asm000064400000000000000000000324051046102023000167500ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %include "ring_core_generated/prefix_symbols_nasm.inc" %ifidn __OUTPUT_FORMAT__, win32 %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 $@feat.00 equ 1 section .text code align=64 %else section .text code %endif %ifdef BORINGSSL_DISPATCH_TEST extern _BORINGSSL_function_hit %endif align 16 __aesni_encrypt2: movups xmm0,[edx] shl ecx,4 movups xmm1,[16+edx] xorps xmm2,xmm0 pxor xmm3,xmm0 movups xmm0,[32+edx] lea edx,[32+ecx*1+edx] neg ecx add ecx,16 L$000enc2_loop: db 102,15,56,220,209 db 102,15,56,220,217 movups xmm1,[ecx*1+edx] add ecx,32 db 102,15,56,220,208 db 102,15,56,220,216 movups xmm0,[ecx*1+edx-16] jnz NEAR L$000enc2_loop db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,221,208 db 102,15,56,221,216 ret align 16 __aesni_encrypt3: movups xmm0,[edx] shl ecx,4 movups xmm1,[16+edx] xorps xmm2,xmm0 pxor xmm3,xmm0 pxor xmm4,xmm0 movups xmm0,[32+edx] lea edx,[32+ecx*1+edx] neg ecx add ecx,16 L$001enc3_loop: db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 movups xmm1,[ecx*1+edx] add ecx,32 db 102,15,56,220,208 db 102,15,56,220,216 db 102,15,56,220,224 movups xmm0,[ecx*1+edx-16] jnz NEAR L$001enc3_loop db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 db 102,15,56,221,208 db 102,15,56,221,216 db 102,15,56,221,224 ret align 16 __aesni_encrypt4: movups xmm0,[edx] movups xmm1,[16+edx] shl ecx,4 xorps xmm2,xmm0 pxor xmm3,xmm0 pxor xmm4,xmm0 pxor xmm5,xmm0 movups xmm0,[32+edx] lea edx,[32+ecx*1+edx] neg ecx db 15,31,64,0 add ecx,16 L$002enc4_loop: db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 db 102,15,56,220,233 movups xmm1,[ecx*1+edx] add ecx,32 db 102,15,56,220,208 db 102,15,56,220,216 db 102,15,56,220,224 db 102,15,56,220,232 movups xmm0,[ecx*1+edx-16] jnz NEAR L$002enc4_loop db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 db 102,15,56,220,233 db 102,15,56,221,208 db 102,15,56,221,216 db 102,15,56,221,224 db 102,15,56,221,232 ret align 16 __aesni_encrypt6: movups xmm0,[edx] shl ecx,4 movups xmm1,[16+edx] xorps xmm2,xmm0 pxor xmm3,xmm0 pxor xmm4,xmm0 db 102,15,56,220,209 pxor xmm5,xmm0 pxor xmm6,xmm0 db 102,15,56,220,217 lea edx,[32+ecx*1+edx] neg ecx db 102,15,56,220,225 pxor xmm7,xmm0 movups xmm0,[ecx*1+edx] add ecx,16 jmp NEAR L$003_aesni_encrypt6_inner align 16 L$004enc6_loop: db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 L$003_aesni_encrypt6_inner: db 102,15,56,220,233 db 102,15,56,220,241 db 102,15,56,220,249 L$_aesni_encrypt6_enter: movups xmm1,[ecx*1+edx] add ecx,32 db 102,15,56,220,208 db 102,15,56,220,216 db 102,15,56,220,224 db 102,15,56,220,232 db 102,15,56,220,240 db 102,15,56,220,248 movups xmm0,[ecx*1+edx-16] jnz NEAR L$004enc6_loop db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 db 102,15,56,220,233 db 102,15,56,220,241 db 102,15,56,220,249 db 102,15,56,221,208 db 102,15,56,221,216 db 102,15,56,221,224 db 102,15,56,221,232 db 102,15,56,221,240 db 102,15,56,221,248 ret global _aes_hw_ctr32_encrypt_blocks align 16 _aes_hw_ctr32_encrypt_blocks: L$_aes_hw_ctr32_encrypt_blocks_begin: push ebp push ebx push esi push edi %ifdef BORINGSSL_DISPATCH_TEST push ebx push edx call L$005pic_for_function_hit L$005pic_for_function_hit: pop ebx lea ebx,[(_BORINGSSL_function_hit+0-L$005pic_for_function_hit)+ebx] mov edx,1 mov BYTE [ebx],dl pop edx pop ebx %endif mov esi,DWORD [20+esp] mov edi,DWORD [24+esp] mov eax,DWORD [28+esp] mov edx,DWORD [32+esp] mov ebx,DWORD [36+esp] mov ebp,esp sub esp,88 and esp,-16 mov DWORD [80+esp],ebp cmp eax,1 je NEAR L$006ctr32_one_shortcut movdqu xmm7,[ebx] mov DWORD [esp],202182159 mov DWORD [4+esp],134810123 mov DWORD [8+esp],67438087 mov DWORD [12+esp],66051 mov ecx,6 xor ebp,ebp mov DWORD [16+esp],ecx mov DWORD [20+esp],ecx mov DWORD [24+esp],ecx mov DWORD [28+esp],ebp db 102,15,58,22,251,3 db 102,15,58,34,253,3 mov ecx,DWORD [240+edx] bswap ebx pxor xmm0,xmm0 pxor xmm1,xmm1 movdqa xmm2,[esp] db 102,15,58,34,195,0 lea ebp,[3+ebx] db 102,15,58,34,205,0 inc ebx db 102,15,58,34,195,1 inc ebp db 102,15,58,34,205,1 inc ebx db 102,15,58,34,195,2 inc ebp db 102,15,58,34,205,2 movdqa [48+esp],xmm0 db 102,15,56,0,194 movdqu xmm6,[edx] movdqa [64+esp],xmm1 db 102,15,56,0,202 pshufd xmm2,xmm0,192 pshufd xmm3,xmm0,128 cmp eax,6 jb NEAR L$007ctr32_tail pxor xmm7,xmm6 shl ecx,4 mov ebx,16 movdqa [32+esp],xmm7 mov ebp,edx sub ebx,ecx lea edx,[32+ecx*1+edx] sub eax,6 jmp NEAR L$008ctr32_loop6 align 16 L$008ctr32_loop6: pshufd xmm4,xmm0,64 movdqa xmm0,[32+esp] pshufd xmm5,xmm1,192 pxor xmm2,xmm0 pshufd xmm6,xmm1,128 pxor xmm3,xmm0 pshufd xmm7,xmm1,64 movups xmm1,[16+ebp] pxor xmm4,xmm0 pxor xmm5,xmm0 db 102,15,56,220,209 pxor xmm6,xmm0 pxor xmm7,xmm0 db 102,15,56,220,217 movups xmm0,[32+ebp] mov ecx,ebx db 102,15,56,220,225 db 102,15,56,220,233 db 102,15,56,220,241 db 102,15,56,220,249 call L$_aesni_encrypt6_enter movups xmm1,[esi] movups xmm0,[16+esi] xorps xmm2,xmm1 movups xmm1,[32+esi] xorps xmm3,xmm0 movups [edi],xmm2 movdqa xmm0,[16+esp] xorps xmm4,xmm1 movdqa xmm1,[64+esp] movups [16+edi],xmm3 movups [32+edi],xmm4 paddd xmm1,xmm0 paddd xmm0,[48+esp] movdqa xmm2,[esp] movups xmm3,[48+esi] movups xmm4,[64+esi] xorps xmm5,xmm3 movups xmm3,[80+esi] lea esi,[96+esi] movdqa [48+esp],xmm0 db 102,15,56,0,194 xorps xmm6,xmm4 movups [48+edi],xmm5 xorps xmm7,xmm3 movdqa [64+esp],xmm1 db 102,15,56,0,202 movups [64+edi],xmm6 pshufd xmm2,xmm0,192 movups [80+edi],xmm7 lea edi,[96+edi] pshufd xmm3,xmm0,128 sub eax,6 jnc NEAR L$008ctr32_loop6 add eax,6 jz NEAR L$009ctr32_ret movdqu xmm7,[ebp] mov edx,ebp pxor xmm7,[32+esp] mov ecx,DWORD [240+ebp] L$007ctr32_tail: por xmm2,xmm7 cmp eax,2 jb NEAR L$010ctr32_one pshufd xmm4,xmm0,64 por xmm3,xmm7 je NEAR L$011ctr32_two pshufd xmm5,xmm1,192 por xmm4,xmm7 cmp eax,4 jb NEAR L$012ctr32_three pshufd xmm6,xmm1,128 por xmm5,xmm7 je NEAR L$013ctr32_four por xmm6,xmm7 call __aesni_encrypt6 movups xmm1,[esi] movups xmm0,[16+esi] xorps xmm2,xmm1 movups xmm1,[32+esi] xorps xmm3,xmm0 movups xmm0,[48+esi] xorps xmm4,xmm1 movups xmm1,[64+esi] xorps xmm5,xmm0 movups [edi],xmm2 xorps xmm6,xmm1 movups [16+edi],xmm3 movups [32+edi],xmm4 movups [48+edi],xmm5 movups [64+edi],xmm6 jmp NEAR L$009ctr32_ret align 16 L$006ctr32_one_shortcut: movups xmm2,[ebx] mov ecx,DWORD [240+edx] L$010ctr32_one: movups xmm0,[edx] movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 L$014enc1_loop_1: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] jnz NEAR L$014enc1_loop_1 db 102,15,56,221,209 movups xmm6,[esi] xorps xmm6,xmm2 movups [edi],xmm6 jmp NEAR L$009ctr32_ret align 16 L$011ctr32_two: call __aesni_encrypt2 movups xmm5,[esi] movups xmm6,[16+esi] xorps xmm2,xmm5 xorps xmm3,xmm6 movups [edi],xmm2 movups [16+edi],xmm3 jmp NEAR L$009ctr32_ret align 16 L$012ctr32_three: call __aesni_encrypt3 movups xmm5,[esi] movups xmm6,[16+esi] xorps xmm2,xmm5 movups xmm7,[32+esi] xorps xmm3,xmm6 movups [edi],xmm2 xorps xmm4,xmm7 movups [16+edi],xmm3 movups [32+edi],xmm4 jmp NEAR L$009ctr32_ret align 16 L$013ctr32_four: call __aesni_encrypt4 movups xmm6,[esi] movups xmm7,[16+esi] movups xmm1,[32+esi] xorps xmm2,xmm6 movups xmm0,[48+esi] xorps xmm3,xmm7 movups [edi],xmm2 xorps xmm4,xmm1 movups [16+edi],xmm3 xorps xmm5,xmm0 movups [32+edi],xmm4 movups [48+edi],xmm5 L$009ctr32_ret: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 movdqa [32+esp],xmm0 pxor xmm5,xmm5 movdqa [48+esp],xmm0 pxor xmm6,xmm6 movdqa [64+esp],xmm0 pxor xmm7,xmm7 mov esp,DWORD [80+esp] pop edi pop esi pop ebx pop ebp ret global _aes_hw_set_encrypt_key_base align 16 _aes_hw_set_encrypt_key_base: L$_aes_hw_set_encrypt_key_base_begin: %ifdef BORINGSSL_DISPATCH_TEST push ebx push edx call L$015pic_for_function_hit L$015pic_for_function_hit: pop ebx lea ebx,[(_BORINGSSL_function_hit+3-L$015pic_for_function_hit)+ebx] mov edx,1 mov BYTE [ebx],dl pop edx pop ebx %endif mov eax,DWORD [4+esp] mov ecx,DWORD [8+esp] mov edx,DWORD [12+esp] push ebx call L$016pic L$016pic: pop ebx lea ebx,[(L$key_const-L$016pic)+ebx] movups xmm0,[eax] xorps xmm4,xmm4 lea edx,[16+edx] cmp ecx,256 je NEAR L$01714rounds cmp ecx,128 jne NEAR L$018bad_keybits align 16 L$01910rounds: mov ecx,9 movups [edx-16],xmm0 db 102,15,58,223,200,1 call L$020key_128_cold db 102,15,58,223,200,2 call L$021key_128 db 102,15,58,223,200,4 call L$021key_128 db 102,15,58,223,200,8 call L$021key_128 db 102,15,58,223,200,16 call L$021key_128 db 102,15,58,223,200,32 call L$021key_128 db 102,15,58,223,200,64 call L$021key_128 db 102,15,58,223,200,128 call L$021key_128 db 102,15,58,223,200,27 call L$021key_128 db 102,15,58,223,200,54 call L$021key_128 movups [edx],xmm0 mov DWORD [80+edx],ecx jmp NEAR L$022good_key align 16 L$021key_128: movups [edx],xmm0 lea edx,[16+edx] L$020key_128_cold: shufps xmm4,xmm0,16 xorps xmm0,xmm4 shufps xmm4,xmm0,140 xorps xmm0,xmm4 shufps xmm1,xmm1,255 xorps xmm0,xmm1 ret align 16 L$01714rounds: movups xmm2,[16+eax] lea edx,[16+edx] mov ecx,13 movups [edx-32],xmm0 movups [edx-16],xmm2 db 102,15,58,223,202,1 call L$023key_256a_cold db 102,15,58,223,200,1 call L$024key_256b db 102,15,58,223,202,2 call L$025key_256a db 102,15,58,223,200,2 call L$024key_256b db 102,15,58,223,202,4 call L$025key_256a db 102,15,58,223,200,4 call L$024key_256b db 102,15,58,223,202,8 call L$025key_256a db 102,15,58,223,200,8 call L$024key_256b db 102,15,58,223,202,16 call L$025key_256a db 102,15,58,223,200,16 call L$024key_256b db 102,15,58,223,202,32 call L$025key_256a db 102,15,58,223,200,32 call L$024key_256b db 102,15,58,223,202,64 call L$025key_256a movups [edx],xmm0 mov DWORD [16+edx],ecx xor eax,eax jmp NEAR L$022good_key align 16 L$025key_256a: movups [edx],xmm2 lea edx,[16+edx] L$023key_256a_cold: shufps xmm4,xmm0,16 xorps xmm0,xmm4 shufps xmm4,xmm0,140 xorps xmm0,xmm4 shufps xmm1,xmm1,255 xorps xmm0,xmm1 ret align 16 L$024key_256b: movups [edx],xmm0 lea edx,[16+edx] shufps xmm4,xmm2,16 xorps xmm2,xmm4 shufps xmm4,xmm2,140 xorps xmm2,xmm4 shufps xmm1,xmm1,170 xorps xmm2,xmm1 ret L$022good_key: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 xor eax,eax pop ebx ret align 4 L$018bad_keybits: pxor xmm0,xmm0 mov eax,-2 pop ebx ret global _aes_hw_set_encrypt_key_alt align 16 _aes_hw_set_encrypt_key_alt: L$_aes_hw_set_encrypt_key_alt_begin: %ifdef BORINGSSL_DISPATCH_TEST push ebx push edx call L$026pic_for_function_hit L$026pic_for_function_hit: pop ebx lea ebx,[(_BORINGSSL_function_hit+3-L$026pic_for_function_hit)+ebx] mov edx,1 mov BYTE [ebx],dl pop edx pop ebx %endif mov eax,DWORD [4+esp] mov ecx,DWORD [8+esp] mov edx,DWORD [12+esp] push ebx call L$027pic L$027pic: pop ebx lea ebx,[(L$key_const-L$027pic)+ebx] movups xmm0,[eax] xorps xmm4,xmm4 lea edx,[16+edx] cmp ecx,256 je NEAR L$02814rounds_alt cmp ecx,128 jne NEAR L$029bad_keybits align 16 L$03010rounds_alt: movdqa xmm5,[ebx] mov ecx,8 movdqa xmm4,[32+ebx] movdqa xmm2,xmm0 movdqu [edx-16],xmm0 L$031loop_key128: db 102,15,56,0,197 db 102,15,56,221,196 pslld xmm4,1 lea edx,[16+edx] movdqa xmm3,xmm2 pslldq xmm2,4 pxor xmm3,xmm2 pslldq xmm2,4 pxor xmm3,xmm2 pslldq xmm2,4 pxor xmm2,xmm3 pxor xmm0,xmm2 movdqu [edx-16],xmm0 movdqa xmm2,xmm0 dec ecx jnz NEAR L$031loop_key128 movdqa xmm4,[48+ebx] db 102,15,56,0,197 db 102,15,56,221,196 pslld xmm4,1 movdqa xmm3,xmm2 pslldq xmm2,4 pxor xmm3,xmm2 pslldq xmm2,4 pxor xmm3,xmm2 pslldq xmm2,4 pxor xmm2,xmm3 pxor xmm0,xmm2 movdqu [edx],xmm0 movdqa xmm2,xmm0 db 102,15,56,0,197 db 102,15,56,221,196 movdqa xmm3,xmm2 pslldq xmm2,4 pxor xmm3,xmm2 pslldq xmm2,4 pxor xmm3,xmm2 pslldq xmm2,4 pxor xmm2,xmm3 pxor xmm0,xmm2 movdqu [16+edx],xmm0 mov ecx,9 mov DWORD [96+edx],ecx jmp NEAR L$032good_key align 16 L$02814rounds_alt: movups xmm2,[16+eax] lea edx,[16+edx] movdqa xmm5,[ebx] movdqa xmm4,[32+ebx] mov ecx,7 movdqu [edx-32],xmm0 movdqa xmm1,xmm2 movdqu [edx-16],xmm2 L$033loop_key256: db 102,15,56,0,213 db 102,15,56,221,212 movdqa xmm3,xmm0 pslldq xmm0,4 pxor xmm3,xmm0 pslldq xmm0,4 pxor xmm3,xmm0 pslldq xmm0,4 pxor xmm0,xmm3 pslld xmm4,1 pxor xmm0,xmm2 movdqu [edx],xmm0 dec ecx jz NEAR L$034done_key256 pshufd xmm2,xmm0,255 pxor xmm3,xmm3 db 102,15,56,221,211 movdqa xmm3,xmm1 pslldq xmm1,4 pxor xmm3,xmm1 pslldq xmm1,4 pxor xmm3,xmm1 pslldq xmm1,4 pxor xmm1,xmm3 pxor xmm2,xmm1 movdqu [16+edx],xmm2 lea edx,[32+edx] movdqa xmm1,xmm2 jmp NEAR L$033loop_key256 L$034done_key256: mov ecx,13 mov DWORD [16+edx],ecx L$032good_key: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 xor eax,eax pop ebx ret align 4 L$029bad_keybits: pxor xmm0,xmm0 mov eax,-2 pop ebx ret align 64 L$key_const: dd 202313229,202313229,202313229,202313229 dd 67569157,67569157,67569157,67569157 dd 1,1,1,1 dd 27,27,27,27 db 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 db 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 db 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 db 115,108,46,111,114,103,62,0 %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/aesni-x86-win32n.o000064400000000000000000000274351046102023000164350ustar 00000000000000Lg'6.debug$S Z@B.debug$TR0@B.textx ' p`5C:\Users\b\p\ring\pregenerated\aesni-x86-win32n.asm=0BՃ}9x [  #(, /!4"9#>$D%I&N'S(X)Y*`,c-f.j/m0q1u2y3}45789:;<=>?@ABCDEFGHJKLMNOPQRSTU WXYZ["\%]*^/_4`9a>bDcIdNeSfXg]hbigjlkmlpnsovpzq}rstuvwxyz{|}~"',16;@EJKPQRSTX\`dhjmptw} "'*047<BDFJMR`ekpty}       &+/ 4!8";#@$C%I&L'R(W)Y*_+e-i.l/r0w1{23456789:;<=>?@ABCDEFGHIJKLNOQRSTV W XYZ[\ ]#^&_+`0b5c8d<e?fBgEhIiNjPlUmXn\o_pcqfrisltputvywyz{|}~  $*059?DJOUZ`ekpv{  &+16<AGLRW]bhmsx~      !"#&'()+:;< = >@ABCD"E(F.G4H:I@KDLIMNNROWQ\RaSfTiUmVrWvX{YZ[\]^_`abcdefghijklmnopqrstuv w x y z { | } $ ' + 0 5 : > C H M Q V Z _ c h l q u y z                                    0 @ P ` p 8C:\Users\b\p\ring\pregenerated\aesni-x86-win32n.o4'The Netwide Assembler 2.13.03__aesni_encrypt2L$000enc2_loop__aesni_encrypt3L$001enc3_loop__aesni_encrypt4L$002enc4_loop__aesni_encrypt6L$004enc6_loop$L$003_aesni_encrypt6_inner!L$_aesni_encrypt6_enter9_ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks.L$_aes_hw_ctr32_encrypt_blocks_beginL$008ctr32_loop6L$007ctr32_tail!L$006ctr32_one_shortcutL$010ctr32_oneL$014enc1_loop_1L$011ctr32_twoL$012ctr32_threeL$013ctr32_fourL$009ctr32_ret9_ring_core_0_17_14__aes_hw_set_encrypt_key_base.L$_aes_hw_set_encrypt_key_base_beginL$016picL$01910roundsL$021key_128L$020key_128_coldL$01714roundsL$025key_256aL$023key_256a_coldL$024key_256bL$022good_keyL$018bad_keybits8_ring_core_0_17_14__aes_hw_set_encrypt_key_alt-L$_aes_hw_set_encrypt_key_alt_beginL$027picL$03010rounds_altL$031loop_key128L$02814rounds_altL$033loop_key256L$034done_key256L$032good_keyL$029bad_keybitsL$key_constl p       * . D H ` d z ~         4 8 d h             * . E I _ c   ! ! " " # # $ $ ,% 0% E& I& ^' b' |( ( ) ) * * + + , , 3- 7- G. K. d/ h/ 0 0 1 1 2 2 3 3 4 4 5 5 JWfB T كf8f8 f8f8D f8f8f8f8ÐJWffB T كf8f8f8 f8f8f8D f8f8f8f8f8f8ÐJWfffB T @f8f8f8f8 f8f8f8f8D f8f8f8f8f8f8f8f8ÐJWfff8fff8ٍT f8f f8f8f8f8f8f8 f8f8f8f8f8f8D f8f8f8f8f8f8f8f8f8f8f8f8ÐUSVWt$|$D$T$ \$$Xl$Pso;$ D$ D$D$ 1L$L$L$l$f:f:"fffo$f:"kf:"Cf:"Ef:"Cf:"Ef:"fD$0f8o2fL$@f8fpfp؀5ff|$ )ˍT fp@foD$ fpffpffp@Mfff8fff8E f8f8f8f8FWN WfoD$WfoL$@_g ffD$0fo$^0f@W^Pv`fD$0f8Wo0WfL$@f8w@fpP`fp؀]o}f|$ f׃fp@ffpffpffFWN WF0WN@WW_g o0w@JR Wf8I Rf86W7.vWW_a .vW~ WW_g 6[6~N WF0WW_Wg o0ffffffD$ ffD$0ffD$@fd$P_^[]ÐD$L$T$ S[WR Bf:f:qf:ff:[f:Pf: Ef:@:f:Ȁ/f:$f:6JPRWWWÐPR BRf:f:f:{f:f:ef:zf:Of:df:9f:Nf: #f: 8f:@ J1<RWWWÐRWWɪWffffff1[f[ÐD$L$T$ S[WRfo+foc foBf8f8frRfofsffsffsffBfoIfoc0f8f8frfofsffsffsfffof8f8fofsffsffsffB J`PRfo+foc BfoRf8f8fofsffsffsffrfIBfpff8fofsffsffsffRR fo Jffffff1[Ðf[Ð     AES for Intel AES-NI, CRYPTOGAMS by .filegC:\Users\b\p\ring\.debug$S Z.debug$TR.textx .absolut@feat.00$`5DU dpuPP `e-ETe0tPL$016pic0&4BUcqL$027pic@W  C  , : K W__aesni_encrypt2L$000enc2_loop__aesni_encrypt3L$001enc3_loop__aesni_encrypt4L$002enc4_loop__aesni_encrypt6L$004enc6_loopL$003_aesni_encrypt6_innerL$_aesni_encrypt6_enter_ring_core_0_17_14__aes_hw_ctr32_encrypt_blocksL$_aes_hw_ctr32_encrypt_blocks_beginL$008ctr32_loop6L$007ctr32_tailL$006ctr32_one_shortcutL$010ctr32_oneL$014enc1_loop_1L$011ctr32_twoL$012ctr32_threeL$013ctr32_fourL$009ctr32_ret_ring_core_0_17_14__aes_hw_set_encrypt_key_baseL$_aes_hw_set_encrypt_key_base_beginL$01910roundsL$021key_128L$020key_128_coldL$01714roundsL$025key_256aL$023key_256a_coldL$024key_256bL$022good_keyL$018bad_keybits_ring_core_0_17_14__aes_hw_set_encrypt_key_altL$_aes_hw_set_encrypt_key_alt_beginL$03010rounds_altL$031loop_key128L$02814rounds_altL$033loop_key256L$034done_key256L$032good_keyL$029bad_keybitsL$key_constring-0.17.14/pregenerated/aesni-x86_64-elf.S000064400000000000000000000507451046102023000163400ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text .type _aesni_encrypt2,@function .align 16 _aesni_encrypt2: .cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 movups 32(%rcx),%xmm0 leaq 32(%rcx,%rax,1),%rcx negq %rax addq $16,%rax .Lenc_loop2: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop2 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,221,208 .byte 102,15,56,221,216 ret .cfi_endproc .size _aesni_encrypt2,.-_aesni_encrypt2 .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: .cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 movups 32(%rcx),%xmm0 leaq 32(%rcx,%rax,1),%rcx negq %rax addq $16,%rax .Lenc_loop3: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop3 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 102,15,56,221,224 ret .cfi_endproc .size _aesni_encrypt3,.-_aesni_encrypt3 .type _aesni_encrypt4,@function .align 16 _aesni_encrypt4: .cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 xorps %xmm0,%xmm5 movups 32(%rcx),%xmm0 leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 0x0f,0x1f,0x00 addq $16,%rax .Lenc_loop4: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop4 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 102,15,56,221,224 .byte 102,15,56,221,232 ret .cfi_endproc .size _aesni_encrypt4,.-_aesni_encrypt4 .type _aesni_encrypt6,@function .align 16 _aesni_encrypt6: .cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 .byte 102,15,56,220,209 leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,220,217 pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 .byte 102,15,56,220,225 pxor %xmm0,%xmm7 movups (%rcx,%rax,1),%xmm0 addq $16,%rax jmp .Lenc_loop6_enter .align 16 .Lenc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .Lenc_loop6_enter: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop6 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 102,15,56,221,224 .byte 102,15,56,221,232 .byte 102,15,56,221,240 .byte 102,15,56,221,248 ret .cfi_endproc .size _aesni_encrypt6,.-_aesni_encrypt6 .type _aesni_encrypt8,@function .align 16 _aesni_encrypt8: .cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,220,209 pxor %xmm0,%xmm7 pxor %xmm0,%xmm8 .byte 102,15,56,220,217 pxor %xmm0,%xmm9 movups (%rcx,%rax,1),%xmm0 addq $16,%rax jmp .Lenc_loop8_inner .align 16 .Lenc_loop8: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .Lenc_loop8_inner: .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 .Lenc_loop8_enter: movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop8 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 102,15,56,221,224 .byte 102,15,56,221,232 .byte 102,15,56,221,240 .byte 102,15,56,221,248 .byte 102,68,15,56,221,192 .byte 102,68,15,56,221,200 ret .cfi_endproc .size _aesni_encrypt8,.-_aesni_encrypt8 .globl aes_hw_ctr32_encrypt_blocks .hidden aes_hw_ctr32_encrypt_blocks .type aes_hw_ctr32_encrypt_blocks,@function .align 16 aes_hw_ctr32_encrypt_blocks: .cfi_startproc _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST movb $1,BORINGSSL_function_hit(%rip) #endif cmpq $1,%rdx jne .Lctr32_bulk movups (%r8),%xmm2 movups (%rdi),%xmm3 movl 240(%rcx),%edx movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 .Loop_enc1_1: .byte 102,15,56,220,209 decl %edx movups (%rcx),%xmm1 leaq 16(%rcx),%rcx jnz .Loop_enc1_1 .byte 102,15,56,221,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 xorps %xmm3,%xmm2 pxor %xmm3,%xmm3 movups %xmm2,(%rsi) xorps %xmm2,%xmm2 jmp .Lctr32_epilogue .align 16 .Lctr32_bulk: leaq (%rsp),%r11 .cfi_def_cfa_register %r11 pushq %rbp .cfi_offset %rbp,-16 subq $128,%rsp andq $-16,%rsp movdqu (%r8),%xmm2 movdqu (%rcx),%xmm0 movl 12(%r8),%r8d pxor %xmm0,%xmm2 movl 12(%rcx),%ebp movdqa %xmm2,0(%rsp) bswapl %r8d movdqa %xmm2,%xmm3 movdqa %xmm2,%xmm4 movdqa %xmm2,%xmm5 movdqa %xmm2,64(%rsp) movdqa %xmm2,80(%rsp) movdqa %xmm2,96(%rsp) movq %rdx,%r10 movdqa %xmm2,112(%rsp) leaq 1(%r8),%rax leaq 2(%r8),%rdx bswapl %eax bswapl %edx xorl %ebp,%eax xorl %ebp,%edx .byte 102,15,58,34,216,3 leaq 3(%r8),%rax movdqa %xmm3,16(%rsp) .byte 102,15,58,34,226,3 bswapl %eax movq %r10,%rdx leaq 4(%r8),%r10 movdqa %xmm4,32(%rsp) xorl %ebp,%eax bswapl %r10d .byte 102,15,58,34,232,3 xorl %ebp,%r10d movdqa %xmm5,48(%rsp) leaq 5(%r8),%r9 movl %r10d,64+12(%rsp) bswapl %r9d leaq 6(%r8),%r10 movl 240(%rcx),%eax xorl %ebp,%r9d bswapl %r10d movl %r9d,80+12(%rsp) xorl %ebp,%r10d leaq 7(%r8),%r9 movl %r10d,96+12(%rsp) bswapl %r9d xorl %ebp,%r9d movl %r9d,112+12(%rsp) movups 16(%rcx),%xmm1 movdqa 64(%rsp),%xmm6 movdqa 80(%rsp),%xmm7 cmpq $8,%rdx jb .Lctr32_tail leaq 128(%rcx),%rcx subq $8,%rdx jmp .Lctr32_loop8 .align 32 .Lctr32_loop8: addl $8,%r8d movdqa 96(%rsp),%xmm8 .byte 102,15,56,220,209 movl %r8d,%r9d movdqa 112(%rsp),%xmm9 .byte 102,15,56,220,217 bswapl %r9d movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 xorl %ebp,%r9d nop .byte 102,15,56,220,233 movl %r9d,0+12(%rsp) leaq 1(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movups 48-128(%rcx),%xmm1 bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 movl %r9d,16+12(%rsp) leaq 2(%r8),%r9 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups 64-128(%rcx),%xmm0 bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movl %r9d,32+12(%rsp) leaq 3(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movups 80-128(%rcx),%xmm1 bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 movl %r9d,48+12(%rsp) leaq 4(%r8),%r9 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups 96-128(%rcx),%xmm0 bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movl %r9d,64+12(%rsp) leaq 5(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movups 112-128(%rcx),%xmm1 bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 movl %r9d,80+12(%rsp) leaq 6(%r8),%r9 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups 128-128(%rcx),%xmm0 bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movl %r9d,96+12(%rsp) leaq 7(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movups 144-128(%rcx),%xmm1 bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 xorl %ebp,%r9d movdqu 0(%rdi),%xmm10 .byte 102,15,56,220,232 movl %r9d,112+12(%rsp) cmpl $11,%eax .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups 160-128(%rcx),%xmm0 jb .Lctr32_enc_done .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movups 176-128(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups 192-128(%rcx),%xmm0 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movups 208-128(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups 224-128(%rcx),%xmm0 jmp .Lctr32_enc_done .align 16 .Lctr32_enc_done: movdqu 16(%rdi),%xmm11 pxor %xmm0,%xmm10 movdqu 32(%rdi),%xmm12 pxor %xmm0,%xmm11 movdqu 48(%rdi),%xmm13 pxor %xmm0,%xmm12 movdqu 64(%rdi),%xmm14 pxor %xmm0,%xmm13 movdqu 80(%rdi),%xmm15 pxor %xmm0,%xmm14 prefetcht0 448(%rdi) prefetcht0 512(%rdi) pxor %xmm0,%xmm15 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movdqu 96(%rdi),%xmm1 leaq 128(%rdi),%rdi .byte 102,65,15,56,221,210 pxor %xmm0,%xmm1 movdqu 112-128(%rdi),%xmm10 .byte 102,65,15,56,221,219 pxor %xmm0,%xmm10 movdqa 0(%rsp),%xmm11 .byte 102,65,15,56,221,228 .byte 102,65,15,56,221,237 movdqa 16(%rsp),%xmm12 movdqa 32(%rsp),%xmm13 .byte 102,65,15,56,221,246 .byte 102,65,15,56,221,255 movdqa 48(%rsp),%xmm14 movdqa 64(%rsp),%xmm15 .byte 102,68,15,56,221,193 movdqa 80(%rsp),%xmm0 movups 16-128(%rcx),%xmm1 .byte 102,69,15,56,221,202 movups %xmm2,(%rsi) movdqa %xmm11,%xmm2 movups %xmm3,16(%rsi) movdqa %xmm12,%xmm3 movups %xmm4,32(%rsi) movdqa %xmm13,%xmm4 movups %xmm5,48(%rsi) movdqa %xmm14,%xmm5 movups %xmm6,64(%rsi) movdqa %xmm15,%xmm6 movups %xmm7,80(%rsi) movdqa %xmm0,%xmm7 movups %xmm8,96(%rsi) movups %xmm9,112(%rsi) leaq 128(%rsi),%rsi subq $8,%rdx jnc .Lctr32_loop8 addq $8,%rdx jz .Lctr32_done leaq -128(%rcx),%rcx .Lctr32_tail: leaq 16(%rcx),%rcx cmpq $4,%rdx jb .Lctr32_loop3 je .Lctr32_loop4 shll $4,%eax movdqa 96(%rsp),%xmm8 pxor %xmm9,%xmm9 movups 16(%rcx),%xmm0 .byte 102,15,56,220,209 .byte 102,15,56,220,217 leaq 32-16(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,220,225 addq $16,%rax movups (%rdi),%xmm10 .byte 102,15,56,220,233 .byte 102,15,56,220,241 movups 16(%rdi),%xmm11 movups 32(%rdi),%xmm12 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 call .Lenc_loop8_enter movdqu 48(%rdi),%xmm13 pxor %xmm10,%xmm2 movdqu 64(%rdi),%xmm10 pxor %xmm11,%xmm3 movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) pxor %xmm10,%xmm6 movdqu %xmm5,48(%rsi) movdqu %xmm6,64(%rsi) cmpq $6,%rdx jb .Lctr32_done movups 80(%rdi),%xmm11 xorps %xmm11,%xmm7 movups %xmm7,80(%rsi) je .Lctr32_done movups 96(%rdi),%xmm12 xorps %xmm12,%xmm8 movups %xmm8,96(%rsi) jmp .Lctr32_done .align 32 .Lctr32_loop4: .byte 102,15,56,220,209 leaq 16(%rcx),%rcx decl %eax .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movups (%rcx),%xmm1 jnz .Lctr32_loop4 .byte 102,15,56,221,209 .byte 102,15,56,221,217 movups (%rdi),%xmm10 movups 16(%rdi),%xmm11 .byte 102,15,56,221,225 .byte 102,15,56,221,233 movups 32(%rdi),%xmm12 movups 48(%rdi),%xmm13 xorps %xmm10,%xmm2 movups %xmm2,(%rsi) xorps %xmm11,%xmm3 movups %xmm3,16(%rsi) pxor %xmm12,%xmm4 movdqu %xmm4,32(%rsi) pxor %xmm13,%xmm5 movdqu %xmm5,48(%rsi) jmp .Lctr32_done .align 32 .Lctr32_loop3: .byte 102,15,56,220,209 leaq 16(%rcx),%rcx decl %eax .byte 102,15,56,220,217 .byte 102,15,56,220,225 movups (%rcx),%xmm1 jnz .Lctr32_loop3 .byte 102,15,56,221,209 .byte 102,15,56,221,217 .byte 102,15,56,221,225 movups (%rdi),%xmm10 xorps %xmm10,%xmm2 movups %xmm2,(%rsi) cmpq $2,%rdx jb .Lctr32_done movups 16(%rdi),%xmm11 xorps %xmm11,%xmm3 movups %xmm3,16(%rsi) je .Lctr32_done movups 32(%rdi),%xmm12 xorps %xmm12,%xmm4 movups %xmm4,32(%rsi) .Lctr32_done: xorps %xmm0,%xmm0 xorl %ebp,%ebp pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 movaps %xmm0,0(%rsp) pxor %xmm8,%xmm8 movaps %xmm0,16(%rsp) pxor %xmm9,%xmm9 movaps %xmm0,32(%rsp) pxor %xmm10,%xmm10 movaps %xmm0,48(%rsp) pxor %xmm11,%xmm11 movaps %xmm0,64(%rsp) pxor %xmm12,%xmm12 movaps %xmm0,80(%rsp) pxor %xmm13,%xmm13 movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 movq -8(%r11),%rbp .cfi_restore %rbp leaq (%r11),%rsp .cfi_def_cfa_register %rsp .Lctr32_epilogue: ret .cfi_endproc .size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks .globl aes_hw_set_encrypt_key_base .hidden aes_hw_set_encrypt_key_base .type aes_hw_set_encrypt_key_base,@function .align 16 aes_hw_set_encrypt_key_base: .cfi_startproc _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST movb $1,BORINGSSL_function_hit+3(%rip) #endif subq $8,%rsp .cfi_adjust_cfa_offset 8 movups (%rdi),%xmm0 xorps %xmm4,%xmm4 leaq 16(%rdx),%rax cmpl $256,%esi je .L14rounds cmpl $128,%esi jne .Lbad_keybits .L10rounds: movl $9,%esi movups %xmm0,(%rdx) .byte 102,15,58,223,200,1 call .Lkey_expansion_128_cold .byte 102,15,58,223,200,2 call .Lkey_expansion_128 .byte 102,15,58,223,200,4 call .Lkey_expansion_128 .byte 102,15,58,223,200,8 call .Lkey_expansion_128 .byte 102,15,58,223,200,16 call .Lkey_expansion_128 .byte 102,15,58,223,200,32 call .Lkey_expansion_128 .byte 102,15,58,223,200,64 call .Lkey_expansion_128 .byte 102,15,58,223,200,128 call .Lkey_expansion_128 .byte 102,15,58,223,200,27 call .Lkey_expansion_128 .byte 102,15,58,223,200,54 call .Lkey_expansion_128 movups %xmm0,(%rax) movl %esi,80(%rax) xorl %eax,%eax jmp .Lenc_key_ret .align 16 .L14rounds: movups 16(%rdi),%xmm2 movl $13,%esi leaq 16(%rax),%rax movups %xmm0,(%rdx) movups %xmm2,16(%rdx) .byte 102,15,58,223,202,1 call .Lkey_expansion_256a_cold .byte 102,15,58,223,200,1 call .Lkey_expansion_256b .byte 102,15,58,223,202,2 call .Lkey_expansion_256a .byte 102,15,58,223,200,2 call .Lkey_expansion_256b .byte 102,15,58,223,202,4 call .Lkey_expansion_256a .byte 102,15,58,223,200,4 call .Lkey_expansion_256b .byte 102,15,58,223,202,8 call .Lkey_expansion_256a .byte 102,15,58,223,200,8 call .Lkey_expansion_256b .byte 102,15,58,223,202,16 call .Lkey_expansion_256a .byte 102,15,58,223,200,16 call .Lkey_expansion_256b .byte 102,15,58,223,202,32 call .Lkey_expansion_256a .byte 102,15,58,223,200,32 call .Lkey_expansion_256b .byte 102,15,58,223,202,64 call .Lkey_expansion_256a movups %xmm0,(%rax) movl %esi,16(%rax) xorq %rax,%rax jmp .Lenc_key_ret .align 16 .Lbad_keybits: movq $-2,%rax .Lenc_key_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 addq $8,%rsp .cfi_adjust_cfa_offset -8 ret .cfi_endproc .align 16 .Lkey_expansion_128: .cfi_startproc movups %xmm0,(%rax) leaq 16(%rax),%rax .Lkey_expansion_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $255,%xmm1,%xmm1 xorps %xmm1,%xmm0 ret .cfi_endproc .align 16 .Lkey_expansion_256a: .cfi_startproc movups %xmm2,(%rax) leaq 16(%rax),%rax .Lkey_expansion_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $255,%xmm1,%xmm1 xorps %xmm1,%xmm0 ret .cfi_endproc .align 16 .Lkey_expansion_256b: .cfi_startproc movups %xmm0,(%rax) leaq 16(%rax),%rax shufps $16,%xmm2,%xmm4 xorps %xmm4,%xmm2 shufps $140,%xmm2,%xmm4 xorps %xmm4,%xmm2 shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 ret .cfi_endproc .size aes_hw_set_encrypt_key_base,.-aes_hw_set_encrypt_key_base .globl aes_hw_set_encrypt_key_alt .hidden aes_hw_set_encrypt_key_alt .type aes_hw_set_encrypt_key_alt,@function .align 16 aes_hw_set_encrypt_key_alt: .cfi_startproc _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST movb $1,BORINGSSL_function_hit+3(%rip) #endif subq $8,%rsp .cfi_adjust_cfa_offset 8 movups (%rdi),%xmm0 xorps %xmm4,%xmm4 leaq 16(%rdx),%rax cmpl $256,%esi je .L14rounds_alt cmpl $128,%esi jne .Lbad_keybits_alt movl $9,%esi movdqa .Lkey_rotate(%rip),%xmm5 movl $8,%r10d movdqa .Lkey_rcon1(%rip),%xmm4 movdqa %xmm0,%xmm2 movdqu %xmm0,(%rdx) jmp .Loop_key128 .align 16 .Loop_key128: .byte 102,15,56,0,197 .byte 102,15,56,221,196 pslld $1,%xmm4 leaq 16(%rax),%rax movdqa %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm3,%xmm2 pxor %xmm2,%xmm0 movdqu %xmm0,-16(%rax) movdqa %xmm0,%xmm2 decl %r10d jnz .Loop_key128 movdqa .Lkey_rcon1b(%rip),%xmm4 .byte 102,15,56,0,197 .byte 102,15,56,221,196 pslld $1,%xmm4 movdqa %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm3,%xmm2 pxor %xmm2,%xmm0 movdqu %xmm0,(%rax) movdqa %xmm0,%xmm2 .byte 102,15,56,0,197 .byte 102,15,56,221,196 movdqa %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm3,%xmm2 pxor %xmm2,%xmm0 movdqu %xmm0,16(%rax) movl %esi,96(%rax) xorl %eax,%eax jmp .Lenc_key_ret_alt .align 16 .L14rounds_alt: movups 16(%rdi),%xmm2 movl $13,%esi leaq 16(%rax),%rax movdqa .Lkey_rotate(%rip),%xmm5 movdqa .Lkey_rcon1(%rip),%xmm4 movl $7,%r10d movdqu %xmm0,0(%rdx) movdqa %xmm2,%xmm1 movdqu %xmm2,16(%rdx) jmp .Loop_key256 .align 16 .Loop_key256: .byte 102,15,56,0,213 .byte 102,15,56,221,212 movdqa %xmm0,%xmm3 pslldq $4,%xmm0 pxor %xmm0,%xmm3 pslldq $4,%xmm0 pxor %xmm0,%xmm3 pslldq $4,%xmm0 pxor %xmm3,%xmm0 pslld $1,%xmm4 pxor %xmm2,%xmm0 movdqu %xmm0,(%rax) decl %r10d jz .Ldone_key256 pshufd $0xff,%xmm0,%xmm2 pxor %xmm3,%xmm3 .byte 102,15,56,221,211 movdqa %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm3,%xmm1 pxor %xmm1,%xmm2 movdqu %xmm2,16(%rax) leaq 32(%rax),%rax movdqa %xmm2,%xmm1 jmp .Loop_key256 .Ldone_key256: movl %esi,16(%rax) xorl %eax,%eax jmp .Lenc_key_ret_alt .align 16 .Lbad_keybits_alt: movq $-2,%rax .Lenc_key_ret_alt: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 addq $8,%rsp .cfi_adjust_cfa_offset -8 ret .cfi_endproc .size aes_hw_set_encrypt_key_alt,.-aes_hw_set_encrypt_key_alt .section .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lincrement32: .long 6,6,6,0 .Lincrement64: .long 1,0,0,0 .Lincrement1: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .Lkey_rotate: .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d .Lkey_rotate192: .long 0x04070605,0x04070605,0x04070605,0x04070605 .Lkey_rcon1: .long 1,1,1,1 .Lkey_rcon1b: .long 0x1b,0x1b,0x1b,0x1b .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 .text #endif ring-0.17.14/pregenerated/aesni-x86_64-macosx.S000064400000000000000000000466411046102023000170640ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text .p2align 4 _aesni_encrypt2: movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 movups 32(%rcx),%xmm0 leaq 32(%rcx,%rax,1),%rcx negq %rax addq $16,%rax L$enc_loop2: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%rcx,%rax,1),%xmm0 jnz L$enc_loop2 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,221,208 .byte 102,15,56,221,216 ret .p2align 4 _aesni_encrypt3: movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 movups 32(%rcx),%xmm0 leaq 32(%rcx,%rax,1),%rcx negq %rax addq $16,%rax L$enc_loop3: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 movups -16(%rcx,%rax,1),%xmm0 jnz L$enc_loop3 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 102,15,56,221,224 ret .p2align 4 _aesni_encrypt4: movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 xorps %xmm0,%xmm5 movups 32(%rcx),%xmm0 leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 0x0f,0x1f,0x00 addq $16,%rax L$enc_loop4: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 movups -16(%rcx,%rax,1),%xmm0 jnz L$enc_loop4 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 102,15,56,221,224 .byte 102,15,56,221,232 ret .p2align 4 _aesni_encrypt6: movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 .byte 102,15,56,220,209 leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,220,217 pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 .byte 102,15,56,220,225 pxor %xmm0,%xmm7 movups (%rcx,%rax,1),%xmm0 addq $16,%rax jmp L$enc_loop6_enter .p2align 4 L$enc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 L$enc_loop6_enter: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups -16(%rcx,%rax,1),%xmm0 jnz L$enc_loop6 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 102,15,56,221,224 .byte 102,15,56,221,232 .byte 102,15,56,221,240 .byte 102,15,56,221,248 ret .p2align 4 _aesni_encrypt8: movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,220,209 pxor %xmm0,%xmm7 pxor %xmm0,%xmm8 .byte 102,15,56,220,217 pxor %xmm0,%xmm9 movups (%rcx,%rax,1),%xmm0 addq $16,%rax jmp L$enc_loop8_inner .p2align 4 L$enc_loop8: .byte 102,15,56,220,209 .byte 102,15,56,220,217 L$enc_loop8_inner: .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 L$enc_loop8_enter: movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups -16(%rcx,%rax,1),%xmm0 jnz L$enc_loop8 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 102,15,56,221,224 .byte 102,15,56,221,232 .byte 102,15,56,221,240 .byte 102,15,56,221,248 .byte 102,68,15,56,221,192 .byte 102,68,15,56,221,200 ret .globl _aes_hw_ctr32_encrypt_blocks .private_extern _aes_hw_ctr32_encrypt_blocks .p2align 4 _aes_hw_ctr32_encrypt_blocks: _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST movb $1,BORINGSSL_function_hit(%rip) #endif cmpq $1,%rdx jne L$ctr32_bulk movups (%r8),%xmm2 movups (%rdi),%xmm3 movl 240(%rcx),%edx movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 L$oop_enc1_1: .byte 102,15,56,220,209 decl %edx movups (%rcx),%xmm1 leaq 16(%rcx),%rcx jnz L$oop_enc1_1 .byte 102,15,56,221,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 xorps %xmm3,%xmm2 pxor %xmm3,%xmm3 movups %xmm2,(%rsi) xorps %xmm2,%xmm2 jmp L$ctr32_epilogue .p2align 4 L$ctr32_bulk: leaq (%rsp),%r11 pushq %rbp subq $128,%rsp andq $-16,%rsp movdqu (%r8),%xmm2 movdqu (%rcx),%xmm0 movl 12(%r8),%r8d pxor %xmm0,%xmm2 movl 12(%rcx),%ebp movdqa %xmm2,0(%rsp) bswapl %r8d movdqa %xmm2,%xmm3 movdqa %xmm2,%xmm4 movdqa %xmm2,%xmm5 movdqa %xmm2,64(%rsp) movdqa %xmm2,80(%rsp) movdqa %xmm2,96(%rsp) movq %rdx,%r10 movdqa %xmm2,112(%rsp) leaq 1(%r8),%rax leaq 2(%r8),%rdx bswapl %eax bswapl %edx xorl %ebp,%eax xorl %ebp,%edx .byte 102,15,58,34,216,3 leaq 3(%r8),%rax movdqa %xmm3,16(%rsp) .byte 102,15,58,34,226,3 bswapl %eax movq %r10,%rdx leaq 4(%r8),%r10 movdqa %xmm4,32(%rsp) xorl %ebp,%eax bswapl %r10d .byte 102,15,58,34,232,3 xorl %ebp,%r10d movdqa %xmm5,48(%rsp) leaq 5(%r8),%r9 movl %r10d,64+12(%rsp) bswapl %r9d leaq 6(%r8),%r10 movl 240(%rcx),%eax xorl %ebp,%r9d bswapl %r10d movl %r9d,80+12(%rsp) xorl %ebp,%r10d leaq 7(%r8),%r9 movl %r10d,96+12(%rsp) bswapl %r9d xorl %ebp,%r9d movl %r9d,112+12(%rsp) movups 16(%rcx),%xmm1 movdqa 64(%rsp),%xmm6 movdqa 80(%rsp),%xmm7 cmpq $8,%rdx jb L$ctr32_tail leaq 128(%rcx),%rcx subq $8,%rdx jmp L$ctr32_loop8 .p2align 5 L$ctr32_loop8: addl $8,%r8d movdqa 96(%rsp),%xmm8 .byte 102,15,56,220,209 movl %r8d,%r9d movdqa 112(%rsp),%xmm9 .byte 102,15,56,220,217 bswapl %r9d movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 xorl %ebp,%r9d nop .byte 102,15,56,220,233 movl %r9d,0+12(%rsp) leaq 1(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movups 48-128(%rcx),%xmm1 bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 movl %r9d,16+12(%rsp) leaq 2(%r8),%r9 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups 64-128(%rcx),%xmm0 bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movl %r9d,32+12(%rsp) leaq 3(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movups 80-128(%rcx),%xmm1 bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 movl %r9d,48+12(%rsp) leaq 4(%r8),%r9 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups 96-128(%rcx),%xmm0 bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movl %r9d,64+12(%rsp) leaq 5(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movups 112-128(%rcx),%xmm1 bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 movl %r9d,80+12(%rsp) leaq 6(%r8),%r9 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups 128-128(%rcx),%xmm0 bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movl %r9d,96+12(%rsp) leaq 7(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movups 144-128(%rcx),%xmm1 bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 xorl %ebp,%r9d movdqu 0(%rdi),%xmm10 .byte 102,15,56,220,232 movl %r9d,112+12(%rsp) cmpl $11,%eax .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups 160-128(%rcx),%xmm0 jb L$ctr32_enc_done .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movups 176-128(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups 192-128(%rcx),%xmm0 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movups 208-128(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 movups 224-128(%rcx),%xmm0 jmp L$ctr32_enc_done .p2align 4 L$ctr32_enc_done: movdqu 16(%rdi),%xmm11 pxor %xmm0,%xmm10 movdqu 32(%rdi),%xmm12 pxor %xmm0,%xmm11 movdqu 48(%rdi),%xmm13 pxor %xmm0,%xmm12 movdqu 64(%rdi),%xmm14 pxor %xmm0,%xmm13 movdqu 80(%rdi),%xmm15 pxor %xmm0,%xmm14 prefetcht0 448(%rdi) prefetcht0 512(%rdi) pxor %xmm0,%xmm15 .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 movdqu 96(%rdi),%xmm1 leaq 128(%rdi),%rdi .byte 102,65,15,56,221,210 pxor %xmm0,%xmm1 movdqu 112-128(%rdi),%xmm10 .byte 102,65,15,56,221,219 pxor %xmm0,%xmm10 movdqa 0(%rsp),%xmm11 .byte 102,65,15,56,221,228 .byte 102,65,15,56,221,237 movdqa 16(%rsp),%xmm12 movdqa 32(%rsp),%xmm13 .byte 102,65,15,56,221,246 .byte 102,65,15,56,221,255 movdqa 48(%rsp),%xmm14 movdqa 64(%rsp),%xmm15 .byte 102,68,15,56,221,193 movdqa 80(%rsp),%xmm0 movups 16-128(%rcx),%xmm1 .byte 102,69,15,56,221,202 movups %xmm2,(%rsi) movdqa %xmm11,%xmm2 movups %xmm3,16(%rsi) movdqa %xmm12,%xmm3 movups %xmm4,32(%rsi) movdqa %xmm13,%xmm4 movups %xmm5,48(%rsi) movdqa %xmm14,%xmm5 movups %xmm6,64(%rsi) movdqa %xmm15,%xmm6 movups %xmm7,80(%rsi) movdqa %xmm0,%xmm7 movups %xmm8,96(%rsi) movups %xmm9,112(%rsi) leaq 128(%rsi),%rsi subq $8,%rdx jnc L$ctr32_loop8 addq $8,%rdx jz L$ctr32_done leaq -128(%rcx),%rcx L$ctr32_tail: leaq 16(%rcx),%rcx cmpq $4,%rdx jb L$ctr32_loop3 je L$ctr32_loop4 shll $4,%eax movdqa 96(%rsp),%xmm8 pxor %xmm9,%xmm9 movups 16(%rcx),%xmm0 .byte 102,15,56,220,209 .byte 102,15,56,220,217 leaq 32-16(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,220,225 addq $16,%rax movups (%rdi),%xmm10 .byte 102,15,56,220,233 .byte 102,15,56,220,241 movups 16(%rdi),%xmm11 movups 32(%rdi),%xmm12 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 call L$enc_loop8_enter movdqu 48(%rdi),%xmm13 pxor %xmm10,%xmm2 movdqu 64(%rdi),%xmm10 pxor %xmm11,%xmm3 movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) pxor %xmm10,%xmm6 movdqu %xmm5,48(%rsi) movdqu %xmm6,64(%rsi) cmpq $6,%rdx jb L$ctr32_done movups 80(%rdi),%xmm11 xorps %xmm11,%xmm7 movups %xmm7,80(%rsi) je L$ctr32_done movups 96(%rdi),%xmm12 xorps %xmm12,%xmm8 movups %xmm8,96(%rsi) jmp L$ctr32_done .p2align 5 L$ctr32_loop4: .byte 102,15,56,220,209 leaq 16(%rcx),%rcx decl %eax .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movups (%rcx),%xmm1 jnz L$ctr32_loop4 .byte 102,15,56,221,209 .byte 102,15,56,221,217 movups (%rdi),%xmm10 movups 16(%rdi),%xmm11 .byte 102,15,56,221,225 .byte 102,15,56,221,233 movups 32(%rdi),%xmm12 movups 48(%rdi),%xmm13 xorps %xmm10,%xmm2 movups %xmm2,(%rsi) xorps %xmm11,%xmm3 movups %xmm3,16(%rsi) pxor %xmm12,%xmm4 movdqu %xmm4,32(%rsi) pxor %xmm13,%xmm5 movdqu %xmm5,48(%rsi) jmp L$ctr32_done .p2align 5 L$ctr32_loop3: .byte 102,15,56,220,209 leaq 16(%rcx),%rcx decl %eax .byte 102,15,56,220,217 .byte 102,15,56,220,225 movups (%rcx),%xmm1 jnz L$ctr32_loop3 .byte 102,15,56,221,209 .byte 102,15,56,221,217 .byte 102,15,56,221,225 movups (%rdi),%xmm10 xorps %xmm10,%xmm2 movups %xmm2,(%rsi) cmpq $2,%rdx jb L$ctr32_done movups 16(%rdi),%xmm11 xorps %xmm11,%xmm3 movups %xmm3,16(%rsi) je L$ctr32_done movups 32(%rdi),%xmm12 xorps %xmm12,%xmm4 movups %xmm4,32(%rsi) L$ctr32_done: xorps %xmm0,%xmm0 xorl %ebp,%ebp pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 movaps %xmm0,0(%rsp) pxor %xmm8,%xmm8 movaps %xmm0,16(%rsp) pxor %xmm9,%xmm9 movaps %xmm0,32(%rsp) pxor %xmm10,%xmm10 movaps %xmm0,48(%rsp) pxor %xmm11,%xmm11 movaps %xmm0,64(%rsp) pxor %xmm12,%xmm12 movaps %xmm0,80(%rsp) pxor %xmm13,%xmm13 movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 movq -8(%r11),%rbp leaq (%r11),%rsp L$ctr32_epilogue: ret .globl _aes_hw_set_encrypt_key_base .private_extern _aes_hw_set_encrypt_key_base .p2align 4 _aes_hw_set_encrypt_key_base: _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST movb $1,BORINGSSL_function_hit+3(%rip) #endif subq $8,%rsp movups (%rdi),%xmm0 xorps %xmm4,%xmm4 leaq 16(%rdx),%rax cmpl $256,%esi je L$14rounds cmpl $128,%esi jne L$bad_keybits L$10rounds: movl $9,%esi movups %xmm0,(%rdx) .byte 102,15,58,223,200,1 call L$key_expansion_128_cold .byte 102,15,58,223,200,2 call L$key_expansion_128 .byte 102,15,58,223,200,4 call L$key_expansion_128 .byte 102,15,58,223,200,8 call L$key_expansion_128 .byte 102,15,58,223,200,16 call L$key_expansion_128 .byte 102,15,58,223,200,32 call L$key_expansion_128 .byte 102,15,58,223,200,64 call L$key_expansion_128 .byte 102,15,58,223,200,128 call L$key_expansion_128 .byte 102,15,58,223,200,27 call L$key_expansion_128 .byte 102,15,58,223,200,54 call L$key_expansion_128 movups %xmm0,(%rax) movl %esi,80(%rax) xorl %eax,%eax jmp L$enc_key_ret .p2align 4 L$14rounds: movups 16(%rdi),%xmm2 movl $13,%esi leaq 16(%rax),%rax movups %xmm0,(%rdx) movups %xmm2,16(%rdx) .byte 102,15,58,223,202,1 call L$key_expansion_256a_cold .byte 102,15,58,223,200,1 call L$key_expansion_256b .byte 102,15,58,223,202,2 call L$key_expansion_256a .byte 102,15,58,223,200,2 call L$key_expansion_256b .byte 102,15,58,223,202,4 call L$key_expansion_256a .byte 102,15,58,223,200,4 call L$key_expansion_256b .byte 102,15,58,223,202,8 call L$key_expansion_256a .byte 102,15,58,223,200,8 call L$key_expansion_256b .byte 102,15,58,223,202,16 call L$key_expansion_256a .byte 102,15,58,223,200,16 call L$key_expansion_256b .byte 102,15,58,223,202,32 call L$key_expansion_256a .byte 102,15,58,223,200,32 call L$key_expansion_256b .byte 102,15,58,223,202,64 call L$key_expansion_256a movups %xmm0,(%rax) movl %esi,16(%rax) xorq %rax,%rax jmp L$enc_key_ret .p2align 4 L$bad_keybits: movq $-2,%rax L$enc_key_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 addq $8,%rsp ret .p2align 4 L$key_expansion_128: movups %xmm0,(%rax) leaq 16(%rax),%rax L$key_expansion_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $255,%xmm1,%xmm1 xorps %xmm1,%xmm0 ret .p2align 4 L$key_expansion_256a: movups %xmm2,(%rax) leaq 16(%rax),%rax L$key_expansion_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $255,%xmm1,%xmm1 xorps %xmm1,%xmm0 ret .p2align 4 L$key_expansion_256b: movups %xmm0,(%rax) leaq 16(%rax),%rax shufps $16,%xmm2,%xmm4 xorps %xmm4,%xmm2 shufps $140,%xmm2,%xmm4 xorps %xmm4,%xmm2 shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 ret .globl _aes_hw_set_encrypt_key_alt .private_extern _aes_hw_set_encrypt_key_alt .p2align 4 _aes_hw_set_encrypt_key_alt: _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST movb $1,BORINGSSL_function_hit+3(%rip) #endif subq $8,%rsp movups (%rdi),%xmm0 xorps %xmm4,%xmm4 leaq 16(%rdx),%rax cmpl $256,%esi je L$14rounds_alt cmpl $128,%esi jne L$bad_keybits_alt movl $9,%esi movdqa L$key_rotate(%rip),%xmm5 movl $8,%r10d movdqa L$key_rcon1(%rip),%xmm4 movdqa %xmm0,%xmm2 movdqu %xmm0,(%rdx) jmp L$oop_key128 .p2align 4 L$oop_key128: .byte 102,15,56,0,197 .byte 102,15,56,221,196 pslld $1,%xmm4 leaq 16(%rax),%rax movdqa %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm3,%xmm2 pxor %xmm2,%xmm0 movdqu %xmm0,-16(%rax) movdqa %xmm0,%xmm2 decl %r10d jnz L$oop_key128 movdqa L$key_rcon1b(%rip),%xmm4 .byte 102,15,56,0,197 .byte 102,15,56,221,196 pslld $1,%xmm4 movdqa %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm3,%xmm2 pxor %xmm2,%xmm0 movdqu %xmm0,(%rax) movdqa %xmm0,%xmm2 .byte 102,15,56,0,197 .byte 102,15,56,221,196 movdqa %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm2,%xmm3 pslldq $4,%xmm2 pxor %xmm3,%xmm2 pxor %xmm2,%xmm0 movdqu %xmm0,16(%rax) movl %esi,96(%rax) xorl %eax,%eax jmp L$enc_key_ret_alt .p2align 4 L$14rounds_alt: movups 16(%rdi),%xmm2 movl $13,%esi leaq 16(%rax),%rax movdqa L$key_rotate(%rip),%xmm5 movdqa L$key_rcon1(%rip),%xmm4 movl $7,%r10d movdqu %xmm0,0(%rdx) movdqa %xmm2,%xmm1 movdqu %xmm2,16(%rdx) jmp L$oop_key256 .p2align 4 L$oop_key256: .byte 102,15,56,0,213 .byte 102,15,56,221,212 movdqa %xmm0,%xmm3 pslldq $4,%xmm0 pxor %xmm0,%xmm3 pslldq $4,%xmm0 pxor %xmm0,%xmm3 pslldq $4,%xmm0 pxor %xmm3,%xmm0 pslld $1,%xmm4 pxor %xmm2,%xmm0 movdqu %xmm0,(%rax) decl %r10d jz L$done_key256 pshufd $0xff,%xmm0,%xmm2 pxor %xmm3,%xmm3 .byte 102,15,56,221,211 movdqa %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm3,%xmm1 pxor %xmm1,%xmm2 movdqu %xmm2,16(%rax) leaq 32(%rax),%rax movdqa %xmm2,%xmm1 jmp L$oop_key256 L$done_key256: movl %esi,16(%rax) xorl %eax,%eax jmp L$enc_key_ret_alt .p2align 4 L$bad_keybits_alt: movq $-2,%rax L$enc_key_ret_alt: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 addq $8,%rsp ret .section __DATA,__const .p2align 6 L$bswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 L$increment32: .long 6,6,6,0 L$increment64: .long 1,0,0,0 L$increment1: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 L$key_rotate: .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d L$key_rotate192: .long 0x04070605,0x04070605,0x04070605,0x04070605 L$key_rcon1: .long 1,1,1,1 L$key_rcon1b: .long 0x1b,0x1b,0x1b,0x1b .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 .text #endif ring-0.17.14/pregenerated/aesni-x86_64-nasm.asm000064400000000000000000000565151046102023000171070ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %ifidn __OUTPUT_FORMAT__, win64 default rel %define XMMWORD %define YMMWORD %define ZMMWORD %define _CET_ENDBR %include "ring_core_generated/prefix_symbols_nasm.inc" section .text code align=64 ALIGN 16 _aesni_encrypt2: movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] xorps xmm2,xmm0 xorps xmm3,xmm0 movups xmm0,XMMWORD[32+rcx] lea rcx,[32+rax*1+rcx] neg rax add rax,16 $L$enc_loop2: DB 102,15,56,220,209 DB 102,15,56,220,217 movups xmm1,XMMWORD[rax*1+rcx] add rax,32 DB 102,15,56,220,208 DB 102,15,56,220,216 movups xmm0,XMMWORD[((-16))+rax*1+rcx] jnz NEAR $L$enc_loop2 DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,221,208 DB 102,15,56,221,216 ret ALIGN 16 _aesni_encrypt3: movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] xorps xmm2,xmm0 xorps xmm3,xmm0 xorps xmm4,xmm0 movups xmm0,XMMWORD[32+rcx] lea rcx,[32+rax*1+rcx] neg rax add rax,16 $L$enc_loop3: DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 movups xmm1,XMMWORD[rax*1+rcx] add rax,32 DB 102,15,56,220,208 DB 102,15,56,220,216 DB 102,15,56,220,224 movups xmm0,XMMWORD[((-16))+rax*1+rcx] jnz NEAR $L$enc_loop3 DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 DB 102,15,56,221,208 DB 102,15,56,221,216 DB 102,15,56,221,224 ret ALIGN 16 _aesni_encrypt4: movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] xorps xmm2,xmm0 xorps xmm3,xmm0 xorps xmm4,xmm0 xorps xmm5,xmm0 movups xmm0,XMMWORD[32+rcx] lea rcx,[32+rax*1+rcx] neg rax DB 0x0f,0x1f,0x00 add rax,16 $L$enc_loop4: DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 DB 102,15,56,220,233 movups xmm1,XMMWORD[rax*1+rcx] add rax,32 DB 102,15,56,220,208 DB 102,15,56,220,216 DB 102,15,56,220,224 DB 102,15,56,220,232 movups xmm0,XMMWORD[((-16))+rax*1+rcx] jnz NEAR $L$enc_loop4 DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 DB 102,15,56,220,233 DB 102,15,56,221,208 DB 102,15,56,221,216 DB 102,15,56,221,224 DB 102,15,56,221,232 ret ALIGN 16 _aesni_encrypt6: movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] xorps xmm2,xmm0 pxor xmm3,xmm0 pxor xmm4,xmm0 DB 102,15,56,220,209 lea rcx,[32+rax*1+rcx] neg rax DB 102,15,56,220,217 pxor xmm5,xmm0 pxor xmm6,xmm0 DB 102,15,56,220,225 pxor xmm7,xmm0 movups xmm0,XMMWORD[rax*1+rcx] add rax,16 jmp NEAR $L$enc_loop6_enter ALIGN 16 $L$enc_loop6: DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 $L$enc_loop6_enter: DB 102,15,56,220,233 DB 102,15,56,220,241 DB 102,15,56,220,249 movups xmm1,XMMWORD[rax*1+rcx] add rax,32 DB 102,15,56,220,208 DB 102,15,56,220,216 DB 102,15,56,220,224 DB 102,15,56,220,232 DB 102,15,56,220,240 DB 102,15,56,220,248 movups xmm0,XMMWORD[((-16))+rax*1+rcx] jnz NEAR $L$enc_loop6 DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 DB 102,15,56,220,233 DB 102,15,56,220,241 DB 102,15,56,220,249 DB 102,15,56,221,208 DB 102,15,56,221,216 DB 102,15,56,221,224 DB 102,15,56,221,232 DB 102,15,56,221,240 DB 102,15,56,221,248 ret ALIGN 16 _aesni_encrypt8: movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] xorps xmm2,xmm0 xorps xmm3,xmm0 pxor xmm4,xmm0 pxor xmm5,xmm0 pxor xmm6,xmm0 lea rcx,[32+rax*1+rcx] neg rax DB 102,15,56,220,209 pxor xmm7,xmm0 pxor xmm8,xmm0 DB 102,15,56,220,217 pxor xmm9,xmm0 movups xmm0,XMMWORD[rax*1+rcx] add rax,16 jmp NEAR $L$enc_loop8_inner ALIGN 16 $L$enc_loop8: DB 102,15,56,220,209 DB 102,15,56,220,217 $L$enc_loop8_inner: DB 102,15,56,220,225 DB 102,15,56,220,233 DB 102,15,56,220,241 DB 102,15,56,220,249 DB 102,68,15,56,220,193 DB 102,68,15,56,220,201 $L$enc_loop8_enter: movups xmm1,XMMWORD[rax*1+rcx] add rax,32 DB 102,15,56,220,208 DB 102,15,56,220,216 DB 102,15,56,220,224 DB 102,15,56,220,232 DB 102,15,56,220,240 DB 102,15,56,220,248 DB 102,68,15,56,220,192 DB 102,68,15,56,220,200 movups xmm0,XMMWORD[((-16))+rax*1+rcx] jnz NEAR $L$enc_loop8 DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 DB 102,15,56,220,233 DB 102,15,56,220,241 DB 102,15,56,220,249 DB 102,68,15,56,220,193 DB 102,68,15,56,220,201 DB 102,15,56,221,208 DB 102,15,56,221,216 DB 102,15,56,221,224 DB 102,15,56,221,232 DB 102,15,56,221,240 DB 102,15,56,221,248 DB 102,68,15,56,221,192 DB 102,68,15,56,221,200 ret global aes_hw_ctr32_encrypt_blocks ALIGN 16 aes_hw_ctr32_encrypt_blocks: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_aes_hw_ctr32_encrypt_blocks: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] _CET_ENDBR %ifdef BORINGSSL_DISPATCH_TEST mov BYTE[BORINGSSL_function_hit],1 %endif cmp rdx,1 jne NEAR $L$ctr32_bulk movups xmm2,XMMWORD[r8] movups xmm3,XMMWORD[rdi] mov edx,DWORD[240+rcx] movups xmm0,XMMWORD[rcx] movups xmm1,XMMWORD[16+rcx] lea rcx,[32+rcx] xorps xmm2,xmm0 $L$oop_enc1_1: DB 102,15,56,220,209 dec edx movups xmm1,XMMWORD[rcx] lea rcx,[16+rcx] jnz NEAR $L$oop_enc1_1 DB 102,15,56,221,209 pxor xmm0,xmm0 pxor xmm1,xmm1 xorps xmm2,xmm3 pxor xmm3,xmm3 movups XMMWORD[rsi],xmm2 xorps xmm2,xmm2 jmp NEAR $L$ctr32_epilogue ALIGN 16 $L$ctr32_bulk: lea r11,[rsp] push rbp sub rsp,288 and rsp,-16 movaps XMMWORD[(-168)+r11],xmm6 movaps XMMWORD[(-152)+r11],xmm7 movaps XMMWORD[(-136)+r11],xmm8 movaps XMMWORD[(-120)+r11],xmm9 movaps XMMWORD[(-104)+r11],xmm10 movaps XMMWORD[(-88)+r11],xmm11 movaps XMMWORD[(-72)+r11],xmm12 movaps XMMWORD[(-56)+r11],xmm13 movaps XMMWORD[(-40)+r11],xmm14 movaps XMMWORD[(-24)+r11],xmm15 $L$ctr32_body: movdqu xmm2,XMMWORD[r8] movdqu xmm0,XMMWORD[rcx] mov r8d,DWORD[12+r8] pxor xmm2,xmm0 mov ebp,DWORD[12+rcx] movdqa XMMWORD[rsp],xmm2 bswap r8d movdqa xmm3,xmm2 movdqa xmm4,xmm2 movdqa xmm5,xmm2 movdqa XMMWORD[64+rsp],xmm2 movdqa XMMWORD[80+rsp],xmm2 movdqa XMMWORD[96+rsp],xmm2 mov r10,rdx movdqa XMMWORD[112+rsp],xmm2 lea rax,[1+r8] lea rdx,[2+r8] bswap eax bswap edx xor eax,ebp xor edx,ebp DB 102,15,58,34,216,3 lea rax,[3+r8] movdqa XMMWORD[16+rsp],xmm3 DB 102,15,58,34,226,3 bswap eax mov rdx,r10 lea r10,[4+r8] movdqa XMMWORD[32+rsp],xmm4 xor eax,ebp bswap r10d DB 102,15,58,34,232,3 xor r10d,ebp movdqa XMMWORD[48+rsp],xmm5 lea r9,[5+r8] mov DWORD[((64+12))+rsp],r10d bswap r9d lea r10,[6+r8] mov eax,DWORD[240+rcx] xor r9d,ebp bswap r10d mov DWORD[((80+12))+rsp],r9d xor r10d,ebp lea r9,[7+r8] mov DWORD[((96+12))+rsp],r10d bswap r9d xor r9d,ebp mov DWORD[((112+12))+rsp],r9d movups xmm1,XMMWORD[16+rcx] movdqa xmm6,XMMWORD[64+rsp] movdqa xmm7,XMMWORD[80+rsp] cmp rdx,8 jb NEAR $L$ctr32_tail lea rcx,[128+rcx] sub rdx,8 jmp NEAR $L$ctr32_loop8 ALIGN 32 $L$ctr32_loop8: add r8d,8 movdqa xmm8,XMMWORD[96+rsp] DB 102,15,56,220,209 mov r9d,r8d movdqa xmm9,XMMWORD[112+rsp] DB 102,15,56,220,217 bswap r9d movups xmm0,XMMWORD[((32-128))+rcx] DB 102,15,56,220,225 xor r9d,ebp nop DB 102,15,56,220,233 mov DWORD[((0+12))+rsp],r9d lea r9,[1+r8] DB 102,15,56,220,241 DB 102,15,56,220,249 DB 102,68,15,56,220,193 DB 102,68,15,56,220,201 movups xmm1,XMMWORD[((48-128))+rcx] bswap r9d DB 102,15,56,220,208 DB 102,15,56,220,216 xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,224 DB 102,15,56,220,232 mov DWORD[((16+12))+rsp],r9d lea r9,[2+r8] DB 102,15,56,220,240 DB 102,15,56,220,248 DB 102,68,15,56,220,192 DB 102,68,15,56,220,200 movups xmm0,XMMWORD[((64-128))+rcx] bswap r9d DB 102,15,56,220,209 DB 102,15,56,220,217 xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,225 DB 102,15,56,220,233 mov DWORD[((32+12))+rsp],r9d lea r9,[3+r8] DB 102,15,56,220,241 DB 102,15,56,220,249 DB 102,68,15,56,220,193 DB 102,68,15,56,220,201 movups xmm1,XMMWORD[((80-128))+rcx] bswap r9d DB 102,15,56,220,208 DB 102,15,56,220,216 xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,224 DB 102,15,56,220,232 mov DWORD[((48+12))+rsp],r9d lea r9,[4+r8] DB 102,15,56,220,240 DB 102,15,56,220,248 DB 102,68,15,56,220,192 DB 102,68,15,56,220,200 movups xmm0,XMMWORD[((96-128))+rcx] bswap r9d DB 102,15,56,220,209 DB 102,15,56,220,217 xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,225 DB 102,15,56,220,233 mov DWORD[((64+12))+rsp],r9d lea r9,[5+r8] DB 102,15,56,220,241 DB 102,15,56,220,249 DB 102,68,15,56,220,193 DB 102,68,15,56,220,201 movups xmm1,XMMWORD[((112-128))+rcx] bswap r9d DB 102,15,56,220,208 DB 102,15,56,220,216 xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,224 DB 102,15,56,220,232 mov DWORD[((80+12))+rsp],r9d lea r9,[6+r8] DB 102,15,56,220,240 DB 102,15,56,220,248 DB 102,68,15,56,220,192 DB 102,68,15,56,220,200 movups xmm0,XMMWORD[((128-128))+rcx] bswap r9d DB 102,15,56,220,209 DB 102,15,56,220,217 xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,225 DB 102,15,56,220,233 mov DWORD[((96+12))+rsp],r9d lea r9,[7+r8] DB 102,15,56,220,241 DB 102,15,56,220,249 DB 102,68,15,56,220,193 DB 102,68,15,56,220,201 movups xmm1,XMMWORD[((144-128))+rcx] bswap r9d DB 102,15,56,220,208 DB 102,15,56,220,216 DB 102,15,56,220,224 xor r9d,ebp movdqu xmm10,XMMWORD[rdi] DB 102,15,56,220,232 mov DWORD[((112+12))+rsp],r9d cmp eax,11 DB 102,15,56,220,240 DB 102,15,56,220,248 DB 102,68,15,56,220,192 DB 102,68,15,56,220,200 movups xmm0,XMMWORD[((160-128))+rcx] jb NEAR $L$ctr32_enc_done DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 DB 102,15,56,220,233 DB 102,15,56,220,241 DB 102,15,56,220,249 DB 102,68,15,56,220,193 DB 102,68,15,56,220,201 movups xmm1,XMMWORD[((176-128))+rcx] DB 102,15,56,220,208 DB 102,15,56,220,216 DB 102,15,56,220,224 DB 102,15,56,220,232 DB 102,15,56,220,240 DB 102,15,56,220,248 DB 102,68,15,56,220,192 DB 102,68,15,56,220,200 movups xmm0,XMMWORD[((192-128))+rcx] DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 DB 102,15,56,220,233 DB 102,15,56,220,241 DB 102,15,56,220,249 DB 102,68,15,56,220,193 DB 102,68,15,56,220,201 movups xmm1,XMMWORD[((208-128))+rcx] DB 102,15,56,220,208 DB 102,15,56,220,216 DB 102,15,56,220,224 DB 102,15,56,220,232 DB 102,15,56,220,240 DB 102,15,56,220,248 DB 102,68,15,56,220,192 DB 102,68,15,56,220,200 movups xmm0,XMMWORD[((224-128))+rcx] jmp NEAR $L$ctr32_enc_done ALIGN 16 $L$ctr32_enc_done: movdqu xmm11,XMMWORD[16+rdi] pxor xmm10,xmm0 movdqu xmm12,XMMWORD[32+rdi] pxor xmm11,xmm0 movdqu xmm13,XMMWORD[48+rdi] pxor xmm12,xmm0 movdqu xmm14,XMMWORD[64+rdi] pxor xmm13,xmm0 movdqu xmm15,XMMWORD[80+rdi] pxor xmm14,xmm0 prefetcht0 [448+rdi] prefetcht0 [512+rdi] pxor xmm15,xmm0 DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 DB 102,15,56,220,233 DB 102,15,56,220,241 DB 102,15,56,220,249 DB 102,68,15,56,220,193 DB 102,68,15,56,220,201 movdqu xmm1,XMMWORD[96+rdi] lea rdi,[128+rdi] DB 102,65,15,56,221,210 pxor xmm1,xmm0 movdqu xmm10,XMMWORD[((112-128))+rdi] DB 102,65,15,56,221,219 pxor xmm10,xmm0 movdqa xmm11,XMMWORD[rsp] DB 102,65,15,56,221,228 DB 102,65,15,56,221,237 movdqa xmm12,XMMWORD[16+rsp] movdqa xmm13,XMMWORD[32+rsp] DB 102,65,15,56,221,246 DB 102,65,15,56,221,255 movdqa xmm14,XMMWORD[48+rsp] movdqa xmm15,XMMWORD[64+rsp] DB 102,68,15,56,221,193 movdqa xmm0,XMMWORD[80+rsp] movups xmm1,XMMWORD[((16-128))+rcx] DB 102,69,15,56,221,202 movups XMMWORD[rsi],xmm2 movdqa xmm2,xmm11 movups XMMWORD[16+rsi],xmm3 movdqa xmm3,xmm12 movups XMMWORD[32+rsi],xmm4 movdqa xmm4,xmm13 movups XMMWORD[48+rsi],xmm5 movdqa xmm5,xmm14 movups XMMWORD[64+rsi],xmm6 movdqa xmm6,xmm15 movups XMMWORD[80+rsi],xmm7 movdqa xmm7,xmm0 movups XMMWORD[96+rsi],xmm8 movups XMMWORD[112+rsi],xmm9 lea rsi,[128+rsi] sub rdx,8 jnc NEAR $L$ctr32_loop8 add rdx,8 jz NEAR $L$ctr32_done lea rcx,[((-128))+rcx] $L$ctr32_tail: lea rcx,[16+rcx] cmp rdx,4 jb NEAR $L$ctr32_loop3 je NEAR $L$ctr32_loop4 shl eax,4 movdqa xmm8,XMMWORD[96+rsp] pxor xmm9,xmm9 movups xmm0,XMMWORD[16+rcx] DB 102,15,56,220,209 DB 102,15,56,220,217 lea rcx,[((32-16))+rax*1+rcx] neg rax DB 102,15,56,220,225 add rax,16 movups xmm10,XMMWORD[rdi] DB 102,15,56,220,233 DB 102,15,56,220,241 movups xmm11,XMMWORD[16+rdi] movups xmm12,XMMWORD[32+rdi] DB 102,15,56,220,249 DB 102,68,15,56,220,193 call $L$enc_loop8_enter movdqu xmm13,XMMWORD[48+rdi] pxor xmm2,xmm10 movdqu xmm10,XMMWORD[64+rdi] pxor xmm3,xmm11 movdqu XMMWORD[rsi],xmm2 pxor xmm4,xmm12 movdqu XMMWORD[16+rsi],xmm3 pxor xmm5,xmm13 movdqu XMMWORD[32+rsi],xmm4 pxor xmm6,xmm10 movdqu XMMWORD[48+rsi],xmm5 movdqu XMMWORD[64+rsi],xmm6 cmp rdx,6 jb NEAR $L$ctr32_done movups xmm11,XMMWORD[80+rdi] xorps xmm7,xmm11 movups XMMWORD[80+rsi],xmm7 je NEAR $L$ctr32_done movups xmm12,XMMWORD[96+rdi] xorps xmm8,xmm12 movups XMMWORD[96+rsi],xmm8 jmp NEAR $L$ctr32_done ALIGN 32 $L$ctr32_loop4: DB 102,15,56,220,209 lea rcx,[16+rcx] dec eax DB 102,15,56,220,217 DB 102,15,56,220,225 DB 102,15,56,220,233 movups xmm1,XMMWORD[rcx] jnz NEAR $L$ctr32_loop4 DB 102,15,56,221,209 DB 102,15,56,221,217 movups xmm10,XMMWORD[rdi] movups xmm11,XMMWORD[16+rdi] DB 102,15,56,221,225 DB 102,15,56,221,233 movups xmm12,XMMWORD[32+rdi] movups xmm13,XMMWORD[48+rdi] xorps xmm2,xmm10 movups XMMWORD[rsi],xmm2 xorps xmm3,xmm11 movups XMMWORD[16+rsi],xmm3 pxor xmm4,xmm12 movdqu XMMWORD[32+rsi],xmm4 pxor xmm5,xmm13 movdqu XMMWORD[48+rsi],xmm5 jmp NEAR $L$ctr32_done ALIGN 32 $L$ctr32_loop3: DB 102,15,56,220,209 lea rcx,[16+rcx] dec eax DB 102,15,56,220,217 DB 102,15,56,220,225 movups xmm1,XMMWORD[rcx] jnz NEAR $L$ctr32_loop3 DB 102,15,56,221,209 DB 102,15,56,221,217 DB 102,15,56,221,225 movups xmm10,XMMWORD[rdi] xorps xmm2,xmm10 movups XMMWORD[rsi],xmm2 cmp rdx,2 jb NEAR $L$ctr32_done movups xmm11,XMMWORD[16+rdi] xorps xmm3,xmm11 movups XMMWORD[16+rsi],xmm3 je NEAR $L$ctr32_done movups xmm12,XMMWORD[32+rdi] xorps xmm4,xmm12 movups XMMWORD[32+rsi],xmm4 $L$ctr32_done: xorps xmm0,xmm0 xor ebp,ebp pxor xmm1,xmm1 pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 movaps xmm6,XMMWORD[((-168))+r11] movaps XMMWORD[(-168)+r11],xmm0 movaps xmm7,XMMWORD[((-152))+r11] movaps XMMWORD[(-152)+r11],xmm0 movaps xmm8,XMMWORD[((-136))+r11] movaps XMMWORD[(-136)+r11],xmm0 movaps xmm9,XMMWORD[((-120))+r11] movaps XMMWORD[(-120)+r11],xmm0 movaps xmm10,XMMWORD[((-104))+r11] movaps XMMWORD[(-104)+r11],xmm0 movaps xmm11,XMMWORD[((-88))+r11] movaps XMMWORD[(-88)+r11],xmm0 movaps xmm12,XMMWORD[((-72))+r11] movaps XMMWORD[(-72)+r11],xmm0 movaps xmm13,XMMWORD[((-56))+r11] movaps XMMWORD[(-56)+r11],xmm0 movaps xmm14,XMMWORD[((-40))+r11] movaps XMMWORD[(-40)+r11],xmm0 movaps xmm15,XMMWORD[((-24))+r11] movaps XMMWORD[(-24)+r11],xmm0 movaps XMMWORD[rsp],xmm0 movaps XMMWORD[16+rsp],xmm0 movaps XMMWORD[32+rsp],xmm0 movaps XMMWORD[48+rsp],xmm0 movaps XMMWORD[64+rsp],xmm0 movaps XMMWORD[80+rsp],xmm0 movaps XMMWORD[96+rsp],xmm0 movaps XMMWORD[112+rsp],xmm0 mov rbp,QWORD[((-8))+r11] lea rsp,[r11] $L$ctr32_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_aes_hw_ctr32_encrypt_blocks: global aes_hw_set_encrypt_key_base ALIGN 16 aes_hw_set_encrypt_key_base: $L$SEH_begin_aes_hw_set_encrypt_key_base_1: _CET_ENDBR %ifdef BORINGSSL_DISPATCH_TEST mov BYTE[((BORINGSSL_function_hit+3))],1 %endif sub rsp,8 $L$SEH_prologue_aes_hw_set_encrypt_key_base_2: $L$SEH_endprologue_aes_hw_set_encrypt_key_base_3: movups xmm0,XMMWORD[rcx] xorps xmm4,xmm4 lea rax,[16+r8] cmp edx,256 je NEAR $L$14rounds cmp edx,128 jne NEAR $L$bad_keybits $L$10rounds: mov edx,9 movups XMMWORD[r8],xmm0 DB 102,15,58,223,200,1 call $L$key_expansion_128_cold DB 102,15,58,223,200,2 call $L$key_expansion_128 DB 102,15,58,223,200,4 call $L$key_expansion_128 DB 102,15,58,223,200,8 call $L$key_expansion_128 DB 102,15,58,223,200,16 call $L$key_expansion_128 DB 102,15,58,223,200,32 call $L$key_expansion_128 DB 102,15,58,223,200,64 call $L$key_expansion_128 DB 102,15,58,223,200,128 call $L$key_expansion_128 DB 102,15,58,223,200,27 call $L$key_expansion_128 DB 102,15,58,223,200,54 call $L$key_expansion_128 movups XMMWORD[rax],xmm0 mov DWORD[80+rax],edx xor eax,eax jmp NEAR $L$enc_key_ret ALIGN 16 $L$14rounds: movups xmm2,XMMWORD[16+rcx] mov edx,13 lea rax,[16+rax] movups XMMWORD[r8],xmm0 movups XMMWORD[16+r8],xmm2 DB 102,15,58,223,202,1 call $L$key_expansion_256a_cold DB 102,15,58,223,200,1 call $L$key_expansion_256b DB 102,15,58,223,202,2 call $L$key_expansion_256a DB 102,15,58,223,200,2 call $L$key_expansion_256b DB 102,15,58,223,202,4 call $L$key_expansion_256a DB 102,15,58,223,200,4 call $L$key_expansion_256b DB 102,15,58,223,202,8 call $L$key_expansion_256a DB 102,15,58,223,200,8 call $L$key_expansion_256b DB 102,15,58,223,202,16 call $L$key_expansion_256a DB 102,15,58,223,200,16 call $L$key_expansion_256b DB 102,15,58,223,202,32 call $L$key_expansion_256a DB 102,15,58,223,200,32 call $L$key_expansion_256b DB 102,15,58,223,202,64 call $L$key_expansion_256a movups XMMWORD[rax],xmm0 mov DWORD[16+rax],edx xor rax,rax jmp NEAR $L$enc_key_ret ALIGN 16 $L$bad_keybits: mov rax,-2 $L$enc_key_ret: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 add rsp,8 ret $L$SEH_end_aes_hw_set_encrypt_key_base_4: ALIGN 16 $L$key_expansion_128: movups XMMWORD[rax],xmm0 lea rax,[16+rax] $L$key_expansion_128_cold: shufps xmm4,xmm0,16 xorps xmm0,xmm4 shufps xmm4,xmm0,140 xorps xmm0,xmm4 shufps xmm1,xmm1,255 xorps xmm0,xmm1 ret ALIGN 16 $L$key_expansion_256a: movups XMMWORD[rax],xmm2 lea rax,[16+rax] $L$key_expansion_256a_cold: shufps xmm4,xmm0,16 xorps xmm0,xmm4 shufps xmm4,xmm0,140 xorps xmm0,xmm4 shufps xmm1,xmm1,255 xorps xmm0,xmm1 ret ALIGN 16 $L$key_expansion_256b: movups XMMWORD[rax],xmm0 lea rax,[16+rax] shufps xmm4,xmm2,16 xorps xmm2,xmm4 shufps xmm4,xmm2,140 xorps xmm2,xmm4 shufps xmm1,xmm1,170 xorps xmm2,xmm1 ret global aes_hw_set_encrypt_key_alt ALIGN 16 aes_hw_set_encrypt_key_alt: $L$SEH_begin_aes_hw_set_encrypt_key_alt_1: _CET_ENDBR %ifdef BORINGSSL_DISPATCH_TEST mov BYTE[((BORINGSSL_function_hit+3))],1 %endif sub rsp,8 $L$SEH_prologue_aes_hw_set_encrypt_key_alt_2: $L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3: movups xmm0,XMMWORD[rcx] xorps xmm4,xmm4 lea rax,[16+r8] cmp edx,256 je NEAR $L$14rounds_alt cmp edx,128 jne NEAR $L$bad_keybits_alt mov edx,9 movdqa xmm5,XMMWORD[$L$key_rotate] mov r10d,8 movdqa xmm4,XMMWORD[$L$key_rcon1] movdqa xmm2,xmm0 movdqu XMMWORD[r8],xmm0 jmp NEAR $L$oop_key128 ALIGN 16 $L$oop_key128: DB 102,15,56,0,197 DB 102,15,56,221,196 pslld xmm4,1 lea rax,[16+rax] movdqa xmm3,xmm2 pslldq xmm2,4 pxor xmm3,xmm2 pslldq xmm2,4 pxor xmm3,xmm2 pslldq xmm2,4 pxor xmm2,xmm3 pxor xmm0,xmm2 movdqu XMMWORD[(-16)+rax],xmm0 movdqa xmm2,xmm0 dec r10d jnz NEAR $L$oop_key128 movdqa xmm4,XMMWORD[$L$key_rcon1b] DB 102,15,56,0,197 DB 102,15,56,221,196 pslld xmm4,1 movdqa xmm3,xmm2 pslldq xmm2,4 pxor xmm3,xmm2 pslldq xmm2,4 pxor xmm3,xmm2 pslldq xmm2,4 pxor xmm2,xmm3 pxor xmm0,xmm2 movdqu XMMWORD[rax],xmm0 movdqa xmm2,xmm0 DB 102,15,56,0,197 DB 102,15,56,221,196 movdqa xmm3,xmm2 pslldq xmm2,4 pxor xmm3,xmm2 pslldq xmm2,4 pxor xmm3,xmm2 pslldq xmm2,4 pxor xmm2,xmm3 pxor xmm0,xmm2 movdqu XMMWORD[16+rax],xmm0 mov DWORD[96+rax],edx xor eax,eax jmp NEAR $L$enc_key_ret_alt ALIGN 16 $L$14rounds_alt: movups xmm2,XMMWORD[16+rcx] mov edx,13 lea rax,[16+rax] movdqa xmm5,XMMWORD[$L$key_rotate] movdqa xmm4,XMMWORD[$L$key_rcon1] mov r10d,7 movdqu XMMWORD[r8],xmm0 movdqa xmm1,xmm2 movdqu XMMWORD[16+r8],xmm2 jmp NEAR $L$oop_key256 ALIGN 16 $L$oop_key256: DB 102,15,56,0,213 DB 102,15,56,221,212 movdqa xmm3,xmm0 pslldq xmm0,4 pxor xmm3,xmm0 pslldq xmm0,4 pxor xmm3,xmm0 pslldq xmm0,4 pxor xmm0,xmm3 pslld xmm4,1 pxor xmm0,xmm2 movdqu XMMWORD[rax],xmm0 dec r10d jz NEAR $L$done_key256 pshufd xmm2,xmm0,0xff pxor xmm3,xmm3 DB 102,15,56,221,211 movdqa xmm3,xmm1 pslldq xmm1,4 pxor xmm3,xmm1 pslldq xmm1,4 pxor xmm3,xmm1 pslldq xmm1,4 pxor xmm1,xmm3 pxor xmm2,xmm1 movdqu XMMWORD[16+rax],xmm2 lea rax,[32+rax] movdqa xmm1,xmm2 jmp NEAR $L$oop_key256 $L$done_key256: mov DWORD[16+rax],edx xor eax,eax jmp NEAR $L$enc_key_ret_alt ALIGN 16 $L$bad_keybits_alt: mov rax,-2 $L$enc_key_ret_alt: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 add rsp,8 ret $L$SEH_end_aes_hw_set_encrypt_key_alt_4: section .rdata rdata align=8 ALIGN 64 $L$bswap_mask: DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 $L$increment32: DD 6,6,6,0 $L$increment64: DD 1,0,0,0 $L$increment1: DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 $L$key_rotate: DD 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d $L$key_rotate192: DD 0x04070605,0x04070605,0x04070605,0x04070605 $L$key_rcon1: DD 1,1,1,1 $L$key_rcon1b: DD 0x1b,0x1b,0x1b,0x1b DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 DB 115,108,46,111,114,103,62,0 ALIGN 64 section .text EXTERN __imp_RtlVirtualUnwind ALIGN 16 ctr_xts_se_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$common_seh_tail mov rax,QWORD[152+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail mov rax,QWORD[208+r8] lea rsi,[((-168))+rax] lea rdi,[512+r8] mov ecx,20 DD 0xa548f3fc mov rbp,QWORD[((-8))+rax] mov QWORD[160+r8],rbp $L$common_seh_tail: mov rdi,QWORD[8+rax] mov rsi,QWORD[16+rax] mov QWORD[152+r8],rax mov QWORD[168+r8],rsi mov QWORD[176+r8],rdi mov rdi,QWORD[40+r9] mov rsi,r8 mov ecx,154 DD 0xa548f3fc mov rsi,r9 xor rcx,rcx mov rdx,QWORD[8+rsi] mov r8,QWORD[rsi] mov r9,QWORD[16+rsi] mov r10,QWORD[40+rsi] lea r11,[56+rsi] lea r12,[24+rsi] mov QWORD[32+rsp],r10 mov QWORD[40+rsp],r11 mov QWORD[48+rsp],r12 mov QWORD[56+rsp],rcx call QWORD[__imp_RtlVirtualUnwind] mov eax,1 add rsp,64 popfq pop r15 pop r14 pop r13 pop r12 pop rbp pop rbx pop rdi pop rsi ret section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_aes_hw_ctr32_encrypt_blocks wrt ..imagebase DD $L$SEH_end_aes_hw_ctr32_encrypt_blocks wrt ..imagebase DD $L$SEH_info_ctr32 wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_ctr32: DB 9,0,0,0 DD ctr_xts_se_handler wrt ..imagebase DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase section .pdata ALIGN 4 DD $L$SEH_begin_aes_hw_set_encrypt_key_base_1 wrt ..imagebase DD $L$SEH_end_aes_hw_set_encrypt_key_base_4 wrt ..imagebase DD $L$SEH_info_aes_hw_set_encrypt_key_base_0 wrt ..imagebase DD $L$SEH_begin_aes_hw_set_encrypt_key_alt_1 wrt ..imagebase DD $L$SEH_end_aes_hw_set_encrypt_key_alt_4 wrt ..imagebase DD $L$SEH_info_aes_hw_set_encrypt_key_alt_0 wrt ..imagebase section .xdata ALIGN 4 $L$SEH_info_aes_hw_set_encrypt_key_base_0: DB 1 DB $L$SEH_endprologue_aes_hw_set_encrypt_key_base_3-$L$SEH_begin_aes_hw_set_encrypt_key_base_1 DB 1 DB 0 DB $L$SEH_prologue_aes_hw_set_encrypt_key_base_2-$L$SEH_begin_aes_hw_set_encrypt_key_base_1 DB 2 DW 0 $L$SEH_info_aes_hw_set_encrypt_key_alt_0: DB 1 DB $L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3-$L$SEH_begin_aes_hw_set_encrypt_key_alt_1 DB 1 DB 0 DB $L$SEH_prologue_aes_hw_set_encrypt_key_alt_2-$L$SEH_begin_aes_hw_set_encrypt_key_alt_1 DB 2 DW 0 %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/aesni-x86_64-nasm.o000064400000000000000000000452551046102023000165640ustar 00000000000000dg~?P.debug$S|&'@B.debug$TR,,@B.text,= p`.rdata>>@p@.pdata$>> @0@.xdata @?`?@@@6C:\Users\b\p\ring\pregenerated\aesni-x86_64-nasm.asm7JYOŐ:@4   %*. 2!7"<#A$G&L'Q(V)[*\.`1c2f3j4m5p6s7w8|9:=>?@ABCDEFHIJKLMNRUVWXYZ[\]^_` cdefg!h%i*j/k4l9m>nDpIqNrSsXt]ubvgwlxm|psvz} #(-27<AFKLPSVZ]`dhlqty}  %+16;@EJOU[\`ejmpsvy~   !"$&(*+,-./ 0123!4&5+;0<4=8><??@DAGBKCODSEYF_GeHhInKrLvMxNzO|P~QRSTUVWXYZ[\]^_`abcdefghijkmoprs uvwy {$|+}0~3:?BFKNOTY]bgmswz $*047<ADFKPUY^ciosv{ !'-17<AFKPU[aejoty~       !"#$!%'&,'2(7)>*E+J,O-T.Y/^0c1h2n3t4y5789:;<=>?@ABCDEFGHJKLMN O P Q R S T U V$ W) X0 Z4 [: ]> ^D _H dL eP fV g\ j_ kf lk no ot py q~ r s t u v w x y z { }                        % ) + 0 5 : = C H M Q V [ ` e j n q u y ~                                   # + 3 ; C K P U Z _ d i n s x }                                   !& "+ #1 $6 %< &A 'G (L )R *W +] ,b -h .m /p 0s 1u 2z 6 8 9 : < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W% X( Y+ Z. [3 ]@ _G aK bO cS dW e[ f_ gc id mp ps qw s{ t~ u v w x y |                                %*/37<@EINRV[_bhpuz "'05:> C G L P UY^bfiotx} !#$%&(+,-/13456789;[]^_`abcdefhi klnopq$s+u/v3w6x<zC|J}Q~VZ^eimt{9C:\Users\b\p\ring\pregenerated\aesni-x86_64-nasm.o4'The Netwide Assembler 2.13.03_aesni_encrypt2L$enc_loop2_aesni_encrypt3L$enc_loop3_aesni_encrypt4L$enc_loop4_aesni_encrypt6L$enc_loop6L$enc_loop6_enter_aesni_encrypt8L$enc_loop8L$enc_loop8_innerL$enc_loop8_enter8ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks1L$SEH_begin_aes_hw_ctr32_encrypt_blocksL$oop_enc1_1L$ctr32_bulkL$ctr32_bodyL$ctr32_loop8L$ctr32_enc_doneL$ctr32_tailL$ctr32_loop4L$ctr32_loop3L$ctr32_doneL$ctr32_epilogue/L$SEH_end_aes_hw_ctr32_encrypt_blocks8ring_core_0_17_14__aes_hw_set_encrypt_key_base3L$SEH_begin_aes_hw_set_encrypt_key_base_16L$SEH_prologue_aes_hw_set_encrypt_key_base_29L$SEH_endprologue_aes_hw_set_encrypt_key_base_3L$10roundsL$14roundsL$bad_keybitsL$enc_key_ret1L$SEH_end_aes_hw_set_encrypt_key_base_4L$key_expansion_128"L$key_expansion_128_coldL$key_expansion_256a#L$key_expansion_256a_coldL$key_expansion_256b7ring_core_0_17_14__aes_hw_set_encrypt_key_alt2L$SEH_begin_aes_hw_set_encrypt_key_alt_15L$SEH_prologue_aes_hw_set_encrypt_key_alt_28L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3L$oop_key128L$14rounds_altL$oop_key256L$done_key256L$bad_keybits_altL$enc_key_ret_alt0L$SEH_end_aes_hw_set_encrypt_key_alt_4  L$bswap_mask "L$increment32 "L$increment64  L$increment1 "L$key_rotate "L$key_rotate192 "L$key_rcon1 "L$key_rcon1bctr_xts_se_handlerL$common_seh_tail  L$SEH_info_ctr325  L$SEH_info_aes_hw_set_encrypt_key_base_04  L$SEH_info_aes_hw_set_encrypt_key_alt_0l p ) - D H [ _ v z             ) - @ D ] a z ~        !  ! / " 3 " H # L # d $ h $ | % % & & ' ' ( ( ) ) !* !* M!+ Q!+ !, !, !- !- !. !. "/ "/ !"0 %"0 :"1 >"1 S"2 W"2 "3 "3 "4 "4 "5 "5 "6 "6 #7 #7 .#8 2#8 g#9 k#9 #: #: #; #; $< $< $$= ($= >$> B$> V$? Z$? o$@ s$@ $A $A $B $B $C $C $D $D %E %E 2%F 6%F M%G Q%G h%H l%H %I %I %J %J %K %K %L %L %M %M &N &N L&O P&O IWWA HL HHf8f8 H f8f8Df8f8f8f8ÐIWWWA HL HHf8f8f8 H f8f8f8Df8f8f8f8f8f8ÐIWWWWA HL HHf8f8f8f8 H f8f8f8f8Df8f8f8f8f8f8f8f8ÐIWfff8HL Hf8fff8fHf8f8f8f8f8f8 H f8f8f8f8f8f8Df8f8f8f8f8f8f8f8f8f8f8f8ÐIWWfffHL Hf8ffDf8fDHf8f8f8f8f8f8fD8fD8 H f8f8f8f8f8f8fD8fD8Df8f8f8f8f8f8fD8fD8f8f8f8f8f8f8fD8fD8ÐH|$Ht$HHHLLLD$(HXAIHI Wf8 HIf8ffWfWL$UH HA)XA)hE)xE)KE)SE)[E)cE)kE)sE){AooE@ fЋi f$AfofofofT$@fT$PfT$`IfT$pI@IP11f:"I@f\$f:"LMPfd$ 1Af:"A1fl$0MHDT$LAMPA1ADL$\A1MHDT$lAA1DL$|Ifot$@fo|$PH?HHAfDoD$`f8EfDoL$pf8AAf8A1f8DL$ MHf8f8fD8fD8IAf8f8A1ff8f8DL$MHf8f8fD8fD8AAf8f8A1ff8f8DL$,MHf8f8fD8fD8IAf8f8A1ff8f8DL$m d       +  .filegC:\Users\b\p\ring\.debug$S|&.debug$TR.text.rdata.pdata$ .xdata .absolut+ 7`GSc opP` m4AN+[ izH        @ m   @ G d p  w $ 9 S h     '60CQcu 0@P`pe->g__imp_RtlVirtualUnwind_aesni_encrypt2L$enc_loop2_aesni_encrypt3L$enc_loop3_aesni_encrypt4L$enc_loop4_aesni_encrypt6L$enc_loop6L$enc_loop6_enter_aesni_encrypt8L$enc_loop8L$enc_loop8_innerL$enc_loop8_enterring_core_0_17_14__aes_hw_ctr32_encrypt_blocksL$SEH_begin_aes_hw_ctr32_encrypt_blocksL$oop_enc1_1L$ctr32_bulkL$ctr32_bodyL$ctr32_loop8L$ctr32_enc_doneL$ctr32_tailL$ctr32_loop4L$ctr32_loop3L$ctr32_doneL$ctr32_epilogueL$SEH_end_aes_hw_ctr32_encrypt_blocksring_core_0_17_14__aes_hw_set_encrypt_key_baseL$SEH_begin_aes_hw_set_encrypt_key_base_1L$SEH_prologue_aes_hw_set_encrypt_key_base_2L$SEH_endprologue_aes_hw_set_encrypt_key_base_3L$10roundsL$14roundsL$bad_keybitsL$enc_key_retL$SEH_end_aes_hw_set_encrypt_key_base_4L$key_expansion_128L$key_expansion_128_coldL$key_expansion_256aL$key_expansion_256a_coldL$key_expansion_256bring_core_0_17_14__aes_hw_set_encrypt_key_altL$SEH_begin_aes_hw_set_encrypt_key_alt_1L$SEH_prologue_aes_hw_set_encrypt_key_alt_2L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3L$oop_key128L$14rounds_altL$oop_key256L$done_key256L$bad_keybits_altL$enc_key_ret_altL$SEH_end_aes_hw_set_encrypt_key_alt_4L$bswap_maskL$increment32L$increment64L$increment1L$key_rotateL$key_rotate192L$key_rcon1L$key_rcon1bctr_xts_se_handlerL$common_seh_tailL$SEH_info_ctr32L$SEH_info_aes_hw_set_encrypt_key_base_0L$SEH_info_aes_hw_set_encrypt_key_alt_0ring-0.17.14/pregenerated/aesv8-armx-ios64.S000064400000000000000000000167421046102023000164750ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) #if __ARM_MAX_ARCH__>=7 .text .section __TEXT,__const .align 5 Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b .text .globl _aes_hw_set_encrypt_key .private_extern _aes_hw_set_encrypt_key .align 5 _aes_hw_set_encrypt_key: Lenc_key: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 mov x3,#-2 cmp w1,#128 b.lt Lenc_key_abort cmp w1,#256 b.gt Lenc_key_abort tst w1,#0x3f b.ne Lenc_key_abort adrp x3,Lrcon@PAGE add x3,x3,Lrcon@PAGEOFF cmp w1,#192 eor v0.16b,v0.16b,v0.16b ld1 {v3.16b},[x0],#16 mov w1,#8 // reuse w1 ld1 {v1.4s,v2.4s},[x3],#32 b.lt Loop128 // 192-bit key support was removed. b L256 .align 4 Loop128: tbl v6.16b,{v3.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 st1 {v3.4s},[x2],#16 aese v6.16b,v0.16b subs w1,w1,#1 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b b.ne Loop128 ld1 {v1.4s},[x3] tbl v6.16b,{v3.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 st1 {v3.4s},[x2],#16 aese v6.16b,v0.16b eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b tbl v6.16b,{v3.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 st1 {v3.4s},[x2],#16 aese v6.16b,v0.16b eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b eor v3.16b,v3.16b,v6.16b st1 {v3.4s},[x2] add x2,x2,#0x50 mov w12,#10 b Ldone // 192-bit key support was removed. .align 4 L256: ld1 {v4.16b},[x0] mov w1,#7 mov w12,#14 st1 {v3.4s},[x2],#16 Loop256: tbl v6.16b,{v4.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 st1 {v4.4s},[x2],#16 aese v6.16b,v0.16b subs w1,w1,#1 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b st1 {v3.4s},[x2],#16 b.eq Ldone dup v6.4s,v3.s[3] // just splat ext v5.16b,v0.16b,v4.16b,#12 aese v6.16b,v0.16b eor v4.16b,v4.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v4.16b,v4.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v4.16b,v4.16b,v5.16b eor v4.16b,v4.16b,v6.16b b Loop256 Ldone: str w12,[x2] mov x3,#0 Lenc_key_abort: mov x0,x3 // return value ldr x29,[sp],#16 ret .globl _aes_hw_ctr32_encrypt_blocks .private_extern _aes_hw_ctr32_encrypt_blocks .align 5 _aes_hw_ctr32_encrypt_blocks: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ldr w5,[x3,#240] ldr w8, [x4, #12] ld1 {v0.4s},[x4] ld1 {v16.4s,v17.4s},[x3] // load key schedule... sub w5,w5,#4 mov x12,#16 cmp x2,#2 add x7,x3,x5,lsl#4 // pointer to last 5 round keys sub w5,w5,#2 ld1 {v20.4s,v21.4s},[x7],#32 ld1 {v22.4s,v23.4s},[x7],#32 ld1 {v7.4s},[x7] add x7,x3,#32 mov w6,w5 csel x12,xzr,x12,lo // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are // affected by silicon errata #1742098 [0] and #1655431 [1], // respectively, where the second instruction of an aese/aesmc // instruction pair may execute twice if an interrupt is taken right // after the first instruction consumes an input register of which a // single 32-bit lane has been updated the last time it was modified. // // This function uses a counter in one 32-bit lane. The vmov lines // could write to v1.16b and v18.16b directly, but that trips this bugs. // We write to v6.16b and copy to the final register as a workaround. // // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice #ifndef __AARCH64EB__ rev w8, w8 #endif add w10, w8, #1 orr v6.16b,v0.16b,v0.16b rev w10, w10 mov v6.s[3],w10 add w8, w8, #2 orr v1.16b,v6.16b,v6.16b b.ls Lctr32_tail rev w12, w8 mov v6.s[3],w12 sub x2,x2,#3 // bias orr v18.16b,v6.16b,v6.16b b Loop3x_ctr32 .align 4 Loop3x_ctr32: aese v0.16b,v16.16b aesmc v0.16b,v0.16b aese v1.16b,v16.16b aesmc v1.16b,v1.16b aese v18.16b,v16.16b aesmc v18.16b,v18.16b ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aese v0.16b,v17.16b aesmc v0.16b,v0.16b aese v1.16b,v17.16b aesmc v1.16b,v1.16b aese v18.16b,v17.16b aesmc v18.16b,v18.16b ld1 {v17.4s},[x7],#16 b.gt Loop3x_ctr32 aese v0.16b,v16.16b aesmc v4.16b,v0.16b aese v1.16b,v16.16b aesmc v5.16b,v1.16b ld1 {v2.16b},[x0],#16 add w9,w8,#1 aese v18.16b,v16.16b aesmc v18.16b,v18.16b ld1 {v3.16b},[x0],#16 rev w9,w9 aese v4.16b,v17.16b aesmc v4.16b,v4.16b aese v5.16b,v17.16b aesmc v5.16b,v5.16b ld1 {v19.16b},[x0],#16 mov x7,x3 aese v18.16b,v17.16b aesmc v17.16b,v18.16b aese v4.16b,v20.16b aesmc v4.16b,v4.16b aese v5.16b,v20.16b aesmc v5.16b,v5.16b eor v2.16b,v2.16b,v7.16b add w10,w8,#2 aese v17.16b,v20.16b aesmc v17.16b,v17.16b eor v3.16b,v3.16b,v7.16b add w8,w8,#3 aese v4.16b,v21.16b aesmc v4.16b,v4.16b aese v5.16b,v21.16b aesmc v5.16b,v5.16b // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in // 32-bit mode. See the comment above. eor v19.16b,v19.16b,v7.16b mov v6.s[3], w9 aese v17.16b,v21.16b aesmc v17.16b,v17.16b orr v0.16b,v6.16b,v6.16b rev w10,w10 aese v4.16b,v22.16b aesmc v4.16b,v4.16b mov v6.s[3], w10 rev w12,w8 aese v5.16b,v22.16b aesmc v5.16b,v5.16b orr v1.16b,v6.16b,v6.16b mov v6.s[3], w12 aese v17.16b,v22.16b aesmc v17.16b,v17.16b orr v18.16b,v6.16b,v6.16b subs x2,x2,#3 aese v4.16b,v23.16b aese v5.16b,v23.16b aese v17.16b,v23.16b eor v2.16b,v2.16b,v4.16b ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] st1 {v2.16b},[x1],#16 eor v3.16b,v3.16b,v5.16b mov w6,w5 st1 {v3.16b},[x1],#16 eor v19.16b,v19.16b,v17.16b ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] st1 {v19.16b},[x1],#16 b.hs Loop3x_ctr32 adds x2,x2,#3 b.eq Lctr32_done cmp x2,#1 mov x12,#16 csel x12,xzr,x12,eq Lctr32_tail: aese v0.16b,v16.16b aesmc v0.16b,v0.16b aese v1.16b,v16.16b aesmc v1.16b,v1.16b ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aese v0.16b,v17.16b aesmc v0.16b,v0.16b aese v1.16b,v17.16b aesmc v1.16b,v1.16b ld1 {v17.4s},[x7],#16 b.gt Lctr32_tail aese v0.16b,v16.16b aesmc v0.16b,v0.16b aese v1.16b,v16.16b aesmc v1.16b,v1.16b aese v0.16b,v17.16b aesmc v0.16b,v0.16b aese v1.16b,v17.16b aesmc v1.16b,v1.16b ld1 {v2.16b},[x0],x12 aese v0.16b,v20.16b aesmc v0.16b,v0.16b aese v1.16b,v20.16b aesmc v1.16b,v1.16b ld1 {v3.16b},[x0] aese v0.16b,v21.16b aesmc v0.16b,v0.16b aese v1.16b,v21.16b aesmc v1.16b,v1.16b eor v2.16b,v2.16b,v7.16b aese v0.16b,v22.16b aesmc v0.16b,v0.16b aese v1.16b,v22.16b aesmc v1.16b,v1.16b eor v3.16b,v3.16b,v7.16b aese v0.16b,v23.16b aese v1.16b,v23.16b cmp x2,#1 eor v2.16b,v2.16b,v0.16b eor v3.16b,v3.16b,v1.16b st1 {v2.16b},[x1],#16 b.eq Lctr32_done st1 {v3.16b},[x1] Lctr32_done: ldr x29,[sp],#16 ret #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) ring-0.17.14/pregenerated/aesv8-armx-linux64.S000064400000000000000000000172571046102023000170440ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) #if __ARM_MAX_ARCH__>=7 .text .arch armv8-a+crypto .section .rodata .align 5 .Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b .text .globl aes_hw_set_encrypt_key .hidden aes_hw_set_encrypt_key .type aes_hw_set_encrypt_key,%function .align 5 aes_hw_set_encrypt_key: .Lenc_key: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 mov x3,#-2 cmp w1,#128 b.lt .Lenc_key_abort cmp w1,#256 b.gt .Lenc_key_abort tst w1,#0x3f b.ne .Lenc_key_abort adrp x3,.Lrcon add x3,x3,:lo12:.Lrcon cmp w1,#192 eor v0.16b,v0.16b,v0.16b ld1 {v3.16b},[x0],#16 mov w1,#8 // reuse w1 ld1 {v1.4s,v2.4s},[x3],#32 b.lt .Loop128 // 192-bit key support was removed. b .L256 .align 4 .Loop128: tbl v6.16b,{v3.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 st1 {v3.4s},[x2],#16 aese v6.16b,v0.16b subs w1,w1,#1 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b b.ne .Loop128 ld1 {v1.4s},[x3] tbl v6.16b,{v3.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 st1 {v3.4s},[x2],#16 aese v6.16b,v0.16b eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b tbl v6.16b,{v3.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 st1 {v3.4s},[x2],#16 aese v6.16b,v0.16b eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b eor v3.16b,v3.16b,v6.16b st1 {v3.4s},[x2] add x2,x2,#0x50 mov w12,#10 b .Ldone // 192-bit key support was removed. .align 4 .L256: ld1 {v4.16b},[x0] mov w1,#7 mov w12,#14 st1 {v3.4s},[x2],#16 .Loop256: tbl v6.16b,{v4.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 st1 {v4.4s},[x2],#16 aese v6.16b,v0.16b subs w1,w1,#1 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b st1 {v3.4s},[x2],#16 b.eq .Ldone dup v6.4s,v3.s[3] // just splat ext v5.16b,v0.16b,v4.16b,#12 aese v6.16b,v0.16b eor v4.16b,v4.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v4.16b,v4.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v4.16b,v4.16b,v5.16b eor v4.16b,v4.16b,v6.16b b .Loop256 .Ldone: str w12,[x2] mov x3,#0 .Lenc_key_abort: mov x0,x3 // return value ldr x29,[sp],#16 ret .size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key .globl aes_hw_ctr32_encrypt_blocks .hidden aes_hw_ctr32_encrypt_blocks .type aes_hw_ctr32_encrypt_blocks,%function .align 5 aes_hw_ctr32_encrypt_blocks: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ldr w5,[x3,#240] ldr w8, [x4, #12] ld1 {v0.4s},[x4] ld1 {v16.4s,v17.4s},[x3] // load key schedule... sub w5,w5,#4 mov x12,#16 cmp x2,#2 add x7,x3,x5,lsl#4 // pointer to last 5 round keys sub w5,w5,#2 ld1 {v20.4s,v21.4s},[x7],#32 ld1 {v22.4s,v23.4s},[x7],#32 ld1 {v7.4s},[x7] add x7,x3,#32 mov w6,w5 csel x12,xzr,x12,lo // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are // affected by silicon errata #1742098 [0] and #1655431 [1], // respectively, where the second instruction of an aese/aesmc // instruction pair may execute twice if an interrupt is taken right // after the first instruction consumes an input register of which a // single 32-bit lane has been updated the last time it was modified. // // This function uses a counter in one 32-bit lane. The vmov lines // could write to v1.16b and v18.16b directly, but that trips this bugs. // We write to v6.16b and copy to the final register as a workaround. // // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice #ifndef __AARCH64EB__ rev w8, w8 #endif add w10, w8, #1 orr v6.16b,v0.16b,v0.16b rev w10, w10 mov v6.s[3],w10 add w8, w8, #2 orr v1.16b,v6.16b,v6.16b b.ls .Lctr32_tail rev w12, w8 mov v6.s[3],w12 sub x2,x2,#3 // bias orr v18.16b,v6.16b,v6.16b b .Loop3x_ctr32 .align 4 .Loop3x_ctr32: aese v0.16b,v16.16b aesmc v0.16b,v0.16b aese v1.16b,v16.16b aesmc v1.16b,v1.16b aese v18.16b,v16.16b aesmc v18.16b,v18.16b ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aese v0.16b,v17.16b aesmc v0.16b,v0.16b aese v1.16b,v17.16b aesmc v1.16b,v1.16b aese v18.16b,v17.16b aesmc v18.16b,v18.16b ld1 {v17.4s},[x7],#16 b.gt .Loop3x_ctr32 aese v0.16b,v16.16b aesmc v4.16b,v0.16b aese v1.16b,v16.16b aesmc v5.16b,v1.16b ld1 {v2.16b},[x0],#16 add w9,w8,#1 aese v18.16b,v16.16b aesmc v18.16b,v18.16b ld1 {v3.16b},[x0],#16 rev w9,w9 aese v4.16b,v17.16b aesmc v4.16b,v4.16b aese v5.16b,v17.16b aesmc v5.16b,v5.16b ld1 {v19.16b},[x0],#16 mov x7,x3 aese v18.16b,v17.16b aesmc v17.16b,v18.16b aese v4.16b,v20.16b aesmc v4.16b,v4.16b aese v5.16b,v20.16b aesmc v5.16b,v5.16b eor v2.16b,v2.16b,v7.16b add w10,w8,#2 aese v17.16b,v20.16b aesmc v17.16b,v17.16b eor v3.16b,v3.16b,v7.16b add w8,w8,#3 aese v4.16b,v21.16b aesmc v4.16b,v4.16b aese v5.16b,v21.16b aesmc v5.16b,v5.16b // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in // 32-bit mode. See the comment above. eor v19.16b,v19.16b,v7.16b mov v6.s[3], w9 aese v17.16b,v21.16b aesmc v17.16b,v17.16b orr v0.16b,v6.16b,v6.16b rev w10,w10 aese v4.16b,v22.16b aesmc v4.16b,v4.16b mov v6.s[3], w10 rev w12,w8 aese v5.16b,v22.16b aesmc v5.16b,v5.16b orr v1.16b,v6.16b,v6.16b mov v6.s[3], w12 aese v17.16b,v22.16b aesmc v17.16b,v17.16b orr v18.16b,v6.16b,v6.16b subs x2,x2,#3 aese v4.16b,v23.16b aese v5.16b,v23.16b aese v17.16b,v23.16b eor v2.16b,v2.16b,v4.16b ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] st1 {v2.16b},[x1],#16 eor v3.16b,v3.16b,v5.16b mov w6,w5 st1 {v3.16b},[x1],#16 eor v19.16b,v19.16b,v17.16b ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] st1 {v19.16b},[x1],#16 b.hs .Loop3x_ctr32 adds x2,x2,#3 b.eq .Lctr32_done cmp x2,#1 mov x12,#16 csel x12,xzr,x12,eq .Lctr32_tail: aese v0.16b,v16.16b aesmc v0.16b,v0.16b aese v1.16b,v16.16b aesmc v1.16b,v1.16b ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aese v0.16b,v17.16b aesmc v0.16b,v0.16b aese v1.16b,v17.16b aesmc v1.16b,v1.16b ld1 {v17.4s},[x7],#16 b.gt .Lctr32_tail aese v0.16b,v16.16b aesmc v0.16b,v0.16b aese v1.16b,v16.16b aesmc v1.16b,v1.16b aese v0.16b,v17.16b aesmc v0.16b,v0.16b aese v1.16b,v17.16b aesmc v1.16b,v1.16b ld1 {v2.16b},[x0],x12 aese v0.16b,v20.16b aesmc v0.16b,v0.16b aese v1.16b,v20.16b aesmc v1.16b,v1.16b ld1 {v3.16b},[x0] aese v0.16b,v21.16b aesmc v0.16b,v0.16b aese v1.16b,v21.16b aesmc v1.16b,v1.16b eor v2.16b,v2.16b,v7.16b aese v0.16b,v22.16b aesmc v0.16b,v0.16b aese v1.16b,v22.16b aesmc v1.16b,v1.16b eor v3.16b,v3.16b,v7.16b aese v0.16b,v23.16b aese v1.16b,v23.16b cmp x2,#1 eor v2.16b,v2.16b,v0.16b eor v3.16b,v3.16b,v1.16b st1 {v2.16b},[x1],#16 b.eq .Lctr32_done st1 {v3.16b},[x1] .Lctr32_done: ldr x29,[sp],#16 ret .size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) ring-0.17.14/pregenerated/aesv8-armx-win64.S000064400000000000000000000167541046102023000165030ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) #if __ARM_MAX_ARCH__>=7 .text .arch armv8-a+crypto .section .rodata .align 5 Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b .text .globl aes_hw_set_encrypt_key .def aes_hw_set_encrypt_key .type 32 .endef .align 5 aes_hw_set_encrypt_key: Lenc_key: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 mov x3,#-2 cmp w1,#128 b.lt Lenc_key_abort cmp w1,#256 b.gt Lenc_key_abort tst w1,#0x3f b.ne Lenc_key_abort adrp x3,Lrcon add x3,x3,:lo12:Lrcon cmp w1,#192 eor v0.16b,v0.16b,v0.16b ld1 {v3.16b},[x0],#16 mov w1,#8 // reuse w1 ld1 {v1.4s,v2.4s},[x3],#32 b.lt Loop128 // 192-bit key support was removed. b L256 .align 4 Loop128: tbl v6.16b,{v3.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 st1 {v3.4s},[x2],#16 aese v6.16b,v0.16b subs w1,w1,#1 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b b.ne Loop128 ld1 {v1.4s},[x3] tbl v6.16b,{v3.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 st1 {v3.4s},[x2],#16 aese v6.16b,v0.16b eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b tbl v6.16b,{v3.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 st1 {v3.4s},[x2],#16 aese v6.16b,v0.16b eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b eor v3.16b,v3.16b,v6.16b st1 {v3.4s},[x2] add x2,x2,#0x50 mov w12,#10 b Ldone // 192-bit key support was removed. .align 4 L256: ld1 {v4.16b},[x0] mov w1,#7 mov w12,#14 st1 {v3.4s},[x2],#16 Loop256: tbl v6.16b,{v4.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 st1 {v4.4s},[x2],#16 aese v6.16b,v0.16b subs w1,w1,#1 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b st1 {v3.4s},[x2],#16 b.eq Ldone dup v6.4s,v3.s[3] // just splat ext v5.16b,v0.16b,v4.16b,#12 aese v6.16b,v0.16b eor v4.16b,v4.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v4.16b,v4.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 eor v4.16b,v4.16b,v5.16b eor v4.16b,v4.16b,v6.16b b Loop256 Ldone: str w12,[x2] mov x3,#0 Lenc_key_abort: mov x0,x3 // return value ldr x29,[sp],#16 ret .globl aes_hw_ctr32_encrypt_blocks .def aes_hw_ctr32_encrypt_blocks .type 32 .endef .align 5 aes_hw_ctr32_encrypt_blocks: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ldr w5,[x3,#240] ldr w8, [x4, #12] ld1 {v0.4s},[x4] ld1 {v16.4s,v17.4s},[x3] // load key schedule... sub w5,w5,#4 mov x12,#16 cmp x2,#2 add x7,x3,x5,lsl#4 // pointer to last 5 round keys sub w5,w5,#2 ld1 {v20.4s,v21.4s},[x7],#32 ld1 {v22.4s,v23.4s},[x7],#32 ld1 {v7.4s},[x7] add x7,x3,#32 mov w6,w5 csel x12,xzr,x12,lo // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are // affected by silicon errata #1742098 [0] and #1655431 [1], // respectively, where the second instruction of an aese/aesmc // instruction pair may execute twice if an interrupt is taken right // after the first instruction consumes an input register of which a // single 32-bit lane has been updated the last time it was modified. // // This function uses a counter in one 32-bit lane. The vmov lines // could write to v1.16b and v18.16b directly, but that trips this bugs. // We write to v6.16b and copy to the final register as a workaround. // // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice #ifndef __AARCH64EB__ rev w8, w8 #endif add w10, w8, #1 orr v6.16b,v0.16b,v0.16b rev w10, w10 mov v6.s[3],w10 add w8, w8, #2 orr v1.16b,v6.16b,v6.16b b.ls Lctr32_tail rev w12, w8 mov v6.s[3],w12 sub x2,x2,#3 // bias orr v18.16b,v6.16b,v6.16b b Loop3x_ctr32 .align 4 Loop3x_ctr32: aese v0.16b,v16.16b aesmc v0.16b,v0.16b aese v1.16b,v16.16b aesmc v1.16b,v1.16b aese v18.16b,v16.16b aesmc v18.16b,v18.16b ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aese v0.16b,v17.16b aesmc v0.16b,v0.16b aese v1.16b,v17.16b aesmc v1.16b,v1.16b aese v18.16b,v17.16b aesmc v18.16b,v18.16b ld1 {v17.4s},[x7],#16 b.gt Loop3x_ctr32 aese v0.16b,v16.16b aesmc v4.16b,v0.16b aese v1.16b,v16.16b aesmc v5.16b,v1.16b ld1 {v2.16b},[x0],#16 add w9,w8,#1 aese v18.16b,v16.16b aesmc v18.16b,v18.16b ld1 {v3.16b},[x0],#16 rev w9,w9 aese v4.16b,v17.16b aesmc v4.16b,v4.16b aese v5.16b,v17.16b aesmc v5.16b,v5.16b ld1 {v19.16b},[x0],#16 mov x7,x3 aese v18.16b,v17.16b aesmc v17.16b,v18.16b aese v4.16b,v20.16b aesmc v4.16b,v4.16b aese v5.16b,v20.16b aesmc v5.16b,v5.16b eor v2.16b,v2.16b,v7.16b add w10,w8,#2 aese v17.16b,v20.16b aesmc v17.16b,v17.16b eor v3.16b,v3.16b,v7.16b add w8,w8,#3 aese v4.16b,v21.16b aesmc v4.16b,v4.16b aese v5.16b,v21.16b aesmc v5.16b,v5.16b // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in // 32-bit mode. See the comment above. eor v19.16b,v19.16b,v7.16b mov v6.s[3], w9 aese v17.16b,v21.16b aesmc v17.16b,v17.16b orr v0.16b,v6.16b,v6.16b rev w10,w10 aese v4.16b,v22.16b aesmc v4.16b,v4.16b mov v6.s[3], w10 rev w12,w8 aese v5.16b,v22.16b aesmc v5.16b,v5.16b orr v1.16b,v6.16b,v6.16b mov v6.s[3], w12 aese v17.16b,v22.16b aesmc v17.16b,v17.16b orr v18.16b,v6.16b,v6.16b subs x2,x2,#3 aese v4.16b,v23.16b aese v5.16b,v23.16b aese v17.16b,v23.16b eor v2.16b,v2.16b,v4.16b ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] st1 {v2.16b},[x1],#16 eor v3.16b,v3.16b,v5.16b mov w6,w5 st1 {v3.16b},[x1],#16 eor v19.16b,v19.16b,v17.16b ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] st1 {v19.16b},[x1],#16 b.hs Loop3x_ctr32 adds x2,x2,#3 b.eq Lctr32_done cmp x2,#1 mov x12,#16 csel x12,xzr,x12,eq Lctr32_tail: aese v0.16b,v16.16b aesmc v0.16b,v0.16b aese v1.16b,v16.16b aesmc v1.16b,v1.16b ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aese v0.16b,v17.16b aesmc v0.16b,v0.16b aese v1.16b,v17.16b aesmc v1.16b,v1.16b ld1 {v17.4s},[x7],#16 b.gt Lctr32_tail aese v0.16b,v16.16b aesmc v0.16b,v0.16b aese v1.16b,v16.16b aesmc v1.16b,v1.16b aese v0.16b,v17.16b aesmc v0.16b,v0.16b aese v1.16b,v17.16b aesmc v1.16b,v1.16b ld1 {v2.16b},[x0],x12 aese v0.16b,v20.16b aesmc v0.16b,v0.16b aese v1.16b,v20.16b aesmc v1.16b,v1.16b ld1 {v3.16b},[x0] aese v0.16b,v21.16b aesmc v0.16b,v0.16b aese v1.16b,v21.16b aesmc v1.16b,v1.16b eor v2.16b,v2.16b,v7.16b aese v0.16b,v22.16b aesmc v0.16b,v0.16b aese v1.16b,v22.16b aesmc v1.16b,v1.16b eor v3.16b,v3.16b,v7.16b aese v0.16b,v23.16b aese v1.16b,v23.16b cmp x2,#1 eor v2.16b,v2.16b,v0.16b eor v3.16b,v3.16b,v1.16b st1 {v2.16b},[x1],#16 b.eq Lctr32_done st1 {v3.16b},[x1] Lctr32_done: ldr x29,[sp],#16 ret #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) ring-0.17.14/pregenerated/aesv8-gcm-armv8-ios64.S000064400000000000000000002403501046102023000173210ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) #if __ARM_MAX_ARCH__ >= 8 .text .globl _aes_gcm_enc_kernel .private_extern _aes_gcm_enc_kernel .align 4 _aes_gcm_enc_kernel: AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-128]! mov x29, sp stp x19, x20, [sp, #16] mov x16, x4 mov x8, x5 stp x21, x22, [sp, #32] stp x23, x24, [sp, #48] stp d8, d9, [sp, #64] stp d10, d11, [sp, #80] stp d12, d13, [sp, #96] stp d14, d15, [sp, #112] ldr w17, [x8, #240] add x19, x8, x17, lsl #4 // borrow input_l1 for last key ldp x13, x14, [x19] // load round N keys ldr q31, [x19, #-16] // load round N-1 keys add x4, x0, x1, lsr #3 // end_input_ptr lsr x5, x1, #3 // byte_len mov x15, x5 ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible sub x5, x5, #1 // byte_len - 1 ldr q18, [x8, #0] // load rk0 and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ldr q25, [x8, #112] // load rk7 add x5, x5, x0 lsr x12, x11, #32 fmov d2, x10 // CTR block 2 orr w11, w11, w11 rev w12, w12 // rev_ctr32 fmov d1, x10 // CTR block 1 aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 0 - round 0 add w12, w12, #1 // increment rev_ctr32 rev w9, w12 // CTR block 1 fmov d3, x10 // CTR block 3 orr x9, x11, x9, lsl #32 // CTR block 1 add w12, w12, #1 // CTR block 1 ldr q19, [x8, #16] // load rk1 fmov v1.d[1], x9 // CTR block 1 rev w9, w12 // CTR block 2 add w12, w12, #1 // CTR block 2 orr x9, x11, x9, lsl #32 // CTR block 2 ldr q20, [x8, #32] // load rk2 fmov v2.d[1], x9 // CTR block 2 rev w9, w12 // CTR block 3 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 0 - round 1 orr x9, x11, x9, lsl #32 // CTR block 3 fmov v3.d[1], x9 // CTR block 3 aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 1 - round 0 ldr q21, [x8, #48] // load rk3 aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 0 - round 2 ldr q24, [x8, #96] // load rk6 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 2 - round 0 ldr q23, [x8, #80] // load rk5 aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 1 - round 1 ldr q14, [x6, #48] // load h3l | h3h ext v14.16b, v14.16b, v14.16b, #8 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 3 - round 0 aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 2 - round 1 ldr q22, [x8, #64] // load rk4 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 1 - round 2 ldr q13, [x6, #32] // load h2l | h2h ext v13.16b, v13.16b, v13.16b, #8 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 3 - round 1 ldr q30, [x8, #192] // load rk12 aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 2 - round 2 ldr q15, [x6, #80] // load h4l | h4h ext v15.16b, v15.16b, v15.16b, #8 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 1 - round 3 ldr q29, [x8, #176] // load rk11 aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 3 - round 2 ldr q26, [x8, #128] // load rk8 aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 2 - round 3 add w12, w12, #1 // CTR block 3 aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 0 - round 3 aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 3 - round 3 ld1 { v11.16b}, [x3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 2 - round 4 aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 0 - round 4 aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 1 - round 4 aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 3 - round 4 cmp x17, #12 // setup flags for AES-128/192/256 check aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 0 - round 5 aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 1 - round 5 aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 3 - round 5 aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 2 - round 5 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 1 - round 6 trn2 v17.2d, v14.2d, v15.2d // h4l | h3l aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 3 - round 6 ldr q27, [x8, #144] // load rk9 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 0 - round 6 ldr q12, [x6] // load h1l | h1h ext v12.16b, v12.16b, v12.16b, #8 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 2 - round 6 ldr q28, [x8, #160] // load rk10 aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 1 - round 7 trn1 v9.2d, v14.2d, v15.2d // h4h | h3h aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 0 - round 7 aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 2 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 3 - round 7 trn2 v16.2d, v12.2d, v13.2d // h2l | h1l aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 1 - round 8 aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 2 - round 8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 3 - round 8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 0 - round 8 b.lt Lenc_finish_first_blocks // branch if AES-128 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 1 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 2 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 3 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 0 - round 9 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 1 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 2 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 3 - round 10 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 0 - round 10 b.eq Lenc_finish_first_blocks // branch if AES-192 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 1 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 2 - round 11 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 0 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 3 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 1 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 2 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 0 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 3 - round 12 Lenc_finish_first_blocks: cmp x0, x5 // check if we have <= 4 blocks eor v17.16b, v17.16b, v9.16b // h4k | h3k aese v2.16b, v31.16b // AES block 2 - round N-1 trn1 v8.2d, v12.2d, v13.2d // h2h | h1h aese v1.16b, v31.16b // AES block 1 - round N-1 aese v0.16b, v31.16b // AES block 0 - round N-1 aese v3.16b, v31.16b // AES block 3 - round N-1 eor v16.16b, v16.16b, v8.16b // h2k | h1k b.ge Lenc_tail // handle tail ldp x19, x20, [x0, #16] // AES block 1 - load plaintext rev w9, w12 // CTR block 4 ldp x6, x7, [x0, #0] // AES block 0 - load plaintext ldp x23, x24, [x0, #48] // AES block 3 - load plaintext ldp x21, x22, [x0, #32] // AES block 2 - load plaintext add x0, x0, #64 // AES input_ptr update eor x19, x19, x13 // AES block 1 - round N low eor x20, x20, x14 // AES block 1 - round N high fmov d5, x19 // AES block 1 - mov low eor x6, x6, x13 // AES block 0 - round N low eor x7, x7, x14 // AES block 0 - round N high eor x24, x24, x14 // AES block 3 - round N high fmov d4, x6 // AES block 0 - mov low cmp x0, x5 // check if we have <= 8 blocks fmov v4.d[1], x7 // AES block 0 - mov high eor x23, x23, x13 // AES block 3 - round N low eor x21, x21, x13 // AES block 2 - round N low fmov v5.d[1], x20 // AES block 1 - mov high fmov d6, x21 // AES block 2 - mov low add w12, w12, #1 // CTR block 4 orr x9, x11, x9, lsl #32 // CTR block 4 fmov d7, x23 // AES block 3 - mov low eor x22, x22, x14 // AES block 2 - round N high fmov v6.d[1], x22 // AES block 2 - mov high eor v4.16b, v4.16b, v0.16b // AES block 0 - result fmov d0, x10 // CTR block 4 fmov v0.d[1], x9 // CTR block 4 rev w9, w12 // CTR block 5 add w12, w12, #1 // CTR block 5 eor v5.16b, v5.16b, v1.16b // AES block 1 - result fmov d1, x10 // CTR block 5 orr x9, x11, x9, lsl #32 // CTR block 5 fmov v1.d[1], x9 // CTR block 5 rev w9, w12 // CTR block 6 st1 { v4.16b}, [x2], #16 // AES block 0 - store result fmov v7.d[1], x24 // AES block 3 - mov high orr x9, x11, x9, lsl #32 // CTR block 6 eor v6.16b, v6.16b, v2.16b // AES block 2 - result st1 { v5.16b}, [x2], #16 // AES block 1 - store result add w12, w12, #1 // CTR block 6 fmov d2, x10 // CTR block 6 fmov v2.d[1], x9 // CTR block 6 st1 { v6.16b}, [x2], #16 // AES block 2 - store result rev w9, w12 // CTR block 7 orr x9, x11, x9, lsl #32 // CTR block 7 eor v7.16b, v7.16b, v3.16b // AES block 3 - result st1 { v7.16b}, [x2], #16 // AES block 3 - store result b.ge Lenc_prepretail // do prepretail Lenc_main_loop: // main loop start aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 fmov d3, x10 // CTR block 4k+3 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 fmov v3.d[1], x9 // CTR block 4k+3 aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 ldp x23, x24, [x0, #48] // AES block 4k+7 - load plaintext aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 ldp x21, x22, [x0, #32] // AES block 4k+6 - load plaintext aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 eor v4.16b, v4.16b, v11.16b // PRE 1 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 eor x23, x23, x13 // AES block 4k+7 - round N low aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 mov d10, v17.d[1] // GHASH block 4k - mid pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high eor x22, x22, x14 // AES block 4k+6 - round N high mov d8, v4.d[1] // GHASH block 4k - mid aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high mov d4, v5.d[1] // GHASH block 4k+1 - mid aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 mov d8, v6.d[1] // GHASH block 4k+2 - mid aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 ldp x19, x20, [x0, #16] // AES block 4k+5 - load plaintext aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 mov d4, v7.d[1] // GHASH block 4k+3 - mid aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 eor x19, x19, x13 // AES block 4k+5 - round N low aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 eor x21, x21, x13 // AES block 4k+6 - round N low aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 movi v8.8b, #0xc2 pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high cmp x17, #12 // setup flags for AES-128/192/256 check fmov d5, x19 // AES block 4k+5 - mov low ldp x6, x7, [x0, #0] // AES block 4k+4 - load plaintext b.lt Lenc_main_loop_continue // branch if AES-128 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 b.eq Lenc_main_loop_continue // branch if AES-192 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 Lenc_main_loop_continue: shl d8, d8, #56 // mod_constant eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid add w12, w12, #1 // CTR block 4k+3 eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up add x0, x0, #64 // AES input_ptr update pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid rev w9, w12 // CTR block 4k+8 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor x6, x6, x13 // AES block 4k+4 - round N low eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up eor x7, x7, x14 // AES block 4k+4 - round N high fmov d4, x6 // AES block 4k+4 - mov low orr x9, x11, x9, lsl #32 // CTR block 4k+8 eor v7.16b, v9.16b, v7.16b // MODULO - fold into mid eor x20, x20, x14 // AES block 4k+5 - round N high eor x24, x24, x14 // AES block 4k+7 - round N high add w12, w12, #1 // CTR block 4k+8 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 fmov v4.d[1], x7 // AES block 4k+4 - mov high eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid fmov d7, x23 // AES block 4k+7 - mov low aese v1.16b, v31.16b // AES block 4k+5 - round N-1 fmov v5.d[1], x20 // AES block 4k+5 - mov high fmov d6, x21 // AES block 4k+6 - mov low cmp x0, x5 // LOOP CONTROL fmov v6.d[1], x22 // AES block 4k+6 - mov high pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result fmov d0, x10 // CTR block 4k+8 fmov v0.d[1], x9 // CTR block 4k+8 rev w9, w12 // CTR block 4k+9 add w12, w12, #1 // CTR block 4k+9 eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result fmov d1, x10 // CTR block 4k+9 orr x9, x11, x9, lsl #32 // CTR block 4k+9 fmov v1.d[1], x9 // CTR block 4k+9 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 rev w9, w12 // CTR block 4k+10 st1 { v4.16b}, [x2], #16 // AES block 4k+4 - store result orr x9, x11, x9, lsl #32 // CTR block 4k+10 eor v11.16b, v11.16b, v9.16b // MODULO - fold into low fmov v7.d[1], x24 // AES block 4k+7 - mov high ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment st1 { v5.16b}, [x2], #16 // AES block 4k+5 - store result add w12, w12, #1 // CTR block 4k+10 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result fmov d2, x10 // CTR block 4k+10 st1 { v6.16b}, [x2], #16 // AES block 4k+6 - store result fmov v2.d[1], x9 // CTR block 4k+10 rev w9, w12 // CTR block 4k+11 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low orr x9, x11, x9, lsl #32 // CTR block 4k+11 eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result st1 { v7.16b}, [x2], #16 // AES block 4k+7 - store result b.lt Lenc_main_loop Lenc_prepretail: // PREPRETAIL aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 fmov d3, x10 // CTR block 4k+3 aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) fmov v3.d[1], x9 // CTR block 4k+3 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 eor v4.16b, v4.16b, v11.16b // PRE 1 rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 mov d10, v17.d[1] // GHASH block 4k - mid aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low mov d8, v4.d[1] // GHASH block 4k - mid pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high mov d4, v5.d[1] // GHASH block 4k+1 - mid aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid mov d8, v6.d[1] // GHASH block 4k+2 - mid aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid add w12, w12, #1 // CTR block 4k+3 pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high mov d4, v7.d[1] // GHASH block 4k+3 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 movi v8.8b, #0xc2 aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 shl d8, d8, #56 // mod_constant aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 cmp x17, #12 // setup flags for AES-128/192/256 check aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 eor v10.16b, v10.16b, v9.16b // karatsuba tidy up aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 pmull v4.1q, v9.1d, v8.1d ext v9.16b, v9.16b, v9.16b, #8 eor v10.16b, v10.16b, v11.16b b.lt Lenc_finish_prepretail // branch if AES-128 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 b.eq Lenc_finish_prepretail // branch if AES-192 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 Lenc_finish_prepretail: eor v10.16b, v10.16b, v4.16b eor v10.16b, v10.16b, v9.16b pmull v4.1q, v10.1d, v8.1d ext v10.16b, v10.16b, v10.16b, #8 aese v1.16b, v31.16b // AES block 4k+5 - round N-1 eor v11.16b, v11.16b, v4.16b aese v3.16b, v31.16b // AES block 4k+7 - round N-1 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 eor v11.16b, v11.16b, v10.16b Lenc_tail: // TAIL ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process ldp x6, x7, [x0], #16 // AES block 4k+4 - load plaintext eor x6, x6, x13 // AES block 4k+4 - round N low eor x7, x7, x14 // AES block 4k+4 - round N high cmp x5, #48 fmov d4, x6 // AES block 4k+4 - mov low fmov v4.d[1], x7 // AES block 4k+4 - mov high eor v5.16b, v4.16b, v0.16b // AES block 4k+4 - result b.gt Lenc_blocks_more_than_3 cmp x5, #32 mov v3.16b, v2.16b movi v11.8b, #0 movi v9.8b, #0 sub w12, w12, #1 mov v2.16b, v1.16b movi v10.8b, #0 b.gt Lenc_blocks_more_than_2 mov v3.16b, v1.16b sub w12, w12, #1 cmp x5, #16 b.gt Lenc_blocks_more_than_1 sub w12, w12, #1 b Lenc_blocks_less_than_1 Lenc_blocks_more_than_3: // blocks left > 3 st1 { v5.16b}, [x2], #16 // AES final-3 block - store result ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high rev64 v4.16b, v5.16b // GHASH final-3 block eor x6, x6, x13 // AES final-2 block - round N low eor v4.16b, v4.16b, v8.16b // feed in partial tag eor x7, x7, x14 // AES final-2 block - round N high mov d22, v4.d[1] // GHASH final-3 block - mid fmov d5, x6 // AES final-2 block - mov low fmov v5.d[1], x7 // AES final-2 block - mov high eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid movi v8.8b, #0 // suppress further partial tag feed in mov d10, v17.d[1] // GHASH final-3 block - mid pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid eor v5.16b, v5.16b, v1.16b // AES final-2 block - result Lenc_blocks_more_than_2: // blocks left > 2 st1 { v5.16b}, [x2], #16 // AES final-2 block - store result ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high rev64 v4.16b, v5.16b // GHASH final-2 block eor x6, x6, x13 // AES final-1 block - round N low eor v4.16b, v4.16b, v8.16b // feed in partial tag fmov d5, x6 // AES final-1 block - mov low eor x7, x7, x14 // AES final-1 block - round N high fmov v5.d[1], x7 // AES final-1 block - mov high movi v8.8b, #0 // suppress further partial tag feed in pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high mov d22, v4.d[1] // GHASH final-2 block - mid pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid eor v5.16b, v5.16b, v2.16b // AES final-1 block - result eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid Lenc_blocks_more_than_1: // blocks left > 1 st1 { v5.16b}, [x2], #16 // AES final-1 block - store result rev64 v4.16b, v5.16b // GHASH final-1 block ldp x6, x7, [x0], #16 // AES final block - load input low & high eor v4.16b, v4.16b, v8.16b // feed in partial tag movi v8.8b, #0 // suppress further partial tag feed in eor x6, x6, x13 // AES final block - round N low mov d22, v4.d[1] // GHASH final-1 block - mid pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high eor x7, x7, x14 // AES final block - round N high eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high ins v22.d[1], v22.d[0] // GHASH final-1 block - mid fmov d5, x6 // AES final block - mov low fmov v5.d[1], x7 // AES final block - mov high pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low eor v5.16b, v5.16b, v3.16b // AES final block - result eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low Lenc_blocks_less_than_1: // blocks left <= 1 and x1, x1, #127 // bit_length %= 128 mvn x13, xzr // rkN_l = 0xffffffffffffffff sub x1, x1, #128 // bit_length -= 128 neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) ld1 { v18.16b}, [x2] // load existing bytes where the possibly partial last block is to be stored mvn x14, xzr // rkN_h = 0xffffffffffffffff and x1, x1, #127 // bit_length %= 128 lsr x14, x14, x1 // rkN_h is mask for top 64b of last block cmp x1, #64 csel x6, x13, x14, lt csel x7, x14, xzr, lt fmov d0, x6 // ctr0b is mask for last block fmov v0.d[1], x7 and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits rev64 v4.16b, v5.16b // GHASH final block eor v4.16b, v4.16b, v8.16b // feed in partial tag bif v5.16b, v18.16b, v0.16b // insert existing bytes in top end of result before storing pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high mov d8, v4.d[1] // GHASH final block - mid rev w9, w12 pmull v21.1q, v4.1d, v12.1d // GHASH final block - low eor v9.16b, v9.16b, v20.16b // GHASH final block - high eor v8.8b, v8.8b, v4.8b // GHASH final block - mid pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid eor v11.16b, v11.16b, v21.16b // GHASH final block - low eor v10.16b, v10.16b, v8.16b // GHASH final block - mid movi v8.8b, #0xc2 eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up shl d8, d8, #56 // mod_constant eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment str w9, [x16, #12] // store the updated counter st1 { v5.16b}, [x2] // store all 16B eor v11.16b, v11.16b, v9.16b // MODULO - fold into low eor v11.16b, v11.16b, v10.16b // MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 st1 { v11.16b }, [x3] ldp x19, x20, [sp, #16] ldp x21, x22, [sp, #32] ldp x23, x24, [sp, #48] ldp d8, d9, [sp, #64] ldp d10, d11, [sp, #80] ldp d12, d13, [sp, #96] ldp d14, d15, [sp, #112] ldp x29, x30, [sp], #128 AARCH64_VALIDATE_LINK_REGISTER ret .globl _aes_gcm_dec_kernel .private_extern _aes_gcm_dec_kernel .align 4 _aes_gcm_dec_kernel: AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-128]! mov x29, sp stp x19, x20, [sp, #16] mov x16, x4 mov x8, x5 stp x21, x22, [sp, #32] stp x23, x24, [sp, #48] stp d8, d9, [sp, #64] stp d10, d11, [sp, #80] stp d12, d13, [sp, #96] stp d14, d15, [sp, #112] ldr w17, [x8, #240] add x19, x8, x17, lsl #4 // borrow input_l1 for last key ldp x13, x14, [x19] // load round N keys ldr q31, [x19, #-16] // load round N-1 keys lsr x5, x1, #3 // byte_len mov x15, x5 ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 ldr q26, [x8, #128] // load rk8 sub x5, x5, #1 // byte_len - 1 ldr q25, [x8, #112] // load rk7 and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) add x4, x0, x1, lsr #3 // end_input_ptr ldr q24, [x8, #96] // load rk6 lsr x12, x11, #32 ldr q23, [x8, #80] // load rk5 orr w11, w11, w11 ldr q21, [x8, #48] // load rk3 add x5, x5, x0 rev w12, w12 // rev_ctr32 add w12, w12, #1 // increment rev_ctr32 fmov d3, x10 // CTR block 3 rev w9, w12 // CTR block 1 add w12, w12, #1 // CTR block 1 fmov d1, x10 // CTR block 1 orr x9, x11, x9, lsl #32 // CTR block 1 ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible fmov v1.d[1], x9 // CTR block 1 rev w9, w12 // CTR block 2 add w12, w12, #1 // CTR block 2 fmov d2, x10 // CTR block 2 orr x9, x11, x9, lsl #32 // CTR block 2 fmov v2.d[1], x9 // CTR block 2 rev w9, w12 // CTR block 3 orr x9, x11, x9, lsl #32 // CTR block 3 ldr q18, [x8, #0] // load rk0 fmov v3.d[1], x9 // CTR block 3 add w12, w12, #1 // CTR block 3 ldr q22, [x8, #64] // load rk4 ldr q19, [x8, #16] // load rk1 aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 0 - round 0 ldr q14, [x6, #48] // load h3l | h3h ext v14.16b, v14.16b, v14.16b, #8 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 3 - round 0 ldr q15, [x6, #80] // load h4l | h4h ext v15.16b, v15.16b, v15.16b, #8 aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 1 - round 0 ldr q13, [x6, #32] // load h2l | h2h ext v13.16b, v13.16b, v13.16b, #8 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 2 - round 0 ldr q20, [x8, #32] // load rk2 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 0 - round 1 aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 1 - round 1 ld1 { v11.16b}, [x3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 2 - round 1 ldr q27, [x8, #144] // load rk9 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 3 - round 1 ldr q30, [x8, #192] // load rk12 aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 0 - round 2 ldr q12, [x6] // load h1l | h1h ext v12.16b, v12.16b, v12.16b, #8 aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 2 - round 2 ldr q28, [x8, #160] // load rk10 aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 3 - round 2 aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 0 - round 3 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 1 - round 2 aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 3 - round 3 aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 0 - round 4 aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 2 - round 3 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 1 - round 3 aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 3 - round 4 aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 2 - round 4 aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 1 - round 4 aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 3 - round 5 aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 0 - round 5 aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 1 - round 5 aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 2 - round 5 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 0 - round 6 aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 3 - round 6 cmp x17, #12 // setup flags for AES-128/192/256 check aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 1 - round 6 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 2 - round 6 aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 0 - round 7 aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 1 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 3 - round 7 aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 0 - round 8 aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 2 - round 7 aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 3 - round 8 aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 1 - round 8 ldr q29, [x8, #176] // load rk11 aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 2 - round 8 b.lt Ldec_finish_first_blocks // branch if AES-128 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 0 - round 9 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 1 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 3 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 2 - round 9 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 0 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 1 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 3 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 2 - round 10 b.eq Ldec_finish_first_blocks // branch if AES-192 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 0 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 3 - round 11 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 1 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 2 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 1 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 0 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 2 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 3 - round 12 Ldec_finish_first_blocks: cmp x0, x5 // check if we have <= 4 blocks trn1 v9.2d, v14.2d, v15.2d // h4h | h3h trn2 v17.2d, v14.2d, v15.2d // h4l | h3l trn1 v8.2d, v12.2d, v13.2d // h2h | h1h trn2 v16.2d, v12.2d, v13.2d // h2l | h1l eor v17.16b, v17.16b, v9.16b // h4k | h3k aese v1.16b, v31.16b // AES block 1 - round N-1 aese v2.16b, v31.16b // AES block 2 - round N-1 eor v16.16b, v16.16b, v8.16b // h2k | h1k aese v3.16b, v31.16b // AES block 3 - round N-1 aese v0.16b, v31.16b // AES block 0 - round N-1 b.ge Ldec_tail // handle tail ldr q4, [x0, #0] // AES block 0 - load ciphertext ldr q5, [x0, #16] // AES block 1 - load ciphertext rev w9, w12 // CTR block 4 eor v0.16b, v4.16b, v0.16b // AES block 0 - result eor v1.16b, v5.16b, v1.16b // AES block 1 - result rev64 v5.16b, v5.16b // GHASH block 1 ldr q7, [x0, #48] // AES block 3 - load ciphertext mov x7, v0.d[1] // AES block 0 - mov high mov x6, v0.d[0] // AES block 0 - mov low rev64 v4.16b, v4.16b // GHASH block 0 add w12, w12, #1 // CTR block 4 fmov d0, x10 // CTR block 4 orr x9, x11, x9, lsl #32 // CTR block 4 fmov v0.d[1], x9 // CTR block 4 rev w9, w12 // CTR block 5 add w12, w12, #1 // CTR block 5 mov x19, v1.d[0] // AES block 1 - mov low orr x9, x11, x9, lsl #32 // CTR block 5 mov x20, v1.d[1] // AES block 1 - mov high eor x7, x7, x14 // AES block 0 - round N high eor x6, x6, x13 // AES block 0 - round N low stp x6, x7, [x2], #16 // AES block 0 - store result fmov d1, x10 // CTR block 5 ldr q6, [x0, #32] // AES block 2 - load ciphertext add x0, x0, #64 // AES input_ptr update fmov v1.d[1], x9 // CTR block 5 rev w9, w12 // CTR block 6 add w12, w12, #1 // CTR block 6 eor x19, x19, x13 // AES block 1 - round N low orr x9, x11, x9, lsl #32 // CTR block 6 eor x20, x20, x14 // AES block 1 - round N high stp x19, x20, [x2], #16 // AES block 1 - store result eor v2.16b, v6.16b, v2.16b // AES block 2 - result cmp x0, x5 // check if we have <= 8 blocks b.ge Ldec_prepretail // do prepretail Ldec_main_loop: // main loop start mov x21, v2.d[0] // AES block 4k+2 - mov low ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 mov x22, v2.d[1] // AES block 4k+2 - mov high aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 fmov d2, x10 // CTR block 4k+6 fmov v2.d[1], x9 // CTR block 4k+6 eor v4.16b, v4.16b, v11.16b // PRE 1 rev w9, w12 // CTR block 4k+7 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 mov x24, v3.d[1] // AES block 4k+3 - mov high aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 mov x23, v3.d[0] // AES block 4k+3 - mov low pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high mov d8, v4.d[1] // GHASH block 4k - mid fmov d3, x10 // CTR block 4k+7 aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 orr x9, x11, x9, lsl #32 // CTR block 4k+7 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 fmov v3.d[1], x9 // CTR block 4k+7 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 eor x22, x22, x14 // AES block 4k+2 - round N high aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 mov d10, v17.d[1] // GHASH block 4k - mid aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 rev64 v6.16b, v6.16b // GHASH block 4k+2 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 eor x21, x21, x13 // AES block 4k+2 - round N low aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 stp x21, x22, [x2], #16 // AES block 4k+2 - store result pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 rev64 v7.16b, v7.16b // GHASH block 4k+3 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid eor x23, x23, x13 // AES block 4k+3 - round N low pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low eor x24, x24, x14 // AES block 4k+3 - round N high eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 mov d4, v5.d[1] // GHASH block 4k+1 - mid aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 add w12, w12, #1 // CTR block 4k+7 aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 mov d8, v6.d[1] // GHASH block 4k+2 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid rev w9, w12 // CTR block 4k+8 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 add w12, w12, #1 // CTR block 4k+8 aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high mov d6, v7.d[1] // GHASH block 4k+3 - mid aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low orr x9, x11, x9, lsl #32 // CTR block 4k+8 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high cmp x17, #12 // setup flags for AES-128/192/256 check eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid movi v8.8b, #0xc2 aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 shl d8, d8, #56 // mod_constant aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 b.lt Ldec_main_loop_continue // branch if AES-128 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 b.eq Ldec_main_loop_continue // branch if AES-192 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 Ldec_main_loop_continue: pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext aese v0.16b, v31.16b // AES block 4k+4 - round N-1 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result stp x23, x24, [x2], #16 // AES block 4k+3 - store result eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext mov x7, v0.d[1] // AES block 4k+4 - mov high eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid aese v1.16b, v31.16b // AES block 4k+5 - round N-1 add x0, x0, #64 // AES input_ptr update mov x6, v0.d[0] // AES block 4k+4 - mov low fmov d0, x10 // CTR block 4k+8 fmov v0.d[1], x9 // CTR block 4k+8 pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result rev w9, w12 // CTR block 4k+9 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 orr x9, x11, x9, lsl #32 // CTR block 4k+9 cmp x0, x5 // LOOP CONTROL add w12, w12, #1 // CTR block 4k+9 eor x6, x6, x13 // AES block 4k+4 - round N low eor x7, x7, x14 // AES block 4k+4 - round N high mov x20, v1.d[1] // AES block 4k+5 - mov high eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result eor v11.16b, v11.16b, v8.16b // MODULO - fold into low mov x19, v1.d[0] // AES block 4k+5 - mov low fmov d1, x10 // CTR block 4k+9 ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment fmov v1.d[1], x9 // CTR block 4k+9 rev w9, w12 // CTR block 4k+10 add w12, w12, #1 // CTR block 4k+10 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 orr x9, x11, x9, lsl #32 // CTR block 4k+10 rev64 v5.16b, v5.16b // GHASH block 4k+5 eor x20, x20, x14 // AES block 4k+5 - round N high stp x6, x7, [x2], #16 // AES block 4k+4 - store result eor x19, x19, x13 // AES block 4k+5 - round N low stp x19, x20, [x2], #16 // AES block 4k+5 - store result rev64 v4.16b, v4.16b // GHASH block 4k+4 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low b.lt Ldec_main_loop Ldec_prepretail: // PREPRETAIL ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 mov x21, v2.d[0] // AES block 4k+2 - mov low eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 mov x22, v2.d[1] // AES block 4k+2 - mov high aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 fmov d2, x10 // CTR block 4k+6 fmov v2.d[1], x9 // CTR block 4k+6 rev w9, w12 // CTR block 4k+7 eor v4.16b, v4.16b, v11.16b // PRE 1 rev64 v6.16b, v6.16b // GHASH block 4k+2 orr x9, x11, x9, lsl #32 // CTR block 4k+7 mov x23, v3.d[0] // AES block 4k+3 - mov low aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 mov x24, v3.d[1] // AES block 4k+3 - mov high pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low mov d8, v4.d[1] // GHASH block 4k - mid fmov d3, x10 // CTR block 4k+7 pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high fmov v3.d[1], x9 // CTR block 4k+7 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 mov d10, v17.d[1] // GHASH block 4k - mid aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 rev64 v7.16b, v7.16b // GHASH block 4k+3 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 mov d4, v5.d[1] // GHASH block 4k+1 - mid aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 mov d8, v6.d[1] // GHASH block 4k+2 - mid aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 mov d6, v7.d[1] // GHASH block 4k+3 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 movi v8.8b, #0xc2 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 cmp x17, #12 // setup flags for AES-128/192/256 check eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 shl d8, d8, #56 // mod_constant aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 b.lt Ldec_finish_prepretail // branch if AES-128 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 b.eq Ldec_finish_prepretail // branch if AES-192 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 Ldec_finish_prepretail: eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid eor x22, x22, x14 // AES block 4k+2 - round N high eor x23, x23, x13 // AES block 4k+3 - round N low eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid add w12, w12, #1 // CTR block 4k+7 eor x21, x21, x13 // AES block 4k+2 - round N low pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low eor x24, x24, x14 // AES block 4k+3 - round N high stp x21, x22, [x2], #16 // AES block 4k+2 - store result ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment stp x23, x24, [x2], #16 // AES block 4k+3 - store result eor v11.16b, v11.16b, v8.16b // MODULO - fold into low aese v1.16b, v31.16b // AES block 4k+5 - round N-1 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low Ldec_tail: // TAIL sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result mov x6, v0.d[0] // AES block 4k+4 - mov low mov x7, v0.d[1] // AES block 4k+4 - mov high ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag cmp x5, #48 eor x6, x6, x13 // AES block 4k+4 - round N low eor x7, x7, x14 // AES block 4k+4 - round N high b.gt Ldec_blocks_more_than_3 sub w12, w12, #1 mov v3.16b, v2.16b movi v10.8b, #0 movi v11.8b, #0 cmp x5, #32 movi v9.8b, #0 mov v2.16b, v1.16b b.gt Ldec_blocks_more_than_2 sub w12, w12, #1 mov v3.16b, v1.16b cmp x5, #16 b.gt Ldec_blocks_more_than_1 sub w12, w12, #1 b Ldec_blocks_less_than_1 Ldec_blocks_more_than_3: // blocks left > 3 rev64 v4.16b, v5.16b // GHASH final-3 block ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext stp x6, x7, [x2], #16 // AES final-3 block - store result mov d10, v17.d[1] // GHASH final-3 block - mid eor v4.16b, v4.16b, v8.16b // feed in partial tag eor v0.16b, v5.16b, v1.16b // AES final-2 block - result mov d22, v4.d[1] // GHASH final-3 block - mid mov x6, v0.d[0] // AES final-2 block - mov low mov x7, v0.d[1] // AES final-2 block - mov high eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid movi v8.8b, #0 // suppress further partial tag feed in pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid eor x6, x6, x13 // AES final-2 block - round N low pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low eor x7, x7, x14 // AES final-2 block - round N high Ldec_blocks_more_than_2: // blocks left > 2 rev64 v4.16b, v5.16b // GHASH final-2 block ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext eor v4.16b, v4.16b, v8.16b // feed in partial tag stp x6, x7, [x2], #16 // AES final-2 block - store result eor v0.16b, v5.16b, v2.16b // AES final-1 block - result mov d22, v4.d[1] // GHASH final-2 block - mid pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid mov x6, v0.d[0] // AES final-1 block - mov low mov x7, v0.d[1] // AES final-1 block - mov high eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low movi v8.8b, #0 // suppress further partial tag feed in pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high eor x6, x6, x13 // AES final-1 block - round N low eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid eor x7, x7, x14 // AES final-1 block - round N high Ldec_blocks_more_than_1: // blocks left > 1 stp x6, x7, [x2], #16 // AES final-1 block - store result rev64 v4.16b, v5.16b // GHASH final-1 block ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext eor v4.16b, v4.16b, v8.16b // feed in partial tag movi v8.8b, #0 // suppress further partial tag feed in mov d22, v4.d[1] // GHASH final-1 block - mid eor v0.16b, v5.16b, v3.16b // AES final block - result pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low mov x6, v0.d[0] // AES final block - mov low ins v22.d[1], v22.d[0] // GHASH final-1 block - mid mov x7, v0.d[1] // AES final block - mov high pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid eor x6, x6, x13 // AES final block - round N low eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid eor x7, x7, x14 // AES final block - round N high Ldec_blocks_less_than_1: // blocks left <= 1 and x1, x1, #127 // bit_length %= 128 mvn x14, xzr // rkN_h = 0xffffffffffffffff sub x1, x1, #128 // bit_length -= 128 mvn x13, xzr // rkN_l = 0xffffffffffffffff ldp x4, x5, [x2] // load existing bytes we need to not overwrite neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) and x1, x1, #127 // bit_length %= 128 lsr x14, x14, x1 // rkN_h is mask for top 64b of last block cmp x1, #64 csel x9, x13, x14, lt csel x10, x14, xzr, lt fmov d0, x9 // ctr0b is mask for last block and x6, x6, x9 mov v0.d[1], x10 bic x4, x4, x9 // mask out low existing bytes rev w9, w12 bic x5, x5, x10 // mask out high existing bytes orr x6, x6, x4 and x7, x7, x10 orr x7, x7, x5 and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits rev64 v4.16b, v5.16b // GHASH final block eor v4.16b, v4.16b, v8.16b // feed in partial tag pmull v21.1q, v4.1d, v12.1d // GHASH final block - low mov d8, v4.d[1] // GHASH final block - mid eor v8.8b, v8.8b, v4.8b // GHASH final block - mid pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid eor v9.16b, v9.16b, v20.16b // GHASH final block - high eor v11.16b, v11.16b, v21.16b // GHASH final block - low eor v10.16b, v10.16b, v8.16b // GHASH final block - mid movi v8.8b, #0xc2 eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up shl d8, d8, #56 // mod_constant eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment eor v11.16b, v11.16b, v8.16b // MODULO - fold into low stp x6, x7, [x2] str w9, [x16, #12] // store the updated counter eor v11.16b, v11.16b, v10.16b // MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 st1 { v11.16b }, [x3] ldp x19, x20, [sp, #16] ldp x21, x22, [sp, #32] ldp x23, x24, [sp, #48] ldp d8, d9, [sp, #64] ldp d10, d11, [sp, #80] ldp d12, d13, [sp, #96] ldp d14, d15, [sp, #112] ldp x29, x30, [sp], #128 AARCH64_VALIDATE_LINK_REGISTER ret #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) ring-0.17.14/pregenerated/aesv8-gcm-armv8-linux64.S000064400000000000000000002406601046102023000176720ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) #if __ARM_MAX_ARCH__ >= 8 .arch armv8-a+crypto .text .globl aes_gcm_enc_kernel .hidden aes_gcm_enc_kernel .type aes_gcm_enc_kernel,%function .align 4 aes_gcm_enc_kernel: AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-128]! mov x29, sp stp x19, x20, [sp, #16] mov x16, x4 mov x8, x5 stp x21, x22, [sp, #32] stp x23, x24, [sp, #48] stp d8, d9, [sp, #64] stp d10, d11, [sp, #80] stp d12, d13, [sp, #96] stp d14, d15, [sp, #112] ldr w17, [x8, #240] add x19, x8, x17, lsl #4 // borrow input_l1 for last key ldp x13, x14, [x19] // load round N keys ldr q31, [x19, #-16] // load round N-1 keys add x4, x0, x1, lsr #3 // end_input_ptr lsr x5, x1, #3 // byte_len mov x15, x5 ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible sub x5, x5, #1 // byte_len - 1 ldr q18, [x8, #0] // load rk0 and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ldr q25, [x8, #112] // load rk7 add x5, x5, x0 lsr x12, x11, #32 fmov d2, x10 // CTR block 2 orr w11, w11, w11 rev w12, w12 // rev_ctr32 fmov d1, x10 // CTR block 1 aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 0 - round 0 add w12, w12, #1 // increment rev_ctr32 rev w9, w12 // CTR block 1 fmov d3, x10 // CTR block 3 orr x9, x11, x9, lsl #32 // CTR block 1 add w12, w12, #1 // CTR block 1 ldr q19, [x8, #16] // load rk1 fmov v1.d[1], x9 // CTR block 1 rev w9, w12 // CTR block 2 add w12, w12, #1 // CTR block 2 orr x9, x11, x9, lsl #32 // CTR block 2 ldr q20, [x8, #32] // load rk2 fmov v2.d[1], x9 // CTR block 2 rev w9, w12 // CTR block 3 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 0 - round 1 orr x9, x11, x9, lsl #32 // CTR block 3 fmov v3.d[1], x9 // CTR block 3 aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 1 - round 0 ldr q21, [x8, #48] // load rk3 aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 0 - round 2 ldr q24, [x8, #96] // load rk6 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 2 - round 0 ldr q23, [x8, #80] // load rk5 aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 1 - round 1 ldr q14, [x6, #48] // load h3l | h3h ext v14.16b, v14.16b, v14.16b, #8 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 3 - round 0 aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 2 - round 1 ldr q22, [x8, #64] // load rk4 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 1 - round 2 ldr q13, [x6, #32] // load h2l | h2h ext v13.16b, v13.16b, v13.16b, #8 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 3 - round 1 ldr q30, [x8, #192] // load rk12 aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 2 - round 2 ldr q15, [x6, #80] // load h4l | h4h ext v15.16b, v15.16b, v15.16b, #8 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 1 - round 3 ldr q29, [x8, #176] // load rk11 aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 3 - round 2 ldr q26, [x8, #128] // load rk8 aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 2 - round 3 add w12, w12, #1 // CTR block 3 aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 0 - round 3 aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 3 - round 3 ld1 { v11.16b}, [x3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 2 - round 4 aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 0 - round 4 aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 1 - round 4 aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 3 - round 4 cmp x17, #12 // setup flags for AES-128/192/256 check aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 0 - round 5 aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 1 - round 5 aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 3 - round 5 aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 2 - round 5 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 1 - round 6 trn2 v17.2d, v14.2d, v15.2d // h4l | h3l aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 3 - round 6 ldr q27, [x8, #144] // load rk9 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 0 - round 6 ldr q12, [x6] // load h1l | h1h ext v12.16b, v12.16b, v12.16b, #8 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 2 - round 6 ldr q28, [x8, #160] // load rk10 aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 1 - round 7 trn1 v9.2d, v14.2d, v15.2d // h4h | h3h aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 0 - round 7 aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 2 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 3 - round 7 trn2 v16.2d, v12.2d, v13.2d // h2l | h1l aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 1 - round 8 aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 2 - round 8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 3 - round 8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 0 - round 8 b.lt .Lenc_finish_first_blocks // branch if AES-128 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 1 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 2 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 3 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 0 - round 9 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 1 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 2 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 3 - round 10 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 0 - round 10 b.eq .Lenc_finish_first_blocks // branch if AES-192 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 1 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 2 - round 11 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 0 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 3 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 1 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 2 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 0 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 3 - round 12 .Lenc_finish_first_blocks: cmp x0, x5 // check if we have <= 4 blocks eor v17.16b, v17.16b, v9.16b // h4k | h3k aese v2.16b, v31.16b // AES block 2 - round N-1 trn1 v8.2d, v12.2d, v13.2d // h2h | h1h aese v1.16b, v31.16b // AES block 1 - round N-1 aese v0.16b, v31.16b // AES block 0 - round N-1 aese v3.16b, v31.16b // AES block 3 - round N-1 eor v16.16b, v16.16b, v8.16b // h2k | h1k b.ge .Lenc_tail // handle tail ldp x19, x20, [x0, #16] // AES block 1 - load plaintext rev w9, w12 // CTR block 4 ldp x6, x7, [x0, #0] // AES block 0 - load plaintext ldp x23, x24, [x0, #48] // AES block 3 - load plaintext ldp x21, x22, [x0, #32] // AES block 2 - load plaintext add x0, x0, #64 // AES input_ptr update eor x19, x19, x13 // AES block 1 - round N low eor x20, x20, x14 // AES block 1 - round N high fmov d5, x19 // AES block 1 - mov low eor x6, x6, x13 // AES block 0 - round N low eor x7, x7, x14 // AES block 0 - round N high eor x24, x24, x14 // AES block 3 - round N high fmov d4, x6 // AES block 0 - mov low cmp x0, x5 // check if we have <= 8 blocks fmov v4.d[1], x7 // AES block 0 - mov high eor x23, x23, x13 // AES block 3 - round N low eor x21, x21, x13 // AES block 2 - round N low fmov v5.d[1], x20 // AES block 1 - mov high fmov d6, x21 // AES block 2 - mov low add w12, w12, #1 // CTR block 4 orr x9, x11, x9, lsl #32 // CTR block 4 fmov d7, x23 // AES block 3 - mov low eor x22, x22, x14 // AES block 2 - round N high fmov v6.d[1], x22 // AES block 2 - mov high eor v4.16b, v4.16b, v0.16b // AES block 0 - result fmov d0, x10 // CTR block 4 fmov v0.d[1], x9 // CTR block 4 rev w9, w12 // CTR block 5 add w12, w12, #1 // CTR block 5 eor v5.16b, v5.16b, v1.16b // AES block 1 - result fmov d1, x10 // CTR block 5 orr x9, x11, x9, lsl #32 // CTR block 5 fmov v1.d[1], x9 // CTR block 5 rev w9, w12 // CTR block 6 st1 { v4.16b}, [x2], #16 // AES block 0 - store result fmov v7.d[1], x24 // AES block 3 - mov high orr x9, x11, x9, lsl #32 // CTR block 6 eor v6.16b, v6.16b, v2.16b // AES block 2 - result st1 { v5.16b}, [x2], #16 // AES block 1 - store result add w12, w12, #1 // CTR block 6 fmov d2, x10 // CTR block 6 fmov v2.d[1], x9 // CTR block 6 st1 { v6.16b}, [x2], #16 // AES block 2 - store result rev w9, w12 // CTR block 7 orr x9, x11, x9, lsl #32 // CTR block 7 eor v7.16b, v7.16b, v3.16b // AES block 3 - result st1 { v7.16b}, [x2], #16 // AES block 3 - store result b.ge .Lenc_prepretail // do prepretail .Lenc_main_loop: // main loop start aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 fmov d3, x10 // CTR block 4k+3 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 fmov v3.d[1], x9 // CTR block 4k+3 aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 ldp x23, x24, [x0, #48] // AES block 4k+7 - load plaintext aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 ldp x21, x22, [x0, #32] // AES block 4k+6 - load plaintext aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 eor v4.16b, v4.16b, v11.16b // PRE 1 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 eor x23, x23, x13 // AES block 4k+7 - round N low aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 mov d10, v17.d[1] // GHASH block 4k - mid pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high eor x22, x22, x14 // AES block 4k+6 - round N high mov d8, v4.d[1] // GHASH block 4k - mid aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high mov d4, v5.d[1] // GHASH block 4k+1 - mid aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 mov d8, v6.d[1] // GHASH block 4k+2 - mid aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 ldp x19, x20, [x0, #16] // AES block 4k+5 - load plaintext aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 mov d4, v7.d[1] // GHASH block 4k+3 - mid aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 eor x19, x19, x13 // AES block 4k+5 - round N low aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 eor x21, x21, x13 // AES block 4k+6 - round N low aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 movi v8.8b, #0xc2 pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high cmp x17, #12 // setup flags for AES-128/192/256 check fmov d5, x19 // AES block 4k+5 - mov low ldp x6, x7, [x0, #0] // AES block 4k+4 - load plaintext b.lt .Lenc_main_loop_continue // branch if AES-128 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 b.eq .Lenc_main_loop_continue // branch if AES-192 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 .Lenc_main_loop_continue: shl d8, d8, #56 // mod_constant eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid add w12, w12, #1 // CTR block 4k+3 eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up add x0, x0, #64 // AES input_ptr update pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid rev w9, w12 // CTR block 4k+8 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor x6, x6, x13 // AES block 4k+4 - round N low eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up eor x7, x7, x14 // AES block 4k+4 - round N high fmov d4, x6 // AES block 4k+4 - mov low orr x9, x11, x9, lsl #32 // CTR block 4k+8 eor v7.16b, v9.16b, v7.16b // MODULO - fold into mid eor x20, x20, x14 // AES block 4k+5 - round N high eor x24, x24, x14 // AES block 4k+7 - round N high add w12, w12, #1 // CTR block 4k+8 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 fmov v4.d[1], x7 // AES block 4k+4 - mov high eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid fmov d7, x23 // AES block 4k+7 - mov low aese v1.16b, v31.16b // AES block 4k+5 - round N-1 fmov v5.d[1], x20 // AES block 4k+5 - mov high fmov d6, x21 // AES block 4k+6 - mov low cmp x0, x5 // .LOOP CONTROL fmov v6.d[1], x22 // AES block 4k+6 - mov high pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result fmov d0, x10 // CTR block 4k+8 fmov v0.d[1], x9 // CTR block 4k+8 rev w9, w12 // CTR block 4k+9 add w12, w12, #1 // CTR block 4k+9 eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result fmov d1, x10 // CTR block 4k+9 orr x9, x11, x9, lsl #32 // CTR block 4k+9 fmov v1.d[1], x9 // CTR block 4k+9 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 rev w9, w12 // CTR block 4k+10 st1 { v4.16b}, [x2], #16 // AES block 4k+4 - store result orr x9, x11, x9, lsl #32 // CTR block 4k+10 eor v11.16b, v11.16b, v9.16b // MODULO - fold into low fmov v7.d[1], x24 // AES block 4k+7 - mov high ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment st1 { v5.16b}, [x2], #16 // AES block 4k+5 - store result add w12, w12, #1 // CTR block 4k+10 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result fmov d2, x10 // CTR block 4k+10 st1 { v6.16b}, [x2], #16 // AES block 4k+6 - store result fmov v2.d[1], x9 // CTR block 4k+10 rev w9, w12 // CTR block 4k+11 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low orr x9, x11, x9, lsl #32 // CTR block 4k+11 eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result st1 { v7.16b}, [x2], #16 // AES block 4k+7 - store result b.lt .Lenc_main_loop .Lenc_prepretail: // PREPRETAIL aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 fmov d3, x10 // CTR block 4k+3 aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) fmov v3.d[1], x9 // CTR block 4k+3 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 eor v4.16b, v4.16b, v11.16b // PRE 1 rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 mov d10, v17.d[1] // GHASH block 4k - mid aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low mov d8, v4.d[1] // GHASH block 4k - mid pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high mov d4, v5.d[1] // GHASH block 4k+1 - mid aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid mov d8, v6.d[1] // GHASH block 4k+2 - mid aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid add w12, w12, #1 // CTR block 4k+3 pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high mov d4, v7.d[1] // GHASH block 4k+3 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 movi v8.8b, #0xc2 aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 shl d8, d8, #56 // mod_constant aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 cmp x17, #12 // setup flags for AES-128/192/256 check aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 eor v10.16b, v10.16b, v9.16b // karatsuba tidy up aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 pmull v4.1q, v9.1d, v8.1d ext v9.16b, v9.16b, v9.16b, #8 eor v10.16b, v10.16b, v11.16b b.lt .Lenc_finish_prepretail // branch if AES-128 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 b.eq .Lenc_finish_prepretail // branch if AES-192 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 .Lenc_finish_prepretail: eor v10.16b, v10.16b, v4.16b eor v10.16b, v10.16b, v9.16b pmull v4.1q, v10.1d, v8.1d ext v10.16b, v10.16b, v10.16b, #8 aese v1.16b, v31.16b // AES block 4k+5 - round N-1 eor v11.16b, v11.16b, v4.16b aese v3.16b, v31.16b // AES block 4k+7 - round N-1 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 eor v11.16b, v11.16b, v10.16b .Lenc_tail: // TAIL ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process ldp x6, x7, [x0], #16 // AES block 4k+4 - load plaintext eor x6, x6, x13 // AES block 4k+4 - round N low eor x7, x7, x14 // AES block 4k+4 - round N high cmp x5, #48 fmov d4, x6 // AES block 4k+4 - mov low fmov v4.d[1], x7 // AES block 4k+4 - mov high eor v5.16b, v4.16b, v0.16b // AES block 4k+4 - result b.gt .Lenc_blocks_more_than_3 cmp x5, #32 mov v3.16b, v2.16b movi v11.8b, #0 movi v9.8b, #0 sub w12, w12, #1 mov v2.16b, v1.16b movi v10.8b, #0 b.gt .Lenc_blocks_more_than_2 mov v3.16b, v1.16b sub w12, w12, #1 cmp x5, #16 b.gt .Lenc_blocks_more_than_1 sub w12, w12, #1 b .Lenc_blocks_less_than_1 .Lenc_blocks_more_than_3: // blocks left > 3 st1 { v5.16b}, [x2], #16 // AES final-3 block - store result ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high rev64 v4.16b, v5.16b // GHASH final-3 block eor x6, x6, x13 // AES final-2 block - round N low eor v4.16b, v4.16b, v8.16b // feed in partial tag eor x7, x7, x14 // AES final-2 block - round N high mov d22, v4.d[1] // GHASH final-3 block - mid fmov d5, x6 // AES final-2 block - mov low fmov v5.d[1], x7 // AES final-2 block - mov high eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid movi v8.8b, #0 // suppress further partial tag feed in mov d10, v17.d[1] // GHASH final-3 block - mid pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid eor v5.16b, v5.16b, v1.16b // AES final-2 block - result .Lenc_blocks_more_than_2: // blocks left > 2 st1 { v5.16b}, [x2], #16 // AES final-2 block - store result ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high rev64 v4.16b, v5.16b // GHASH final-2 block eor x6, x6, x13 // AES final-1 block - round N low eor v4.16b, v4.16b, v8.16b // feed in partial tag fmov d5, x6 // AES final-1 block - mov low eor x7, x7, x14 // AES final-1 block - round N high fmov v5.d[1], x7 // AES final-1 block - mov high movi v8.8b, #0 // suppress further partial tag feed in pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high mov d22, v4.d[1] // GHASH final-2 block - mid pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid eor v5.16b, v5.16b, v2.16b // AES final-1 block - result eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid .Lenc_blocks_more_than_1: // blocks left > 1 st1 { v5.16b}, [x2], #16 // AES final-1 block - store result rev64 v4.16b, v5.16b // GHASH final-1 block ldp x6, x7, [x0], #16 // AES final block - load input low & high eor v4.16b, v4.16b, v8.16b // feed in partial tag movi v8.8b, #0 // suppress further partial tag feed in eor x6, x6, x13 // AES final block - round N low mov d22, v4.d[1] // GHASH final-1 block - mid pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high eor x7, x7, x14 // AES final block - round N high eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high ins v22.d[1], v22.d[0] // GHASH final-1 block - mid fmov d5, x6 // AES final block - mov low fmov v5.d[1], x7 // AES final block - mov high pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low eor v5.16b, v5.16b, v3.16b // AES final block - result eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low .Lenc_blocks_less_than_1: // blocks left <= 1 and x1, x1, #127 // bit_length %= 128 mvn x13, xzr // rkN_l = 0xffffffffffffffff sub x1, x1, #128 // bit_length -= 128 neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) ld1 { v18.16b}, [x2] // load existing bytes where the possibly partial last block is to be stored mvn x14, xzr // rkN_h = 0xffffffffffffffff and x1, x1, #127 // bit_length %= 128 lsr x14, x14, x1 // rkN_h is mask for top 64b of last block cmp x1, #64 csel x6, x13, x14, lt csel x7, x14, xzr, lt fmov d0, x6 // ctr0b is mask for last block fmov v0.d[1], x7 and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits rev64 v4.16b, v5.16b // GHASH final block eor v4.16b, v4.16b, v8.16b // feed in partial tag bif v5.16b, v18.16b, v0.16b // insert existing bytes in top end of result before storing pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high mov d8, v4.d[1] // GHASH final block - mid rev w9, w12 pmull v21.1q, v4.1d, v12.1d // GHASH final block - low eor v9.16b, v9.16b, v20.16b // GHASH final block - high eor v8.8b, v8.8b, v4.8b // GHASH final block - mid pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid eor v11.16b, v11.16b, v21.16b // GHASH final block - low eor v10.16b, v10.16b, v8.16b // GHASH final block - mid movi v8.8b, #0xc2 eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up shl d8, d8, #56 // mod_constant eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment str w9, [x16, #12] // store the updated counter st1 { v5.16b}, [x2] // store all 16B eor v11.16b, v11.16b, v9.16b // MODULO - fold into low eor v11.16b, v11.16b, v10.16b // MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 st1 { v11.16b }, [x3] ldp x19, x20, [sp, #16] ldp x21, x22, [sp, #32] ldp x23, x24, [sp, #48] ldp d8, d9, [sp, #64] ldp d10, d11, [sp, #80] ldp d12, d13, [sp, #96] ldp d14, d15, [sp, #112] ldp x29, x30, [sp], #128 AARCH64_VALIDATE_LINK_REGISTER ret .size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel .globl aes_gcm_dec_kernel .hidden aes_gcm_dec_kernel .type aes_gcm_dec_kernel,%function .align 4 aes_gcm_dec_kernel: AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-128]! mov x29, sp stp x19, x20, [sp, #16] mov x16, x4 mov x8, x5 stp x21, x22, [sp, #32] stp x23, x24, [sp, #48] stp d8, d9, [sp, #64] stp d10, d11, [sp, #80] stp d12, d13, [sp, #96] stp d14, d15, [sp, #112] ldr w17, [x8, #240] add x19, x8, x17, lsl #4 // borrow input_l1 for last key ldp x13, x14, [x19] // load round N keys ldr q31, [x19, #-16] // load round N-1 keys lsr x5, x1, #3 // byte_len mov x15, x5 ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 ldr q26, [x8, #128] // load rk8 sub x5, x5, #1 // byte_len - 1 ldr q25, [x8, #112] // load rk7 and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) add x4, x0, x1, lsr #3 // end_input_ptr ldr q24, [x8, #96] // load rk6 lsr x12, x11, #32 ldr q23, [x8, #80] // load rk5 orr w11, w11, w11 ldr q21, [x8, #48] // load rk3 add x5, x5, x0 rev w12, w12 // rev_ctr32 add w12, w12, #1 // increment rev_ctr32 fmov d3, x10 // CTR block 3 rev w9, w12 // CTR block 1 add w12, w12, #1 // CTR block 1 fmov d1, x10 // CTR block 1 orr x9, x11, x9, lsl #32 // CTR block 1 ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible fmov v1.d[1], x9 // CTR block 1 rev w9, w12 // CTR block 2 add w12, w12, #1 // CTR block 2 fmov d2, x10 // CTR block 2 orr x9, x11, x9, lsl #32 // CTR block 2 fmov v2.d[1], x9 // CTR block 2 rev w9, w12 // CTR block 3 orr x9, x11, x9, lsl #32 // CTR block 3 ldr q18, [x8, #0] // load rk0 fmov v3.d[1], x9 // CTR block 3 add w12, w12, #1 // CTR block 3 ldr q22, [x8, #64] // load rk4 ldr q19, [x8, #16] // load rk1 aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 0 - round 0 ldr q14, [x6, #48] // load h3l | h3h ext v14.16b, v14.16b, v14.16b, #8 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 3 - round 0 ldr q15, [x6, #80] // load h4l | h4h ext v15.16b, v15.16b, v15.16b, #8 aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 1 - round 0 ldr q13, [x6, #32] // load h2l | h2h ext v13.16b, v13.16b, v13.16b, #8 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 2 - round 0 ldr q20, [x8, #32] // load rk2 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 0 - round 1 aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 1 - round 1 ld1 { v11.16b}, [x3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 2 - round 1 ldr q27, [x8, #144] // load rk9 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 3 - round 1 ldr q30, [x8, #192] // load rk12 aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 0 - round 2 ldr q12, [x6] // load h1l | h1h ext v12.16b, v12.16b, v12.16b, #8 aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 2 - round 2 ldr q28, [x8, #160] // load rk10 aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 3 - round 2 aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 0 - round 3 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 1 - round 2 aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 3 - round 3 aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 0 - round 4 aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 2 - round 3 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 1 - round 3 aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 3 - round 4 aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 2 - round 4 aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 1 - round 4 aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 3 - round 5 aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 0 - round 5 aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 1 - round 5 aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 2 - round 5 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 0 - round 6 aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 3 - round 6 cmp x17, #12 // setup flags for AES-128/192/256 check aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 1 - round 6 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 2 - round 6 aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 0 - round 7 aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 1 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 3 - round 7 aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 0 - round 8 aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 2 - round 7 aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 3 - round 8 aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 1 - round 8 ldr q29, [x8, #176] // load rk11 aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 2 - round 8 b.lt .Ldec_finish_first_blocks // branch if AES-128 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 0 - round 9 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 1 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 3 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 2 - round 9 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 0 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 1 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 3 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 2 - round 10 b.eq .Ldec_finish_first_blocks // branch if AES-192 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 0 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 3 - round 11 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 1 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 2 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 1 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 0 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 2 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 3 - round 12 .Ldec_finish_first_blocks: cmp x0, x5 // check if we have <= 4 blocks trn1 v9.2d, v14.2d, v15.2d // h4h | h3h trn2 v17.2d, v14.2d, v15.2d // h4l | h3l trn1 v8.2d, v12.2d, v13.2d // h2h | h1h trn2 v16.2d, v12.2d, v13.2d // h2l | h1l eor v17.16b, v17.16b, v9.16b // h4k | h3k aese v1.16b, v31.16b // AES block 1 - round N-1 aese v2.16b, v31.16b // AES block 2 - round N-1 eor v16.16b, v16.16b, v8.16b // h2k | h1k aese v3.16b, v31.16b // AES block 3 - round N-1 aese v0.16b, v31.16b // AES block 0 - round N-1 b.ge .Ldec_tail // handle tail ldr q4, [x0, #0] // AES block 0 - load ciphertext ldr q5, [x0, #16] // AES block 1 - load ciphertext rev w9, w12 // CTR block 4 eor v0.16b, v4.16b, v0.16b // AES block 0 - result eor v1.16b, v5.16b, v1.16b // AES block 1 - result rev64 v5.16b, v5.16b // GHASH block 1 ldr q7, [x0, #48] // AES block 3 - load ciphertext mov x7, v0.d[1] // AES block 0 - mov high mov x6, v0.d[0] // AES block 0 - mov low rev64 v4.16b, v4.16b // GHASH block 0 add w12, w12, #1 // CTR block 4 fmov d0, x10 // CTR block 4 orr x9, x11, x9, lsl #32 // CTR block 4 fmov v0.d[1], x9 // CTR block 4 rev w9, w12 // CTR block 5 add w12, w12, #1 // CTR block 5 mov x19, v1.d[0] // AES block 1 - mov low orr x9, x11, x9, lsl #32 // CTR block 5 mov x20, v1.d[1] // AES block 1 - mov high eor x7, x7, x14 // AES block 0 - round N high eor x6, x6, x13 // AES block 0 - round N low stp x6, x7, [x2], #16 // AES block 0 - store result fmov d1, x10 // CTR block 5 ldr q6, [x0, #32] // AES block 2 - load ciphertext add x0, x0, #64 // AES input_ptr update fmov v1.d[1], x9 // CTR block 5 rev w9, w12 // CTR block 6 add w12, w12, #1 // CTR block 6 eor x19, x19, x13 // AES block 1 - round N low orr x9, x11, x9, lsl #32 // CTR block 6 eor x20, x20, x14 // AES block 1 - round N high stp x19, x20, [x2], #16 // AES block 1 - store result eor v2.16b, v6.16b, v2.16b // AES block 2 - result cmp x0, x5 // check if we have <= 8 blocks b.ge .Ldec_prepretail // do prepretail .Ldec_main_loop: // main loop start mov x21, v2.d[0] // AES block 4k+2 - mov low ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 mov x22, v2.d[1] // AES block 4k+2 - mov high aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 fmov d2, x10 // CTR block 4k+6 fmov v2.d[1], x9 // CTR block 4k+6 eor v4.16b, v4.16b, v11.16b // PRE 1 rev w9, w12 // CTR block 4k+7 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 mov x24, v3.d[1] // AES block 4k+3 - mov high aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 mov x23, v3.d[0] // AES block 4k+3 - mov low pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high mov d8, v4.d[1] // GHASH block 4k - mid fmov d3, x10 // CTR block 4k+7 aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 orr x9, x11, x9, lsl #32 // CTR block 4k+7 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 fmov v3.d[1], x9 // CTR block 4k+7 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 eor x22, x22, x14 // AES block 4k+2 - round N high aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 mov d10, v17.d[1] // GHASH block 4k - mid aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 rev64 v6.16b, v6.16b // GHASH block 4k+2 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 eor x21, x21, x13 // AES block 4k+2 - round N low aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 stp x21, x22, [x2], #16 // AES block 4k+2 - store result pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 rev64 v7.16b, v7.16b // GHASH block 4k+3 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid eor x23, x23, x13 // AES block 4k+3 - round N low pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low eor x24, x24, x14 // AES block 4k+3 - round N high eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 mov d4, v5.d[1] // GHASH block 4k+1 - mid aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 add w12, w12, #1 // CTR block 4k+7 aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 mov d8, v6.d[1] // GHASH block 4k+2 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid rev w9, w12 // CTR block 4k+8 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 add w12, w12, #1 // CTR block 4k+8 aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high mov d6, v7.d[1] // GHASH block 4k+3 - mid aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low orr x9, x11, x9, lsl #32 // CTR block 4k+8 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high cmp x17, #12 // setup flags for AES-128/192/256 check eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid movi v8.8b, #0xc2 aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 shl d8, d8, #56 // mod_constant aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 b.lt .Ldec_main_loop_continue // branch if AES-128 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 b.eq .Ldec_main_loop_continue // branch if AES-192 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 .Ldec_main_loop_continue: pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext aese v0.16b, v31.16b // AES block 4k+4 - round N-1 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result stp x23, x24, [x2], #16 // AES block 4k+3 - store result eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext mov x7, v0.d[1] // AES block 4k+4 - mov high eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid aese v1.16b, v31.16b // AES block 4k+5 - round N-1 add x0, x0, #64 // AES input_ptr update mov x6, v0.d[0] // AES block 4k+4 - mov low fmov d0, x10 // CTR block 4k+8 fmov v0.d[1], x9 // CTR block 4k+8 pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result rev w9, w12 // CTR block 4k+9 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 orr x9, x11, x9, lsl #32 // CTR block 4k+9 cmp x0, x5 // .LOOP CONTROL add w12, w12, #1 // CTR block 4k+9 eor x6, x6, x13 // AES block 4k+4 - round N low eor x7, x7, x14 // AES block 4k+4 - round N high mov x20, v1.d[1] // AES block 4k+5 - mov high eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result eor v11.16b, v11.16b, v8.16b // MODULO - fold into low mov x19, v1.d[0] // AES block 4k+5 - mov low fmov d1, x10 // CTR block 4k+9 ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment fmov v1.d[1], x9 // CTR block 4k+9 rev w9, w12 // CTR block 4k+10 add w12, w12, #1 // CTR block 4k+10 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 orr x9, x11, x9, lsl #32 // CTR block 4k+10 rev64 v5.16b, v5.16b // GHASH block 4k+5 eor x20, x20, x14 // AES block 4k+5 - round N high stp x6, x7, [x2], #16 // AES block 4k+4 - store result eor x19, x19, x13 // AES block 4k+5 - round N low stp x19, x20, [x2], #16 // AES block 4k+5 - store result rev64 v4.16b, v4.16b // GHASH block 4k+4 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low b.lt .Ldec_main_loop .Ldec_prepretail: // PREPRETAIL ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 mov x21, v2.d[0] // AES block 4k+2 - mov low eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 mov x22, v2.d[1] // AES block 4k+2 - mov high aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 fmov d2, x10 // CTR block 4k+6 fmov v2.d[1], x9 // CTR block 4k+6 rev w9, w12 // CTR block 4k+7 eor v4.16b, v4.16b, v11.16b // PRE 1 rev64 v6.16b, v6.16b // GHASH block 4k+2 orr x9, x11, x9, lsl #32 // CTR block 4k+7 mov x23, v3.d[0] // AES block 4k+3 - mov low aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 mov x24, v3.d[1] // AES block 4k+3 - mov high pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low mov d8, v4.d[1] // GHASH block 4k - mid fmov d3, x10 // CTR block 4k+7 pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high fmov v3.d[1], x9 // CTR block 4k+7 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 mov d10, v17.d[1] // GHASH block 4k - mid aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 rev64 v7.16b, v7.16b // GHASH block 4k+3 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 mov d4, v5.d[1] // GHASH block 4k+1 - mid aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 mov d8, v6.d[1] // GHASH block 4k+2 - mid aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 mov d6, v7.d[1] // GHASH block 4k+3 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 movi v8.8b, #0xc2 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 cmp x17, #12 // setup flags for AES-128/192/256 check eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 shl d8, d8, #56 // mod_constant aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 b.lt .Ldec_finish_prepretail // branch if AES-128 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 b.eq .Ldec_finish_prepretail // branch if AES-192 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 .Ldec_finish_prepretail: eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid eor x22, x22, x14 // AES block 4k+2 - round N high eor x23, x23, x13 // AES block 4k+3 - round N low eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid add w12, w12, #1 // CTR block 4k+7 eor x21, x21, x13 // AES block 4k+2 - round N low pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low eor x24, x24, x14 // AES block 4k+3 - round N high stp x21, x22, [x2], #16 // AES block 4k+2 - store result ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment stp x23, x24, [x2], #16 // AES block 4k+3 - store result eor v11.16b, v11.16b, v8.16b // MODULO - fold into low aese v1.16b, v31.16b // AES block 4k+5 - round N-1 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low .Ldec_tail: // TAIL sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result mov x6, v0.d[0] // AES block 4k+4 - mov low mov x7, v0.d[1] // AES block 4k+4 - mov high ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag cmp x5, #48 eor x6, x6, x13 // AES block 4k+4 - round N low eor x7, x7, x14 // AES block 4k+4 - round N high b.gt .Ldec_blocks_more_than_3 sub w12, w12, #1 mov v3.16b, v2.16b movi v10.8b, #0 movi v11.8b, #0 cmp x5, #32 movi v9.8b, #0 mov v2.16b, v1.16b b.gt .Ldec_blocks_more_than_2 sub w12, w12, #1 mov v3.16b, v1.16b cmp x5, #16 b.gt .Ldec_blocks_more_than_1 sub w12, w12, #1 b .Ldec_blocks_less_than_1 .Ldec_blocks_more_than_3: // blocks left > 3 rev64 v4.16b, v5.16b // GHASH final-3 block ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext stp x6, x7, [x2], #16 // AES final-3 block - store result mov d10, v17.d[1] // GHASH final-3 block - mid eor v4.16b, v4.16b, v8.16b // feed in partial tag eor v0.16b, v5.16b, v1.16b // AES final-2 block - result mov d22, v4.d[1] // GHASH final-3 block - mid mov x6, v0.d[0] // AES final-2 block - mov low mov x7, v0.d[1] // AES final-2 block - mov high eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid movi v8.8b, #0 // suppress further partial tag feed in pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid eor x6, x6, x13 // AES final-2 block - round N low pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low eor x7, x7, x14 // AES final-2 block - round N high .Ldec_blocks_more_than_2: // blocks left > 2 rev64 v4.16b, v5.16b // GHASH final-2 block ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext eor v4.16b, v4.16b, v8.16b // feed in partial tag stp x6, x7, [x2], #16 // AES final-2 block - store result eor v0.16b, v5.16b, v2.16b // AES final-1 block - result mov d22, v4.d[1] // GHASH final-2 block - mid pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid mov x6, v0.d[0] // AES final-1 block - mov low mov x7, v0.d[1] // AES final-1 block - mov high eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low movi v8.8b, #0 // suppress further partial tag feed in pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high eor x6, x6, x13 // AES final-1 block - round N low eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid eor x7, x7, x14 // AES final-1 block - round N high .Ldec_blocks_more_than_1: // blocks left > 1 stp x6, x7, [x2], #16 // AES final-1 block - store result rev64 v4.16b, v5.16b // GHASH final-1 block ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext eor v4.16b, v4.16b, v8.16b // feed in partial tag movi v8.8b, #0 // suppress further partial tag feed in mov d22, v4.d[1] // GHASH final-1 block - mid eor v0.16b, v5.16b, v3.16b // AES final block - result pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low mov x6, v0.d[0] // AES final block - mov low ins v22.d[1], v22.d[0] // GHASH final-1 block - mid mov x7, v0.d[1] // AES final block - mov high pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid eor x6, x6, x13 // AES final block - round N low eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid eor x7, x7, x14 // AES final block - round N high .Ldec_blocks_less_than_1: // blocks left <= 1 and x1, x1, #127 // bit_length %= 128 mvn x14, xzr // rkN_h = 0xffffffffffffffff sub x1, x1, #128 // bit_length -= 128 mvn x13, xzr // rkN_l = 0xffffffffffffffff ldp x4, x5, [x2] // load existing bytes we need to not overwrite neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) and x1, x1, #127 // bit_length %= 128 lsr x14, x14, x1 // rkN_h is mask for top 64b of last block cmp x1, #64 csel x9, x13, x14, lt csel x10, x14, xzr, lt fmov d0, x9 // ctr0b is mask for last block and x6, x6, x9 mov v0.d[1], x10 bic x4, x4, x9 // mask out low existing bytes rev w9, w12 bic x5, x5, x10 // mask out high existing bytes orr x6, x6, x4 and x7, x7, x10 orr x7, x7, x5 and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits rev64 v4.16b, v5.16b // GHASH final block eor v4.16b, v4.16b, v8.16b // feed in partial tag pmull v21.1q, v4.1d, v12.1d // GHASH final block - low mov d8, v4.d[1] // GHASH final block - mid eor v8.8b, v8.8b, v4.8b // GHASH final block - mid pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid eor v9.16b, v9.16b, v20.16b // GHASH final block - high eor v11.16b, v11.16b, v21.16b // GHASH final block - low eor v10.16b, v10.16b, v8.16b // GHASH final block - mid movi v8.8b, #0xc2 eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up shl d8, d8, #56 // mod_constant eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment eor v11.16b, v11.16b, v8.16b // MODULO - fold into low stp x6, x7, [x2] str w9, [x16, #12] // store the updated counter eor v11.16b, v11.16b, v10.16b // MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 st1 { v11.16b }, [x3] ldp x19, x20, [sp, #16] ldp x21, x22, [sp, #32] ldp x23, x24, [sp, #48] ldp d8, d9, [sp, #64] ldp d10, d11, [sp, #80] ldp d12, d13, [sp, #96] ldp d14, d15, [sp, #112] ldp x29, x30, [sp], #128 AARCH64_VALIDATE_LINK_REGISTER ret .size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) ring-0.17.14/pregenerated/aesv8-gcm-armv8-win64.S000064400000000000000000002404001046102023000173200ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) #if __ARM_MAX_ARCH__ >= 8 .arch armv8-a+crypto .text .globl aes_gcm_enc_kernel .def aes_gcm_enc_kernel .type 32 .endef .align 4 aes_gcm_enc_kernel: AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-128]! mov x29, sp stp x19, x20, [sp, #16] mov x16, x4 mov x8, x5 stp x21, x22, [sp, #32] stp x23, x24, [sp, #48] stp d8, d9, [sp, #64] stp d10, d11, [sp, #80] stp d12, d13, [sp, #96] stp d14, d15, [sp, #112] ldr w17, [x8, #240] add x19, x8, x17, lsl #4 // borrow input_l1 for last key ldp x13, x14, [x19] // load round N keys ldr q31, [x19, #-16] // load round N-1 keys add x4, x0, x1, lsr #3 // end_input_ptr lsr x5, x1, #3 // byte_len mov x15, x5 ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible sub x5, x5, #1 // byte_len - 1 ldr q18, [x8, #0] // load rk0 and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ldr q25, [x8, #112] // load rk7 add x5, x5, x0 lsr x12, x11, #32 fmov d2, x10 // CTR block 2 orr w11, w11, w11 rev w12, w12 // rev_ctr32 fmov d1, x10 // CTR block 1 aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 0 - round 0 add w12, w12, #1 // increment rev_ctr32 rev w9, w12 // CTR block 1 fmov d3, x10 // CTR block 3 orr x9, x11, x9, lsl #32 // CTR block 1 add w12, w12, #1 // CTR block 1 ldr q19, [x8, #16] // load rk1 fmov v1.d[1], x9 // CTR block 1 rev w9, w12 // CTR block 2 add w12, w12, #1 // CTR block 2 orr x9, x11, x9, lsl #32 // CTR block 2 ldr q20, [x8, #32] // load rk2 fmov v2.d[1], x9 // CTR block 2 rev w9, w12 // CTR block 3 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 0 - round 1 orr x9, x11, x9, lsl #32 // CTR block 3 fmov v3.d[1], x9 // CTR block 3 aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 1 - round 0 ldr q21, [x8, #48] // load rk3 aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 0 - round 2 ldr q24, [x8, #96] // load rk6 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 2 - round 0 ldr q23, [x8, #80] // load rk5 aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 1 - round 1 ldr q14, [x6, #48] // load h3l | h3h ext v14.16b, v14.16b, v14.16b, #8 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 3 - round 0 aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 2 - round 1 ldr q22, [x8, #64] // load rk4 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 1 - round 2 ldr q13, [x6, #32] // load h2l | h2h ext v13.16b, v13.16b, v13.16b, #8 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 3 - round 1 ldr q30, [x8, #192] // load rk12 aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 2 - round 2 ldr q15, [x6, #80] // load h4l | h4h ext v15.16b, v15.16b, v15.16b, #8 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 1 - round 3 ldr q29, [x8, #176] // load rk11 aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 3 - round 2 ldr q26, [x8, #128] // load rk8 aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 2 - round 3 add w12, w12, #1 // CTR block 3 aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 0 - round 3 aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 3 - round 3 ld1 { v11.16b}, [x3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 2 - round 4 aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 0 - round 4 aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 1 - round 4 aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 3 - round 4 cmp x17, #12 // setup flags for AES-128/192/256 check aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 0 - round 5 aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 1 - round 5 aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 3 - round 5 aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 2 - round 5 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 1 - round 6 trn2 v17.2d, v14.2d, v15.2d // h4l | h3l aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 3 - round 6 ldr q27, [x8, #144] // load rk9 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 0 - round 6 ldr q12, [x6] // load h1l | h1h ext v12.16b, v12.16b, v12.16b, #8 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 2 - round 6 ldr q28, [x8, #160] // load rk10 aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 1 - round 7 trn1 v9.2d, v14.2d, v15.2d // h4h | h3h aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 0 - round 7 aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 2 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 3 - round 7 trn2 v16.2d, v12.2d, v13.2d // h2l | h1l aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 1 - round 8 aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 2 - round 8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 3 - round 8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 0 - round 8 b.lt Lenc_finish_first_blocks // branch if AES-128 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 1 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 2 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 3 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 0 - round 9 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 1 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 2 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 3 - round 10 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 0 - round 10 b.eq Lenc_finish_first_blocks // branch if AES-192 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 1 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 2 - round 11 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 0 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 3 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 1 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 2 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 0 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 3 - round 12 Lenc_finish_first_blocks: cmp x0, x5 // check if we have <= 4 blocks eor v17.16b, v17.16b, v9.16b // h4k | h3k aese v2.16b, v31.16b // AES block 2 - round N-1 trn1 v8.2d, v12.2d, v13.2d // h2h | h1h aese v1.16b, v31.16b // AES block 1 - round N-1 aese v0.16b, v31.16b // AES block 0 - round N-1 aese v3.16b, v31.16b // AES block 3 - round N-1 eor v16.16b, v16.16b, v8.16b // h2k | h1k b.ge Lenc_tail // handle tail ldp x19, x20, [x0, #16] // AES block 1 - load plaintext rev w9, w12 // CTR block 4 ldp x6, x7, [x0, #0] // AES block 0 - load plaintext ldp x23, x24, [x0, #48] // AES block 3 - load plaintext ldp x21, x22, [x0, #32] // AES block 2 - load plaintext add x0, x0, #64 // AES input_ptr update eor x19, x19, x13 // AES block 1 - round N low eor x20, x20, x14 // AES block 1 - round N high fmov d5, x19 // AES block 1 - mov low eor x6, x6, x13 // AES block 0 - round N low eor x7, x7, x14 // AES block 0 - round N high eor x24, x24, x14 // AES block 3 - round N high fmov d4, x6 // AES block 0 - mov low cmp x0, x5 // check if we have <= 8 blocks fmov v4.d[1], x7 // AES block 0 - mov high eor x23, x23, x13 // AES block 3 - round N low eor x21, x21, x13 // AES block 2 - round N low fmov v5.d[1], x20 // AES block 1 - mov high fmov d6, x21 // AES block 2 - mov low add w12, w12, #1 // CTR block 4 orr x9, x11, x9, lsl #32 // CTR block 4 fmov d7, x23 // AES block 3 - mov low eor x22, x22, x14 // AES block 2 - round N high fmov v6.d[1], x22 // AES block 2 - mov high eor v4.16b, v4.16b, v0.16b // AES block 0 - result fmov d0, x10 // CTR block 4 fmov v0.d[1], x9 // CTR block 4 rev w9, w12 // CTR block 5 add w12, w12, #1 // CTR block 5 eor v5.16b, v5.16b, v1.16b // AES block 1 - result fmov d1, x10 // CTR block 5 orr x9, x11, x9, lsl #32 // CTR block 5 fmov v1.d[1], x9 // CTR block 5 rev w9, w12 // CTR block 6 st1 { v4.16b}, [x2], #16 // AES block 0 - store result fmov v7.d[1], x24 // AES block 3 - mov high orr x9, x11, x9, lsl #32 // CTR block 6 eor v6.16b, v6.16b, v2.16b // AES block 2 - result st1 { v5.16b}, [x2], #16 // AES block 1 - store result add w12, w12, #1 // CTR block 6 fmov d2, x10 // CTR block 6 fmov v2.d[1], x9 // CTR block 6 st1 { v6.16b}, [x2], #16 // AES block 2 - store result rev w9, w12 // CTR block 7 orr x9, x11, x9, lsl #32 // CTR block 7 eor v7.16b, v7.16b, v3.16b // AES block 3 - result st1 { v7.16b}, [x2], #16 // AES block 3 - store result b.ge Lenc_prepretail // do prepretail Lenc_main_loop: // main loop start aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 fmov d3, x10 // CTR block 4k+3 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 fmov v3.d[1], x9 // CTR block 4k+3 aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 ldp x23, x24, [x0, #48] // AES block 4k+7 - load plaintext aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 ldp x21, x22, [x0, #32] // AES block 4k+6 - load plaintext aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 eor v4.16b, v4.16b, v11.16b // PRE 1 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 eor x23, x23, x13 // AES block 4k+7 - round N low aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 mov d10, v17.d[1] // GHASH block 4k - mid pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high eor x22, x22, x14 // AES block 4k+6 - round N high mov d8, v4.d[1] // GHASH block 4k - mid aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high mov d4, v5.d[1] // GHASH block 4k+1 - mid aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 mov d8, v6.d[1] // GHASH block 4k+2 - mid aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 ldp x19, x20, [x0, #16] // AES block 4k+5 - load plaintext aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 mov d4, v7.d[1] // GHASH block 4k+3 - mid aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 eor x19, x19, x13 // AES block 4k+5 - round N low aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 eor x21, x21, x13 // AES block 4k+6 - round N low aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 movi v8.8b, #0xc2 pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high cmp x17, #12 // setup flags for AES-128/192/256 check fmov d5, x19 // AES block 4k+5 - mov low ldp x6, x7, [x0, #0] // AES block 4k+4 - load plaintext b.lt Lenc_main_loop_continue // branch if AES-128 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 b.eq Lenc_main_loop_continue // branch if AES-192 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 Lenc_main_loop_continue: shl d8, d8, #56 // mod_constant eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid add w12, w12, #1 // CTR block 4k+3 eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up add x0, x0, #64 // AES input_ptr update pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid rev w9, w12 // CTR block 4k+8 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor x6, x6, x13 // AES block 4k+4 - round N low eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up eor x7, x7, x14 // AES block 4k+4 - round N high fmov d4, x6 // AES block 4k+4 - mov low orr x9, x11, x9, lsl #32 // CTR block 4k+8 eor v7.16b, v9.16b, v7.16b // MODULO - fold into mid eor x20, x20, x14 // AES block 4k+5 - round N high eor x24, x24, x14 // AES block 4k+7 - round N high add w12, w12, #1 // CTR block 4k+8 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 fmov v4.d[1], x7 // AES block 4k+4 - mov high eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid fmov d7, x23 // AES block 4k+7 - mov low aese v1.16b, v31.16b // AES block 4k+5 - round N-1 fmov v5.d[1], x20 // AES block 4k+5 - mov high fmov d6, x21 // AES block 4k+6 - mov low cmp x0, x5 // LOOP CONTROL fmov v6.d[1], x22 // AES block 4k+6 - mov high pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result fmov d0, x10 // CTR block 4k+8 fmov v0.d[1], x9 // CTR block 4k+8 rev w9, w12 // CTR block 4k+9 add w12, w12, #1 // CTR block 4k+9 eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result fmov d1, x10 // CTR block 4k+9 orr x9, x11, x9, lsl #32 // CTR block 4k+9 fmov v1.d[1], x9 // CTR block 4k+9 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 rev w9, w12 // CTR block 4k+10 st1 { v4.16b}, [x2], #16 // AES block 4k+4 - store result orr x9, x11, x9, lsl #32 // CTR block 4k+10 eor v11.16b, v11.16b, v9.16b // MODULO - fold into low fmov v7.d[1], x24 // AES block 4k+7 - mov high ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment st1 { v5.16b}, [x2], #16 // AES block 4k+5 - store result add w12, w12, #1 // CTR block 4k+10 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result fmov d2, x10 // CTR block 4k+10 st1 { v6.16b}, [x2], #16 // AES block 4k+6 - store result fmov v2.d[1], x9 // CTR block 4k+10 rev w9, w12 // CTR block 4k+11 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low orr x9, x11, x9, lsl #32 // CTR block 4k+11 eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result st1 { v7.16b}, [x2], #16 // AES block 4k+7 - store result b.lt Lenc_main_loop Lenc_prepretail: // PREPRETAIL aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 fmov d3, x10 // CTR block 4k+3 aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) fmov v3.d[1], x9 // CTR block 4k+3 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 eor v4.16b, v4.16b, v11.16b // PRE 1 rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 mov d10, v17.d[1] // GHASH block 4k - mid aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low mov d8, v4.d[1] // GHASH block 4k - mid pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high mov d4, v5.d[1] // GHASH block 4k+1 - mid aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid mov d8, v6.d[1] // GHASH block 4k+2 - mid aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid add w12, w12, #1 // CTR block 4k+3 pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high mov d4, v7.d[1] // GHASH block 4k+3 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 movi v8.8b, #0xc2 aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 shl d8, d8, #56 // mod_constant aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 cmp x17, #12 // setup flags for AES-128/192/256 check aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 eor v10.16b, v10.16b, v9.16b // karatsuba tidy up aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 pmull v4.1q, v9.1d, v8.1d ext v9.16b, v9.16b, v9.16b, #8 eor v10.16b, v10.16b, v11.16b b.lt Lenc_finish_prepretail // branch if AES-128 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 b.eq Lenc_finish_prepretail // branch if AES-192 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 Lenc_finish_prepretail: eor v10.16b, v10.16b, v4.16b eor v10.16b, v10.16b, v9.16b pmull v4.1q, v10.1d, v8.1d ext v10.16b, v10.16b, v10.16b, #8 aese v1.16b, v31.16b // AES block 4k+5 - round N-1 eor v11.16b, v11.16b, v4.16b aese v3.16b, v31.16b // AES block 4k+7 - round N-1 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 eor v11.16b, v11.16b, v10.16b Lenc_tail: // TAIL ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process ldp x6, x7, [x0], #16 // AES block 4k+4 - load plaintext eor x6, x6, x13 // AES block 4k+4 - round N low eor x7, x7, x14 // AES block 4k+4 - round N high cmp x5, #48 fmov d4, x6 // AES block 4k+4 - mov low fmov v4.d[1], x7 // AES block 4k+4 - mov high eor v5.16b, v4.16b, v0.16b // AES block 4k+4 - result b.gt Lenc_blocks_more_than_3 cmp x5, #32 mov v3.16b, v2.16b movi v11.8b, #0 movi v9.8b, #0 sub w12, w12, #1 mov v2.16b, v1.16b movi v10.8b, #0 b.gt Lenc_blocks_more_than_2 mov v3.16b, v1.16b sub w12, w12, #1 cmp x5, #16 b.gt Lenc_blocks_more_than_1 sub w12, w12, #1 b Lenc_blocks_less_than_1 Lenc_blocks_more_than_3: // blocks left > 3 st1 { v5.16b}, [x2], #16 // AES final-3 block - store result ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high rev64 v4.16b, v5.16b // GHASH final-3 block eor x6, x6, x13 // AES final-2 block - round N low eor v4.16b, v4.16b, v8.16b // feed in partial tag eor x7, x7, x14 // AES final-2 block - round N high mov d22, v4.d[1] // GHASH final-3 block - mid fmov d5, x6 // AES final-2 block - mov low fmov v5.d[1], x7 // AES final-2 block - mov high eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid movi v8.8b, #0 // suppress further partial tag feed in mov d10, v17.d[1] // GHASH final-3 block - mid pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid eor v5.16b, v5.16b, v1.16b // AES final-2 block - result Lenc_blocks_more_than_2: // blocks left > 2 st1 { v5.16b}, [x2], #16 // AES final-2 block - store result ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high rev64 v4.16b, v5.16b // GHASH final-2 block eor x6, x6, x13 // AES final-1 block - round N low eor v4.16b, v4.16b, v8.16b // feed in partial tag fmov d5, x6 // AES final-1 block - mov low eor x7, x7, x14 // AES final-1 block - round N high fmov v5.d[1], x7 // AES final-1 block - mov high movi v8.8b, #0 // suppress further partial tag feed in pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high mov d22, v4.d[1] // GHASH final-2 block - mid pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid eor v5.16b, v5.16b, v2.16b // AES final-1 block - result eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid Lenc_blocks_more_than_1: // blocks left > 1 st1 { v5.16b}, [x2], #16 // AES final-1 block - store result rev64 v4.16b, v5.16b // GHASH final-1 block ldp x6, x7, [x0], #16 // AES final block - load input low & high eor v4.16b, v4.16b, v8.16b // feed in partial tag movi v8.8b, #0 // suppress further partial tag feed in eor x6, x6, x13 // AES final block - round N low mov d22, v4.d[1] // GHASH final-1 block - mid pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high eor x7, x7, x14 // AES final block - round N high eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high ins v22.d[1], v22.d[0] // GHASH final-1 block - mid fmov d5, x6 // AES final block - mov low fmov v5.d[1], x7 // AES final block - mov high pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low eor v5.16b, v5.16b, v3.16b // AES final block - result eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low Lenc_blocks_less_than_1: // blocks left <= 1 and x1, x1, #127 // bit_length %= 128 mvn x13, xzr // rkN_l = 0xffffffffffffffff sub x1, x1, #128 // bit_length -= 128 neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) ld1 { v18.16b}, [x2] // load existing bytes where the possibly partial last block is to be stored mvn x14, xzr // rkN_h = 0xffffffffffffffff and x1, x1, #127 // bit_length %= 128 lsr x14, x14, x1 // rkN_h is mask for top 64b of last block cmp x1, #64 csel x6, x13, x14, lt csel x7, x14, xzr, lt fmov d0, x6 // ctr0b is mask for last block fmov v0.d[1], x7 and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits rev64 v4.16b, v5.16b // GHASH final block eor v4.16b, v4.16b, v8.16b // feed in partial tag bif v5.16b, v18.16b, v0.16b // insert existing bytes in top end of result before storing pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high mov d8, v4.d[1] // GHASH final block - mid rev w9, w12 pmull v21.1q, v4.1d, v12.1d // GHASH final block - low eor v9.16b, v9.16b, v20.16b // GHASH final block - high eor v8.8b, v8.8b, v4.8b // GHASH final block - mid pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid eor v11.16b, v11.16b, v21.16b // GHASH final block - low eor v10.16b, v10.16b, v8.16b // GHASH final block - mid movi v8.8b, #0xc2 eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up shl d8, d8, #56 // mod_constant eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment str w9, [x16, #12] // store the updated counter st1 { v5.16b}, [x2] // store all 16B eor v11.16b, v11.16b, v9.16b // MODULO - fold into low eor v11.16b, v11.16b, v10.16b // MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 st1 { v11.16b }, [x3] ldp x19, x20, [sp, #16] ldp x21, x22, [sp, #32] ldp x23, x24, [sp, #48] ldp d8, d9, [sp, #64] ldp d10, d11, [sp, #80] ldp d12, d13, [sp, #96] ldp d14, d15, [sp, #112] ldp x29, x30, [sp], #128 AARCH64_VALIDATE_LINK_REGISTER ret .globl aes_gcm_dec_kernel .def aes_gcm_dec_kernel .type 32 .endef .align 4 aes_gcm_dec_kernel: AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-128]! mov x29, sp stp x19, x20, [sp, #16] mov x16, x4 mov x8, x5 stp x21, x22, [sp, #32] stp x23, x24, [sp, #48] stp d8, d9, [sp, #64] stp d10, d11, [sp, #80] stp d12, d13, [sp, #96] stp d14, d15, [sp, #112] ldr w17, [x8, #240] add x19, x8, x17, lsl #4 // borrow input_l1 for last key ldp x13, x14, [x19] // load round N keys ldr q31, [x19, #-16] // load round N-1 keys lsr x5, x1, #3 // byte_len mov x15, x5 ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 ldr q26, [x8, #128] // load rk8 sub x5, x5, #1 // byte_len - 1 ldr q25, [x8, #112] // load rk7 and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) add x4, x0, x1, lsr #3 // end_input_ptr ldr q24, [x8, #96] // load rk6 lsr x12, x11, #32 ldr q23, [x8, #80] // load rk5 orr w11, w11, w11 ldr q21, [x8, #48] // load rk3 add x5, x5, x0 rev w12, w12 // rev_ctr32 add w12, w12, #1 // increment rev_ctr32 fmov d3, x10 // CTR block 3 rev w9, w12 // CTR block 1 add w12, w12, #1 // CTR block 1 fmov d1, x10 // CTR block 1 orr x9, x11, x9, lsl #32 // CTR block 1 ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible fmov v1.d[1], x9 // CTR block 1 rev w9, w12 // CTR block 2 add w12, w12, #1 // CTR block 2 fmov d2, x10 // CTR block 2 orr x9, x11, x9, lsl #32 // CTR block 2 fmov v2.d[1], x9 // CTR block 2 rev w9, w12 // CTR block 3 orr x9, x11, x9, lsl #32 // CTR block 3 ldr q18, [x8, #0] // load rk0 fmov v3.d[1], x9 // CTR block 3 add w12, w12, #1 // CTR block 3 ldr q22, [x8, #64] // load rk4 ldr q19, [x8, #16] // load rk1 aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 0 - round 0 ldr q14, [x6, #48] // load h3l | h3h ext v14.16b, v14.16b, v14.16b, #8 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 3 - round 0 ldr q15, [x6, #80] // load h4l | h4h ext v15.16b, v15.16b, v15.16b, #8 aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 1 - round 0 ldr q13, [x6, #32] // load h2l | h2h ext v13.16b, v13.16b, v13.16b, #8 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 2 - round 0 ldr q20, [x8, #32] // load rk2 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 0 - round 1 aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 1 - round 1 ld1 { v11.16b}, [x3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 2 - round 1 ldr q27, [x8, #144] // load rk9 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 3 - round 1 ldr q30, [x8, #192] // load rk12 aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 0 - round 2 ldr q12, [x6] // load h1l | h1h ext v12.16b, v12.16b, v12.16b, #8 aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 2 - round 2 ldr q28, [x8, #160] // load rk10 aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 3 - round 2 aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 0 - round 3 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 1 - round 2 aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 3 - round 3 aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 0 - round 4 aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 2 - round 3 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 1 - round 3 aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 3 - round 4 aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 2 - round 4 aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 1 - round 4 aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 3 - round 5 aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 0 - round 5 aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 1 - round 5 aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 2 - round 5 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 0 - round 6 aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 3 - round 6 cmp x17, #12 // setup flags for AES-128/192/256 check aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 1 - round 6 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 2 - round 6 aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 0 - round 7 aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 1 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 3 - round 7 aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 0 - round 8 aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 2 - round 7 aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 3 - round 8 aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 1 - round 8 ldr q29, [x8, #176] // load rk11 aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 2 - round 8 b.lt Ldec_finish_first_blocks // branch if AES-128 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 0 - round 9 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 1 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 3 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 2 - round 9 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 0 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 1 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 3 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 2 - round 10 b.eq Ldec_finish_first_blocks // branch if AES-192 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 0 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 3 - round 11 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 1 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 2 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 1 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 0 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 2 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 3 - round 12 Ldec_finish_first_blocks: cmp x0, x5 // check if we have <= 4 blocks trn1 v9.2d, v14.2d, v15.2d // h4h | h3h trn2 v17.2d, v14.2d, v15.2d // h4l | h3l trn1 v8.2d, v12.2d, v13.2d // h2h | h1h trn2 v16.2d, v12.2d, v13.2d // h2l | h1l eor v17.16b, v17.16b, v9.16b // h4k | h3k aese v1.16b, v31.16b // AES block 1 - round N-1 aese v2.16b, v31.16b // AES block 2 - round N-1 eor v16.16b, v16.16b, v8.16b // h2k | h1k aese v3.16b, v31.16b // AES block 3 - round N-1 aese v0.16b, v31.16b // AES block 0 - round N-1 b.ge Ldec_tail // handle tail ldr q4, [x0, #0] // AES block 0 - load ciphertext ldr q5, [x0, #16] // AES block 1 - load ciphertext rev w9, w12 // CTR block 4 eor v0.16b, v4.16b, v0.16b // AES block 0 - result eor v1.16b, v5.16b, v1.16b // AES block 1 - result rev64 v5.16b, v5.16b // GHASH block 1 ldr q7, [x0, #48] // AES block 3 - load ciphertext mov x7, v0.d[1] // AES block 0 - mov high mov x6, v0.d[0] // AES block 0 - mov low rev64 v4.16b, v4.16b // GHASH block 0 add w12, w12, #1 // CTR block 4 fmov d0, x10 // CTR block 4 orr x9, x11, x9, lsl #32 // CTR block 4 fmov v0.d[1], x9 // CTR block 4 rev w9, w12 // CTR block 5 add w12, w12, #1 // CTR block 5 mov x19, v1.d[0] // AES block 1 - mov low orr x9, x11, x9, lsl #32 // CTR block 5 mov x20, v1.d[1] // AES block 1 - mov high eor x7, x7, x14 // AES block 0 - round N high eor x6, x6, x13 // AES block 0 - round N low stp x6, x7, [x2], #16 // AES block 0 - store result fmov d1, x10 // CTR block 5 ldr q6, [x0, #32] // AES block 2 - load ciphertext add x0, x0, #64 // AES input_ptr update fmov v1.d[1], x9 // CTR block 5 rev w9, w12 // CTR block 6 add w12, w12, #1 // CTR block 6 eor x19, x19, x13 // AES block 1 - round N low orr x9, x11, x9, lsl #32 // CTR block 6 eor x20, x20, x14 // AES block 1 - round N high stp x19, x20, [x2], #16 // AES block 1 - store result eor v2.16b, v6.16b, v2.16b // AES block 2 - result cmp x0, x5 // check if we have <= 8 blocks b.ge Ldec_prepretail // do prepretail Ldec_main_loop: // main loop start mov x21, v2.d[0] // AES block 4k+2 - mov low ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 mov x22, v2.d[1] // AES block 4k+2 - mov high aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 fmov d2, x10 // CTR block 4k+6 fmov v2.d[1], x9 // CTR block 4k+6 eor v4.16b, v4.16b, v11.16b // PRE 1 rev w9, w12 // CTR block 4k+7 aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 mov x24, v3.d[1] // AES block 4k+3 - mov high aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 mov x23, v3.d[0] // AES block 4k+3 - mov low pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high mov d8, v4.d[1] // GHASH block 4k - mid fmov d3, x10 // CTR block 4k+7 aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 orr x9, x11, x9, lsl #32 // CTR block 4k+7 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 fmov v3.d[1], x9 // CTR block 4k+7 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 eor x22, x22, x14 // AES block 4k+2 - round N high aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 mov d10, v17.d[1] // GHASH block 4k - mid aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 rev64 v6.16b, v6.16b // GHASH block 4k+2 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 eor x21, x21, x13 // AES block 4k+2 - round N low aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 stp x21, x22, [x2], #16 // AES block 4k+2 - store result pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 rev64 v7.16b, v7.16b // GHASH block 4k+3 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid eor x23, x23, x13 // AES block 4k+3 - round N low pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low eor x24, x24, x14 // AES block 4k+3 - round N high eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 mov d4, v5.d[1] // GHASH block 4k+1 - mid aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 add w12, w12, #1 // CTR block 4k+7 aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 mov d8, v6.d[1] // GHASH block 4k+2 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid rev w9, w12 // CTR block 4k+8 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 add w12, w12, #1 // CTR block 4k+8 aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high mov d6, v7.d[1] // GHASH block 4k+3 - mid aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low orr x9, x11, x9, lsl #32 // CTR block 4k+8 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high cmp x17, #12 // setup flags for AES-128/192/256 check eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid movi v8.8b, #0xc2 aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 shl d8, d8, #56 // mod_constant aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 b.lt Ldec_main_loop_continue // branch if AES-128 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 b.eq Ldec_main_loop_continue // branch if AES-192 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 Ldec_main_loop_continue: pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext aese v0.16b, v31.16b // AES block 4k+4 - round N-1 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result stp x23, x24, [x2], #16 // AES block 4k+3 - store result eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext mov x7, v0.d[1] // AES block 4k+4 - mov high eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid aese v1.16b, v31.16b // AES block 4k+5 - round N-1 add x0, x0, #64 // AES input_ptr update mov x6, v0.d[0] // AES block 4k+4 - mov low fmov d0, x10 // CTR block 4k+8 fmov v0.d[1], x9 // CTR block 4k+8 pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result rev w9, w12 // CTR block 4k+9 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 orr x9, x11, x9, lsl #32 // CTR block 4k+9 cmp x0, x5 // LOOP CONTROL add w12, w12, #1 // CTR block 4k+9 eor x6, x6, x13 // AES block 4k+4 - round N low eor x7, x7, x14 // AES block 4k+4 - round N high mov x20, v1.d[1] // AES block 4k+5 - mov high eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result eor v11.16b, v11.16b, v8.16b // MODULO - fold into low mov x19, v1.d[0] // AES block 4k+5 - mov low fmov d1, x10 // CTR block 4k+9 ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment fmov v1.d[1], x9 // CTR block 4k+9 rev w9, w12 // CTR block 4k+10 add w12, w12, #1 // CTR block 4k+10 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 orr x9, x11, x9, lsl #32 // CTR block 4k+10 rev64 v5.16b, v5.16b // GHASH block 4k+5 eor x20, x20, x14 // AES block 4k+5 - round N high stp x6, x7, [x2], #16 // AES block 4k+4 - store result eor x19, x19, x13 // AES block 4k+5 - round N low stp x19, x20, [x2], #16 // AES block 4k+5 - store result rev64 v4.16b, v4.16b // GHASH block 4k+4 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low b.lt Ldec_main_loop Ldec_prepretail: // PREPRETAIL ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 mov x21, v2.d[0] // AES block 4k+2 - mov low eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 mov x22, v2.d[1] // AES block 4k+2 - mov high aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 fmov d2, x10 // CTR block 4k+6 fmov v2.d[1], x9 // CTR block 4k+6 rev w9, w12 // CTR block 4k+7 eor v4.16b, v4.16b, v11.16b // PRE 1 rev64 v6.16b, v6.16b // GHASH block 4k+2 orr x9, x11, x9, lsl #32 // CTR block 4k+7 mov x23, v3.d[0] // AES block 4k+3 - mov low aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 mov x24, v3.d[1] // AES block 4k+3 - mov high pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low mov d8, v4.d[1] // GHASH block 4k - mid fmov d3, x10 // CTR block 4k+7 pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high fmov v3.d[1], x9 // CTR block 4k+7 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 mov d10, v17.d[1] // GHASH block 4k - mid aese v0.16b, v19.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high aese v2.16b, v19.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 rev64 v7.16b, v7.16b // GHASH block 4k+3 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 mov d4, v5.d[1] // GHASH block 4k+1 - mid aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 aese v0.16b, v21.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 mov d8, v6.d[1] // GHASH block 4k+2 - mid aese v3.16b, v20.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low aese v0.16b, v22.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 aese v3.16b, v21.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid aese v0.16b, v23.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low aese v3.16b, v22.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high aese v3.16b, v23.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid aese v2.16b, v21.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low aese v2.16b, v22.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 mov d6, v7.d[1] // GHASH block 4k+3 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid aese v2.16b, v23.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 aese v3.16b, v24.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 movi v8.8b, #0xc2 aese v1.16b, v24.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid aese v3.16b, v25.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 cmp x17, #12 // setup flags for AES-128/192/256 check eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high aese v1.16b, v25.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 aese v0.16b, v25.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid aese v3.16b, v26.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 aese v2.16b, v25.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up aese v1.16b, v26.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 shl d8, d8, #56 // mod_constant aese v2.16b, v26.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 b.lt Ldec_finish_prepretail // branch if AES-128 aese v1.16b, v27.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 aese v2.16b, v28.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 aese v0.16b, v28.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 b.eq Ldec_finish_prepretail // branch if AES-192 aese v2.16b, v29.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 aese v0.16b, v29.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 aese v1.16b, v29.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 aese v2.16b, v30.16b aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 aese v3.16b, v29.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 aese v1.16b, v30.16b aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 Ldec_finish_prepretail: eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid eor x22, x22, x14 // AES block 4k+2 - round N high eor x23, x23, x13 // AES block 4k+3 - round N low eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid add w12, w12, #1 // CTR block 4k+7 eor x21, x21, x13 // AES block 4k+2 - round N low pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low eor x24, x24, x14 // AES block 4k+3 - round N high stp x21, x22, [x2], #16 // AES block 4k+2 - store result ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment stp x23, x24, [x2], #16 // AES block 4k+3 - store result eor v11.16b, v11.16b, v8.16b // MODULO - fold into low aese v1.16b, v31.16b // AES block 4k+5 - round N-1 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low Ldec_tail: // TAIL sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result mov x6, v0.d[0] // AES block 4k+4 - mov low mov x7, v0.d[1] // AES block 4k+4 - mov high ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag cmp x5, #48 eor x6, x6, x13 // AES block 4k+4 - round N low eor x7, x7, x14 // AES block 4k+4 - round N high b.gt Ldec_blocks_more_than_3 sub w12, w12, #1 mov v3.16b, v2.16b movi v10.8b, #0 movi v11.8b, #0 cmp x5, #32 movi v9.8b, #0 mov v2.16b, v1.16b b.gt Ldec_blocks_more_than_2 sub w12, w12, #1 mov v3.16b, v1.16b cmp x5, #16 b.gt Ldec_blocks_more_than_1 sub w12, w12, #1 b Ldec_blocks_less_than_1 Ldec_blocks_more_than_3: // blocks left > 3 rev64 v4.16b, v5.16b // GHASH final-3 block ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext stp x6, x7, [x2], #16 // AES final-3 block - store result mov d10, v17.d[1] // GHASH final-3 block - mid eor v4.16b, v4.16b, v8.16b // feed in partial tag eor v0.16b, v5.16b, v1.16b // AES final-2 block - result mov d22, v4.d[1] // GHASH final-3 block - mid mov x6, v0.d[0] // AES final-2 block - mov low mov x7, v0.d[1] // AES final-2 block - mov high eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid movi v8.8b, #0 // suppress further partial tag feed in pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid eor x6, x6, x13 // AES final-2 block - round N low pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low eor x7, x7, x14 // AES final-2 block - round N high Ldec_blocks_more_than_2: // blocks left > 2 rev64 v4.16b, v5.16b // GHASH final-2 block ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext eor v4.16b, v4.16b, v8.16b // feed in partial tag stp x6, x7, [x2], #16 // AES final-2 block - store result eor v0.16b, v5.16b, v2.16b // AES final-1 block - result mov d22, v4.d[1] // GHASH final-2 block - mid pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid mov x6, v0.d[0] // AES final-1 block - mov low mov x7, v0.d[1] // AES final-1 block - mov high eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low movi v8.8b, #0 // suppress further partial tag feed in pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high eor x6, x6, x13 // AES final-1 block - round N low eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid eor x7, x7, x14 // AES final-1 block - round N high Ldec_blocks_more_than_1: // blocks left > 1 stp x6, x7, [x2], #16 // AES final-1 block - store result rev64 v4.16b, v5.16b // GHASH final-1 block ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext eor v4.16b, v4.16b, v8.16b // feed in partial tag movi v8.8b, #0 // suppress further partial tag feed in mov d22, v4.d[1] // GHASH final-1 block - mid eor v0.16b, v5.16b, v3.16b // AES final block - result pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low mov x6, v0.d[0] // AES final block - mov low ins v22.d[1], v22.d[0] // GHASH final-1 block - mid mov x7, v0.d[1] // AES final block - mov high pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid eor x6, x6, x13 // AES final block - round N low eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid eor x7, x7, x14 // AES final block - round N high Ldec_blocks_less_than_1: // blocks left <= 1 and x1, x1, #127 // bit_length %= 128 mvn x14, xzr // rkN_h = 0xffffffffffffffff sub x1, x1, #128 // bit_length -= 128 mvn x13, xzr // rkN_l = 0xffffffffffffffff ldp x4, x5, [x2] // load existing bytes we need to not overwrite neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) and x1, x1, #127 // bit_length %= 128 lsr x14, x14, x1 // rkN_h is mask for top 64b of last block cmp x1, #64 csel x9, x13, x14, lt csel x10, x14, xzr, lt fmov d0, x9 // ctr0b is mask for last block and x6, x6, x9 mov v0.d[1], x10 bic x4, x4, x9 // mask out low existing bytes rev w9, w12 bic x5, x5, x10 // mask out high existing bytes orr x6, x6, x4 and x7, x7, x10 orr x7, x7, x5 and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits rev64 v4.16b, v5.16b // GHASH final block eor v4.16b, v4.16b, v8.16b // feed in partial tag pmull v21.1q, v4.1d, v12.1d // GHASH final block - low mov d8, v4.d[1] // GHASH final block - mid eor v8.8b, v8.8b, v4.8b // GHASH final block - mid pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid eor v9.16b, v9.16b, v20.16b // GHASH final block - high eor v11.16b, v11.16b, v21.16b // GHASH final block - low eor v10.16b, v10.16b, v8.16b // GHASH final block - mid movi v8.8b, #0xc2 eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up shl d8, d8, #56 // mod_constant eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment eor v11.16b, v11.16b, v8.16b // MODULO - fold into low stp x6, x7, [x2] str w9, [x16, #12] // store the updated counter eor v11.16b, v11.16b, v10.16b // MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 st1 { v11.16b }, [x3] ldp x19, x20, [sp, #16] ldp x21, x22, [sp, #32] ldp x23, x24, [sp, #48] ldp d8, d9, [sp, #64] ldp d10, d11, [sp, #80] ldp d12, d13, [sp, #96] ldp d14, d15, [sp, #112] ldp x29, x30, [sp], #128 AARCH64_VALIDATE_LINK_REGISTER ret #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) ring-0.17.14/pregenerated/armv4-mont-linux32.S000064400000000000000000000524631046102023000170460ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both @ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. .arch armv7-a .text #if defined(__thumb2__) .syntax unified .thumb #else .code 32 #endif .globl bn_mul_mont_nohw .hidden bn_mul_mont_nohw .type bn_mul_mont_nohw,%function .align 5 bn_mul_mont_nohw: ldr ip,[sp,#4] @ load num stmdb sp!,{r0,r2} @ sp points at argument block cmp ip,#2 mov r0,ip @ load num #ifdef __thumb2__ ittt lt #endif movlt r0,#0 addlt sp,sp,#2*4 blt .Labrt stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers mov r0,r0,lsl#2 @ rescale r0 for byte count sub sp,sp,r0 @ alloca(4*num) sub sp,sp,#4 @ +extra dword sub r0,r0,#4 @ "num=num-1" add r4,r2,r0 @ &bp[num-1] add r0,sp,r0 @ r0 to point at &tp[num-1] ldr r8,[r0,#14*4] @ &n0 ldr r2,[r2] @ bp[0] ldr r5,[r1],#4 @ ap[0],ap++ ldr r6,[r3],#4 @ np[0],np++ ldr r8,[r8] @ *n0 str r4,[r0,#15*4] @ save &bp[num] umull r10,r11,r5,r2 @ ap[0]*bp[0] str r8,[r0,#14*4] @ save n0 value mul r8,r10,r8 @ "tp[0]"*n0 mov r12,#0 umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" mov r4,sp .L1st: ldr r5,[r1],#4 @ ap[j],ap++ mov r10,r11 ldr r6,[r3],#4 @ np[j],np++ mov r11,#0 umlal r10,r11,r5,r2 @ ap[j]*bp[0] mov r14,#0 umlal r12,r14,r6,r8 @ np[j]*n0 adds r12,r12,r10 str r12,[r4],#4 @ tp[j-1]=,tp++ adc r12,r14,#0 cmp r4,r0 bne .L1st adds r12,r12,r11 ldr r4,[r0,#13*4] @ restore bp mov r14,#0 ldr r8,[r0,#14*4] @ restore n0 adc r14,r14,#0 str r12,[r0] @ tp[num-1]= mov r7,sp str r14,[r0,#4] @ tp[num]= .Louter: sub r7,r0,r7 @ "original" r0-1 value sub r1,r1,r7 @ "rewind" ap to &ap[1] ldr r2,[r4,#4]! @ *(++bp) sub r3,r3,r7 @ "rewind" np to &np[1] ldr r5,[r1,#-4] @ ap[0] ldr r10,[sp] @ tp[0] ldr r6,[r3,#-4] @ np[0] ldr r7,[sp,#4] @ tp[1] mov r11,#0 umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] str r4,[r0,#13*4] @ save bp mul r8,r10,r8 mov r12,#0 umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" mov r4,sp .Linner: ldr r5,[r1],#4 @ ap[j],ap++ adds r10,r11,r7 @ +=tp[j] ldr r6,[r3],#4 @ np[j],np++ mov r11,#0 umlal r10,r11,r5,r2 @ ap[j]*bp[i] mov r14,#0 umlal r12,r14,r6,r8 @ np[j]*n0 adc r11,r11,#0 ldr r7,[r4,#8] @ tp[j+1] adds r12,r12,r10 str r12,[r4],#4 @ tp[j-1]=,tp++ adc r12,r14,#0 cmp r4,r0 bne .Linner adds r12,r12,r11 mov r14,#0 ldr r4,[r0,#13*4] @ restore bp adc r14,r14,#0 ldr r8,[r0,#14*4] @ restore n0 adds r12,r12,r7 ldr r7,[r0,#15*4] @ restore &bp[num] adc r14,r14,#0 str r12,[r0] @ tp[num-1]= str r14,[r0,#4] @ tp[num]= cmp r4,r7 #ifdef __thumb2__ itt ne #endif movne r7,sp bne .Louter ldr r2,[r0,#12*4] @ pull rp mov r5,sp add r0,r0,#4 @ r0 to point at &tp[num] sub r5,r0,r5 @ "original" num value mov r4,sp @ "rewind" r4 mov r1,r4 @ "borrow" r1 sub r3,r3,r5 @ "rewind" r3 to &np[0] subs r7,r7,r7 @ "clear" carry flag .Lsub: ldr r7,[r4],#4 ldr r6,[r3],#4 sbcs r7,r7,r6 @ tp[j]-np[j] str r7,[r2],#4 @ rp[j]= teq r4,r0 @ preserve carry bne .Lsub sbcs r14,r14,#0 @ upmost carry mov r4,sp @ "rewind" r4 sub r2,r2,r5 @ "rewind" r2 .Lcopy: ldr r7,[r4] @ conditional copy ldr r5,[r2] str sp,[r4],#4 @ zap tp #ifdef __thumb2__ it cc #endif movcc r5,r7 str r5,[r2],#4 teq r4,r0 @ preserve carry bne .Lcopy mov sp,r0 add sp,sp,#4 @ skip over tp[num+1] ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers add sp,sp,#2*4 @ skip over {r0,r2} mov r0,#1 .Labrt: #if __ARM_ARCH>=5 bx lr @ bx lr #else tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif .size bn_mul_mont_nohw,.-bn_mul_mont_nohw #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon .globl bn_mul8x_mont_neon .hidden bn_mul8x_mont_neon .type bn_mul8x_mont_neon,%function .align 5 bn_mul8x_mont_neon: mov ip,sp stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so ldmia ip,{r4,r5} @ load rest of parameter block mov ip,sp cmp r5,#8 bhi .LNEON_8n @ special case for r5==8, everything is in register bank... vld1.32 {d28[0]}, [r2,:32]! veor d8,d8,d8 sub r7,sp,r5,lsl#4 vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-( and r7,r7,#-64 vld1.32 {d30[0]}, [r4,:32] mov sp,r7 @ alloca vzip.16 d28,d8 vmull.u32 q6,d28,d0[0] vmull.u32 q7,d28,d0[1] vmull.u32 q8,d28,d1[0] vshl.i64 d29,d13,#16 vmull.u32 q9,d28,d1[1] vadd.u64 d29,d29,d12 veor d8,d8,d8 vmul.u32 d29,d29,d30 vmull.u32 q10,d28,d2[0] vld1.32 {d4,d5,d6,d7}, [r3]! vmull.u32 q11,d28,d2[1] vmull.u32 q12,d28,d3[0] vzip.16 d29,d8 vmull.u32 q13,d28,d3[1] vmlal.u32 q6,d29,d4[0] sub r9,r5,#1 vmlal.u32 q7,d29,d4[1] vmlal.u32 q8,d29,d5[0] vmlal.u32 q9,d29,d5[1] vmlal.u32 q10,d29,d6[0] vmov q5,q6 vmlal.u32 q11,d29,d6[1] vmov q6,q7 vmlal.u32 q12,d29,d7[0] vmov q7,q8 vmlal.u32 q13,d29,d7[1] vmov q8,q9 vmov q9,q10 vshr.u64 d10,d10,#16 vmov q10,q11 vmov q11,q12 vadd.u64 d10,d10,d11 vmov q12,q13 veor q13,q13 vshr.u64 d10,d10,#16 b .LNEON_outer8 .align 4 .LNEON_outer8: vld1.32 {d28[0]}, [r2,:32]! veor d8,d8,d8 vzip.16 d28,d8 vadd.u64 d12,d12,d10 vmlal.u32 q6,d28,d0[0] vmlal.u32 q7,d28,d0[1] vmlal.u32 q8,d28,d1[0] vshl.i64 d29,d13,#16 vmlal.u32 q9,d28,d1[1] vadd.u64 d29,d29,d12 veor d8,d8,d8 subs r9,r9,#1 vmul.u32 d29,d29,d30 vmlal.u32 q10,d28,d2[0] vmlal.u32 q11,d28,d2[1] vmlal.u32 q12,d28,d3[0] vzip.16 d29,d8 vmlal.u32 q13,d28,d3[1] vmlal.u32 q6,d29,d4[0] vmlal.u32 q7,d29,d4[1] vmlal.u32 q8,d29,d5[0] vmlal.u32 q9,d29,d5[1] vmlal.u32 q10,d29,d6[0] vmov q5,q6 vmlal.u32 q11,d29,d6[1] vmov q6,q7 vmlal.u32 q12,d29,d7[0] vmov q7,q8 vmlal.u32 q13,d29,d7[1] vmov q8,q9 vmov q9,q10 vshr.u64 d10,d10,#16 vmov q10,q11 vmov q11,q12 vadd.u64 d10,d10,d11 vmov q12,q13 veor q13,q13 vshr.u64 d10,d10,#16 bne .LNEON_outer8 vadd.u64 d12,d12,d10 mov r7,sp vshr.u64 d10,d12,#16 mov r8,r5 vadd.u64 d13,d13,d10 add r6,sp,#96 vshr.u64 d10,d13,#16 vzip.16 d12,d13 b .LNEON_tail_entry .align 4 .LNEON_8n: veor q6,q6,q6 sub r7,sp,#128 veor q7,q7,q7 sub r7,r7,r5,lsl#4 veor q8,q8,q8 and r7,r7,#-64 veor q9,q9,q9 mov sp,r7 @ alloca veor q10,q10,q10 add r7,r7,#256 veor q11,q11,q11 sub r8,r5,#8 veor q12,q12,q12 veor q13,q13,q13 .LNEON_8n_init: vst1.64 {q6,q7},[r7,:256]! subs r8,r8,#8 vst1.64 {q8,q9},[r7,:256]! vst1.64 {q10,q11},[r7,:256]! vst1.64 {q12,q13},[r7,:256]! bne .LNEON_8n_init add r6,sp,#256 vld1.32 {d0,d1,d2,d3},[r1]! add r10,sp,#8 vld1.32 {d30[0]},[r4,:32] mov r9,r5 b .LNEON_8n_outer .align 4 .LNEON_8n_outer: vld1.32 {d28[0]},[r2,:32]! @ *b++ veor d8,d8,d8 vzip.16 d28,d8 add r7,sp,#128 vld1.32 {d4,d5,d6,d7},[r3]! vmlal.u32 q6,d28,d0[0] vmlal.u32 q7,d28,d0[1] veor d8,d8,d8 vmlal.u32 q8,d28,d1[0] vshl.i64 d29,d13,#16 vmlal.u32 q9,d28,d1[1] vadd.u64 d29,d29,d12 vmlal.u32 q10,d28,d2[0] vmul.u32 d29,d29,d30 vmlal.u32 q11,d28,d2[1] vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0] vmlal.u32 q12,d28,d3[0] vzip.16 d29,d8 vmlal.u32 q13,d28,d3[1] vld1.32 {d28[0]},[r2,:32]! @ *b++ vmlal.u32 q6,d29,d4[0] veor d10,d10,d10 vmlal.u32 q7,d29,d4[1] vzip.16 d28,d10 vmlal.u32 q8,d29,d5[0] vshr.u64 d12,d12,#16 vmlal.u32 q9,d29,d5[1] vmlal.u32 q10,d29,d6[0] vadd.u64 d12,d12,d13 vmlal.u32 q11,d29,d6[1] vshr.u64 d12,d12,#16 vmlal.u32 q12,d29,d7[0] vmlal.u32 q13,d29,d7[1] vadd.u64 d14,d14,d12 vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0] vmlal.u32 q7,d28,d0[0] vld1.64 {q6},[r6,:128]! vmlal.u32 q8,d28,d0[1] veor d8,d8,d8 vmlal.u32 q9,d28,d1[0] vshl.i64 d29,d15,#16 vmlal.u32 q10,d28,d1[1] vadd.u64 d29,d29,d14 vmlal.u32 q11,d28,d2[0] vmul.u32 d29,d29,d30 vmlal.u32 q12,d28,d2[1] vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1] vmlal.u32 q13,d28,d3[0] vzip.16 d29,d8 vmlal.u32 q6,d28,d3[1] vld1.32 {d28[0]},[r2,:32]! @ *b++ vmlal.u32 q7,d29,d4[0] veor d10,d10,d10 vmlal.u32 q8,d29,d4[1] vzip.16 d28,d10 vmlal.u32 q9,d29,d5[0] vshr.u64 d14,d14,#16 vmlal.u32 q10,d29,d5[1] vmlal.u32 q11,d29,d6[0] vadd.u64 d14,d14,d15 vmlal.u32 q12,d29,d6[1] vshr.u64 d14,d14,#16 vmlal.u32 q13,d29,d7[0] vmlal.u32 q6,d29,d7[1] vadd.u64 d16,d16,d14 vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1] vmlal.u32 q8,d28,d0[0] vld1.64 {q7},[r6,:128]! vmlal.u32 q9,d28,d0[1] veor d8,d8,d8 vmlal.u32 q10,d28,d1[0] vshl.i64 d29,d17,#16 vmlal.u32 q11,d28,d1[1] vadd.u64 d29,d29,d16 vmlal.u32 q12,d28,d2[0] vmul.u32 d29,d29,d30 vmlal.u32 q13,d28,d2[1] vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2] vmlal.u32 q6,d28,d3[0] vzip.16 d29,d8 vmlal.u32 q7,d28,d3[1] vld1.32 {d28[0]},[r2,:32]! @ *b++ vmlal.u32 q8,d29,d4[0] veor d10,d10,d10 vmlal.u32 q9,d29,d4[1] vzip.16 d28,d10 vmlal.u32 q10,d29,d5[0] vshr.u64 d16,d16,#16 vmlal.u32 q11,d29,d5[1] vmlal.u32 q12,d29,d6[0] vadd.u64 d16,d16,d17 vmlal.u32 q13,d29,d6[1] vshr.u64 d16,d16,#16 vmlal.u32 q6,d29,d7[0] vmlal.u32 q7,d29,d7[1] vadd.u64 d18,d18,d16 vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2] vmlal.u32 q9,d28,d0[0] vld1.64 {q8},[r6,:128]! vmlal.u32 q10,d28,d0[1] veor d8,d8,d8 vmlal.u32 q11,d28,d1[0] vshl.i64 d29,d19,#16 vmlal.u32 q12,d28,d1[1] vadd.u64 d29,d29,d18 vmlal.u32 q13,d28,d2[0] vmul.u32 d29,d29,d30 vmlal.u32 q6,d28,d2[1] vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3] vmlal.u32 q7,d28,d3[0] vzip.16 d29,d8 vmlal.u32 q8,d28,d3[1] vld1.32 {d28[0]},[r2,:32]! @ *b++ vmlal.u32 q9,d29,d4[0] veor d10,d10,d10 vmlal.u32 q10,d29,d4[1] vzip.16 d28,d10 vmlal.u32 q11,d29,d5[0] vshr.u64 d18,d18,#16 vmlal.u32 q12,d29,d5[1] vmlal.u32 q13,d29,d6[0] vadd.u64 d18,d18,d19 vmlal.u32 q6,d29,d6[1] vshr.u64 d18,d18,#16 vmlal.u32 q7,d29,d7[0] vmlal.u32 q8,d29,d7[1] vadd.u64 d20,d20,d18 vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3] vmlal.u32 q10,d28,d0[0] vld1.64 {q9},[r6,:128]! vmlal.u32 q11,d28,d0[1] veor d8,d8,d8 vmlal.u32 q12,d28,d1[0] vshl.i64 d29,d21,#16 vmlal.u32 q13,d28,d1[1] vadd.u64 d29,d29,d20 vmlal.u32 q6,d28,d2[0] vmul.u32 d29,d29,d30 vmlal.u32 q7,d28,d2[1] vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4] vmlal.u32 q8,d28,d3[0] vzip.16 d29,d8 vmlal.u32 q9,d28,d3[1] vld1.32 {d28[0]},[r2,:32]! @ *b++ vmlal.u32 q10,d29,d4[0] veor d10,d10,d10 vmlal.u32 q11,d29,d4[1] vzip.16 d28,d10 vmlal.u32 q12,d29,d5[0] vshr.u64 d20,d20,#16 vmlal.u32 q13,d29,d5[1] vmlal.u32 q6,d29,d6[0] vadd.u64 d20,d20,d21 vmlal.u32 q7,d29,d6[1] vshr.u64 d20,d20,#16 vmlal.u32 q8,d29,d7[0] vmlal.u32 q9,d29,d7[1] vadd.u64 d22,d22,d20 vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4] vmlal.u32 q11,d28,d0[0] vld1.64 {q10},[r6,:128]! vmlal.u32 q12,d28,d0[1] veor d8,d8,d8 vmlal.u32 q13,d28,d1[0] vshl.i64 d29,d23,#16 vmlal.u32 q6,d28,d1[1] vadd.u64 d29,d29,d22 vmlal.u32 q7,d28,d2[0] vmul.u32 d29,d29,d30 vmlal.u32 q8,d28,d2[1] vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5] vmlal.u32 q9,d28,d3[0] vzip.16 d29,d8 vmlal.u32 q10,d28,d3[1] vld1.32 {d28[0]},[r2,:32]! @ *b++ vmlal.u32 q11,d29,d4[0] veor d10,d10,d10 vmlal.u32 q12,d29,d4[1] vzip.16 d28,d10 vmlal.u32 q13,d29,d5[0] vshr.u64 d22,d22,#16 vmlal.u32 q6,d29,d5[1] vmlal.u32 q7,d29,d6[0] vadd.u64 d22,d22,d23 vmlal.u32 q8,d29,d6[1] vshr.u64 d22,d22,#16 vmlal.u32 q9,d29,d7[0] vmlal.u32 q10,d29,d7[1] vadd.u64 d24,d24,d22 vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5] vmlal.u32 q12,d28,d0[0] vld1.64 {q11},[r6,:128]! vmlal.u32 q13,d28,d0[1] veor d8,d8,d8 vmlal.u32 q6,d28,d1[0] vshl.i64 d29,d25,#16 vmlal.u32 q7,d28,d1[1] vadd.u64 d29,d29,d24 vmlal.u32 q8,d28,d2[0] vmul.u32 d29,d29,d30 vmlal.u32 q9,d28,d2[1] vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6] vmlal.u32 q10,d28,d3[0] vzip.16 d29,d8 vmlal.u32 q11,d28,d3[1] vld1.32 {d28[0]},[r2,:32]! @ *b++ vmlal.u32 q12,d29,d4[0] veor d10,d10,d10 vmlal.u32 q13,d29,d4[1] vzip.16 d28,d10 vmlal.u32 q6,d29,d5[0] vshr.u64 d24,d24,#16 vmlal.u32 q7,d29,d5[1] vmlal.u32 q8,d29,d6[0] vadd.u64 d24,d24,d25 vmlal.u32 q9,d29,d6[1] vshr.u64 d24,d24,#16 vmlal.u32 q10,d29,d7[0] vmlal.u32 q11,d29,d7[1] vadd.u64 d26,d26,d24 vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6] vmlal.u32 q13,d28,d0[0] vld1.64 {q12},[r6,:128]! vmlal.u32 q6,d28,d0[1] veor d8,d8,d8 vmlal.u32 q7,d28,d1[0] vshl.i64 d29,d27,#16 vmlal.u32 q8,d28,d1[1] vadd.u64 d29,d29,d26 vmlal.u32 q9,d28,d2[0] vmul.u32 d29,d29,d30 vmlal.u32 q10,d28,d2[1] vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7] vmlal.u32 q11,d28,d3[0] vzip.16 d29,d8 vmlal.u32 q12,d28,d3[1] vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] vmlal.u32 q13,d29,d4[0] vld1.32 {d0,d1,d2,d3},[r1]! vmlal.u32 q6,d29,d4[1] vmlal.u32 q7,d29,d5[0] vshr.u64 d26,d26,#16 vmlal.u32 q8,d29,d5[1] vmlal.u32 q9,d29,d6[0] vadd.u64 d26,d26,d27 vmlal.u32 q10,d29,d6[1] vshr.u64 d26,d26,#16 vmlal.u32 q11,d29,d7[0] vmlal.u32 q12,d29,d7[1] vadd.u64 d12,d12,d26 vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7] add r10,sp,#8 @ rewind sub r8,r5,#8 b .LNEON_8n_inner .align 4 .LNEON_8n_inner: subs r8,r8,#8 vmlal.u32 q6,d28,d0[0] vld1.64 {q13},[r6,:128] vmlal.u32 q7,d28,d0[1] vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0] vmlal.u32 q8,d28,d1[0] vld1.32 {d4,d5,d6,d7},[r3]! vmlal.u32 q9,d28,d1[1] it ne addne r6,r6,#16 @ don't advance in last iteration vmlal.u32 q10,d28,d2[0] vmlal.u32 q11,d28,d2[1] vmlal.u32 q12,d28,d3[0] vmlal.u32 q13,d28,d3[1] vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1] vmlal.u32 q6,d29,d4[0] vmlal.u32 q7,d29,d4[1] vmlal.u32 q8,d29,d5[0] vmlal.u32 q9,d29,d5[1] vmlal.u32 q10,d29,d6[0] vmlal.u32 q11,d29,d6[1] vmlal.u32 q12,d29,d7[0] vmlal.u32 q13,d29,d7[1] vst1.64 {q6},[r7,:128]! vmlal.u32 q7,d28,d0[0] vld1.64 {q6},[r6,:128] vmlal.u32 q8,d28,d0[1] vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1] vmlal.u32 q9,d28,d1[0] it ne addne r6,r6,#16 @ don't advance in last iteration vmlal.u32 q10,d28,d1[1] vmlal.u32 q11,d28,d2[0] vmlal.u32 q12,d28,d2[1] vmlal.u32 q13,d28,d3[0] vmlal.u32 q6,d28,d3[1] vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2] vmlal.u32 q7,d29,d4[0] vmlal.u32 q8,d29,d4[1] vmlal.u32 q9,d29,d5[0] vmlal.u32 q10,d29,d5[1] vmlal.u32 q11,d29,d6[0] vmlal.u32 q12,d29,d6[1] vmlal.u32 q13,d29,d7[0] vmlal.u32 q6,d29,d7[1] vst1.64 {q7},[r7,:128]! vmlal.u32 q8,d28,d0[0] vld1.64 {q7},[r6,:128] vmlal.u32 q9,d28,d0[1] vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2] vmlal.u32 q10,d28,d1[0] it ne addne r6,r6,#16 @ don't advance in last iteration vmlal.u32 q11,d28,d1[1] vmlal.u32 q12,d28,d2[0] vmlal.u32 q13,d28,d2[1] vmlal.u32 q6,d28,d3[0] vmlal.u32 q7,d28,d3[1] vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3] vmlal.u32 q8,d29,d4[0] vmlal.u32 q9,d29,d4[1] vmlal.u32 q10,d29,d5[0] vmlal.u32 q11,d29,d5[1] vmlal.u32 q12,d29,d6[0] vmlal.u32 q13,d29,d6[1] vmlal.u32 q6,d29,d7[0] vmlal.u32 q7,d29,d7[1] vst1.64 {q8},[r7,:128]! vmlal.u32 q9,d28,d0[0] vld1.64 {q8},[r6,:128] vmlal.u32 q10,d28,d0[1] vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3] vmlal.u32 q11,d28,d1[0] it ne addne r6,r6,#16 @ don't advance in last iteration vmlal.u32 q12,d28,d1[1] vmlal.u32 q13,d28,d2[0] vmlal.u32 q6,d28,d2[1] vmlal.u32 q7,d28,d3[0] vmlal.u32 q8,d28,d3[1] vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4] vmlal.u32 q9,d29,d4[0] vmlal.u32 q10,d29,d4[1] vmlal.u32 q11,d29,d5[0] vmlal.u32 q12,d29,d5[1] vmlal.u32 q13,d29,d6[0] vmlal.u32 q6,d29,d6[1] vmlal.u32 q7,d29,d7[0] vmlal.u32 q8,d29,d7[1] vst1.64 {q9},[r7,:128]! vmlal.u32 q10,d28,d0[0] vld1.64 {q9},[r6,:128] vmlal.u32 q11,d28,d0[1] vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4] vmlal.u32 q12,d28,d1[0] it ne addne r6,r6,#16 @ don't advance in last iteration vmlal.u32 q13,d28,d1[1] vmlal.u32 q6,d28,d2[0] vmlal.u32 q7,d28,d2[1] vmlal.u32 q8,d28,d3[0] vmlal.u32 q9,d28,d3[1] vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5] vmlal.u32 q10,d29,d4[0] vmlal.u32 q11,d29,d4[1] vmlal.u32 q12,d29,d5[0] vmlal.u32 q13,d29,d5[1] vmlal.u32 q6,d29,d6[0] vmlal.u32 q7,d29,d6[1] vmlal.u32 q8,d29,d7[0] vmlal.u32 q9,d29,d7[1] vst1.64 {q10},[r7,:128]! vmlal.u32 q11,d28,d0[0] vld1.64 {q10},[r6,:128] vmlal.u32 q12,d28,d0[1] vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5] vmlal.u32 q13,d28,d1[0] it ne addne r6,r6,#16 @ don't advance in last iteration vmlal.u32 q6,d28,d1[1] vmlal.u32 q7,d28,d2[0] vmlal.u32 q8,d28,d2[1] vmlal.u32 q9,d28,d3[0] vmlal.u32 q10,d28,d3[1] vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6] vmlal.u32 q11,d29,d4[0] vmlal.u32 q12,d29,d4[1] vmlal.u32 q13,d29,d5[0] vmlal.u32 q6,d29,d5[1] vmlal.u32 q7,d29,d6[0] vmlal.u32 q8,d29,d6[1] vmlal.u32 q9,d29,d7[0] vmlal.u32 q10,d29,d7[1] vst1.64 {q11},[r7,:128]! vmlal.u32 q12,d28,d0[0] vld1.64 {q11},[r6,:128] vmlal.u32 q13,d28,d0[1] vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6] vmlal.u32 q6,d28,d1[0] it ne addne r6,r6,#16 @ don't advance in last iteration vmlal.u32 q7,d28,d1[1] vmlal.u32 q8,d28,d2[0] vmlal.u32 q9,d28,d2[1] vmlal.u32 q10,d28,d3[0] vmlal.u32 q11,d28,d3[1] vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7] vmlal.u32 q12,d29,d4[0] vmlal.u32 q13,d29,d4[1] vmlal.u32 q6,d29,d5[0] vmlal.u32 q7,d29,d5[1] vmlal.u32 q8,d29,d6[0] vmlal.u32 q9,d29,d6[1] vmlal.u32 q10,d29,d7[0] vmlal.u32 q11,d29,d7[1] vst1.64 {q12},[r7,:128]! vmlal.u32 q13,d28,d0[0] vld1.64 {q12},[r6,:128] vmlal.u32 q6,d28,d0[1] vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7] vmlal.u32 q7,d28,d1[0] it ne addne r6,r6,#16 @ don't advance in last iteration vmlal.u32 q8,d28,d1[1] vmlal.u32 q9,d28,d2[0] vmlal.u32 q10,d28,d2[1] vmlal.u32 q11,d28,d3[0] vmlal.u32 q12,d28,d3[1] it eq subeq r1,r1,r5,lsl#2 @ rewind vmlal.u32 q13,d29,d4[0] vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] vmlal.u32 q6,d29,d4[1] vld1.32 {d0,d1,d2,d3},[r1]! vmlal.u32 q7,d29,d5[0] add r10,sp,#8 @ rewind vmlal.u32 q8,d29,d5[1] vmlal.u32 q9,d29,d6[0] vmlal.u32 q10,d29,d6[1] vmlal.u32 q11,d29,d7[0] vst1.64 {q13},[r7,:128]! vmlal.u32 q12,d29,d7[1] bne .LNEON_8n_inner add r6,sp,#128 vst1.64 {q6,q7},[r7,:256]! veor q2,q2,q2 @ d4-d5 vst1.64 {q8,q9},[r7,:256]! veor q3,q3,q3 @ d6-d7 vst1.64 {q10,q11},[r7,:256]! vst1.64 {q12},[r7,:128] subs r9,r9,#8 vld1.64 {q6,q7},[r6,:256]! vld1.64 {q8,q9},[r6,:256]! vld1.64 {q10,q11},[r6,:256]! vld1.64 {q12,q13},[r6,:256]! itt ne subne r3,r3,r5,lsl#2 @ rewind bne .LNEON_8n_outer add r7,sp,#128 vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame vshr.u64 d10,d12,#16 vst1.64 {q2,q3},[sp,:256]! vadd.u64 d13,d13,d10 vst1.64 {q2,q3}, [sp,:256]! vshr.u64 d10,d13,#16 vst1.64 {q2,q3}, [sp,:256]! vzip.16 d12,d13 mov r8,r5 b .LNEON_tail_entry .align 4 .LNEON_tail: vadd.u64 d12,d12,d10 vshr.u64 d10,d12,#16 vld1.64 {q8,q9}, [r6, :256]! vadd.u64 d13,d13,d10 vld1.64 {q10,q11}, [r6, :256]! vshr.u64 d10,d13,#16 vld1.64 {q12,q13}, [r6, :256]! vzip.16 d12,d13 .LNEON_tail_entry: vadd.u64 d14,d14,d10 vst1.32 {d12[0]}, [r7, :32]! vshr.u64 d10,d14,#16 vadd.u64 d15,d15,d10 vshr.u64 d10,d15,#16 vzip.16 d14,d15 vadd.u64 d16,d16,d10 vst1.32 {d14[0]}, [r7, :32]! vshr.u64 d10,d16,#16 vadd.u64 d17,d17,d10 vshr.u64 d10,d17,#16 vzip.16 d16,d17 vadd.u64 d18,d18,d10 vst1.32 {d16[0]}, [r7, :32]! vshr.u64 d10,d18,#16 vadd.u64 d19,d19,d10 vshr.u64 d10,d19,#16 vzip.16 d18,d19 vadd.u64 d20,d20,d10 vst1.32 {d18[0]}, [r7, :32]! vshr.u64 d10,d20,#16 vadd.u64 d21,d21,d10 vshr.u64 d10,d21,#16 vzip.16 d20,d21 vadd.u64 d22,d22,d10 vst1.32 {d20[0]}, [r7, :32]! vshr.u64 d10,d22,#16 vadd.u64 d23,d23,d10 vshr.u64 d10,d23,#16 vzip.16 d22,d23 vadd.u64 d24,d24,d10 vst1.32 {d22[0]}, [r7, :32]! vshr.u64 d10,d24,#16 vadd.u64 d25,d25,d10 vshr.u64 d10,d25,#16 vzip.16 d24,d25 vadd.u64 d26,d26,d10 vst1.32 {d24[0]}, [r7, :32]! vshr.u64 d10,d26,#16 vadd.u64 d27,d27,d10 vshr.u64 d10,d27,#16 vzip.16 d26,d27 vld1.64 {q6,q7}, [r6, :256]! subs r8,r8,#8 vst1.32 {d26[0]}, [r7, :32]! bne .LNEON_tail vst1.32 {d10[0]}, [r7, :32] @ top-most bit sub r3,r3,r5,lsl#2 @ rewind r3 subs r1,sp,#0 @ clear carry flag add r2,sp,r5,lsl#2 .LNEON_sub: ldmia r1!, {r4,r5,r6,r7} ldmia r3!, {r8,r9,r10,r11} sbcs r8, r4,r8 sbcs r9, r5,r9 sbcs r10,r6,r10 sbcs r11,r7,r11 teq r1,r2 @ preserves carry stmia r0!, {r8,r9,r10,r11} bne .LNEON_sub ldr r10, [r1] @ load top-most bit mov r11,sp veor q0,q0,q0 sub r11,r2,r11 @ this is num*4 veor q1,q1,q1 mov r1,sp sub r0,r0,r11 @ rewind r0 mov r3,r2 @ second 3/4th of frame sbcs r10,r10,#0 @ result is carry flag .LNEON_copy_n_zap: ldmia r1!, {r4,r5,r6,r7} ldmia r0, {r8,r9,r10,r11} it cc movcc r8, r4 vst1.64 {q0,q1}, [r3,:256]! @ wipe itt cc movcc r9, r5 movcc r10,r6 vst1.64 {q0,q1}, [r3,:256]! @ wipe it cc movcc r11,r7 ldmia r1, {r4,r5,r6,r7} stmia r0!, {r8,r9,r10,r11} sub r1,r1,#16 ldmia r0, {r8,r9,r10,r11} it cc movcc r8, r4 vst1.64 {q0,q1}, [r1,:256]! @ wipe itt cc movcc r9, r5 movcc r10,r6 vst1.64 {q0,q1}, [r3,:256]! @ wipe it cc movcc r11,r7 teq r1,r2 @ preserves carry stmia r0!, {r8,r9,r10,r11} bne .LNEON_copy_n_zap mov sp,ip vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} bx lr @ bx lr .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon #endif .byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) ring-0.17.14/pregenerated/armv8-mont-ios64.S000064400000000000000000000736611046102023000165150ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) .text .globl _bn_mul_mont_nohw .private_extern _bn_mul_mont_nohw .align 5 _bn_mul_mont_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] ldr x9,[x2],#8 // bp[0] sub x22,sp,x5,lsl#3 ldp x7,x8,[x1],#16 // ap[0..1] lsl x5,x5,#3 ldr x4,[x4] // *n0 and x22,x22,#-16 // ABI says so ldp x13,x14,[x3],#16 // np[0..1] mul x6,x7,x9 // ap[0]*bp[0] sub x21,x5,#16 // j=num-2 umulh x7,x7,x9 mul x10,x8,x9 // ap[1]*bp[0] umulh x11,x8,x9 mul x15,x6,x4 // "tp[0]"*n0 mov sp,x22 // alloca // (*) mul x12,x13,x15 // np[0]*m1 umulh x13,x13,x15 mul x16,x14,x15 // np[1]*m1 // (*) adds x12,x12,x6 // discarded // (*) As for removal of first multiplication and addition // instructions. The outcome of first addition is // guaranteed to be zero, which leaves two computationally // significant outcomes: it either carries or not. Then // question is when does it carry? Is there alternative // way to deduce it? If you follow operations, you can // observe that condition for carry is quite simple: // x6 being non-zero. So that carry can be calculated // by adding -1 to x6. That's what next instruction does. subs xzr,x6,#1 // (*) umulh x17,x14,x15 adc x13,x13,xzr cbz x21,L1st_skip L1st: ldr x8,[x1],#8 adds x6,x10,x7 sub x21,x21,#8 // j-- adc x7,x11,xzr ldr x14,[x3],#8 adds x12,x16,x13 mul x10,x8,x9 // ap[j]*bp[0] adc x13,x17,xzr umulh x11,x8,x9 adds x12,x12,x6 mul x16,x14,x15 // np[j]*m1 adc x13,x13,xzr umulh x17,x14,x15 str x12,[x22],#8 // tp[j-1] cbnz x21,L1st L1st_skip: adds x6,x10,x7 sub x1,x1,x5 // rewind x1 adc x7,x11,xzr adds x12,x16,x13 sub x3,x3,x5 // rewind x3 adc x13,x17,xzr adds x12,x12,x6 sub x20,x5,#8 // i=num-1 adcs x13,x13,x7 adc x19,xzr,xzr // upmost overflow bit stp x12,x13,[x22] Louter: ldr x9,[x2],#8 // bp[i] ldp x7,x8,[x1],#16 ldr x23,[sp] // tp[0] add x22,sp,#8 mul x6,x7,x9 // ap[0]*bp[i] sub x21,x5,#16 // j=num-2 umulh x7,x7,x9 ldp x13,x14,[x3],#16 mul x10,x8,x9 // ap[1]*bp[i] adds x6,x6,x23 umulh x11,x8,x9 adc x7,x7,xzr mul x15,x6,x4 sub x20,x20,#8 // i-- // (*) mul x12,x13,x15 // np[0]*m1 umulh x13,x13,x15 mul x16,x14,x15 // np[1]*m1 // (*) adds x12,x12,x6 subs xzr,x6,#1 // (*) umulh x17,x14,x15 cbz x21,Linner_skip Linner: ldr x8,[x1],#8 adc x13,x13,xzr ldr x23,[x22],#8 // tp[j] adds x6,x10,x7 sub x21,x21,#8 // j-- adc x7,x11,xzr adds x12,x16,x13 ldr x14,[x3],#8 adc x13,x17,xzr mul x10,x8,x9 // ap[j]*bp[i] adds x6,x6,x23 umulh x11,x8,x9 adc x7,x7,xzr mul x16,x14,x15 // np[j]*m1 adds x12,x12,x6 umulh x17,x14,x15 str x12,[x22,#-16] // tp[j-1] cbnz x21,Linner Linner_skip: ldr x23,[x22],#8 // tp[j] adc x13,x13,xzr adds x6,x10,x7 sub x1,x1,x5 // rewind x1 adc x7,x11,xzr adds x12,x16,x13 sub x3,x3,x5 // rewind x3 adcs x13,x17,x19 adc x19,xzr,xzr adds x6,x6,x23 adc x7,x7,xzr adds x12,x12,x6 adcs x13,x13,x7 adc x19,x19,xzr // upmost overflow bit stp x12,x13,[x22,#-16] cbnz x20,Louter // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. ldr x23,[sp] // tp[0] add x22,sp,#8 ldr x14,[x3],#8 // np[0] subs x21,x5,#8 // j=num-1 and clear borrow mov x1,x0 Lsub: sbcs x8,x23,x14 // tp[j]-np[j] ldr x23,[x22],#8 sub x21,x21,#8 // j-- ldr x14,[x3],#8 str x8,[x1],#8 // rp[j]=tp[j]-np[j] cbnz x21,Lsub sbcs x8,x23,x14 sbcs x19,x19,xzr // did it borrow? str x8,[x1],#8 // rp[num-1] ldr x23,[sp] // tp[0] add x22,sp,#8 ldr x8,[x0],#8 // rp[0] sub x5,x5,#8 // num-- nop Lcond_copy: sub x5,x5,#8 // num-- csel x14,x23,x8,lo // did it borrow? ldr x23,[x22],#8 ldr x8,[x0],#8 str xzr,[x22,#-16] // wipe tp str x14,[x0,#-16] cbnz x5,Lcond_copy csel x14,x23,x8,lo str xzr,[x22,#-8] // wipe tp str x14,[x0,#-8] ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldr x29,[sp],#64 AARCH64_VALIDATE_LINK_REGISTER ret .globl _bn_sqr8x_mont .private_extern _bn_sqr8x_mont .align 5 _bn_sqr8x_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] stp x0,x3,[sp,#96] // offload rp and np ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] ldp x10,x11,[x1,#8*4] ldp x12,x13,[x1,#8*6] sub x2,sp,x5,lsl#4 lsl x5,x5,#3 ldr x4,[x4] // *n0 mov sp,x2 // alloca sub x27,x5,#8*8 b Lsqr8x_zero_start Lsqr8x_zero: sub x27,x27,#8*8 stp xzr,xzr,[x2,#8*0] stp xzr,xzr,[x2,#8*2] stp xzr,xzr,[x2,#8*4] stp xzr,xzr,[x2,#8*6] Lsqr8x_zero_start: stp xzr,xzr,[x2,#8*8] stp xzr,xzr,[x2,#8*10] stp xzr,xzr,[x2,#8*12] stp xzr,xzr,[x2,#8*14] add x2,x2,#8*16 cbnz x27,Lsqr8x_zero add x3,x1,x5 add x1,x1,#8*8 mov x19,xzr mov x20,xzr mov x21,xzr mov x22,xzr mov x23,xzr mov x24,xzr mov x25,xzr mov x26,xzr mov x2,sp str x4,[x29,#112] // offload n0 // Multiply everything but a[i]*a[i] .align 4 Lsqr8x_outer_loop: // a[1]a[0] (i) // a[2]a[0] // a[3]a[0] // a[4]a[0] // a[5]a[0] // a[6]a[0] // a[7]a[0] // a[2]a[1] (ii) // a[3]a[1] // a[4]a[1] // a[5]a[1] // a[6]a[1] // a[7]a[1] // a[3]a[2] (iii) // a[4]a[2] // a[5]a[2] // a[6]a[2] // a[7]a[2] // a[4]a[3] (iv) // a[5]a[3] // a[6]a[3] // a[7]a[3] // a[5]a[4] (v) // a[6]a[4] // a[7]a[4] // a[6]a[5] (vi) // a[7]a[5] // a[7]a[6] (vii) mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) mul x15,x8,x6 mul x16,x9,x6 mul x17,x10,x6 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) mul x14,x11,x6 adcs x21,x21,x15 mul x15,x12,x6 adcs x22,x22,x16 mul x16,x13,x6 adcs x23,x23,x17 umulh x17,x7,x6 // hi(a[1..7]*a[0]) adcs x24,x24,x14 umulh x14,x8,x6 adcs x25,x25,x15 umulh x15,x9,x6 adcs x26,x26,x16 umulh x16,x10,x6 stp x19,x20,[x2],#8*2 // t[0..1] adc x19,xzr,xzr // t[8] adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) umulh x17,x11,x6 adcs x22,x22,x14 umulh x14,x12,x6 adcs x23,x23,x15 umulh x15,x13,x6 adcs x24,x24,x16 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) adcs x25,x25,x17 mul x17,x9,x7 adcs x26,x26,x14 mul x14,x10,x7 adc x19,x19,x15 mul x15,x11,x7 adds x22,x22,x16 mul x16,x12,x7 adcs x23,x23,x17 mul x17,x13,x7 adcs x24,x24,x14 umulh x14,x8,x7 // hi(a[2..7]*a[1]) adcs x25,x25,x15 umulh x15,x9,x7 adcs x26,x26,x16 umulh x16,x10,x7 adcs x19,x19,x17 umulh x17,x11,x7 stp x21,x22,[x2],#8*2 // t[2..3] adc x20,xzr,xzr // t[9] adds x23,x23,x14 umulh x14,x12,x7 adcs x24,x24,x15 umulh x15,x13,x7 adcs x25,x25,x16 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) adcs x26,x26,x17 mul x17,x10,x8 adcs x19,x19,x14 mul x14,x11,x8 adc x20,x20,x15 mul x15,x12,x8 adds x24,x24,x16 mul x16,x13,x8 adcs x25,x25,x17 umulh x17,x9,x8 // hi(a[3..7]*a[2]) adcs x26,x26,x14 umulh x14,x10,x8 adcs x19,x19,x15 umulh x15,x11,x8 adcs x20,x20,x16 umulh x16,x12,x8 stp x23,x24,[x2],#8*2 // t[4..5] adc x21,xzr,xzr // t[10] adds x25,x25,x17 umulh x17,x13,x8 adcs x26,x26,x14 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) adcs x19,x19,x15 mul x15,x11,x9 adcs x20,x20,x16 mul x16,x12,x9 adc x21,x21,x17 mul x17,x13,x9 adds x26,x26,x14 umulh x14,x10,x9 // hi(a[4..7]*a[3]) adcs x19,x19,x15 umulh x15,x11,x9 adcs x20,x20,x16 umulh x16,x12,x9 adcs x21,x21,x17 umulh x17,x13,x9 stp x25,x26,[x2],#8*2 // t[6..7] adc x22,xzr,xzr // t[11] adds x19,x19,x14 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) adcs x20,x20,x15 mul x15,x12,x10 adcs x21,x21,x16 mul x16,x13,x10 adc x22,x22,x17 umulh x17,x11,x10 // hi(a[5..7]*a[4]) adds x20,x20,x14 umulh x14,x12,x10 adcs x21,x21,x15 umulh x15,x13,x10 adcs x22,x22,x16 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) adc x23,xzr,xzr // t[12] adds x21,x21,x17 mul x17,x13,x11 adcs x22,x22,x14 umulh x14,x12,x11 // hi(a[6..7]*a[5]) adc x23,x23,x15 umulh x15,x13,x11 adds x22,x22,x16 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) adcs x23,x23,x17 umulh x17,x13,x12 // hi(a[7]*a[6]) adc x24,xzr,xzr // t[13] adds x23,x23,x14 sub x27,x3,x1 // done yet? adc x24,x24,x15 adds x24,x24,x16 sub x14,x3,x5 // rewinded ap adc x25,xzr,xzr // t[14] add x25,x25,x17 cbz x27,Lsqr8x_outer_break mov x4,x6 ldp x6,x7,[x2,#8*0] ldp x8,x9,[x2,#8*2] ldp x10,x11,[x2,#8*4] ldp x12,x13,[x2,#8*6] adds x19,x19,x6 adcs x20,x20,x7 ldp x6,x7,[x1,#8*0] adcs x21,x21,x8 adcs x22,x22,x9 ldp x8,x9,[x1,#8*2] adcs x23,x23,x10 adcs x24,x24,x11 ldp x10,x11,[x1,#8*4] adcs x25,x25,x12 mov x0,x1 adcs x26,xzr,x13 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 //adc x28,xzr,xzr // moved below mov x27,#-8*8 // a[8]a[0] // a[9]a[0] // a[a]a[0] // a[b]a[0] // a[c]a[0] // a[d]a[0] // a[e]a[0] // a[f]a[0] // a[8]a[1] // a[f]a[1]........................ // a[8]a[2] // a[f]a[2]........................ // a[8]a[3] // a[f]a[3]........................ // a[8]a[4] // a[f]a[4]........................ // a[8]a[5] // a[f]a[5]........................ // a[8]a[6] // a[f]a[6]........................ // a[8]a[7] // a[f]a[7]........................ Lsqr8x_mul: mul x14,x6,x4 adc x28,xzr,xzr // carry bit, modulo-scheduled mul x15,x7,x4 add x27,x27,#8 mul x16,x8,x4 mul x17,x9,x4 adds x19,x19,x14 mul x14,x10,x4 adcs x20,x20,x15 mul x15,x11,x4 adcs x21,x21,x16 mul x16,x12,x4 adcs x22,x22,x17 mul x17,x13,x4 adcs x23,x23,x14 umulh x14,x6,x4 adcs x24,x24,x15 umulh x15,x7,x4 adcs x25,x25,x16 umulh x16,x8,x4 adcs x26,x26,x17 umulh x17,x9,x4 adc x28,x28,xzr str x19,[x2],#8 adds x19,x20,x14 umulh x14,x10,x4 adcs x20,x21,x15 umulh x15,x11,x4 adcs x21,x22,x16 umulh x16,x12,x4 adcs x22,x23,x17 umulh x17,x13,x4 ldr x4,[x0,x27] adcs x23,x24,x14 adcs x24,x25,x15 adcs x25,x26,x16 adcs x26,x28,x17 //adc x28,xzr,xzr // moved above cbnz x27,Lsqr8x_mul // note that carry flag is guaranteed // to be zero at this point cmp x1,x3 // done yet? b.eq Lsqr8x_break ldp x6,x7,[x2,#8*0] ldp x8,x9,[x2,#8*2] ldp x10,x11,[x2,#8*4] ldp x12,x13,[x2,#8*6] adds x19,x19,x6 ldr x4,[x0,#-8*8] adcs x20,x20,x7 ldp x6,x7,[x1,#8*0] adcs x21,x21,x8 adcs x22,x22,x9 ldp x8,x9,[x1,#8*2] adcs x23,x23,x10 adcs x24,x24,x11 ldp x10,x11,[x1,#8*4] adcs x25,x25,x12 mov x27,#-8*8 adcs x26,x26,x13 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 //adc x28,xzr,xzr // moved above b Lsqr8x_mul .align 4 Lsqr8x_break: ldp x6,x7,[x0,#8*0] add x1,x0,#8*8 ldp x8,x9,[x0,#8*2] sub x14,x3,x1 // is it last iteration? ldp x10,x11,[x0,#8*4] sub x15,x2,x14 ldp x12,x13,[x0,#8*6] cbz x14,Lsqr8x_outer_loop stp x19,x20,[x2,#8*0] ldp x19,x20,[x15,#8*0] stp x21,x22,[x2,#8*2] ldp x21,x22,[x15,#8*2] stp x23,x24,[x2,#8*4] ldp x23,x24,[x15,#8*4] stp x25,x26,[x2,#8*6] mov x2,x15 ldp x25,x26,[x15,#8*6] b Lsqr8x_outer_loop .align 4 Lsqr8x_outer_break: // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] ldp x15,x16,[sp,#8*1] ldp x11,x13,[x14,#8*2] add x1,x14,#8*4 ldp x17,x14,[sp,#8*3] stp x19,x20,[x2,#8*0] mul x19,x7,x7 stp x21,x22,[x2,#8*2] umulh x7,x7,x7 stp x23,x24,[x2,#8*4] mul x8,x9,x9 stp x25,x26,[x2,#8*6] mov x2,sp umulh x9,x9,x9 adds x20,x7,x15,lsl#1 extr x15,x16,x15,#63 sub x27,x5,#8*4 Lsqr4x_shift_n_add: adcs x21,x8,x15 extr x16,x17,x16,#63 sub x27,x27,#8*4 adcs x22,x9,x16 ldp x15,x16,[x2,#8*5] mul x10,x11,x11 ldp x7,x9,[x1],#8*2 umulh x11,x11,x11 mul x12,x13,x13 umulh x13,x13,x13 extr x17,x14,x17,#63 stp x19,x20,[x2,#8*0] adcs x23,x10,x17 extr x14,x15,x14,#63 stp x21,x22,[x2,#8*2] adcs x24,x11,x14 ldp x17,x14,[x2,#8*7] extr x15,x16,x15,#63 adcs x25,x12,x15 extr x16,x17,x16,#63 adcs x26,x13,x16 ldp x15,x16,[x2,#8*9] mul x6,x7,x7 ldp x11,x13,[x1],#8*2 umulh x7,x7,x7 mul x8,x9,x9 umulh x9,x9,x9 stp x23,x24,[x2,#8*4] extr x17,x14,x17,#63 stp x25,x26,[x2,#8*6] add x2,x2,#8*8 adcs x19,x6,x17 extr x14,x15,x14,#63 adcs x20,x7,x14 ldp x17,x14,[x2,#8*3] extr x15,x16,x15,#63 cbnz x27,Lsqr4x_shift_n_add ldp x1,x4,[x29,#104] // pull np and n0 adcs x21,x8,x15 extr x16,x17,x16,#63 adcs x22,x9,x16 ldp x15,x16,[x2,#8*5] mul x10,x11,x11 umulh x11,x11,x11 stp x19,x20,[x2,#8*0] mul x12,x13,x13 umulh x13,x13,x13 stp x21,x22,[x2,#8*2] extr x17,x14,x17,#63 adcs x23,x10,x17 extr x14,x15,x14,#63 ldp x19,x20,[sp,#8*0] adcs x24,x11,x14 extr x15,x16,x15,#63 ldp x6,x7,[x1,#8*0] adcs x25,x12,x15 extr x16,xzr,x16,#63 ldp x8,x9,[x1,#8*2] adc x26,x13,x16 ldp x10,x11,[x1,#8*4] // Reduce by 512 bits per iteration mul x28,x4,x19 // t[0]*n0 ldp x12,x13,[x1,#8*6] add x3,x1,x5 ldp x21,x22,[sp,#8*2] stp x23,x24,[x2,#8*4] ldp x23,x24,[sp,#8*4] stp x25,x26,[x2,#8*6] ldp x25,x26,[sp,#8*6] add x1,x1,#8*8 mov x30,xzr // initial top-most carry mov x2,sp mov x27,#8 Lsqr8x_reduction: // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) mul x15,x7,x28 sub x27,x27,#1 mul x16,x8,x28 str x28,[x2],#8 // put aside t[0]*n0 for tail processing mul x17,x9,x28 // (*) adds xzr,x19,x14 subs xzr,x19,#1 // (*) mul x14,x10,x28 adcs x19,x20,x15 mul x15,x11,x28 adcs x20,x21,x16 mul x16,x12,x28 adcs x21,x22,x17 mul x17,x13,x28 adcs x22,x23,x14 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) adcs x23,x24,x15 umulh x15,x7,x28 adcs x24,x25,x16 umulh x16,x8,x28 adcs x25,x26,x17 umulh x17,x9,x28 adc x26,xzr,xzr adds x19,x19,x14 umulh x14,x10,x28 adcs x20,x20,x15 umulh x15,x11,x28 adcs x21,x21,x16 umulh x16,x12,x28 adcs x22,x22,x17 umulh x17,x13,x28 mul x28,x4,x19 // next t[0]*n0 adcs x23,x23,x14 adcs x24,x24,x15 adcs x25,x25,x16 adc x26,x26,x17 cbnz x27,Lsqr8x_reduction ldp x14,x15,[x2,#8*0] ldp x16,x17,[x2,#8*2] mov x0,x2 sub x27,x3,x1 // done yet? adds x19,x19,x14 adcs x20,x20,x15 ldp x14,x15,[x2,#8*4] adcs x21,x21,x16 adcs x22,x22,x17 ldp x16,x17,[x2,#8*6] adcs x23,x23,x14 adcs x24,x24,x15 adcs x25,x25,x16 adcs x26,x26,x17 //adc x28,xzr,xzr // moved below cbz x27,Lsqr8x8_post_condition ldr x4,[x2,#-8*8] ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] ldp x10,x11,[x1,#8*4] mov x27,#-8*8 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 Lsqr8x_tail: mul x14,x6,x4 adc x28,xzr,xzr // carry bit, modulo-scheduled mul x15,x7,x4 add x27,x27,#8 mul x16,x8,x4 mul x17,x9,x4 adds x19,x19,x14 mul x14,x10,x4 adcs x20,x20,x15 mul x15,x11,x4 adcs x21,x21,x16 mul x16,x12,x4 adcs x22,x22,x17 mul x17,x13,x4 adcs x23,x23,x14 umulh x14,x6,x4 adcs x24,x24,x15 umulh x15,x7,x4 adcs x25,x25,x16 umulh x16,x8,x4 adcs x26,x26,x17 umulh x17,x9,x4 adc x28,x28,xzr str x19,[x2],#8 adds x19,x20,x14 umulh x14,x10,x4 adcs x20,x21,x15 umulh x15,x11,x4 adcs x21,x22,x16 umulh x16,x12,x4 adcs x22,x23,x17 umulh x17,x13,x4 ldr x4,[x0,x27] adcs x23,x24,x14 adcs x24,x25,x15 adcs x25,x26,x16 adcs x26,x28,x17 //adc x28,xzr,xzr // moved above cbnz x27,Lsqr8x_tail // note that carry flag is guaranteed // to be zero at this point ldp x6,x7,[x2,#8*0] sub x27,x3,x1 // done yet? sub x16,x3,x5 // rewinded np ldp x8,x9,[x2,#8*2] ldp x10,x11,[x2,#8*4] ldp x12,x13,[x2,#8*6] cbz x27,Lsqr8x_tail_break ldr x4,[x0,#-8*8] adds x19,x19,x6 adcs x20,x20,x7 ldp x6,x7,[x1,#8*0] adcs x21,x21,x8 adcs x22,x22,x9 ldp x8,x9,[x1,#8*2] adcs x23,x23,x10 adcs x24,x24,x11 ldp x10,x11,[x1,#8*4] adcs x25,x25,x12 mov x27,#-8*8 adcs x26,x26,x13 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 //adc x28,xzr,xzr // moved above b Lsqr8x_tail .align 4 Lsqr8x_tail_break: ldr x4,[x29,#112] // pull n0 add x27,x2,#8*8 // end of current t[num] window subs xzr,x30,#1 // "move" top-most carry to carry bit adcs x14,x19,x6 adcs x15,x20,x7 ldp x19,x20,[x0,#8*0] adcs x21,x21,x8 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] adcs x22,x22,x9 ldp x8,x9,[x16,#8*2] adcs x23,x23,x10 adcs x24,x24,x11 ldp x10,x11,[x16,#8*4] adcs x25,x25,x12 adcs x26,x26,x13 ldp x12,x13,[x16,#8*6] add x1,x16,#8*8 adc x30,xzr,xzr // top-most carry mul x28,x4,x19 stp x14,x15,[x2,#8*0] stp x21,x22,[x2,#8*2] ldp x21,x22,[x0,#8*2] stp x23,x24,[x2,#8*4] ldp x23,x24,[x0,#8*4] cmp x27,x29 // did we hit the bottom? stp x25,x26,[x2,#8*6] mov x2,x0 // slide the window ldp x25,x26,[x0,#8*6] mov x27,#8 b.ne Lsqr8x_reduction // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. ldr x0,[x29,#96] // pull rp add x2,x2,#8*8 subs x14,x19,x6 sbcs x15,x20,x7 sub x27,x5,#8*8 mov x3,x0 // x0 copy Lsqr8x_sub: sbcs x16,x21,x8 ldp x6,x7,[x1,#8*0] sbcs x17,x22,x9 stp x14,x15,[x0,#8*0] sbcs x14,x23,x10 ldp x8,x9,[x1,#8*2] sbcs x15,x24,x11 stp x16,x17,[x0,#8*2] sbcs x16,x25,x12 ldp x10,x11,[x1,#8*4] sbcs x17,x26,x13 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 ldp x19,x20,[x2,#8*0] sub x27,x27,#8*8 ldp x21,x22,[x2,#8*2] ldp x23,x24,[x2,#8*4] ldp x25,x26,[x2,#8*6] add x2,x2,#8*8 stp x14,x15,[x0,#8*4] sbcs x14,x19,x6 stp x16,x17,[x0,#8*6] add x0,x0,#8*8 sbcs x15,x20,x7 cbnz x27,Lsqr8x_sub sbcs x16,x21,x8 mov x2,sp add x1,sp,x5 ldp x6,x7,[x3,#8*0] sbcs x17,x22,x9 stp x14,x15,[x0,#8*0] sbcs x14,x23,x10 ldp x8,x9,[x3,#8*2] sbcs x15,x24,x11 stp x16,x17,[x0,#8*2] sbcs x16,x25,x12 ldp x19,x20,[x1,#8*0] sbcs x17,x26,x13 ldp x21,x22,[x1,#8*2] sbcs xzr,x30,xzr // did it borrow? ldr x30,[x29,#8] // pull return address stp x14,x15,[x0,#8*4] stp x16,x17,[x0,#8*6] sub x27,x5,#8*4 Lsqr4x_cond_copy: sub x27,x27,#8*4 csel x14,x19,x6,lo stp xzr,xzr,[x2,#8*0] csel x15,x20,x7,lo ldp x6,x7,[x3,#8*4] ldp x19,x20,[x1,#8*4] csel x16,x21,x8,lo stp xzr,xzr,[x2,#8*2] add x2,x2,#8*4 csel x17,x22,x9,lo ldp x8,x9,[x3,#8*6] ldp x21,x22,[x1,#8*6] add x1,x1,#8*4 stp x14,x15,[x3,#8*0] stp x16,x17,[x3,#8*2] add x3,x3,#8*4 stp xzr,xzr,[x1,#8*0] stp xzr,xzr,[x1,#8*2] cbnz x27,Lsqr4x_cond_copy csel x14,x19,x6,lo stp xzr,xzr,[x2,#8*0] csel x15,x20,x7,lo stp xzr,xzr,[x2,#8*2] csel x16,x21,x8,lo csel x17,x22,x9,lo stp x14,x15,[x3,#8*0] stp x16,x17,[x3,#8*2] b Lsqr8x_done .align 4 Lsqr8x8_post_condition: adc x28,xzr,xzr ldr x30,[x29,#8] // pull return address // x19-7,x28 hold result, x6-7 hold modulus subs x6,x19,x6 ldr x1,[x29,#96] // pull rp sbcs x7,x20,x7 stp xzr,xzr,[sp,#8*0] sbcs x8,x21,x8 stp xzr,xzr,[sp,#8*2] sbcs x9,x22,x9 stp xzr,xzr,[sp,#8*4] sbcs x10,x23,x10 stp xzr,xzr,[sp,#8*6] sbcs x11,x24,x11 stp xzr,xzr,[sp,#8*8] sbcs x12,x25,x12 stp xzr,xzr,[sp,#8*10] sbcs x13,x26,x13 stp xzr,xzr,[sp,#8*12] sbcs x28,x28,xzr // did it borrow? stp xzr,xzr,[sp,#8*14] // x6-7 hold result-modulus csel x6,x19,x6,lo csel x7,x20,x7,lo csel x8,x21,x8,lo csel x9,x22,x9,lo stp x6,x7,[x1,#8*0] csel x10,x23,x10,lo csel x11,x24,x11,lo stp x8,x9,[x1,#8*2] csel x12,x25,x12,lo csel x13,x26,x13,lo stp x10,x11,[x1,#8*4] stp x12,x13,[x1,#8*6] Lsqr8x_done: ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 // x30 is popped earlier AARCH64_VALIDATE_LINK_REGISTER ret .globl _bn_mul4x_mont .private_extern _bn_mul4x_mont .align 5 _bn_mul4x_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub x26,sp,x5,lsl#3 lsl x5,x5,#3 ldr x4,[x4] // *n0 sub sp,x26,#8*4 // alloca add x10,x2,x5 add x27,x1,x5 stp x0,x10,[x29,#96] // offload rp and &b[num] ldr x24,[x2,#8*0] // b[0] ldp x6,x7,[x1,#8*0] // a[0..3] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 mov x19,xzr mov x20,xzr mov x21,xzr mov x22,xzr ldp x14,x15,[x3,#8*0] // n[0..3] ldp x16,x17,[x3,#8*2] adds x3,x3,#8*4 // clear carry bit mov x0,xzr mov x28,#0 mov x26,sp Loop_mul4x_1st_reduction: mul x10,x6,x24 // lo(a[0..3]*b[0]) adc x0,x0,xzr // modulo-scheduled mul x11,x7,x24 add x28,x28,#8 mul x12,x8,x24 and x28,x28,#31 mul x13,x9,x24 adds x19,x19,x10 umulh x10,x6,x24 // hi(a[0..3]*b[0]) adcs x20,x20,x11 mul x25,x19,x4 // t[0]*n0 adcs x21,x21,x12 umulh x11,x7,x24 adcs x22,x22,x13 umulh x12,x8,x24 adc x23,xzr,xzr umulh x13,x9,x24 ldr x24,[x2,x28] // next b[i] (or b[0]) adds x20,x20,x10 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) str x25,[x26],#8 // put aside t[0]*n0 for tail processing adcs x21,x21,x11 mul x11,x15,x25 adcs x22,x22,x12 mul x12,x16,x25 adc x23,x23,x13 // can't overflow mul x13,x17,x25 // (*) adds xzr,x19,x10 subs xzr,x19,#1 // (*) umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) adcs x19,x20,x11 umulh x11,x15,x25 adcs x20,x21,x12 umulh x12,x16,x25 adcs x21,x22,x13 umulh x13,x17,x25 adcs x22,x23,x0 adc x0,xzr,xzr adds x19,x19,x10 sub x10,x27,x1 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr cbnz x28,Loop_mul4x_1st_reduction cbz x10,Lmul4x4_post_condition ldp x6,x7,[x1,#8*0] // a[4..7] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 ldr x25,[sp] // a[0]*n0 ldp x14,x15,[x3,#8*0] // n[4..7] ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 Loop_mul4x_1st_tail: mul x10,x6,x24 // lo(a[4..7]*b[i]) adc x0,x0,xzr // modulo-scheduled mul x11,x7,x24 add x28,x28,#8 mul x12,x8,x24 and x28,x28,#31 mul x13,x9,x24 adds x19,x19,x10 umulh x10,x6,x24 // hi(a[4..7]*b[i]) adcs x20,x20,x11 umulh x11,x7,x24 adcs x21,x21,x12 umulh x12,x8,x24 adcs x22,x22,x13 umulh x13,x9,x24 adc x23,xzr,xzr ldr x24,[x2,x28] // next b[i] (or b[0]) adds x20,x20,x10 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) adcs x21,x21,x11 mul x11,x15,x25 adcs x22,x22,x12 mul x12,x16,x25 adc x23,x23,x13 // can't overflow mul x13,x17,x25 adds x19,x19,x10 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) adcs x20,x20,x11 umulh x11,x15,x25 adcs x21,x21,x12 umulh x12,x16,x25 adcs x22,x22,x13 adcs x23,x23,x0 umulh x13,x17,x25 adc x0,xzr,xzr ldr x25,[sp,x28] // next t[0]*n0 str x19,[x26],#8 // result!!! adds x19,x20,x10 sub x10,x27,x1 // done yet? adcs x20,x21,x11 adcs x21,x22,x12 adcs x22,x23,x13 //adc x0,x0,xzr cbnz x28,Loop_mul4x_1st_tail sub x11,x27,x5 // rewinded x1 cbz x10,Lmul4x_proceed ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 ldp x14,x15,[x3,#8*0] ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 b Loop_mul4x_1st_tail .align 5 Lmul4x_proceed: ldr x24,[x2,#8*4]! // *++b adc x30,x0,xzr ldp x6,x7,[x11,#8*0] // a[0..3] sub x3,x3,x5 // rewind np ldp x8,x9,[x11,#8*2] add x1,x11,#8*4 stp x19,x20,[x26,#8*0] // result!!! ldp x19,x20,[sp,#8*4] // t[0..3] stp x21,x22,[x26,#8*2] // result!!! ldp x21,x22,[sp,#8*6] ldp x14,x15,[x3,#8*0] // n[0..3] mov x26,sp ldp x16,x17,[x3,#8*2] adds x3,x3,#8*4 // clear carry bit mov x0,xzr .align 4 Loop_mul4x_reduction: mul x10,x6,x24 // lo(a[0..3]*b[4]) adc x0,x0,xzr // modulo-scheduled mul x11,x7,x24 add x28,x28,#8 mul x12,x8,x24 and x28,x28,#31 mul x13,x9,x24 adds x19,x19,x10 umulh x10,x6,x24 // hi(a[0..3]*b[4]) adcs x20,x20,x11 mul x25,x19,x4 // t[0]*n0 adcs x21,x21,x12 umulh x11,x7,x24 adcs x22,x22,x13 umulh x12,x8,x24 adc x23,xzr,xzr umulh x13,x9,x24 ldr x24,[x2,x28] // next b[i] adds x20,x20,x10 // (*) mul x10,x14,x25 str x25,[x26],#8 // put aside t[0]*n0 for tail processing adcs x21,x21,x11 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 adcs x22,x22,x12 mul x12,x16,x25 adc x23,x23,x13 // can't overflow mul x13,x17,x25 // (*) adds xzr,x19,x10 subs xzr,x19,#1 // (*) umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 adcs x19,x20,x11 umulh x11,x15,x25 adcs x20,x21,x12 umulh x12,x16,x25 adcs x21,x22,x13 umulh x13,x17,x25 adcs x22,x23,x0 adc x0,xzr,xzr adds x19,x19,x10 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr cbnz x28,Loop_mul4x_reduction adc x0,x0,xzr ldp x10,x11,[x26,#8*4] // t[4..7] ldp x12,x13,[x26,#8*6] ldp x6,x7,[x1,#8*0] // a[4..7] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 adds x19,x19,x10 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr ldr x25,[sp] // t[0]*n0 ldp x14,x15,[x3,#8*0] // n[4..7] ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 .align 4 Loop_mul4x_tail: mul x10,x6,x24 // lo(a[4..7]*b[4]) adc x0,x0,xzr // modulo-scheduled mul x11,x7,x24 add x28,x28,#8 mul x12,x8,x24 and x28,x28,#31 mul x13,x9,x24 adds x19,x19,x10 umulh x10,x6,x24 // hi(a[4..7]*b[4]) adcs x20,x20,x11 umulh x11,x7,x24 adcs x21,x21,x12 umulh x12,x8,x24 adcs x22,x22,x13 umulh x13,x9,x24 adc x23,xzr,xzr ldr x24,[x2,x28] // next b[i] adds x20,x20,x10 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) adcs x21,x21,x11 mul x11,x15,x25 adcs x22,x22,x12 mul x12,x16,x25 adc x23,x23,x13 // can't overflow mul x13,x17,x25 adds x19,x19,x10 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) adcs x20,x20,x11 umulh x11,x15,x25 adcs x21,x21,x12 umulh x12,x16,x25 adcs x22,x22,x13 umulh x13,x17,x25 adcs x23,x23,x0 ldr x25,[sp,x28] // next a[0]*n0 adc x0,xzr,xzr str x19,[x26],#8 // result!!! adds x19,x20,x10 sub x10,x27,x1 // done yet? adcs x20,x21,x11 adcs x21,x22,x12 adcs x22,x23,x13 //adc x0,x0,xzr cbnz x28,Loop_mul4x_tail sub x11,x3,x5 // rewinded np? adc x0,x0,xzr cbz x10,Loop_mul4x_break ldp x10,x11,[x26,#8*4] ldp x12,x13,[x26,#8*6] ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 adds x19,x19,x10 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr ldp x14,x15,[x3,#8*0] ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 b Loop_mul4x_tail .align 4 Loop_mul4x_break: ldp x12,x13,[x29,#96] // pull rp and &b[num] adds x19,x19,x30 add x2,x2,#8*4 // bp++ adcs x20,x20,xzr sub x1,x1,x5 // rewind ap adcs x21,x21,xzr stp x19,x20,[x26,#8*0] // result!!! adcs x22,x22,xzr ldp x19,x20,[sp,#8*4] // t[0..3] adc x30,x0,xzr stp x21,x22,[x26,#8*2] // result!!! cmp x2,x13 // done yet? ldp x21,x22,[sp,#8*6] ldp x14,x15,[x11,#8*0] // n[0..3] ldp x16,x17,[x11,#8*2] add x3,x11,#8*4 b.eq Lmul4x_post ldr x24,[x2] ldp x6,x7,[x1,#8*0] // a[0..3] ldp x8,x9,[x1,#8*2] adds x1,x1,#8*4 // clear carry bit mov x0,xzr mov x26,sp b Loop_mul4x_reduction .align 4 Lmul4x_post: // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. mov x0,x12 mov x27,x12 // x0 copy subs x10,x19,x14 add x26,sp,#8*8 sbcs x11,x20,x15 sub x28,x5,#8*4 Lmul4x_sub: sbcs x12,x21,x16 ldp x14,x15,[x3,#8*0] sub x28,x28,#8*4 ldp x19,x20,[x26,#8*0] sbcs x13,x22,x17 ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 ldp x21,x22,[x26,#8*2] add x26,x26,#8*4 stp x10,x11,[x0,#8*0] sbcs x10,x19,x14 stp x12,x13,[x0,#8*2] add x0,x0,#8*4 sbcs x11,x20,x15 cbnz x28,Lmul4x_sub sbcs x12,x21,x16 mov x26,sp add x1,sp,#8*4 ldp x6,x7,[x27,#8*0] sbcs x13,x22,x17 stp x10,x11,[x0,#8*0] ldp x8,x9,[x27,#8*2] stp x12,x13,[x0,#8*2] ldp x19,x20,[x1,#8*0] ldp x21,x22,[x1,#8*2] sbcs xzr,x30,xzr // did it borrow? ldr x30,[x29,#8] // pull return address sub x28,x5,#8*4 Lmul4x_cond_copy: sub x28,x28,#8*4 csel x10,x19,x6,lo stp xzr,xzr,[x26,#8*0] csel x11,x20,x7,lo ldp x6,x7,[x27,#8*4] ldp x19,x20,[x1,#8*4] csel x12,x21,x8,lo stp xzr,xzr,[x26,#8*2] add x26,x26,#8*4 csel x13,x22,x9,lo ldp x8,x9,[x27,#8*6] ldp x21,x22,[x1,#8*6] add x1,x1,#8*4 stp x10,x11,[x27,#8*0] stp x12,x13,[x27,#8*2] add x27,x27,#8*4 cbnz x28,Lmul4x_cond_copy csel x10,x19,x6,lo stp xzr,xzr,[x26,#8*0] csel x11,x20,x7,lo stp xzr,xzr,[x26,#8*2] csel x12,x21,x8,lo stp xzr,xzr,[x26,#8*3] csel x13,x22,x9,lo stp xzr,xzr,[x26,#8*4] stp x10,x11,[x27,#8*0] stp x12,x13,[x27,#8*2] b Lmul4x_done .align 4 Lmul4x4_post_condition: adc x0,x0,xzr ldr x1,[x29,#96] // pull rp // x19-3,x0 hold result, x14-7 hold modulus subs x6,x19,x14 ldr x30,[x29,#8] // pull return address sbcs x7,x20,x15 stp xzr,xzr,[sp,#8*0] sbcs x8,x21,x16 stp xzr,xzr,[sp,#8*2] sbcs x9,x22,x17 stp xzr,xzr,[sp,#8*4] sbcs xzr,x0,xzr // did it borrow? stp xzr,xzr,[sp,#8*6] // x6-3 hold result-modulus csel x6,x19,x6,lo csel x7,x20,x7,lo csel x8,x21,x8,lo csel x9,x22,x9,lo stp x6,x7,[x1,#8*0] stp x8,x9,[x1,#8*2] Lmul4x_done: ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 // x30 is popped earlier AARCH64_VALIDATE_LINK_REGISTER ret .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 4 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) ring-0.17.14/pregenerated/armv8-mont-linux64.S000064400000000000000000000742341046102023000170570ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) .text .globl bn_mul_mont_nohw .hidden bn_mul_mont_nohw .type bn_mul_mont_nohw,%function .align 5 bn_mul_mont_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] ldr x9,[x2],#8 // bp[0] sub x22,sp,x5,lsl#3 ldp x7,x8,[x1],#16 // ap[0..1] lsl x5,x5,#3 ldr x4,[x4] // *n0 and x22,x22,#-16 // ABI says so ldp x13,x14,[x3],#16 // np[0..1] mul x6,x7,x9 // ap[0]*bp[0] sub x21,x5,#16 // j=num-2 umulh x7,x7,x9 mul x10,x8,x9 // ap[1]*bp[0] umulh x11,x8,x9 mul x15,x6,x4 // "tp[0]"*n0 mov sp,x22 // alloca // (*) mul x12,x13,x15 // np[0]*m1 umulh x13,x13,x15 mul x16,x14,x15 // np[1]*m1 // (*) adds x12,x12,x6 // discarded // (*) As for removal of first multiplication and addition // instructions. The outcome of first addition is // guaranteed to be zero, which leaves two computationally // significant outcomes: it either carries or not. Then // question is when does it carry? Is there alternative // way to deduce it? If you follow operations, you can // observe that condition for carry is quite simple: // x6 being non-zero. So that carry can be calculated // by adding -1 to x6. That's what next instruction does. subs xzr,x6,#1 // (*) umulh x17,x14,x15 adc x13,x13,xzr cbz x21,.L1st_skip .L1st: ldr x8,[x1],#8 adds x6,x10,x7 sub x21,x21,#8 // j-- adc x7,x11,xzr ldr x14,[x3],#8 adds x12,x16,x13 mul x10,x8,x9 // ap[j]*bp[0] adc x13,x17,xzr umulh x11,x8,x9 adds x12,x12,x6 mul x16,x14,x15 // np[j]*m1 adc x13,x13,xzr umulh x17,x14,x15 str x12,[x22],#8 // tp[j-1] cbnz x21,.L1st .L1st_skip: adds x6,x10,x7 sub x1,x1,x5 // rewind x1 adc x7,x11,xzr adds x12,x16,x13 sub x3,x3,x5 // rewind x3 adc x13,x17,xzr adds x12,x12,x6 sub x20,x5,#8 // i=num-1 adcs x13,x13,x7 adc x19,xzr,xzr // upmost overflow bit stp x12,x13,[x22] .Louter: ldr x9,[x2],#8 // bp[i] ldp x7,x8,[x1],#16 ldr x23,[sp] // tp[0] add x22,sp,#8 mul x6,x7,x9 // ap[0]*bp[i] sub x21,x5,#16 // j=num-2 umulh x7,x7,x9 ldp x13,x14,[x3],#16 mul x10,x8,x9 // ap[1]*bp[i] adds x6,x6,x23 umulh x11,x8,x9 adc x7,x7,xzr mul x15,x6,x4 sub x20,x20,#8 // i-- // (*) mul x12,x13,x15 // np[0]*m1 umulh x13,x13,x15 mul x16,x14,x15 // np[1]*m1 // (*) adds x12,x12,x6 subs xzr,x6,#1 // (*) umulh x17,x14,x15 cbz x21,.Linner_skip .Linner: ldr x8,[x1],#8 adc x13,x13,xzr ldr x23,[x22],#8 // tp[j] adds x6,x10,x7 sub x21,x21,#8 // j-- adc x7,x11,xzr adds x12,x16,x13 ldr x14,[x3],#8 adc x13,x17,xzr mul x10,x8,x9 // ap[j]*bp[i] adds x6,x6,x23 umulh x11,x8,x9 adc x7,x7,xzr mul x16,x14,x15 // np[j]*m1 adds x12,x12,x6 umulh x17,x14,x15 str x12,[x22,#-16] // tp[j-1] cbnz x21,.Linner .Linner_skip: ldr x23,[x22],#8 // tp[j] adc x13,x13,xzr adds x6,x10,x7 sub x1,x1,x5 // rewind x1 adc x7,x11,xzr adds x12,x16,x13 sub x3,x3,x5 // rewind x3 adcs x13,x17,x19 adc x19,xzr,xzr adds x6,x6,x23 adc x7,x7,xzr adds x12,x12,x6 adcs x13,x13,x7 adc x19,x19,xzr // upmost overflow bit stp x12,x13,[x22,#-16] cbnz x20,.Louter // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. ldr x23,[sp] // tp[0] add x22,sp,#8 ldr x14,[x3],#8 // np[0] subs x21,x5,#8 // j=num-1 and clear borrow mov x1,x0 .Lsub: sbcs x8,x23,x14 // tp[j]-np[j] ldr x23,[x22],#8 sub x21,x21,#8 // j-- ldr x14,[x3],#8 str x8,[x1],#8 // rp[j]=tp[j]-np[j] cbnz x21,.Lsub sbcs x8,x23,x14 sbcs x19,x19,xzr // did it borrow? str x8,[x1],#8 // rp[num-1] ldr x23,[sp] // tp[0] add x22,sp,#8 ldr x8,[x0],#8 // rp[0] sub x5,x5,#8 // num-- nop .Lcond_copy: sub x5,x5,#8 // num-- csel x14,x23,x8,lo // did it borrow? ldr x23,[x22],#8 ldr x8,[x0],#8 str xzr,[x22,#-16] // wipe tp str x14,[x0,#-16] cbnz x5,.Lcond_copy csel x14,x23,x8,lo str xzr,[x22,#-8] // wipe tp str x14,[x0,#-8] ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldr x29,[sp],#64 AARCH64_VALIDATE_LINK_REGISTER ret .size bn_mul_mont_nohw,.-bn_mul_mont_nohw .globl bn_sqr8x_mont .hidden bn_sqr8x_mont .type bn_sqr8x_mont,%function .align 5 bn_sqr8x_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] stp x0,x3,[sp,#96] // offload rp and np ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] ldp x10,x11,[x1,#8*4] ldp x12,x13,[x1,#8*6] sub x2,sp,x5,lsl#4 lsl x5,x5,#3 ldr x4,[x4] // *n0 mov sp,x2 // alloca sub x27,x5,#8*8 b .Lsqr8x_zero_start .Lsqr8x_zero: sub x27,x27,#8*8 stp xzr,xzr,[x2,#8*0] stp xzr,xzr,[x2,#8*2] stp xzr,xzr,[x2,#8*4] stp xzr,xzr,[x2,#8*6] .Lsqr8x_zero_start: stp xzr,xzr,[x2,#8*8] stp xzr,xzr,[x2,#8*10] stp xzr,xzr,[x2,#8*12] stp xzr,xzr,[x2,#8*14] add x2,x2,#8*16 cbnz x27,.Lsqr8x_zero add x3,x1,x5 add x1,x1,#8*8 mov x19,xzr mov x20,xzr mov x21,xzr mov x22,xzr mov x23,xzr mov x24,xzr mov x25,xzr mov x26,xzr mov x2,sp str x4,[x29,#112] // offload n0 // Multiply everything but a[i]*a[i] .align 4 .Lsqr8x_outer_loop: // a[1]a[0] (i) // a[2]a[0] // a[3]a[0] // a[4]a[0] // a[5]a[0] // a[6]a[0] // a[7]a[0] // a[2]a[1] (ii) // a[3]a[1] // a[4]a[1] // a[5]a[1] // a[6]a[1] // a[7]a[1] // a[3]a[2] (iii) // a[4]a[2] // a[5]a[2] // a[6]a[2] // a[7]a[2] // a[4]a[3] (iv) // a[5]a[3] // a[6]a[3] // a[7]a[3] // a[5]a[4] (v) // a[6]a[4] // a[7]a[4] // a[6]a[5] (vi) // a[7]a[5] // a[7]a[6] (vii) mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) mul x15,x8,x6 mul x16,x9,x6 mul x17,x10,x6 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) mul x14,x11,x6 adcs x21,x21,x15 mul x15,x12,x6 adcs x22,x22,x16 mul x16,x13,x6 adcs x23,x23,x17 umulh x17,x7,x6 // hi(a[1..7]*a[0]) adcs x24,x24,x14 umulh x14,x8,x6 adcs x25,x25,x15 umulh x15,x9,x6 adcs x26,x26,x16 umulh x16,x10,x6 stp x19,x20,[x2],#8*2 // t[0..1] adc x19,xzr,xzr // t[8] adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) umulh x17,x11,x6 adcs x22,x22,x14 umulh x14,x12,x6 adcs x23,x23,x15 umulh x15,x13,x6 adcs x24,x24,x16 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) adcs x25,x25,x17 mul x17,x9,x7 adcs x26,x26,x14 mul x14,x10,x7 adc x19,x19,x15 mul x15,x11,x7 adds x22,x22,x16 mul x16,x12,x7 adcs x23,x23,x17 mul x17,x13,x7 adcs x24,x24,x14 umulh x14,x8,x7 // hi(a[2..7]*a[1]) adcs x25,x25,x15 umulh x15,x9,x7 adcs x26,x26,x16 umulh x16,x10,x7 adcs x19,x19,x17 umulh x17,x11,x7 stp x21,x22,[x2],#8*2 // t[2..3] adc x20,xzr,xzr // t[9] adds x23,x23,x14 umulh x14,x12,x7 adcs x24,x24,x15 umulh x15,x13,x7 adcs x25,x25,x16 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) adcs x26,x26,x17 mul x17,x10,x8 adcs x19,x19,x14 mul x14,x11,x8 adc x20,x20,x15 mul x15,x12,x8 adds x24,x24,x16 mul x16,x13,x8 adcs x25,x25,x17 umulh x17,x9,x8 // hi(a[3..7]*a[2]) adcs x26,x26,x14 umulh x14,x10,x8 adcs x19,x19,x15 umulh x15,x11,x8 adcs x20,x20,x16 umulh x16,x12,x8 stp x23,x24,[x2],#8*2 // t[4..5] adc x21,xzr,xzr // t[10] adds x25,x25,x17 umulh x17,x13,x8 adcs x26,x26,x14 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) adcs x19,x19,x15 mul x15,x11,x9 adcs x20,x20,x16 mul x16,x12,x9 adc x21,x21,x17 mul x17,x13,x9 adds x26,x26,x14 umulh x14,x10,x9 // hi(a[4..7]*a[3]) adcs x19,x19,x15 umulh x15,x11,x9 adcs x20,x20,x16 umulh x16,x12,x9 adcs x21,x21,x17 umulh x17,x13,x9 stp x25,x26,[x2],#8*2 // t[6..7] adc x22,xzr,xzr // t[11] adds x19,x19,x14 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) adcs x20,x20,x15 mul x15,x12,x10 adcs x21,x21,x16 mul x16,x13,x10 adc x22,x22,x17 umulh x17,x11,x10 // hi(a[5..7]*a[4]) adds x20,x20,x14 umulh x14,x12,x10 adcs x21,x21,x15 umulh x15,x13,x10 adcs x22,x22,x16 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) adc x23,xzr,xzr // t[12] adds x21,x21,x17 mul x17,x13,x11 adcs x22,x22,x14 umulh x14,x12,x11 // hi(a[6..7]*a[5]) adc x23,x23,x15 umulh x15,x13,x11 adds x22,x22,x16 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) adcs x23,x23,x17 umulh x17,x13,x12 // hi(a[7]*a[6]) adc x24,xzr,xzr // t[13] adds x23,x23,x14 sub x27,x3,x1 // done yet? adc x24,x24,x15 adds x24,x24,x16 sub x14,x3,x5 // rewinded ap adc x25,xzr,xzr // t[14] add x25,x25,x17 cbz x27,.Lsqr8x_outer_break mov x4,x6 ldp x6,x7,[x2,#8*0] ldp x8,x9,[x2,#8*2] ldp x10,x11,[x2,#8*4] ldp x12,x13,[x2,#8*6] adds x19,x19,x6 adcs x20,x20,x7 ldp x6,x7,[x1,#8*0] adcs x21,x21,x8 adcs x22,x22,x9 ldp x8,x9,[x1,#8*2] adcs x23,x23,x10 adcs x24,x24,x11 ldp x10,x11,[x1,#8*4] adcs x25,x25,x12 mov x0,x1 adcs x26,xzr,x13 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 //adc x28,xzr,xzr // moved below mov x27,#-8*8 // a[8]a[0] // a[9]a[0] // a[a]a[0] // a[b]a[0] // a[c]a[0] // a[d]a[0] // a[e]a[0] // a[f]a[0] // a[8]a[1] // a[f]a[1]........................ // a[8]a[2] // a[f]a[2]........................ // a[8]a[3] // a[f]a[3]........................ // a[8]a[4] // a[f]a[4]........................ // a[8]a[5] // a[f]a[5]........................ // a[8]a[6] // a[f]a[6]........................ // a[8]a[7] // a[f]a[7]........................ .Lsqr8x_mul: mul x14,x6,x4 adc x28,xzr,xzr // carry bit, modulo-scheduled mul x15,x7,x4 add x27,x27,#8 mul x16,x8,x4 mul x17,x9,x4 adds x19,x19,x14 mul x14,x10,x4 adcs x20,x20,x15 mul x15,x11,x4 adcs x21,x21,x16 mul x16,x12,x4 adcs x22,x22,x17 mul x17,x13,x4 adcs x23,x23,x14 umulh x14,x6,x4 adcs x24,x24,x15 umulh x15,x7,x4 adcs x25,x25,x16 umulh x16,x8,x4 adcs x26,x26,x17 umulh x17,x9,x4 adc x28,x28,xzr str x19,[x2],#8 adds x19,x20,x14 umulh x14,x10,x4 adcs x20,x21,x15 umulh x15,x11,x4 adcs x21,x22,x16 umulh x16,x12,x4 adcs x22,x23,x17 umulh x17,x13,x4 ldr x4,[x0,x27] adcs x23,x24,x14 adcs x24,x25,x15 adcs x25,x26,x16 adcs x26,x28,x17 //adc x28,xzr,xzr // moved above cbnz x27,.Lsqr8x_mul // note that carry flag is guaranteed // to be zero at this point cmp x1,x3 // done yet? b.eq .Lsqr8x_break ldp x6,x7,[x2,#8*0] ldp x8,x9,[x2,#8*2] ldp x10,x11,[x2,#8*4] ldp x12,x13,[x2,#8*6] adds x19,x19,x6 ldr x4,[x0,#-8*8] adcs x20,x20,x7 ldp x6,x7,[x1,#8*0] adcs x21,x21,x8 adcs x22,x22,x9 ldp x8,x9,[x1,#8*2] adcs x23,x23,x10 adcs x24,x24,x11 ldp x10,x11,[x1,#8*4] adcs x25,x25,x12 mov x27,#-8*8 adcs x26,x26,x13 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 //adc x28,xzr,xzr // moved above b .Lsqr8x_mul .align 4 .Lsqr8x_break: ldp x6,x7,[x0,#8*0] add x1,x0,#8*8 ldp x8,x9,[x0,#8*2] sub x14,x3,x1 // is it last iteration? ldp x10,x11,[x0,#8*4] sub x15,x2,x14 ldp x12,x13,[x0,#8*6] cbz x14,.Lsqr8x_outer_loop stp x19,x20,[x2,#8*0] ldp x19,x20,[x15,#8*0] stp x21,x22,[x2,#8*2] ldp x21,x22,[x15,#8*2] stp x23,x24,[x2,#8*4] ldp x23,x24,[x15,#8*4] stp x25,x26,[x2,#8*6] mov x2,x15 ldp x25,x26,[x15,#8*6] b .Lsqr8x_outer_loop .align 4 .Lsqr8x_outer_break: // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] ldp x15,x16,[sp,#8*1] ldp x11,x13,[x14,#8*2] add x1,x14,#8*4 ldp x17,x14,[sp,#8*3] stp x19,x20,[x2,#8*0] mul x19,x7,x7 stp x21,x22,[x2,#8*2] umulh x7,x7,x7 stp x23,x24,[x2,#8*4] mul x8,x9,x9 stp x25,x26,[x2,#8*6] mov x2,sp umulh x9,x9,x9 adds x20,x7,x15,lsl#1 extr x15,x16,x15,#63 sub x27,x5,#8*4 .Lsqr4x_shift_n_add: adcs x21,x8,x15 extr x16,x17,x16,#63 sub x27,x27,#8*4 adcs x22,x9,x16 ldp x15,x16,[x2,#8*5] mul x10,x11,x11 ldp x7,x9,[x1],#8*2 umulh x11,x11,x11 mul x12,x13,x13 umulh x13,x13,x13 extr x17,x14,x17,#63 stp x19,x20,[x2,#8*0] adcs x23,x10,x17 extr x14,x15,x14,#63 stp x21,x22,[x2,#8*2] adcs x24,x11,x14 ldp x17,x14,[x2,#8*7] extr x15,x16,x15,#63 adcs x25,x12,x15 extr x16,x17,x16,#63 adcs x26,x13,x16 ldp x15,x16,[x2,#8*9] mul x6,x7,x7 ldp x11,x13,[x1],#8*2 umulh x7,x7,x7 mul x8,x9,x9 umulh x9,x9,x9 stp x23,x24,[x2,#8*4] extr x17,x14,x17,#63 stp x25,x26,[x2,#8*6] add x2,x2,#8*8 adcs x19,x6,x17 extr x14,x15,x14,#63 adcs x20,x7,x14 ldp x17,x14,[x2,#8*3] extr x15,x16,x15,#63 cbnz x27,.Lsqr4x_shift_n_add ldp x1,x4,[x29,#104] // pull np and n0 adcs x21,x8,x15 extr x16,x17,x16,#63 adcs x22,x9,x16 ldp x15,x16,[x2,#8*5] mul x10,x11,x11 umulh x11,x11,x11 stp x19,x20,[x2,#8*0] mul x12,x13,x13 umulh x13,x13,x13 stp x21,x22,[x2,#8*2] extr x17,x14,x17,#63 adcs x23,x10,x17 extr x14,x15,x14,#63 ldp x19,x20,[sp,#8*0] adcs x24,x11,x14 extr x15,x16,x15,#63 ldp x6,x7,[x1,#8*0] adcs x25,x12,x15 extr x16,xzr,x16,#63 ldp x8,x9,[x1,#8*2] adc x26,x13,x16 ldp x10,x11,[x1,#8*4] // Reduce by 512 bits per iteration mul x28,x4,x19 // t[0]*n0 ldp x12,x13,[x1,#8*6] add x3,x1,x5 ldp x21,x22,[sp,#8*2] stp x23,x24,[x2,#8*4] ldp x23,x24,[sp,#8*4] stp x25,x26,[x2,#8*6] ldp x25,x26,[sp,#8*6] add x1,x1,#8*8 mov x30,xzr // initial top-most carry mov x2,sp mov x27,#8 .Lsqr8x_reduction: // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) mul x15,x7,x28 sub x27,x27,#1 mul x16,x8,x28 str x28,[x2],#8 // put aside t[0]*n0 for tail processing mul x17,x9,x28 // (*) adds xzr,x19,x14 subs xzr,x19,#1 // (*) mul x14,x10,x28 adcs x19,x20,x15 mul x15,x11,x28 adcs x20,x21,x16 mul x16,x12,x28 adcs x21,x22,x17 mul x17,x13,x28 adcs x22,x23,x14 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) adcs x23,x24,x15 umulh x15,x7,x28 adcs x24,x25,x16 umulh x16,x8,x28 adcs x25,x26,x17 umulh x17,x9,x28 adc x26,xzr,xzr adds x19,x19,x14 umulh x14,x10,x28 adcs x20,x20,x15 umulh x15,x11,x28 adcs x21,x21,x16 umulh x16,x12,x28 adcs x22,x22,x17 umulh x17,x13,x28 mul x28,x4,x19 // next t[0]*n0 adcs x23,x23,x14 adcs x24,x24,x15 adcs x25,x25,x16 adc x26,x26,x17 cbnz x27,.Lsqr8x_reduction ldp x14,x15,[x2,#8*0] ldp x16,x17,[x2,#8*2] mov x0,x2 sub x27,x3,x1 // done yet? adds x19,x19,x14 adcs x20,x20,x15 ldp x14,x15,[x2,#8*4] adcs x21,x21,x16 adcs x22,x22,x17 ldp x16,x17,[x2,#8*6] adcs x23,x23,x14 adcs x24,x24,x15 adcs x25,x25,x16 adcs x26,x26,x17 //adc x28,xzr,xzr // moved below cbz x27,.Lsqr8x8_post_condition ldr x4,[x2,#-8*8] ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] ldp x10,x11,[x1,#8*4] mov x27,#-8*8 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 .Lsqr8x_tail: mul x14,x6,x4 adc x28,xzr,xzr // carry bit, modulo-scheduled mul x15,x7,x4 add x27,x27,#8 mul x16,x8,x4 mul x17,x9,x4 adds x19,x19,x14 mul x14,x10,x4 adcs x20,x20,x15 mul x15,x11,x4 adcs x21,x21,x16 mul x16,x12,x4 adcs x22,x22,x17 mul x17,x13,x4 adcs x23,x23,x14 umulh x14,x6,x4 adcs x24,x24,x15 umulh x15,x7,x4 adcs x25,x25,x16 umulh x16,x8,x4 adcs x26,x26,x17 umulh x17,x9,x4 adc x28,x28,xzr str x19,[x2],#8 adds x19,x20,x14 umulh x14,x10,x4 adcs x20,x21,x15 umulh x15,x11,x4 adcs x21,x22,x16 umulh x16,x12,x4 adcs x22,x23,x17 umulh x17,x13,x4 ldr x4,[x0,x27] adcs x23,x24,x14 adcs x24,x25,x15 adcs x25,x26,x16 adcs x26,x28,x17 //adc x28,xzr,xzr // moved above cbnz x27,.Lsqr8x_tail // note that carry flag is guaranteed // to be zero at this point ldp x6,x7,[x2,#8*0] sub x27,x3,x1 // done yet? sub x16,x3,x5 // rewinded np ldp x8,x9,[x2,#8*2] ldp x10,x11,[x2,#8*4] ldp x12,x13,[x2,#8*6] cbz x27,.Lsqr8x_tail_break ldr x4,[x0,#-8*8] adds x19,x19,x6 adcs x20,x20,x7 ldp x6,x7,[x1,#8*0] adcs x21,x21,x8 adcs x22,x22,x9 ldp x8,x9,[x1,#8*2] adcs x23,x23,x10 adcs x24,x24,x11 ldp x10,x11,[x1,#8*4] adcs x25,x25,x12 mov x27,#-8*8 adcs x26,x26,x13 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 //adc x28,xzr,xzr // moved above b .Lsqr8x_tail .align 4 .Lsqr8x_tail_break: ldr x4,[x29,#112] // pull n0 add x27,x2,#8*8 // end of current t[num] window subs xzr,x30,#1 // "move" top-most carry to carry bit adcs x14,x19,x6 adcs x15,x20,x7 ldp x19,x20,[x0,#8*0] adcs x21,x21,x8 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] adcs x22,x22,x9 ldp x8,x9,[x16,#8*2] adcs x23,x23,x10 adcs x24,x24,x11 ldp x10,x11,[x16,#8*4] adcs x25,x25,x12 adcs x26,x26,x13 ldp x12,x13,[x16,#8*6] add x1,x16,#8*8 adc x30,xzr,xzr // top-most carry mul x28,x4,x19 stp x14,x15,[x2,#8*0] stp x21,x22,[x2,#8*2] ldp x21,x22,[x0,#8*2] stp x23,x24,[x2,#8*4] ldp x23,x24,[x0,#8*4] cmp x27,x29 // did we hit the bottom? stp x25,x26,[x2,#8*6] mov x2,x0 // slide the window ldp x25,x26,[x0,#8*6] mov x27,#8 b.ne .Lsqr8x_reduction // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. ldr x0,[x29,#96] // pull rp add x2,x2,#8*8 subs x14,x19,x6 sbcs x15,x20,x7 sub x27,x5,#8*8 mov x3,x0 // x0 copy .Lsqr8x_sub: sbcs x16,x21,x8 ldp x6,x7,[x1,#8*0] sbcs x17,x22,x9 stp x14,x15,[x0,#8*0] sbcs x14,x23,x10 ldp x8,x9,[x1,#8*2] sbcs x15,x24,x11 stp x16,x17,[x0,#8*2] sbcs x16,x25,x12 ldp x10,x11,[x1,#8*4] sbcs x17,x26,x13 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 ldp x19,x20,[x2,#8*0] sub x27,x27,#8*8 ldp x21,x22,[x2,#8*2] ldp x23,x24,[x2,#8*4] ldp x25,x26,[x2,#8*6] add x2,x2,#8*8 stp x14,x15,[x0,#8*4] sbcs x14,x19,x6 stp x16,x17,[x0,#8*6] add x0,x0,#8*8 sbcs x15,x20,x7 cbnz x27,.Lsqr8x_sub sbcs x16,x21,x8 mov x2,sp add x1,sp,x5 ldp x6,x7,[x3,#8*0] sbcs x17,x22,x9 stp x14,x15,[x0,#8*0] sbcs x14,x23,x10 ldp x8,x9,[x3,#8*2] sbcs x15,x24,x11 stp x16,x17,[x0,#8*2] sbcs x16,x25,x12 ldp x19,x20,[x1,#8*0] sbcs x17,x26,x13 ldp x21,x22,[x1,#8*2] sbcs xzr,x30,xzr // did it borrow? ldr x30,[x29,#8] // pull return address stp x14,x15,[x0,#8*4] stp x16,x17,[x0,#8*6] sub x27,x5,#8*4 .Lsqr4x_cond_copy: sub x27,x27,#8*4 csel x14,x19,x6,lo stp xzr,xzr,[x2,#8*0] csel x15,x20,x7,lo ldp x6,x7,[x3,#8*4] ldp x19,x20,[x1,#8*4] csel x16,x21,x8,lo stp xzr,xzr,[x2,#8*2] add x2,x2,#8*4 csel x17,x22,x9,lo ldp x8,x9,[x3,#8*6] ldp x21,x22,[x1,#8*6] add x1,x1,#8*4 stp x14,x15,[x3,#8*0] stp x16,x17,[x3,#8*2] add x3,x3,#8*4 stp xzr,xzr,[x1,#8*0] stp xzr,xzr,[x1,#8*2] cbnz x27,.Lsqr4x_cond_copy csel x14,x19,x6,lo stp xzr,xzr,[x2,#8*0] csel x15,x20,x7,lo stp xzr,xzr,[x2,#8*2] csel x16,x21,x8,lo csel x17,x22,x9,lo stp x14,x15,[x3,#8*0] stp x16,x17,[x3,#8*2] b .Lsqr8x_done .align 4 .Lsqr8x8_post_condition: adc x28,xzr,xzr ldr x30,[x29,#8] // pull return address // x19-7,x28 hold result, x6-7 hold modulus subs x6,x19,x6 ldr x1,[x29,#96] // pull rp sbcs x7,x20,x7 stp xzr,xzr,[sp,#8*0] sbcs x8,x21,x8 stp xzr,xzr,[sp,#8*2] sbcs x9,x22,x9 stp xzr,xzr,[sp,#8*4] sbcs x10,x23,x10 stp xzr,xzr,[sp,#8*6] sbcs x11,x24,x11 stp xzr,xzr,[sp,#8*8] sbcs x12,x25,x12 stp xzr,xzr,[sp,#8*10] sbcs x13,x26,x13 stp xzr,xzr,[sp,#8*12] sbcs x28,x28,xzr // did it borrow? stp xzr,xzr,[sp,#8*14] // x6-7 hold result-modulus csel x6,x19,x6,lo csel x7,x20,x7,lo csel x8,x21,x8,lo csel x9,x22,x9,lo stp x6,x7,[x1,#8*0] csel x10,x23,x10,lo csel x11,x24,x11,lo stp x8,x9,[x1,#8*2] csel x12,x25,x12,lo csel x13,x26,x13,lo stp x10,x11,[x1,#8*4] stp x12,x13,[x1,#8*6] .Lsqr8x_done: ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 // x30 is popped earlier AARCH64_VALIDATE_LINK_REGISTER ret .size bn_sqr8x_mont,.-bn_sqr8x_mont .globl bn_mul4x_mont .hidden bn_mul4x_mont .type bn_mul4x_mont,%function .align 5 bn_mul4x_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub x26,sp,x5,lsl#3 lsl x5,x5,#3 ldr x4,[x4] // *n0 sub sp,x26,#8*4 // alloca add x10,x2,x5 add x27,x1,x5 stp x0,x10,[x29,#96] // offload rp and &b[num] ldr x24,[x2,#8*0] // b[0] ldp x6,x7,[x1,#8*0] // a[0..3] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 mov x19,xzr mov x20,xzr mov x21,xzr mov x22,xzr ldp x14,x15,[x3,#8*0] // n[0..3] ldp x16,x17,[x3,#8*2] adds x3,x3,#8*4 // clear carry bit mov x0,xzr mov x28,#0 mov x26,sp .Loop_mul4x_1st_reduction: mul x10,x6,x24 // lo(a[0..3]*b[0]) adc x0,x0,xzr // modulo-scheduled mul x11,x7,x24 add x28,x28,#8 mul x12,x8,x24 and x28,x28,#31 mul x13,x9,x24 adds x19,x19,x10 umulh x10,x6,x24 // hi(a[0..3]*b[0]) adcs x20,x20,x11 mul x25,x19,x4 // t[0]*n0 adcs x21,x21,x12 umulh x11,x7,x24 adcs x22,x22,x13 umulh x12,x8,x24 adc x23,xzr,xzr umulh x13,x9,x24 ldr x24,[x2,x28] // next b[i] (or b[0]) adds x20,x20,x10 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) str x25,[x26],#8 // put aside t[0]*n0 for tail processing adcs x21,x21,x11 mul x11,x15,x25 adcs x22,x22,x12 mul x12,x16,x25 adc x23,x23,x13 // can't overflow mul x13,x17,x25 // (*) adds xzr,x19,x10 subs xzr,x19,#1 // (*) umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) adcs x19,x20,x11 umulh x11,x15,x25 adcs x20,x21,x12 umulh x12,x16,x25 adcs x21,x22,x13 umulh x13,x17,x25 adcs x22,x23,x0 adc x0,xzr,xzr adds x19,x19,x10 sub x10,x27,x1 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr cbnz x28,.Loop_mul4x_1st_reduction cbz x10,.Lmul4x4_post_condition ldp x6,x7,[x1,#8*0] // a[4..7] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 ldr x25,[sp] // a[0]*n0 ldp x14,x15,[x3,#8*0] // n[4..7] ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 .Loop_mul4x_1st_tail: mul x10,x6,x24 // lo(a[4..7]*b[i]) adc x0,x0,xzr // modulo-scheduled mul x11,x7,x24 add x28,x28,#8 mul x12,x8,x24 and x28,x28,#31 mul x13,x9,x24 adds x19,x19,x10 umulh x10,x6,x24 // hi(a[4..7]*b[i]) adcs x20,x20,x11 umulh x11,x7,x24 adcs x21,x21,x12 umulh x12,x8,x24 adcs x22,x22,x13 umulh x13,x9,x24 adc x23,xzr,xzr ldr x24,[x2,x28] // next b[i] (or b[0]) adds x20,x20,x10 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) adcs x21,x21,x11 mul x11,x15,x25 adcs x22,x22,x12 mul x12,x16,x25 adc x23,x23,x13 // can't overflow mul x13,x17,x25 adds x19,x19,x10 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) adcs x20,x20,x11 umulh x11,x15,x25 adcs x21,x21,x12 umulh x12,x16,x25 adcs x22,x22,x13 adcs x23,x23,x0 umulh x13,x17,x25 adc x0,xzr,xzr ldr x25,[sp,x28] // next t[0]*n0 str x19,[x26],#8 // result!!! adds x19,x20,x10 sub x10,x27,x1 // done yet? adcs x20,x21,x11 adcs x21,x22,x12 adcs x22,x23,x13 //adc x0,x0,xzr cbnz x28,.Loop_mul4x_1st_tail sub x11,x27,x5 // rewinded x1 cbz x10,.Lmul4x_proceed ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 ldp x14,x15,[x3,#8*0] ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 b .Loop_mul4x_1st_tail .align 5 .Lmul4x_proceed: ldr x24,[x2,#8*4]! // *++b adc x30,x0,xzr ldp x6,x7,[x11,#8*0] // a[0..3] sub x3,x3,x5 // rewind np ldp x8,x9,[x11,#8*2] add x1,x11,#8*4 stp x19,x20,[x26,#8*0] // result!!! ldp x19,x20,[sp,#8*4] // t[0..3] stp x21,x22,[x26,#8*2] // result!!! ldp x21,x22,[sp,#8*6] ldp x14,x15,[x3,#8*0] // n[0..3] mov x26,sp ldp x16,x17,[x3,#8*2] adds x3,x3,#8*4 // clear carry bit mov x0,xzr .align 4 .Loop_mul4x_reduction: mul x10,x6,x24 // lo(a[0..3]*b[4]) adc x0,x0,xzr // modulo-scheduled mul x11,x7,x24 add x28,x28,#8 mul x12,x8,x24 and x28,x28,#31 mul x13,x9,x24 adds x19,x19,x10 umulh x10,x6,x24 // hi(a[0..3]*b[4]) adcs x20,x20,x11 mul x25,x19,x4 // t[0]*n0 adcs x21,x21,x12 umulh x11,x7,x24 adcs x22,x22,x13 umulh x12,x8,x24 adc x23,xzr,xzr umulh x13,x9,x24 ldr x24,[x2,x28] // next b[i] adds x20,x20,x10 // (*) mul x10,x14,x25 str x25,[x26],#8 // put aside t[0]*n0 for tail processing adcs x21,x21,x11 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 adcs x22,x22,x12 mul x12,x16,x25 adc x23,x23,x13 // can't overflow mul x13,x17,x25 // (*) adds xzr,x19,x10 subs xzr,x19,#1 // (*) umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 adcs x19,x20,x11 umulh x11,x15,x25 adcs x20,x21,x12 umulh x12,x16,x25 adcs x21,x22,x13 umulh x13,x17,x25 adcs x22,x23,x0 adc x0,xzr,xzr adds x19,x19,x10 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr cbnz x28,.Loop_mul4x_reduction adc x0,x0,xzr ldp x10,x11,[x26,#8*4] // t[4..7] ldp x12,x13,[x26,#8*6] ldp x6,x7,[x1,#8*0] // a[4..7] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 adds x19,x19,x10 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr ldr x25,[sp] // t[0]*n0 ldp x14,x15,[x3,#8*0] // n[4..7] ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 .align 4 .Loop_mul4x_tail: mul x10,x6,x24 // lo(a[4..7]*b[4]) adc x0,x0,xzr // modulo-scheduled mul x11,x7,x24 add x28,x28,#8 mul x12,x8,x24 and x28,x28,#31 mul x13,x9,x24 adds x19,x19,x10 umulh x10,x6,x24 // hi(a[4..7]*b[4]) adcs x20,x20,x11 umulh x11,x7,x24 adcs x21,x21,x12 umulh x12,x8,x24 adcs x22,x22,x13 umulh x13,x9,x24 adc x23,xzr,xzr ldr x24,[x2,x28] // next b[i] adds x20,x20,x10 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) adcs x21,x21,x11 mul x11,x15,x25 adcs x22,x22,x12 mul x12,x16,x25 adc x23,x23,x13 // can't overflow mul x13,x17,x25 adds x19,x19,x10 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) adcs x20,x20,x11 umulh x11,x15,x25 adcs x21,x21,x12 umulh x12,x16,x25 adcs x22,x22,x13 umulh x13,x17,x25 adcs x23,x23,x0 ldr x25,[sp,x28] // next a[0]*n0 adc x0,xzr,xzr str x19,[x26],#8 // result!!! adds x19,x20,x10 sub x10,x27,x1 // done yet? adcs x20,x21,x11 adcs x21,x22,x12 adcs x22,x23,x13 //adc x0,x0,xzr cbnz x28,.Loop_mul4x_tail sub x11,x3,x5 // rewinded np? adc x0,x0,xzr cbz x10,.Loop_mul4x_break ldp x10,x11,[x26,#8*4] ldp x12,x13,[x26,#8*6] ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 adds x19,x19,x10 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr ldp x14,x15,[x3,#8*0] ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 b .Loop_mul4x_tail .align 4 .Loop_mul4x_break: ldp x12,x13,[x29,#96] // pull rp and &b[num] adds x19,x19,x30 add x2,x2,#8*4 // bp++ adcs x20,x20,xzr sub x1,x1,x5 // rewind ap adcs x21,x21,xzr stp x19,x20,[x26,#8*0] // result!!! adcs x22,x22,xzr ldp x19,x20,[sp,#8*4] // t[0..3] adc x30,x0,xzr stp x21,x22,[x26,#8*2] // result!!! cmp x2,x13 // done yet? ldp x21,x22,[sp,#8*6] ldp x14,x15,[x11,#8*0] // n[0..3] ldp x16,x17,[x11,#8*2] add x3,x11,#8*4 b.eq .Lmul4x_post ldr x24,[x2] ldp x6,x7,[x1,#8*0] // a[0..3] ldp x8,x9,[x1,#8*2] adds x1,x1,#8*4 // clear carry bit mov x0,xzr mov x26,sp b .Loop_mul4x_reduction .align 4 .Lmul4x_post: // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. mov x0,x12 mov x27,x12 // x0 copy subs x10,x19,x14 add x26,sp,#8*8 sbcs x11,x20,x15 sub x28,x5,#8*4 .Lmul4x_sub: sbcs x12,x21,x16 ldp x14,x15,[x3,#8*0] sub x28,x28,#8*4 ldp x19,x20,[x26,#8*0] sbcs x13,x22,x17 ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 ldp x21,x22,[x26,#8*2] add x26,x26,#8*4 stp x10,x11,[x0,#8*0] sbcs x10,x19,x14 stp x12,x13,[x0,#8*2] add x0,x0,#8*4 sbcs x11,x20,x15 cbnz x28,.Lmul4x_sub sbcs x12,x21,x16 mov x26,sp add x1,sp,#8*4 ldp x6,x7,[x27,#8*0] sbcs x13,x22,x17 stp x10,x11,[x0,#8*0] ldp x8,x9,[x27,#8*2] stp x12,x13,[x0,#8*2] ldp x19,x20,[x1,#8*0] ldp x21,x22,[x1,#8*2] sbcs xzr,x30,xzr // did it borrow? ldr x30,[x29,#8] // pull return address sub x28,x5,#8*4 .Lmul4x_cond_copy: sub x28,x28,#8*4 csel x10,x19,x6,lo stp xzr,xzr,[x26,#8*0] csel x11,x20,x7,lo ldp x6,x7,[x27,#8*4] ldp x19,x20,[x1,#8*4] csel x12,x21,x8,lo stp xzr,xzr,[x26,#8*2] add x26,x26,#8*4 csel x13,x22,x9,lo ldp x8,x9,[x27,#8*6] ldp x21,x22,[x1,#8*6] add x1,x1,#8*4 stp x10,x11,[x27,#8*0] stp x12,x13,[x27,#8*2] add x27,x27,#8*4 cbnz x28,.Lmul4x_cond_copy csel x10,x19,x6,lo stp xzr,xzr,[x26,#8*0] csel x11,x20,x7,lo stp xzr,xzr,[x26,#8*2] csel x12,x21,x8,lo stp xzr,xzr,[x26,#8*3] csel x13,x22,x9,lo stp xzr,xzr,[x26,#8*4] stp x10,x11,[x27,#8*0] stp x12,x13,[x27,#8*2] b .Lmul4x_done .align 4 .Lmul4x4_post_condition: adc x0,x0,xzr ldr x1,[x29,#96] // pull rp // x19-3,x0 hold result, x14-7 hold modulus subs x6,x19,x14 ldr x30,[x29,#8] // pull return address sbcs x7,x20,x15 stp xzr,xzr,[sp,#8*0] sbcs x8,x21,x16 stp xzr,xzr,[sp,#8*2] sbcs x9,x22,x17 stp xzr,xzr,[sp,#8*4] sbcs xzr,x0,xzr // did it borrow? stp xzr,xzr,[sp,#8*6] // x6-3 hold result-modulus csel x6,x19,x6,lo csel x7,x20,x7,lo csel x8,x21,x8,lo csel x9,x22,x9,lo stp x6,x7,[x1,#8*0] stp x8,x9,[x1,#8*2] .Lmul4x_done: ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 // x30 is popped earlier AARCH64_VALIDATE_LINK_REGISTER ret .size bn_mul4x_mont,.-bn_mul4x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 4 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) ring-0.17.14/pregenerated/armv8-mont-win64.S000064400000000000000000000736721046102023000165220ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) .text .globl bn_mul_mont_nohw .def bn_mul_mont_nohw .type 32 .endef .align 5 bn_mul_mont_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] ldr x9,[x2],#8 // bp[0] sub x22,sp,x5,lsl#3 ldp x7,x8,[x1],#16 // ap[0..1] lsl x5,x5,#3 ldr x4,[x4] // *n0 and x22,x22,#-16 // ABI says so ldp x13,x14,[x3],#16 // np[0..1] mul x6,x7,x9 // ap[0]*bp[0] sub x21,x5,#16 // j=num-2 umulh x7,x7,x9 mul x10,x8,x9 // ap[1]*bp[0] umulh x11,x8,x9 mul x15,x6,x4 // "tp[0]"*n0 mov sp,x22 // alloca // (*) mul x12,x13,x15 // np[0]*m1 umulh x13,x13,x15 mul x16,x14,x15 // np[1]*m1 // (*) adds x12,x12,x6 // discarded // (*) As for removal of first multiplication and addition // instructions. The outcome of first addition is // guaranteed to be zero, which leaves two computationally // significant outcomes: it either carries or not. Then // question is when does it carry? Is there alternative // way to deduce it? If you follow operations, you can // observe that condition for carry is quite simple: // x6 being non-zero. So that carry can be calculated // by adding -1 to x6. That's what next instruction does. subs xzr,x6,#1 // (*) umulh x17,x14,x15 adc x13,x13,xzr cbz x21,L1st_skip L1st: ldr x8,[x1],#8 adds x6,x10,x7 sub x21,x21,#8 // j-- adc x7,x11,xzr ldr x14,[x3],#8 adds x12,x16,x13 mul x10,x8,x9 // ap[j]*bp[0] adc x13,x17,xzr umulh x11,x8,x9 adds x12,x12,x6 mul x16,x14,x15 // np[j]*m1 adc x13,x13,xzr umulh x17,x14,x15 str x12,[x22],#8 // tp[j-1] cbnz x21,L1st L1st_skip: adds x6,x10,x7 sub x1,x1,x5 // rewind x1 adc x7,x11,xzr adds x12,x16,x13 sub x3,x3,x5 // rewind x3 adc x13,x17,xzr adds x12,x12,x6 sub x20,x5,#8 // i=num-1 adcs x13,x13,x7 adc x19,xzr,xzr // upmost overflow bit stp x12,x13,[x22] Louter: ldr x9,[x2],#8 // bp[i] ldp x7,x8,[x1],#16 ldr x23,[sp] // tp[0] add x22,sp,#8 mul x6,x7,x9 // ap[0]*bp[i] sub x21,x5,#16 // j=num-2 umulh x7,x7,x9 ldp x13,x14,[x3],#16 mul x10,x8,x9 // ap[1]*bp[i] adds x6,x6,x23 umulh x11,x8,x9 adc x7,x7,xzr mul x15,x6,x4 sub x20,x20,#8 // i-- // (*) mul x12,x13,x15 // np[0]*m1 umulh x13,x13,x15 mul x16,x14,x15 // np[1]*m1 // (*) adds x12,x12,x6 subs xzr,x6,#1 // (*) umulh x17,x14,x15 cbz x21,Linner_skip Linner: ldr x8,[x1],#8 adc x13,x13,xzr ldr x23,[x22],#8 // tp[j] adds x6,x10,x7 sub x21,x21,#8 // j-- adc x7,x11,xzr adds x12,x16,x13 ldr x14,[x3],#8 adc x13,x17,xzr mul x10,x8,x9 // ap[j]*bp[i] adds x6,x6,x23 umulh x11,x8,x9 adc x7,x7,xzr mul x16,x14,x15 // np[j]*m1 adds x12,x12,x6 umulh x17,x14,x15 str x12,[x22,#-16] // tp[j-1] cbnz x21,Linner Linner_skip: ldr x23,[x22],#8 // tp[j] adc x13,x13,xzr adds x6,x10,x7 sub x1,x1,x5 // rewind x1 adc x7,x11,xzr adds x12,x16,x13 sub x3,x3,x5 // rewind x3 adcs x13,x17,x19 adc x19,xzr,xzr adds x6,x6,x23 adc x7,x7,xzr adds x12,x12,x6 adcs x13,x13,x7 adc x19,x19,xzr // upmost overflow bit stp x12,x13,[x22,#-16] cbnz x20,Louter // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. ldr x23,[sp] // tp[0] add x22,sp,#8 ldr x14,[x3],#8 // np[0] subs x21,x5,#8 // j=num-1 and clear borrow mov x1,x0 Lsub: sbcs x8,x23,x14 // tp[j]-np[j] ldr x23,[x22],#8 sub x21,x21,#8 // j-- ldr x14,[x3],#8 str x8,[x1],#8 // rp[j]=tp[j]-np[j] cbnz x21,Lsub sbcs x8,x23,x14 sbcs x19,x19,xzr // did it borrow? str x8,[x1],#8 // rp[num-1] ldr x23,[sp] // tp[0] add x22,sp,#8 ldr x8,[x0],#8 // rp[0] sub x5,x5,#8 // num-- nop Lcond_copy: sub x5,x5,#8 // num-- csel x14,x23,x8,lo // did it borrow? ldr x23,[x22],#8 ldr x8,[x0],#8 str xzr,[x22,#-16] // wipe tp str x14,[x0,#-16] cbnz x5,Lcond_copy csel x14,x23,x8,lo str xzr,[x22,#-8] // wipe tp str x14,[x0,#-8] ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldr x29,[sp],#64 AARCH64_VALIDATE_LINK_REGISTER ret .globl bn_sqr8x_mont .def bn_sqr8x_mont .type 32 .endef .align 5 bn_sqr8x_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] stp x0,x3,[sp,#96] // offload rp and np ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] ldp x10,x11,[x1,#8*4] ldp x12,x13,[x1,#8*6] sub x2,sp,x5,lsl#4 lsl x5,x5,#3 ldr x4,[x4] // *n0 mov sp,x2 // alloca sub x27,x5,#8*8 b Lsqr8x_zero_start Lsqr8x_zero: sub x27,x27,#8*8 stp xzr,xzr,[x2,#8*0] stp xzr,xzr,[x2,#8*2] stp xzr,xzr,[x2,#8*4] stp xzr,xzr,[x2,#8*6] Lsqr8x_zero_start: stp xzr,xzr,[x2,#8*8] stp xzr,xzr,[x2,#8*10] stp xzr,xzr,[x2,#8*12] stp xzr,xzr,[x2,#8*14] add x2,x2,#8*16 cbnz x27,Lsqr8x_zero add x3,x1,x5 add x1,x1,#8*8 mov x19,xzr mov x20,xzr mov x21,xzr mov x22,xzr mov x23,xzr mov x24,xzr mov x25,xzr mov x26,xzr mov x2,sp str x4,[x29,#112] // offload n0 // Multiply everything but a[i]*a[i] .align 4 Lsqr8x_outer_loop: // a[1]a[0] (i) // a[2]a[0] // a[3]a[0] // a[4]a[0] // a[5]a[0] // a[6]a[0] // a[7]a[0] // a[2]a[1] (ii) // a[3]a[1] // a[4]a[1] // a[5]a[1] // a[6]a[1] // a[7]a[1] // a[3]a[2] (iii) // a[4]a[2] // a[5]a[2] // a[6]a[2] // a[7]a[2] // a[4]a[3] (iv) // a[5]a[3] // a[6]a[3] // a[7]a[3] // a[5]a[4] (v) // a[6]a[4] // a[7]a[4] // a[6]a[5] (vi) // a[7]a[5] // a[7]a[6] (vii) mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) mul x15,x8,x6 mul x16,x9,x6 mul x17,x10,x6 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) mul x14,x11,x6 adcs x21,x21,x15 mul x15,x12,x6 adcs x22,x22,x16 mul x16,x13,x6 adcs x23,x23,x17 umulh x17,x7,x6 // hi(a[1..7]*a[0]) adcs x24,x24,x14 umulh x14,x8,x6 adcs x25,x25,x15 umulh x15,x9,x6 adcs x26,x26,x16 umulh x16,x10,x6 stp x19,x20,[x2],#8*2 // t[0..1] adc x19,xzr,xzr // t[8] adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) umulh x17,x11,x6 adcs x22,x22,x14 umulh x14,x12,x6 adcs x23,x23,x15 umulh x15,x13,x6 adcs x24,x24,x16 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) adcs x25,x25,x17 mul x17,x9,x7 adcs x26,x26,x14 mul x14,x10,x7 adc x19,x19,x15 mul x15,x11,x7 adds x22,x22,x16 mul x16,x12,x7 adcs x23,x23,x17 mul x17,x13,x7 adcs x24,x24,x14 umulh x14,x8,x7 // hi(a[2..7]*a[1]) adcs x25,x25,x15 umulh x15,x9,x7 adcs x26,x26,x16 umulh x16,x10,x7 adcs x19,x19,x17 umulh x17,x11,x7 stp x21,x22,[x2],#8*2 // t[2..3] adc x20,xzr,xzr // t[9] adds x23,x23,x14 umulh x14,x12,x7 adcs x24,x24,x15 umulh x15,x13,x7 adcs x25,x25,x16 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) adcs x26,x26,x17 mul x17,x10,x8 adcs x19,x19,x14 mul x14,x11,x8 adc x20,x20,x15 mul x15,x12,x8 adds x24,x24,x16 mul x16,x13,x8 adcs x25,x25,x17 umulh x17,x9,x8 // hi(a[3..7]*a[2]) adcs x26,x26,x14 umulh x14,x10,x8 adcs x19,x19,x15 umulh x15,x11,x8 adcs x20,x20,x16 umulh x16,x12,x8 stp x23,x24,[x2],#8*2 // t[4..5] adc x21,xzr,xzr // t[10] adds x25,x25,x17 umulh x17,x13,x8 adcs x26,x26,x14 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) adcs x19,x19,x15 mul x15,x11,x9 adcs x20,x20,x16 mul x16,x12,x9 adc x21,x21,x17 mul x17,x13,x9 adds x26,x26,x14 umulh x14,x10,x9 // hi(a[4..7]*a[3]) adcs x19,x19,x15 umulh x15,x11,x9 adcs x20,x20,x16 umulh x16,x12,x9 adcs x21,x21,x17 umulh x17,x13,x9 stp x25,x26,[x2],#8*2 // t[6..7] adc x22,xzr,xzr // t[11] adds x19,x19,x14 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) adcs x20,x20,x15 mul x15,x12,x10 adcs x21,x21,x16 mul x16,x13,x10 adc x22,x22,x17 umulh x17,x11,x10 // hi(a[5..7]*a[4]) adds x20,x20,x14 umulh x14,x12,x10 adcs x21,x21,x15 umulh x15,x13,x10 adcs x22,x22,x16 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) adc x23,xzr,xzr // t[12] adds x21,x21,x17 mul x17,x13,x11 adcs x22,x22,x14 umulh x14,x12,x11 // hi(a[6..7]*a[5]) adc x23,x23,x15 umulh x15,x13,x11 adds x22,x22,x16 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) adcs x23,x23,x17 umulh x17,x13,x12 // hi(a[7]*a[6]) adc x24,xzr,xzr // t[13] adds x23,x23,x14 sub x27,x3,x1 // done yet? adc x24,x24,x15 adds x24,x24,x16 sub x14,x3,x5 // rewinded ap adc x25,xzr,xzr // t[14] add x25,x25,x17 cbz x27,Lsqr8x_outer_break mov x4,x6 ldp x6,x7,[x2,#8*0] ldp x8,x9,[x2,#8*2] ldp x10,x11,[x2,#8*4] ldp x12,x13,[x2,#8*6] adds x19,x19,x6 adcs x20,x20,x7 ldp x6,x7,[x1,#8*0] adcs x21,x21,x8 adcs x22,x22,x9 ldp x8,x9,[x1,#8*2] adcs x23,x23,x10 adcs x24,x24,x11 ldp x10,x11,[x1,#8*4] adcs x25,x25,x12 mov x0,x1 adcs x26,xzr,x13 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 //adc x28,xzr,xzr // moved below mov x27,#-8*8 // a[8]a[0] // a[9]a[0] // a[a]a[0] // a[b]a[0] // a[c]a[0] // a[d]a[0] // a[e]a[0] // a[f]a[0] // a[8]a[1] // a[f]a[1]........................ // a[8]a[2] // a[f]a[2]........................ // a[8]a[3] // a[f]a[3]........................ // a[8]a[4] // a[f]a[4]........................ // a[8]a[5] // a[f]a[5]........................ // a[8]a[6] // a[f]a[6]........................ // a[8]a[7] // a[f]a[7]........................ Lsqr8x_mul: mul x14,x6,x4 adc x28,xzr,xzr // carry bit, modulo-scheduled mul x15,x7,x4 add x27,x27,#8 mul x16,x8,x4 mul x17,x9,x4 adds x19,x19,x14 mul x14,x10,x4 adcs x20,x20,x15 mul x15,x11,x4 adcs x21,x21,x16 mul x16,x12,x4 adcs x22,x22,x17 mul x17,x13,x4 adcs x23,x23,x14 umulh x14,x6,x4 adcs x24,x24,x15 umulh x15,x7,x4 adcs x25,x25,x16 umulh x16,x8,x4 adcs x26,x26,x17 umulh x17,x9,x4 adc x28,x28,xzr str x19,[x2],#8 adds x19,x20,x14 umulh x14,x10,x4 adcs x20,x21,x15 umulh x15,x11,x4 adcs x21,x22,x16 umulh x16,x12,x4 adcs x22,x23,x17 umulh x17,x13,x4 ldr x4,[x0,x27] adcs x23,x24,x14 adcs x24,x25,x15 adcs x25,x26,x16 adcs x26,x28,x17 //adc x28,xzr,xzr // moved above cbnz x27,Lsqr8x_mul // note that carry flag is guaranteed // to be zero at this point cmp x1,x3 // done yet? b.eq Lsqr8x_break ldp x6,x7,[x2,#8*0] ldp x8,x9,[x2,#8*2] ldp x10,x11,[x2,#8*4] ldp x12,x13,[x2,#8*6] adds x19,x19,x6 ldr x4,[x0,#-8*8] adcs x20,x20,x7 ldp x6,x7,[x1,#8*0] adcs x21,x21,x8 adcs x22,x22,x9 ldp x8,x9,[x1,#8*2] adcs x23,x23,x10 adcs x24,x24,x11 ldp x10,x11,[x1,#8*4] adcs x25,x25,x12 mov x27,#-8*8 adcs x26,x26,x13 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 //adc x28,xzr,xzr // moved above b Lsqr8x_mul .align 4 Lsqr8x_break: ldp x6,x7,[x0,#8*0] add x1,x0,#8*8 ldp x8,x9,[x0,#8*2] sub x14,x3,x1 // is it last iteration? ldp x10,x11,[x0,#8*4] sub x15,x2,x14 ldp x12,x13,[x0,#8*6] cbz x14,Lsqr8x_outer_loop stp x19,x20,[x2,#8*0] ldp x19,x20,[x15,#8*0] stp x21,x22,[x2,#8*2] ldp x21,x22,[x15,#8*2] stp x23,x24,[x2,#8*4] ldp x23,x24,[x15,#8*4] stp x25,x26,[x2,#8*6] mov x2,x15 ldp x25,x26,[x15,#8*6] b Lsqr8x_outer_loop .align 4 Lsqr8x_outer_break: // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] ldp x15,x16,[sp,#8*1] ldp x11,x13,[x14,#8*2] add x1,x14,#8*4 ldp x17,x14,[sp,#8*3] stp x19,x20,[x2,#8*0] mul x19,x7,x7 stp x21,x22,[x2,#8*2] umulh x7,x7,x7 stp x23,x24,[x2,#8*4] mul x8,x9,x9 stp x25,x26,[x2,#8*6] mov x2,sp umulh x9,x9,x9 adds x20,x7,x15,lsl#1 extr x15,x16,x15,#63 sub x27,x5,#8*4 Lsqr4x_shift_n_add: adcs x21,x8,x15 extr x16,x17,x16,#63 sub x27,x27,#8*4 adcs x22,x9,x16 ldp x15,x16,[x2,#8*5] mul x10,x11,x11 ldp x7,x9,[x1],#8*2 umulh x11,x11,x11 mul x12,x13,x13 umulh x13,x13,x13 extr x17,x14,x17,#63 stp x19,x20,[x2,#8*0] adcs x23,x10,x17 extr x14,x15,x14,#63 stp x21,x22,[x2,#8*2] adcs x24,x11,x14 ldp x17,x14,[x2,#8*7] extr x15,x16,x15,#63 adcs x25,x12,x15 extr x16,x17,x16,#63 adcs x26,x13,x16 ldp x15,x16,[x2,#8*9] mul x6,x7,x7 ldp x11,x13,[x1],#8*2 umulh x7,x7,x7 mul x8,x9,x9 umulh x9,x9,x9 stp x23,x24,[x2,#8*4] extr x17,x14,x17,#63 stp x25,x26,[x2,#8*6] add x2,x2,#8*8 adcs x19,x6,x17 extr x14,x15,x14,#63 adcs x20,x7,x14 ldp x17,x14,[x2,#8*3] extr x15,x16,x15,#63 cbnz x27,Lsqr4x_shift_n_add ldp x1,x4,[x29,#104] // pull np and n0 adcs x21,x8,x15 extr x16,x17,x16,#63 adcs x22,x9,x16 ldp x15,x16,[x2,#8*5] mul x10,x11,x11 umulh x11,x11,x11 stp x19,x20,[x2,#8*0] mul x12,x13,x13 umulh x13,x13,x13 stp x21,x22,[x2,#8*2] extr x17,x14,x17,#63 adcs x23,x10,x17 extr x14,x15,x14,#63 ldp x19,x20,[sp,#8*0] adcs x24,x11,x14 extr x15,x16,x15,#63 ldp x6,x7,[x1,#8*0] adcs x25,x12,x15 extr x16,xzr,x16,#63 ldp x8,x9,[x1,#8*2] adc x26,x13,x16 ldp x10,x11,[x1,#8*4] // Reduce by 512 bits per iteration mul x28,x4,x19 // t[0]*n0 ldp x12,x13,[x1,#8*6] add x3,x1,x5 ldp x21,x22,[sp,#8*2] stp x23,x24,[x2,#8*4] ldp x23,x24,[sp,#8*4] stp x25,x26,[x2,#8*6] ldp x25,x26,[sp,#8*6] add x1,x1,#8*8 mov x30,xzr // initial top-most carry mov x2,sp mov x27,#8 Lsqr8x_reduction: // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) mul x15,x7,x28 sub x27,x27,#1 mul x16,x8,x28 str x28,[x2],#8 // put aside t[0]*n0 for tail processing mul x17,x9,x28 // (*) adds xzr,x19,x14 subs xzr,x19,#1 // (*) mul x14,x10,x28 adcs x19,x20,x15 mul x15,x11,x28 adcs x20,x21,x16 mul x16,x12,x28 adcs x21,x22,x17 mul x17,x13,x28 adcs x22,x23,x14 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) adcs x23,x24,x15 umulh x15,x7,x28 adcs x24,x25,x16 umulh x16,x8,x28 adcs x25,x26,x17 umulh x17,x9,x28 adc x26,xzr,xzr adds x19,x19,x14 umulh x14,x10,x28 adcs x20,x20,x15 umulh x15,x11,x28 adcs x21,x21,x16 umulh x16,x12,x28 adcs x22,x22,x17 umulh x17,x13,x28 mul x28,x4,x19 // next t[0]*n0 adcs x23,x23,x14 adcs x24,x24,x15 adcs x25,x25,x16 adc x26,x26,x17 cbnz x27,Lsqr8x_reduction ldp x14,x15,[x2,#8*0] ldp x16,x17,[x2,#8*2] mov x0,x2 sub x27,x3,x1 // done yet? adds x19,x19,x14 adcs x20,x20,x15 ldp x14,x15,[x2,#8*4] adcs x21,x21,x16 adcs x22,x22,x17 ldp x16,x17,[x2,#8*6] adcs x23,x23,x14 adcs x24,x24,x15 adcs x25,x25,x16 adcs x26,x26,x17 //adc x28,xzr,xzr // moved below cbz x27,Lsqr8x8_post_condition ldr x4,[x2,#-8*8] ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] ldp x10,x11,[x1,#8*4] mov x27,#-8*8 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 Lsqr8x_tail: mul x14,x6,x4 adc x28,xzr,xzr // carry bit, modulo-scheduled mul x15,x7,x4 add x27,x27,#8 mul x16,x8,x4 mul x17,x9,x4 adds x19,x19,x14 mul x14,x10,x4 adcs x20,x20,x15 mul x15,x11,x4 adcs x21,x21,x16 mul x16,x12,x4 adcs x22,x22,x17 mul x17,x13,x4 adcs x23,x23,x14 umulh x14,x6,x4 adcs x24,x24,x15 umulh x15,x7,x4 adcs x25,x25,x16 umulh x16,x8,x4 adcs x26,x26,x17 umulh x17,x9,x4 adc x28,x28,xzr str x19,[x2],#8 adds x19,x20,x14 umulh x14,x10,x4 adcs x20,x21,x15 umulh x15,x11,x4 adcs x21,x22,x16 umulh x16,x12,x4 adcs x22,x23,x17 umulh x17,x13,x4 ldr x4,[x0,x27] adcs x23,x24,x14 adcs x24,x25,x15 adcs x25,x26,x16 adcs x26,x28,x17 //adc x28,xzr,xzr // moved above cbnz x27,Lsqr8x_tail // note that carry flag is guaranteed // to be zero at this point ldp x6,x7,[x2,#8*0] sub x27,x3,x1 // done yet? sub x16,x3,x5 // rewinded np ldp x8,x9,[x2,#8*2] ldp x10,x11,[x2,#8*4] ldp x12,x13,[x2,#8*6] cbz x27,Lsqr8x_tail_break ldr x4,[x0,#-8*8] adds x19,x19,x6 adcs x20,x20,x7 ldp x6,x7,[x1,#8*0] adcs x21,x21,x8 adcs x22,x22,x9 ldp x8,x9,[x1,#8*2] adcs x23,x23,x10 adcs x24,x24,x11 ldp x10,x11,[x1,#8*4] adcs x25,x25,x12 mov x27,#-8*8 adcs x26,x26,x13 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 //adc x28,xzr,xzr // moved above b Lsqr8x_tail .align 4 Lsqr8x_tail_break: ldr x4,[x29,#112] // pull n0 add x27,x2,#8*8 // end of current t[num] window subs xzr,x30,#1 // "move" top-most carry to carry bit adcs x14,x19,x6 adcs x15,x20,x7 ldp x19,x20,[x0,#8*0] adcs x21,x21,x8 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] adcs x22,x22,x9 ldp x8,x9,[x16,#8*2] adcs x23,x23,x10 adcs x24,x24,x11 ldp x10,x11,[x16,#8*4] adcs x25,x25,x12 adcs x26,x26,x13 ldp x12,x13,[x16,#8*6] add x1,x16,#8*8 adc x30,xzr,xzr // top-most carry mul x28,x4,x19 stp x14,x15,[x2,#8*0] stp x21,x22,[x2,#8*2] ldp x21,x22,[x0,#8*2] stp x23,x24,[x2,#8*4] ldp x23,x24,[x0,#8*4] cmp x27,x29 // did we hit the bottom? stp x25,x26,[x2,#8*6] mov x2,x0 // slide the window ldp x25,x26,[x0,#8*6] mov x27,#8 b.ne Lsqr8x_reduction // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. ldr x0,[x29,#96] // pull rp add x2,x2,#8*8 subs x14,x19,x6 sbcs x15,x20,x7 sub x27,x5,#8*8 mov x3,x0 // x0 copy Lsqr8x_sub: sbcs x16,x21,x8 ldp x6,x7,[x1,#8*0] sbcs x17,x22,x9 stp x14,x15,[x0,#8*0] sbcs x14,x23,x10 ldp x8,x9,[x1,#8*2] sbcs x15,x24,x11 stp x16,x17,[x0,#8*2] sbcs x16,x25,x12 ldp x10,x11,[x1,#8*4] sbcs x17,x26,x13 ldp x12,x13,[x1,#8*6] add x1,x1,#8*8 ldp x19,x20,[x2,#8*0] sub x27,x27,#8*8 ldp x21,x22,[x2,#8*2] ldp x23,x24,[x2,#8*4] ldp x25,x26,[x2,#8*6] add x2,x2,#8*8 stp x14,x15,[x0,#8*4] sbcs x14,x19,x6 stp x16,x17,[x0,#8*6] add x0,x0,#8*8 sbcs x15,x20,x7 cbnz x27,Lsqr8x_sub sbcs x16,x21,x8 mov x2,sp add x1,sp,x5 ldp x6,x7,[x3,#8*0] sbcs x17,x22,x9 stp x14,x15,[x0,#8*0] sbcs x14,x23,x10 ldp x8,x9,[x3,#8*2] sbcs x15,x24,x11 stp x16,x17,[x0,#8*2] sbcs x16,x25,x12 ldp x19,x20,[x1,#8*0] sbcs x17,x26,x13 ldp x21,x22,[x1,#8*2] sbcs xzr,x30,xzr // did it borrow? ldr x30,[x29,#8] // pull return address stp x14,x15,[x0,#8*4] stp x16,x17,[x0,#8*6] sub x27,x5,#8*4 Lsqr4x_cond_copy: sub x27,x27,#8*4 csel x14,x19,x6,lo stp xzr,xzr,[x2,#8*0] csel x15,x20,x7,lo ldp x6,x7,[x3,#8*4] ldp x19,x20,[x1,#8*4] csel x16,x21,x8,lo stp xzr,xzr,[x2,#8*2] add x2,x2,#8*4 csel x17,x22,x9,lo ldp x8,x9,[x3,#8*6] ldp x21,x22,[x1,#8*6] add x1,x1,#8*4 stp x14,x15,[x3,#8*0] stp x16,x17,[x3,#8*2] add x3,x3,#8*4 stp xzr,xzr,[x1,#8*0] stp xzr,xzr,[x1,#8*2] cbnz x27,Lsqr4x_cond_copy csel x14,x19,x6,lo stp xzr,xzr,[x2,#8*0] csel x15,x20,x7,lo stp xzr,xzr,[x2,#8*2] csel x16,x21,x8,lo csel x17,x22,x9,lo stp x14,x15,[x3,#8*0] stp x16,x17,[x3,#8*2] b Lsqr8x_done .align 4 Lsqr8x8_post_condition: adc x28,xzr,xzr ldr x30,[x29,#8] // pull return address // x19-7,x28 hold result, x6-7 hold modulus subs x6,x19,x6 ldr x1,[x29,#96] // pull rp sbcs x7,x20,x7 stp xzr,xzr,[sp,#8*0] sbcs x8,x21,x8 stp xzr,xzr,[sp,#8*2] sbcs x9,x22,x9 stp xzr,xzr,[sp,#8*4] sbcs x10,x23,x10 stp xzr,xzr,[sp,#8*6] sbcs x11,x24,x11 stp xzr,xzr,[sp,#8*8] sbcs x12,x25,x12 stp xzr,xzr,[sp,#8*10] sbcs x13,x26,x13 stp xzr,xzr,[sp,#8*12] sbcs x28,x28,xzr // did it borrow? stp xzr,xzr,[sp,#8*14] // x6-7 hold result-modulus csel x6,x19,x6,lo csel x7,x20,x7,lo csel x8,x21,x8,lo csel x9,x22,x9,lo stp x6,x7,[x1,#8*0] csel x10,x23,x10,lo csel x11,x24,x11,lo stp x8,x9,[x1,#8*2] csel x12,x25,x12,lo csel x13,x26,x13,lo stp x10,x11,[x1,#8*4] stp x12,x13,[x1,#8*6] Lsqr8x_done: ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 // x30 is popped earlier AARCH64_VALIDATE_LINK_REGISTER ret .globl bn_mul4x_mont .def bn_mul4x_mont .type 32 .endef .align 5 bn_mul4x_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub x26,sp,x5,lsl#3 lsl x5,x5,#3 ldr x4,[x4] // *n0 sub sp,x26,#8*4 // alloca add x10,x2,x5 add x27,x1,x5 stp x0,x10,[x29,#96] // offload rp and &b[num] ldr x24,[x2,#8*0] // b[0] ldp x6,x7,[x1,#8*0] // a[0..3] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 mov x19,xzr mov x20,xzr mov x21,xzr mov x22,xzr ldp x14,x15,[x3,#8*0] // n[0..3] ldp x16,x17,[x3,#8*2] adds x3,x3,#8*4 // clear carry bit mov x0,xzr mov x28,#0 mov x26,sp Loop_mul4x_1st_reduction: mul x10,x6,x24 // lo(a[0..3]*b[0]) adc x0,x0,xzr // modulo-scheduled mul x11,x7,x24 add x28,x28,#8 mul x12,x8,x24 and x28,x28,#31 mul x13,x9,x24 adds x19,x19,x10 umulh x10,x6,x24 // hi(a[0..3]*b[0]) adcs x20,x20,x11 mul x25,x19,x4 // t[0]*n0 adcs x21,x21,x12 umulh x11,x7,x24 adcs x22,x22,x13 umulh x12,x8,x24 adc x23,xzr,xzr umulh x13,x9,x24 ldr x24,[x2,x28] // next b[i] (or b[0]) adds x20,x20,x10 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) str x25,[x26],#8 // put aside t[0]*n0 for tail processing adcs x21,x21,x11 mul x11,x15,x25 adcs x22,x22,x12 mul x12,x16,x25 adc x23,x23,x13 // can't overflow mul x13,x17,x25 // (*) adds xzr,x19,x10 subs xzr,x19,#1 // (*) umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) adcs x19,x20,x11 umulh x11,x15,x25 adcs x20,x21,x12 umulh x12,x16,x25 adcs x21,x22,x13 umulh x13,x17,x25 adcs x22,x23,x0 adc x0,xzr,xzr adds x19,x19,x10 sub x10,x27,x1 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr cbnz x28,Loop_mul4x_1st_reduction cbz x10,Lmul4x4_post_condition ldp x6,x7,[x1,#8*0] // a[4..7] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 ldr x25,[sp] // a[0]*n0 ldp x14,x15,[x3,#8*0] // n[4..7] ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 Loop_mul4x_1st_tail: mul x10,x6,x24 // lo(a[4..7]*b[i]) adc x0,x0,xzr // modulo-scheduled mul x11,x7,x24 add x28,x28,#8 mul x12,x8,x24 and x28,x28,#31 mul x13,x9,x24 adds x19,x19,x10 umulh x10,x6,x24 // hi(a[4..7]*b[i]) adcs x20,x20,x11 umulh x11,x7,x24 adcs x21,x21,x12 umulh x12,x8,x24 adcs x22,x22,x13 umulh x13,x9,x24 adc x23,xzr,xzr ldr x24,[x2,x28] // next b[i] (or b[0]) adds x20,x20,x10 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) adcs x21,x21,x11 mul x11,x15,x25 adcs x22,x22,x12 mul x12,x16,x25 adc x23,x23,x13 // can't overflow mul x13,x17,x25 adds x19,x19,x10 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) adcs x20,x20,x11 umulh x11,x15,x25 adcs x21,x21,x12 umulh x12,x16,x25 adcs x22,x22,x13 adcs x23,x23,x0 umulh x13,x17,x25 adc x0,xzr,xzr ldr x25,[sp,x28] // next t[0]*n0 str x19,[x26],#8 // result!!! adds x19,x20,x10 sub x10,x27,x1 // done yet? adcs x20,x21,x11 adcs x21,x22,x12 adcs x22,x23,x13 //adc x0,x0,xzr cbnz x28,Loop_mul4x_1st_tail sub x11,x27,x5 // rewinded x1 cbz x10,Lmul4x_proceed ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 ldp x14,x15,[x3,#8*0] ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 b Loop_mul4x_1st_tail .align 5 Lmul4x_proceed: ldr x24,[x2,#8*4]! // *++b adc x30,x0,xzr ldp x6,x7,[x11,#8*0] // a[0..3] sub x3,x3,x5 // rewind np ldp x8,x9,[x11,#8*2] add x1,x11,#8*4 stp x19,x20,[x26,#8*0] // result!!! ldp x19,x20,[sp,#8*4] // t[0..3] stp x21,x22,[x26,#8*2] // result!!! ldp x21,x22,[sp,#8*6] ldp x14,x15,[x3,#8*0] // n[0..3] mov x26,sp ldp x16,x17,[x3,#8*2] adds x3,x3,#8*4 // clear carry bit mov x0,xzr .align 4 Loop_mul4x_reduction: mul x10,x6,x24 // lo(a[0..3]*b[4]) adc x0,x0,xzr // modulo-scheduled mul x11,x7,x24 add x28,x28,#8 mul x12,x8,x24 and x28,x28,#31 mul x13,x9,x24 adds x19,x19,x10 umulh x10,x6,x24 // hi(a[0..3]*b[4]) adcs x20,x20,x11 mul x25,x19,x4 // t[0]*n0 adcs x21,x21,x12 umulh x11,x7,x24 adcs x22,x22,x13 umulh x12,x8,x24 adc x23,xzr,xzr umulh x13,x9,x24 ldr x24,[x2,x28] // next b[i] adds x20,x20,x10 // (*) mul x10,x14,x25 str x25,[x26],#8 // put aside t[0]*n0 for tail processing adcs x21,x21,x11 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 adcs x22,x22,x12 mul x12,x16,x25 adc x23,x23,x13 // can't overflow mul x13,x17,x25 // (*) adds xzr,x19,x10 subs xzr,x19,#1 // (*) umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 adcs x19,x20,x11 umulh x11,x15,x25 adcs x20,x21,x12 umulh x12,x16,x25 adcs x21,x22,x13 umulh x13,x17,x25 adcs x22,x23,x0 adc x0,xzr,xzr adds x19,x19,x10 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr cbnz x28,Loop_mul4x_reduction adc x0,x0,xzr ldp x10,x11,[x26,#8*4] // t[4..7] ldp x12,x13,[x26,#8*6] ldp x6,x7,[x1,#8*0] // a[4..7] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 adds x19,x19,x10 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr ldr x25,[sp] // t[0]*n0 ldp x14,x15,[x3,#8*0] // n[4..7] ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 .align 4 Loop_mul4x_tail: mul x10,x6,x24 // lo(a[4..7]*b[4]) adc x0,x0,xzr // modulo-scheduled mul x11,x7,x24 add x28,x28,#8 mul x12,x8,x24 and x28,x28,#31 mul x13,x9,x24 adds x19,x19,x10 umulh x10,x6,x24 // hi(a[4..7]*b[4]) adcs x20,x20,x11 umulh x11,x7,x24 adcs x21,x21,x12 umulh x12,x8,x24 adcs x22,x22,x13 umulh x13,x9,x24 adc x23,xzr,xzr ldr x24,[x2,x28] // next b[i] adds x20,x20,x10 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) adcs x21,x21,x11 mul x11,x15,x25 adcs x22,x22,x12 mul x12,x16,x25 adc x23,x23,x13 // can't overflow mul x13,x17,x25 adds x19,x19,x10 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) adcs x20,x20,x11 umulh x11,x15,x25 adcs x21,x21,x12 umulh x12,x16,x25 adcs x22,x22,x13 umulh x13,x17,x25 adcs x23,x23,x0 ldr x25,[sp,x28] // next a[0]*n0 adc x0,xzr,xzr str x19,[x26],#8 // result!!! adds x19,x20,x10 sub x10,x27,x1 // done yet? adcs x20,x21,x11 adcs x21,x22,x12 adcs x22,x23,x13 //adc x0,x0,xzr cbnz x28,Loop_mul4x_tail sub x11,x3,x5 // rewinded np? adc x0,x0,xzr cbz x10,Loop_mul4x_break ldp x10,x11,[x26,#8*4] ldp x12,x13,[x26,#8*6] ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] add x1,x1,#8*4 adds x19,x19,x10 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr ldp x14,x15,[x3,#8*0] ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 b Loop_mul4x_tail .align 4 Loop_mul4x_break: ldp x12,x13,[x29,#96] // pull rp and &b[num] adds x19,x19,x30 add x2,x2,#8*4 // bp++ adcs x20,x20,xzr sub x1,x1,x5 // rewind ap adcs x21,x21,xzr stp x19,x20,[x26,#8*0] // result!!! adcs x22,x22,xzr ldp x19,x20,[sp,#8*4] // t[0..3] adc x30,x0,xzr stp x21,x22,[x26,#8*2] // result!!! cmp x2,x13 // done yet? ldp x21,x22,[sp,#8*6] ldp x14,x15,[x11,#8*0] // n[0..3] ldp x16,x17,[x11,#8*2] add x3,x11,#8*4 b.eq Lmul4x_post ldr x24,[x2] ldp x6,x7,[x1,#8*0] // a[0..3] ldp x8,x9,[x1,#8*2] adds x1,x1,#8*4 // clear carry bit mov x0,xzr mov x26,sp b Loop_mul4x_reduction .align 4 Lmul4x_post: // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. mov x0,x12 mov x27,x12 // x0 copy subs x10,x19,x14 add x26,sp,#8*8 sbcs x11,x20,x15 sub x28,x5,#8*4 Lmul4x_sub: sbcs x12,x21,x16 ldp x14,x15,[x3,#8*0] sub x28,x28,#8*4 ldp x19,x20,[x26,#8*0] sbcs x13,x22,x17 ldp x16,x17,[x3,#8*2] add x3,x3,#8*4 ldp x21,x22,[x26,#8*2] add x26,x26,#8*4 stp x10,x11,[x0,#8*0] sbcs x10,x19,x14 stp x12,x13,[x0,#8*2] add x0,x0,#8*4 sbcs x11,x20,x15 cbnz x28,Lmul4x_sub sbcs x12,x21,x16 mov x26,sp add x1,sp,#8*4 ldp x6,x7,[x27,#8*0] sbcs x13,x22,x17 stp x10,x11,[x0,#8*0] ldp x8,x9,[x27,#8*2] stp x12,x13,[x0,#8*2] ldp x19,x20,[x1,#8*0] ldp x21,x22,[x1,#8*2] sbcs xzr,x30,xzr // did it borrow? ldr x30,[x29,#8] // pull return address sub x28,x5,#8*4 Lmul4x_cond_copy: sub x28,x28,#8*4 csel x10,x19,x6,lo stp xzr,xzr,[x26,#8*0] csel x11,x20,x7,lo ldp x6,x7,[x27,#8*4] ldp x19,x20,[x1,#8*4] csel x12,x21,x8,lo stp xzr,xzr,[x26,#8*2] add x26,x26,#8*4 csel x13,x22,x9,lo ldp x8,x9,[x27,#8*6] ldp x21,x22,[x1,#8*6] add x1,x1,#8*4 stp x10,x11,[x27,#8*0] stp x12,x13,[x27,#8*2] add x27,x27,#8*4 cbnz x28,Lmul4x_cond_copy csel x10,x19,x6,lo stp xzr,xzr,[x26,#8*0] csel x11,x20,x7,lo stp xzr,xzr,[x26,#8*2] csel x12,x21,x8,lo stp xzr,xzr,[x26,#8*3] csel x13,x22,x9,lo stp xzr,xzr,[x26,#8*4] stp x10,x11,[x27,#8*0] stp x12,x13,[x27,#8*2] b Lmul4x_done .align 4 Lmul4x4_post_condition: adc x0,x0,xzr ldr x1,[x29,#96] // pull rp // x19-3,x0 hold result, x14-7 hold modulus subs x6,x19,x14 ldr x30,[x29,#8] // pull return address sbcs x7,x20,x15 stp xzr,xzr,[sp,#8*0] sbcs x8,x21,x16 stp xzr,xzr,[sp,#8*2] sbcs x9,x22,x17 stp xzr,xzr,[sp,#8*4] sbcs xzr,x0,xzr // did it borrow? stp xzr,xzr,[sp,#8*6] // x6-3 hold result-modulus csel x6,x19,x6,lo csel x7,x20,x7,lo csel x8,x21,x8,lo csel x9,x22,x9,lo stp x6,x7,[x1,#8*0] stp x8,x9,[x1,#8*2] Lmul4x_done: ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 // x30 is popped earlier AARCH64_VALIDATE_LINK_REGISTER ret .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 4 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) ring-0.17.14/pregenerated/bsaes-armv7-linux32.S000064400000000000000000000425711046102023000171700ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) @ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. @ @ Licensed under the Apache License, Version 2.0 (the "License"); @ you may not use this file except in compliance with the License. @ You may obtain a copy of the License at @ @ https://www.apache.org/licenses/LICENSE-2.0 @ @ Unless required by applicable law or agreed to in writing, software @ distributed under the License is distributed on an "AS IS" BASIS, @ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @ See the License for the specific language governing permissions and @ limitations under the License. @ ==================================================================== @ Written by Andy Polyakov for the OpenSSL @ project. @ @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel @ of Linaro. @ ==================================================================== @ Bit-sliced AES for ARM NEON @ @ February 2012. @ @ This implementation is direct adaptation of bsaes-x86_64 module for @ ARM NEON. Except that this module is endian-neutral [in sense that @ it can be compiled for either endianness] by courtesy of vld1.8's @ neutrality. Initial version doesn't implement interface to OpenSSL, @ only low-level primitives and unsupported entry points, just enough @ to collect performance results, which for Cortex-A8 core are: @ @ encrypt 19.5 cycles per byte processed with 128-bit key @ decrypt 22.1 cycles per byte processed with 128-bit key @ key conv. 440 cycles per 128-bit key/0.18 of 8x block @ @ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, @ which is [much] worse than anticipated (for further details see @ http://www.openssl.org/~appro/Snapdragon-S4.html). @ @ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code @ manages in 20.0 cycles]. @ @ When comparing to x86_64 results keep in mind that NEON unit is @ [mostly] single-issue and thus can't [fully] benefit from @ instruction-level parallelism. And when comparing to aes-armv4 @ results keep in mind key schedule conversion overhead (see @ bsaes-x86_64.pl for further details)... @ @ @ April-August 2013 @ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. #ifndef __KERNEL__ # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} # define VFP_ABI_POP vldmia sp!,{d8-d15} # define VFP_ABI_FRAME 0x40 #else # define VFP_ABI_PUSH # define VFP_ABI_POP # define VFP_ABI_FRAME 0 # define BSAES_ASM_EXTENDED_KEY # define __ARM_MAX_ARCH__ 7 #endif #ifdef __thumb__ # define adrl adr #endif #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon .text .syntax unified @ ARMv7-capable assembler is expected to handle this #if defined(__thumb2__) && !defined(__APPLE__) .thumb #else .code 32 # undef __thumb2__ #endif .type _bsaes_const,%object .align 6 _bsaes_const: .LM0ISR:@ InvShiftRows constants .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 .LISR: .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 .LISRM0: .quad 0x01040b0e0205080f, 0x0306090c00070a0d .LM0SR:@ ShiftRows constants .quad 0x0a0e02060f03070b, 0x0004080c05090d01 .LSR: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b .LSRM0: .quad 0x0304090e00050a0f, 0x01060b0c0207080d .LM0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d .LREVM0SR: .quad 0x090d01050c000408, 0x03070b0f060a0e02 .byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 6 .size _bsaes_const,.-_bsaes_const .type _bsaes_encrypt8,%function .align 4 _bsaes_encrypt8: adr r6,. vldmia r4!, {q9} @ round 0 key #if defined(__thumb2__) || defined(__APPLE__) adr r6,.LM0SR #else sub r6,r6,#_bsaes_encrypt8-.LM0SR #endif vldmia r6!, {q8} @ .LM0SR _bsaes_encrypt8_alt: veor q10, q0, q9 @ xor with round0 key veor q11, q1, q9 vtbl.8 d0, {q10}, d16 vtbl.8 d1, {q10}, d17 veor q12, q2, q9 vtbl.8 d2, {q11}, d16 vtbl.8 d3, {q11}, d17 veor q13, q3, q9 vtbl.8 d4, {q12}, d16 vtbl.8 d5, {q12}, d17 veor q14, q4, q9 vtbl.8 d6, {q13}, d16 vtbl.8 d7, {q13}, d17 veor q15, q5, q9 vtbl.8 d8, {q14}, d16 vtbl.8 d9, {q14}, d17 veor q10, q6, q9 vtbl.8 d10, {q15}, d16 vtbl.8 d11, {q15}, d17 veor q11, q7, q9 vtbl.8 d12, {q10}, d16 vtbl.8 d13, {q10}, d17 vtbl.8 d14, {q11}, d16 vtbl.8 d15, {q11}, d17 _bsaes_encrypt8_bitslice: vmov.i8 q8,#0x55 @ compose .LBS0 vmov.i8 q9,#0x33 @ compose .LBS1 vshr.u64 q10, q6, #1 vshr.u64 q11, q4, #1 veor q10, q10, q7 veor q11, q11, q5 vand q10, q10, q8 vand q11, q11, q8 veor q7, q7, q10 vshl.u64 q10, q10, #1 veor q5, q5, q11 vshl.u64 q11, q11, #1 veor q6, q6, q10 veor q4, q4, q11 vshr.u64 q10, q2, #1 vshr.u64 q11, q0, #1 veor q10, q10, q3 veor q11, q11, q1 vand q10, q10, q8 vand q11, q11, q8 veor q3, q3, q10 vshl.u64 q10, q10, #1 veor q1, q1, q11 vshl.u64 q11, q11, #1 veor q2, q2, q10 veor q0, q0, q11 vmov.i8 q8,#0x0f @ compose .LBS2 vshr.u64 q10, q5, #2 vshr.u64 q11, q4, #2 veor q10, q10, q7 veor q11, q11, q6 vand q10, q10, q9 vand q11, q11, q9 veor q7, q7, q10 vshl.u64 q10, q10, #2 veor q6, q6, q11 vshl.u64 q11, q11, #2 veor q5, q5, q10 veor q4, q4, q11 vshr.u64 q10, q1, #2 vshr.u64 q11, q0, #2 veor q10, q10, q3 veor q11, q11, q2 vand q10, q10, q9 vand q11, q11, q9 veor q3, q3, q10 vshl.u64 q10, q10, #2 veor q2, q2, q11 vshl.u64 q11, q11, #2 veor q1, q1, q10 veor q0, q0, q11 vshr.u64 q10, q3, #4 vshr.u64 q11, q2, #4 veor q10, q10, q7 veor q11, q11, q6 vand q10, q10, q8 vand q11, q11, q8 veor q7, q7, q10 vshl.u64 q10, q10, #4 veor q6, q6, q11 vshl.u64 q11, q11, #4 veor q3, q3, q10 veor q2, q2, q11 vshr.u64 q10, q1, #4 vshr.u64 q11, q0, #4 veor q10, q10, q5 veor q11, q11, q4 vand q10, q10, q8 vand q11, q11, q8 veor q5, q5, q10 vshl.u64 q10, q10, #4 veor q4, q4, q11 vshl.u64 q11, q11, #4 veor q1, q1, q10 veor q0, q0, q11 sub r5,r5,#1 b .Lenc_sbox .align 4 .Lenc_loop: vldmia r4!, {q8,q9,q10,q11} veor q8, q8, q0 veor q9, q9, q1 vtbl.8 d0, {q8}, d24 vtbl.8 d1, {q8}, d25 vldmia r4!, {q8} veor q10, q10, q2 vtbl.8 d2, {q9}, d24 vtbl.8 d3, {q9}, d25 vldmia r4!, {q9} veor q11, q11, q3 vtbl.8 d4, {q10}, d24 vtbl.8 d5, {q10}, d25 vldmia r4!, {q10} vtbl.8 d6, {q11}, d24 vtbl.8 d7, {q11}, d25 vldmia r4!, {q11} veor q8, q8, q4 veor q9, q9, q5 vtbl.8 d8, {q8}, d24 vtbl.8 d9, {q8}, d25 veor q10, q10, q6 vtbl.8 d10, {q9}, d24 vtbl.8 d11, {q9}, d25 veor q11, q11, q7 vtbl.8 d12, {q10}, d24 vtbl.8 d13, {q10}, d25 vtbl.8 d14, {q11}, d24 vtbl.8 d15, {q11}, d25 .Lenc_sbox: veor q2, q2, q1 veor q5, q5, q6 veor q3, q3, q0 veor q6, q6, q2 veor q5, q5, q0 veor q6, q6, q3 veor q3, q3, q7 veor q7, q7, q5 veor q3, q3, q4 veor q4, q4, q5 veor q2, q2, q7 veor q3, q3, q1 veor q1, q1, q5 veor q11, q7, q4 veor q10, q1, q2 veor q9, q5, q3 veor q13, q2, q4 vmov q8, q10 veor q12, q6, q0 vorr q10, q10, q9 veor q15, q11, q8 vand q14, q11, q12 vorr q11, q11, q12 veor q12, q12, q9 vand q8, q8, q9 veor q9, q3, q0 vand q15, q15, q12 vand q13, q13, q9 veor q9, q7, q1 veor q12, q5, q6 veor q11, q11, q13 veor q10, q10, q13 vand q13, q9, q12 vorr q9, q9, q12 veor q11, q11, q15 veor q8, q8, q13 veor q10, q10, q14 veor q9, q9, q15 veor q8, q8, q14 vand q12, q2, q3 veor q9, q9, q14 vand q13, q4, q0 vand q14, q1, q5 vorr q15, q7, q6 veor q11, q11, q12 veor q9, q9, q14 veor q8, q8, q15 veor q10, q10, q13 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 @ new smaller inversion vand q14, q11, q9 vmov q12, q8 veor q13, q10, q14 veor q15, q8, q14 veor q14, q8, q14 @ q14=q15 vbsl q13, q9, q8 vbsl q15, q11, q10 veor q11, q11, q10 vbsl q12, q13, q14 vbsl q8, q14, q13 vand q14, q12, q15 veor q9, q9, q8 veor q14, q14, q11 veor q12, q6, q0 veor q8, q5, q3 veor q10, q15, q14 vand q10, q10, q6 veor q6, q6, q5 vand q11, q5, q15 vand q6, q6, q14 veor q5, q11, q10 veor q6, q6, q11 veor q15, q15, q13 veor q14, q14, q9 veor q11, q15, q14 veor q10, q13, q9 vand q11, q11, q12 vand q10, q10, q0 veor q12, q12, q8 veor q0, q0, q3 vand q8, q8, q15 vand q3, q3, q13 vand q12, q12, q14 vand q0, q0, q9 veor q8, q8, q12 veor q0, q0, q3 veor q12, q12, q11 veor q3, q3, q10 veor q6, q6, q12 veor q0, q0, q12 veor q5, q5, q8 veor q3, q3, q8 veor q12, q7, q4 veor q8, q1, q2 veor q11, q15, q14 veor q10, q13, q9 vand q11, q11, q12 vand q10, q10, q4 veor q12, q12, q8 veor q4, q4, q2 vand q8, q8, q15 vand q2, q2, q13 vand q12, q12, q14 vand q4, q4, q9 veor q8, q8, q12 veor q4, q4, q2 veor q12, q12, q11 veor q2, q2, q10 veor q15, q15, q13 veor q14, q14, q9 veor q10, q15, q14 vand q10, q10, q7 veor q7, q7, q1 vand q11, q1, q15 vand q7, q7, q14 veor q1, q11, q10 veor q7, q7, q11 veor q7, q7, q12 veor q4, q4, q12 veor q1, q1, q8 veor q2, q2, q8 veor q7, q7, q0 veor q1, q1, q6 veor q6, q6, q0 veor q4, q4, q7 veor q0, q0, q1 veor q1, q1, q5 veor q5, q5, q2 veor q2, q2, q3 veor q3, q3, q5 veor q4, q4, q5 veor q6, q6, q3 subs r5,r5,#1 bcc .Lenc_done vext.8 q8, q0, q0, #12 @ x0 <<< 32 vext.8 q9, q1, q1, #12 veor q0, q0, q8 @ x0 ^ (x0 <<< 32) vext.8 q10, q4, q4, #12 veor q1, q1, q9 vext.8 q11, q6, q6, #12 veor q4, q4, q10 vext.8 q12, q3, q3, #12 veor q6, q6, q11 vext.8 q13, q7, q7, #12 veor q3, q3, q12 vext.8 q14, q2, q2, #12 veor q7, q7, q13 vext.8 q15, q5, q5, #12 veor q2, q2, q14 veor q9, q9, q0 veor q5, q5, q15 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) veor q10, q10, q1 veor q8, q8, q5 veor q9, q9, q5 vext.8 q1, q1, q1, #8 veor q13, q13, q3 veor q0, q0, q8 veor q14, q14, q7 veor q1, q1, q9 vext.8 q8, q3, q3, #8 veor q12, q12, q6 vext.8 q9, q7, q7, #8 veor q15, q15, q2 vext.8 q3, q6, q6, #8 veor q11, q11, q4 vext.8 q7, q5, q5, #8 veor q12, q12, q5 vext.8 q6, q2, q2, #8 veor q11, q11, q5 vext.8 q2, q4, q4, #8 veor q5, q9, q13 veor q4, q8, q12 veor q3, q3, q11 veor q7, q7, q15 veor q6, q6, q14 @ vmov q4, q8 veor q2, q2, q10 @ vmov q5, q9 vldmia r6, {q12} @ .LSR ite eq @ Thumb2 thing, samity check in ARM addeq r6,r6,#0x10 bne .Lenc_loop vldmia r6, {q12} @ .LSRM0 b .Lenc_loop .align 4 .Lenc_done: vmov.i8 q8,#0x55 @ compose .LBS0 vmov.i8 q9,#0x33 @ compose .LBS1 vshr.u64 q10, q2, #1 vshr.u64 q11, q3, #1 veor q10, q10, q5 veor q11, q11, q7 vand q10, q10, q8 vand q11, q11, q8 veor q5, q5, q10 vshl.u64 q10, q10, #1 veor q7, q7, q11 vshl.u64 q11, q11, #1 veor q2, q2, q10 veor q3, q3, q11 vshr.u64 q10, q4, #1 vshr.u64 q11, q0, #1 veor q10, q10, q6 veor q11, q11, q1 vand q10, q10, q8 vand q11, q11, q8 veor q6, q6, q10 vshl.u64 q10, q10, #1 veor q1, q1, q11 vshl.u64 q11, q11, #1 veor q4, q4, q10 veor q0, q0, q11 vmov.i8 q8,#0x0f @ compose .LBS2 vshr.u64 q10, q7, #2 vshr.u64 q11, q3, #2 veor q10, q10, q5 veor q11, q11, q2 vand q10, q10, q9 vand q11, q11, q9 veor q5, q5, q10 vshl.u64 q10, q10, #2 veor q2, q2, q11 vshl.u64 q11, q11, #2 veor q7, q7, q10 veor q3, q3, q11 vshr.u64 q10, q1, #2 vshr.u64 q11, q0, #2 veor q10, q10, q6 veor q11, q11, q4 vand q10, q10, q9 vand q11, q11, q9 veor q6, q6, q10 vshl.u64 q10, q10, #2 veor q4, q4, q11 vshl.u64 q11, q11, #2 veor q1, q1, q10 veor q0, q0, q11 vshr.u64 q10, q6, #4 vshr.u64 q11, q4, #4 veor q10, q10, q5 veor q11, q11, q2 vand q10, q10, q8 vand q11, q11, q8 veor q5, q5, q10 vshl.u64 q10, q10, #4 veor q2, q2, q11 vshl.u64 q11, q11, #4 veor q6, q6, q10 veor q4, q4, q11 vshr.u64 q10, q1, #4 vshr.u64 q11, q0, #4 veor q10, q10, q7 veor q11, q11, q3 vand q10, q10, q8 vand q11, q11, q8 veor q7, q7, q10 vshl.u64 q10, q10, #4 veor q3, q3, q11 vshl.u64 q11, q11, #4 veor q1, q1, q10 veor q0, q0, q11 vldmia r4, {q8} @ last round key veor q4, q4, q8 veor q6, q6, q8 veor q3, q3, q8 veor q7, q7, q8 veor q2, q2, q8 veor q5, q5, q8 veor q0, q0, q8 veor q1, q1, q8 bx lr .size _bsaes_encrypt8,.-_bsaes_encrypt8 .type _bsaes_key_convert,%function .align 4 _bsaes_key_convert: adr r6,. vld1.8 {q7}, [r4]! @ load round 0 key #if defined(__thumb2__) || defined(__APPLE__) adr r6,.LM0 #else sub r6,r6,#_bsaes_key_convert-.LM0 #endif vld1.8 {q15}, [r4]! @ load round 1 key vmov.i8 q8, #0x01 @ bit masks vmov.i8 q9, #0x02 vmov.i8 q10, #0x04 vmov.i8 q11, #0x08 vmov.i8 q12, #0x10 vmov.i8 q13, #0x20 vldmia r6, {q14} @ .LM0 #ifdef __ARMEL__ vrev32.8 q7, q7 vrev32.8 q15, q15 #endif sub r5,r5,#1 vstmia r12!, {q7} @ save round 0 key b .Lkey_loop .align 4 .Lkey_loop: vtbl.8 d14,{q15},d28 vtbl.8 d15,{q15},d29 vmov.i8 q6, #0x40 vmov.i8 q15, #0x80 vtst.8 q0, q7, q8 vtst.8 q1, q7, q9 vtst.8 q2, q7, q10 vtst.8 q3, q7, q11 vtst.8 q4, q7, q12 vtst.8 q5, q7, q13 vtst.8 q6, q7, q6 vtst.8 q7, q7, q15 vld1.8 {q15}, [r4]! @ load next round key vmvn q0, q0 @ "pnot" vmvn q1, q1 vmvn q5, q5 vmvn q6, q6 #ifdef __ARMEL__ vrev32.8 q15, q15 #endif subs r5,r5,#1 vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key bne .Lkey_loop vmov.i8 q7,#0x63 @ compose .L63 @ don't save last round key bx lr .size _bsaes_key_convert,.-_bsaes_key_convert .globl bsaes_ctr32_encrypt_blocks .hidden bsaes_ctr32_encrypt_blocks .type bsaes_ctr32_encrypt_blocks,%function .align 5 bsaes_ctr32_encrypt_blocks: @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this @ out to retain a constant-time implementation. mov ip, sp stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} VFP_ABI_PUSH ldr r8, [ip] @ ctr is 1st arg on the stack sub sp, sp, #0x10 @ scratch space to carry over the ctr mov r9, sp @ save sp ldr r10, [r3, #240] @ get # of rounds #ifndef BSAES_ASM_EXTENDED_KEY @ allocate the key schedule on the stack sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key add r12, #96 @ size of bit-sliced key schedule @ populate the key schedule mov r4, r3 @ pass key mov r5, r10 @ pass # of rounds mov sp, r12 @ sp is sp bl _bsaes_key_convert veor q7,q7,q15 @ fix up last round key vstmia r12, {q7} @ save last round key vld1.8 {q0}, [r8] @ load counter #ifdef __APPLE__ mov r8, #:lower16:(.LREVM0SR-.LM0) add r8, r6, r8 #else add r8, r6, #.LREVM0SR-.LM0 @ borrow r8 #endif vldmia sp, {q4} @ load round0 key #else ldr r12, [r3, #244] eors r12, #1 beq 0f @ populate the key schedule str r12, [r3, #244] mov r4, r3 @ pass key mov r5, r10 @ pass # of rounds add r12, r3, #248 @ pass key schedule bl _bsaes_key_convert veor q7,q7,q15 @ fix up last round key vstmia r12, {q7} @ save last round key .align 2 add r12, r3, #248 vld1.8 {q0}, [r8] @ load counter adrl r8, .LREVM0SR @ borrow r8 vldmia r12, {q4} @ load round0 key sub sp, #0x10 @ place for adjusted round0 key #endif vmov.i32 q8,#1 @ compose 1<<96 veor q9,q9,q9 vrev32.8 q0,q0 vext.8 q8,q9,q8,#4 vrev32.8 q4,q4 vadd.u32 q9,q8,q8 @ compose 2<<96 vstmia sp, {q4} @ save adjusted round0 key b .Lctr_enc_loop .align 4 .Lctr_enc_loop: vadd.u32 q10, q8, q9 @ compose 3<<96 vadd.u32 q1, q0, q8 @ +1 vadd.u32 q2, q0, q9 @ +2 vadd.u32 q3, q0, q10 @ +3 vadd.u32 q4, q1, q10 vadd.u32 q5, q2, q10 vadd.u32 q6, q3, q10 vadd.u32 q7, q4, q10 vadd.u32 q10, q5, q10 @ next counter @ Borrow prologue from _bsaes_encrypt8 to use the opportunity @ to flip byte order in 32-bit counter vldmia sp, {q9} @ load round0 key #ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x10 @ pass next round key #else add r4, r3, #264 #endif vldmia r8, {q8} @ .LREVM0SR mov r5, r10 @ pass rounds vstmia r9, {q10} @ save next counter #ifdef __APPLE__ mov r6, #:lower16:(.LREVM0SR-.LSR) sub r6, r8, r6 #else sub r6, r8, #.LREVM0SR-.LSR @ pass constants #endif bl _bsaes_encrypt8_alt subs r2, r2, #8 blo .Lctr_enc_loop_done vld1.8 {q8,q9}, [r0]! @ load input vld1.8 {q10,q11}, [r0]! veor q0, q8 veor q1, q9 vld1.8 {q12,q13}, [r0]! veor q4, q10 veor q6, q11 vld1.8 {q14,q15}, [r0]! veor q3, q12 vst1.8 {q0,q1}, [r1]! @ write output veor q7, q13 veor q2, q14 vst1.8 {q4}, [r1]! veor q5, q15 vst1.8 {q6}, [r1]! vmov.i32 q8, #1 @ compose 1<<96 vst1.8 {q3}, [r1]! veor q9, q9, q9 vst1.8 {q7}, [r1]! vext.8 q8, q9, q8, #4 vst1.8 {q2}, [r1]! vadd.u32 q9,q8,q8 @ compose 2<<96 vst1.8 {q5}, [r1]! vldmia r9, {q0} @ load counter bne .Lctr_enc_loop b .Lctr_enc_done .align 4 .Lctr_enc_loop_done: add r2, r2, #8 vld1.8 {q8}, [r0]! @ load input veor q0, q8 vst1.8 {q0}, [r1]! @ write output cmp r2, #2 blo .Lctr_enc_done vld1.8 {q9}, [r0]! veor q1, q9 vst1.8 {q1}, [r1]! beq .Lctr_enc_done vld1.8 {q10}, [r0]! veor q4, q10 vst1.8 {q4}, [r1]! cmp r2, #4 blo .Lctr_enc_done vld1.8 {q11}, [r0]! veor q6, q11 vst1.8 {q6}, [r1]! beq .Lctr_enc_done vld1.8 {q12}, [r0]! veor q3, q12 vst1.8 {q3}, [r1]! cmp r2, #6 blo .Lctr_enc_done vld1.8 {q13}, [r0]! veor q7, q13 vst1.8 {q7}, [r1]! beq .Lctr_enc_done vld1.8 {q14}, [r0] veor q2, q14 vst1.8 {q2}, [r1]! .Lctr_enc_done: vmov.i32 q0, #0 vmov.i32 q1, #0 #ifndef BSAES_ASM_EXTENDED_KEY .Lctr_enc_bzero:@ wipe key schedule [if any] vstmia sp!, {q0,q1} cmp sp, r9 bne .Lctr_enc_bzero #else vstmia sp, {q0,q1} #endif mov sp, r9 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return @ OpenSSL contains aes_nohw_* fallback code here. We patch this @ out to retain a constant-time implementation. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) ring-0.17.14/pregenerated/chacha-armv4-linux32.S000064400000000000000000000701541046102023000172750ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both @ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. .arch armv7-a .text #if defined(__thumb2__) || defined(__clang__) .syntax unified #endif #if defined(__thumb2__) .thumb #else .code 32 #endif #if defined(__thumb2__) || defined(__clang__) #define ldrhsb ldrbhs #endif .align 5 .Lsigma: .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral .Lone: .long 1,0,0,0 .globl ChaCha20_ctr32_nohw .hidden ChaCha20_ctr32_nohw .type ChaCha20_ctr32_nohw,%function .align 5 ChaCha20_ctr32_nohw: ldr r12,[sp,#0] @ pull pointer to counter and nonce stmdb sp!,{r0,r1,r2,r4-r11,lr} adr r14,.Lsigma ldmia r12,{r4,r5,r6,r7} @ load counter and nonce sub sp,sp,#4*(16) @ off-load area stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key ldmia r14,{r0,r1,r2,r3} @ load sigma stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key stmdb sp!,{r0,r1,r2,r3} @ copy sigma str r10,[sp,#4*(16+10)] @ off-load "rx" str r11,[sp,#4*(16+11)] @ off-load "rx" b .Loop_outer_enter .align 4 .Loop_outer: ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material str r11,[sp,#4*(32+2)] @ save len str r12, [sp,#4*(32+1)] @ save inp str r14, [sp,#4*(32+0)] @ save out .Loop_outer_enter: ldr r11, [sp,#4*(15)] ldr r12,[sp,#4*(12)] @ modulo-scheduled load ldr r10, [sp,#4*(13)] ldr r14,[sp,#4*(14)] str r11, [sp,#4*(16+15)] mov r11,#10 b .Loop .align 4 .Loop: subs r11,r11,#1 add r0,r0,r4 mov r12,r12,ror#16 add r1,r1,r5 mov r10,r10,ror#16 eor r12,r12,r0,ror#16 eor r10,r10,r1,ror#16 add r8,r8,r12 mov r4,r4,ror#20 add r9,r9,r10 mov r5,r5,ror#20 eor r4,r4,r8,ror#20 eor r5,r5,r9,ror#20 add r0,r0,r4 mov r12,r12,ror#24 add r1,r1,r5 mov r10,r10,ror#24 eor r12,r12,r0,ror#24 eor r10,r10,r1,ror#24 add r8,r8,r12 mov r4,r4,ror#25 add r9,r9,r10 mov r5,r5,ror#25 str r10,[sp,#4*(16+13)] ldr r10,[sp,#4*(16+15)] eor r4,r4,r8,ror#25 eor r5,r5,r9,ror#25 str r8,[sp,#4*(16+8)] ldr r8,[sp,#4*(16+10)] add r2,r2,r6 mov r14,r14,ror#16 str r9,[sp,#4*(16+9)] ldr r9,[sp,#4*(16+11)] add r3,r3,r7 mov r10,r10,ror#16 eor r14,r14,r2,ror#16 eor r10,r10,r3,ror#16 add r8,r8,r14 mov r6,r6,ror#20 add r9,r9,r10 mov r7,r7,ror#20 eor r6,r6,r8,ror#20 eor r7,r7,r9,ror#20 add r2,r2,r6 mov r14,r14,ror#24 add r3,r3,r7 mov r10,r10,ror#24 eor r14,r14,r2,ror#24 eor r10,r10,r3,ror#24 add r8,r8,r14 mov r6,r6,ror#25 add r9,r9,r10 mov r7,r7,ror#25 eor r6,r6,r8,ror#25 eor r7,r7,r9,ror#25 add r0,r0,r5 mov r10,r10,ror#16 add r1,r1,r6 mov r12,r12,ror#16 eor r10,r10,r0,ror#16 eor r12,r12,r1,ror#16 add r8,r8,r10 mov r5,r5,ror#20 add r9,r9,r12 mov r6,r6,ror#20 eor r5,r5,r8,ror#20 eor r6,r6,r9,ror#20 add r0,r0,r5 mov r10,r10,ror#24 add r1,r1,r6 mov r12,r12,ror#24 eor r10,r10,r0,ror#24 eor r12,r12,r1,ror#24 add r8,r8,r10 mov r5,r5,ror#25 str r10,[sp,#4*(16+15)] ldr r10,[sp,#4*(16+13)] add r9,r9,r12 mov r6,r6,ror#25 eor r5,r5,r8,ror#25 eor r6,r6,r9,ror#25 str r8,[sp,#4*(16+10)] ldr r8,[sp,#4*(16+8)] add r2,r2,r7 mov r10,r10,ror#16 str r9,[sp,#4*(16+11)] ldr r9,[sp,#4*(16+9)] add r3,r3,r4 mov r14,r14,ror#16 eor r10,r10,r2,ror#16 eor r14,r14,r3,ror#16 add r8,r8,r10 mov r7,r7,ror#20 add r9,r9,r14 mov r4,r4,ror#20 eor r7,r7,r8,ror#20 eor r4,r4,r9,ror#20 add r2,r2,r7 mov r10,r10,ror#24 add r3,r3,r4 mov r14,r14,ror#24 eor r10,r10,r2,ror#24 eor r14,r14,r3,ror#24 add r8,r8,r10 mov r7,r7,ror#25 add r9,r9,r14 mov r4,r4,ror#25 eor r7,r7,r8,ror#25 eor r4,r4,r9,ror#25 bne .Loop ldr r11,[sp,#4*(32+2)] @ load len str r8, [sp,#4*(16+8)] @ modulo-scheduled store str r9, [sp,#4*(16+9)] str r12,[sp,#4*(16+12)] str r10, [sp,#4*(16+13)] str r14,[sp,#4*(16+14)] @ at this point we have first half of 512-bit result in @ rx and second half at sp+4*(16+8) cmp r11,#64 @ done yet? #ifdef __thumb2__ itete lo #endif addlo r12,sp,#4*(0) @ shortcut or ... ldrhs r12,[sp,#4*(32+1)] @ ... load inp addlo r14,sp,#4*(0) @ shortcut or ... ldrhs r14,[sp,#4*(32+0)] @ ... load out ldr r8,[sp,#4*(0)] @ load key material ldr r9,[sp,#4*(1)] #if __ARM_ARCH>=6 || !defined(__ARMEB__) # if __ARM_ARCH<7 orr r10,r12,r14 tst r10,#3 @ are input and output aligned? ldr r10,[sp,#4*(2)] bne .Lunaligned cmp r11,#64 @ restore flags # else ldr r10,[sp,#4*(2)] # endif ldr r11,[sp,#4*(3)] add r0,r0,r8 @ accumulate key material add r1,r1,r9 # ifdef __thumb2__ itt hs # endif ldrhs r8,[r12],#16 @ load input ldrhs r9,[r12,#-12] add r2,r2,r10 add r3,r3,r11 # ifdef __thumb2__ itt hs # endif ldrhs r10,[r12,#-8] ldrhs r11,[r12,#-4] # if __ARM_ARCH>=6 && defined(__ARMEB__) rev r0,r0 rev r1,r1 rev r2,r2 rev r3,r3 # endif # ifdef __thumb2__ itt hs # endif eorhs r0,r0,r8 @ xor with input eorhs r1,r1,r9 add r8,sp,#4*(4) str r0,[r14],#16 @ store output # ifdef __thumb2__ itt hs # endif eorhs r2,r2,r10 eorhs r3,r3,r11 ldmia r8,{r8,r9,r10,r11} @ load key material str r1,[r14,#-12] str r2,[r14,#-8] str r3,[r14,#-4] add r4,r4,r8 @ accumulate key material add r5,r5,r9 # ifdef __thumb2__ itt hs # endif ldrhs r8,[r12],#16 @ load input ldrhs r9,[r12,#-12] add r6,r6,r10 add r7,r7,r11 # ifdef __thumb2__ itt hs # endif ldrhs r10,[r12,#-8] ldrhs r11,[r12,#-4] # if __ARM_ARCH>=6 && defined(__ARMEB__) rev r4,r4 rev r5,r5 rev r6,r6 rev r7,r7 # endif # ifdef __thumb2__ itt hs # endif eorhs r4,r4,r8 eorhs r5,r5,r9 add r8,sp,#4*(8) str r4,[r14],#16 @ store output # ifdef __thumb2__ itt hs # endif eorhs r6,r6,r10 eorhs r7,r7,r11 str r5,[r14,#-12] ldmia r8,{r8,r9,r10,r11} @ load key material str r6,[r14,#-8] add r0,sp,#4*(16+8) str r7,[r14,#-4] ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half add r0,r0,r8 @ accumulate key material add r1,r1,r9 # ifdef __thumb2__ itt hs # endif ldrhs r8,[r12],#16 @ load input ldrhs r9,[r12,#-12] # ifdef __thumb2__ itt hi # endif strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it add r2,r2,r10 add r3,r3,r11 # ifdef __thumb2__ itt hs # endif ldrhs r10,[r12,#-8] ldrhs r11,[r12,#-4] # if __ARM_ARCH>=6 && defined(__ARMEB__) rev r0,r0 rev r1,r1 rev r2,r2 rev r3,r3 # endif # ifdef __thumb2__ itt hs # endif eorhs r0,r0,r8 eorhs r1,r1,r9 add r8,sp,#4*(12) str r0,[r14],#16 @ store output # ifdef __thumb2__ itt hs # endif eorhs r2,r2,r10 eorhs r3,r3,r11 str r1,[r14,#-12] ldmia r8,{r8,r9,r10,r11} @ load key material str r2,[r14,#-8] str r3,[r14,#-4] add r4,r4,r8 @ accumulate key material add r5,r5,r9 # ifdef __thumb2__ itt hi # endif addhi r8,r8,#1 @ next counter value strhi r8,[sp,#4*(12)] @ save next counter value # ifdef __thumb2__ itt hs # endif ldrhs r8,[r12],#16 @ load input ldrhs r9,[r12,#-12] add r6,r6,r10 add r7,r7,r11 # ifdef __thumb2__ itt hs # endif ldrhs r10,[r12,#-8] ldrhs r11,[r12,#-4] # if __ARM_ARCH>=6 && defined(__ARMEB__) rev r4,r4 rev r5,r5 rev r6,r6 rev r7,r7 # endif # ifdef __thumb2__ itt hs # endif eorhs r4,r4,r8 eorhs r5,r5,r9 # ifdef __thumb2__ it ne # endif ldrne r8,[sp,#4*(32+2)] @ re-load len # ifdef __thumb2__ itt hs # endif eorhs r6,r6,r10 eorhs r7,r7,r11 str r4,[r14],#16 @ store output str r5,[r14,#-12] # ifdef __thumb2__ it hs # endif subhs r11,r8,#64 @ len-=64 str r6,[r14,#-8] str r7,[r14,#-4] bhi .Loop_outer beq .Ldone # if __ARM_ARCH<7 b .Ltail .align 4 .Lunaligned:@ unaligned endian-neutral path cmp r11,#64 @ restore flags # endif #endif #if __ARM_ARCH<7 ldr r11,[sp,#4*(3)] add r0,r0,r8 @ accumulate key material add r1,r1,r9 add r2,r2,r10 # ifdef __thumb2__ itete lo # endif eorlo r8,r8,r8 @ zero or ... ldrhsb r8,[r12],#16 @ ... load input eorlo r9,r9,r9 ldrhsb r9,[r12,#-12] add r3,r3,r11 # ifdef __thumb2__ itete lo # endif eorlo r10,r10,r10 ldrhsb r10,[r12,#-8] eorlo r11,r11,r11 ldrhsb r11,[r12,#-4] eor r0,r8,r0 @ xor with input (or zero) eor r1,r9,r1 # ifdef __thumb2__ itt hs # endif ldrhsb r8,[r12,#-15] @ load more input ldrhsb r9,[r12,#-11] eor r2,r10,r2 strb r0,[r14],#16 @ store output eor r3,r11,r3 # ifdef __thumb2__ itt hs # endif ldrhsb r10,[r12,#-7] ldrhsb r11,[r12,#-3] strb r1,[r14,#-12] eor r0,r8,r0,lsr#8 strb r2,[r14,#-8] eor r1,r9,r1,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r8,[r12,#-14] @ load more input ldrhsb r9,[r12,#-10] strb r3,[r14,#-4] eor r2,r10,r2,lsr#8 strb r0,[r14,#-15] eor r3,r11,r3,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r10,[r12,#-6] ldrhsb r11,[r12,#-2] strb r1,[r14,#-11] eor r0,r8,r0,lsr#8 strb r2,[r14,#-7] eor r1,r9,r1,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r8,[r12,#-13] @ load more input ldrhsb r9,[r12,#-9] strb r3,[r14,#-3] eor r2,r10,r2,lsr#8 strb r0,[r14,#-14] eor r3,r11,r3,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r10,[r12,#-5] ldrhsb r11,[r12,#-1] strb r1,[r14,#-10] strb r2,[r14,#-6] eor r0,r8,r0,lsr#8 strb r3,[r14,#-2] eor r1,r9,r1,lsr#8 strb r0,[r14,#-13] eor r2,r10,r2,lsr#8 strb r1,[r14,#-9] eor r3,r11,r3,lsr#8 strb r2,[r14,#-5] strb r3,[r14,#-1] add r8,sp,#4*(4+0) ldmia r8,{r8,r9,r10,r11} @ load key material add r0,sp,#4*(16+8) add r4,r4,r8 @ accumulate key material add r5,r5,r9 add r6,r6,r10 # ifdef __thumb2__ itete lo # endif eorlo r8,r8,r8 @ zero or ... ldrhsb r8,[r12],#16 @ ... load input eorlo r9,r9,r9 ldrhsb r9,[r12,#-12] add r7,r7,r11 # ifdef __thumb2__ itete lo # endif eorlo r10,r10,r10 ldrhsb r10,[r12,#-8] eorlo r11,r11,r11 ldrhsb r11,[r12,#-4] eor r4,r8,r4 @ xor with input (or zero) eor r5,r9,r5 # ifdef __thumb2__ itt hs # endif ldrhsb r8,[r12,#-15] @ load more input ldrhsb r9,[r12,#-11] eor r6,r10,r6 strb r4,[r14],#16 @ store output eor r7,r11,r7 # ifdef __thumb2__ itt hs # endif ldrhsb r10,[r12,#-7] ldrhsb r11,[r12,#-3] strb r5,[r14,#-12] eor r4,r8,r4,lsr#8 strb r6,[r14,#-8] eor r5,r9,r5,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r8,[r12,#-14] @ load more input ldrhsb r9,[r12,#-10] strb r7,[r14,#-4] eor r6,r10,r6,lsr#8 strb r4,[r14,#-15] eor r7,r11,r7,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r10,[r12,#-6] ldrhsb r11,[r12,#-2] strb r5,[r14,#-11] eor r4,r8,r4,lsr#8 strb r6,[r14,#-7] eor r5,r9,r5,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r8,[r12,#-13] @ load more input ldrhsb r9,[r12,#-9] strb r7,[r14,#-3] eor r6,r10,r6,lsr#8 strb r4,[r14,#-14] eor r7,r11,r7,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r10,[r12,#-5] ldrhsb r11,[r12,#-1] strb r5,[r14,#-10] strb r6,[r14,#-6] eor r4,r8,r4,lsr#8 strb r7,[r14,#-2] eor r5,r9,r5,lsr#8 strb r4,[r14,#-13] eor r6,r10,r6,lsr#8 strb r5,[r14,#-9] eor r7,r11,r7,lsr#8 strb r6,[r14,#-5] strb r7,[r14,#-1] add r8,sp,#4*(4+4) ldmia r8,{r8,r9,r10,r11} @ load key material ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half # ifdef __thumb2__ itt hi # endif strhi r10,[sp,#4*(16+10)] @ copy "rx" strhi r11,[sp,#4*(16+11)] @ copy "rx" add r0,r0,r8 @ accumulate key material add r1,r1,r9 add r2,r2,r10 # ifdef __thumb2__ itete lo # endif eorlo r8,r8,r8 @ zero or ... ldrhsb r8,[r12],#16 @ ... load input eorlo r9,r9,r9 ldrhsb r9,[r12,#-12] add r3,r3,r11 # ifdef __thumb2__ itete lo # endif eorlo r10,r10,r10 ldrhsb r10,[r12,#-8] eorlo r11,r11,r11 ldrhsb r11,[r12,#-4] eor r0,r8,r0 @ xor with input (or zero) eor r1,r9,r1 # ifdef __thumb2__ itt hs # endif ldrhsb r8,[r12,#-15] @ load more input ldrhsb r9,[r12,#-11] eor r2,r10,r2 strb r0,[r14],#16 @ store output eor r3,r11,r3 # ifdef __thumb2__ itt hs # endif ldrhsb r10,[r12,#-7] ldrhsb r11,[r12,#-3] strb r1,[r14,#-12] eor r0,r8,r0,lsr#8 strb r2,[r14,#-8] eor r1,r9,r1,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r8,[r12,#-14] @ load more input ldrhsb r9,[r12,#-10] strb r3,[r14,#-4] eor r2,r10,r2,lsr#8 strb r0,[r14,#-15] eor r3,r11,r3,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r10,[r12,#-6] ldrhsb r11,[r12,#-2] strb r1,[r14,#-11] eor r0,r8,r0,lsr#8 strb r2,[r14,#-7] eor r1,r9,r1,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r8,[r12,#-13] @ load more input ldrhsb r9,[r12,#-9] strb r3,[r14,#-3] eor r2,r10,r2,lsr#8 strb r0,[r14,#-14] eor r3,r11,r3,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r10,[r12,#-5] ldrhsb r11,[r12,#-1] strb r1,[r14,#-10] strb r2,[r14,#-6] eor r0,r8,r0,lsr#8 strb r3,[r14,#-2] eor r1,r9,r1,lsr#8 strb r0,[r14,#-13] eor r2,r10,r2,lsr#8 strb r1,[r14,#-9] eor r3,r11,r3,lsr#8 strb r2,[r14,#-5] strb r3,[r14,#-1] add r8,sp,#4*(4+8) ldmia r8,{r8,r9,r10,r11} @ load key material add r4,r4,r8 @ accumulate key material # ifdef __thumb2__ itt hi # endif addhi r8,r8,#1 @ next counter value strhi r8,[sp,#4*(12)] @ save next counter value add r5,r5,r9 add r6,r6,r10 # ifdef __thumb2__ itete lo # endif eorlo r8,r8,r8 @ zero or ... ldrhsb r8,[r12],#16 @ ... load input eorlo r9,r9,r9 ldrhsb r9,[r12,#-12] add r7,r7,r11 # ifdef __thumb2__ itete lo # endif eorlo r10,r10,r10 ldrhsb r10,[r12,#-8] eorlo r11,r11,r11 ldrhsb r11,[r12,#-4] eor r4,r8,r4 @ xor with input (or zero) eor r5,r9,r5 # ifdef __thumb2__ itt hs # endif ldrhsb r8,[r12,#-15] @ load more input ldrhsb r9,[r12,#-11] eor r6,r10,r6 strb r4,[r14],#16 @ store output eor r7,r11,r7 # ifdef __thumb2__ itt hs # endif ldrhsb r10,[r12,#-7] ldrhsb r11,[r12,#-3] strb r5,[r14,#-12] eor r4,r8,r4,lsr#8 strb r6,[r14,#-8] eor r5,r9,r5,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r8,[r12,#-14] @ load more input ldrhsb r9,[r12,#-10] strb r7,[r14,#-4] eor r6,r10,r6,lsr#8 strb r4,[r14,#-15] eor r7,r11,r7,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r10,[r12,#-6] ldrhsb r11,[r12,#-2] strb r5,[r14,#-11] eor r4,r8,r4,lsr#8 strb r6,[r14,#-7] eor r5,r9,r5,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r8,[r12,#-13] @ load more input ldrhsb r9,[r12,#-9] strb r7,[r14,#-3] eor r6,r10,r6,lsr#8 strb r4,[r14,#-14] eor r7,r11,r7,lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb r10,[r12,#-5] ldrhsb r11,[r12,#-1] strb r5,[r14,#-10] strb r6,[r14,#-6] eor r4,r8,r4,lsr#8 strb r7,[r14,#-2] eor r5,r9,r5,lsr#8 strb r4,[r14,#-13] eor r6,r10,r6,lsr#8 strb r5,[r14,#-9] eor r7,r11,r7,lsr#8 strb r6,[r14,#-5] strb r7,[r14,#-1] # ifdef __thumb2__ it ne # endif ldrne r8,[sp,#4*(32+2)] @ re-load len # ifdef __thumb2__ it hs # endif subhs r11,r8,#64 @ len-=64 bhi .Loop_outer beq .Ldone #endif .Ltail: ldr r12,[sp,#4*(32+1)] @ load inp add r9,sp,#4*(0) ldr r14,[sp,#4*(32+0)] @ load out .Loop_tail: ldrb r10,[r9],#1 @ read buffer on stack ldrb r11,[r12],#1 @ read input subs r8,r8,#1 eor r11,r11,r10 strb r11,[r14],#1 @ store output bne .Loop_tail .Ldone: add sp,sp,#4*(32+3) ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} .size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon .globl ChaCha20_ctr32_neon .hidden ChaCha20_ctr32_neon .type ChaCha20_ctr32_neon,%function .align 5 ChaCha20_ctr32_neon: ldr r12,[sp,#0] @ pull pointer to counter and nonce stmdb sp!,{r0,r1,r2,r4-r11,lr} adr r14,.Lsigma vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so stmdb sp!,{r0,r1,r2,r3} vld1.32 {q1,q2},[r3] @ load key ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key sub sp,sp,#4*(16+16) vld1.32 {q3},[r12] @ load counter and nonce add r12,sp,#4*8 ldmia r14,{r0,r1,r2,r3} @ load sigma vld1.32 {q0},[r14]! @ load sigma vld1.32 {q12},[r14] @ one vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce vst1.32 {q0,q1},[sp] @ copy sigma|1/2key str r10,[sp,#4*(16+10)] @ off-load "rx" str r11,[sp,#4*(16+11)] @ off-load "rx" vshl.i32 d26,d24,#1 @ two vstr d24,[sp,#4*(16+0)] vshl.i32 d28,d24,#2 @ four vstr d26,[sp,#4*(16+2)] vmov q4,q0 vstr d28,[sp,#4*(16+4)] vmov q8,q0 vmov q5,q1 vmov q9,q1 b .Loop_neon_enter .align 4 .Loop_neon_outer: ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material cmp r11,#64*2 @ if len<=64*2 bls .Lbreak_neon @ switch to integer-only vmov q4,q0 str r11,[sp,#4*(32+2)] @ save len vmov q8,q0 str r12, [sp,#4*(32+1)] @ save inp vmov q5,q1 str r14, [sp,#4*(32+0)] @ save out vmov q9,q1 .Loop_neon_enter: ldr r11, [sp,#4*(15)] vadd.i32 q7,q3,q12 @ counter+1 ldr r12,[sp,#4*(12)] @ modulo-scheduled load vmov q6,q2 ldr r10, [sp,#4*(13)] vmov q10,q2 ldr r14,[sp,#4*(14)] vadd.i32 q11,q7,q12 @ counter+2 str r11, [sp,#4*(16+15)] mov r11,#10 add r12,r12,#3 @ counter+3 b .Loop_neon .align 4 .Loop_neon: subs r11,r11,#1 vadd.i32 q0,q0,q1 add r0,r0,r4 vadd.i32 q4,q4,q5 mov r12,r12,ror#16 vadd.i32 q8,q8,q9 add r1,r1,r5 veor q3,q3,q0 mov r10,r10,ror#16 veor q7,q7,q4 eor r12,r12,r0,ror#16 veor q11,q11,q8 eor r10,r10,r1,ror#16 vrev32.16 q3,q3 add r8,r8,r12 vrev32.16 q7,q7 mov r4,r4,ror#20 vrev32.16 q11,q11 add r9,r9,r10 vadd.i32 q2,q2,q3 mov r5,r5,ror#20 vadd.i32 q6,q6,q7 eor r4,r4,r8,ror#20 vadd.i32 q10,q10,q11 eor r5,r5,r9,ror#20 veor q12,q1,q2 add r0,r0,r4 veor q13,q5,q6 mov r12,r12,ror#24 veor q14,q9,q10 add r1,r1,r5 vshr.u32 q1,q12,#20 mov r10,r10,ror#24 vshr.u32 q5,q13,#20 eor r12,r12,r0,ror#24 vshr.u32 q9,q14,#20 eor r10,r10,r1,ror#24 vsli.32 q1,q12,#12 add r8,r8,r12 vsli.32 q5,q13,#12 mov r4,r4,ror#25 vsli.32 q9,q14,#12 add r9,r9,r10 vadd.i32 q0,q0,q1 mov r5,r5,ror#25 vadd.i32 q4,q4,q5 str r10,[sp,#4*(16+13)] vadd.i32 q8,q8,q9 ldr r10,[sp,#4*(16+15)] veor q12,q3,q0 eor r4,r4,r8,ror#25 veor q13,q7,q4 eor r5,r5,r9,ror#25 veor q14,q11,q8 str r8,[sp,#4*(16+8)] vshr.u32 q3,q12,#24 ldr r8,[sp,#4*(16+10)] vshr.u32 q7,q13,#24 add r2,r2,r6 vshr.u32 q11,q14,#24 mov r14,r14,ror#16 vsli.32 q3,q12,#8 str r9,[sp,#4*(16+9)] vsli.32 q7,q13,#8 ldr r9,[sp,#4*(16+11)] vsli.32 q11,q14,#8 add r3,r3,r7 vadd.i32 q2,q2,q3 mov r10,r10,ror#16 vadd.i32 q6,q6,q7 eor r14,r14,r2,ror#16 vadd.i32 q10,q10,q11 eor r10,r10,r3,ror#16 veor q12,q1,q2 add r8,r8,r14 veor q13,q5,q6 mov r6,r6,ror#20 veor q14,q9,q10 add r9,r9,r10 vshr.u32 q1,q12,#25 mov r7,r7,ror#20 vshr.u32 q5,q13,#25 eor r6,r6,r8,ror#20 vshr.u32 q9,q14,#25 eor r7,r7,r9,ror#20 vsli.32 q1,q12,#7 add r2,r2,r6 vsli.32 q5,q13,#7 mov r14,r14,ror#24 vsli.32 q9,q14,#7 add r3,r3,r7 vext.8 q2,q2,q2,#8 mov r10,r10,ror#24 vext.8 q6,q6,q6,#8 eor r14,r14,r2,ror#24 vext.8 q10,q10,q10,#8 eor r10,r10,r3,ror#24 vext.8 q1,q1,q1,#4 add r8,r8,r14 vext.8 q5,q5,q5,#4 mov r6,r6,ror#25 vext.8 q9,q9,q9,#4 add r9,r9,r10 vext.8 q3,q3,q3,#12 mov r7,r7,ror#25 vext.8 q7,q7,q7,#12 eor r6,r6,r8,ror#25 vext.8 q11,q11,q11,#12 eor r7,r7,r9,ror#25 vadd.i32 q0,q0,q1 add r0,r0,r5 vadd.i32 q4,q4,q5 mov r10,r10,ror#16 vadd.i32 q8,q8,q9 add r1,r1,r6 veor q3,q3,q0 mov r12,r12,ror#16 veor q7,q7,q4 eor r10,r10,r0,ror#16 veor q11,q11,q8 eor r12,r12,r1,ror#16 vrev32.16 q3,q3 add r8,r8,r10 vrev32.16 q7,q7 mov r5,r5,ror#20 vrev32.16 q11,q11 add r9,r9,r12 vadd.i32 q2,q2,q3 mov r6,r6,ror#20 vadd.i32 q6,q6,q7 eor r5,r5,r8,ror#20 vadd.i32 q10,q10,q11 eor r6,r6,r9,ror#20 veor q12,q1,q2 add r0,r0,r5 veor q13,q5,q6 mov r10,r10,ror#24 veor q14,q9,q10 add r1,r1,r6 vshr.u32 q1,q12,#20 mov r12,r12,ror#24 vshr.u32 q5,q13,#20 eor r10,r10,r0,ror#24 vshr.u32 q9,q14,#20 eor r12,r12,r1,ror#24 vsli.32 q1,q12,#12 add r8,r8,r10 vsli.32 q5,q13,#12 mov r5,r5,ror#25 vsli.32 q9,q14,#12 str r10,[sp,#4*(16+15)] vadd.i32 q0,q0,q1 ldr r10,[sp,#4*(16+13)] vadd.i32 q4,q4,q5 add r9,r9,r12 vadd.i32 q8,q8,q9 mov r6,r6,ror#25 veor q12,q3,q0 eor r5,r5,r8,ror#25 veor q13,q7,q4 eor r6,r6,r9,ror#25 veor q14,q11,q8 str r8,[sp,#4*(16+10)] vshr.u32 q3,q12,#24 ldr r8,[sp,#4*(16+8)] vshr.u32 q7,q13,#24 add r2,r2,r7 vshr.u32 q11,q14,#24 mov r10,r10,ror#16 vsli.32 q3,q12,#8 str r9,[sp,#4*(16+11)] vsli.32 q7,q13,#8 ldr r9,[sp,#4*(16+9)] vsli.32 q11,q14,#8 add r3,r3,r4 vadd.i32 q2,q2,q3 mov r14,r14,ror#16 vadd.i32 q6,q6,q7 eor r10,r10,r2,ror#16 vadd.i32 q10,q10,q11 eor r14,r14,r3,ror#16 veor q12,q1,q2 add r8,r8,r10 veor q13,q5,q6 mov r7,r7,ror#20 veor q14,q9,q10 add r9,r9,r14 vshr.u32 q1,q12,#25 mov r4,r4,ror#20 vshr.u32 q5,q13,#25 eor r7,r7,r8,ror#20 vshr.u32 q9,q14,#25 eor r4,r4,r9,ror#20 vsli.32 q1,q12,#7 add r2,r2,r7 vsli.32 q5,q13,#7 mov r10,r10,ror#24 vsli.32 q9,q14,#7 add r3,r3,r4 vext.8 q2,q2,q2,#8 mov r14,r14,ror#24 vext.8 q6,q6,q6,#8 eor r10,r10,r2,ror#24 vext.8 q10,q10,q10,#8 eor r14,r14,r3,ror#24 vext.8 q1,q1,q1,#12 add r8,r8,r10 vext.8 q5,q5,q5,#12 mov r7,r7,ror#25 vext.8 q9,q9,q9,#12 add r9,r9,r14 vext.8 q3,q3,q3,#4 mov r4,r4,ror#25 vext.8 q7,q7,q7,#4 eor r7,r7,r8,ror#25 vext.8 q11,q11,q11,#4 eor r4,r4,r9,ror#25 bne .Loop_neon add r11,sp,#32 vld1.32 {q12,q13},[sp] @ load key material vld1.32 {q14,q15},[r11] ldr r11,[sp,#4*(32+2)] @ load len str r8, [sp,#4*(16+8)] @ modulo-scheduled store str r9, [sp,#4*(16+9)] str r12,[sp,#4*(16+12)] str r10, [sp,#4*(16+13)] str r14,[sp,#4*(16+14)] @ at this point we have first half of 512-bit result in @ rx and second half at sp+4*(16+8) ldr r12,[sp,#4*(32+1)] @ load inp ldr r14,[sp,#4*(32+0)] @ load out vadd.i32 q0,q0,q12 @ accumulate key material vadd.i32 q4,q4,q12 vadd.i32 q8,q8,q12 vldr d24,[sp,#4*(16+0)] @ one vadd.i32 q1,q1,q13 vadd.i32 q5,q5,q13 vadd.i32 q9,q9,q13 vldr d26,[sp,#4*(16+2)] @ two vadd.i32 q2,q2,q14 vadd.i32 q6,q6,q14 vadd.i32 q10,q10,q14 vadd.i32 d14,d14,d24 @ counter+1 vadd.i32 d22,d22,d26 @ counter+2 vadd.i32 q3,q3,q15 vadd.i32 q7,q7,q15 vadd.i32 q11,q11,q15 cmp r11,#64*4 blo .Ltail_neon vld1.8 {q12,q13},[r12]! @ load input mov r11,sp vld1.8 {q14,q15},[r12]! veor q0,q0,q12 @ xor with input veor q1,q1,q13 vld1.8 {q12,q13},[r12]! veor q2,q2,q14 veor q3,q3,q15 vld1.8 {q14,q15},[r12]! veor q4,q4,q12 vst1.8 {q0,q1},[r14]! @ store output veor q5,q5,q13 vld1.8 {q12,q13},[r12]! veor q6,q6,q14 vst1.8 {q2,q3},[r14]! veor q7,q7,q15 vld1.8 {q14,q15},[r12]! veor q8,q8,q12 vld1.32 {q0,q1},[r11]! @ load for next iteration veor d25,d25,d25 vldr d24,[sp,#4*(16+4)] @ four veor q9,q9,q13 vld1.32 {q2,q3},[r11] veor q10,q10,q14 vst1.8 {q4,q5},[r14]! veor q11,q11,q15 vst1.8 {q6,q7},[r14]! vadd.i32 d6,d6,d24 @ next counter value vldr d24,[sp,#4*(16+0)] @ one ldmia sp,{r8,r9,r10,r11} @ load key material add r0,r0,r8 @ accumulate key material ldr r8,[r12],#16 @ load input vst1.8 {q8,q9},[r14]! add r1,r1,r9 ldr r9,[r12,#-12] vst1.8 {q10,q11},[r14]! add r2,r2,r10 ldr r10,[r12,#-8] add r3,r3,r11 ldr r11,[r12,#-4] # ifdef __ARMEB__ rev r0,r0 rev r1,r1 rev r2,r2 rev r3,r3 # endif eor r0,r0,r8 @ xor with input add r8,sp,#4*(4) eor r1,r1,r9 str r0,[r14],#16 @ store output eor r2,r2,r10 str r1,[r14,#-12] eor r3,r3,r11 ldmia r8,{r8,r9,r10,r11} @ load key material str r2,[r14,#-8] str r3,[r14,#-4] add r4,r4,r8 @ accumulate key material ldr r8,[r12],#16 @ load input add r5,r5,r9 ldr r9,[r12,#-12] add r6,r6,r10 ldr r10,[r12,#-8] add r7,r7,r11 ldr r11,[r12,#-4] # ifdef __ARMEB__ rev r4,r4 rev r5,r5 rev r6,r6 rev r7,r7 # endif eor r4,r4,r8 add r8,sp,#4*(8) eor r5,r5,r9 str r4,[r14],#16 @ store output eor r6,r6,r10 str r5,[r14,#-12] eor r7,r7,r11 ldmia r8,{r8,r9,r10,r11} @ load key material str r6,[r14,#-8] add r0,sp,#4*(16+8) str r7,[r14,#-4] ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half add r0,r0,r8 @ accumulate key material ldr r8,[r12],#16 @ load input add r1,r1,r9 ldr r9,[r12,#-12] # ifdef __thumb2__ it hi # endif strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it add r2,r2,r10 ldr r10,[r12,#-8] # ifdef __thumb2__ it hi # endif strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it add r3,r3,r11 ldr r11,[r12,#-4] # ifdef __ARMEB__ rev r0,r0 rev r1,r1 rev r2,r2 rev r3,r3 # endif eor r0,r0,r8 add r8,sp,#4*(12) eor r1,r1,r9 str r0,[r14],#16 @ store output eor r2,r2,r10 str r1,[r14,#-12] eor r3,r3,r11 ldmia r8,{r8,r9,r10,r11} @ load key material str r2,[r14,#-8] str r3,[r14,#-4] add r4,r4,r8 @ accumulate key material add r8,r8,#4 @ next counter value add r5,r5,r9 str r8,[sp,#4*(12)] @ save next counter value ldr r8,[r12],#16 @ load input add r6,r6,r10 add r4,r4,#3 @ counter+3 ldr r9,[r12,#-12] add r7,r7,r11 ldr r10,[r12,#-8] ldr r11,[r12,#-4] # ifdef __ARMEB__ rev r4,r4 rev r5,r5 rev r6,r6 rev r7,r7 # endif eor r4,r4,r8 # ifdef __thumb2__ it hi # endif ldrhi r8,[sp,#4*(32+2)] @ re-load len eor r5,r5,r9 eor r6,r6,r10 str r4,[r14],#16 @ store output eor r7,r7,r11 str r5,[r14,#-12] sub r11,r8,#64*4 @ len-=64*4 str r6,[r14,#-8] str r7,[r14,#-4] bhi .Loop_neon_outer b .Ldone_neon .align 4 .Lbreak_neon: @ harmonize NEON and integer-only stack frames: load data @ from NEON frame, but save to integer-only one; distance @ between the two is 4*(32+4+16-32)=4*(20). str r11, [sp,#4*(20+32+2)] @ save len add r11,sp,#4*(32+4) str r12, [sp,#4*(20+32+1)] @ save inp str r14, [sp,#4*(20+32+0)] @ save out ldr r12,[sp,#4*(16+10)] ldr r14,[sp,#4*(16+11)] vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement str r12,[sp,#4*(20+16+10)] @ copy "rx" str r14,[sp,#4*(20+16+11)] @ copy "rx" ldr r11, [sp,#4*(15)] ldr r12,[sp,#4*(12)] @ modulo-scheduled load ldr r10, [sp,#4*(13)] ldr r14,[sp,#4*(14)] str r11, [sp,#4*(20+16+15)] add r11,sp,#4*(20) vst1.32 {q0,q1},[r11]! @ copy key add sp,sp,#4*(20) @ switch frame vst1.32 {q2,q3},[r11] mov r11,#10 b .Loop @ go integer-only .align 4 .Ltail_neon: cmp r11,#64*3 bhs .L192_or_more_neon cmp r11,#64*2 bhs .L128_or_more_neon cmp r11,#64*1 bhs .L64_or_more_neon add r8,sp,#4*(8) vst1.8 {q0,q1},[sp] add r10,sp,#4*(0) vst1.8 {q2,q3},[r8] b .Loop_tail_neon .align 4 .L64_or_more_neon: vld1.8 {q12,q13},[r12]! vld1.8 {q14,q15},[r12]! veor q0,q0,q12 veor q1,q1,q13 veor q2,q2,q14 veor q3,q3,q15 vst1.8 {q0,q1},[r14]! vst1.8 {q2,q3},[r14]! beq .Ldone_neon add r8,sp,#4*(8) vst1.8 {q4,q5},[sp] add r10,sp,#4*(0) vst1.8 {q6,q7},[r8] sub r11,r11,#64*1 @ len-=64*1 b .Loop_tail_neon .align 4 .L128_or_more_neon: vld1.8 {q12,q13},[r12]! vld1.8 {q14,q15},[r12]! veor q0,q0,q12 veor q1,q1,q13 vld1.8 {q12,q13},[r12]! veor q2,q2,q14 veor q3,q3,q15 vld1.8 {q14,q15},[r12]! veor q4,q4,q12 veor q5,q5,q13 vst1.8 {q0,q1},[r14]! veor q6,q6,q14 vst1.8 {q2,q3},[r14]! veor q7,q7,q15 vst1.8 {q4,q5},[r14]! vst1.8 {q6,q7},[r14]! beq .Ldone_neon add r8,sp,#4*(8) vst1.8 {q8,q9},[sp] add r10,sp,#4*(0) vst1.8 {q10,q11},[r8] sub r11,r11,#64*2 @ len-=64*2 b .Loop_tail_neon .align 4 .L192_or_more_neon: vld1.8 {q12,q13},[r12]! vld1.8 {q14,q15},[r12]! veor q0,q0,q12 veor q1,q1,q13 vld1.8 {q12,q13},[r12]! veor q2,q2,q14 veor q3,q3,q15 vld1.8 {q14,q15},[r12]! veor q4,q4,q12 veor q5,q5,q13 vld1.8 {q12,q13},[r12]! veor q6,q6,q14 vst1.8 {q0,q1},[r14]! veor q7,q7,q15 vld1.8 {q14,q15},[r12]! veor q8,q8,q12 vst1.8 {q2,q3},[r14]! veor q9,q9,q13 vst1.8 {q4,q5},[r14]! veor q10,q10,q14 vst1.8 {q6,q7},[r14]! veor q11,q11,q15 vst1.8 {q8,q9},[r14]! vst1.8 {q10,q11},[r14]! beq .Ldone_neon ldmia sp,{r8,r9,r10,r11} @ load key material add r0,r0,r8 @ accumulate key material add r8,sp,#4*(4) add r1,r1,r9 add r2,r2,r10 add r3,r3,r11 ldmia r8,{r8,r9,r10,r11} @ load key material add r4,r4,r8 @ accumulate key material add r8,sp,#4*(8) add r5,r5,r9 add r6,r6,r10 add r7,r7,r11 ldmia r8,{r8,r9,r10,r11} @ load key material # ifdef __ARMEB__ rev r0,r0 rev r1,r1 rev r2,r2 rev r3,r3 rev r4,r4 rev r5,r5 rev r6,r6 rev r7,r7 # endif stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7} add r0,sp,#4*(16+8) ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half add r0,r0,r8 @ accumulate key material add r8,sp,#4*(12) add r1,r1,r9 add r2,r2,r10 add r3,r3,r11 ldmia r8,{r8,r9,r10,r11} @ load key material add r4,r4,r8 @ accumulate key material add r8,sp,#4*(8) add r5,r5,r9 add r4,r4,#3 @ counter+3 add r6,r6,r10 add r7,r7,r11 ldr r11,[sp,#4*(32+2)] @ re-load len # ifdef __ARMEB__ rev r0,r0 rev r1,r1 rev r2,r2 rev r3,r3 rev r4,r4 rev r5,r5 rev r6,r6 rev r7,r7 # endif stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7} add r10,sp,#4*(0) sub r11,r11,#64*3 @ len-=64*3 .Loop_tail_neon: ldrb r8,[r10],#1 @ read buffer on stack ldrb r9,[r12],#1 @ read input subs r11,r11,#1 eor r8,r8,r9 strb r8,[r14],#1 @ store output bne .Loop_tail_neon .Ldone_neon: add sp,sp,#4*(32+4) vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15} add sp,sp,#4*(16+3) ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} .size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) ring-0.17.14/pregenerated/chacha-armv8-ios64.S000064400000000000000000001163711046102023000167430ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) .section __TEXT,__const .align 5 Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral Lone: .long 1,0,0,0 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .text .globl _ChaCha20_ctr32_nohw .private_extern _ChaCha20_ctr32_nohw .align 5 _ChaCha20_ctr32_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 adrp x5,Lsigma@PAGE add x5,x5,Lsigma@PAGEOFF stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#64 ldp x22,x23,[x5] // load sigma ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ldp x28,x30,[x4] // load counter #ifdef __AARCH64EB__ ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif Loop_outer: mov w5,w22 // unpack key block lsr x6,x22,#32 mov w7,w23 lsr x8,x23,#32 mov w9,w24 lsr x10,x24,#32 mov w11,w25 lsr x12,x25,#32 mov w13,w26 lsr x14,x26,#32 mov w15,w27 lsr x16,x27,#32 mov w17,w28 lsr x19,x28,#32 mov w20,w30 lsr x21,x30,#32 mov x4,#10 subs x2,x2,#64 Loop: sub x4,x4,#1 add w5,w5,w9 add w6,w6,w10 add w7,w7,w11 add w8,w8,w12 eor w17,w17,w5 eor w19,w19,w6 eor w20,w20,w7 eor w21,w21,w8 ror w17,w17,#16 ror w19,w19,#16 ror w20,w20,#16 ror w21,w21,#16 add w13,w13,w17 add w14,w14,w19 add w15,w15,w20 add w16,w16,w21 eor w9,w9,w13 eor w10,w10,w14 eor w11,w11,w15 eor w12,w12,w16 ror w9,w9,#20 ror w10,w10,#20 ror w11,w11,#20 ror w12,w12,#20 add w5,w5,w9 add w6,w6,w10 add w7,w7,w11 add w8,w8,w12 eor w17,w17,w5 eor w19,w19,w6 eor w20,w20,w7 eor w21,w21,w8 ror w17,w17,#24 ror w19,w19,#24 ror w20,w20,#24 ror w21,w21,#24 add w13,w13,w17 add w14,w14,w19 add w15,w15,w20 add w16,w16,w21 eor w9,w9,w13 eor w10,w10,w14 eor w11,w11,w15 eor w12,w12,w16 ror w9,w9,#25 ror w10,w10,#25 ror w11,w11,#25 ror w12,w12,#25 add w5,w5,w10 add w6,w6,w11 add w7,w7,w12 add w8,w8,w9 eor w21,w21,w5 eor w17,w17,w6 eor w19,w19,w7 eor w20,w20,w8 ror w21,w21,#16 ror w17,w17,#16 ror w19,w19,#16 ror w20,w20,#16 add w15,w15,w21 add w16,w16,w17 add w13,w13,w19 add w14,w14,w20 eor w10,w10,w15 eor w11,w11,w16 eor w12,w12,w13 eor w9,w9,w14 ror w10,w10,#20 ror w11,w11,#20 ror w12,w12,#20 ror w9,w9,#20 add w5,w5,w10 add w6,w6,w11 add w7,w7,w12 add w8,w8,w9 eor w21,w21,w5 eor w17,w17,w6 eor w19,w19,w7 eor w20,w20,w8 ror w21,w21,#24 ror w17,w17,#24 ror w19,w19,#24 ror w20,w20,#24 add w15,w15,w21 add w16,w16,w17 add w13,w13,w19 add w14,w14,w20 eor w10,w10,w15 eor w11,w11,w16 eor w12,w12,w13 eor w9,w9,w14 ror w10,w10,#25 ror w11,w11,#25 ror w12,w12,#25 ror w9,w9,#25 cbnz x4,Loop add w5,w5,w22 // accumulate key block add x6,x6,x22,lsr#32 add w7,w7,w23 add x8,x8,x23,lsr#32 add w9,w9,w24 add x10,x10,x24,lsr#32 add w11,w11,w25 add x12,x12,x25,lsr#32 add w13,w13,w26 add x14,x14,x26,lsr#32 add w15,w15,w27 add x16,x16,x27,lsr#32 add w17,w17,w28 add x19,x19,x28,lsr#32 add w20,w20,w30 add x21,x21,x30,lsr#32 b.lo Ltail add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#1 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 b.hi Loop_outer ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .align 4 Ltail: add x2,x2,#64 Less_than_64: sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 add x4,sp,x2 neg x2,x2 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif stp x5,x7,[sp,#0] stp x9,x11,[sp,#16] stp x13,x15,[sp,#32] stp x17,x20,[sp,#48] Loop_tail: ldrb w10,[x1,x2] ldrb w11,[x4,x2] add x2,x2,#1 eor w10,w10,w11 strb w10,[x0,x2] cbnz x2,Loop_tail stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .globl _ChaCha20_ctr32_neon .private_extern _ChaCha20_ctr32_neon .align 5 _ChaCha20_ctr32_neon: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 adrp x5,Lsigma@PAGE add x5,x5,Lsigma@PAGEOFF stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] cmp x2,#512 b.hs L512_or_more_neon sub sp,sp,#64 ldp x22,x23,[x5] // load sigma ld1 {v24.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ld1 {v25.4s,v26.4s},[x3] ldp x28,x30,[x4] // load counter ld1 {v27.4s},[x4] ld1 {v31.4s},[x5] #ifdef __AARCH64EB__ rev64 v24.4s,v24.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif add v27.4s,v27.4s,v31.4s // += 1 add v28.4s,v27.4s,v31.4s add v29.4s,v28.4s,v31.4s shl v31.4s,v31.4s,#2 // 1 -> 4 Loop_outer_neon: mov w5,w22 // unpack key block lsr x6,x22,#32 mov v0.16b,v24.16b mov w7,w23 lsr x8,x23,#32 mov v4.16b,v24.16b mov w9,w24 lsr x10,x24,#32 mov v16.16b,v24.16b mov w11,w25 mov v1.16b,v25.16b lsr x12,x25,#32 mov v5.16b,v25.16b mov w13,w26 mov v17.16b,v25.16b lsr x14,x26,#32 mov v3.16b,v27.16b mov w15,w27 mov v7.16b,v28.16b lsr x16,x27,#32 mov v19.16b,v29.16b mov w17,w28 mov v2.16b,v26.16b lsr x19,x28,#32 mov v6.16b,v26.16b mov w20,w30 mov v18.16b,v26.16b lsr x21,x30,#32 mov x4,#10 subs x2,x2,#256 Loop_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v16.4s,v16.4s,v17.4s add w7,w7,w11 eor v3.16b,v3.16b,v0.16b add w8,w8,w12 eor v7.16b,v7.16b,v4.16b eor w17,w17,w5 eor v19.16b,v19.16b,v16.16b eor w19,w19,w6 rev32 v3.8h,v3.8h eor w20,w20,w7 rev32 v7.8h,v7.8h eor w21,w21,w8 rev32 v19.8h,v19.8h ror w17,w17,#16 add v2.4s,v2.4s,v3.4s ror w19,w19,#16 add v6.4s,v6.4s,v7.4s ror w20,w20,#16 add v18.4s,v18.4s,v19.4s ror w21,w21,#16 eor v20.16b,v1.16b,v2.16b add w13,w13,w17 eor v21.16b,v5.16b,v6.16b add w14,w14,w19 eor v22.16b,v17.16b,v18.16b add w15,w15,w20 ushr v1.4s,v20.4s,#20 add w16,w16,w21 ushr v5.4s,v21.4s,#20 eor w9,w9,w13 ushr v17.4s,v22.4s,#20 eor w10,w10,w14 sli v1.4s,v20.4s,#12 eor w11,w11,w15 sli v5.4s,v21.4s,#12 eor w12,w12,w16 sli v17.4s,v22.4s,#12 ror w9,w9,#20 add v0.4s,v0.4s,v1.4s ror w10,w10,#20 add v4.4s,v4.4s,v5.4s ror w11,w11,#20 add v16.4s,v16.4s,v17.4s ror w12,w12,#20 eor v20.16b,v3.16b,v0.16b add w5,w5,w9 eor v21.16b,v7.16b,v4.16b add w6,w6,w10 eor v22.16b,v19.16b,v16.16b add w7,w7,w11 ushr v3.4s,v20.4s,#24 add w8,w8,w12 ushr v7.4s,v21.4s,#24 eor w17,w17,w5 ushr v19.4s,v22.4s,#24 eor w19,w19,w6 sli v3.4s,v20.4s,#8 eor w20,w20,w7 sli v7.4s,v21.4s,#8 eor w21,w21,w8 sli v19.4s,v22.4s,#8 ror w17,w17,#24 add v2.4s,v2.4s,v3.4s ror w19,w19,#24 add v6.4s,v6.4s,v7.4s ror w20,w20,#24 add v18.4s,v18.4s,v19.4s ror w21,w21,#24 eor v20.16b,v1.16b,v2.16b add w13,w13,w17 eor v21.16b,v5.16b,v6.16b add w14,w14,w19 eor v22.16b,v17.16b,v18.16b add w15,w15,w20 ushr v1.4s,v20.4s,#25 add w16,w16,w21 ushr v5.4s,v21.4s,#25 eor w9,w9,w13 ushr v17.4s,v22.4s,#25 eor w10,w10,w14 sli v1.4s,v20.4s,#7 eor w11,w11,w15 sli v5.4s,v21.4s,#7 eor w12,w12,w16 sli v17.4s,v22.4s,#7 ror w9,w9,#25 ext v2.16b,v2.16b,v2.16b,#8 ror w10,w10,#25 ext v6.16b,v6.16b,v6.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v3.16b,v3.16b,v3.16b,#12 ext v7.16b,v7.16b,v7.16b,#12 ext v19.16b,v19.16b,v19.16b,#12 ext v1.16b,v1.16b,v1.16b,#4 ext v5.16b,v5.16b,v5.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 add v0.4s,v0.4s,v1.4s add w5,w5,w10 add v4.4s,v4.4s,v5.4s add w6,w6,w11 add v16.4s,v16.4s,v17.4s add w7,w7,w12 eor v3.16b,v3.16b,v0.16b add w8,w8,w9 eor v7.16b,v7.16b,v4.16b eor w21,w21,w5 eor v19.16b,v19.16b,v16.16b eor w17,w17,w6 rev32 v3.8h,v3.8h eor w19,w19,w7 rev32 v7.8h,v7.8h eor w20,w20,w8 rev32 v19.8h,v19.8h ror w21,w21,#16 add v2.4s,v2.4s,v3.4s ror w17,w17,#16 add v6.4s,v6.4s,v7.4s ror w19,w19,#16 add v18.4s,v18.4s,v19.4s ror w20,w20,#16 eor v20.16b,v1.16b,v2.16b add w15,w15,w21 eor v21.16b,v5.16b,v6.16b add w16,w16,w17 eor v22.16b,v17.16b,v18.16b add w13,w13,w19 ushr v1.4s,v20.4s,#20 add w14,w14,w20 ushr v5.4s,v21.4s,#20 eor w10,w10,w15 ushr v17.4s,v22.4s,#20 eor w11,w11,w16 sli v1.4s,v20.4s,#12 eor w12,w12,w13 sli v5.4s,v21.4s,#12 eor w9,w9,w14 sli v17.4s,v22.4s,#12 ror w10,w10,#20 add v0.4s,v0.4s,v1.4s ror w11,w11,#20 add v4.4s,v4.4s,v5.4s ror w12,w12,#20 add v16.4s,v16.4s,v17.4s ror w9,w9,#20 eor v20.16b,v3.16b,v0.16b add w5,w5,w10 eor v21.16b,v7.16b,v4.16b add w6,w6,w11 eor v22.16b,v19.16b,v16.16b add w7,w7,w12 ushr v3.4s,v20.4s,#24 add w8,w8,w9 ushr v7.4s,v21.4s,#24 eor w21,w21,w5 ushr v19.4s,v22.4s,#24 eor w17,w17,w6 sli v3.4s,v20.4s,#8 eor w19,w19,w7 sli v7.4s,v21.4s,#8 eor w20,w20,w8 sli v19.4s,v22.4s,#8 ror w21,w21,#24 add v2.4s,v2.4s,v3.4s ror w17,w17,#24 add v6.4s,v6.4s,v7.4s ror w19,w19,#24 add v18.4s,v18.4s,v19.4s ror w20,w20,#24 eor v20.16b,v1.16b,v2.16b add w15,w15,w21 eor v21.16b,v5.16b,v6.16b add w16,w16,w17 eor v22.16b,v17.16b,v18.16b add w13,w13,w19 ushr v1.4s,v20.4s,#25 add w14,w14,w20 ushr v5.4s,v21.4s,#25 eor w10,w10,w15 ushr v17.4s,v22.4s,#25 eor w11,w11,w16 sli v1.4s,v20.4s,#7 eor w12,w12,w13 sli v5.4s,v21.4s,#7 eor w9,w9,w14 sli v17.4s,v22.4s,#7 ror w10,w10,#25 ext v2.16b,v2.16b,v2.16b,#8 ror w11,w11,#25 ext v6.16b,v6.16b,v6.16b,#8 ror w12,w12,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#4 ext v7.16b,v7.16b,v7.16b,#4 ext v19.16b,v19.16b,v19.16b,#4 ext v1.16b,v1.16b,v1.16b,#12 ext v5.16b,v5.16b,v5.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 cbnz x4,Loop_neon add w5,w5,w22 // accumulate key block add v0.4s,v0.4s,v24.4s add x6,x6,x22,lsr#32 add v4.4s,v4.4s,v24.4s add w7,w7,w23 add v16.4s,v16.4s,v24.4s add x8,x8,x23,lsr#32 add v2.4s,v2.4s,v26.4s add w9,w9,w24 add v6.4s,v6.4s,v26.4s add x10,x10,x24,lsr#32 add v18.4s,v18.4s,v26.4s add w11,w11,w25 add v3.4s,v3.4s,v27.4s add x12,x12,x25,lsr#32 add w13,w13,w26 add v7.4s,v7.4s,v28.4s add x14,x14,x26,lsr#32 add w15,w15,w27 add v19.4s,v19.4s,v29.4s add x16,x16,x27,lsr#32 add w17,w17,w28 add v1.4s,v1.4s,v25.4s add x19,x19,x28,lsr#32 add w20,w20,w30 add v5.4s,v5.4s,v25.4s add x21,x21,x30,lsr#32 add v17.4s,v17.4s,v25.4s b.lo Ltail_neon add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor v0.16b,v0.16b,v20.16b eor x15,x15,x16 eor v1.16b,v1.16b,v21.16b eor x17,x17,x19 eor v2.16b,v2.16b,v22.16b eor x20,x20,x21 eor v3.16b,v3.16b,v23.16b ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 stp x5,x7,[x0,#0] // store output add x28,x28,#4 // increment counter stp x9,x11,[x0,#16] add v27.4s,v27.4s,v31.4s // += 4 stp x13,x15,[x0,#32] add v28.4s,v28.4s,v31.4s stp x17,x20,[x0,#48] add v29.4s,v29.4s,v31.4s add x0,x0,#64 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 eor v4.16b,v4.16b,v20.16b eor v5.16b,v5.16b,v21.16b eor v6.16b,v6.16b,v22.16b eor v7.16b,v7.16b,v23.16b st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 eor v16.16b,v16.16b,v0.16b eor v17.16b,v17.16b,v1.16b eor v18.16b,v18.16b,v2.16b eor v19.16b,v19.16b,v3.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 b.hi Loop_outer_neon ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret Ltail_neon: add x2,x2,#256 cmp x2,#64 b.lo Less_than_64 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#4 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 b.eq Ldone_neon sub x2,x2,#64 cmp x2,#64 b.lo Less_than_128 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor v0.16b,v0.16b,v20.16b eor v1.16b,v1.16b,v21.16b eor v2.16b,v2.16b,v22.16b eor v3.16b,v3.16b,v23.16b st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 b.eq Ldone_neon sub x2,x2,#64 cmp x2,#64 b.lo Less_than_192 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor v4.16b,v4.16b,v20.16b eor v5.16b,v5.16b,v21.16b eor v6.16b,v6.16b,v22.16b eor v7.16b,v7.16b,v23.16b st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 b.eq Ldone_neon sub x2,x2,#64 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] b Last_neon Less_than_128: st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] b Last_neon Less_than_192: st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] b Last_neon .align 4 Last_neon: sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 add x4,sp,x2 neg x2,x2 Loop_tail_neon: ldrb w10,[x1,x2] ldrb w11,[x4,x2] add x2,x2,#1 eor w10,w10,w11 strb w10,[x0,x2] cbnz x2,Loop_tail_neon stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] Ldone_neon: ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .align 5 ChaCha20_512_neon: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 adrp x5,Lsigma@PAGE add x5,x5,Lsigma@PAGEOFF stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] L512_or_more_neon: sub sp,sp,#128+64 ldp x22,x23,[x5] // load sigma ld1 {v24.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ld1 {v25.4s,v26.4s},[x3] ldp x28,x30,[x4] // load counter ld1 {v27.4s},[x4] ld1 {v31.4s},[x5] #ifdef __AARCH64EB__ rev64 v24.4s,v24.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif add v27.4s,v27.4s,v31.4s // += 1 stp q24,q25,[sp,#0] // off-load key block, invariant part add v27.4s,v27.4s,v31.4s // not typo str q26,[sp,#32] add v28.4s,v27.4s,v31.4s add v29.4s,v28.4s,v31.4s add v30.4s,v29.4s,v31.4s shl v31.4s,v31.4s,#2 // 1 -> 4 stp d8,d9,[sp,#128+0] // meet ABI requirements stp d10,d11,[sp,#128+16] stp d12,d13,[sp,#128+32] stp d14,d15,[sp,#128+48] sub x2,x2,#512 // not typo Loop_outer_512_neon: mov v0.16b,v24.16b mov v4.16b,v24.16b mov v8.16b,v24.16b mov v12.16b,v24.16b mov v16.16b,v24.16b mov v20.16b,v24.16b mov v1.16b,v25.16b mov w5,w22 // unpack key block mov v5.16b,v25.16b lsr x6,x22,#32 mov v9.16b,v25.16b mov w7,w23 mov v13.16b,v25.16b lsr x8,x23,#32 mov v17.16b,v25.16b mov w9,w24 mov v21.16b,v25.16b lsr x10,x24,#32 mov v3.16b,v27.16b mov w11,w25 mov v7.16b,v28.16b lsr x12,x25,#32 mov v11.16b,v29.16b mov w13,w26 mov v15.16b,v30.16b lsr x14,x26,#32 mov v2.16b,v26.16b mov w15,w27 mov v6.16b,v26.16b lsr x16,x27,#32 add v19.4s,v3.4s,v31.4s // +4 mov w17,w28 add v23.4s,v7.4s,v31.4s // +4 lsr x19,x28,#32 mov v10.16b,v26.16b mov w20,w30 mov v14.16b,v26.16b lsr x21,x30,#32 mov v18.16b,v26.16b stp q27,q28,[sp,#48] // off-load key block, variable part mov v22.16b,v26.16b str q29,[sp,#80] mov x4,#5 subs x2,x2,#512 Loop_upper_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#12 ext v7.16b,v7.16b,v7.16b,#12 ext v11.16b,v11.16b,v11.16b,#12 ext v15.16b,v15.16b,v15.16b,#12 ext v19.16b,v19.16b,v19.16b,#12 ext v23.16b,v23.16b,v23.16b,#12 ext v1.16b,v1.16b,v1.16b,#4 ext v5.16b,v5.16b,v5.16b,#4 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#4 ext v7.16b,v7.16b,v7.16b,#4 ext v11.16b,v11.16b,v11.16b,#4 ext v15.16b,v15.16b,v15.16b,#4 ext v19.16b,v19.16b,v19.16b,#4 ext v23.16b,v23.16b,v23.16b,#4 ext v1.16b,v1.16b,v1.16b,#12 ext v5.16b,v5.16b,v5.16b,#12 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 cbnz x4,Loop_upper_neon add w5,w5,w22 // accumulate key block add x6,x6,x22,lsr#32 add w7,w7,w23 add x8,x8,x23,lsr#32 add w9,w9,w24 add x10,x10,x24,lsr#32 add w11,w11,w25 add x12,x12,x25,lsr#32 add w13,w13,w26 add x14,x14,x26,lsr#32 add w15,w15,w27 add x16,x16,x27,lsr#32 add w17,w17,w28 add x19,x19,x28,lsr#32 add w20,w20,w30 add x21,x21,x30,lsr#32 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#1 // increment counter mov w5,w22 // unpack key block lsr x6,x22,#32 stp x9,x11,[x0,#16] mov w7,w23 lsr x8,x23,#32 stp x13,x15,[x0,#32] mov w9,w24 lsr x10,x24,#32 stp x17,x20,[x0,#48] add x0,x0,#64 mov w11,w25 lsr x12,x25,#32 mov w13,w26 lsr x14,x26,#32 mov w15,w27 lsr x16,x27,#32 mov w17,w28 lsr x19,x28,#32 mov w20,w30 lsr x21,x30,#32 mov x4,#5 Loop_lower_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#12 ext v7.16b,v7.16b,v7.16b,#12 ext v11.16b,v11.16b,v11.16b,#12 ext v15.16b,v15.16b,v15.16b,#12 ext v19.16b,v19.16b,v19.16b,#12 ext v23.16b,v23.16b,v23.16b,#12 ext v1.16b,v1.16b,v1.16b,#4 ext v5.16b,v5.16b,v5.16b,#4 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#4 ext v7.16b,v7.16b,v7.16b,#4 ext v11.16b,v11.16b,v11.16b,#4 ext v15.16b,v15.16b,v15.16b,#4 ext v19.16b,v19.16b,v19.16b,#4 ext v23.16b,v23.16b,v23.16b,#4 ext v1.16b,v1.16b,v1.16b,#12 ext v5.16b,v5.16b,v5.16b,#12 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 cbnz x4,Loop_lower_neon add w5,w5,w22 // accumulate key block ldp q24,q25,[sp,#0] add x6,x6,x22,lsr#32 ldp q26,q27,[sp,#32] add w7,w7,w23 ldp q28,q29,[sp,#64] add x8,x8,x23,lsr#32 add v0.4s,v0.4s,v24.4s add w9,w9,w24 add v4.4s,v4.4s,v24.4s add x10,x10,x24,lsr#32 add v8.4s,v8.4s,v24.4s add w11,w11,w25 add v12.4s,v12.4s,v24.4s add x12,x12,x25,lsr#32 add v16.4s,v16.4s,v24.4s add w13,w13,w26 add v20.4s,v20.4s,v24.4s add x14,x14,x26,lsr#32 add v2.4s,v2.4s,v26.4s add w15,w15,w27 add v6.4s,v6.4s,v26.4s add x16,x16,x27,lsr#32 add v10.4s,v10.4s,v26.4s add w17,w17,w28 add v14.4s,v14.4s,v26.4s add x19,x19,x28,lsr#32 add v18.4s,v18.4s,v26.4s add w20,w20,w30 add v22.4s,v22.4s,v26.4s add x21,x21,x30,lsr#32 add v19.4s,v19.4s,v31.4s // +4 add x5,x5,x6,lsl#32 // pack add v23.4s,v23.4s,v31.4s // +4 add x7,x7,x8,lsl#32 add v3.4s,v3.4s,v27.4s ldp x6,x8,[x1,#0] // load input add v7.4s,v7.4s,v28.4s add x9,x9,x10,lsl#32 add v11.4s,v11.4s,v29.4s add x11,x11,x12,lsl#32 add v15.4s,v15.4s,v30.4s ldp x10,x12,[x1,#16] add v19.4s,v19.4s,v27.4s add x13,x13,x14,lsl#32 add v23.4s,v23.4s,v28.4s add x15,x15,x16,lsl#32 add v1.4s,v1.4s,v25.4s ldp x14,x16,[x1,#32] add v5.4s,v5.4s,v25.4s add x17,x17,x19,lsl#32 add v9.4s,v9.4s,v25.4s add x20,x20,x21,lsl#32 add v13.4s,v13.4s,v25.4s ldp x19,x21,[x1,#48] add v17.4s,v17.4s,v25.4s add x1,x1,#64 add v21.4s,v21.4s,v25.4s #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor v0.16b,v0.16b,v24.16b eor x15,x15,x16 eor v1.16b,v1.16b,v25.16b eor x17,x17,x19 eor v2.16b,v2.16b,v26.16b eor x20,x20,x21 eor v3.16b,v3.16b,v27.16b ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 stp x5,x7,[x0,#0] // store output add x28,x28,#7 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 eor v4.16b,v4.16b,v24.16b eor v5.16b,v5.16b,v25.16b eor v6.16b,v6.16b,v26.16b eor v7.16b,v7.16b,v27.16b st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 eor v8.16b,v8.16b,v0.16b ldp q24,q25,[sp,#0] eor v9.16b,v9.16b,v1.16b ldp q26,q27,[sp,#32] eor v10.16b,v10.16b,v2.16b eor v11.16b,v11.16b,v3.16b st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 eor v12.16b,v12.16b,v4.16b eor v13.16b,v13.16b,v5.16b eor v14.16b,v14.16b,v6.16b eor v15.16b,v15.16b,v7.16b st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 eor v16.16b,v16.16b,v8.16b eor v17.16b,v17.16b,v9.16b eor v18.16b,v18.16b,v10.16b eor v19.16b,v19.16b,v11.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 shl v0.4s,v31.4s,#1 // 4 -> 8 eor v20.16b,v20.16b,v12.16b eor v21.16b,v21.16b,v13.16b eor v22.16b,v22.16b,v14.16b eor v23.16b,v23.16b,v15.16b st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 add v27.4s,v27.4s,v0.4s // += 8 add v28.4s,v28.4s,v0.4s add v29.4s,v29.4s,v0.4s add v30.4s,v30.4s,v0.4s b.hs Loop_outer_512_neon adds x2,x2,#512 ushr v0.4s,v31.4s,#2 // 4 -> 1 ldp d8,d9,[sp,#128+0] // meet ABI requirements ldp d10,d11,[sp,#128+16] ldp d12,d13,[sp,#128+32] ldp d14,d15,[sp,#128+48] stp q24,q31,[sp,#0] // wipe off-load area stp q24,q31,[sp,#32] stp q24,q31,[sp,#64] b.eq Ldone_512_neon cmp x2,#192 sub v27.4s,v27.4s,v0.4s // -= 1 sub v28.4s,v28.4s,v0.4s sub v29.4s,v29.4s,v0.4s add sp,sp,#128 b.hs Loop_outer_neon eor v25.16b,v25.16b,v25.16b eor v26.16b,v26.16b,v26.16b eor v27.16b,v27.16b,v27.16b eor v28.16b,v28.16b,v28.16b eor v29.16b,v29.16b,v29.16b eor v30.16b,v30.16b,v30.16b b Loop_outer Ldone_512_neon: ldp x19,x20,[x29,#16] add sp,sp,#128+64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) ring-0.17.14/pregenerated/chacha-armv8-linux64.S000064400000000000000000001167451046102023000173150ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) .section .rodata .align 5 .Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral .Lone: .long 1,0,0,0 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .text .globl ChaCha20_ctr32_nohw .hidden ChaCha20_ctr32_nohw .type ChaCha20_ctr32_nohw,%function .align 5 ChaCha20_ctr32_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 adrp x5,.Lsigma add x5,x5,:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#64 ldp x22,x23,[x5] // load sigma ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ldp x28,x30,[x4] // load counter #ifdef __AARCH64EB__ ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif .Loop_outer: mov w5,w22 // unpack key block lsr x6,x22,#32 mov w7,w23 lsr x8,x23,#32 mov w9,w24 lsr x10,x24,#32 mov w11,w25 lsr x12,x25,#32 mov w13,w26 lsr x14,x26,#32 mov w15,w27 lsr x16,x27,#32 mov w17,w28 lsr x19,x28,#32 mov w20,w30 lsr x21,x30,#32 mov x4,#10 subs x2,x2,#64 .Loop: sub x4,x4,#1 add w5,w5,w9 add w6,w6,w10 add w7,w7,w11 add w8,w8,w12 eor w17,w17,w5 eor w19,w19,w6 eor w20,w20,w7 eor w21,w21,w8 ror w17,w17,#16 ror w19,w19,#16 ror w20,w20,#16 ror w21,w21,#16 add w13,w13,w17 add w14,w14,w19 add w15,w15,w20 add w16,w16,w21 eor w9,w9,w13 eor w10,w10,w14 eor w11,w11,w15 eor w12,w12,w16 ror w9,w9,#20 ror w10,w10,#20 ror w11,w11,#20 ror w12,w12,#20 add w5,w5,w9 add w6,w6,w10 add w7,w7,w11 add w8,w8,w12 eor w17,w17,w5 eor w19,w19,w6 eor w20,w20,w7 eor w21,w21,w8 ror w17,w17,#24 ror w19,w19,#24 ror w20,w20,#24 ror w21,w21,#24 add w13,w13,w17 add w14,w14,w19 add w15,w15,w20 add w16,w16,w21 eor w9,w9,w13 eor w10,w10,w14 eor w11,w11,w15 eor w12,w12,w16 ror w9,w9,#25 ror w10,w10,#25 ror w11,w11,#25 ror w12,w12,#25 add w5,w5,w10 add w6,w6,w11 add w7,w7,w12 add w8,w8,w9 eor w21,w21,w5 eor w17,w17,w6 eor w19,w19,w7 eor w20,w20,w8 ror w21,w21,#16 ror w17,w17,#16 ror w19,w19,#16 ror w20,w20,#16 add w15,w15,w21 add w16,w16,w17 add w13,w13,w19 add w14,w14,w20 eor w10,w10,w15 eor w11,w11,w16 eor w12,w12,w13 eor w9,w9,w14 ror w10,w10,#20 ror w11,w11,#20 ror w12,w12,#20 ror w9,w9,#20 add w5,w5,w10 add w6,w6,w11 add w7,w7,w12 add w8,w8,w9 eor w21,w21,w5 eor w17,w17,w6 eor w19,w19,w7 eor w20,w20,w8 ror w21,w21,#24 ror w17,w17,#24 ror w19,w19,#24 ror w20,w20,#24 add w15,w15,w21 add w16,w16,w17 add w13,w13,w19 add w14,w14,w20 eor w10,w10,w15 eor w11,w11,w16 eor w12,w12,w13 eor w9,w9,w14 ror w10,w10,#25 ror w11,w11,#25 ror w12,w12,#25 ror w9,w9,#25 cbnz x4,.Loop add w5,w5,w22 // accumulate key block add x6,x6,x22,lsr#32 add w7,w7,w23 add x8,x8,x23,lsr#32 add w9,w9,w24 add x10,x10,x24,lsr#32 add w11,w11,w25 add x12,x12,x25,lsr#32 add w13,w13,w26 add x14,x14,x26,lsr#32 add w15,w15,w27 add x16,x16,x27,lsr#32 add w17,w17,w28 add x19,x19,x28,lsr#32 add w20,w20,w30 add x21,x21,x30,lsr#32 b.lo .Ltail add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#1 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 b.hi .Loop_outer ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .align 4 .Ltail: add x2,x2,#64 .Less_than_64: sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 add x4,sp,x2 neg x2,x2 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif stp x5,x7,[sp,#0] stp x9,x11,[sp,#16] stp x13,x15,[sp,#32] stp x17,x20,[sp,#48] .Loop_tail: ldrb w10,[x1,x2] ldrb w11,[x4,x2] add x2,x2,#1 eor w10,w10,w11 strb w10,[x0,x2] cbnz x2,.Loop_tail stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw .globl ChaCha20_ctr32_neon .hidden ChaCha20_ctr32_neon .type ChaCha20_ctr32_neon,%function .align 5 ChaCha20_ctr32_neon: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 adrp x5,.Lsigma add x5,x5,:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] cmp x2,#512 b.hs .L512_or_more_neon sub sp,sp,#64 ldp x22,x23,[x5] // load sigma ld1 {v24.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ld1 {v25.4s,v26.4s},[x3] ldp x28,x30,[x4] // load counter ld1 {v27.4s},[x4] ld1 {v31.4s},[x5] #ifdef __AARCH64EB__ rev64 v24.4s,v24.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif add v27.4s,v27.4s,v31.4s // += 1 add v28.4s,v27.4s,v31.4s add v29.4s,v28.4s,v31.4s shl v31.4s,v31.4s,#2 // 1 -> 4 .Loop_outer_neon: mov w5,w22 // unpack key block lsr x6,x22,#32 mov v0.16b,v24.16b mov w7,w23 lsr x8,x23,#32 mov v4.16b,v24.16b mov w9,w24 lsr x10,x24,#32 mov v16.16b,v24.16b mov w11,w25 mov v1.16b,v25.16b lsr x12,x25,#32 mov v5.16b,v25.16b mov w13,w26 mov v17.16b,v25.16b lsr x14,x26,#32 mov v3.16b,v27.16b mov w15,w27 mov v7.16b,v28.16b lsr x16,x27,#32 mov v19.16b,v29.16b mov w17,w28 mov v2.16b,v26.16b lsr x19,x28,#32 mov v6.16b,v26.16b mov w20,w30 mov v18.16b,v26.16b lsr x21,x30,#32 mov x4,#10 subs x2,x2,#256 .Loop_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v16.4s,v16.4s,v17.4s add w7,w7,w11 eor v3.16b,v3.16b,v0.16b add w8,w8,w12 eor v7.16b,v7.16b,v4.16b eor w17,w17,w5 eor v19.16b,v19.16b,v16.16b eor w19,w19,w6 rev32 v3.8h,v3.8h eor w20,w20,w7 rev32 v7.8h,v7.8h eor w21,w21,w8 rev32 v19.8h,v19.8h ror w17,w17,#16 add v2.4s,v2.4s,v3.4s ror w19,w19,#16 add v6.4s,v6.4s,v7.4s ror w20,w20,#16 add v18.4s,v18.4s,v19.4s ror w21,w21,#16 eor v20.16b,v1.16b,v2.16b add w13,w13,w17 eor v21.16b,v5.16b,v6.16b add w14,w14,w19 eor v22.16b,v17.16b,v18.16b add w15,w15,w20 ushr v1.4s,v20.4s,#20 add w16,w16,w21 ushr v5.4s,v21.4s,#20 eor w9,w9,w13 ushr v17.4s,v22.4s,#20 eor w10,w10,w14 sli v1.4s,v20.4s,#12 eor w11,w11,w15 sli v5.4s,v21.4s,#12 eor w12,w12,w16 sli v17.4s,v22.4s,#12 ror w9,w9,#20 add v0.4s,v0.4s,v1.4s ror w10,w10,#20 add v4.4s,v4.4s,v5.4s ror w11,w11,#20 add v16.4s,v16.4s,v17.4s ror w12,w12,#20 eor v20.16b,v3.16b,v0.16b add w5,w5,w9 eor v21.16b,v7.16b,v4.16b add w6,w6,w10 eor v22.16b,v19.16b,v16.16b add w7,w7,w11 ushr v3.4s,v20.4s,#24 add w8,w8,w12 ushr v7.4s,v21.4s,#24 eor w17,w17,w5 ushr v19.4s,v22.4s,#24 eor w19,w19,w6 sli v3.4s,v20.4s,#8 eor w20,w20,w7 sli v7.4s,v21.4s,#8 eor w21,w21,w8 sli v19.4s,v22.4s,#8 ror w17,w17,#24 add v2.4s,v2.4s,v3.4s ror w19,w19,#24 add v6.4s,v6.4s,v7.4s ror w20,w20,#24 add v18.4s,v18.4s,v19.4s ror w21,w21,#24 eor v20.16b,v1.16b,v2.16b add w13,w13,w17 eor v21.16b,v5.16b,v6.16b add w14,w14,w19 eor v22.16b,v17.16b,v18.16b add w15,w15,w20 ushr v1.4s,v20.4s,#25 add w16,w16,w21 ushr v5.4s,v21.4s,#25 eor w9,w9,w13 ushr v17.4s,v22.4s,#25 eor w10,w10,w14 sli v1.4s,v20.4s,#7 eor w11,w11,w15 sli v5.4s,v21.4s,#7 eor w12,w12,w16 sli v17.4s,v22.4s,#7 ror w9,w9,#25 ext v2.16b,v2.16b,v2.16b,#8 ror w10,w10,#25 ext v6.16b,v6.16b,v6.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v3.16b,v3.16b,v3.16b,#12 ext v7.16b,v7.16b,v7.16b,#12 ext v19.16b,v19.16b,v19.16b,#12 ext v1.16b,v1.16b,v1.16b,#4 ext v5.16b,v5.16b,v5.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 add v0.4s,v0.4s,v1.4s add w5,w5,w10 add v4.4s,v4.4s,v5.4s add w6,w6,w11 add v16.4s,v16.4s,v17.4s add w7,w7,w12 eor v3.16b,v3.16b,v0.16b add w8,w8,w9 eor v7.16b,v7.16b,v4.16b eor w21,w21,w5 eor v19.16b,v19.16b,v16.16b eor w17,w17,w6 rev32 v3.8h,v3.8h eor w19,w19,w7 rev32 v7.8h,v7.8h eor w20,w20,w8 rev32 v19.8h,v19.8h ror w21,w21,#16 add v2.4s,v2.4s,v3.4s ror w17,w17,#16 add v6.4s,v6.4s,v7.4s ror w19,w19,#16 add v18.4s,v18.4s,v19.4s ror w20,w20,#16 eor v20.16b,v1.16b,v2.16b add w15,w15,w21 eor v21.16b,v5.16b,v6.16b add w16,w16,w17 eor v22.16b,v17.16b,v18.16b add w13,w13,w19 ushr v1.4s,v20.4s,#20 add w14,w14,w20 ushr v5.4s,v21.4s,#20 eor w10,w10,w15 ushr v17.4s,v22.4s,#20 eor w11,w11,w16 sli v1.4s,v20.4s,#12 eor w12,w12,w13 sli v5.4s,v21.4s,#12 eor w9,w9,w14 sli v17.4s,v22.4s,#12 ror w10,w10,#20 add v0.4s,v0.4s,v1.4s ror w11,w11,#20 add v4.4s,v4.4s,v5.4s ror w12,w12,#20 add v16.4s,v16.4s,v17.4s ror w9,w9,#20 eor v20.16b,v3.16b,v0.16b add w5,w5,w10 eor v21.16b,v7.16b,v4.16b add w6,w6,w11 eor v22.16b,v19.16b,v16.16b add w7,w7,w12 ushr v3.4s,v20.4s,#24 add w8,w8,w9 ushr v7.4s,v21.4s,#24 eor w21,w21,w5 ushr v19.4s,v22.4s,#24 eor w17,w17,w6 sli v3.4s,v20.4s,#8 eor w19,w19,w7 sli v7.4s,v21.4s,#8 eor w20,w20,w8 sli v19.4s,v22.4s,#8 ror w21,w21,#24 add v2.4s,v2.4s,v3.4s ror w17,w17,#24 add v6.4s,v6.4s,v7.4s ror w19,w19,#24 add v18.4s,v18.4s,v19.4s ror w20,w20,#24 eor v20.16b,v1.16b,v2.16b add w15,w15,w21 eor v21.16b,v5.16b,v6.16b add w16,w16,w17 eor v22.16b,v17.16b,v18.16b add w13,w13,w19 ushr v1.4s,v20.4s,#25 add w14,w14,w20 ushr v5.4s,v21.4s,#25 eor w10,w10,w15 ushr v17.4s,v22.4s,#25 eor w11,w11,w16 sli v1.4s,v20.4s,#7 eor w12,w12,w13 sli v5.4s,v21.4s,#7 eor w9,w9,w14 sli v17.4s,v22.4s,#7 ror w10,w10,#25 ext v2.16b,v2.16b,v2.16b,#8 ror w11,w11,#25 ext v6.16b,v6.16b,v6.16b,#8 ror w12,w12,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#4 ext v7.16b,v7.16b,v7.16b,#4 ext v19.16b,v19.16b,v19.16b,#4 ext v1.16b,v1.16b,v1.16b,#12 ext v5.16b,v5.16b,v5.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 cbnz x4,.Loop_neon add w5,w5,w22 // accumulate key block add v0.4s,v0.4s,v24.4s add x6,x6,x22,lsr#32 add v4.4s,v4.4s,v24.4s add w7,w7,w23 add v16.4s,v16.4s,v24.4s add x8,x8,x23,lsr#32 add v2.4s,v2.4s,v26.4s add w9,w9,w24 add v6.4s,v6.4s,v26.4s add x10,x10,x24,lsr#32 add v18.4s,v18.4s,v26.4s add w11,w11,w25 add v3.4s,v3.4s,v27.4s add x12,x12,x25,lsr#32 add w13,w13,w26 add v7.4s,v7.4s,v28.4s add x14,x14,x26,lsr#32 add w15,w15,w27 add v19.4s,v19.4s,v29.4s add x16,x16,x27,lsr#32 add w17,w17,w28 add v1.4s,v1.4s,v25.4s add x19,x19,x28,lsr#32 add w20,w20,w30 add v5.4s,v5.4s,v25.4s add x21,x21,x30,lsr#32 add v17.4s,v17.4s,v25.4s b.lo .Ltail_neon add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor v0.16b,v0.16b,v20.16b eor x15,x15,x16 eor v1.16b,v1.16b,v21.16b eor x17,x17,x19 eor v2.16b,v2.16b,v22.16b eor x20,x20,x21 eor v3.16b,v3.16b,v23.16b ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 stp x5,x7,[x0,#0] // store output add x28,x28,#4 // increment counter stp x9,x11,[x0,#16] add v27.4s,v27.4s,v31.4s // += 4 stp x13,x15,[x0,#32] add v28.4s,v28.4s,v31.4s stp x17,x20,[x0,#48] add v29.4s,v29.4s,v31.4s add x0,x0,#64 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 eor v4.16b,v4.16b,v20.16b eor v5.16b,v5.16b,v21.16b eor v6.16b,v6.16b,v22.16b eor v7.16b,v7.16b,v23.16b st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 eor v16.16b,v16.16b,v0.16b eor v17.16b,v17.16b,v1.16b eor v18.16b,v18.16b,v2.16b eor v19.16b,v19.16b,v3.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 b.hi .Loop_outer_neon ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .Ltail_neon: add x2,x2,#256 cmp x2,#64 b.lo .Less_than_64 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#4 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 b.eq .Ldone_neon sub x2,x2,#64 cmp x2,#64 b.lo .Less_than_128 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor v0.16b,v0.16b,v20.16b eor v1.16b,v1.16b,v21.16b eor v2.16b,v2.16b,v22.16b eor v3.16b,v3.16b,v23.16b st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 b.eq .Ldone_neon sub x2,x2,#64 cmp x2,#64 b.lo .Less_than_192 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor v4.16b,v4.16b,v20.16b eor v5.16b,v5.16b,v21.16b eor v6.16b,v6.16b,v22.16b eor v7.16b,v7.16b,v23.16b st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 b.eq .Ldone_neon sub x2,x2,#64 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] b .Last_neon .Less_than_128: st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] b .Last_neon .Less_than_192: st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] b .Last_neon .align 4 .Last_neon: sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 add x4,sp,x2 neg x2,x2 .Loop_tail_neon: ldrb w10,[x1,x2] ldrb w11,[x4,x2] add x2,x2,#1 eor w10,w10,w11 strb w10,[x0,x2] cbnz x2,.Loop_tail_neon stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] .Ldone_neon: ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon .type ChaCha20_512_neon,%function .align 5 ChaCha20_512_neon: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 adrp x5,.Lsigma add x5,x5,:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] .L512_or_more_neon: sub sp,sp,#128+64 ldp x22,x23,[x5] // load sigma ld1 {v24.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ld1 {v25.4s,v26.4s},[x3] ldp x28,x30,[x4] // load counter ld1 {v27.4s},[x4] ld1 {v31.4s},[x5] #ifdef __AARCH64EB__ rev64 v24.4s,v24.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif add v27.4s,v27.4s,v31.4s // += 1 stp q24,q25,[sp,#0] // off-load key block, invariant part add v27.4s,v27.4s,v31.4s // not typo str q26,[sp,#32] add v28.4s,v27.4s,v31.4s add v29.4s,v28.4s,v31.4s add v30.4s,v29.4s,v31.4s shl v31.4s,v31.4s,#2 // 1 -> 4 stp d8,d9,[sp,#128+0] // meet ABI requirements stp d10,d11,[sp,#128+16] stp d12,d13,[sp,#128+32] stp d14,d15,[sp,#128+48] sub x2,x2,#512 // not typo .Loop_outer_512_neon: mov v0.16b,v24.16b mov v4.16b,v24.16b mov v8.16b,v24.16b mov v12.16b,v24.16b mov v16.16b,v24.16b mov v20.16b,v24.16b mov v1.16b,v25.16b mov w5,w22 // unpack key block mov v5.16b,v25.16b lsr x6,x22,#32 mov v9.16b,v25.16b mov w7,w23 mov v13.16b,v25.16b lsr x8,x23,#32 mov v17.16b,v25.16b mov w9,w24 mov v21.16b,v25.16b lsr x10,x24,#32 mov v3.16b,v27.16b mov w11,w25 mov v7.16b,v28.16b lsr x12,x25,#32 mov v11.16b,v29.16b mov w13,w26 mov v15.16b,v30.16b lsr x14,x26,#32 mov v2.16b,v26.16b mov w15,w27 mov v6.16b,v26.16b lsr x16,x27,#32 add v19.4s,v3.4s,v31.4s // +4 mov w17,w28 add v23.4s,v7.4s,v31.4s // +4 lsr x19,x28,#32 mov v10.16b,v26.16b mov w20,w30 mov v14.16b,v26.16b lsr x21,x30,#32 mov v18.16b,v26.16b stp q27,q28,[sp,#48] // off-load key block, variable part mov v22.16b,v26.16b str q29,[sp,#80] mov x4,#5 subs x2,x2,#512 .Loop_upper_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#12 ext v7.16b,v7.16b,v7.16b,#12 ext v11.16b,v11.16b,v11.16b,#12 ext v15.16b,v15.16b,v15.16b,#12 ext v19.16b,v19.16b,v19.16b,#12 ext v23.16b,v23.16b,v23.16b,#12 ext v1.16b,v1.16b,v1.16b,#4 ext v5.16b,v5.16b,v5.16b,#4 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#4 ext v7.16b,v7.16b,v7.16b,#4 ext v11.16b,v11.16b,v11.16b,#4 ext v15.16b,v15.16b,v15.16b,#4 ext v19.16b,v19.16b,v19.16b,#4 ext v23.16b,v23.16b,v23.16b,#4 ext v1.16b,v1.16b,v1.16b,#12 ext v5.16b,v5.16b,v5.16b,#12 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 cbnz x4,.Loop_upper_neon add w5,w5,w22 // accumulate key block add x6,x6,x22,lsr#32 add w7,w7,w23 add x8,x8,x23,lsr#32 add w9,w9,w24 add x10,x10,x24,lsr#32 add w11,w11,w25 add x12,x12,x25,lsr#32 add w13,w13,w26 add x14,x14,x26,lsr#32 add w15,w15,w27 add x16,x16,x27,lsr#32 add w17,w17,w28 add x19,x19,x28,lsr#32 add w20,w20,w30 add x21,x21,x30,lsr#32 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#1 // increment counter mov w5,w22 // unpack key block lsr x6,x22,#32 stp x9,x11,[x0,#16] mov w7,w23 lsr x8,x23,#32 stp x13,x15,[x0,#32] mov w9,w24 lsr x10,x24,#32 stp x17,x20,[x0,#48] add x0,x0,#64 mov w11,w25 lsr x12,x25,#32 mov w13,w26 lsr x14,x26,#32 mov w15,w27 lsr x16,x27,#32 mov w17,w28 lsr x19,x28,#32 mov w20,w30 lsr x21,x30,#32 mov x4,#5 .Loop_lower_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#12 ext v7.16b,v7.16b,v7.16b,#12 ext v11.16b,v11.16b,v11.16b,#12 ext v15.16b,v15.16b,v15.16b,#12 ext v19.16b,v19.16b,v19.16b,#12 ext v23.16b,v23.16b,v23.16b,#12 ext v1.16b,v1.16b,v1.16b,#4 ext v5.16b,v5.16b,v5.16b,#4 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#4 ext v7.16b,v7.16b,v7.16b,#4 ext v11.16b,v11.16b,v11.16b,#4 ext v15.16b,v15.16b,v15.16b,#4 ext v19.16b,v19.16b,v19.16b,#4 ext v23.16b,v23.16b,v23.16b,#4 ext v1.16b,v1.16b,v1.16b,#12 ext v5.16b,v5.16b,v5.16b,#12 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 cbnz x4,.Loop_lower_neon add w5,w5,w22 // accumulate key block ldp q24,q25,[sp,#0] add x6,x6,x22,lsr#32 ldp q26,q27,[sp,#32] add w7,w7,w23 ldp q28,q29,[sp,#64] add x8,x8,x23,lsr#32 add v0.4s,v0.4s,v24.4s add w9,w9,w24 add v4.4s,v4.4s,v24.4s add x10,x10,x24,lsr#32 add v8.4s,v8.4s,v24.4s add w11,w11,w25 add v12.4s,v12.4s,v24.4s add x12,x12,x25,lsr#32 add v16.4s,v16.4s,v24.4s add w13,w13,w26 add v20.4s,v20.4s,v24.4s add x14,x14,x26,lsr#32 add v2.4s,v2.4s,v26.4s add w15,w15,w27 add v6.4s,v6.4s,v26.4s add x16,x16,x27,lsr#32 add v10.4s,v10.4s,v26.4s add w17,w17,w28 add v14.4s,v14.4s,v26.4s add x19,x19,x28,lsr#32 add v18.4s,v18.4s,v26.4s add w20,w20,w30 add v22.4s,v22.4s,v26.4s add x21,x21,x30,lsr#32 add v19.4s,v19.4s,v31.4s // +4 add x5,x5,x6,lsl#32 // pack add v23.4s,v23.4s,v31.4s // +4 add x7,x7,x8,lsl#32 add v3.4s,v3.4s,v27.4s ldp x6,x8,[x1,#0] // load input add v7.4s,v7.4s,v28.4s add x9,x9,x10,lsl#32 add v11.4s,v11.4s,v29.4s add x11,x11,x12,lsl#32 add v15.4s,v15.4s,v30.4s ldp x10,x12,[x1,#16] add v19.4s,v19.4s,v27.4s add x13,x13,x14,lsl#32 add v23.4s,v23.4s,v28.4s add x15,x15,x16,lsl#32 add v1.4s,v1.4s,v25.4s ldp x14,x16,[x1,#32] add v5.4s,v5.4s,v25.4s add x17,x17,x19,lsl#32 add v9.4s,v9.4s,v25.4s add x20,x20,x21,lsl#32 add v13.4s,v13.4s,v25.4s ldp x19,x21,[x1,#48] add v17.4s,v17.4s,v25.4s add x1,x1,#64 add v21.4s,v21.4s,v25.4s #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor v0.16b,v0.16b,v24.16b eor x15,x15,x16 eor v1.16b,v1.16b,v25.16b eor x17,x17,x19 eor v2.16b,v2.16b,v26.16b eor x20,x20,x21 eor v3.16b,v3.16b,v27.16b ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 stp x5,x7,[x0,#0] // store output add x28,x28,#7 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 eor v4.16b,v4.16b,v24.16b eor v5.16b,v5.16b,v25.16b eor v6.16b,v6.16b,v26.16b eor v7.16b,v7.16b,v27.16b st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 eor v8.16b,v8.16b,v0.16b ldp q24,q25,[sp,#0] eor v9.16b,v9.16b,v1.16b ldp q26,q27,[sp,#32] eor v10.16b,v10.16b,v2.16b eor v11.16b,v11.16b,v3.16b st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 eor v12.16b,v12.16b,v4.16b eor v13.16b,v13.16b,v5.16b eor v14.16b,v14.16b,v6.16b eor v15.16b,v15.16b,v7.16b st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 eor v16.16b,v16.16b,v8.16b eor v17.16b,v17.16b,v9.16b eor v18.16b,v18.16b,v10.16b eor v19.16b,v19.16b,v11.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 shl v0.4s,v31.4s,#1 // 4 -> 8 eor v20.16b,v20.16b,v12.16b eor v21.16b,v21.16b,v13.16b eor v22.16b,v22.16b,v14.16b eor v23.16b,v23.16b,v15.16b st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 add v27.4s,v27.4s,v0.4s // += 8 add v28.4s,v28.4s,v0.4s add v29.4s,v29.4s,v0.4s add v30.4s,v30.4s,v0.4s b.hs .Loop_outer_512_neon adds x2,x2,#512 ushr v0.4s,v31.4s,#2 // 4 -> 1 ldp d8,d9,[sp,#128+0] // meet ABI requirements ldp d10,d11,[sp,#128+16] ldp d12,d13,[sp,#128+32] ldp d14,d15,[sp,#128+48] stp q24,q31,[sp,#0] // wipe off-load area stp q24,q31,[sp,#32] stp q24,q31,[sp,#64] b.eq .Ldone_512_neon cmp x2,#192 sub v27.4s,v27.4s,v0.4s // -= 1 sub v28.4s,v28.4s,v0.4s sub v29.4s,v29.4s,v0.4s add sp,sp,#128 b.hs .Loop_outer_neon eor v25.16b,v25.16b,v25.16b eor v26.16b,v26.16b,v26.16b eor v27.16b,v27.16b,v27.16b eor v28.16b,v28.16b,v28.16b eor v29.16b,v29.16b,v29.16b eor v30.16b,v30.16b,v30.16b b .Loop_outer .Ldone_512_neon: ldp x19,x20,[x29,#16] add sp,sp,#128+64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_512_neon,.-ChaCha20_512_neon #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) ring-0.17.14/pregenerated/chacha-armv8-win64.S000064400000000000000000001164121046102023000167420ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) .section .rodata .align 5 Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral Lone: .long 1,0,0,0 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .text .globl ChaCha20_ctr32_nohw .def ChaCha20_ctr32_nohw .type 32 .endef .align 5 ChaCha20_ctr32_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 adrp x5,Lsigma add x5,x5,:lo12:Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#64 ldp x22,x23,[x5] // load sigma ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ldp x28,x30,[x4] // load counter #ifdef __AARCH64EB__ ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif Loop_outer: mov w5,w22 // unpack key block lsr x6,x22,#32 mov w7,w23 lsr x8,x23,#32 mov w9,w24 lsr x10,x24,#32 mov w11,w25 lsr x12,x25,#32 mov w13,w26 lsr x14,x26,#32 mov w15,w27 lsr x16,x27,#32 mov w17,w28 lsr x19,x28,#32 mov w20,w30 lsr x21,x30,#32 mov x4,#10 subs x2,x2,#64 Loop: sub x4,x4,#1 add w5,w5,w9 add w6,w6,w10 add w7,w7,w11 add w8,w8,w12 eor w17,w17,w5 eor w19,w19,w6 eor w20,w20,w7 eor w21,w21,w8 ror w17,w17,#16 ror w19,w19,#16 ror w20,w20,#16 ror w21,w21,#16 add w13,w13,w17 add w14,w14,w19 add w15,w15,w20 add w16,w16,w21 eor w9,w9,w13 eor w10,w10,w14 eor w11,w11,w15 eor w12,w12,w16 ror w9,w9,#20 ror w10,w10,#20 ror w11,w11,#20 ror w12,w12,#20 add w5,w5,w9 add w6,w6,w10 add w7,w7,w11 add w8,w8,w12 eor w17,w17,w5 eor w19,w19,w6 eor w20,w20,w7 eor w21,w21,w8 ror w17,w17,#24 ror w19,w19,#24 ror w20,w20,#24 ror w21,w21,#24 add w13,w13,w17 add w14,w14,w19 add w15,w15,w20 add w16,w16,w21 eor w9,w9,w13 eor w10,w10,w14 eor w11,w11,w15 eor w12,w12,w16 ror w9,w9,#25 ror w10,w10,#25 ror w11,w11,#25 ror w12,w12,#25 add w5,w5,w10 add w6,w6,w11 add w7,w7,w12 add w8,w8,w9 eor w21,w21,w5 eor w17,w17,w6 eor w19,w19,w7 eor w20,w20,w8 ror w21,w21,#16 ror w17,w17,#16 ror w19,w19,#16 ror w20,w20,#16 add w15,w15,w21 add w16,w16,w17 add w13,w13,w19 add w14,w14,w20 eor w10,w10,w15 eor w11,w11,w16 eor w12,w12,w13 eor w9,w9,w14 ror w10,w10,#20 ror w11,w11,#20 ror w12,w12,#20 ror w9,w9,#20 add w5,w5,w10 add w6,w6,w11 add w7,w7,w12 add w8,w8,w9 eor w21,w21,w5 eor w17,w17,w6 eor w19,w19,w7 eor w20,w20,w8 ror w21,w21,#24 ror w17,w17,#24 ror w19,w19,#24 ror w20,w20,#24 add w15,w15,w21 add w16,w16,w17 add w13,w13,w19 add w14,w14,w20 eor w10,w10,w15 eor w11,w11,w16 eor w12,w12,w13 eor w9,w9,w14 ror w10,w10,#25 ror w11,w11,#25 ror w12,w12,#25 ror w9,w9,#25 cbnz x4,Loop add w5,w5,w22 // accumulate key block add x6,x6,x22,lsr#32 add w7,w7,w23 add x8,x8,x23,lsr#32 add w9,w9,w24 add x10,x10,x24,lsr#32 add w11,w11,w25 add x12,x12,x25,lsr#32 add w13,w13,w26 add x14,x14,x26,lsr#32 add w15,w15,w27 add x16,x16,x27,lsr#32 add w17,w17,w28 add x19,x19,x28,lsr#32 add w20,w20,w30 add x21,x21,x30,lsr#32 b.lo Ltail add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#1 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 b.hi Loop_outer ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .align 4 Ltail: add x2,x2,#64 Less_than_64: sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 add x4,sp,x2 neg x2,x2 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif stp x5,x7,[sp,#0] stp x9,x11,[sp,#16] stp x13,x15,[sp,#32] stp x17,x20,[sp,#48] Loop_tail: ldrb w10,[x1,x2] ldrb w11,[x4,x2] add x2,x2,#1 eor w10,w10,w11 strb w10,[x0,x2] cbnz x2,Loop_tail stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .globl ChaCha20_ctr32_neon .def ChaCha20_ctr32_neon .type 32 .endef .align 5 ChaCha20_ctr32_neon: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 adrp x5,Lsigma add x5,x5,:lo12:Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] cmp x2,#512 b.hs L512_or_more_neon sub sp,sp,#64 ldp x22,x23,[x5] // load sigma ld1 {v24.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ld1 {v25.4s,v26.4s},[x3] ldp x28,x30,[x4] // load counter ld1 {v27.4s},[x4] ld1 {v31.4s},[x5] #ifdef __AARCH64EB__ rev64 v24.4s,v24.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif add v27.4s,v27.4s,v31.4s // += 1 add v28.4s,v27.4s,v31.4s add v29.4s,v28.4s,v31.4s shl v31.4s,v31.4s,#2 // 1 -> 4 Loop_outer_neon: mov w5,w22 // unpack key block lsr x6,x22,#32 mov v0.16b,v24.16b mov w7,w23 lsr x8,x23,#32 mov v4.16b,v24.16b mov w9,w24 lsr x10,x24,#32 mov v16.16b,v24.16b mov w11,w25 mov v1.16b,v25.16b lsr x12,x25,#32 mov v5.16b,v25.16b mov w13,w26 mov v17.16b,v25.16b lsr x14,x26,#32 mov v3.16b,v27.16b mov w15,w27 mov v7.16b,v28.16b lsr x16,x27,#32 mov v19.16b,v29.16b mov w17,w28 mov v2.16b,v26.16b lsr x19,x28,#32 mov v6.16b,v26.16b mov w20,w30 mov v18.16b,v26.16b lsr x21,x30,#32 mov x4,#10 subs x2,x2,#256 Loop_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v16.4s,v16.4s,v17.4s add w7,w7,w11 eor v3.16b,v3.16b,v0.16b add w8,w8,w12 eor v7.16b,v7.16b,v4.16b eor w17,w17,w5 eor v19.16b,v19.16b,v16.16b eor w19,w19,w6 rev32 v3.8h,v3.8h eor w20,w20,w7 rev32 v7.8h,v7.8h eor w21,w21,w8 rev32 v19.8h,v19.8h ror w17,w17,#16 add v2.4s,v2.4s,v3.4s ror w19,w19,#16 add v6.4s,v6.4s,v7.4s ror w20,w20,#16 add v18.4s,v18.4s,v19.4s ror w21,w21,#16 eor v20.16b,v1.16b,v2.16b add w13,w13,w17 eor v21.16b,v5.16b,v6.16b add w14,w14,w19 eor v22.16b,v17.16b,v18.16b add w15,w15,w20 ushr v1.4s,v20.4s,#20 add w16,w16,w21 ushr v5.4s,v21.4s,#20 eor w9,w9,w13 ushr v17.4s,v22.4s,#20 eor w10,w10,w14 sli v1.4s,v20.4s,#12 eor w11,w11,w15 sli v5.4s,v21.4s,#12 eor w12,w12,w16 sli v17.4s,v22.4s,#12 ror w9,w9,#20 add v0.4s,v0.4s,v1.4s ror w10,w10,#20 add v4.4s,v4.4s,v5.4s ror w11,w11,#20 add v16.4s,v16.4s,v17.4s ror w12,w12,#20 eor v20.16b,v3.16b,v0.16b add w5,w5,w9 eor v21.16b,v7.16b,v4.16b add w6,w6,w10 eor v22.16b,v19.16b,v16.16b add w7,w7,w11 ushr v3.4s,v20.4s,#24 add w8,w8,w12 ushr v7.4s,v21.4s,#24 eor w17,w17,w5 ushr v19.4s,v22.4s,#24 eor w19,w19,w6 sli v3.4s,v20.4s,#8 eor w20,w20,w7 sli v7.4s,v21.4s,#8 eor w21,w21,w8 sli v19.4s,v22.4s,#8 ror w17,w17,#24 add v2.4s,v2.4s,v3.4s ror w19,w19,#24 add v6.4s,v6.4s,v7.4s ror w20,w20,#24 add v18.4s,v18.4s,v19.4s ror w21,w21,#24 eor v20.16b,v1.16b,v2.16b add w13,w13,w17 eor v21.16b,v5.16b,v6.16b add w14,w14,w19 eor v22.16b,v17.16b,v18.16b add w15,w15,w20 ushr v1.4s,v20.4s,#25 add w16,w16,w21 ushr v5.4s,v21.4s,#25 eor w9,w9,w13 ushr v17.4s,v22.4s,#25 eor w10,w10,w14 sli v1.4s,v20.4s,#7 eor w11,w11,w15 sli v5.4s,v21.4s,#7 eor w12,w12,w16 sli v17.4s,v22.4s,#7 ror w9,w9,#25 ext v2.16b,v2.16b,v2.16b,#8 ror w10,w10,#25 ext v6.16b,v6.16b,v6.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v3.16b,v3.16b,v3.16b,#12 ext v7.16b,v7.16b,v7.16b,#12 ext v19.16b,v19.16b,v19.16b,#12 ext v1.16b,v1.16b,v1.16b,#4 ext v5.16b,v5.16b,v5.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 add v0.4s,v0.4s,v1.4s add w5,w5,w10 add v4.4s,v4.4s,v5.4s add w6,w6,w11 add v16.4s,v16.4s,v17.4s add w7,w7,w12 eor v3.16b,v3.16b,v0.16b add w8,w8,w9 eor v7.16b,v7.16b,v4.16b eor w21,w21,w5 eor v19.16b,v19.16b,v16.16b eor w17,w17,w6 rev32 v3.8h,v3.8h eor w19,w19,w7 rev32 v7.8h,v7.8h eor w20,w20,w8 rev32 v19.8h,v19.8h ror w21,w21,#16 add v2.4s,v2.4s,v3.4s ror w17,w17,#16 add v6.4s,v6.4s,v7.4s ror w19,w19,#16 add v18.4s,v18.4s,v19.4s ror w20,w20,#16 eor v20.16b,v1.16b,v2.16b add w15,w15,w21 eor v21.16b,v5.16b,v6.16b add w16,w16,w17 eor v22.16b,v17.16b,v18.16b add w13,w13,w19 ushr v1.4s,v20.4s,#20 add w14,w14,w20 ushr v5.4s,v21.4s,#20 eor w10,w10,w15 ushr v17.4s,v22.4s,#20 eor w11,w11,w16 sli v1.4s,v20.4s,#12 eor w12,w12,w13 sli v5.4s,v21.4s,#12 eor w9,w9,w14 sli v17.4s,v22.4s,#12 ror w10,w10,#20 add v0.4s,v0.4s,v1.4s ror w11,w11,#20 add v4.4s,v4.4s,v5.4s ror w12,w12,#20 add v16.4s,v16.4s,v17.4s ror w9,w9,#20 eor v20.16b,v3.16b,v0.16b add w5,w5,w10 eor v21.16b,v7.16b,v4.16b add w6,w6,w11 eor v22.16b,v19.16b,v16.16b add w7,w7,w12 ushr v3.4s,v20.4s,#24 add w8,w8,w9 ushr v7.4s,v21.4s,#24 eor w21,w21,w5 ushr v19.4s,v22.4s,#24 eor w17,w17,w6 sli v3.4s,v20.4s,#8 eor w19,w19,w7 sli v7.4s,v21.4s,#8 eor w20,w20,w8 sli v19.4s,v22.4s,#8 ror w21,w21,#24 add v2.4s,v2.4s,v3.4s ror w17,w17,#24 add v6.4s,v6.4s,v7.4s ror w19,w19,#24 add v18.4s,v18.4s,v19.4s ror w20,w20,#24 eor v20.16b,v1.16b,v2.16b add w15,w15,w21 eor v21.16b,v5.16b,v6.16b add w16,w16,w17 eor v22.16b,v17.16b,v18.16b add w13,w13,w19 ushr v1.4s,v20.4s,#25 add w14,w14,w20 ushr v5.4s,v21.4s,#25 eor w10,w10,w15 ushr v17.4s,v22.4s,#25 eor w11,w11,w16 sli v1.4s,v20.4s,#7 eor w12,w12,w13 sli v5.4s,v21.4s,#7 eor w9,w9,w14 sli v17.4s,v22.4s,#7 ror w10,w10,#25 ext v2.16b,v2.16b,v2.16b,#8 ror w11,w11,#25 ext v6.16b,v6.16b,v6.16b,#8 ror w12,w12,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#4 ext v7.16b,v7.16b,v7.16b,#4 ext v19.16b,v19.16b,v19.16b,#4 ext v1.16b,v1.16b,v1.16b,#12 ext v5.16b,v5.16b,v5.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 cbnz x4,Loop_neon add w5,w5,w22 // accumulate key block add v0.4s,v0.4s,v24.4s add x6,x6,x22,lsr#32 add v4.4s,v4.4s,v24.4s add w7,w7,w23 add v16.4s,v16.4s,v24.4s add x8,x8,x23,lsr#32 add v2.4s,v2.4s,v26.4s add w9,w9,w24 add v6.4s,v6.4s,v26.4s add x10,x10,x24,lsr#32 add v18.4s,v18.4s,v26.4s add w11,w11,w25 add v3.4s,v3.4s,v27.4s add x12,x12,x25,lsr#32 add w13,w13,w26 add v7.4s,v7.4s,v28.4s add x14,x14,x26,lsr#32 add w15,w15,w27 add v19.4s,v19.4s,v29.4s add x16,x16,x27,lsr#32 add w17,w17,w28 add v1.4s,v1.4s,v25.4s add x19,x19,x28,lsr#32 add w20,w20,w30 add v5.4s,v5.4s,v25.4s add x21,x21,x30,lsr#32 add v17.4s,v17.4s,v25.4s b.lo Ltail_neon add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor v0.16b,v0.16b,v20.16b eor x15,x15,x16 eor v1.16b,v1.16b,v21.16b eor x17,x17,x19 eor v2.16b,v2.16b,v22.16b eor x20,x20,x21 eor v3.16b,v3.16b,v23.16b ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 stp x5,x7,[x0,#0] // store output add x28,x28,#4 // increment counter stp x9,x11,[x0,#16] add v27.4s,v27.4s,v31.4s // += 4 stp x13,x15,[x0,#32] add v28.4s,v28.4s,v31.4s stp x17,x20,[x0,#48] add v29.4s,v29.4s,v31.4s add x0,x0,#64 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 eor v4.16b,v4.16b,v20.16b eor v5.16b,v5.16b,v21.16b eor v6.16b,v6.16b,v22.16b eor v7.16b,v7.16b,v23.16b st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 eor v16.16b,v16.16b,v0.16b eor v17.16b,v17.16b,v1.16b eor v18.16b,v18.16b,v2.16b eor v19.16b,v19.16b,v3.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 b.hi Loop_outer_neon ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret Ltail_neon: add x2,x2,#256 cmp x2,#64 b.lo Less_than_64 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#4 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 b.eq Ldone_neon sub x2,x2,#64 cmp x2,#64 b.lo Less_than_128 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor v0.16b,v0.16b,v20.16b eor v1.16b,v1.16b,v21.16b eor v2.16b,v2.16b,v22.16b eor v3.16b,v3.16b,v23.16b st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 b.eq Ldone_neon sub x2,x2,#64 cmp x2,#64 b.lo Less_than_192 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor v4.16b,v4.16b,v20.16b eor v5.16b,v5.16b,v21.16b eor v6.16b,v6.16b,v22.16b eor v7.16b,v7.16b,v23.16b st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 b.eq Ldone_neon sub x2,x2,#64 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] b Last_neon Less_than_128: st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] b Last_neon Less_than_192: st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] b Last_neon .align 4 Last_neon: sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 add x4,sp,x2 neg x2,x2 Loop_tail_neon: ldrb w10,[x1,x2] ldrb w11,[x4,x2] add x2,x2,#1 eor w10,w10,w11 strb w10,[x0,x2] cbnz x2,Loop_tail_neon stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] Ldone_neon: ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .def ChaCha20_512_neon .type 32 .endef .align 5 ChaCha20_512_neon: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 adrp x5,Lsigma add x5,x5,:lo12:Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] L512_or_more_neon: sub sp,sp,#128+64 ldp x22,x23,[x5] // load sigma ld1 {v24.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ld1 {v25.4s,v26.4s},[x3] ldp x28,x30,[x4] // load counter ld1 {v27.4s},[x4] ld1 {v31.4s},[x5] #ifdef __AARCH64EB__ rev64 v24.4s,v24.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif add v27.4s,v27.4s,v31.4s // += 1 stp q24,q25,[sp,#0] // off-load key block, invariant part add v27.4s,v27.4s,v31.4s // not typo str q26,[sp,#32] add v28.4s,v27.4s,v31.4s add v29.4s,v28.4s,v31.4s add v30.4s,v29.4s,v31.4s shl v31.4s,v31.4s,#2 // 1 -> 4 stp d8,d9,[sp,#128+0] // meet ABI requirements stp d10,d11,[sp,#128+16] stp d12,d13,[sp,#128+32] stp d14,d15,[sp,#128+48] sub x2,x2,#512 // not typo Loop_outer_512_neon: mov v0.16b,v24.16b mov v4.16b,v24.16b mov v8.16b,v24.16b mov v12.16b,v24.16b mov v16.16b,v24.16b mov v20.16b,v24.16b mov v1.16b,v25.16b mov w5,w22 // unpack key block mov v5.16b,v25.16b lsr x6,x22,#32 mov v9.16b,v25.16b mov w7,w23 mov v13.16b,v25.16b lsr x8,x23,#32 mov v17.16b,v25.16b mov w9,w24 mov v21.16b,v25.16b lsr x10,x24,#32 mov v3.16b,v27.16b mov w11,w25 mov v7.16b,v28.16b lsr x12,x25,#32 mov v11.16b,v29.16b mov w13,w26 mov v15.16b,v30.16b lsr x14,x26,#32 mov v2.16b,v26.16b mov w15,w27 mov v6.16b,v26.16b lsr x16,x27,#32 add v19.4s,v3.4s,v31.4s // +4 mov w17,w28 add v23.4s,v7.4s,v31.4s // +4 lsr x19,x28,#32 mov v10.16b,v26.16b mov w20,w30 mov v14.16b,v26.16b lsr x21,x30,#32 mov v18.16b,v26.16b stp q27,q28,[sp,#48] // off-load key block, variable part mov v22.16b,v26.16b str q29,[sp,#80] mov x4,#5 subs x2,x2,#512 Loop_upper_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#12 ext v7.16b,v7.16b,v7.16b,#12 ext v11.16b,v11.16b,v11.16b,#12 ext v15.16b,v15.16b,v15.16b,#12 ext v19.16b,v19.16b,v19.16b,#12 ext v23.16b,v23.16b,v23.16b,#12 ext v1.16b,v1.16b,v1.16b,#4 ext v5.16b,v5.16b,v5.16b,#4 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#4 ext v7.16b,v7.16b,v7.16b,#4 ext v11.16b,v11.16b,v11.16b,#4 ext v15.16b,v15.16b,v15.16b,#4 ext v19.16b,v19.16b,v19.16b,#4 ext v23.16b,v23.16b,v23.16b,#4 ext v1.16b,v1.16b,v1.16b,#12 ext v5.16b,v5.16b,v5.16b,#12 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 cbnz x4,Loop_upper_neon add w5,w5,w22 // accumulate key block add x6,x6,x22,lsr#32 add w7,w7,w23 add x8,x8,x23,lsr#32 add w9,w9,w24 add x10,x10,x24,lsr#32 add w11,w11,w25 add x12,x12,x25,lsr#32 add w13,w13,w26 add x14,x14,x26,lsr#32 add w15,w15,w27 add x16,x16,x27,lsr#32 add w17,w17,w28 add x19,x19,x28,lsr#32 add w20,w20,w30 add x21,x21,x30,lsr#32 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#1 // increment counter mov w5,w22 // unpack key block lsr x6,x22,#32 stp x9,x11,[x0,#16] mov w7,w23 lsr x8,x23,#32 stp x13,x15,[x0,#32] mov w9,w24 lsr x10,x24,#32 stp x17,x20,[x0,#48] add x0,x0,#64 mov w11,w25 lsr x12,x25,#32 mov w13,w26 lsr x14,x26,#32 mov w15,w27 lsr x16,x27,#32 mov w17,w28 lsr x19,x28,#32 mov w20,w30 lsr x21,x30,#32 mov x4,#5 Loop_lower_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#12 ext v7.16b,v7.16b,v7.16b,#12 ext v11.16b,v11.16b,v11.16b,#12 ext v15.16b,v15.16b,v15.16b,#12 ext v19.16b,v19.16b,v19.16b,#12 ext v23.16b,v23.16b,v23.16b,#12 ext v1.16b,v1.16b,v1.16b,#4 ext v5.16b,v5.16b,v5.16b,#4 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#4 ext v7.16b,v7.16b,v7.16b,#4 ext v11.16b,v11.16b,v11.16b,#4 ext v15.16b,v15.16b,v15.16b,#4 ext v19.16b,v19.16b,v19.16b,#4 ext v23.16b,v23.16b,v23.16b,#4 ext v1.16b,v1.16b,v1.16b,#12 ext v5.16b,v5.16b,v5.16b,#12 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 cbnz x4,Loop_lower_neon add w5,w5,w22 // accumulate key block ldp q24,q25,[sp,#0] add x6,x6,x22,lsr#32 ldp q26,q27,[sp,#32] add w7,w7,w23 ldp q28,q29,[sp,#64] add x8,x8,x23,lsr#32 add v0.4s,v0.4s,v24.4s add w9,w9,w24 add v4.4s,v4.4s,v24.4s add x10,x10,x24,lsr#32 add v8.4s,v8.4s,v24.4s add w11,w11,w25 add v12.4s,v12.4s,v24.4s add x12,x12,x25,lsr#32 add v16.4s,v16.4s,v24.4s add w13,w13,w26 add v20.4s,v20.4s,v24.4s add x14,x14,x26,lsr#32 add v2.4s,v2.4s,v26.4s add w15,w15,w27 add v6.4s,v6.4s,v26.4s add x16,x16,x27,lsr#32 add v10.4s,v10.4s,v26.4s add w17,w17,w28 add v14.4s,v14.4s,v26.4s add x19,x19,x28,lsr#32 add v18.4s,v18.4s,v26.4s add w20,w20,w30 add v22.4s,v22.4s,v26.4s add x21,x21,x30,lsr#32 add v19.4s,v19.4s,v31.4s // +4 add x5,x5,x6,lsl#32 // pack add v23.4s,v23.4s,v31.4s // +4 add x7,x7,x8,lsl#32 add v3.4s,v3.4s,v27.4s ldp x6,x8,[x1,#0] // load input add v7.4s,v7.4s,v28.4s add x9,x9,x10,lsl#32 add v11.4s,v11.4s,v29.4s add x11,x11,x12,lsl#32 add v15.4s,v15.4s,v30.4s ldp x10,x12,[x1,#16] add v19.4s,v19.4s,v27.4s add x13,x13,x14,lsl#32 add v23.4s,v23.4s,v28.4s add x15,x15,x16,lsl#32 add v1.4s,v1.4s,v25.4s ldp x14,x16,[x1,#32] add v5.4s,v5.4s,v25.4s add x17,x17,x19,lsl#32 add v9.4s,v9.4s,v25.4s add x20,x20,x21,lsl#32 add v13.4s,v13.4s,v25.4s ldp x19,x21,[x1,#48] add v17.4s,v17.4s,v25.4s add x1,x1,#64 add v21.4s,v21.4s,v25.4s #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor v0.16b,v0.16b,v24.16b eor x15,x15,x16 eor v1.16b,v1.16b,v25.16b eor x17,x17,x19 eor v2.16b,v2.16b,v26.16b eor x20,x20,x21 eor v3.16b,v3.16b,v27.16b ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 stp x5,x7,[x0,#0] // store output add x28,x28,#7 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 eor v4.16b,v4.16b,v24.16b eor v5.16b,v5.16b,v25.16b eor v6.16b,v6.16b,v26.16b eor v7.16b,v7.16b,v27.16b st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 eor v8.16b,v8.16b,v0.16b ldp q24,q25,[sp,#0] eor v9.16b,v9.16b,v1.16b ldp q26,q27,[sp,#32] eor v10.16b,v10.16b,v2.16b eor v11.16b,v11.16b,v3.16b st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 eor v12.16b,v12.16b,v4.16b eor v13.16b,v13.16b,v5.16b eor v14.16b,v14.16b,v6.16b eor v15.16b,v15.16b,v7.16b st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 eor v16.16b,v16.16b,v8.16b eor v17.16b,v17.16b,v9.16b eor v18.16b,v18.16b,v10.16b eor v19.16b,v19.16b,v11.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 shl v0.4s,v31.4s,#1 // 4 -> 8 eor v20.16b,v20.16b,v12.16b eor v21.16b,v21.16b,v13.16b eor v22.16b,v22.16b,v14.16b eor v23.16b,v23.16b,v15.16b st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 add v27.4s,v27.4s,v0.4s // += 8 add v28.4s,v28.4s,v0.4s add v29.4s,v29.4s,v0.4s add v30.4s,v30.4s,v0.4s b.hs Loop_outer_512_neon adds x2,x2,#512 ushr v0.4s,v31.4s,#2 // 4 -> 1 ldp d8,d9,[sp,#128+0] // meet ABI requirements ldp d10,d11,[sp,#128+16] ldp d12,d13,[sp,#128+32] ldp d14,d15,[sp,#128+48] stp q24,q31,[sp,#0] // wipe off-load area stp q24,q31,[sp,#32] stp q24,q31,[sp,#64] b.eq Ldone_512_neon cmp x2,#192 sub v27.4s,v27.4s,v0.4s // -= 1 sub v28.4s,v28.4s,v0.4s sub v29.4s,v29.4s,v0.4s add sp,sp,#128 b.hs Loop_outer_neon eor v25.16b,v25.16b,v25.16b eor v26.16b,v26.16b,v26.16b eor v27.16b,v27.16b,v27.16b eor v28.16b,v28.16b,v28.16b eor v29.16b,v29.16b,v29.16b eor v30.16b,v30.16b,v30.16b b Loop_outer Ldone_512_neon: ldp x19,x20,[x29,#16] add sp,sp,#128+64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) ring-0.17.14/pregenerated/chacha-x86-elf.S000064400000000000000000000305161046102023000161310ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) .text .globl ChaCha20_ctr32_ssse3 .hidden ChaCha20_ctr32_ssse3 .type ChaCha20_ctr32_ssse3,@function .align 16 ChaCha20_ctr32_ssse3: .L_ChaCha20_ctr32_ssse3_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi call .Lpic_point .Lpic_point: popl %eax movl 20(%esp),%edi movl 24(%esp),%esi movl 28(%esp),%ecx movl 32(%esp),%edx movl 36(%esp),%ebx movl %esp,%ebp subl $524,%esp andl $-64,%esp movl %ebp,512(%esp) leal .Lssse3_data-.Lpic_point(%eax),%eax movdqu (%ebx),%xmm3 cmpl $256,%ecx jb .L0001x movl %edx,516(%esp) movl %ebx,520(%esp) subl $256,%ecx leal 384(%esp),%ebp movdqu (%edx),%xmm7 pshufd $0,%xmm3,%xmm0 pshufd $85,%xmm3,%xmm1 pshufd $170,%xmm3,%xmm2 pshufd $255,%xmm3,%xmm3 paddd 48(%eax),%xmm0 pshufd $0,%xmm7,%xmm4 pshufd $85,%xmm7,%xmm5 psubd 64(%eax),%xmm0 pshufd $170,%xmm7,%xmm6 pshufd $255,%xmm7,%xmm7 movdqa %xmm0,64(%ebp) movdqa %xmm1,80(%ebp) movdqa %xmm2,96(%ebp) movdqa %xmm3,112(%ebp) movdqu 16(%edx),%xmm3 movdqa %xmm4,-64(%ebp) movdqa %xmm5,-48(%ebp) movdqa %xmm6,-32(%ebp) movdqa %xmm7,-16(%ebp) movdqa 32(%eax),%xmm7 leal 128(%esp),%ebx pshufd $0,%xmm3,%xmm0 pshufd $85,%xmm3,%xmm1 pshufd $170,%xmm3,%xmm2 pshufd $255,%xmm3,%xmm3 pshufd $0,%xmm7,%xmm4 pshufd $85,%xmm7,%xmm5 pshufd $170,%xmm7,%xmm6 pshufd $255,%xmm7,%xmm7 movdqa %xmm0,(%ebp) movdqa %xmm1,16(%ebp) movdqa %xmm2,32(%ebp) movdqa %xmm3,48(%ebp) movdqa %xmm4,-128(%ebp) movdqa %xmm5,-112(%ebp) movdqa %xmm6,-96(%ebp) movdqa %xmm7,-80(%ebp) leal 128(%esi),%esi leal 128(%edi),%edi jmp .L001outer_loop .align 16 .L001outer_loop: movdqa -112(%ebp),%xmm1 movdqa -96(%ebp),%xmm2 movdqa -80(%ebp),%xmm3 movdqa -48(%ebp),%xmm5 movdqa -32(%ebp),%xmm6 movdqa -16(%ebp),%xmm7 movdqa %xmm1,-112(%ebx) movdqa %xmm2,-96(%ebx) movdqa %xmm3,-80(%ebx) movdqa %xmm5,-48(%ebx) movdqa %xmm6,-32(%ebx) movdqa %xmm7,-16(%ebx) movdqa 32(%ebp),%xmm2 movdqa 48(%ebp),%xmm3 movdqa 64(%ebp),%xmm4 movdqa 80(%ebp),%xmm5 movdqa 96(%ebp),%xmm6 movdqa 112(%ebp),%xmm7 paddd 64(%eax),%xmm4 movdqa %xmm2,32(%ebx) movdqa %xmm3,48(%ebx) movdqa %xmm4,64(%ebx) movdqa %xmm5,80(%ebx) movdqa %xmm6,96(%ebx) movdqa %xmm7,112(%ebx) movdqa %xmm4,64(%ebp) movdqa -128(%ebp),%xmm0 movdqa %xmm4,%xmm6 movdqa -64(%ebp),%xmm3 movdqa (%ebp),%xmm4 movdqa 16(%ebp),%xmm5 movl $10,%edx nop .align 16 .L002loop: paddd %xmm3,%xmm0 movdqa %xmm3,%xmm2 pxor %xmm0,%xmm6 pshufb (%eax),%xmm6 paddd %xmm6,%xmm4 pxor %xmm4,%xmm2 movdqa -48(%ebx),%xmm3 movdqa %xmm2,%xmm1 pslld $12,%xmm2 psrld $20,%xmm1 por %xmm1,%xmm2 movdqa -112(%ebx),%xmm1 paddd %xmm2,%xmm0 movdqa 80(%ebx),%xmm7 pxor %xmm0,%xmm6 movdqa %xmm0,-128(%ebx) pshufb 16(%eax),%xmm6 paddd %xmm6,%xmm4 movdqa %xmm6,64(%ebx) pxor %xmm4,%xmm2 paddd %xmm3,%xmm1 movdqa %xmm2,%xmm0 pslld $7,%xmm2 psrld $25,%xmm0 pxor %xmm1,%xmm7 por %xmm0,%xmm2 movdqa %xmm4,(%ebx) pshufb (%eax),%xmm7 movdqa %xmm2,-64(%ebx) paddd %xmm7,%xmm5 movdqa 32(%ebx),%xmm4 pxor %xmm5,%xmm3 movdqa -32(%ebx),%xmm2 movdqa %xmm3,%xmm0 pslld $12,%xmm3 psrld $20,%xmm0 por %xmm0,%xmm3 movdqa -96(%ebx),%xmm0 paddd %xmm3,%xmm1 movdqa 96(%ebx),%xmm6 pxor %xmm1,%xmm7 movdqa %xmm1,-112(%ebx) pshufb 16(%eax),%xmm7 paddd %xmm7,%xmm5 movdqa %xmm7,80(%ebx) pxor %xmm5,%xmm3 paddd %xmm2,%xmm0 movdqa %xmm3,%xmm1 pslld $7,%xmm3 psrld $25,%xmm1 pxor %xmm0,%xmm6 por %xmm1,%xmm3 movdqa %xmm5,16(%ebx) pshufb (%eax),%xmm6 movdqa %xmm3,-48(%ebx) paddd %xmm6,%xmm4 movdqa 48(%ebx),%xmm5 pxor %xmm4,%xmm2 movdqa -16(%ebx),%xmm3 movdqa %xmm2,%xmm1 pslld $12,%xmm2 psrld $20,%xmm1 por %xmm1,%xmm2 movdqa -80(%ebx),%xmm1 paddd %xmm2,%xmm0 movdqa 112(%ebx),%xmm7 pxor %xmm0,%xmm6 movdqa %xmm0,-96(%ebx) pshufb 16(%eax),%xmm6 paddd %xmm6,%xmm4 movdqa %xmm6,96(%ebx) pxor %xmm4,%xmm2 paddd %xmm3,%xmm1 movdqa %xmm2,%xmm0 pslld $7,%xmm2 psrld $25,%xmm0 pxor %xmm1,%xmm7 por %xmm0,%xmm2 pshufb (%eax),%xmm7 movdqa %xmm2,-32(%ebx) paddd %xmm7,%xmm5 pxor %xmm5,%xmm3 movdqa -48(%ebx),%xmm2 movdqa %xmm3,%xmm0 pslld $12,%xmm3 psrld $20,%xmm0 por %xmm0,%xmm3 movdqa -128(%ebx),%xmm0 paddd %xmm3,%xmm1 pxor %xmm1,%xmm7 movdqa %xmm1,-80(%ebx) pshufb 16(%eax),%xmm7 paddd %xmm7,%xmm5 movdqa %xmm7,%xmm6 pxor %xmm5,%xmm3 paddd %xmm2,%xmm0 movdqa %xmm3,%xmm1 pslld $7,%xmm3 psrld $25,%xmm1 pxor %xmm0,%xmm6 por %xmm1,%xmm3 pshufb (%eax),%xmm6 movdqa %xmm3,-16(%ebx) paddd %xmm6,%xmm4 pxor %xmm4,%xmm2 movdqa -32(%ebx),%xmm3 movdqa %xmm2,%xmm1 pslld $12,%xmm2 psrld $20,%xmm1 por %xmm1,%xmm2 movdqa -112(%ebx),%xmm1 paddd %xmm2,%xmm0 movdqa 64(%ebx),%xmm7 pxor %xmm0,%xmm6 movdqa %xmm0,-128(%ebx) pshufb 16(%eax),%xmm6 paddd %xmm6,%xmm4 movdqa %xmm6,112(%ebx) pxor %xmm4,%xmm2 paddd %xmm3,%xmm1 movdqa %xmm2,%xmm0 pslld $7,%xmm2 psrld $25,%xmm0 pxor %xmm1,%xmm7 por %xmm0,%xmm2 movdqa %xmm4,32(%ebx) pshufb (%eax),%xmm7 movdqa %xmm2,-48(%ebx) paddd %xmm7,%xmm5 movdqa (%ebx),%xmm4 pxor %xmm5,%xmm3 movdqa -16(%ebx),%xmm2 movdqa %xmm3,%xmm0 pslld $12,%xmm3 psrld $20,%xmm0 por %xmm0,%xmm3 movdqa -96(%ebx),%xmm0 paddd %xmm3,%xmm1 movdqa 80(%ebx),%xmm6 pxor %xmm1,%xmm7 movdqa %xmm1,-112(%ebx) pshufb 16(%eax),%xmm7 paddd %xmm7,%xmm5 movdqa %xmm7,64(%ebx) pxor %xmm5,%xmm3 paddd %xmm2,%xmm0 movdqa %xmm3,%xmm1 pslld $7,%xmm3 psrld $25,%xmm1 pxor %xmm0,%xmm6 por %xmm1,%xmm3 movdqa %xmm5,48(%ebx) pshufb (%eax),%xmm6 movdqa %xmm3,-32(%ebx) paddd %xmm6,%xmm4 movdqa 16(%ebx),%xmm5 pxor %xmm4,%xmm2 movdqa -64(%ebx),%xmm3 movdqa %xmm2,%xmm1 pslld $12,%xmm2 psrld $20,%xmm1 por %xmm1,%xmm2 movdqa -80(%ebx),%xmm1 paddd %xmm2,%xmm0 movdqa 96(%ebx),%xmm7 pxor %xmm0,%xmm6 movdqa %xmm0,-96(%ebx) pshufb 16(%eax),%xmm6 paddd %xmm6,%xmm4 movdqa %xmm6,80(%ebx) pxor %xmm4,%xmm2 paddd %xmm3,%xmm1 movdqa %xmm2,%xmm0 pslld $7,%xmm2 psrld $25,%xmm0 pxor %xmm1,%xmm7 por %xmm0,%xmm2 pshufb (%eax),%xmm7 movdqa %xmm2,-16(%ebx) paddd %xmm7,%xmm5 pxor %xmm5,%xmm3 movdqa %xmm3,%xmm0 pslld $12,%xmm3 psrld $20,%xmm0 por %xmm0,%xmm3 movdqa -128(%ebx),%xmm0 paddd %xmm3,%xmm1 movdqa 64(%ebx),%xmm6 pxor %xmm1,%xmm7 movdqa %xmm1,-80(%ebx) pshufb 16(%eax),%xmm7 paddd %xmm7,%xmm5 movdqa %xmm7,96(%ebx) pxor %xmm5,%xmm3 movdqa %xmm3,%xmm1 pslld $7,%xmm3 psrld $25,%xmm1 por %xmm1,%xmm3 decl %edx jnz .L002loop movdqa %xmm3,-64(%ebx) movdqa %xmm4,(%ebx) movdqa %xmm5,16(%ebx) movdqa %xmm6,64(%ebx) movdqa %xmm7,96(%ebx) movdqa -112(%ebx),%xmm1 movdqa -96(%ebx),%xmm2 movdqa -80(%ebx),%xmm3 paddd -128(%ebp),%xmm0 paddd -112(%ebp),%xmm1 paddd -96(%ebp),%xmm2 paddd -80(%ebp),%xmm3 movdqa %xmm0,%xmm6 punpckldq %xmm1,%xmm0 movdqa %xmm2,%xmm7 punpckldq %xmm3,%xmm2 punpckhdq %xmm1,%xmm6 punpckhdq %xmm3,%xmm7 movdqa %xmm0,%xmm1 punpcklqdq %xmm2,%xmm0 movdqa %xmm6,%xmm3 punpcklqdq %xmm7,%xmm6 punpckhqdq %xmm2,%xmm1 punpckhqdq %xmm7,%xmm3 movdqu -128(%esi),%xmm4 movdqu -64(%esi),%xmm5 movdqu (%esi),%xmm2 movdqu 64(%esi),%xmm7 leal 16(%esi),%esi pxor %xmm0,%xmm4 movdqa -64(%ebx),%xmm0 pxor %xmm1,%xmm5 movdqa -48(%ebx),%xmm1 pxor %xmm2,%xmm6 movdqa -32(%ebx),%xmm2 pxor %xmm3,%xmm7 movdqa -16(%ebx),%xmm3 movdqu %xmm4,-128(%edi) movdqu %xmm5,-64(%edi) movdqu %xmm6,(%edi) movdqu %xmm7,64(%edi) leal 16(%edi),%edi paddd -64(%ebp),%xmm0 paddd -48(%ebp),%xmm1 paddd -32(%ebp),%xmm2 paddd -16(%ebp),%xmm3 movdqa %xmm0,%xmm6 punpckldq %xmm1,%xmm0 movdqa %xmm2,%xmm7 punpckldq %xmm3,%xmm2 punpckhdq %xmm1,%xmm6 punpckhdq %xmm3,%xmm7 movdqa %xmm0,%xmm1 punpcklqdq %xmm2,%xmm0 movdqa %xmm6,%xmm3 punpcklqdq %xmm7,%xmm6 punpckhqdq %xmm2,%xmm1 punpckhqdq %xmm7,%xmm3 movdqu -128(%esi),%xmm4 movdqu -64(%esi),%xmm5 movdqu (%esi),%xmm2 movdqu 64(%esi),%xmm7 leal 16(%esi),%esi pxor %xmm0,%xmm4 movdqa (%ebx),%xmm0 pxor %xmm1,%xmm5 movdqa 16(%ebx),%xmm1 pxor %xmm2,%xmm6 movdqa 32(%ebx),%xmm2 pxor %xmm3,%xmm7 movdqa 48(%ebx),%xmm3 movdqu %xmm4,-128(%edi) movdqu %xmm5,-64(%edi) movdqu %xmm6,(%edi) movdqu %xmm7,64(%edi) leal 16(%edi),%edi paddd (%ebp),%xmm0 paddd 16(%ebp),%xmm1 paddd 32(%ebp),%xmm2 paddd 48(%ebp),%xmm3 movdqa %xmm0,%xmm6 punpckldq %xmm1,%xmm0 movdqa %xmm2,%xmm7 punpckldq %xmm3,%xmm2 punpckhdq %xmm1,%xmm6 punpckhdq %xmm3,%xmm7 movdqa %xmm0,%xmm1 punpcklqdq %xmm2,%xmm0 movdqa %xmm6,%xmm3 punpcklqdq %xmm7,%xmm6 punpckhqdq %xmm2,%xmm1 punpckhqdq %xmm7,%xmm3 movdqu -128(%esi),%xmm4 movdqu -64(%esi),%xmm5 movdqu (%esi),%xmm2 movdqu 64(%esi),%xmm7 leal 16(%esi),%esi pxor %xmm0,%xmm4 movdqa 64(%ebx),%xmm0 pxor %xmm1,%xmm5 movdqa 80(%ebx),%xmm1 pxor %xmm2,%xmm6 movdqa 96(%ebx),%xmm2 pxor %xmm3,%xmm7 movdqa 112(%ebx),%xmm3 movdqu %xmm4,-128(%edi) movdqu %xmm5,-64(%edi) movdqu %xmm6,(%edi) movdqu %xmm7,64(%edi) leal 16(%edi),%edi paddd 64(%ebp),%xmm0 paddd 80(%ebp),%xmm1 paddd 96(%ebp),%xmm2 paddd 112(%ebp),%xmm3 movdqa %xmm0,%xmm6 punpckldq %xmm1,%xmm0 movdqa %xmm2,%xmm7 punpckldq %xmm3,%xmm2 punpckhdq %xmm1,%xmm6 punpckhdq %xmm3,%xmm7 movdqa %xmm0,%xmm1 punpcklqdq %xmm2,%xmm0 movdqa %xmm6,%xmm3 punpcklqdq %xmm7,%xmm6 punpckhqdq %xmm2,%xmm1 punpckhqdq %xmm7,%xmm3 movdqu -128(%esi),%xmm4 movdqu -64(%esi),%xmm5 movdqu (%esi),%xmm2 movdqu 64(%esi),%xmm7 leal 208(%esi),%esi pxor %xmm0,%xmm4 pxor %xmm1,%xmm5 pxor %xmm2,%xmm6 pxor %xmm3,%xmm7 movdqu %xmm4,-128(%edi) movdqu %xmm5,-64(%edi) movdqu %xmm6,(%edi) movdqu %xmm7,64(%edi) leal 208(%edi),%edi subl $256,%ecx jnc .L001outer_loop addl $256,%ecx jz .L003done movl 520(%esp),%ebx leal -128(%esi),%esi movl 516(%esp),%edx leal -128(%edi),%edi movd 64(%ebp),%xmm2 movdqu (%ebx),%xmm3 paddd 96(%eax),%xmm2 pand 112(%eax),%xmm3 por %xmm2,%xmm3 .L0001x: movdqa 32(%eax),%xmm0 movdqu (%edx),%xmm1 movdqu 16(%edx),%xmm2 movdqa (%eax),%xmm6 movdqa 16(%eax),%xmm7 movl %ebp,48(%esp) movdqa %xmm0,(%esp) movdqa %xmm1,16(%esp) movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) movl $10,%edx jmp .L004loop1x .align 16 .L005outer1x: movdqa 80(%eax),%xmm3 movdqa (%esp),%xmm0 movdqa 16(%esp),%xmm1 movdqa 32(%esp),%xmm2 paddd 48(%esp),%xmm3 movl $10,%edx movdqa %xmm3,48(%esp) jmp .L004loop1x .align 16 .L004loop1x: paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,222 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $20,%xmm1 pslld $12,%xmm4 por %xmm4,%xmm1 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,223 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $25,%xmm1 pslld $7,%xmm4 por %xmm4,%xmm1 pshufd $78,%xmm2,%xmm2 pshufd $57,%xmm1,%xmm1 pshufd $147,%xmm3,%xmm3 nop paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,222 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $20,%xmm1 pslld $12,%xmm4 por %xmm4,%xmm1 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,223 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $25,%xmm1 pslld $7,%xmm4 por %xmm4,%xmm1 pshufd $78,%xmm2,%xmm2 pshufd $147,%xmm1,%xmm1 pshufd $57,%xmm3,%xmm3 decl %edx jnz .L004loop1x paddd (%esp),%xmm0 paddd 16(%esp),%xmm1 paddd 32(%esp),%xmm2 paddd 48(%esp),%xmm3 cmpl $64,%ecx jb .L006tail movdqu (%esi),%xmm4 movdqu 16(%esi),%xmm5 pxor %xmm4,%xmm0 movdqu 32(%esi),%xmm4 pxor %xmm5,%xmm1 movdqu 48(%esi),%xmm5 pxor %xmm4,%xmm2 pxor %xmm5,%xmm3 leal 64(%esi),%esi movdqu %xmm0,(%edi) movdqu %xmm1,16(%edi) movdqu %xmm2,32(%edi) movdqu %xmm3,48(%edi) leal 64(%edi),%edi subl $64,%ecx jnz .L005outer1x jmp .L003done .L006tail: movdqa %xmm0,(%esp) movdqa %xmm1,16(%esp) movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) xorl %eax,%eax xorl %edx,%edx xorl %ebp,%ebp .L007tail_loop: movb (%esp,%ebp,1),%al movb (%esi,%ebp,1),%dl leal 1(%ebp),%ebp xorb %dl,%al movb %al,-1(%edi,%ebp,1) decl %ecx jnz .L007tail_loop .L003done: movl 512(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret .size ChaCha20_ctr32_ssse3,.-.L_ChaCha20_ctr32_ssse3_begin .align 64 .Lssse3_data: .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 .long 1634760805,857760878,2036477234,1797285236 .long 0,1,2,3 .long 4,4,4,4 .long 1,0,0,0 .long 4,0,0,0 .long 0,-1,-1,-1 .align 64 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 .byte 114,103,62,0 #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) ring-0.17.14/pregenerated/chacha-x86-win32n.asm000064400000000000000000000267241046102023000170670ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %include "ring_core_generated/prefix_symbols_nasm.inc" %ifidn __OUTPUT_FORMAT__, win32 %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 $@feat.00 equ 1 section .text code align=64 %else section .text code %endif global _ChaCha20_ctr32_ssse3 align 16 _ChaCha20_ctr32_ssse3: L$_ChaCha20_ctr32_ssse3_begin: push ebp push ebx push esi push edi call L$pic_point L$pic_point: pop eax mov edi,DWORD [20+esp] mov esi,DWORD [24+esp] mov ecx,DWORD [28+esp] mov edx,DWORD [32+esp] mov ebx,DWORD [36+esp] mov ebp,esp sub esp,524 and esp,-64 mov DWORD [512+esp],ebp lea eax,[(L$ssse3_data-L$pic_point)+eax] movdqu xmm3,[ebx] cmp ecx,256 jb NEAR L$0001x mov DWORD [516+esp],edx mov DWORD [520+esp],ebx sub ecx,256 lea ebp,[384+esp] movdqu xmm7,[edx] pshufd xmm0,xmm3,0 pshufd xmm1,xmm3,85 pshufd xmm2,xmm3,170 pshufd xmm3,xmm3,255 paddd xmm0,[48+eax] pshufd xmm4,xmm7,0 pshufd xmm5,xmm7,85 psubd xmm0,[64+eax] pshufd xmm6,xmm7,170 pshufd xmm7,xmm7,255 movdqa [64+ebp],xmm0 movdqa [80+ebp],xmm1 movdqa [96+ebp],xmm2 movdqa [112+ebp],xmm3 movdqu xmm3,[16+edx] movdqa [ebp-64],xmm4 movdqa [ebp-48],xmm5 movdqa [ebp-32],xmm6 movdqa [ebp-16],xmm7 movdqa xmm7,[32+eax] lea ebx,[128+esp] pshufd xmm0,xmm3,0 pshufd xmm1,xmm3,85 pshufd xmm2,xmm3,170 pshufd xmm3,xmm3,255 pshufd xmm4,xmm7,0 pshufd xmm5,xmm7,85 pshufd xmm6,xmm7,170 pshufd xmm7,xmm7,255 movdqa [ebp],xmm0 movdqa [16+ebp],xmm1 movdqa [32+ebp],xmm2 movdqa [48+ebp],xmm3 movdqa [ebp-128],xmm4 movdqa [ebp-112],xmm5 movdqa [ebp-96],xmm6 movdqa [ebp-80],xmm7 lea esi,[128+esi] lea edi,[128+edi] jmp NEAR L$001outer_loop align 16 L$001outer_loop: movdqa xmm1,[ebp-112] movdqa xmm2,[ebp-96] movdqa xmm3,[ebp-80] movdqa xmm5,[ebp-48] movdqa xmm6,[ebp-32] movdqa xmm7,[ebp-16] movdqa [ebx-112],xmm1 movdqa [ebx-96],xmm2 movdqa [ebx-80],xmm3 movdqa [ebx-48],xmm5 movdqa [ebx-32],xmm6 movdqa [ebx-16],xmm7 movdqa xmm2,[32+ebp] movdqa xmm3,[48+ebp] movdqa xmm4,[64+ebp] movdqa xmm5,[80+ebp] movdqa xmm6,[96+ebp] movdqa xmm7,[112+ebp] paddd xmm4,[64+eax] movdqa [32+ebx],xmm2 movdqa [48+ebx],xmm3 movdqa [64+ebx],xmm4 movdqa [80+ebx],xmm5 movdqa [96+ebx],xmm6 movdqa [112+ebx],xmm7 movdqa [64+ebp],xmm4 movdqa xmm0,[ebp-128] movdqa xmm6,xmm4 movdqa xmm3,[ebp-64] movdqa xmm4,[ebp] movdqa xmm5,[16+ebp] mov edx,10 nop align 16 L$002loop: paddd xmm0,xmm3 movdqa xmm2,xmm3 pxor xmm6,xmm0 pshufb xmm6,[eax] paddd xmm4,xmm6 pxor xmm2,xmm4 movdqa xmm3,[ebx-48] movdqa xmm1,xmm2 pslld xmm2,12 psrld xmm1,20 por xmm2,xmm1 movdqa xmm1,[ebx-112] paddd xmm0,xmm2 movdqa xmm7,[80+ebx] pxor xmm6,xmm0 movdqa [ebx-128],xmm0 pshufb xmm6,[16+eax] paddd xmm4,xmm6 movdqa [64+ebx],xmm6 pxor xmm2,xmm4 paddd xmm1,xmm3 movdqa xmm0,xmm2 pslld xmm2,7 psrld xmm0,25 pxor xmm7,xmm1 por xmm2,xmm0 movdqa [ebx],xmm4 pshufb xmm7,[eax] movdqa [ebx-64],xmm2 paddd xmm5,xmm7 movdqa xmm4,[32+ebx] pxor xmm3,xmm5 movdqa xmm2,[ebx-32] movdqa xmm0,xmm3 pslld xmm3,12 psrld xmm0,20 por xmm3,xmm0 movdqa xmm0,[ebx-96] paddd xmm1,xmm3 movdqa xmm6,[96+ebx] pxor xmm7,xmm1 movdqa [ebx-112],xmm1 pshufb xmm7,[16+eax] paddd xmm5,xmm7 movdqa [80+ebx],xmm7 pxor xmm3,xmm5 paddd xmm0,xmm2 movdqa xmm1,xmm3 pslld xmm3,7 psrld xmm1,25 pxor xmm6,xmm0 por xmm3,xmm1 movdqa [16+ebx],xmm5 pshufb xmm6,[eax] movdqa [ebx-48],xmm3 paddd xmm4,xmm6 movdqa xmm5,[48+ebx] pxor xmm2,xmm4 movdqa xmm3,[ebx-16] movdqa xmm1,xmm2 pslld xmm2,12 psrld xmm1,20 por xmm2,xmm1 movdqa xmm1,[ebx-80] paddd xmm0,xmm2 movdqa xmm7,[112+ebx] pxor xmm6,xmm0 movdqa [ebx-96],xmm0 pshufb xmm6,[16+eax] paddd xmm4,xmm6 movdqa [96+ebx],xmm6 pxor xmm2,xmm4 paddd xmm1,xmm3 movdqa xmm0,xmm2 pslld xmm2,7 psrld xmm0,25 pxor xmm7,xmm1 por xmm2,xmm0 pshufb xmm7,[eax] movdqa [ebx-32],xmm2 paddd xmm5,xmm7 pxor xmm3,xmm5 movdqa xmm2,[ebx-48] movdqa xmm0,xmm3 pslld xmm3,12 psrld xmm0,20 por xmm3,xmm0 movdqa xmm0,[ebx-128] paddd xmm1,xmm3 pxor xmm7,xmm1 movdqa [ebx-80],xmm1 pshufb xmm7,[16+eax] paddd xmm5,xmm7 movdqa xmm6,xmm7 pxor xmm3,xmm5 paddd xmm0,xmm2 movdqa xmm1,xmm3 pslld xmm3,7 psrld xmm1,25 pxor xmm6,xmm0 por xmm3,xmm1 pshufb xmm6,[eax] movdqa [ebx-16],xmm3 paddd xmm4,xmm6 pxor xmm2,xmm4 movdqa xmm3,[ebx-32] movdqa xmm1,xmm2 pslld xmm2,12 psrld xmm1,20 por xmm2,xmm1 movdqa xmm1,[ebx-112] paddd xmm0,xmm2 movdqa xmm7,[64+ebx] pxor xmm6,xmm0 movdqa [ebx-128],xmm0 pshufb xmm6,[16+eax] paddd xmm4,xmm6 movdqa [112+ebx],xmm6 pxor xmm2,xmm4 paddd xmm1,xmm3 movdqa xmm0,xmm2 pslld xmm2,7 psrld xmm0,25 pxor xmm7,xmm1 por xmm2,xmm0 movdqa [32+ebx],xmm4 pshufb xmm7,[eax] movdqa [ebx-48],xmm2 paddd xmm5,xmm7 movdqa xmm4,[ebx] pxor xmm3,xmm5 movdqa xmm2,[ebx-16] movdqa xmm0,xmm3 pslld xmm3,12 psrld xmm0,20 por xmm3,xmm0 movdqa xmm0,[ebx-96] paddd xmm1,xmm3 movdqa xmm6,[80+ebx] pxor xmm7,xmm1 movdqa [ebx-112],xmm1 pshufb xmm7,[16+eax] paddd xmm5,xmm7 movdqa [64+ebx],xmm7 pxor xmm3,xmm5 paddd xmm0,xmm2 movdqa xmm1,xmm3 pslld xmm3,7 psrld xmm1,25 pxor xmm6,xmm0 por xmm3,xmm1 movdqa [48+ebx],xmm5 pshufb xmm6,[eax] movdqa [ebx-32],xmm3 paddd xmm4,xmm6 movdqa xmm5,[16+ebx] pxor xmm2,xmm4 movdqa xmm3,[ebx-64] movdqa xmm1,xmm2 pslld xmm2,12 psrld xmm1,20 por xmm2,xmm1 movdqa xmm1,[ebx-80] paddd xmm0,xmm2 movdqa xmm7,[96+ebx] pxor xmm6,xmm0 movdqa [ebx-96],xmm0 pshufb xmm6,[16+eax] paddd xmm4,xmm6 movdqa [80+ebx],xmm6 pxor xmm2,xmm4 paddd xmm1,xmm3 movdqa xmm0,xmm2 pslld xmm2,7 psrld xmm0,25 pxor xmm7,xmm1 por xmm2,xmm0 pshufb xmm7,[eax] movdqa [ebx-16],xmm2 paddd xmm5,xmm7 pxor xmm3,xmm5 movdqa xmm0,xmm3 pslld xmm3,12 psrld xmm0,20 por xmm3,xmm0 movdqa xmm0,[ebx-128] paddd xmm1,xmm3 movdqa xmm6,[64+ebx] pxor xmm7,xmm1 movdqa [ebx-80],xmm1 pshufb xmm7,[16+eax] paddd xmm5,xmm7 movdqa [96+ebx],xmm7 pxor xmm3,xmm5 movdqa xmm1,xmm3 pslld xmm3,7 psrld xmm1,25 por xmm3,xmm1 dec edx jnz NEAR L$002loop movdqa [ebx-64],xmm3 movdqa [ebx],xmm4 movdqa [16+ebx],xmm5 movdqa [64+ebx],xmm6 movdqa [96+ebx],xmm7 movdqa xmm1,[ebx-112] movdqa xmm2,[ebx-96] movdqa xmm3,[ebx-80] paddd xmm0,[ebp-128] paddd xmm1,[ebp-112] paddd xmm2,[ebp-96] paddd xmm3,[ebp-80] movdqa xmm6,xmm0 punpckldq xmm0,xmm1 movdqa xmm7,xmm2 punpckldq xmm2,xmm3 punpckhdq xmm6,xmm1 punpckhdq xmm7,xmm3 movdqa xmm1,xmm0 punpcklqdq xmm0,xmm2 movdqa xmm3,xmm6 punpcklqdq xmm6,xmm7 punpckhqdq xmm1,xmm2 punpckhqdq xmm3,xmm7 movdqu xmm4,[esi-128] movdqu xmm5,[esi-64] movdqu xmm2,[esi] movdqu xmm7,[64+esi] lea esi,[16+esi] pxor xmm4,xmm0 movdqa xmm0,[ebx-64] pxor xmm5,xmm1 movdqa xmm1,[ebx-48] pxor xmm6,xmm2 movdqa xmm2,[ebx-32] pxor xmm7,xmm3 movdqa xmm3,[ebx-16] movdqu [edi-128],xmm4 movdqu [edi-64],xmm5 movdqu [edi],xmm6 movdqu [64+edi],xmm7 lea edi,[16+edi] paddd xmm0,[ebp-64] paddd xmm1,[ebp-48] paddd xmm2,[ebp-32] paddd xmm3,[ebp-16] movdqa xmm6,xmm0 punpckldq xmm0,xmm1 movdqa xmm7,xmm2 punpckldq xmm2,xmm3 punpckhdq xmm6,xmm1 punpckhdq xmm7,xmm3 movdqa xmm1,xmm0 punpcklqdq xmm0,xmm2 movdqa xmm3,xmm6 punpcklqdq xmm6,xmm7 punpckhqdq xmm1,xmm2 punpckhqdq xmm3,xmm7 movdqu xmm4,[esi-128] movdqu xmm5,[esi-64] movdqu xmm2,[esi] movdqu xmm7,[64+esi] lea esi,[16+esi] pxor xmm4,xmm0 movdqa xmm0,[ebx] pxor xmm5,xmm1 movdqa xmm1,[16+ebx] pxor xmm6,xmm2 movdqa xmm2,[32+ebx] pxor xmm7,xmm3 movdqa xmm3,[48+ebx] movdqu [edi-128],xmm4 movdqu [edi-64],xmm5 movdqu [edi],xmm6 movdqu [64+edi],xmm7 lea edi,[16+edi] paddd xmm0,[ebp] paddd xmm1,[16+ebp] paddd xmm2,[32+ebp] paddd xmm3,[48+ebp] movdqa xmm6,xmm0 punpckldq xmm0,xmm1 movdqa xmm7,xmm2 punpckldq xmm2,xmm3 punpckhdq xmm6,xmm1 punpckhdq xmm7,xmm3 movdqa xmm1,xmm0 punpcklqdq xmm0,xmm2 movdqa xmm3,xmm6 punpcklqdq xmm6,xmm7 punpckhqdq xmm1,xmm2 punpckhqdq xmm3,xmm7 movdqu xmm4,[esi-128] movdqu xmm5,[esi-64] movdqu xmm2,[esi] movdqu xmm7,[64+esi] lea esi,[16+esi] pxor xmm4,xmm0 movdqa xmm0,[64+ebx] pxor xmm5,xmm1 movdqa xmm1,[80+ebx] pxor xmm6,xmm2 movdqa xmm2,[96+ebx] pxor xmm7,xmm3 movdqa xmm3,[112+ebx] movdqu [edi-128],xmm4 movdqu [edi-64],xmm5 movdqu [edi],xmm6 movdqu [64+edi],xmm7 lea edi,[16+edi] paddd xmm0,[64+ebp] paddd xmm1,[80+ebp] paddd xmm2,[96+ebp] paddd xmm3,[112+ebp] movdqa xmm6,xmm0 punpckldq xmm0,xmm1 movdqa xmm7,xmm2 punpckldq xmm2,xmm3 punpckhdq xmm6,xmm1 punpckhdq xmm7,xmm3 movdqa xmm1,xmm0 punpcklqdq xmm0,xmm2 movdqa xmm3,xmm6 punpcklqdq xmm6,xmm7 punpckhqdq xmm1,xmm2 punpckhqdq xmm3,xmm7 movdqu xmm4,[esi-128] movdqu xmm5,[esi-64] movdqu xmm2,[esi] movdqu xmm7,[64+esi] lea esi,[208+esi] pxor xmm4,xmm0 pxor xmm5,xmm1 pxor xmm6,xmm2 pxor xmm7,xmm3 movdqu [edi-128],xmm4 movdqu [edi-64],xmm5 movdqu [edi],xmm6 movdqu [64+edi],xmm7 lea edi,[208+edi] sub ecx,256 jnc NEAR L$001outer_loop add ecx,256 jz NEAR L$003done mov ebx,DWORD [520+esp] lea esi,[esi-128] mov edx,DWORD [516+esp] lea edi,[edi-128] movd xmm2,DWORD [64+ebp] movdqu xmm3,[ebx] paddd xmm2,[96+eax] pand xmm3,[112+eax] por xmm3,xmm2 L$0001x: movdqa xmm0,[32+eax] movdqu xmm1,[edx] movdqu xmm2,[16+edx] movdqa xmm6,[eax] movdqa xmm7,[16+eax] mov DWORD [48+esp],ebp movdqa [esp],xmm0 movdqa [16+esp],xmm1 movdqa [32+esp],xmm2 movdqa [48+esp],xmm3 mov edx,10 jmp NEAR L$004loop1x align 16 L$005outer1x: movdqa xmm3,[80+eax] movdqa xmm0,[esp] movdqa xmm1,[16+esp] movdqa xmm2,[32+esp] paddd xmm3,[48+esp] mov edx,10 movdqa [48+esp],xmm3 jmp NEAR L$004loop1x align 16 L$004loop1x: paddd xmm0,xmm1 pxor xmm3,xmm0 db 102,15,56,0,222 paddd xmm2,xmm3 pxor xmm1,xmm2 movdqa xmm4,xmm1 psrld xmm1,20 pslld xmm4,12 por xmm1,xmm4 paddd xmm0,xmm1 pxor xmm3,xmm0 db 102,15,56,0,223 paddd xmm2,xmm3 pxor xmm1,xmm2 movdqa xmm4,xmm1 psrld xmm1,25 pslld xmm4,7 por xmm1,xmm4 pshufd xmm2,xmm2,78 pshufd xmm1,xmm1,57 pshufd xmm3,xmm3,147 nop paddd xmm0,xmm1 pxor xmm3,xmm0 db 102,15,56,0,222 paddd xmm2,xmm3 pxor xmm1,xmm2 movdqa xmm4,xmm1 psrld xmm1,20 pslld xmm4,12 por xmm1,xmm4 paddd xmm0,xmm1 pxor xmm3,xmm0 db 102,15,56,0,223 paddd xmm2,xmm3 pxor xmm1,xmm2 movdqa xmm4,xmm1 psrld xmm1,25 pslld xmm4,7 por xmm1,xmm4 pshufd xmm2,xmm2,78 pshufd xmm1,xmm1,147 pshufd xmm3,xmm3,57 dec edx jnz NEAR L$004loop1x paddd xmm0,[esp] paddd xmm1,[16+esp] paddd xmm2,[32+esp] paddd xmm3,[48+esp] cmp ecx,64 jb NEAR L$006tail movdqu xmm4,[esi] movdqu xmm5,[16+esi] pxor xmm0,xmm4 movdqu xmm4,[32+esi] pxor xmm1,xmm5 movdqu xmm5,[48+esi] pxor xmm2,xmm4 pxor xmm3,xmm5 lea esi,[64+esi] movdqu [edi],xmm0 movdqu [16+edi],xmm1 movdqu [32+edi],xmm2 movdqu [48+edi],xmm3 lea edi,[64+edi] sub ecx,64 jnz NEAR L$005outer1x jmp NEAR L$003done L$006tail: movdqa [esp],xmm0 movdqa [16+esp],xmm1 movdqa [32+esp],xmm2 movdqa [48+esp],xmm3 xor eax,eax xor edx,edx xor ebp,ebp L$007tail_loop: mov al,BYTE [ebp*1+esp] mov dl,BYTE [ebp*1+esi] lea ebp,[1+ebp] xor al,dl mov BYTE [ebp*1+edi-1],al dec ecx jnz NEAR L$007tail_loop L$003done: mov esp,DWORD [512+esp] pop edi pop esi pop ebx pop ebp ret align 64 L$ssse3_data: db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 db 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 dd 1634760805,857760878,2036477234,1797285236 dd 0,1,2,3 dd 4,4,4,4 dd 1,0,0,0 dd 4,0,0,0 dd 0,-1,-1,-1 align 64 db 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 db 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 db 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 db 114,103,62,0 %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/chacha-x86-win32n.o000064400000000000000000000213201046102023000165300ustar 00000000000000Lg .debug$S(@B.debug$T@B.text  p`6C:\Users\b\p\ring\pregenerated\chacha-x86-win32n.asm/m g"Ȫm0 =   & )!0"6#:$@%F&M'T(Z)a*e+j,o-t.y/~0123456789:;<=>?@ABCDEFGHIJK LMNO P&Q,R1S@UEVJWOXTYYZ^[c\h]m^r_w`|abcdefghijklmnopqrstuxyz{|}~ "'-16:>BGLPTX]bfkotx} #'+/49=AFKOSX\afjosw| $(,059>CGLPUY^dhmq u y ~   !"#$%&'()* +,-./"0&1+2/34485=6C7G8L9P:T;Y<^=b>c?i@nArBwC|DEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abc defgh!i$j)k.l3m8n<o@pDqHrLsPtTuXv\w`xdyhzm{r|v}{~~ #',05:?CHKPUZ_cgkosw{ "&+/4:@FKPUZ`flqw|             # ( , 1 6 ; < B G M !S "Y #\ $b %f &k 'o (t )x *} + , - . / 0 1 2 3 4 5 7 8 9 : ; < = ? @ A B C D E G H I J K L M O P Q0 R@ SP T` Up V X Y Z [9C:\Users\b\p\ring\pregenerated\chacha-x86-win32n.o4'The Netwide Assembler 2.13.032_ring_core_0_17_14__ChaCha20_ctr32_ssse3'L$_ChaCha20_ctr32_ssse3_beginL$pic_pointL$001outer_loopL$002loopL$0001xL$005outer1xL$004loop1xL$006tailL$007tail_loopL$003doneL$ssse3_datal p    ! F J ] a x |               USVWX|$t$L$T$ \$$ $ o$$$o:fpfpUfpӪfpf@0fpfpUf@@fpfpfE@fMPfU`f]poZfefmfuf}fox $fpfpUfpӪfpfpfpUfpfpfEfMfU f]0fefmfuf}foMfoUfo]fomfoufo}fKfSf[fkfsf{foU fo]0foe@fomPfou`fo}pf`@fS f[0fc@fkPfs`f{pfe@foEfofo]foefom ffoff80fffo[fofr frffoKffo{PffCf8pffs@fffofrfrfff#f88fSffoc ffoSfofr frffoCffos`ffKf8xff{Pfffofrfrfffkf80f[ffok0ffo[fofr frffoKffo{pffCf8pffs`fffofrfrfff88fSfffoSfofr frffoCfffKf8xffofffofrfrfff80f[fffo[fofr frffoKffo{@ffCf8pffspfffofrfrfffc f88fSffo#ffoSfofr frffoCffosPffKf8xff{@fffofrfrfffk0f80f[ffokffo[fofr frffoKffo{`ffCf8pffsPfffofrfrfff88fSfffofr frffoCffos@ffKf8xff{`ffofrfrfJwf[f#fkfs@f{`foKfoSfo[fEfMfUf]fofbfofbfjfjfoflfoflfmfmofonoo~@vffoCffoKffoSffo[go7@fEfMfUf]fofbfofbfjfjfoflfoflfmfmofonoo~@vffoffoKffoS ffo[0go7@fEfMfU f]0fofbfofbfjfjfoflfoflfmfmofonoo~@vffoC@ffoKPffoS`ffo[pgo7@fE@fMPfU`f]pfofbfofbfjfjfoflfoflfmfmofonoo~@ffffgo7@c$v$fnU@ofP`fXpffo@ o oRfo0foxl$0f$fL$fT$ f\$0 0foXPfo$foL$foT$ f\$0 f\$0fff8fffofrfr ffff8fffofrfrffpNfp9fpۓfff8fffofrfr ffff8fffofrfrffpNfpɓfp9J>f$fL$fT$ f\$0@Jo&onfof fon0ffݍv@OW _0@@3f$fL$fT$ f\$0111,.m0ЈD/I$_^[]Ð      expand 32-byte kChaCha20 for x86, CRYPTOGAMS by .filegC:\Users\b\p\ring\.debug$S(.debug$T.text .absolut@feat.00-K W@gL$0001xqP~    _ring_core_0_17_14__ChaCha20_ctr32_ssse3L$_ChaCha20_ctr32_ssse3_beginL$pic_pointL$001outer_loopL$002loopL$005outer1xL$004loop1xL$006tailL$007tail_loopL$003doneL$ssse3_dataring-0.17.14/pregenerated/chacha-x86_64-elf.S000064400000000000000000000745311046102023000164470ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text .section .rodata .align 64 .Lzero: .long 0,0,0,0 .Lone: .long 1,0,0,0 .Linc: .long 0,1,2,3 .Lfour: .long 4,4,4,4 .Lincy: .long 0,2,4,6,1,3,5,7 .Leight: .long 8,8,8,8,8,8,8,8 .Lrot16: .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd .Lrot24: .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe .Lsigma: .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 .align 64 .Lzeroz: .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 .Lfourz: .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 .Lincz: .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 .Lsixteen: .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text .globl ChaCha20_ctr32_nohw .hidden ChaCha20_ctr32_nohw .type ChaCha20_ctr32_nohw,@function .align 64 ChaCha20_ctr32_nohw: .cfi_startproc _CET_ENDBR pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset r15,-56 subq $64+24,%rsp .cfi_adjust_cfa_offset 88 .Lctr32_body: movdqu (%rcx),%xmm1 movdqu 16(%rcx),%xmm2 movdqu (%r8),%xmm3 movdqa .Lone(%rip),%xmm4 movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) movq %rdx,%rbp jmp .Loop_outer .align 32 .Loop_outer: movl $0x61707865,%eax movl $0x3320646e,%ebx movl $0x79622d32,%ecx movl $0x6b206574,%edx movl 16(%rsp),%r8d movl 20(%rsp),%r9d movl 24(%rsp),%r10d movl 28(%rsp),%r11d movd %xmm3,%r12d movl 52(%rsp),%r13d movl 56(%rsp),%r14d movl 60(%rsp),%r15d movq %rbp,64+0(%rsp) movl $10,%ebp movq %rsi,64+8(%rsp) .byte 102,72,15,126,214 movq %rdi,64+16(%rsp) movq %rsi,%rdi shrq $32,%rdi jmp .Loop .align 32 .Loop: addl %r8d,%eax xorl %eax,%r12d roll $16,%r12d addl %r9d,%ebx xorl %ebx,%r13d roll $16,%r13d addl %r12d,%esi xorl %esi,%r8d roll $12,%r8d addl %r13d,%edi xorl %edi,%r9d roll $12,%r9d addl %r8d,%eax xorl %eax,%r12d roll $8,%r12d addl %r9d,%ebx xorl %ebx,%r13d roll $8,%r13d addl %r12d,%esi xorl %esi,%r8d roll $7,%r8d addl %r13d,%edi xorl %edi,%r9d roll $7,%r9d movl %esi,32(%rsp) movl %edi,36(%rsp) movl 40(%rsp),%esi movl 44(%rsp),%edi addl %r10d,%ecx xorl %ecx,%r14d roll $16,%r14d addl %r11d,%edx xorl %edx,%r15d roll $16,%r15d addl %r14d,%esi xorl %esi,%r10d roll $12,%r10d addl %r15d,%edi xorl %edi,%r11d roll $12,%r11d addl %r10d,%ecx xorl %ecx,%r14d roll $8,%r14d addl %r11d,%edx xorl %edx,%r15d roll $8,%r15d addl %r14d,%esi xorl %esi,%r10d roll $7,%r10d addl %r15d,%edi xorl %edi,%r11d roll $7,%r11d addl %r9d,%eax xorl %eax,%r15d roll $16,%r15d addl %r10d,%ebx xorl %ebx,%r12d roll $16,%r12d addl %r15d,%esi xorl %esi,%r9d roll $12,%r9d addl %r12d,%edi xorl %edi,%r10d roll $12,%r10d addl %r9d,%eax xorl %eax,%r15d roll $8,%r15d addl %r10d,%ebx xorl %ebx,%r12d roll $8,%r12d addl %r15d,%esi xorl %esi,%r9d roll $7,%r9d addl %r12d,%edi xorl %edi,%r10d roll $7,%r10d movl %esi,40(%rsp) movl %edi,44(%rsp) movl 32(%rsp),%esi movl 36(%rsp),%edi addl %r11d,%ecx xorl %ecx,%r13d roll $16,%r13d addl %r8d,%edx xorl %edx,%r14d roll $16,%r14d addl %r13d,%esi xorl %esi,%r11d roll $12,%r11d addl %r14d,%edi xorl %edi,%r8d roll $12,%r8d addl %r11d,%ecx xorl %ecx,%r13d roll $8,%r13d addl %r8d,%edx xorl %edx,%r14d roll $8,%r14d addl %r13d,%esi xorl %esi,%r11d roll $7,%r11d addl %r14d,%edi xorl %edi,%r8d roll $7,%r8d decl %ebp jnz .Loop movl %edi,36(%rsp) movl %esi,32(%rsp) movq 64(%rsp),%rbp movdqa %xmm2,%xmm1 movq 64+8(%rsp),%rsi paddd %xmm4,%xmm3 movq 64+16(%rsp),%rdi addl $0x61707865,%eax addl $0x3320646e,%ebx addl $0x79622d32,%ecx addl $0x6b206574,%edx addl 16(%rsp),%r8d addl 20(%rsp),%r9d addl 24(%rsp),%r10d addl 28(%rsp),%r11d addl 48(%rsp),%r12d addl 52(%rsp),%r13d addl 56(%rsp),%r14d addl 60(%rsp),%r15d paddd 32(%rsp),%xmm1 cmpq $64,%rbp jb .Ltail xorl 0(%rsi),%eax xorl 4(%rsi),%ebx xorl 8(%rsi),%ecx xorl 12(%rsi),%edx xorl 16(%rsi),%r8d xorl 20(%rsi),%r9d xorl 24(%rsi),%r10d xorl 28(%rsi),%r11d movdqu 32(%rsi),%xmm0 xorl 48(%rsi),%r12d xorl 52(%rsi),%r13d xorl 56(%rsi),%r14d xorl 60(%rsi),%r15d leaq 64(%rsi),%rsi pxor %xmm1,%xmm0 movdqa %xmm2,32(%rsp) movd %xmm3,48(%rsp) movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) movdqu %xmm0,32(%rdi) movl %r12d,48(%rdi) movl %r13d,52(%rdi) movl %r14d,56(%rdi) movl %r15d,60(%rdi) leaq 64(%rdi),%rdi subq $64,%rbp jnz .Loop_outer jmp .Ldone .align 16 .Ltail: movl %eax,0(%rsp) movl %ebx,4(%rsp) xorq %rbx,%rbx movl %ecx,8(%rsp) movl %edx,12(%rsp) movl %r8d,16(%rsp) movl %r9d,20(%rsp) movl %r10d,24(%rsp) movl %r11d,28(%rsp) movdqa %xmm1,32(%rsp) movl %r12d,48(%rsp) movl %r13d,52(%rsp) movl %r14d,56(%rsp) movl %r15d,60(%rsp) .Loop_tail: movzbl (%rsi,%rbx,1),%eax movzbl (%rsp,%rbx,1),%edx leaq 1(%rbx),%rbx xorl %edx,%eax movb %al,-1(%rdi,%rbx,1) decq %rbp jnz .Loop_tail .Ldone: leaq 64+24+48(%rsp),%rsi movq -48(%rsi),%r15 .cfi_restore r15 movq -40(%rsi),%r14 .cfi_restore r14 movq -32(%rsi),%r13 .cfi_restore r13 movq -24(%rsi),%r12 .cfi_restore r12 movq -16(%rsi),%rbp .cfi_restore rbp movq -8(%rsi),%rbx .cfi_restore rbx leaq (%rsi),%rsp .cfi_adjust_cfa_offset -136 .Lno_data: ret .cfi_endproc .size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw .globl ChaCha20_ctr32_ssse3_4x .hidden ChaCha20_ctr32_ssse3_4x .type ChaCha20_ctr32_ssse3_4x,@function .align 32 ChaCha20_ctr32_ssse3_4x: .cfi_startproc _CET_ENDBR movq %rsp,%r9 .cfi_def_cfa_register r9 subq $0x140+8,%rsp movdqa .Lsigma(%rip),%xmm11 movdqu (%rcx),%xmm15 movdqu 16(%rcx),%xmm7 movdqu (%r8),%xmm3 leaq 256(%rsp),%rcx leaq .Lrot16(%rip),%r10 leaq .Lrot24(%rip),%r11 pshufd $0x00,%xmm11,%xmm8 pshufd $0x55,%xmm11,%xmm9 movdqa %xmm8,64(%rsp) pshufd $0xaa,%xmm11,%xmm10 movdqa %xmm9,80(%rsp) pshufd $0xff,%xmm11,%xmm11 movdqa %xmm10,96(%rsp) movdqa %xmm11,112(%rsp) pshufd $0x00,%xmm15,%xmm12 pshufd $0x55,%xmm15,%xmm13 movdqa %xmm12,128-256(%rcx) pshufd $0xaa,%xmm15,%xmm14 movdqa %xmm13,144-256(%rcx) pshufd $0xff,%xmm15,%xmm15 movdqa %xmm14,160-256(%rcx) movdqa %xmm15,176-256(%rcx) pshufd $0x00,%xmm7,%xmm4 pshufd $0x55,%xmm7,%xmm5 movdqa %xmm4,192-256(%rcx) pshufd $0xaa,%xmm7,%xmm6 movdqa %xmm5,208-256(%rcx) pshufd $0xff,%xmm7,%xmm7 movdqa %xmm6,224-256(%rcx) movdqa %xmm7,240-256(%rcx) pshufd $0x00,%xmm3,%xmm0 pshufd $0x55,%xmm3,%xmm1 paddd .Linc(%rip),%xmm0 pshufd $0xaa,%xmm3,%xmm2 movdqa %xmm1,272-256(%rcx) pshufd $0xff,%xmm3,%xmm3 movdqa %xmm2,288-256(%rcx) movdqa %xmm3,304-256(%rcx) jmp .Loop_enter4x .align 32 .Loop_outer4x: movdqa 64(%rsp),%xmm8 movdqa 80(%rsp),%xmm9 movdqa 96(%rsp),%xmm10 movdqa 112(%rsp),%xmm11 movdqa 128-256(%rcx),%xmm12 movdqa 144-256(%rcx),%xmm13 movdqa 160-256(%rcx),%xmm14 movdqa 176-256(%rcx),%xmm15 movdqa 192-256(%rcx),%xmm4 movdqa 208-256(%rcx),%xmm5 movdqa 224-256(%rcx),%xmm6 movdqa 240-256(%rcx),%xmm7 movdqa 256-256(%rcx),%xmm0 movdqa 272-256(%rcx),%xmm1 movdqa 288-256(%rcx),%xmm2 movdqa 304-256(%rcx),%xmm3 paddd .Lfour(%rip),%xmm0 .Loop_enter4x: movdqa %xmm6,32(%rsp) movdqa %xmm7,48(%rsp) movdqa (%r10),%xmm7 movl $10,%eax movdqa %xmm0,256-256(%rcx) jmp .Loop4x .align 32 .Loop4x: paddd %xmm12,%xmm8 paddd %xmm13,%xmm9 pxor %xmm8,%xmm0 pxor %xmm9,%xmm1 .byte 102,15,56,0,199 .byte 102,15,56,0,207 paddd %xmm0,%xmm4 paddd %xmm1,%xmm5 pxor %xmm4,%xmm12 pxor %xmm5,%xmm13 movdqa %xmm12,%xmm6 pslld $12,%xmm12 psrld $20,%xmm6 movdqa %xmm13,%xmm7 pslld $12,%xmm13 por %xmm6,%xmm12 psrld $20,%xmm7 movdqa (%r11),%xmm6 por %xmm7,%xmm13 paddd %xmm12,%xmm8 paddd %xmm13,%xmm9 pxor %xmm8,%xmm0 pxor %xmm9,%xmm1 .byte 102,15,56,0,198 .byte 102,15,56,0,206 paddd %xmm0,%xmm4 paddd %xmm1,%xmm5 pxor %xmm4,%xmm12 pxor %xmm5,%xmm13 movdqa %xmm12,%xmm7 pslld $7,%xmm12 psrld $25,%xmm7 movdqa %xmm13,%xmm6 pslld $7,%xmm13 por %xmm7,%xmm12 psrld $25,%xmm6 movdqa (%r10),%xmm7 por %xmm6,%xmm13 movdqa %xmm4,0(%rsp) movdqa %xmm5,16(%rsp) movdqa 32(%rsp),%xmm4 movdqa 48(%rsp),%xmm5 paddd %xmm14,%xmm10 paddd %xmm15,%xmm11 pxor %xmm10,%xmm2 pxor %xmm11,%xmm3 .byte 102,15,56,0,215 .byte 102,15,56,0,223 paddd %xmm2,%xmm4 paddd %xmm3,%xmm5 pxor %xmm4,%xmm14 pxor %xmm5,%xmm15 movdqa %xmm14,%xmm6 pslld $12,%xmm14 psrld $20,%xmm6 movdqa %xmm15,%xmm7 pslld $12,%xmm15 por %xmm6,%xmm14 psrld $20,%xmm7 movdqa (%r11),%xmm6 por %xmm7,%xmm15 paddd %xmm14,%xmm10 paddd %xmm15,%xmm11 pxor %xmm10,%xmm2 pxor %xmm11,%xmm3 .byte 102,15,56,0,214 .byte 102,15,56,0,222 paddd %xmm2,%xmm4 paddd %xmm3,%xmm5 pxor %xmm4,%xmm14 pxor %xmm5,%xmm15 movdqa %xmm14,%xmm7 pslld $7,%xmm14 psrld $25,%xmm7 movdqa %xmm15,%xmm6 pslld $7,%xmm15 por %xmm7,%xmm14 psrld $25,%xmm6 movdqa (%r10),%xmm7 por %xmm6,%xmm15 paddd %xmm13,%xmm8 paddd %xmm14,%xmm9 pxor %xmm8,%xmm3 pxor %xmm9,%xmm0 .byte 102,15,56,0,223 .byte 102,15,56,0,199 paddd %xmm3,%xmm4 paddd %xmm0,%xmm5 pxor %xmm4,%xmm13 pxor %xmm5,%xmm14 movdqa %xmm13,%xmm6 pslld $12,%xmm13 psrld $20,%xmm6 movdqa %xmm14,%xmm7 pslld $12,%xmm14 por %xmm6,%xmm13 psrld $20,%xmm7 movdqa (%r11),%xmm6 por %xmm7,%xmm14 paddd %xmm13,%xmm8 paddd %xmm14,%xmm9 pxor %xmm8,%xmm3 pxor %xmm9,%xmm0 .byte 102,15,56,0,222 .byte 102,15,56,0,198 paddd %xmm3,%xmm4 paddd %xmm0,%xmm5 pxor %xmm4,%xmm13 pxor %xmm5,%xmm14 movdqa %xmm13,%xmm7 pslld $7,%xmm13 psrld $25,%xmm7 movdqa %xmm14,%xmm6 pslld $7,%xmm14 por %xmm7,%xmm13 psrld $25,%xmm6 movdqa (%r10),%xmm7 por %xmm6,%xmm14 movdqa %xmm4,32(%rsp) movdqa %xmm5,48(%rsp) movdqa 0(%rsp),%xmm4 movdqa 16(%rsp),%xmm5 paddd %xmm15,%xmm10 paddd %xmm12,%xmm11 pxor %xmm10,%xmm1 pxor %xmm11,%xmm2 .byte 102,15,56,0,207 .byte 102,15,56,0,215 paddd %xmm1,%xmm4 paddd %xmm2,%xmm5 pxor %xmm4,%xmm15 pxor %xmm5,%xmm12 movdqa %xmm15,%xmm6 pslld $12,%xmm15 psrld $20,%xmm6 movdqa %xmm12,%xmm7 pslld $12,%xmm12 por %xmm6,%xmm15 psrld $20,%xmm7 movdqa (%r11),%xmm6 por %xmm7,%xmm12 paddd %xmm15,%xmm10 paddd %xmm12,%xmm11 pxor %xmm10,%xmm1 pxor %xmm11,%xmm2 .byte 102,15,56,0,206 .byte 102,15,56,0,214 paddd %xmm1,%xmm4 paddd %xmm2,%xmm5 pxor %xmm4,%xmm15 pxor %xmm5,%xmm12 movdqa %xmm15,%xmm7 pslld $7,%xmm15 psrld $25,%xmm7 movdqa %xmm12,%xmm6 pslld $7,%xmm12 por %xmm7,%xmm15 psrld $25,%xmm6 movdqa (%r10),%xmm7 por %xmm6,%xmm12 decl %eax jnz .Loop4x paddd 64(%rsp),%xmm8 paddd 80(%rsp),%xmm9 paddd 96(%rsp),%xmm10 paddd 112(%rsp),%xmm11 movdqa %xmm8,%xmm6 punpckldq %xmm9,%xmm8 movdqa %xmm10,%xmm7 punpckldq %xmm11,%xmm10 punpckhdq %xmm9,%xmm6 punpckhdq %xmm11,%xmm7 movdqa %xmm8,%xmm9 punpcklqdq %xmm10,%xmm8 movdqa %xmm6,%xmm11 punpcklqdq %xmm7,%xmm6 punpckhqdq %xmm10,%xmm9 punpckhqdq %xmm7,%xmm11 paddd 128-256(%rcx),%xmm12 paddd 144-256(%rcx),%xmm13 paddd 160-256(%rcx),%xmm14 paddd 176-256(%rcx),%xmm15 movdqa %xmm8,0(%rsp) movdqa %xmm9,16(%rsp) movdqa 32(%rsp),%xmm8 movdqa 48(%rsp),%xmm9 movdqa %xmm12,%xmm10 punpckldq %xmm13,%xmm12 movdqa %xmm14,%xmm7 punpckldq %xmm15,%xmm14 punpckhdq %xmm13,%xmm10 punpckhdq %xmm15,%xmm7 movdqa %xmm12,%xmm13 punpcklqdq %xmm14,%xmm12 movdqa %xmm10,%xmm15 punpcklqdq %xmm7,%xmm10 punpckhqdq %xmm14,%xmm13 punpckhqdq %xmm7,%xmm15 paddd 192-256(%rcx),%xmm4 paddd 208-256(%rcx),%xmm5 paddd 224-256(%rcx),%xmm8 paddd 240-256(%rcx),%xmm9 movdqa %xmm6,32(%rsp) movdqa %xmm11,48(%rsp) movdqa %xmm4,%xmm14 punpckldq %xmm5,%xmm4 movdqa %xmm8,%xmm7 punpckldq %xmm9,%xmm8 punpckhdq %xmm5,%xmm14 punpckhdq %xmm9,%xmm7 movdqa %xmm4,%xmm5 punpcklqdq %xmm8,%xmm4 movdqa %xmm14,%xmm9 punpcklqdq %xmm7,%xmm14 punpckhqdq %xmm8,%xmm5 punpckhqdq %xmm7,%xmm9 paddd 256-256(%rcx),%xmm0 paddd 272-256(%rcx),%xmm1 paddd 288-256(%rcx),%xmm2 paddd 304-256(%rcx),%xmm3 movdqa %xmm0,%xmm8 punpckldq %xmm1,%xmm0 movdqa %xmm2,%xmm7 punpckldq %xmm3,%xmm2 punpckhdq %xmm1,%xmm8 punpckhdq %xmm3,%xmm7 movdqa %xmm0,%xmm1 punpcklqdq %xmm2,%xmm0 movdqa %xmm8,%xmm3 punpcklqdq %xmm7,%xmm8 punpckhqdq %xmm2,%xmm1 punpckhqdq %xmm7,%xmm3 cmpq $256,%rdx jb .Ltail4x movdqu 0(%rsi),%xmm6 movdqu 16(%rsi),%xmm11 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm7 pxor 0(%rsp),%xmm6 pxor %xmm12,%xmm11 pxor %xmm4,%xmm2 pxor %xmm0,%xmm7 movdqu %xmm6,0(%rdi) movdqu 64(%rsi),%xmm6 movdqu %xmm11,16(%rdi) movdqu 80(%rsi),%xmm11 movdqu %xmm2,32(%rdi) movdqu 96(%rsi),%xmm2 movdqu %xmm7,48(%rdi) movdqu 112(%rsi),%xmm7 leaq 128(%rsi),%rsi pxor 16(%rsp),%xmm6 pxor %xmm13,%xmm11 pxor %xmm5,%xmm2 pxor %xmm1,%xmm7 movdqu %xmm6,64(%rdi) movdqu 0(%rsi),%xmm6 movdqu %xmm11,80(%rdi) movdqu 16(%rsi),%xmm11 movdqu %xmm2,96(%rdi) movdqu 32(%rsi),%xmm2 movdqu %xmm7,112(%rdi) leaq 128(%rdi),%rdi movdqu 48(%rsi),%xmm7 pxor 32(%rsp),%xmm6 pxor %xmm10,%xmm11 pxor %xmm14,%xmm2 pxor %xmm8,%xmm7 movdqu %xmm6,0(%rdi) movdqu 64(%rsi),%xmm6 movdqu %xmm11,16(%rdi) movdqu 80(%rsi),%xmm11 movdqu %xmm2,32(%rdi) movdqu 96(%rsi),%xmm2 movdqu %xmm7,48(%rdi) movdqu 112(%rsi),%xmm7 leaq 128(%rsi),%rsi pxor 48(%rsp),%xmm6 pxor %xmm15,%xmm11 pxor %xmm9,%xmm2 pxor %xmm3,%xmm7 movdqu %xmm6,64(%rdi) movdqu %xmm11,80(%rdi) movdqu %xmm2,96(%rdi) movdqu %xmm7,112(%rdi) leaq 128(%rdi),%rdi subq $256,%rdx jnz .Loop_outer4x jmp .Ldone4x .Ltail4x: cmpq $192,%rdx jae .L192_or_more4x cmpq $128,%rdx jae .L128_or_more4x cmpq $64,%rdx jae .L64_or_more4x xorq %r10,%r10 movdqa %xmm12,16(%rsp) movdqa %xmm4,32(%rsp) movdqa %xmm0,48(%rsp) jmp .Loop_tail4x .align 32 .L64_or_more4x: movdqu 0(%rsi),%xmm6 movdqu 16(%rsi),%xmm11 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm7 pxor 0(%rsp),%xmm6 pxor %xmm12,%xmm11 pxor %xmm4,%xmm2 pxor %xmm0,%xmm7 movdqu %xmm6,0(%rdi) movdqu %xmm11,16(%rdi) movdqu %xmm2,32(%rdi) movdqu %xmm7,48(%rdi) je .Ldone4x movdqa 16(%rsp),%xmm6 leaq 64(%rsi),%rsi xorq %r10,%r10 movdqa %xmm6,0(%rsp) movdqa %xmm13,16(%rsp) leaq 64(%rdi),%rdi movdqa %xmm5,32(%rsp) subq $64,%rdx movdqa %xmm1,48(%rsp) jmp .Loop_tail4x .align 32 .L128_or_more4x: movdqu 0(%rsi),%xmm6 movdqu 16(%rsi),%xmm11 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm7 pxor 0(%rsp),%xmm6 pxor %xmm12,%xmm11 pxor %xmm4,%xmm2 pxor %xmm0,%xmm7 movdqu %xmm6,0(%rdi) movdqu 64(%rsi),%xmm6 movdqu %xmm11,16(%rdi) movdqu 80(%rsi),%xmm11 movdqu %xmm2,32(%rdi) movdqu 96(%rsi),%xmm2 movdqu %xmm7,48(%rdi) movdqu 112(%rsi),%xmm7 pxor 16(%rsp),%xmm6 pxor %xmm13,%xmm11 pxor %xmm5,%xmm2 pxor %xmm1,%xmm7 movdqu %xmm6,64(%rdi) movdqu %xmm11,80(%rdi) movdqu %xmm2,96(%rdi) movdqu %xmm7,112(%rdi) je .Ldone4x movdqa 32(%rsp),%xmm6 leaq 128(%rsi),%rsi xorq %r10,%r10 movdqa %xmm6,0(%rsp) movdqa %xmm10,16(%rsp) leaq 128(%rdi),%rdi movdqa %xmm14,32(%rsp) subq $128,%rdx movdqa %xmm8,48(%rsp) jmp .Loop_tail4x .align 32 .L192_or_more4x: movdqu 0(%rsi),%xmm6 movdqu 16(%rsi),%xmm11 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm7 pxor 0(%rsp),%xmm6 pxor %xmm12,%xmm11 pxor %xmm4,%xmm2 pxor %xmm0,%xmm7 movdqu %xmm6,0(%rdi) movdqu 64(%rsi),%xmm6 movdqu %xmm11,16(%rdi) movdqu 80(%rsi),%xmm11 movdqu %xmm2,32(%rdi) movdqu 96(%rsi),%xmm2 movdqu %xmm7,48(%rdi) movdqu 112(%rsi),%xmm7 leaq 128(%rsi),%rsi pxor 16(%rsp),%xmm6 pxor %xmm13,%xmm11 pxor %xmm5,%xmm2 pxor %xmm1,%xmm7 movdqu %xmm6,64(%rdi) movdqu 0(%rsi),%xmm6 movdqu %xmm11,80(%rdi) movdqu 16(%rsi),%xmm11 movdqu %xmm2,96(%rdi) movdqu 32(%rsi),%xmm2 movdqu %xmm7,112(%rdi) leaq 128(%rdi),%rdi movdqu 48(%rsi),%xmm7 pxor 32(%rsp),%xmm6 pxor %xmm10,%xmm11 pxor %xmm14,%xmm2 pxor %xmm8,%xmm7 movdqu %xmm6,0(%rdi) movdqu %xmm11,16(%rdi) movdqu %xmm2,32(%rdi) movdqu %xmm7,48(%rdi) je .Ldone4x movdqa 48(%rsp),%xmm6 leaq 64(%rsi),%rsi xorq %r10,%r10 movdqa %xmm6,0(%rsp) movdqa %xmm15,16(%rsp) leaq 64(%rdi),%rdi movdqa %xmm9,32(%rsp) subq $192,%rdx movdqa %xmm3,48(%rsp) .Loop_tail4x: movzbl (%rsi,%r10,1),%eax movzbl (%rsp,%r10,1),%ecx leaq 1(%r10),%r10 xorl %ecx,%eax movb %al,-1(%rdi,%r10,1) decq %rdx jnz .Loop_tail4x .Ldone4x: leaq (%r9),%rsp .cfi_def_cfa_register rsp .L4x_epilogue: ret .cfi_endproc .size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x .globl ChaCha20_ctr32_avx2 .hidden ChaCha20_ctr32_avx2 .type ChaCha20_ctr32_avx2,@function .align 32 ChaCha20_ctr32_avx2: .cfi_startproc _CET_ENDBR movq %rsp,%r9 .cfi_def_cfa_register r9 subq $0x280+8,%rsp andq $-32,%rsp vzeroupper vbroadcasti128 .Lsigma(%rip),%ymm11 vbroadcasti128 (%rcx),%ymm3 vbroadcasti128 16(%rcx),%ymm15 vbroadcasti128 (%r8),%ymm7 leaq 256(%rsp),%rcx leaq 512(%rsp),%rax leaq .Lrot16(%rip),%r10 leaq .Lrot24(%rip),%r11 vpshufd $0x00,%ymm11,%ymm8 vpshufd $0x55,%ymm11,%ymm9 vmovdqa %ymm8,128-256(%rcx) vpshufd $0xaa,%ymm11,%ymm10 vmovdqa %ymm9,160-256(%rcx) vpshufd $0xff,%ymm11,%ymm11 vmovdqa %ymm10,192-256(%rcx) vmovdqa %ymm11,224-256(%rcx) vpshufd $0x00,%ymm3,%ymm0 vpshufd $0x55,%ymm3,%ymm1 vmovdqa %ymm0,256-256(%rcx) vpshufd $0xaa,%ymm3,%ymm2 vmovdqa %ymm1,288-256(%rcx) vpshufd $0xff,%ymm3,%ymm3 vmovdqa %ymm2,320-256(%rcx) vmovdqa %ymm3,352-256(%rcx) vpshufd $0x00,%ymm15,%ymm12 vpshufd $0x55,%ymm15,%ymm13 vmovdqa %ymm12,384-512(%rax) vpshufd $0xaa,%ymm15,%ymm14 vmovdqa %ymm13,416-512(%rax) vpshufd $0xff,%ymm15,%ymm15 vmovdqa %ymm14,448-512(%rax) vmovdqa %ymm15,480-512(%rax) vpshufd $0x00,%ymm7,%ymm4 vpshufd $0x55,%ymm7,%ymm5 vpaddd .Lincy(%rip),%ymm4,%ymm4 vpshufd $0xaa,%ymm7,%ymm6 vmovdqa %ymm5,544-512(%rax) vpshufd $0xff,%ymm7,%ymm7 vmovdqa %ymm6,576-512(%rax) vmovdqa %ymm7,608-512(%rax) jmp .Loop_enter8x .align 32 .Loop_outer8x: vmovdqa 128-256(%rcx),%ymm8 vmovdqa 160-256(%rcx),%ymm9 vmovdqa 192-256(%rcx),%ymm10 vmovdqa 224-256(%rcx),%ymm11 vmovdqa 256-256(%rcx),%ymm0 vmovdqa 288-256(%rcx),%ymm1 vmovdqa 320-256(%rcx),%ymm2 vmovdqa 352-256(%rcx),%ymm3 vmovdqa 384-512(%rax),%ymm12 vmovdqa 416-512(%rax),%ymm13 vmovdqa 448-512(%rax),%ymm14 vmovdqa 480-512(%rax),%ymm15 vmovdqa 512-512(%rax),%ymm4 vmovdqa 544-512(%rax),%ymm5 vmovdqa 576-512(%rax),%ymm6 vmovdqa 608-512(%rax),%ymm7 vpaddd .Leight(%rip),%ymm4,%ymm4 .Loop_enter8x: vmovdqa %ymm14,64(%rsp) vmovdqa %ymm15,96(%rsp) vbroadcasti128 (%r10),%ymm15 vmovdqa %ymm4,512-512(%rax) movl $10,%eax jmp .Loop8x .align 32 .Loop8x: vpaddd %ymm0,%ymm8,%ymm8 vpxor %ymm4,%ymm8,%ymm4 vpshufb %ymm15,%ymm4,%ymm4 vpaddd %ymm1,%ymm9,%ymm9 vpxor %ymm5,%ymm9,%ymm5 vpshufb %ymm15,%ymm5,%ymm5 vpaddd %ymm4,%ymm12,%ymm12 vpxor %ymm0,%ymm12,%ymm0 vpslld $12,%ymm0,%ymm14 vpsrld $20,%ymm0,%ymm0 vpor %ymm0,%ymm14,%ymm0 vbroadcasti128 (%r11),%ymm14 vpaddd %ymm5,%ymm13,%ymm13 vpxor %ymm1,%ymm13,%ymm1 vpslld $12,%ymm1,%ymm15 vpsrld $20,%ymm1,%ymm1 vpor %ymm1,%ymm15,%ymm1 vpaddd %ymm0,%ymm8,%ymm8 vpxor %ymm4,%ymm8,%ymm4 vpshufb %ymm14,%ymm4,%ymm4 vpaddd %ymm1,%ymm9,%ymm9 vpxor %ymm5,%ymm9,%ymm5 vpshufb %ymm14,%ymm5,%ymm5 vpaddd %ymm4,%ymm12,%ymm12 vpxor %ymm0,%ymm12,%ymm0 vpslld $7,%ymm0,%ymm15 vpsrld $25,%ymm0,%ymm0 vpor %ymm0,%ymm15,%ymm0 vbroadcasti128 (%r10),%ymm15 vpaddd %ymm5,%ymm13,%ymm13 vpxor %ymm1,%ymm13,%ymm1 vpslld $7,%ymm1,%ymm14 vpsrld $25,%ymm1,%ymm1 vpor %ymm1,%ymm14,%ymm1 vmovdqa %ymm12,0(%rsp) vmovdqa %ymm13,32(%rsp) vmovdqa 64(%rsp),%ymm12 vmovdqa 96(%rsp),%ymm13 vpaddd %ymm2,%ymm10,%ymm10 vpxor %ymm6,%ymm10,%ymm6 vpshufb %ymm15,%ymm6,%ymm6 vpaddd %ymm3,%ymm11,%ymm11 vpxor %ymm7,%ymm11,%ymm7 vpshufb %ymm15,%ymm7,%ymm7 vpaddd %ymm6,%ymm12,%ymm12 vpxor %ymm2,%ymm12,%ymm2 vpslld $12,%ymm2,%ymm14 vpsrld $20,%ymm2,%ymm2 vpor %ymm2,%ymm14,%ymm2 vbroadcasti128 (%r11),%ymm14 vpaddd %ymm7,%ymm13,%ymm13 vpxor %ymm3,%ymm13,%ymm3 vpslld $12,%ymm3,%ymm15 vpsrld $20,%ymm3,%ymm3 vpor %ymm3,%ymm15,%ymm3 vpaddd %ymm2,%ymm10,%ymm10 vpxor %ymm6,%ymm10,%ymm6 vpshufb %ymm14,%ymm6,%ymm6 vpaddd %ymm3,%ymm11,%ymm11 vpxor %ymm7,%ymm11,%ymm7 vpshufb %ymm14,%ymm7,%ymm7 vpaddd %ymm6,%ymm12,%ymm12 vpxor %ymm2,%ymm12,%ymm2 vpslld $7,%ymm2,%ymm15 vpsrld $25,%ymm2,%ymm2 vpor %ymm2,%ymm15,%ymm2 vbroadcasti128 (%r10),%ymm15 vpaddd %ymm7,%ymm13,%ymm13 vpxor %ymm3,%ymm13,%ymm3 vpslld $7,%ymm3,%ymm14 vpsrld $25,%ymm3,%ymm3 vpor %ymm3,%ymm14,%ymm3 vpaddd %ymm1,%ymm8,%ymm8 vpxor %ymm7,%ymm8,%ymm7 vpshufb %ymm15,%ymm7,%ymm7 vpaddd %ymm2,%ymm9,%ymm9 vpxor %ymm4,%ymm9,%ymm4 vpshufb %ymm15,%ymm4,%ymm4 vpaddd %ymm7,%ymm12,%ymm12 vpxor %ymm1,%ymm12,%ymm1 vpslld $12,%ymm1,%ymm14 vpsrld $20,%ymm1,%ymm1 vpor %ymm1,%ymm14,%ymm1 vbroadcasti128 (%r11),%ymm14 vpaddd %ymm4,%ymm13,%ymm13 vpxor %ymm2,%ymm13,%ymm2 vpslld $12,%ymm2,%ymm15 vpsrld $20,%ymm2,%ymm2 vpor %ymm2,%ymm15,%ymm2 vpaddd %ymm1,%ymm8,%ymm8 vpxor %ymm7,%ymm8,%ymm7 vpshufb %ymm14,%ymm7,%ymm7 vpaddd %ymm2,%ymm9,%ymm9 vpxor %ymm4,%ymm9,%ymm4 vpshufb %ymm14,%ymm4,%ymm4 vpaddd %ymm7,%ymm12,%ymm12 vpxor %ymm1,%ymm12,%ymm1 vpslld $7,%ymm1,%ymm15 vpsrld $25,%ymm1,%ymm1 vpor %ymm1,%ymm15,%ymm1 vbroadcasti128 (%r10),%ymm15 vpaddd %ymm4,%ymm13,%ymm13 vpxor %ymm2,%ymm13,%ymm2 vpslld $7,%ymm2,%ymm14 vpsrld $25,%ymm2,%ymm2 vpor %ymm2,%ymm14,%ymm2 vmovdqa %ymm12,64(%rsp) vmovdqa %ymm13,96(%rsp) vmovdqa 0(%rsp),%ymm12 vmovdqa 32(%rsp),%ymm13 vpaddd %ymm3,%ymm10,%ymm10 vpxor %ymm5,%ymm10,%ymm5 vpshufb %ymm15,%ymm5,%ymm5 vpaddd %ymm0,%ymm11,%ymm11 vpxor %ymm6,%ymm11,%ymm6 vpshufb %ymm15,%ymm6,%ymm6 vpaddd %ymm5,%ymm12,%ymm12 vpxor %ymm3,%ymm12,%ymm3 vpslld $12,%ymm3,%ymm14 vpsrld $20,%ymm3,%ymm3 vpor %ymm3,%ymm14,%ymm3 vbroadcasti128 (%r11),%ymm14 vpaddd %ymm6,%ymm13,%ymm13 vpxor %ymm0,%ymm13,%ymm0 vpslld $12,%ymm0,%ymm15 vpsrld $20,%ymm0,%ymm0 vpor %ymm0,%ymm15,%ymm0 vpaddd %ymm3,%ymm10,%ymm10 vpxor %ymm5,%ymm10,%ymm5 vpshufb %ymm14,%ymm5,%ymm5 vpaddd %ymm0,%ymm11,%ymm11 vpxor %ymm6,%ymm11,%ymm6 vpshufb %ymm14,%ymm6,%ymm6 vpaddd %ymm5,%ymm12,%ymm12 vpxor %ymm3,%ymm12,%ymm3 vpslld $7,%ymm3,%ymm15 vpsrld $25,%ymm3,%ymm3 vpor %ymm3,%ymm15,%ymm3 vbroadcasti128 (%r10),%ymm15 vpaddd %ymm6,%ymm13,%ymm13 vpxor %ymm0,%ymm13,%ymm0 vpslld $7,%ymm0,%ymm14 vpsrld $25,%ymm0,%ymm0 vpor %ymm0,%ymm14,%ymm0 decl %eax jnz .Loop8x leaq 512(%rsp),%rax vpaddd 128-256(%rcx),%ymm8,%ymm8 vpaddd 160-256(%rcx),%ymm9,%ymm9 vpaddd 192-256(%rcx),%ymm10,%ymm10 vpaddd 224-256(%rcx),%ymm11,%ymm11 vpunpckldq %ymm9,%ymm8,%ymm14 vpunpckldq %ymm11,%ymm10,%ymm15 vpunpckhdq %ymm9,%ymm8,%ymm8 vpunpckhdq %ymm11,%ymm10,%ymm10 vpunpcklqdq %ymm15,%ymm14,%ymm9 vpunpckhqdq %ymm15,%ymm14,%ymm14 vpunpcklqdq %ymm10,%ymm8,%ymm11 vpunpckhqdq %ymm10,%ymm8,%ymm8 vpaddd 256-256(%rcx),%ymm0,%ymm0 vpaddd 288-256(%rcx),%ymm1,%ymm1 vpaddd 320-256(%rcx),%ymm2,%ymm2 vpaddd 352-256(%rcx),%ymm3,%ymm3 vpunpckldq %ymm1,%ymm0,%ymm10 vpunpckldq %ymm3,%ymm2,%ymm15 vpunpckhdq %ymm1,%ymm0,%ymm0 vpunpckhdq %ymm3,%ymm2,%ymm2 vpunpcklqdq %ymm15,%ymm10,%ymm1 vpunpckhqdq %ymm15,%ymm10,%ymm10 vpunpcklqdq %ymm2,%ymm0,%ymm3 vpunpckhqdq %ymm2,%ymm0,%ymm0 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 vmovdqa %ymm15,0(%rsp) vmovdqa %ymm9,32(%rsp) vmovdqa 64(%rsp),%ymm15 vmovdqa 96(%rsp),%ymm9 vpaddd 384-512(%rax),%ymm12,%ymm12 vpaddd 416-512(%rax),%ymm13,%ymm13 vpaddd 448-512(%rax),%ymm15,%ymm15 vpaddd 480-512(%rax),%ymm9,%ymm9 vpunpckldq %ymm13,%ymm12,%ymm2 vpunpckldq %ymm9,%ymm15,%ymm8 vpunpckhdq %ymm13,%ymm12,%ymm12 vpunpckhdq %ymm9,%ymm15,%ymm15 vpunpcklqdq %ymm8,%ymm2,%ymm13 vpunpckhqdq %ymm8,%ymm2,%ymm2 vpunpcklqdq %ymm15,%ymm12,%ymm9 vpunpckhqdq %ymm15,%ymm12,%ymm12 vpaddd 512-512(%rax),%ymm4,%ymm4 vpaddd 544-512(%rax),%ymm5,%ymm5 vpaddd 576-512(%rax),%ymm6,%ymm6 vpaddd 608-512(%rax),%ymm7,%ymm7 vpunpckldq %ymm5,%ymm4,%ymm15 vpunpckldq %ymm7,%ymm6,%ymm8 vpunpckhdq %ymm5,%ymm4,%ymm4 vpunpckhdq %ymm7,%ymm6,%ymm6 vpunpcklqdq %ymm8,%ymm15,%ymm5 vpunpckhqdq %ymm8,%ymm15,%ymm15 vpunpcklqdq %ymm6,%ymm4,%ymm7 vpunpckhqdq %ymm6,%ymm4,%ymm4 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 vmovdqa 0(%rsp),%ymm6 vmovdqa 32(%rsp),%ymm12 cmpq $512,%rdx jb .Ltail8x vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 leaq 128(%rsi),%rsi vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) leaq 128(%rdi),%rdi vpxor 0(%rsi),%ymm12,%ymm12 vpxor 32(%rsi),%ymm13,%ymm13 vpxor 64(%rsi),%ymm10,%ymm10 vpxor 96(%rsi),%ymm15,%ymm15 leaq 128(%rsi),%rsi vmovdqu %ymm12,0(%rdi) vmovdqu %ymm13,32(%rdi) vmovdqu %ymm10,64(%rdi) vmovdqu %ymm15,96(%rdi) leaq 128(%rdi),%rdi vpxor 0(%rsi),%ymm14,%ymm14 vpxor 32(%rsi),%ymm2,%ymm2 vpxor 64(%rsi),%ymm3,%ymm3 vpxor 96(%rsi),%ymm7,%ymm7 leaq 128(%rsi),%rsi vmovdqu %ymm14,0(%rdi) vmovdqu %ymm2,32(%rdi) vmovdqu %ymm3,64(%rdi) vmovdqu %ymm7,96(%rdi) leaq 128(%rdi),%rdi vpxor 0(%rsi),%ymm11,%ymm11 vpxor 32(%rsi),%ymm9,%ymm9 vpxor 64(%rsi),%ymm0,%ymm0 vpxor 96(%rsi),%ymm4,%ymm4 leaq 128(%rsi),%rsi vmovdqu %ymm11,0(%rdi) vmovdqu %ymm9,32(%rdi) vmovdqu %ymm0,64(%rdi) vmovdqu %ymm4,96(%rdi) leaq 128(%rdi),%rdi subq $512,%rdx jnz .Loop_outer8x jmp .Ldone8x .Ltail8x: cmpq $448,%rdx jae .L448_or_more8x cmpq $384,%rdx jae .L384_or_more8x cmpq $320,%rdx jae .L320_or_more8x cmpq $256,%rdx jae .L256_or_more8x cmpq $192,%rdx jae .L192_or_more8x cmpq $128,%rdx jae .L128_or_more8x cmpq $64,%rdx jae .L64_or_more8x xorq %r10,%r10 vmovdqa %ymm6,0(%rsp) vmovdqa %ymm8,32(%rsp) jmp .Loop_tail8x .align 32 .L64_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) je .Ldone8x leaq 64(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm1,0(%rsp) leaq 64(%rdi),%rdi subq $64,%rdx vmovdqa %ymm5,32(%rsp) jmp .Loop_tail8x .align 32 .L128_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) je .Ldone8x leaq 128(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm12,0(%rsp) leaq 128(%rdi),%rdi subq $128,%rdx vmovdqa %ymm13,32(%rsp) jmp .Loop_tail8x .align 32 .L192_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 vpxor 128(%rsi),%ymm12,%ymm12 vpxor 160(%rsi),%ymm13,%ymm13 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) vmovdqu %ymm12,128(%rdi) vmovdqu %ymm13,160(%rdi) je .Ldone8x leaq 192(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm10,0(%rsp) leaq 192(%rdi),%rdi subq $192,%rdx vmovdqa %ymm15,32(%rsp) jmp .Loop_tail8x .align 32 .L256_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 vpxor 128(%rsi),%ymm12,%ymm12 vpxor 160(%rsi),%ymm13,%ymm13 vpxor 192(%rsi),%ymm10,%ymm10 vpxor 224(%rsi),%ymm15,%ymm15 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) vmovdqu %ymm12,128(%rdi) vmovdqu %ymm13,160(%rdi) vmovdqu %ymm10,192(%rdi) vmovdqu %ymm15,224(%rdi) je .Ldone8x leaq 256(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm14,0(%rsp) leaq 256(%rdi),%rdi subq $256,%rdx vmovdqa %ymm2,32(%rsp) jmp .Loop_tail8x .align 32 .L320_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 vpxor 128(%rsi),%ymm12,%ymm12 vpxor 160(%rsi),%ymm13,%ymm13 vpxor 192(%rsi),%ymm10,%ymm10 vpxor 224(%rsi),%ymm15,%ymm15 vpxor 256(%rsi),%ymm14,%ymm14 vpxor 288(%rsi),%ymm2,%ymm2 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) vmovdqu %ymm12,128(%rdi) vmovdqu %ymm13,160(%rdi) vmovdqu %ymm10,192(%rdi) vmovdqu %ymm15,224(%rdi) vmovdqu %ymm14,256(%rdi) vmovdqu %ymm2,288(%rdi) je .Ldone8x leaq 320(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm3,0(%rsp) leaq 320(%rdi),%rdi subq $320,%rdx vmovdqa %ymm7,32(%rsp) jmp .Loop_tail8x .align 32 .L384_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 vpxor 128(%rsi),%ymm12,%ymm12 vpxor 160(%rsi),%ymm13,%ymm13 vpxor 192(%rsi),%ymm10,%ymm10 vpxor 224(%rsi),%ymm15,%ymm15 vpxor 256(%rsi),%ymm14,%ymm14 vpxor 288(%rsi),%ymm2,%ymm2 vpxor 320(%rsi),%ymm3,%ymm3 vpxor 352(%rsi),%ymm7,%ymm7 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) vmovdqu %ymm12,128(%rdi) vmovdqu %ymm13,160(%rdi) vmovdqu %ymm10,192(%rdi) vmovdqu %ymm15,224(%rdi) vmovdqu %ymm14,256(%rdi) vmovdqu %ymm2,288(%rdi) vmovdqu %ymm3,320(%rdi) vmovdqu %ymm7,352(%rdi) je .Ldone8x leaq 384(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm11,0(%rsp) leaq 384(%rdi),%rdi subq $384,%rdx vmovdqa %ymm9,32(%rsp) jmp .Loop_tail8x .align 32 .L448_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 vpxor 128(%rsi),%ymm12,%ymm12 vpxor 160(%rsi),%ymm13,%ymm13 vpxor 192(%rsi),%ymm10,%ymm10 vpxor 224(%rsi),%ymm15,%ymm15 vpxor 256(%rsi),%ymm14,%ymm14 vpxor 288(%rsi),%ymm2,%ymm2 vpxor 320(%rsi),%ymm3,%ymm3 vpxor 352(%rsi),%ymm7,%ymm7 vpxor 384(%rsi),%ymm11,%ymm11 vpxor 416(%rsi),%ymm9,%ymm9 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) vmovdqu %ymm12,128(%rdi) vmovdqu %ymm13,160(%rdi) vmovdqu %ymm10,192(%rdi) vmovdqu %ymm15,224(%rdi) vmovdqu %ymm14,256(%rdi) vmovdqu %ymm2,288(%rdi) vmovdqu %ymm3,320(%rdi) vmovdqu %ymm7,352(%rdi) vmovdqu %ymm11,384(%rdi) vmovdqu %ymm9,416(%rdi) je .Ldone8x leaq 448(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm0,0(%rsp) leaq 448(%rdi),%rdi subq $448,%rdx vmovdqa %ymm4,32(%rsp) .Loop_tail8x: movzbl (%rsi,%r10,1),%eax movzbl (%rsp,%r10,1),%ecx leaq 1(%r10),%r10 xorl %ecx,%eax movb %al,-1(%rdi,%r10,1) decq %rdx jnz .Loop_tail8x .Ldone8x: vzeroall leaq (%r9),%rsp .cfi_def_cfa_register rsp .L8x_epilogue: ret .cfi_endproc .size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2 #endif ring-0.17.14/pregenerated/chacha-x86_64-macosx.S000064400000000000000000000731051046102023000171670ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text .section __DATA,__const .p2align 6 L$zero: .long 0,0,0,0 L$one: .long 1,0,0,0 L$inc: .long 0,1,2,3 L$four: .long 4,4,4,4 L$incy: .long 0,2,4,6,1,3,5,7 L$eight: .long 8,8,8,8,8,8,8,8 L$rot16: .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd L$rot24: .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe L$sigma: .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 .p2align 6 L$zeroz: .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 L$fourz: .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 L$incz: .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 L$sixteen: .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text .globl _ChaCha20_ctr32_nohw .private_extern _ChaCha20_ctr32_nohw .p2align 6 _ChaCha20_ctr32_nohw: _CET_ENDBR pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $64+24,%rsp L$ctr32_body: movdqu (%rcx),%xmm1 movdqu 16(%rcx),%xmm2 movdqu (%r8),%xmm3 movdqa L$one(%rip),%xmm4 movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) movq %rdx,%rbp jmp L$oop_outer .p2align 5 L$oop_outer: movl $0x61707865,%eax movl $0x3320646e,%ebx movl $0x79622d32,%ecx movl $0x6b206574,%edx movl 16(%rsp),%r8d movl 20(%rsp),%r9d movl 24(%rsp),%r10d movl 28(%rsp),%r11d movd %xmm3,%r12d movl 52(%rsp),%r13d movl 56(%rsp),%r14d movl 60(%rsp),%r15d movq %rbp,64+0(%rsp) movl $10,%ebp movq %rsi,64+8(%rsp) .byte 102,72,15,126,214 movq %rdi,64+16(%rsp) movq %rsi,%rdi shrq $32,%rdi jmp L$oop .p2align 5 L$oop: addl %r8d,%eax xorl %eax,%r12d roll $16,%r12d addl %r9d,%ebx xorl %ebx,%r13d roll $16,%r13d addl %r12d,%esi xorl %esi,%r8d roll $12,%r8d addl %r13d,%edi xorl %edi,%r9d roll $12,%r9d addl %r8d,%eax xorl %eax,%r12d roll $8,%r12d addl %r9d,%ebx xorl %ebx,%r13d roll $8,%r13d addl %r12d,%esi xorl %esi,%r8d roll $7,%r8d addl %r13d,%edi xorl %edi,%r9d roll $7,%r9d movl %esi,32(%rsp) movl %edi,36(%rsp) movl 40(%rsp),%esi movl 44(%rsp),%edi addl %r10d,%ecx xorl %ecx,%r14d roll $16,%r14d addl %r11d,%edx xorl %edx,%r15d roll $16,%r15d addl %r14d,%esi xorl %esi,%r10d roll $12,%r10d addl %r15d,%edi xorl %edi,%r11d roll $12,%r11d addl %r10d,%ecx xorl %ecx,%r14d roll $8,%r14d addl %r11d,%edx xorl %edx,%r15d roll $8,%r15d addl %r14d,%esi xorl %esi,%r10d roll $7,%r10d addl %r15d,%edi xorl %edi,%r11d roll $7,%r11d addl %r9d,%eax xorl %eax,%r15d roll $16,%r15d addl %r10d,%ebx xorl %ebx,%r12d roll $16,%r12d addl %r15d,%esi xorl %esi,%r9d roll $12,%r9d addl %r12d,%edi xorl %edi,%r10d roll $12,%r10d addl %r9d,%eax xorl %eax,%r15d roll $8,%r15d addl %r10d,%ebx xorl %ebx,%r12d roll $8,%r12d addl %r15d,%esi xorl %esi,%r9d roll $7,%r9d addl %r12d,%edi xorl %edi,%r10d roll $7,%r10d movl %esi,40(%rsp) movl %edi,44(%rsp) movl 32(%rsp),%esi movl 36(%rsp),%edi addl %r11d,%ecx xorl %ecx,%r13d roll $16,%r13d addl %r8d,%edx xorl %edx,%r14d roll $16,%r14d addl %r13d,%esi xorl %esi,%r11d roll $12,%r11d addl %r14d,%edi xorl %edi,%r8d roll $12,%r8d addl %r11d,%ecx xorl %ecx,%r13d roll $8,%r13d addl %r8d,%edx xorl %edx,%r14d roll $8,%r14d addl %r13d,%esi xorl %esi,%r11d roll $7,%r11d addl %r14d,%edi xorl %edi,%r8d roll $7,%r8d decl %ebp jnz L$oop movl %edi,36(%rsp) movl %esi,32(%rsp) movq 64(%rsp),%rbp movdqa %xmm2,%xmm1 movq 64+8(%rsp),%rsi paddd %xmm4,%xmm3 movq 64+16(%rsp),%rdi addl $0x61707865,%eax addl $0x3320646e,%ebx addl $0x79622d32,%ecx addl $0x6b206574,%edx addl 16(%rsp),%r8d addl 20(%rsp),%r9d addl 24(%rsp),%r10d addl 28(%rsp),%r11d addl 48(%rsp),%r12d addl 52(%rsp),%r13d addl 56(%rsp),%r14d addl 60(%rsp),%r15d paddd 32(%rsp),%xmm1 cmpq $64,%rbp jb L$tail xorl 0(%rsi),%eax xorl 4(%rsi),%ebx xorl 8(%rsi),%ecx xorl 12(%rsi),%edx xorl 16(%rsi),%r8d xorl 20(%rsi),%r9d xorl 24(%rsi),%r10d xorl 28(%rsi),%r11d movdqu 32(%rsi),%xmm0 xorl 48(%rsi),%r12d xorl 52(%rsi),%r13d xorl 56(%rsi),%r14d xorl 60(%rsi),%r15d leaq 64(%rsi),%rsi pxor %xmm1,%xmm0 movdqa %xmm2,32(%rsp) movd %xmm3,48(%rsp) movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) movdqu %xmm0,32(%rdi) movl %r12d,48(%rdi) movl %r13d,52(%rdi) movl %r14d,56(%rdi) movl %r15d,60(%rdi) leaq 64(%rdi),%rdi subq $64,%rbp jnz L$oop_outer jmp L$done .p2align 4 L$tail: movl %eax,0(%rsp) movl %ebx,4(%rsp) xorq %rbx,%rbx movl %ecx,8(%rsp) movl %edx,12(%rsp) movl %r8d,16(%rsp) movl %r9d,20(%rsp) movl %r10d,24(%rsp) movl %r11d,28(%rsp) movdqa %xmm1,32(%rsp) movl %r12d,48(%rsp) movl %r13d,52(%rsp) movl %r14d,56(%rsp) movl %r15d,60(%rsp) L$oop_tail: movzbl (%rsi,%rbx,1),%eax movzbl (%rsp,%rbx,1),%edx leaq 1(%rbx),%rbx xorl %edx,%eax movb %al,-1(%rdi,%rbx,1) decq %rbp jnz L$oop_tail L$done: leaq 64+24+48(%rsp),%rsi movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$no_data: ret .globl _ChaCha20_ctr32_ssse3_4x .private_extern _ChaCha20_ctr32_ssse3_4x .p2align 5 _ChaCha20_ctr32_ssse3_4x: _CET_ENDBR movq %rsp,%r9 subq $0x140+8,%rsp movdqa L$sigma(%rip),%xmm11 movdqu (%rcx),%xmm15 movdqu 16(%rcx),%xmm7 movdqu (%r8),%xmm3 leaq 256(%rsp),%rcx leaq L$rot16(%rip),%r10 leaq L$rot24(%rip),%r11 pshufd $0x00,%xmm11,%xmm8 pshufd $0x55,%xmm11,%xmm9 movdqa %xmm8,64(%rsp) pshufd $0xaa,%xmm11,%xmm10 movdqa %xmm9,80(%rsp) pshufd $0xff,%xmm11,%xmm11 movdqa %xmm10,96(%rsp) movdqa %xmm11,112(%rsp) pshufd $0x00,%xmm15,%xmm12 pshufd $0x55,%xmm15,%xmm13 movdqa %xmm12,128-256(%rcx) pshufd $0xaa,%xmm15,%xmm14 movdqa %xmm13,144-256(%rcx) pshufd $0xff,%xmm15,%xmm15 movdqa %xmm14,160-256(%rcx) movdqa %xmm15,176-256(%rcx) pshufd $0x00,%xmm7,%xmm4 pshufd $0x55,%xmm7,%xmm5 movdqa %xmm4,192-256(%rcx) pshufd $0xaa,%xmm7,%xmm6 movdqa %xmm5,208-256(%rcx) pshufd $0xff,%xmm7,%xmm7 movdqa %xmm6,224-256(%rcx) movdqa %xmm7,240-256(%rcx) pshufd $0x00,%xmm3,%xmm0 pshufd $0x55,%xmm3,%xmm1 paddd L$inc(%rip),%xmm0 pshufd $0xaa,%xmm3,%xmm2 movdqa %xmm1,272-256(%rcx) pshufd $0xff,%xmm3,%xmm3 movdqa %xmm2,288-256(%rcx) movdqa %xmm3,304-256(%rcx) jmp L$oop_enter4x .p2align 5 L$oop_outer4x: movdqa 64(%rsp),%xmm8 movdqa 80(%rsp),%xmm9 movdqa 96(%rsp),%xmm10 movdqa 112(%rsp),%xmm11 movdqa 128-256(%rcx),%xmm12 movdqa 144-256(%rcx),%xmm13 movdqa 160-256(%rcx),%xmm14 movdqa 176-256(%rcx),%xmm15 movdqa 192-256(%rcx),%xmm4 movdqa 208-256(%rcx),%xmm5 movdqa 224-256(%rcx),%xmm6 movdqa 240-256(%rcx),%xmm7 movdqa 256-256(%rcx),%xmm0 movdqa 272-256(%rcx),%xmm1 movdqa 288-256(%rcx),%xmm2 movdqa 304-256(%rcx),%xmm3 paddd L$four(%rip),%xmm0 L$oop_enter4x: movdqa %xmm6,32(%rsp) movdqa %xmm7,48(%rsp) movdqa (%r10),%xmm7 movl $10,%eax movdqa %xmm0,256-256(%rcx) jmp L$oop4x .p2align 5 L$oop4x: paddd %xmm12,%xmm8 paddd %xmm13,%xmm9 pxor %xmm8,%xmm0 pxor %xmm9,%xmm1 .byte 102,15,56,0,199 .byte 102,15,56,0,207 paddd %xmm0,%xmm4 paddd %xmm1,%xmm5 pxor %xmm4,%xmm12 pxor %xmm5,%xmm13 movdqa %xmm12,%xmm6 pslld $12,%xmm12 psrld $20,%xmm6 movdqa %xmm13,%xmm7 pslld $12,%xmm13 por %xmm6,%xmm12 psrld $20,%xmm7 movdqa (%r11),%xmm6 por %xmm7,%xmm13 paddd %xmm12,%xmm8 paddd %xmm13,%xmm9 pxor %xmm8,%xmm0 pxor %xmm9,%xmm1 .byte 102,15,56,0,198 .byte 102,15,56,0,206 paddd %xmm0,%xmm4 paddd %xmm1,%xmm5 pxor %xmm4,%xmm12 pxor %xmm5,%xmm13 movdqa %xmm12,%xmm7 pslld $7,%xmm12 psrld $25,%xmm7 movdqa %xmm13,%xmm6 pslld $7,%xmm13 por %xmm7,%xmm12 psrld $25,%xmm6 movdqa (%r10),%xmm7 por %xmm6,%xmm13 movdqa %xmm4,0(%rsp) movdqa %xmm5,16(%rsp) movdqa 32(%rsp),%xmm4 movdqa 48(%rsp),%xmm5 paddd %xmm14,%xmm10 paddd %xmm15,%xmm11 pxor %xmm10,%xmm2 pxor %xmm11,%xmm3 .byte 102,15,56,0,215 .byte 102,15,56,0,223 paddd %xmm2,%xmm4 paddd %xmm3,%xmm5 pxor %xmm4,%xmm14 pxor %xmm5,%xmm15 movdqa %xmm14,%xmm6 pslld $12,%xmm14 psrld $20,%xmm6 movdqa %xmm15,%xmm7 pslld $12,%xmm15 por %xmm6,%xmm14 psrld $20,%xmm7 movdqa (%r11),%xmm6 por %xmm7,%xmm15 paddd %xmm14,%xmm10 paddd %xmm15,%xmm11 pxor %xmm10,%xmm2 pxor %xmm11,%xmm3 .byte 102,15,56,0,214 .byte 102,15,56,0,222 paddd %xmm2,%xmm4 paddd %xmm3,%xmm5 pxor %xmm4,%xmm14 pxor %xmm5,%xmm15 movdqa %xmm14,%xmm7 pslld $7,%xmm14 psrld $25,%xmm7 movdqa %xmm15,%xmm6 pslld $7,%xmm15 por %xmm7,%xmm14 psrld $25,%xmm6 movdqa (%r10),%xmm7 por %xmm6,%xmm15 paddd %xmm13,%xmm8 paddd %xmm14,%xmm9 pxor %xmm8,%xmm3 pxor %xmm9,%xmm0 .byte 102,15,56,0,223 .byte 102,15,56,0,199 paddd %xmm3,%xmm4 paddd %xmm0,%xmm5 pxor %xmm4,%xmm13 pxor %xmm5,%xmm14 movdqa %xmm13,%xmm6 pslld $12,%xmm13 psrld $20,%xmm6 movdqa %xmm14,%xmm7 pslld $12,%xmm14 por %xmm6,%xmm13 psrld $20,%xmm7 movdqa (%r11),%xmm6 por %xmm7,%xmm14 paddd %xmm13,%xmm8 paddd %xmm14,%xmm9 pxor %xmm8,%xmm3 pxor %xmm9,%xmm0 .byte 102,15,56,0,222 .byte 102,15,56,0,198 paddd %xmm3,%xmm4 paddd %xmm0,%xmm5 pxor %xmm4,%xmm13 pxor %xmm5,%xmm14 movdqa %xmm13,%xmm7 pslld $7,%xmm13 psrld $25,%xmm7 movdqa %xmm14,%xmm6 pslld $7,%xmm14 por %xmm7,%xmm13 psrld $25,%xmm6 movdqa (%r10),%xmm7 por %xmm6,%xmm14 movdqa %xmm4,32(%rsp) movdqa %xmm5,48(%rsp) movdqa 0(%rsp),%xmm4 movdqa 16(%rsp),%xmm5 paddd %xmm15,%xmm10 paddd %xmm12,%xmm11 pxor %xmm10,%xmm1 pxor %xmm11,%xmm2 .byte 102,15,56,0,207 .byte 102,15,56,0,215 paddd %xmm1,%xmm4 paddd %xmm2,%xmm5 pxor %xmm4,%xmm15 pxor %xmm5,%xmm12 movdqa %xmm15,%xmm6 pslld $12,%xmm15 psrld $20,%xmm6 movdqa %xmm12,%xmm7 pslld $12,%xmm12 por %xmm6,%xmm15 psrld $20,%xmm7 movdqa (%r11),%xmm6 por %xmm7,%xmm12 paddd %xmm15,%xmm10 paddd %xmm12,%xmm11 pxor %xmm10,%xmm1 pxor %xmm11,%xmm2 .byte 102,15,56,0,206 .byte 102,15,56,0,214 paddd %xmm1,%xmm4 paddd %xmm2,%xmm5 pxor %xmm4,%xmm15 pxor %xmm5,%xmm12 movdqa %xmm15,%xmm7 pslld $7,%xmm15 psrld $25,%xmm7 movdqa %xmm12,%xmm6 pslld $7,%xmm12 por %xmm7,%xmm15 psrld $25,%xmm6 movdqa (%r10),%xmm7 por %xmm6,%xmm12 decl %eax jnz L$oop4x paddd 64(%rsp),%xmm8 paddd 80(%rsp),%xmm9 paddd 96(%rsp),%xmm10 paddd 112(%rsp),%xmm11 movdqa %xmm8,%xmm6 punpckldq %xmm9,%xmm8 movdqa %xmm10,%xmm7 punpckldq %xmm11,%xmm10 punpckhdq %xmm9,%xmm6 punpckhdq %xmm11,%xmm7 movdqa %xmm8,%xmm9 punpcklqdq %xmm10,%xmm8 movdqa %xmm6,%xmm11 punpcklqdq %xmm7,%xmm6 punpckhqdq %xmm10,%xmm9 punpckhqdq %xmm7,%xmm11 paddd 128-256(%rcx),%xmm12 paddd 144-256(%rcx),%xmm13 paddd 160-256(%rcx),%xmm14 paddd 176-256(%rcx),%xmm15 movdqa %xmm8,0(%rsp) movdqa %xmm9,16(%rsp) movdqa 32(%rsp),%xmm8 movdqa 48(%rsp),%xmm9 movdqa %xmm12,%xmm10 punpckldq %xmm13,%xmm12 movdqa %xmm14,%xmm7 punpckldq %xmm15,%xmm14 punpckhdq %xmm13,%xmm10 punpckhdq %xmm15,%xmm7 movdqa %xmm12,%xmm13 punpcklqdq %xmm14,%xmm12 movdqa %xmm10,%xmm15 punpcklqdq %xmm7,%xmm10 punpckhqdq %xmm14,%xmm13 punpckhqdq %xmm7,%xmm15 paddd 192-256(%rcx),%xmm4 paddd 208-256(%rcx),%xmm5 paddd 224-256(%rcx),%xmm8 paddd 240-256(%rcx),%xmm9 movdqa %xmm6,32(%rsp) movdqa %xmm11,48(%rsp) movdqa %xmm4,%xmm14 punpckldq %xmm5,%xmm4 movdqa %xmm8,%xmm7 punpckldq %xmm9,%xmm8 punpckhdq %xmm5,%xmm14 punpckhdq %xmm9,%xmm7 movdqa %xmm4,%xmm5 punpcklqdq %xmm8,%xmm4 movdqa %xmm14,%xmm9 punpcklqdq %xmm7,%xmm14 punpckhqdq %xmm8,%xmm5 punpckhqdq %xmm7,%xmm9 paddd 256-256(%rcx),%xmm0 paddd 272-256(%rcx),%xmm1 paddd 288-256(%rcx),%xmm2 paddd 304-256(%rcx),%xmm3 movdqa %xmm0,%xmm8 punpckldq %xmm1,%xmm0 movdqa %xmm2,%xmm7 punpckldq %xmm3,%xmm2 punpckhdq %xmm1,%xmm8 punpckhdq %xmm3,%xmm7 movdqa %xmm0,%xmm1 punpcklqdq %xmm2,%xmm0 movdqa %xmm8,%xmm3 punpcklqdq %xmm7,%xmm8 punpckhqdq %xmm2,%xmm1 punpckhqdq %xmm7,%xmm3 cmpq $256,%rdx jb L$tail4x movdqu 0(%rsi),%xmm6 movdqu 16(%rsi),%xmm11 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm7 pxor 0(%rsp),%xmm6 pxor %xmm12,%xmm11 pxor %xmm4,%xmm2 pxor %xmm0,%xmm7 movdqu %xmm6,0(%rdi) movdqu 64(%rsi),%xmm6 movdqu %xmm11,16(%rdi) movdqu 80(%rsi),%xmm11 movdqu %xmm2,32(%rdi) movdqu 96(%rsi),%xmm2 movdqu %xmm7,48(%rdi) movdqu 112(%rsi),%xmm7 leaq 128(%rsi),%rsi pxor 16(%rsp),%xmm6 pxor %xmm13,%xmm11 pxor %xmm5,%xmm2 pxor %xmm1,%xmm7 movdqu %xmm6,64(%rdi) movdqu 0(%rsi),%xmm6 movdqu %xmm11,80(%rdi) movdqu 16(%rsi),%xmm11 movdqu %xmm2,96(%rdi) movdqu 32(%rsi),%xmm2 movdqu %xmm7,112(%rdi) leaq 128(%rdi),%rdi movdqu 48(%rsi),%xmm7 pxor 32(%rsp),%xmm6 pxor %xmm10,%xmm11 pxor %xmm14,%xmm2 pxor %xmm8,%xmm7 movdqu %xmm6,0(%rdi) movdqu 64(%rsi),%xmm6 movdqu %xmm11,16(%rdi) movdqu 80(%rsi),%xmm11 movdqu %xmm2,32(%rdi) movdqu 96(%rsi),%xmm2 movdqu %xmm7,48(%rdi) movdqu 112(%rsi),%xmm7 leaq 128(%rsi),%rsi pxor 48(%rsp),%xmm6 pxor %xmm15,%xmm11 pxor %xmm9,%xmm2 pxor %xmm3,%xmm7 movdqu %xmm6,64(%rdi) movdqu %xmm11,80(%rdi) movdqu %xmm2,96(%rdi) movdqu %xmm7,112(%rdi) leaq 128(%rdi),%rdi subq $256,%rdx jnz L$oop_outer4x jmp L$done4x L$tail4x: cmpq $192,%rdx jae L$192_or_more4x cmpq $128,%rdx jae L$128_or_more4x cmpq $64,%rdx jae L$64_or_more4x xorq %r10,%r10 movdqa %xmm12,16(%rsp) movdqa %xmm4,32(%rsp) movdqa %xmm0,48(%rsp) jmp L$oop_tail4x .p2align 5 L$64_or_more4x: movdqu 0(%rsi),%xmm6 movdqu 16(%rsi),%xmm11 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm7 pxor 0(%rsp),%xmm6 pxor %xmm12,%xmm11 pxor %xmm4,%xmm2 pxor %xmm0,%xmm7 movdqu %xmm6,0(%rdi) movdqu %xmm11,16(%rdi) movdqu %xmm2,32(%rdi) movdqu %xmm7,48(%rdi) je L$done4x movdqa 16(%rsp),%xmm6 leaq 64(%rsi),%rsi xorq %r10,%r10 movdqa %xmm6,0(%rsp) movdqa %xmm13,16(%rsp) leaq 64(%rdi),%rdi movdqa %xmm5,32(%rsp) subq $64,%rdx movdqa %xmm1,48(%rsp) jmp L$oop_tail4x .p2align 5 L$128_or_more4x: movdqu 0(%rsi),%xmm6 movdqu 16(%rsi),%xmm11 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm7 pxor 0(%rsp),%xmm6 pxor %xmm12,%xmm11 pxor %xmm4,%xmm2 pxor %xmm0,%xmm7 movdqu %xmm6,0(%rdi) movdqu 64(%rsi),%xmm6 movdqu %xmm11,16(%rdi) movdqu 80(%rsi),%xmm11 movdqu %xmm2,32(%rdi) movdqu 96(%rsi),%xmm2 movdqu %xmm7,48(%rdi) movdqu 112(%rsi),%xmm7 pxor 16(%rsp),%xmm6 pxor %xmm13,%xmm11 pxor %xmm5,%xmm2 pxor %xmm1,%xmm7 movdqu %xmm6,64(%rdi) movdqu %xmm11,80(%rdi) movdqu %xmm2,96(%rdi) movdqu %xmm7,112(%rdi) je L$done4x movdqa 32(%rsp),%xmm6 leaq 128(%rsi),%rsi xorq %r10,%r10 movdqa %xmm6,0(%rsp) movdqa %xmm10,16(%rsp) leaq 128(%rdi),%rdi movdqa %xmm14,32(%rsp) subq $128,%rdx movdqa %xmm8,48(%rsp) jmp L$oop_tail4x .p2align 5 L$192_or_more4x: movdqu 0(%rsi),%xmm6 movdqu 16(%rsi),%xmm11 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm7 pxor 0(%rsp),%xmm6 pxor %xmm12,%xmm11 pxor %xmm4,%xmm2 pxor %xmm0,%xmm7 movdqu %xmm6,0(%rdi) movdqu 64(%rsi),%xmm6 movdqu %xmm11,16(%rdi) movdqu 80(%rsi),%xmm11 movdqu %xmm2,32(%rdi) movdqu 96(%rsi),%xmm2 movdqu %xmm7,48(%rdi) movdqu 112(%rsi),%xmm7 leaq 128(%rsi),%rsi pxor 16(%rsp),%xmm6 pxor %xmm13,%xmm11 pxor %xmm5,%xmm2 pxor %xmm1,%xmm7 movdqu %xmm6,64(%rdi) movdqu 0(%rsi),%xmm6 movdqu %xmm11,80(%rdi) movdqu 16(%rsi),%xmm11 movdqu %xmm2,96(%rdi) movdqu 32(%rsi),%xmm2 movdqu %xmm7,112(%rdi) leaq 128(%rdi),%rdi movdqu 48(%rsi),%xmm7 pxor 32(%rsp),%xmm6 pxor %xmm10,%xmm11 pxor %xmm14,%xmm2 pxor %xmm8,%xmm7 movdqu %xmm6,0(%rdi) movdqu %xmm11,16(%rdi) movdqu %xmm2,32(%rdi) movdqu %xmm7,48(%rdi) je L$done4x movdqa 48(%rsp),%xmm6 leaq 64(%rsi),%rsi xorq %r10,%r10 movdqa %xmm6,0(%rsp) movdqa %xmm15,16(%rsp) leaq 64(%rdi),%rdi movdqa %xmm9,32(%rsp) subq $192,%rdx movdqa %xmm3,48(%rsp) L$oop_tail4x: movzbl (%rsi,%r10,1),%eax movzbl (%rsp,%r10,1),%ecx leaq 1(%r10),%r10 xorl %ecx,%eax movb %al,-1(%rdi,%r10,1) decq %rdx jnz L$oop_tail4x L$done4x: leaq (%r9),%rsp L$4x_epilogue: ret .globl _ChaCha20_ctr32_avx2 .private_extern _ChaCha20_ctr32_avx2 .p2align 5 _ChaCha20_ctr32_avx2: _CET_ENDBR movq %rsp,%r9 subq $0x280+8,%rsp andq $-32,%rsp vzeroupper vbroadcasti128 L$sigma(%rip),%ymm11 vbroadcasti128 (%rcx),%ymm3 vbroadcasti128 16(%rcx),%ymm15 vbroadcasti128 (%r8),%ymm7 leaq 256(%rsp),%rcx leaq 512(%rsp),%rax leaq L$rot16(%rip),%r10 leaq L$rot24(%rip),%r11 vpshufd $0x00,%ymm11,%ymm8 vpshufd $0x55,%ymm11,%ymm9 vmovdqa %ymm8,128-256(%rcx) vpshufd $0xaa,%ymm11,%ymm10 vmovdqa %ymm9,160-256(%rcx) vpshufd $0xff,%ymm11,%ymm11 vmovdqa %ymm10,192-256(%rcx) vmovdqa %ymm11,224-256(%rcx) vpshufd $0x00,%ymm3,%ymm0 vpshufd $0x55,%ymm3,%ymm1 vmovdqa %ymm0,256-256(%rcx) vpshufd $0xaa,%ymm3,%ymm2 vmovdqa %ymm1,288-256(%rcx) vpshufd $0xff,%ymm3,%ymm3 vmovdqa %ymm2,320-256(%rcx) vmovdqa %ymm3,352-256(%rcx) vpshufd $0x00,%ymm15,%ymm12 vpshufd $0x55,%ymm15,%ymm13 vmovdqa %ymm12,384-512(%rax) vpshufd $0xaa,%ymm15,%ymm14 vmovdqa %ymm13,416-512(%rax) vpshufd $0xff,%ymm15,%ymm15 vmovdqa %ymm14,448-512(%rax) vmovdqa %ymm15,480-512(%rax) vpshufd $0x00,%ymm7,%ymm4 vpshufd $0x55,%ymm7,%ymm5 vpaddd L$incy(%rip),%ymm4,%ymm4 vpshufd $0xaa,%ymm7,%ymm6 vmovdqa %ymm5,544-512(%rax) vpshufd $0xff,%ymm7,%ymm7 vmovdqa %ymm6,576-512(%rax) vmovdqa %ymm7,608-512(%rax) jmp L$oop_enter8x .p2align 5 L$oop_outer8x: vmovdqa 128-256(%rcx),%ymm8 vmovdqa 160-256(%rcx),%ymm9 vmovdqa 192-256(%rcx),%ymm10 vmovdqa 224-256(%rcx),%ymm11 vmovdqa 256-256(%rcx),%ymm0 vmovdqa 288-256(%rcx),%ymm1 vmovdqa 320-256(%rcx),%ymm2 vmovdqa 352-256(%rcx),%ymm3 vmovdqa 384-512(%rax),%ymm12 vmovdqa 416-512(%rax),%ymm13 vmovdqa 448-512(%rax),%ymm14 vmovdqa 480-512(%rax),%ymm15 vmovdqa 512-512(%rax),%ymm4 vmovdqa 544-512(%rax),%ymm5 vmovdqa 576-512(%rax),%ymm6 vmovdqa 608-512(%rax),%ymm7 vpaddd L$eight(%rip),%ymm4,%ymm4 L$oop_enter8x: vmovdqa %ymm14,64(%rsp) vmovdqa %ymm15,96(%rsp) vbroadcasti128 (%r10),%ymm15 vmovdqa %ymm4,512-512(%rax) movl $10,%eax jmp L$oop8x .p2align 5 L$oop8x: vpaddd %ymm0,%ymm8,%ymm8 vpxor %ymm4,%ymm8,%ymm4 vpshufb %ymm15,%ymm4,%ymm4 vpaddd %ymm1,%ymm9,%ymm9 vpxor %ymm5,%ymm9,%ymm5 vpshufb %ymm15,%ymm5,%ymm5 vpaddd %ymm4,%ymm12,%ymm12 vpxor %ymm0,%ymm12,%ymm0 vpslld $12,%ymm0,%ymm14 vpsrld $20,%ymm0,%ymm0 vpor %ymm0,%ymm14,%ymm0 vbroadcasti128 (%r11),%ymm14 vpaddd %ymm5,%ymm13,%ymm13 vpxor %ymm1,%ymm13,%ymm1 vpslld $12,%ymm1,%ymm15 vpsrld $20,%ymm1,%ymm1 vpor %ymm1,%ymm15,%ymm1 vpaddd %ymm0,%ymm8,%ymm8 vpxor %ymm4,%ymm8,%ymm4 vpshufb %ymm14,%ymm4,%ymm4 vpaddd %ymm1,%ymm9,%ymm9 vpxor %ymm5,%ymm9,%ymm5 vpshufb %ymm14,%ymm5,%ymm5 vpaddd %ymm4,%ymm12,%ymm12 vpxor %ymm0,%ymm12,%ymm0 vpslld $7,%ymm0,%ymm15 vpsrld $25,%ymm0,%ymm0 vpor %ymm0,%ymm15,%ymm0 vbroadcasti128 (%r10),%ymm15 vpaddd %ymm5,%ymm13,%ymm13 vpxor %ymm1,%ymm13,%ymm1 vpslld $7,%ymm1,%ymm14 vpsrld $25,%ymm1,%ymm1 vpor %ymm1,%ymm14,%ymm1 vmovdqa %ymm12,0(%rsp) vmovdqa %ymm13,32(%rsp) vmovdqa 64(%rsp),%ymm12 vmovdqa 96(%rsp),%ymm13 vpaddd %ymm2,%ymm10,%ymm10 vpxor %ymm6,%ymm10,%ymm6 vpshufb %ymm15,%ymm6,%ymm6 vpaddd %ymm3,%ymm11,%ymm11 vpxor %ymm7,%ymm11,%ymm7 vpshufb %ymm15,%ymm7,%ymm7 vpaddd %ymm6,%ymm12,%ymm12 vpxor %ymm2,%ymm12,%ymm2 vpslld $12,%ymm2,%ymm14 vpsrld $20,%ymm2,%ymm2 vpor %ymm2,%ymm14,%ymm2 vbroadcasti128 (%r11),%ymm14 vpaddd %ymm7,%ymm13,%ymm13 vpxor %ymm3,%ymm13,%ymm3 vpslld $12,%ymm3,%ymm15 vpsrld $20,%ymm3,%ymm3 vpor %ymm3,%ymm15,%ymm3 vpaddd %ymm2,%ymm10,%ymm10 vpxor %ymm6,%ymm10,%ymm6 vpshufb %ymm14,%ymm6,%ymm6 vpaddd %ymm3,%ymm11,%ymm11 vpxor %ymm7,%ymm11,%ymm7 vpshufb %ymm14,%ymm7,%ymm7 vpaddd %ymm6,%ymm12,%ymm12 vpxor %ymm2,%ymm12,%ymm2 vpslld $7,%ymm2,%ymm15 vpsrld $25,%ymm2,%ymm2 vpor %ymm2,%ymm15,%ymm2 vbroadcasti128 (%r10),%ymm15 vpaddd %ymm7,%ymm13,%ymm13 vpxor %ymm3,%ymm13,%ymm3 vpslld $7,%ymm3,%ymm14 vpsrld $25,%ymm3,%ymm3 vpor %ymm3,%ymm14,%ymm3 vpaddd %ymm1,%ymm8,%ymm8 vpxor %ymm7,%ymm8,%ymm7 vpshufb %ymm15,%ymm7,%ymm7 vpaddd %ymm2,%ymm9,%ymm9 vpxor %ymm4,%ymm9,%ymm4 vpshufb %ymm15,%ymm4,%ymm4 vpaddd %ymm7,%ymm12,%ymm12 vpxor %ymm1,%ymm12,%ymm1 vpslld $12,%ymm1,%ymm14 vpsrld $20,%ymm1,%ymm1 vpor %ymm1,%ymm14,%ymm1 vbroadcasti128 (%r11),%ymm14 vpaddd %ymm4,%ymm13,%ymm13 vpxor %ymm2,%ymm13,%ymm2 vpslld $12,%ymm2,%ymm15 vpsrld $20,%ymm2,%ymm2 vpor %ymm2,%ymm15,%ymm2 vpaddd %ymm1,%ymm8,%ymm8 vpxor %ymm7,%ymm8,%ymm7 vpshufb %ymm14,%ymm7,%ymm7 vpaddd %ymm2,%ymm9,%ymm9 vpxor %ymm4,%ymm9,%ymm4 vpshufb %ymm14,%ymm4,%ymm4 vpaddd %ymm7,%ymm12,%ymm12 vpxor %ymm1,%ymm12,%ymm1 vpslld $7,%ymm1,%ymm15 vpsrld $25,%ymm1,%ymm1 vpor %ymm1,%ymm15,%ymm1 vbroadcasti128 (%r10),%ymm15 vpaddd %ymm4,%ymm13,%ymm13 vpxor %ymm2,%ymm13,%ymm2 vpslld $7,%ymm2,%ymm14 vpsrld $25,%ymm2,%ymm2 vpor %ymm2,%ymm14,%ymm2 vmovdqa %ymm12,64(%rsp) vmovdqa %ymm13,96(%rsp) vmovdqa 0(%rsp),%ymm12 vmovdqa 32(%rsp),%ymm13 vpaddd %ymm3,%ymm10,%ymm10 vpxor %ymm5,%ymm10,%ymm5 vpshufb %ymm15,%ymm5,%ymm5 vpaddd %ymm0,%ymm11,%ymm11 vpxor %ymm6,%ymm11,%ymm6 vpshufb %ymm15,%ymm6,%ymm6 vpaddd %ymm5,%ymm12,%ymm12 vpxor %ymm3,%ymm12,%ymm3 vpslld $12,%ymm3,%ymm14 vpsrld $20,%ymm3,%ymm3 vpor %ymm3,%ymm14,%ymm3 vbroadcasti128 (%r11),%ymm14 vpaddd %ymm6,%ymm13,%ymm13 vpxor %ymm0,%ymm13,%ymm0 vpslld $12,%ymm0,%ymm15 vpsrld $20,%ymm0,%ymm0 vpor %ymm0,%ymm15,%ymm0 vpaddd %ymm3,%ymm10,%ymm10 vpxor %ymm5,%ymm10,%ymm5 vpshufb %ymm14,%ymm5,%ymm5 vpaddd %ymm0,%ymm11,%ymm11 vpxor %ymm6,%ymm11,%ymm6 vpshufb %ymm14,%ymm6,%ymm6 vpaddd %ymm5,%ymm12,%ymm12 vpxor %ymm3,%ymm12,%ymm3 vpslld $7,%ymm3,%ymm15 vpsrld $25,%ymm3,%ymm3 vpor %ymm3,%ymm15,%ymm3 vbroadcasti128 (%r10),%ymm15 vpaddd %ymm6,%ymm13,%ymm13 vpxor %ymm0,%ymm13,%ymm0 vpslld $7,%ymm0,%ymm14 vpsrld $25,%ymm0,%ymm0 vpor %ymm0,%ymm14,%ymm0 decl %eax jnz L$oop8x leaq 512(%rsp),%rax vpaddd 128-256(%rcx),%ymm8,%ymm8 vpaddd 160-256(%rcx),%ymm9,%ymm9 vpaddd 192-256(%rcx),%ymm10,%ymm10 vpaddd 224-256(%rcx),%ymm11,%ymm11 vpunpckldq %ymm9,%ymm8,%ymm14 vpunpckldq %ymm11,%ymm10,%ymm15 vpunpckhdq %ymm9,%ymm8,%ymm8 vpunpckhdq %ymm11,%ymm10,%ymm10 vpunpcklqdq %ymm15,%ymm14,%ymm9 vpunpckhqdq %ymm15,%ymm14,%ymm14 vpunpcklqdq %ymm10,%ymm8,%ymm11 vpunpckhqdq %ymm10,%ymm8,%ymm8 vpaddd 256-256(%rcx),%ymm0,%ymm0 vpaddd 288-256(%rcx),%ymm1,%ymm1 vpaddd 320-256(%rcx),%ymm2,%ymm2 vpaddd 352-256(%rcx),%ymm3,%ymm3 vpunpckldq %ymm1,%ymm0,%ymm10 vpunpckldq %ymm3,%ymm2,%ymm15 vpunpckhdq %ymm1,%ymm0,%ymm0 vpunpckhdq %ymm3,%ymm2,%ymm2 vpunpcklqdq %ymm15,%ymm10,%ymm1 vpunpckhqdq %ymm15,%ymm10,%ymm10 vpunpcklqdq %ymm2,%ymm0,%ymm3 vpunpckhqdq %ymm2,%ymm0,%ymm0 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 vmovdqa %ymm15,0(%rsp) vmovdqa %ymm9,32(%rsp) vmovdqa 64(%rsp),%ymm15 vmovdqa 96(%rsp),%ymm9 vpaddd 384-512(%rax),%ymm12,%ymm12 vpaddd 416-512(%rax),%ymm13,%ymm13 vpaddd 448-512(%rax),%ymm15,%ymm15 vpaddd 480-512(%rax),%ymm9,%ymm9 vpunpckldq %ymm13,%ymm12,%ymm2 vpunpckldq %ymm9,%ymm15,%ymm8 vpunpckhdq %ymm13,%ymm12,%ymm12 vpunpckhdq %ymm9,%ymm15,%ymm15 vpunpcklqdq %ymm8,%ymm2,%ymm13 vpunpckhqdq %ymm8,%ymm2,%ymm2 vpunpcklqdq %ymm15,%ymm12,%ymm9 vpunpckhqdq %ymm15,%ymm12,%ymm12 vpaddd 512-512(%rax),%ymm4,%ymm4 vpaddd 544-512(%rax),%ymm5,%ymm5 vpaddd 576-512(%rax),%ymm6,%ymm6 vpaddd 608-512(%rax),%ymm7,%ymm7 vpunpckldq %ymm5,%ymm4,%ymm15 vpunpckldq %ymm7,%ymm6,%ymm8 vpunpckhdq %ymm5,%ymm4,%ymm4 vpunpckhdq %ymm7,%ymm6,%ymm6 vpunpcklqdq %ymm8,%ymm15,%ymm5 vpunpckhqdq %ymm8,%ymm15,%ymm15 vpunpcklqdq %ymm6,%ymm4,%ymm7 vpunpckhqdq %ymm6,%ymm4,%ymm4 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 vmovdqa 0(%rsp),%ymm6 vmovdqa 32(%rsp),%ymm12 cmpq $512,%rdx jb L$tail8x vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 leaq 128(%rsi),%rsi vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) leaq 128(%rdi),%rdi vpxor 0(%rsi),%ymm12,%ymm12 vpxor 32(%rsi),%ymm13,%ymm13 vpxor 64(%rsi),%ymm10,%ymm10 vpxor 96(%rsi),%ymm15,%ymm15 leaq 128(%rsi),%rsi vmovdqu %ymm12,0(%rdi) vmovdqu %ymm13,32(%rdi) vmovdqu %ymm10,64(%rdi) vmovdqu %ymm15,96(%rdi) leaq 128(%rdi),%rdi vpxor 0(%rsi),%ymm14,%ymm14 vpxor 32(%rsi),%ymm2,%ymm2 vpxor 64(%rsi),%ymm3,%ymm3 vpxor 96(%rsi),%ymm7,%ymm7 leaq 128(%rsi),%rsi vmovdqu %ymm14,0(%rdi) vmovdqu %ymm2,32(%rdi) vmovdqu %ymm3,64(%rdi) vmovdqu %ymm7,96(%rdi) leaq 128(%rdi),%rdi vpxor 0(%rsi),%ymm11,%ymm11 vpxor 32(%rsi),%ymm9,%ymm9 vpxor 64(%rsi),%ymm0,%ymm0 vpxor 96(%rsi),%ymm4,%ymm4 leaq 128(%rsi),%rsi vmovdqu %ymm11,0(%rdi) vmovdqu %ymm9,32(%rdi) vmovdqu %ymm0,64(%rdi) vmovdqu %ymm4,96(%rdi) leaq 128(%rdi),%rdi subq $512,%rdx jnz L$oop_outer8x jmp L$done8x L$tail8x: cmpq $448,%rdx jae L$448_or_more8x cmpq $384,%rdx jae L$384_or_more8x cmpq $320,%rdx jae L$320_or_more8x cmpq $256,%rdx jae L$256_or_more8x cmpq $192,%rdx jae L$192_or_more8x cmpq $128,%rdx jae L$128_or_more8x cmpq $64,%rdx jae L$64_or_more8x xorq %r10,%r10 vmovdqa %ymm6,0(%rsp) vmovdqa %ymm8,32(%rsp) jmp L$oop_tail8x .p2align 5 L$64_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) je L$done8x leaq 64(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm1,0(%rsp) leaq 64(%rdi),%rdi subq $64,%rdx vmovdqa %ymm5,32(%rsp) jmp L$oop_tail8x .p2align 5 L$128_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) je L$done8x leaq 128(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm12,0(%rsp) leaq 128(%rdi),%rdi subq $128,%rdx vmovdqa %ymm13,32(%rsp) jmp L$oop_tail8x .p2align 5 L$192_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 vpxor 128(%rsi),%ymm12,%ymm12 vpxor 160(%rsi),%ymm13,%ymm13 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) vmovdqu %ymm12,128(%rdi) vmovdqu %ymm13,160(%rdi) je L$done8x leaq 192(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm10,0(%rsp) leaq 192(%rdi),%rdi subq $192,%rdx vmovdqa %ymm15,32(%rsp) jmp L$oop_tail8x .p2align 5 L$256_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 vpxor 128(%rsi),%ymm12,%ymm12 vpxor 160(%rsi),%ymm13,%ymm13 vpxor 192(%rsi),%ymm10,%ymm10 vpxor 224(%rsi),%ymm15,%ymm15 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) vmovdqu %ymm12,128(%rdi) vmovdqu %ymm13,160(%rdi) vmovdqu %ymm10,192(%rdi) vmovdqu %ymm15,224(%rdi) je L$done8x leaq 256(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm14,0(%rsp) leaq 256(%rdi),%rdi subq $256,%rdx vmovdqa %ymm2,32(%rsp) jmp L$oop_tail8x .p2align 5 L$320_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 vpxor 128(%rsi),%ymm12,%ymm12 vpxor 160(%rsi),%ymm13,%ymm13 vpxor 192(%rsi),%ymm10,%ymm10 vpxor 224(%rsi),%ymm15,%ymm15 vpxor 256(%rsi),%ymm14,%ymm14 vpxor 288(%rsi),%ymm2,%ymm2 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) vmovdqu %ymm12,128(%rdi) vmovdqu %ymm13,160(%rdi) vmovdqu %ymm10,192(%rdi) vmovdqu %ymm15,224(%rdi) vmovdqu %ymm14,256(%rdi) vmovdqu %ymm2,288(%rdi) je L$done8x leaq 320(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm3,0(%rsp) leaq 320(%rdi),%rdi subq $320,%rdx vmovdqa %ymm7,32(%rsp) jmp L$oop_tail8x .p2align 5 L$384_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 vpxor 128(%rsi),%ymm12,%ymm12 vpxor 160(%rsi),%ymm13,%ymm13 vpxor 192(%rsi),%ymm10,%ymm10 vpxor 224(%rsi),%ymm15,%ymm15 vpxor 256(%rsi),%ymm14,%ymm14 vpxor 288(%rsi),%ymm2,%ymm2 vpxor 320(%rsi),%ymm3,%ymm3 vpxor 352(%rsi),%ymm7,%ymm7 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) vmovdqu %ymm12,128(%rdi) vmovdqu %ymm13,160(%rdi) vmovdqu %ymm10,192(%rdi) vmovdqu %ymm15,224(%rdi) vmovdqu %ymm14,256(%rdi) vmovdqu %ymm2,288(%rdi) vmovdqu %ymm3,320(%rdi) vmovdqu %ymm7,352(%rdi) je L$done8x leaq 384(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm11,0(%rsp) leaq 384(%rdi),%rdi subq $384,%rdx vmovdqa %ymm9,32(%rsp) jmp L$oop_tail8x .p2align 5 L$448_or_more8x: vpxor 0(%rsi),%ymm6,%ymm6 vpxor 32(%rsi),%ymm8,%ymm8 vpxor 64(%rsi),%ymm1,%ymm1 vpxor 96(%rsi),%ymm5,%ymm5 vpxor 128(%rsi),%ymm12,%ymm12 vpxor 160(%rsi),%ymm13,%ymm13 vpxor 192(%rsi),%ymm10,%ymm10 vpxor 224(%rsi),%ymm15,%ymm15 vpxor 256(%rsi),%ymm14,%ymm14 vpxor 288(%rsi),%ymm2,%ymm2 vpxor 320(%rsi),%ymm3,%ymm3 vpxor 352(%rsi),%ymm7,%ymm7 vpxor 384(%rsi),%ymm11,%ymm11 vpxor 416(%rsi),%ymm9,%ymm9 vmovdqu %ymm6,0(%rdi) vmovdqu %ymm8,32(%rdi) vmovdqu %ymm1,64(%rdi) vmovdqu %ymm5,96(%rdi) vmovdqu %ymm12,128(%rdi) vmovdqu %ymm13,160(%rdi) vmovdqu %ymm10,192(%rdi) vmovdqu %ymm15,224(%rdi) vmovdqu %ymm14,256(%rdi) vmovdqu %ymm2,288(%rdi) vmovdqu %ymm3,320(%rdi) vmovdqu %ymm7,352(%rdi) vmovdqu %ymm11,384(%rdi) vmovdqu %ymm9,416(%rdi) je L$done8x leaq 448(%rsi),%rsi xorq %r10,%r10 vmovdqa %ymm0,0(%rsp) leaq 448(%rdi),%rdi subq $448,%rdx vmovdqa %ymm4,32(%rsp) L$oop_tail8x: movzbl (%rsi,%r10,1),%eax movzbl (%rsp,%r10,1),%ecx leaq 1(%r10),%r10 xorl %ecx,%eax movb %al,-1(%rdi,%r10,1) decq %rdx jnz L$oop_tail8x L$done8x: vzeroall leaq (%r9),%rsp L$8x_epilogue: ret #endif ring-0.17.14/pregenerated/chacha-x86_64-nasm.asm000064400000000000000000001106121046102023000172040ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %ifidn __OUTPUT_FORMAT__, win64 default rel %define XMMWORD %define YMMWORD %define ZMMWORD %define _CET_ENDBR %include "ring_core_generated/prefix_symbols_nasm.inc" section .text code align=64 section .rdata rdata align=8 ALIGN 64 $L$zero: DD 0,0,0,0 $L$one: DD 1,0,0,0 $L$inc: DD 0,1,2,3 $L$four: DD 4,4,4,4 $L$incy: DD 0,2,4,6,1,3,5,7 $L$eight: DD 8,8,8,8,8,8,8,8 $L$rot16: DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd $L$rot24: DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe $L$sigma: DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 DB 0 ALIGN 64 $L$zeroz: DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0 $L$fourz: DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0 $L$incz: DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 $L$sixteen: DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 DB 108,46,111,114,103,62,0 section .text global ChaCha20_ctr32_nohw ALIGN 64 ChaCha20_ctr32_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ChaCha20_ctr32_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] _CET_ENDBR push rbx push rbp push r12 push r13 push r14 push r15 sub rsp,64+24 $L$ctr32_body: movdqu xmm1,XMMWORD[rcx] movdqu xmm2,XMMWORD[16+rcx] movdqu xmm3,XMMWORD[r8] movdqa xmm4,XMMWORD[$L$one] movdqa XMMWORD[16+rsp],xmm1 movdqa XMMWORD[32+rsp],xmm2 movdqa XMMWORD[48+rsp],xmm3 mov rbp,rdx jmp NEAR $L$oop_outer ALIGN 32 $L$oop_outer: mov eax,0x61707865 mov ebx,0x3320646e mov ecx,0x79622d32 mov edx,0x6b206574 mov r8d,DWORD[16+rsp] mov r9d,DWORD[20+rsp] mov r10d,DWORD[24+rsp] mov r11d,DWORD[28+rsp] movd r12d,xmm3 mov r13d,DWORD[52+rsp] mov r14d,DWORD[56+rsp] mov r15d,DWORD[60+rsp] mov QWORD[((64+0))+rsp],rbp mov ebp,10 mov QWORD[((64+8))+rsp],rsi DB 102,72,15,126,214 mov QWORD[((64+16))+rsp],rdi mov rdi,rsi shr rdi,32 jmp NEAR $L$oop ALIGN 32 $L$oop: add eax,r8d xor r12d,eax rol r12d,16 add ebx,r9d xor r13d,ebx rol r13d,16 add esi,r12d xor r8d,esi rol r8d,12 add edi,r13d xor r9d,edi rol r9d,12 add eax,r8d xor r12d,eax rol r12d,8 add ebx,r9d xor r13d,ebx rol r13d,8 add esi,r12d xor r8d,esi rol r8d,7 add edi,r13d xor r9d,edi rol r9d,7 mov DWORD[32+rsp],esi mov DWORD[36+rsp],edi mov esi,DWORD[40+rsp] mov edi,DWORD[44+rsp] add ecx,r10d xor r14d,ecx rol r14d,16 add edx,r11d xor r15d,edx rol r15d,16 add esi,r14d xor r10d,esi rol r10d,12 add edi,r15d xor r11d,edi rol r11d,12 add ecx,r10d xor r14d,ecx rol r14d,8 add edx,r11d xor r15d,edx rol r15d,8 add esi,r14d xor r10d,esi rol r10d,7 add edi,r15d xor r11d,edi rol r11d,7 add eax,r9d xor r15d,eax rol r15d,16 add ebx,r10d xor r12d,ebx rol r12d,16 add esi,r15d xor r9d,esi rol r9d,12 add edi,r12d xor r10d,edi rol r10d,12 add eax,r9d xor r15d,eax rol r15d,8 add ebx,r10d xor r12d,ebx rol r12d,8 add esi,r15d xor r9d,esi rol r9d,7 add edi,r12d xor r10d,edi rol r10d,7 mov DWORD[40+rsp],esi mov DWORD[44+rsp],edi mov esi,DWORD[32+rsp] mov edi,DWORD[36+rsp] add ecx,r11d xor r13d,ecx rol r13d,16 add edx,r8d xor r14d,edx rol r14d,16 add esi,r13d xor r11d,esi rol r11d,12 add edi,r14d xor r8d,edi rol r8d,12 add ecx,r11d xor r13d,ecx rol r13d,8 add edx,r8d xor r14d,edx rol r14d,8 add esi,r13d xor r11d,esi rol r11d,7 add edi,r14d xor r8d,edi rol r8d,7 dec ebp jnz NEAR $L$oop mov DWORD[36+rsp],edi mov DWORD[32+rsp],esi mov rbp,QWORD[64+rsp] movdqa xmm1,xmm2 mov rsi,QWORD[((64+8))+rsp] paddd xmm3,xmm4 mov rdi,QWORD[((64+16))+rsp] add eax,0x61707865 add ebx,0x3320646e add ecx,0x79622d32 add edx,0x6b206574 add r8d,DWORD[16+rsp] add r9d,DWORD[20+rsp] add r10d,DWORD[24+rsp] add r11d,DWORD[28+rsp] add r12d,DWORD[48+rsp] add r13d,DWORD[52+rsp] add r14d,DWORD[56+rsp] add r15d,DWORD[60+rsp] paddd xmm1,XMMWORD[32+rsp] cmp rbp,64 jb NEAR $L$tail xor eax,DWORD[rsi] xor ebx,DWORD[4+rsi] xor ecx,DWORD[8+rsi] xor edx,DWORD[12+rsi] xor r8d,DWORD[16+rsi] xor r9d,DWORD[20+rsi] xor r10d,DWORD[24+rsi] xor r11d,DWORD[28+rsi] movdqu xmm0,XMMWORD[32+rsi] xor r12d,DWORD[48+rsi] xor r13d,DWORD[52+rsi] xor r14d,DWORD[56+rsi] xor r15d,DWORD[60+rsi] lea rsi,[64+rsi] pxor xmm0,xmm1 movdqa XMMWORD[32+rsp],xmm2 movd DWORD[48+rsp],xmm3 mov DWORD[rdi],eax mov DWORD[4+rdi],ebx mov DWORD[8+rdi],ecx mov DWORD[12+rdi],edx mov DWORD[16+rdi],r8d mov DWORD[20+rdi],r9d mov DWORD[24+rdi],r10d mov DWORD[28+rdi],r11d movdqu XMMWORD[32+rdi],xmm0 mov DWORD[48+rdi],r12d mov DWORD[52+rdi],r13d mov DWORD[56+rdi],r14d mov DWORD[60+rdi],r15d lea rdi,[64+rdi] sub rbp,64 jnz NEAR $L$oop_outer jmp NEAR $L$done ALIGN 16 $L$tail: mov DWORD[rsp],eax mov DWORD[4+rsp],ebx xor rbx,rbx mov DWORD[8+rsp],ecx mov DWORD[12+rsp],edx mov DWORD[16+rsp],r8d mov DWORD[20+rsp],r9d mov DWORD[24+rsp],r10d mov DWORD[28+rsp],r11d movdqa XMMWORD[32+rsp],xmm1 mov DWORD[48+rsp],r12d mov DWORD[52+rsp],r13d mov DWORD[56+rsp],r14d mov DWORD[60+rsp],r15d $L$oop_tail: movzx eax,BYTE[rbx*1+rsi] movzx edx,BYTE[rbx*1+rsp] lea rbx,[1+rbx] xor eax,edx mov BYTE[((-1))+rbx*1+rdi],al dec rbp jnz NEAR $L$oop_tail $L$done: lea rsi,[((64+24+48))+rsp] mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$no_data: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ChaCha20_ctr32_nohw: global ChaCha20_ctr32_ssse3_4x ALIGN 32 ChaCha20_ctr32_ssse3_4x: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ChaCha20_ctr32_ssse3_4x: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] _CET_ENDBR mov r9,rsp sub rsp,0x140+168 movaps XMMWORD[(-168)+r9],xmm6 movaps XMMWORD[(-152)+r9],xmm7 movaps XMMWORD[(-136)+r9],xmm8 movaps XMMWORD[(-120)+r9],xmm9 movaps XMMWORD[(-104)+r9],xmm10 movaps XMMWORD[(-88)+r9],xmm11 movaps XMMWORD[(-72)+r9],xmm12 movaps XMMWORD[(-56)+r9],xmm13 movaps XMMWORD[(-40)+r9],xmm14 movaps XMMWORD[(-24)+r9],xmm15 $L$4x_body: movdqa xmm11,XMMWORD[$L$sigma] movdqu xmm15,XMMWORD[rcx] movdqu xmm7,XMMWORD[16+rcx] movdqu xmm3,XMMWORD[r8] lea rcx,[256+rsp] lea r10,[$L$rot16] lea r11,[$L$rot24] pshufd xmm8,xmm11,0x00 pshufd xmm9,xmm11,0x55 movdqa XMMWORD[64+rsp],xmm8 pshufd xmm10,xmm11,0xaa movdqa XMMWORD[80+rsp],xmm9 pshufd xmm11,xmm11,0xff movdqa XMMWORD[96+rsp],xmm10 movdqa XMMWORD[112+rsp],xmm11 pshufd xmm12,xmm15,0x00 pshufd xmm13,xmm15,0x55 movdqa XMMWORD[(128-256)+rcx],xmm12 pshufd xmm14,xmm15,0xaa movdqa XMMWORD[(144-256)+rcx],xmm13 pshufd xmm15,xmm15,0xff movdqa XMMWORD[(160-256)+rcx],xmm14 movdqa XMMWORD[(176-256)+rcx],xmm15 pshufd xmm4,xmm7,0x00 pshufd xmm5,xmm7,0x55 movdqa XMMWORD[(192-256)+rcx],xmm4 pshufd xmm6,xmm7,0xaa movdqa XMMWORD[(208-256)+rcx],xmm5 pshufd xmm7,xmm7,0xff movdqa XMMWORD[(224-256)+rcx],xmm6 movdqa XMMWORD[(240-256)+rcx],xmm7 pshufd xmm0,xmm3,0x00 pshufd xmm1,xmm3,0x55 paddd xmm0,XMMWORD[$L$inc] pshufd xmm2,xmm3,0xaa movdqa XMMWORD[(272-256)+rcx],xmm1 pshufd xmm3,xmm3,0xff movdqa XMMWORD[(288-256)+rcx],xmm2 movdqa XMMWORD[(304-256)+rcx],xmm3 jmp NEAR $L$oop_enter4x ALIGN 32 $L$oop_outer4x: movdqa xmm8,XMMWORD[64+rsp] movdqa xmm9,XMMWORD[80+rsp] movdqa xmm10,XMMWORD[96+rsp] movdqa xmm11,XMMWORD[112+rsp] movdqa xmm12,XMMWORD[((128-256))+rcx] movdqa xmm13,XMMWORD[((144-256))+rcx] movdqa xmm14,XMMWORD[((160-256))+rcx] movdqa xmm15,XMMWORD[((176-256))+rcx] movdqa xmm4,XMMWORD[((192-256))+rcx] movdqa xmm5,XMMWORD[((208-256))+rcx] movdqa xmm6,XMMWORD[((224-256))+rcx] movdqa xmm7,XMMWORD[((240-256))+rcx] movdqa xmm0,XMMWORD[((256-256))+rcx] movdqa xmm1,XMMWORD[((272-256))+rcx] movdqa xmm2,XMMWORD[((288-256))+rcx] movdqa xmm3,XMMWORD[((304-256))+rcx] paddd xmm0,XMMWORD[$L$four] $L$oop_enter4x: movdqa XMMWORD[32+rsp],xmm6 movdqa XMMWORD[48+rsp],xmm7 movdqa xmm7,XMMWORD[r10] mov eax,10 movdqa XMMWORD[(256-256)+rcx],xmm0 jmp NEAR $L$oop4x ALIGN 32 $L$oop4x: paddd xmm8,xmm12 paddd xmm9,xmm13 pxor xmm0,xmm8 pxor xmm1,xmm9 DB 102,15,56,0,199 DB 102,15,56,0,207 paddd xmm4,xmm0 paddd xmm5,xmm1 pxor xmm12,xmm4 pxor xmm13,xmm5 movdqa xmm6,xmm12 pslld xmm12,12 psrld xmm6,20 movdqa xmm7,xmm13 pslld xmm13,12 por xmm12,xmm6 psrld xmm7,20 movdqa xmm6,XMMWORD[r11] por xmm13,xmm7 paddd xmm8,xmm12 paddd xmm9,xmm13 pxor xmm0,xmm8 pxor xmm1,xmm9 DB 102,15,56,0,198 DB 102,15,56,0,206 paddd xmm4,xmm0 paddd xmm5,xmm1 pxor xmm12,xmm4 pxor xmm13,xmm5 movdqa xmm7,xmm12 pslld xmm12,7 psrld xmm7,25 movdqa xmm6,xmm13 pslld xmm13,7 por xmm12,xmm7 psrld xmm6,25 movdqa xmm7,XMMWORD[r10] por xmm13,xmm6 movdqa XMMWORD[rsp],xmm4 movdqa XMMWORD[16+rsp],xmm5 movdqa xmm4,XMMWORD[32+rsp] movdqa xmm5,XMMWORD[48+rsp] paddd xmm10,xmm14 paddd xmm11,xmm15 pxor xmm2,xmm10 pxor xmm3,xmm11 DB 102,15,56,0,215 DB 102,15,56,0,223 paddd xmm4,xmm2 paddd xmm5,xmm3 pxor xmm14,xmm4 pxor xmm15,xmm5 movdqa xmm6,xmm14 pslld xmm14,12 psrld xmm6,20 movdqa xmm7,xmm15 pslld xmm15,12 por xmm14,xmm6 psrld xmm7,20 movdqa xmm6,XMMWORD[r11] por xmm15,xmm7 paddd xmm10,xmm14 paddd xmm11,xmm15 pxor xmm2,xmm10 pxor xmm3,xmm11 DB 102,15,56,0,214 DB 102,15,56,0,222 paddd xmm4,xmm2 paddd xmm5,xmm3 pxor xmm14,xmm4 pxor xmm15,xmm5 movdqa xmm7,xmm14 pslld xmm14,7 psrld xmm7,25 movdqa xmm6,xmm15 pslld xmm15,7 por xmm14,xmm7 psrld xmm6,25 movdqa xmm7,XMMWORD[r10] por xmm15,xmm6 paddd xmm8,xmm13 paddd xmm9,xmm14 pxor xmm3,xmm8 pxor xmm0,xmm9 DB 102,15,56,0,223 DB 102,15,56,0,199 paddd xmm4,xmm3 paddd xmm5,xmm0 pxor xmm13,xmm4 pxor xmm14,xmm5 movdqa xmm6,xmm13 pslld xmm13,12 psrld xmm6,20 movdqa xmm7,xmm14 pslld xmm14,12 por xmm13,xmm6 psrld xmm7,20 movdqa xmm6,XMMWORD[r11] por xmm14,xmm7 paddd xmm8,xmm13 paddd xmm9,xmm14 pxor xmm3,xmm8 pxor xmm0,xmm9 DB 102,15,56,0,222 DB 102,15,56,0,198 paddd xmm4,xmm3 paddd xmm5,xmm0 pxor xmm13,xmm4 pxor xmm14,xmm5 movdqa xmm7,xmm13 pslld xmm13,7 psrld xmm7,25 movdqa xmm6,xmm14 pslld xmm14,7 por xmm13,xmm7 psrld xmm6,25 movdqa xmm7,XMMWORD[r10] por xmm14,xmm6 movdqa XMMWORD[32+rsp],xmm4 movdqa XMMWORD[48+rsp],xmm5 movdqa xmm4,XMMWORD[rsp] movdqa xmm5,XMMWORD[16+rsp] paddd xmm10,xmm15 paddd xmm11,xmm12 pxor xmm1,xmm10 pxor xmm2,xmm11 DB 102,15,56,0,207 DB 102,15,56,0,215 paddd xmm4,xmm1 paddd xmm5,xmm2 pxor xmm15,xmm4 pxor xmm12,xmm5 movdqa xmm6,xmm15 pslld xmm15,12 psrld xmm6,20 movdqa xmm7,xmm12 pslld xmm12,12 por xmm15,xmm6 psrld xmm7,20 movdqa xmm6,XMMWORD[r11] por xmm12,xmm7 paddd xmm10,xmm15 paddd xmm11,xmm12 pxor xmm1,xmm10 pxor xmm2,xmm11 DB 102,15,56,0,206 DB 102,15,56,0,214 paddd xmm4,xmm1 paddd xmm5,xmm2 pxor xmm15,xmm4 pxor xmm12,xmm5 movdqa xmm7,xmm15 pslld xmm15,7 psrld xmm7,25 movdqa xmm6,xmm12 pslld xmm12,7 por xmm15,xmm7 psrld xmm6,25 movdqa xmm7,XMMWORD[r10] por xmm12,xmm6 dec eax jnz NEAR $L$oop4x paddd xmm8,XMMWORD[64+rsp] paddd xmm9,XMMWORD[80+rsp] paddd xmm10,XMMWORD[96+rsp] paddd xmm11,XMMWORD[112+rsp] movdqa xmm6,xmm8 punpckldq xmm8,xmm9 movdqa xmm7,xmm10 punpckldq xmm10,xmm11 punpckhdq xmm6,xmm9 punpckhdq xmm7,xmm11 movdqa xmm9,xmm8 punpcklqdq xmm8,xmm10 movdqa xmm11,xmm6 punpcklqdq xmm6,xmm7 punpckhqdq xmm9,xmm10 punpckhqdq xmm11,xmm7 paddd xmm12,XMMWORD[((128-256))+rcx] paddd xmm13,XMMWORD[((144-256))+rcx] paddd xmm14,XMMWORD[((160-256))+rcx] paddd xmm15,XMMWORD[((176-256))+rcx] movdqa XMMWORD[rsp],xmm8 movdqa XMMWORD[16+rsp],xmm9 movdqa xmm8,XMMWORD[32+rsp] movdqa xmm9,XMMWORD[48+rsp] movdqa xmm10,xmm12 punpckldq xmm12,xmm13 movdqa xmm7,xmm14 punpckldq xmm14,xmm15 punpckhdq xmm10,xmm13 punpckhdq xmm7,xmm15 movdqa xmm13,xmm12 punpcklqdq xmm12,xmm14 movdqa xmm15,xmm10 punpcklqdq xmm10,xmm7 punpckhqdq xmm13,xmm14 punpckhqdq xmm15,xmm7 paddd xmm4,XMMWORD[((192-256))+rcx] paddd xmm5,XMMWORD[((208-256))+rcx] paddd xmm8,XMMWORD[((224-256))+rcx] paddd xmm9,XMMWORD[((240-256))+rcx] movdqa XMMWORD[32+rsp],xmm6 movdqa XMMWORD[48+rsp],xmm11 movdqa xmm14,xmm4 punpckldq xmm4,xmm5 movdqa xmm7,xmm8 punpckldq xmm8,xmm9 punpckhdq xmm14,xmm5 punpckhdq xmm7,xmm9 movdqa xmm5,xmm4 punpcklqdq xmm4,xmm8 movdqa xmm9,xmm14 punpcklqdq xmm14,xmm7 punpckhqdq xmm5,xmm8 punpckhqdq xmm9,xmm7 paddd xmm0,XMMWORD[((256-256))+rcx] paddd xmm1,XMMWORD[((272-256))+rcx] paddd xmm2,XMMWORD[((288-256))+rcx] paddd xmm3,XMMWORD[((304-256))+rcx] movdqa xmm8,xmm0 punpckldq xmm0,xmm1 movdqa xmm7,xmm2 punpckldq xmm2,xmm3 punpckhdq xmm8,xmm1 punpckhdq xmm7,xmm3 movdqa xmm1,xmm0 punpcklqdq xmm0,xmm2 movdqa xmm3,xmm8 punpcklqdq xmm8,xmm7 punpckhqdq xmm1,xmm2 punpckhqdq xmm3,xmm7 cmp rdx,64*4 jb NEAR $L$tail4x movdqu xmm6,XMMWORD[rsi] movdqu xmm11,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] movdqu xmm7,XMMWORD[48+rsi] pxor xmm6,XMMWORD[rsp] pxor xmm11,xmm12 pxor xmm2,xmm4 pxor xmm7,xmm0 movdqu XMMWORD[rdi],xmm6 movdqu xmm6,XMMWORD[64+rsi] movdqu XMMWORD[16+rdi],xmm11 movdqu xmm11,XMMWORD[80+rsi] movdqu XMMWORD[32+rdi],xmm2 movdqu xmm2,XMMWORD[96+rsi] movdqu XMMWORD[48+rdi],xmm7 movdqu xmm7,XMMWORD[112+rsi] lea rsi,[128+rsi] pxor xmm6,XMMWORD[16+rsp] pxor xmm11,xmm13 pxor xmm2,xmm5 pxor xmm7,xmm1 movdqu XMMWORD[64+rdi],xmm6 movdqu xmm6,XMMWORD[rsi] movdqu XMMWORD[80+rdi],xmm11 movdqu xmm11,XMMWORD[16+rsi] movdqu XMMWORD[96+rdi],xmm2 movdqu xmm2,XMMWORD[32+rsi] movdqu XMMWORD[112+rdi],xmm7 lea rdi,[128+rdi] movdqu xmm7,XMMWORD[48+rsi] pxor xmm6,XMMWORD[32+rsp] pxor xmm11,xmm10 pxor xmm2,xmm14 pxor xmm7,xmm8 movdqu XMMWORD[rdi],xmm6 movdqu xmm6,XMMWORD[64+rsi] movdqu XMMWORD[16+rdi],xmm11 movdqu xmm11,XMMWORD[80+rsi] movdqu XMMWORD[32+rdi],xmm2 movdqu xmm2,XMMWORD[96+rsi] movdqu XMMWORD[48+rdi],xmm7 movdqu xmm7,XMMWORD[112+rsi] lea rsi,[128+rsi] pxor xmm6,XMMWORD[48+rsp] pxor xmm11,xmm15 pxor xmm2,xmm9 pxor xmm7,xmm3 movdqu XMMWORD[64+rdi],xmm6 movdqu XMMWORD[80+rdi],xmm11 movdqu XMMWORD[96+rdi],xmm2 movdqu XMMWORD[112+rdi],xmm7 lea rdi,[128+rdi] sub rdx,64*4 jnz NEAR $L$oop_outer4x jmp NEAR $L$done4x $L$tail4x: cmp rdx,192 jae NEAR $L$192_or_more4x cmp rdx,128 jae NEAR $L$128_or_more4x cmp rdx,64 jae NEAR $L$64_or_more4x xor r10,r10 movdqa XMMWORD[16+rsp],xmm12 movdqa XMMWORD[32+rsp],xmm4 movdqa XMMWORD[48+rsp],xmm0 jmp NEAR $L$oop_tail4x ALIGN 32 $L$64_or_more4x: movdqu xmm6,XMMWORD[rsi] movdqu xmm11,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] movdqu xmm7,XMMWORD[48+rsi] pxor xmm6,XMMWORD[rsp] pxor xmm11,xmm12 pxor xmm2,xmm4 pxor xmm7,xmm0 movdqu XMMWORD[rdi],xmm6 movdqu XMMWORD[16+rdi],xmm11 movdqu XMMWORD[32+rdi],xmm2 movdqu XMMWORD[48+rdi],xmm7 je NEAR $L$done4x movdqa xmm6,XMMWORD[16+rsp] lea rsi,[64+rsi] xor r10,r10 movdqa XMMWORD[rsp],xmm6 movdqa XMMWORD[16+rsp],xmm13 lea rdi,[64+rdi] movdqa XMMWORD[32+rsp],xmm5 sub rdx,64 movdqa XMMWORD[48+rsp],xmm1 jmp NEAR $L$oop_tail4x ALIGN 32 $L$128_or_more4x: movdqu xmm6,XMMWORD[rsi] movdqu xmm11,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] movdqu xmm7,XMMWORD[48+rsi] pxor xmm6,XMMWORD[rsp] pxor xmm11,xmm12 pxor xmm2,xmm4 pxor xmm7,xmm0 movdqu XMMWORD[rdi],xmm6 movdqu xmm6,XMMWORD[64+rsi] movdqu XMMWORD[16+rdi],xmm11 movdqu xmm11,XMMWORD[80+rsi] movdqu XMMWORD[32+rdi],xmm2 movdqu xmm2,XMMWORD[96+rsi] movdqu XMMWORD[48+rdi],xmm7 movdqu xmm7,XMMWORD[112+rsi] pxor xmm6,XMMWORD[16+rsp] pxor xmm11,xmm13 pxor xmm2,xmm5 pxor xmm7,xmm1 movdqu XMMWORD[64+rdi],xmm6 movdqu XMMWORD[80+rdi],xmm11 movdqu XMMWORD[96+rdi],xmm2 movdqu XMMWORD[112+rdi],xmm7 je NEAR $L$done4x movdqa xmm6,XMMWORD[32+rsp] lea rsi,[128+rsi] xor r10,r10 movdqa XMMWORD[rsp],xmm6 movdqa XMMWORD[16+rsp],xmm10 lea rdi,[128+rdi] movdqa XMMWORD[32+rsp],xmm14 sub rdx,128 movdqa XMMWORD[48+rsp],xmm8 jmp NEAR $L$oop_tail4x ALIGN 32 $L$192_or_more4x: movdqu xmm6,XMMWORD[rsi] movdqu xmm11,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] movdqu xmm7,XMMWORD[48+rsi] pxor xmm6,XMMWORD[rsp] pxor xmm11,xmm12 pxor xmm2,xmm4 pxor xmm7,xmm0 movdqu XMMWORD[rdi],xmm6 movdqu xmm6,XMMWORD[64+rsi] movdqu XMMWORD[16+rdi],xmm11 movdqu xmm11,XMMWORD[80+rsi] movdqu XMMWORD[32+rdi],xmm2 movdqu xmm2,XMMWORD[96+rsi] movdqu XMMWORD[48+rdi],xmm7 movdqu xmm7,XMMWORD[112+rsi] lea rsi,[128+rsi] pxor xmm6,XMMWORD[16+rsp] pxor xmm11,xmm13 pxor xmm2,xmm5 pxor xmm7,xmm1 movdqu XMMWORD[64+rdi],xmm6 movdqu xmm6,XMMWORD[rsi] movdqu XMMWORD[80+rdi],xmm11 movdqu xmm11,XMMWORD[16+rsi] movdqu XMMWORD[96+rdi],xmm2 movdqu xmm2,XMMWORD[32+rsi] movdqu XMMWORD[112+rdi],xmm7 lea rdi,[128+rdi] movdqu xmm7,XMMWORD[48+rsi] pxor xmm6,XMMWORD[32+rsp] pxor xmm11,xmm10 pxor xmm2,xmm14 pxor xmm7,xmm8 movdqu XMMWORD[rdi],xmm6 movdqu XMMWORD[16+rdi],xmm11 movdqu XMMWORD[32+rdi],xmm2 movdqu XMMWORD[48+rdi],xmm7 je NEAR $L$done4x movdqa xmm6,XMMWORD[48+rsp] lea rsi,[64+rsi] xor r10,r10 movdqa XMMWORD[rsp],xmm6 movdqa XMMWORD[16+rsp],xmm15 lea rdi,[64+rdi] movdqa XMMWORD[32+rsp],xmm9 sub rdx,192 movdqa XMMWORD[48+rsp],xmm3 $L$oop_tail4x: movzx eax,BYTE[r10*1+rsi] movzx ecx,BYTE[r10*1+rsp] lea r10,[1+r10] xor eax,ecx mov BYTE[((-1))+r10*1+rdi],al dec rdx jnz NEAR $L$oop_tail4x $L$done4x: movaps xmm6,XMMWORD[((-168))+r9] movaps xmm7,XMMWORD[((-152))+r9] movaps xmm8,XMMWORD[((-136))+r9] movaps xmm9,XMMWORD[((-120))+r9] movaps xmm10,XMMWORD[((-104))+r9] movaps xmm11,XMMWORD[((-88))+r9] movaps xmm12,XMMWORD[((-72))+r9] movaps xmm13,XMMWORD[((-56))+r9] movaps xmm14,XMMWORD[((-40))+r9] movaps xmm15,XMMWORD[((-24))+r9] lea rsp,[r9] $L$4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ChaCha20_ctr32_ssse3_4x: global ChaCha20_ctr32_avx2 ALIGN 32 ChaCha20_ctr32_avx2: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ChaCha20_ctr32_avx2: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] _CET_ENDBR mov r9,rsp sub rsp,0x280+168 and rsp,-32 movaps XMMWORD[(-168)+r9],xmm6 movaps XMMWORD[(-152)+r9],xmm7 movaps XMMWORD[(-136)+r9],xmm8 movaps XMMWORD[(-120)+r9],xmm9 movaps XMMWORD[(-104)+r9],xmm10 movaps XMMWORD[(-88)+r9],xmm11 movaps XMMWORD[(-72)+r9],xmm12 movaps XMMWORD[(-56)+r9],xmm13 movaps XMMWORD[(-40)+r9],xmm14 movaps XMMWORD[(-24)+r9],xmm15 $L$8x_body: vzeroupper vbroadcasti128 ymm11,XMMWORD[$L$sigma] vbroadcasti128 ymm3,XMMWORD[rcx] vbroadcasti128 ymm15,XMMWORD[16+rcx] vbroadcasti128 ymm7,XMMWORD[r8] lea rcx,[256+rsp] lea rax,[512+rsp] lea r10,[$L$rot16] lea r11,[$L$rot24] vpshufd ymm8,ymm11,0x00 vpshufd ymm9,ymm11,0x55 vmovdqa YMMWORD[(128-256)+rcx],ymm8 vpshufd ymm10,ymm11,0xaa vmovdqa YMMWORD[(160-256)+rcx],ymm9 vpshufd ymm11,ymm11,0xff vmovdqa YMMWORD[(192-256)+rcx],ymm10 vmovdqa YMMWORD[(224-256)+rcx],ymm11 vpshufd ymm0,ymm3,0x00 vpshufd ymm1,ymm3,0x55 vmovdqa YMMWORD[(256-256)+rcx],ymm0 vpshufd ymm2,ymm3,0xaa vmovdqa YMMWORD[(288-256)+rcx],ymm1 vpshufd ymm3,ymm3,0xff vmovdqa YMMWORD[(320-256)+rcx],ymm2 vmovdqa YMMWORD[(352-256)+rcx],ymm3 vpshufd ymm12,ymm15,0x00 vpshufd ymm13,ymm15,0x55 vmovdqa YMMWORD[(384-512)+rax],ymm12 vpshufd ymm14,ymm15,0xaa vmovdqa YMMWORD[(416-512)+rax],ymm13 vpshufd ymm15,ymm15,0xff vmovdqa YMMWORD[(448-512)+rax],ymm14 vmovdqa YMMWORD[(480-512)+rax],ymm15 vpshufd ymm4,ymm7,0x00 vpshufd ymm5,ymm7,0x55 vpaddd ymm4,ymm4,YMMWORD[$L$incy] vpshufd ymm6,ymm7,0xaa vmovdqa YMMWORD[(544-512)+rax],ymm5 vpshufd ymm7,ymm7,0xff vmovdqa YMMWORD[(576-512)+rax],ymm6 vmovdqa YMMWORD[(608-512)+rax],ymm7 jmp NEAR $L$oop_enter8x ALIGN 32 $L$oop_outer8x: vmovdqa ymm8,YMMWORD[((128-256))+rcx] vmovdqa ymm9,YMMWORD[((160-256))+rcx] vmovdqa ymm10,YMMWORD[((192-256))+rcx] vmovdqa ymm11,YMMWORD[((224-256))+rcx] vmovdqa ymm0,YMMWORD[((256-256))+rcx] vmovdqa ymm1,YMMWORD[((288-256))+rcx] vmovdqa ymm2,YMMWORD[((320-256))+rcx] vmovdqa ymm3,YMMWORD[((352-256))+rcx] vmovdqa ymm12,YMMWORD[((384-512))+rax] vmovdqa ymm13,YMMWORD[((416-512))+rax] vmovdqa ymm14,YMMWORD[((448-512))+rax] vmovdqa ymm15,YMMWORD[((480-512))+rax] vmovdqa ymm4,YMMWORD[((512-512))+rax] vmovdqa ymm5,YMMWORD[((544-512))+rax] vmovdqa ymm6,YMMWORD[((576-512))+rax] vmovdqa ymm7,YMMWORD[((608-512))+rax] vpaddd ymm4,ymm4,YMMWORD[$L$eight] $L$oop_enter8x: vmovdqa YMMWORD[64+rsp],ymm14 vmovdqa YMMWORD[96+rsp],ymm15 vbroadcasti128 ymm15,XMMWORD[r10] vmovdqa YMMWORD[(512-512)+rax],ymm4 mov eax,10 jmp NEAR $L$oop8x ALIGN 32 $L$oop8x: vpaddd ymm8,ymm8,ymm0 vpxor ymm4,ymm8,ymm4 vpshufb ymm4,ymm4,ymm15 vpaddd ymm9,ymm9,ymm1 vpxor ymm5,ymm9,ymm5 vpshufb ymm5,ymm5,ymm15 vpaddd ymm12,ymm12,ymm4 vpxor ymm0,ymm12,ymm0 vpslld ymm14,ymm0,12 vpsrld ymm0,ymm0,20 vpor ymm0,ymm14,ymm0 vbroadcasti128 ymm14,XMMWORD[r11] vpaddd ymm13,ymm13,ymm5 vpxor ymm1,ymm13,ymm1 vpslld ymm15,ymm1,12 vpsrld ymm1,ymm1,20 vpor ymm1,ymm15,ymm1 vpaddd ymm8,ymm8,ymm0 vpxor ymm4,ymm8,ymm4 vpshufb ymm4,ymm4,ymm14 vpaddd ymm9,ymm9,ymm1 vpxor ymm5,ymm9,ymm5 vpshufb ymm5,ymm5,ymm14 vpaddd ymm12,ymm12,ymm4 vpxor ymm0,ymm12,ymm0 vpslld ymm15,ymm0,7 vpsrld ymm0,ymm0,25 vpor ymm0,ymm15,ymm0 vbroadcasti128 ymm15,XMMWORD[r10] vpaddd ymm13,ymm13,ymm5 vpxor ymm1,ymm13,ymm1 vpslld ymm14,ymm1,7 vpsrld ymm1,ymm1,25 vpor ymm1,ymm14,ymm1 vmovdqa YMMWORD[rsp],ymm12 vmovdqa YMMWORD[32+rsp],ymm13 vmovdqa ymm12,YMMWORD[64+rsp] vmovdqa ymm13,YMMWORD[96+rsp] vpaddd ymm10,ymm10,ymm2 vpxor ymm6,ymm10,ymm6 vpshufb ymm6,ymm6,ymm15 vpaddd ymm11,ymm11,ymm3 vpxor ymm7,ymm11,ymm7 vpshufb ymm7,ymm7,ymm15 vpaddd ymm12,ymm12,ymm6 vpxor ymm2,ymm12,ymm2 vpslld ymm14,ymm2,12 vpsrld ymm2,ymm2,20 vpor ymm2,ymm14,ymm2 vbroadcasti128 ymm14,XMMWORD[r11] vpaddd ymm13,ymm13,ymm7 vpxor ymm3,ymm13,ymm3 vpslld ymm15,ymm3,12 vpsrld ymm3,ymm3,20 vpor ymm3,ymm15,ymm3 vpaddd ymm10,ymm10,ymm2 vpxor ymm6,ymm10,ymm6 vpshufb ymm6,ymm6,ymm14 vpaddd ymm11,ymm11,ymm3 vpxor ymm7,ymm11,ymm7 vpshufb ymm7,ymm7,ymm14 vpaddd ymm12,ymm12,ymm6 vpxor ymm2,ymm12,ymm2 vpslld ymm15,ymm2,7 vpsrld ymm2,ymm2,25 vpor ymm2,ymm15,ymm2 vbroadcasti128 ymm15,XMMWORD[r10] vpaddd ymm13,ymm13,ymm7 vpxor ymm3,ymm13,ymm3 vpslld ymm14,ymm3,7 vpsrld ymm3,ymm3,25 vpor ymm3,ymm14,ymm3 vpaddd ymm8,ymm8,ymm1 vpxor ymm7,ymm8,ymm7 vpshufb ymm7,ymm7,ymm15 vpaddd ymm9,ymm9,ymm2 vpxor ymm4,ymm9,ymm4 vpshufb ymm4,ymm4,ymm15 vpaddd ymm12,ymm12,ymm7 vpxor ymm1,ymm12,ymm1 vpslld ymm14,ymm1,12 vpsrld ymm1,ymm1,20 vpor ymm1,ymm14,ymm1 vbroadcasti128 ymm14,XMMWORD[r11] vpaddd ymm13,ymm13,ymm4 vpxor ymm2,ymm13,ymm2 vpslld ymm15,ymm2,12 vpsrld ymm2,ymm2,20 vpor ymm2,ymm15,ymm2 vpaddd ymm8,ymm8,ymm1 vpxor ymm7,ymm8,ymm7 vpshufb ymm7,ymm7,ymm14 vpaddd ymm9,ymm9,ymm2 vpxor ymm4,ymm9,ymm4 vpshufb ymm4,ymm4,ymm14 vpaddd ymm12,ymm12,ymm7 vpxor ymm1,ymm12,ymm1 vpslld ymm15,ymm1,7 vpsrld ymm1,ymm1,25 vpor ymm1,ymm15,ymm1 vbroadcasti128 ymm15,XMMWORD[r10] vpaddd ymm13,ymm13,ymm4 vpxor ymm2,ymm13,ymm2 vpslld ymm14,ymm2,7 vpsrld ymm2,ymm2,25 vpor ymm2,ymm14,ymm2 vmovdqa YMMWORD[64+rsp],ymm12 vmovdqa YMMWORD[96+rsp],ymm13 vmovdqa ymm12,YMMWORD[rsp] vmovdqa ymm13,YMMWORD[32+rsp] vpaddd ymm10,ymm10,ymm3 vpxor ymm5,ymm10,ymm5 vpshufb ymm5,ymm5,ymm15 vpaddd ymm11,ymm11,ymm0 vpxor ymm6,ymm11,ymm6 vpshufb ymm6,ymm6,ymm15 vpaddd ymm12,ymm12,ymm5 vpxor ymm3,ymm12,ymm3 vpslld ymm14,ymm3,12 vpsrld ymm3,ymm3,20 vpor ymm3,ymm14,ymm3 vbroadcasti128 ymm14,XMMWORD[r11] vpaddd ymm13,ymm13,ymm6 vpxor ymm0,ymm13,ymm0 vpslld ymm15,ymm0,12 vpsrld ymm0,ymm0,20 vpor ymm0,ymm15,ymm0 vpaddd ymm10,ymm10,ymm3 vpxor ymm5,ymm10,ymm5 vpshufb ymm5,ymm5,ymm14 vpaddd ymm11,ymm11,ymm0 vpxor ymm6,ymm11,ymm6 vpshufb ymm6,ymm6,ymm14 vpaddd ymm12,ymm12,ymm5 vpxor ymm3,ymm12,ymm3 vpslld ymm15,ymm3,7 vpsrld ymm3,ymm3,25 vpor ymm3,ymm15,ymm3 vbroadcasti128 ymm15,XMMWORD[r10] vpaddd ymm13,ymm13,ymm6 vpxor ymm0,ymm13,ymm0 vpslld ymm14,ymm0,7 vpsrld ymm0,ymm0,25 vpor ymm0,ymm14,ymm0 dec eax jnz NEAR $L$oop8x lea rax,[512+rsp] vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx] vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx] vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx] vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx] vpunpckldq ymm14,ymm8,ymm9 vpunpckldq ymm15,ymm10,ymm11 vpunpckhdq ymm8,ymm8,ymm9 vpunpckhdq ymm10,ymm10,ymm11 vpunpcklqdq ymm9,ymm14,ymm15 vpunpckhqdq ymm14,ymm14,ymm15 vpunpcklqdq ymm11,ymm8,ymm10 vpunpckhqdq ymm8,ymm8,ymm10 vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx] vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx] vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx] vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx] vpunpckldq ymm10,ymm0,ymm1 vpunpckldq ymm15,ymm2,ymm3 vpunpckhdq ymm0,ymm0,ymm1 vpunpckhdq ymm2,ymm2,ymm3 vpunpcklqdq ymm1,ymm10,ymm15 vpunpckhqdq ymm10,ymm10,ymm15 vpunpcklqdq ymm3,ymm0,ymm2 vpunpckhqdq ymm0,ymm0,ymm2 vperm2i128 ymm15,ymm9,ymm1,0x20 vperm2i128 ymm1,ymm9,ymm1,0x31 vperm2i128 ymm9,ymm14,ymm10,0x20 vperm2i128 ymm10,ymm14,ymm10,0x31 vperm2i128 ymm14,ymm11,ymm3,0x20 vperm2i128 ymm3,ymm11,ymm3,0x31 vperm2i128 ymm11,ymm8,ymm0,0x20 vperm2i128 ymm0,ymm8,ymm0,0x31 vmovdqa YMMWORD[rsp],ymm15 vmovdqa YMMWORD[32+rsp],ymm9 vmovdqa ymm15,YMMWORD[64+rsp] vmovdqa ymm9,YMMWORD[96+rsp] vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax] vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax] vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax] vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax] vpunpckldq ymm2,ymm12,ymm13 vpunpckldq ymm8,ymm15,ymm9 vpunpckhdq ymm12,ymm12,ymm13 vpunpckhdq ymm15,ymm15,ymm9 vpunpcklqdq ymm13,ymm2,ymm8 vpunpckhqdq ymm2,ymm2,ymm8 vpunpcklqdq ymm9,ymm12,ymm15 vpunpckhqdq ymm12,ymm12,ymm15 vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax] vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax] vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax] vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax] vpunpckldq ymm15,ymm4,ymm5 vpunpckldq ymm8,ymm6,ymm7 vpunpckhdq ymm4,ymm4,ymm5 vpunpckhdq ymm6,ymm6,ymm7 vpunpcklqdq ymm5,ymm15,ymm8 vpunpckhqdq ymm15,ymm15,ymm8 vpunpcklqdq ymm7,ymm4,ymm6 vpunpckhqdq ymm4,ymm4,ymm6 vperm2i128 ymm8,ymm13,ymm5,0x20 vperm2i128 ymm5,ymm13,ymm5,0x31 vperm2i128 ymm13,ymm2,ymm15,0x20 vperm2i128 ymm15,ymm2,ymm15,0x31 vperm2i128 ymm2,ymm9,ymm7,0x20 vperm2i128 ymm7,ymm9,ymm7,0x31 vperm2i128 ymm9,ymm12,ymm4,0x20 vperm2i128 ymm4,ymm12,ymm4,0x31 vmovdqa ymm6,YMMWORD[rsp] vmovdqa ymm12,YMMWORD[32+rsp] cmp rdx,64*8 jb NEAR $L$tail8x vpxor ymm6,ymm6,YMMWORD[rsi] vpxor ymm8,ymm8,YMMWORD[32+rsi] vpxor ymm1,ymm1,YMMWORD[64+rsi] vpxor ymm5,ymm5,YMMWORD[96+rsi] lea rsi,[128+rsi] vmovdqu YMMWORD[rdi],ymm6 vmovdqu YMMWORD[32+rdi],ymm8 vmovdqu YMMWORD[64+rdi],ymm1 vmovdqu YMMWORD[96+rdi],ymm5 lea rdi,[128+rdi] vpxor ymm12,ymm12,YMMWORD[rsi] vpxor ymm13,ymm13,YMMWORD[32+rsi] vpxor ymm10,ymm10,YMMWORD[64+rsi] vpxor ymm15,ymm15,YMMWORD[96+rsi] lea rsi,[128+rsi] vmovdqu YMMWORD[rdi],ymm12 vmovdqu YMMWORD[32+rdi],ymm13 vmovdqu YMMWORD[64+rdi],ymm10 vmovdqu YMMWORD[96+rdi],ymm15 lea rdi,[128+rdi] vpxor ymm14,ymm14,YMMWORD[rsi] vpxor ymm2,ymm2,YMMWORD[32+rsi] vpxor ymm3,ymm3,YMMWORD[64+rsi] vpxor ymm7,ymm7,YMMWORD[96+rsi] lea rsi,[128+rsi] vmovdqu YMMWORD[rdi],ymm14 vmovdqu YMMWORD[32+rdi],ymm2 vmovdqu YMMWORD[64+rdi],ymm3 vmovdqu YMMWORD[96+rdi],ymm7 lea rdi,[128+rdi] vpxor ymm11,ymm11,YMMWORD[rsi] vpxor ymm9,ymm9,YMMWORD[32+rsi] vpxor ymm0,ymm0,YMMWORD[64+rsi] vpxor ymm4,ymm4,YMMWORD[96+rsi] lea rsi,[128+rsi] vmovdqu YMMWORD[rdi],ymm11 vmovdqu YMMWORD[32+rdi],ymm9 vmovdqu YMMWORD[64+rdi],ymm0 vmovdqu YMMWORD[96+rdi],ymm4 lea rdi,[128+rdi] sub rdx,64*8 jnz NEAR $L$oop_outer8x jmp NEAR $L$done8x $L$tail8x: cmp rdx,448 jae NEAR $L$448_or_more8x cmp rdx,384 jae NEAR $L$384_or_more8x cmp rdx,320 jae NEAR $L$320_or_more8x cmp rdx,256 jae NEAR $L$256_or_more8x cmp rdx,192 jae NEAR $L$192_or_more8x cmp rdx,128 jae NEAR $L$128_or_more8x cmp rdx,64 jae NEAR $L$64_or_more8x xor r10,r10 vmovdqa YMMWORD[rsp],ymm6 vmovdqa YMMWORD[32+rsp],ymm8 jmp NEAR $L$oop_tail8x ALIGN 32 $L$64_or_more8x: vpxor ymm6,ymm6,YMMWORD[rsi] vpxor ymm8,ymm8,YMMWORD[32+rsi] vmovdqu YMMWORD[rdi],ymm6 vmovdqu YMMWORD[32+rdi],ymm8 je NEAR $L$done8x lea rsi,[64+rsi] xor r10,r10 vmovdqa YMMWORD[rsp],ymm1 lea rdi,[64+rdi] sub rdx,64 vmovdqa YMMWORD[32+rsp],ymm5 jmp NEAR $L$oop_tail8x ALIGN 32 $L$128_or_more8x: vpxor ymm6,ymm6,YMMWORD[rsi] vpxor ymm8,ymm8,YMMWORD[32+rsi] vpxor ymm1,ymm1,YMMWORD[64+rsi] vpxor ymm5,ymm5,YMMWORD[96+rsi] vmovdqu YMMWORD[rdi],ymm6 vmovdqu YMMWORD[32+rdi],ymm8 vmovdqu YMMWORD[64+rdi],ymm1 vmovdqu YMMWORD[96+rdi],ymm5 je NEAR $L$done8x lea rsi,[128+rsi] xor r10,r10 vmovdqa YMMWORD[rsp],ymm12 lea rdi,[128+rdi] sub rdx,128 vmovdqa YMMWORD[32+rsp],ymm13 jmp NEAR $L$oop_tail8x ALIGN 32 $L$192_or_more8x: vpxor ymm6,ymm6,YMMWORD[rsi] vpxor ymm8,ymm8,YMMWORD[32+rsi] vpxor ymm1,ymm1,YMMWORD[64+rsi] vpxor ymm5,ymm5,YMMWORD[96+rsi] vpxor ymm12,ymm12,YMMWORD[128+rsi] vpxor ymm13,ymm13,YMMWORD[160+rsi] vmovdqu YMMWORD[rdi],ymm6 vmovdqu YMMWORD[32+rdi],ymm8 vmovdqu YMMWORD[64+rdi],ymm1 vmovdqu YMMWORD[96+rdi],ymm5 vmovdqu YMMWORD[128+rdi],ymm12 vmovdqu YMMWORD[160+rdi],ymm13 je NEAR $L$done8x lea rsi,[192+rsi] xor r10,r10 vmovdqa YMMWORD[rsp],ymm10 lea rdi,[192+rdi] sub rdx,192 vmovdqa YMMWORD[32+rsp],ymm15 jmp NEAR $L$oop_tail8x ALIGN 32 $L$256_or_more8x: vpxor ymm6,ymm6,YMMWORD[rsi] vpxor ymm8,ymm8,YMMWORD[32+rsi] vpxor ymm1,ymm1,YMMWORD[64+rsi] vpxor ymm5,ymm5,YMMWORD[96+rsi] vpxor ymm12,ymm12,YMMWORD[128+rsi] vpxor ymm13,ymm13,YMMWORD[160+rsi] vpxor ymm10,ymm10,YMMWORD[192+rsi] vpxor ymm15,ymm15,YMMWORD[224+rsi] vmovdqu YMMWORD[rdi],ymm6 vmovdqu YMMWORD[32+rdi],ymm8 vmovdqu YMMWORD[64+rdi],ymm1 vmovdqu YMMWORD[96+rdi],ymm5 vmovdqu YMMWORD[128+rdi],ymm12 vmovdqu YMMWORD[160+rdi],ymm13 vmovdqu YMMWORD[192+rdi],ymm10 vmovdqu YMMWORD[224+rdi],ymm15 je NEAR $L$done8x lea rsi,[256+rsi] xor r10,r10 vmovdqa YMMWORD[rsp],ymm14 lea rdi,[256+rdi] sub rdx,256 vmovdqa YMMWORD[32+rsp],ymm2 jmp NEAR $L$oop_tail8x ALIGN 32 $L$320_or_more8x: vpxor ymm6,ymm6,YMMWORD[rsi] vpxor ymm8,ymm8,YMMWORD[32+rsi] vpxor ymm1,ymm1,YMMWORD[64+rsi] vpxor ymm5,ymm5,YMMWORD[96+rsi] vpxor ymm12,ymm12,YMMWORD[128+rsi] vpxor ymm13,ymm13,YMMWORD[160+rsi] vpxor ymm10,ymm10,YMMWORD[192+rsi] vpxor ymm15,ymm15,YMMWORD[224+rsi] vpxor ymm14,ymm14,YMMWORD[256+rsi] vpxor ymm2,ymm2,YMMWORD[288+rsi] vmovdqu YMMWORD[rdi],ymm6 vmovdqu YMMWORD[32+rdi],ymm8 vmovdqu YMMWORD[64+rdi],ymm1 vmovdqu YMMWORD[96+rdi],ymm5 vmovdqu YMMWORD[128+rdi],ymm12 vmovdqu YMMWORD[160+rdi],ymm13 vmovdqu YMMWORD[192+rdi],ymm10 vmovdqu YMMWORD[224+rdi],ymm15 vmovdqu YMMWORD[256+rdi],ymm14 vmovdqu YMMWORD[288+rdi],ymm2 je NEAR $L$done8x lea rsi,[320+rsi] xor r10,r10 vmovdqa YMMWORD[rsp],ymm3 lea rdi,[320+rdi] sub rdx,320 vmovdqa YMMWORD[32+rsp],ymm7 jmp NEAR $L$oop_tail8x ALIGN 32 $L$384_or_more8x: vpxor ymm6,ymm6,YMMWORD[rsi] vpxor ymm8,ymm8,YMMWORD[32+rsi] vpxor ymm1,ymm1,YMMWORD[64+rsi] vpxor ymm5,ymm5,YMMWORD[96+rsi] vpxor ymm12,ymm12,YMMWORD[128+rsi] vpxor ymm13,ymm13,YMMWORD[160+rsi] vpxor ymm10,ymm10,YMMWORD[192+rsi] vpxor ymm15,ymm15,YMMWORD[224+rsi] vpxor ymm14,ymm14,YMMWORD[256+rsi] vpxor ymm2,ymm2,YMMWORD[288+rsi] vpxor ymm3,ymm3,YMMWORD[320+rsi] vpxor ymm7,ymm7,YMMWORD[352+rsi] vmovdqu YMMWORD[rdi],ymm6 vmovdqu YMMWORD[32+rdi],ymm8 vmovdqu YMMWORD[64+rdi],ymm1 vmovdqu YMMWORD[96+rdi],ymm5 vmovdqu YMMWORD[128+rdi],ymm12 vmovdqu YMMWORD[160+rdi],ymm13 vmovdqu YMMWORD[192+rdi],ymm10 vmovdqu YMMWORD[224+rdi],ymm15 vmovdqu YMMWORD[256+rdi],ymm14 vmovdqu YMMWORD[288+rdi],ymm2 vmovdqu YMMWORD[320+rdi],ymm3 vmovdqu YMMWORD[352+rdi],ymm7 je NEAR $L$done8x lea rsi,[384+rsi] xor r10,r10 vmovdqa YMMWORD[rsp],ymm11 lea rdi,[384+rdi] sub rdx,384 vmovdqa YMMWORD[32+rsp],ymm9 jmp NEAR $L$oop_tail8x ALIGN 32 $L$448_or_more8x: vpxor ymm6,ymm6,YMMWORD[rsi] vpxor ymm8,ymm8,YMMWORD[32+rsi] vpxor ymm1,ymm1,YMMWORD[64+rsi] vpxor ymm5,ymm5,YMMWORD[96+rsi] vpxor ymm12,ymm12,YMMWORD[128+rsi] vpxor ymm13,ymm13,YMMWORD[160+rsi] vpxor ymm10,ymm10,YMMWORD[192+rsi] vpxor ymm15,ymm15,YMMWORD[224+rsi] vpxor ymm14,ymm14,YMMWORD[256+rsi] vpxor ymm2,ymm2,YMMWORD[288+rsi] vpxor ymm3,ymm3,YMMWORD[320+rsi] vpxor ymm7,ymm7,YMMWORD[352+rsi] vpxor ymm11,ymm11,YMMWORD[384+rsi] vpxor ymm9,ymm9,YMMWORD[416+rsi] vmovdqu YMMWORD[rdi],ymm6 vmovdqu YMMWORD[32+rdi],ymm8 vmovdqu YMMWORD[64+rdi],ymm1 vmovdqu YMMWORD[96+rdi],ymm5 vmovdqu YMMWORD[128+rdi],ymm12 vmovdqu YMMWORD[160+rdi],ymm13 vmovdqu YMMWORD[192+rdi],ymm10 vmovdqu YMMWORD[224+rdi],ymm15 vmovdqu YMMWORD[256+rdi],ymm14 vmovdqu YMMWORD[288+rdi],ymm2 vmovdqu YMMWORD[320+rdi],ymm3 vmovdqu YMMWORD[352+rdi],ymm7 vmovdqu YMMWORD[384+rdi],ymm11 vmovdqu YMMWORD[416+rdi],ymm9 je NEAR $L$done8x lea rsi,[448+rsi] xor r10,r10 vmovdqa YMMWORD[rsp],ymm0 lea rdi,[448+rdi] sub rdx,448 vmovdqa YMMWORD[32+rsp],ymm4 $L$oop_tail8x: movzx eax,BYTE[r10*1+rsi] movzx ecx,BYTE[r10*1+rsp] lea r10,[1+r10] xor eax,ecx mov BYTE[((-1))+r10*1+rdi],al dec rdx jnz NEAR $L$oop_tail8x $L$done8x: vzeroall movaps xmm6,XMMWORD[((-168))+r9] movaps xmm7,XMMWORD[((-152))+r9] movaps xmm8,XMMWORD[((-136))+r9] movaps xmm9,XMMWORD[((-120))+r9] movaps xmm10,XMMWORD[((-104))+r9] movaps xmm11,XMMWORD[((-88))+r9] movaps xmm12,XMMWORD[((-72))+r9] movaps xmm13,XMMWORD[((-56))+r9] movaps xmm14,XMMWORD[((-40))+r9] movaps xmm15,XMMWORD[((-24))+r9] lea rsp,[r9] $L$8x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ChaCha20_ctr32_avx2: EXTERN __imp_RtlVirtualUnwind ALIGN 16 se_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] lea r10,[$L$ctr32_body] cmp rbx,r10 jb NEAR $L$common_seh_tail mov rax,QWORD[152+r8] lea r10,[$L$no_data] cmp rbx,r10 jae NEAR $L$common_seh_tail lea rax,[((64+24+48))+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] mov r13,QWORD[((-32))+rax] mov r14,QWORD[((-40))+rax] mov r15,QWORD[((-48))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 mov QWORD[224+r8],r13 mov QWORD[232+r8],r14 mov QWORD[240+r8],r15 $L$common_seh_tail: mov rdi,QWORD[8+rax] mov rsi,QWORD[16+rax] mov QWORD[152+r8],rax mov QWORD[168+r8],rsi mov QWORD[176+r8],rdi mov rdi,QWORD[40+r9] mov rsi,r8 mov ecx,154 DD 0xa548f3fc mov rsi,r9 xor rcx,rcx mov rdx,QWORD[8+rsi] mov r8,QWORD[rsi] mov r9,QWORD[16+rsi] mov r10,QWORD[40+rsi] lea r11,[56+rsi] lea r12,[24+rsi] mov QWORD[32+rsp],r10 mov QWORD[40+rsp],r11 mov QWORD[48+rsp],r12 mov QWORD[56+rsp],rcx call QWORD[__imp_RtlVirtualUnwind] mov eax,1 add rsp,64 popfq pop r15 pop r14 pop r13 pop r12 pop rbp pop rbx pop rdi pop rsi ret ALIGN 16 ssse3_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$common_seh_tail mov rax,QWORD[192+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail lea rsi,[((-40))+rax] lea rdi,[512+r8] mov ecx,4 DD 0xa548f3fc jmp NEAR $L$common_seh_tail ALIGN 16 full_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$common_seh_tail mov rax,QWORD[192+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail lea rsi,[((-168))+rax] lea rdi,[512+r8] mov ecx,20 DD 0xa548f3fc jmp NEAR $L$common_seh_tail section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_ChaCha20_ctr32_nohw wrt ..imagebase DD $L$SEH_end_ChaCha20_ctr32_nohw wrt ..imagebase DD $L$SEH_info_ChaCha20_ctr32_nohw wrt ..imagebase DD $L$SEH_begin_ChaCha20_ctr32_ssse3_4x wrt ..imagebase DD $L$SEH_end_ChaCha20_ctr32_ssse3_4x wrt ..imagebase DD $L$SEH_info_ChaCha20_ctr32_ssse3_4x wrt ..imagebase DD $L$SEH_begin_ChaCha20_ctr32_avx2 wrt ..imagebase DD $L$SEH_end_ChaCha20_ctr32_avx2 wrt ..imagebase DD $L$SEH_info_ChaCha20_ctr32_avx2 wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_ChaCha20_ctr32_nohw: DB 9,0,0,0 DD se_handler wrt ..imagebase $L$SEH_info_ChaCha20_ctr32_ssse3_4x: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase $L$SEH_info_ChaCha20_ctr32_avx2: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/chacha-x86_64-nasm.o000064400000000000000000000617141046102023000166720ustar 00000000000000dg [N.debug$S45~@B.debug$TR:&;@B.text&;W p`.rdata&XZ@p@.pdata$ZAZ @0@.xdata(ZZ@@@7C:\Users\b\p\ring\pregenerated\chacha-x86_64-nasm.asm˝N|C~t fX-L-78 9 ;<=>?DF H"J$L&N(P,U0V5W:XB[H\N]T^W_\a`cedjeoftgyh~ijklmnpqrstuvwy{|}~ "&),048<@CFJMPTWZ^adhknrux| "%(,/269<@BHLPUY^bglrx~      "&*. 2!8#=%@'C(G)J*N+R,W-\.a/f0l1q2v3{4789:;<=@ACEGIKMPQRWYZ[]^_`afhijk l%m*n/o4p9q>rCtLuQvVw[xcyjzq|w}}~#(-@GNU\bhnty~  %+05:?DINSX]aejotz #(-26:?DIO T Y _ d insx} !"#$%&'()*+,-. /012"3'4,51677=8B9H:M;R<W=\>a?f@jAnBsCxD}EFGHIJKLMNOPQRSTUVWXYZ[\]^ _ ` a c d e# f* h/ i4 j9 k> lC mH nM oR pW q[ r` se tk uq vw w} y z { | ~                          " ' , 1 5 : ? D I M Q U Z ^ b f k p t x                                 $ ) . 3 7 < B H M R W \ c i n s w |                          " & * 0 5 : @ F J M R Y ] c g m r       ! " # % & ' ( ) * + , - . / 0 1 2 3 4 5 7 8 9 : ; < =' >. ?5 @: B@ DD EJ FO GT HY I^ Jb Kf Mj No Ou P{ Q R S T U V W X Y [ \ ] ^ _ ` a b c d e f g h i j klnopqr!s%t,u3v9y>zC{G|I}N~QW_goty~ #+3:AGMRX]chmrw{ "',16;?DINV\bgkpu    !"#$%&'()*+ ,-./!0'1-21354:5>6B7G8K9O:T;Y<]=b>f?j@oAtBxC|DEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abc defghi#j(k,l0m5n:o>pCqGrKsPtUuYv_wexjypzt{x|}}~ %*/49>CHMRV[`eimquz !%)-27;?EKQW]ciotz       #',16=AFKPW^di!p"v#}$%&'()*+,-.01235789:;=>?@A BCE G$H)I.J3K7L<MANFOLQSRVS[TbUiVoWtY[\]^_`abcdefgijklmnoqst uvwx#y+z3{7|<}A~FNV^flsv{&,36;BIOT`dins{ ")/4@DINS[cks{ ")/49=?DGMPX`hmrw |     !"$%'(*+,.01246789:; <=>?&@-A4D8E<FCGJHQJUKXL]MaOdPgQkRnSrTvUzV~WXYZ[]^_`abcdefghlnopqrstuvwyz|}  !"#$&(*,-15<@DGKNT[_cflsz:C:\Users\b\p\ring\pregenerated\chacha-x86_64-nasm.o4'The Netwide Assembler 2.13.03 "L$zero "L$one "L$inc "L$four "L$incy "L$eight  L$rot16  L$rot24  L$sigma "L$zeroz "L$fourz "L$incz "L$sixteen0ring_core_0_17_14__ChaCha20_ctr32_nohw)L$SEH_begin_ChaCha20_ctr32_nohwL$ctr32_bodyL$oop_outerL$oopL$tailL$oop_tailL$doneL$no_data'L$SEH_end_ChaCha20_ctr32_nohw4ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x-L$SEH_begin_ChaCha20_ctr32_ssse3_4xL$4x_bodyL$oop_outer4xL$oop_enter4xL$oop4xL$tail4xL$64_or_more4xL$128_or_more4xL$192_or_more4xL$oop_tail4xL$done4xL$4x_epilogue+L$SEH_end_ChaCha20_ctr32_ssse3_4x0ring_core_0_17_14__ChaCha20_ctr32_avx2)L$SEH_begin_ChaCha20_ctr32_avx2L$8x_bodyL$oop_outer8xL$oop_enter8xL$oop8xL$tail8xL$64_or_more8xL$128_or_more8xL$192_or_more8xL$256_or_more8xL$320_or_more8xL$384_or_more8xL$448_or_more8xL$oop_tail8xL$done8xL$8x_epilogue'L$SEH_end_ChaCha20_ctr32_avx2se_handlerL$common_seh_tailssse3_handlerfull_handler+  L$SEH_info_ChaCha20_ctr32_nohw/  L$SEH_info_ChaCha20_ctr32_ssse3_4x+  L$SEH_info_ChaCha20_ctr32_avx2l p F. J. [. _. o. s. . . . . . . . . . . . . / / / / 1/ 5/ F/ J/ Z/ ^/ / / / / / / /! /! /" /" 0# 0# 0$ #0$ 10% 50% F0& J0& o0' s0' 0( 0( 0) 0) 0* 0* 1+ 1+ 1, 1, .1- 21- B1. F1. \1/ `1/ w10 {10 11 11 12 12 13 13 14 14 25 25 626 :26 a27 e27 v28 z28 29 29 2: 2: 2; 2; 2< 2< 2= 2= 3> 3> 3? #3? :3@ >3@ U3A Y3A p3B t3B 3C 3C 3D 3D 3E 3E 3F 3F 3G 3G 4H 4H ,4I 04I E4J I4J a4K e4K 4L 4L 4M 4M H|$Ht$HHHLLLD$(SUATAUAVAWHXo oQAofo%fL$fT$ f\$0Hexpand 32-byte kDD$DL$DT$D\$fA~Dl$4Dt$8D|$Hfm 6=R      expand 32-byte k ChaCha20 for x86_64, CRYPTOGAMS by       C   $.filegC:\Users\b\p\ring\.debug$S4~.debug$TR.text .rdata.pdata$ .xdata(.absolutL$zeroL$oneL$inc L$four0L$incy@L$eight`L$rot16L$rot24L$sigmaL$zerozL$fourzL$incz@%L l,y`L$oopL$tail@L$doneC@L$oop4xL$tail4x - < L@ \9L$done4xWiwVL$oop8xL$tail8xi %5EU`e@u/L$done8xM4 (G__imp_RtlVirtualUnwindL$sixteenring_core_0_17_14__ChaCha20_ctr32_nohwL$SEH_begin_ChaCha20_ctr32_nohwL$ctr32_bodyL$oop_outerL$oop_tailL$no_dataL$SEH_end_ChaCha20_ctr32_nohwring_core_0_17_14__ChaCha20_ctr32_ssse3_4xL$SEH_begin_ChaCha20_ctr32_ssse3_4xL$4x_bodyL$oop_outer4xL$oop_enter4xL$64_or_more4xL$128_or_more4xL$192_or_more4xL$oop_tail4xL$4x_epilogueL$SEH_end_ChaCha20_ctr32_ssse3_4xring_core_0_17_14__ChaCha20_ctr32_avx2L$SEH_begin_ChaCha20_ctr32_avx2L$8x_bodyL$oop_outer8xL$oop_enter8xL$64_or_more8xL$128_or_more8xL$192_or_more8xL$256_or_more8xL$320_or_more8xL$384_or_more8xL$448_or_more8xL$oop_tail8xL$8x_epilogueL$SEH_end_ChaCha20_ctr32_avx2se_handlerL$common_seh_tailssse3_handlerfull_handlerL$SEH_info_ChaCha20_ctr32_nohwL$SEH_info_ChaCha20_ctr32_ssse3_4xL$SEH_info_ChaCha20_ctr32_avx2ring-0.17.14/pregenerated/chacha20_poly1305_armv8-ios64.S000064400000000000000000002203641046102023000205410ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) .section __TEXT,__const .align 7 Lchacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' Linc: .long 1,2,3,4 Lrol8: .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 Lclamp: .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC .text .align 6 Lpoly_hash_ad_internal: .cfi_startproc cbnz x4, Lpoly_hash_intro ret Lpoly_hash_intro: cmp x4, #16 b.lt Lpoly_hash_ad_tail ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most sub x4, x4, #16 b Lpoly_hash_ad_internal Lpoly_hash_ad_tail: cbz x4, Lpoly_hash_ad_ret eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD sub x4, x4, #1 Lpoly_hash_tail_16_compose: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x3, x4] mov v20.b[0], w11 subs x4, x4, #1 b.ge Lpoly_hash_tail_16_compose mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most Lpoly_hash_ad_ret: ret .cfi_endproc ///////////////////////////////// // // void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); // .globl _chacha20_poly1305_seal .private_extern _chacha20_poly1305_seal .align 6 _chacha20_poly1305_seal: AARCH64_SIGN_LINK_REGISTER .cfi_startproc stp x29, x30, [sp, #-80]! .cfi_def_cfa_offset 80 .cfi_offset w30, -72 .cfi_offset w29, -80 mov x29, sp // We probably could do .cfi_def_cfa w29, 80 at this point, but since // we don't actually use the frame pointer like that, it's probably not // worth bothering. stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] .cfi_offset b15, -8 .cfi_offset b14, -16 .cfi_offset b13, -24 .cfi_offset b12, -32 .cfi_offset b11, -40 .cfi_offset b10, -48 .cfi_offset b9, -56 .cfi_offset b8, -64 adrp x11, Lchacha20_consts@PAGE add x11, x11, Lchacha20_consts@PAGEOFF ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values ld1 {v28.16b - v30.16b}, [x5] mov x15, #1 // Prepare the Poly1305 state mov x8, #0 mov x9, #0 mov x10, #0 ldr x12, [x5, #56] // The total cipher text length includes extra_in_len add x12, x12, x2 mov v31.d[0], x4 // Store the input and aad lengths mov v31.d[1], x12 cmp x2, #128 b.le Lseal_128 // Optimization for smaller buffers // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, // the fifth block (A4-D4) horizontally. ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] mov v4.16b, v24.16b ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 mov v9.16b, v28.16b ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 mov v14.16b, v29.16b ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] add v15.4s, v15.4s, v25.4s mov v19.16b, v30.16b sub x5, x5, #32 mov x6, #10 .align 5 Lseal_init_rounds: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v9.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v18.8h, v18.8h rev32 v19.8h, v19.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b eor v8.16b, v8.16b, v13.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v9.4s, #20 sli v8.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s add v3.4s, v3.4s, v7.4s add v4.4s, v4.4s, v8.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v18.16b, {v18.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v14.16b ushr v9.4s, v8.4s, #25 sli v9.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #4 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #12 add v0.4s, v0.4s, v6.4s add v1.4s, v1.4s, v7.4s add v2.4s, v2.4s, v8.4s add v3.4s, v3.4s, v5.4s add v4.4s, v4.4s, v9.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v18.8h, v18.8h rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v19.8h, v19.8h add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v5.4s, #20 sli v8.4s, v5.4s, #12 ushr v5.4s, v9.4s, #20 sli v5.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v5.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v18.16b, {v18.16b}, v26.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v12.16b eor v6.16b, v6.16b, v13.16b eor v7.16b, v7.16b, v10.16b eor v8.16b, v8.16b, v11.16b eor v5.16b, v5.16b, v14.16b ushr v9.4s, v5.4s, #25 sli v9.4s, v5.4s, #7 ushr v5.4s, v8.4s, #25 sli v5.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #12 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #4 subs x6, x6, #1 b.hi Lseal_init_rounds add v15.4s, v15.4s, v25.4s mov x11, #4 dup v20.4s, w11 add v25.4s, v25.4s, v20.4s zip1 v20.4s, v0.4s, v1.4s zip2 v21.4s, v0.4s, v1.4s zip1 v22.4s, v2.4s, v3.4s zip2 v23.4s, v2.4s, v3.4s zip1 v0.2d, v20.2d, v22.2d zip2 v1.2d, v20.2d, v22.2d zip1 v2.2d, v21.2d, v23.2d zip2 v3.2d, v21.2d, v23.2d zip1 v20.4s, v5.4s, v6.4s zip2 v21.4s, v5.4s, v6.4s zip1 v22.4s, v7.4s, v8.4s zip2 v23.4s, v7.4s, v8.4s zip1 v5.2d, v20.2d, v22.2d zip2 v6.2d, v20.2d, v22.2d zip1 v7.2d, v21.2d, v23.2d zip2 v8.2d, v21.2d, v23.2d zip1 v20.4s, v10.4s, v11.4s zip2 v21.4s, v10.4s, v11.4s zip1 v22.4s, v12.4s, v13.4s zip2 v23.4s, v12.4s, v13.4s zip1 v10.2d, v20.2d, v22.2d zip2 v11.2d, v20.2d, v22.2d zip1 v12.2d, v21.2d, v23.2d zip2 v13.2d, v21.2d, v23.2d zip1 v20.4s, v15.4s, v16.4s zip2 v21.4s, v15.4s, v16.4s zip1 v22.4s, v17.4s, v18.4s zip2 v23.4s, v17.4s, v18.4s zip1 v15.2d, v20.2d, v22.2d zip2 v16.2d, v20.2d, v22.2d zip1 v17.2d, v21.2d, v23.2d zip2 v18.2d, v21.2d, v23.2d add v4.4s, v4.4s, v24.4s add v9.4s, v9.4s, v28.4s and v4.16b, v4.16b, v27.16b add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s add v10.4s, v10.4s, v29.4s add v15.4s, v15.4s, v30.4s add v1.4s, v1.4s, v24.4s add v6.4s, v6.4s, v28.4s add v11.4s, v11.4s, v29.4s add v16.4s, v16.4s, v30.4s add v2.4s, v2.4s, v24.4s add v7.4s, v7.4s, v28.4s add v12.4s, v12.4s, v29.4s add v17.4s, v17.4s, v30.4s add v3.4s, v3.4s, v24.4s add v8.4s, v8.4s, v28.4s add v13.4s, v13.4s, v29.4s add v18.4s, v18.4s, v30.4s mov x16, v4.d[0] // Move the R key to GPRs mov x17, v4.d[1] mov v27.16b, v9.16b // Store the S key bl Lpoly_hash_ad_internal mov x3, x0 cmp x2, #256 b.le Lseal_tail ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v2.16b eor v21.16b, v21.16b, v7.16b eor v22.16b, v22.16b, v12.16b eor v23.16b, v23.16b, v17.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v3.16b eor v21.16b, v21.16b, v8.16b eor v22.16b, v22.16b, v13.16b eor v23.16b, v23.16b, v18.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #256 mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 Lseal_main_loop: adrp x11, Lchacha20_consts@PAGE add x11, x11, Lchacha20_consts@PAGEOFF ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] mov v4.16b, v24.16b ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 mov v9.16b, v28.16b ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 mov v14.16b, v29.16b ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] add v15.4s, v15.4s, v25.4s mov v19.16b, v30.16b eor v20.16b, v20.16b, v20.16b //zero not v21.16b, v20.16b // -1 sub v21.4s, v25.4s, v21.4s // Add +1 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) add v19.4s, v19.4s, v20.4s sub x5, x5, #32 .align 5 Lseal_main_loop_rounds: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v9.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v18.8h, v18.8h rev32 v19.8h, v19.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b eor v8.16b, v8.16b, v13.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v9.4s, #20 sli v8.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s add v3.4s, v3.4s, v7.4s add v4.4s, v4.4s, v8.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v18.16b, {v18.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v14.16b ushr v9.4s, v8.4s, #25 sli v9.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #4 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #12 ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most add v0.4s, v0.4s, v6.4s add v1.4s, v1.4s, v7.4s add v2.4s, v2.4s, v8.4s add v3.4s, v3.4s, v5.4s add v4.4s, v4.4s, v9.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v18.8h, v18.8h rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v19.8h, v19.8h add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v5.4s, #20 sli v8.4s, v5.4s, #12 ushr v5.4s, v9.4s, #20 sli v5.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v5.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v18.16b, {v18.16b}, v26.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v12.16b eor v6.16b, v6.16b, v13.16b eor v7.16b, v7.16b, v10.16b eor v8.16b, v8.16b, v11.16b eor v5.16b, v5.16b, v14.16b ushr v9.4s, v5.4s, #25 sli v9.4s, v5.4s, #7 ushr v5.4s, v8.4s, #25 sli v5.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #12 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #4 subs x6, x6, #1 b.ge Lseal_main_loop_rounds ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most subs x7, x7, #1 b.gt Lseal_main_loop_rounds eor v20.16b, v20.16b, v20.16b //zero not v21.16b, v20.16b // -1 sub v21.4s, v25.4s, v21.4s // Add +1 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) add v19.4s, v19.4s, v20.4s add v15.4s, v15.4s, v25.4s mov x11, #5 dup v20.4s, w11 add v25.4s, v25.4s, v20.4s zip1 v20.4s, v0.4s, v1.4s zip2 v21.4s, v0.4s, v1.4s zip1 v22.4s, v2.4s, v3.4s zip2 v23.4s, v2.4s, v3.4s zip1 v0.2d, v20.2d, v22.2d zip2 v1.2d, v20.2d, v22.2d zip1 v2.2d, v21.2d, v23.2d zip2 v3.2d, v21.2d, v23.2d zip1 v20.4s, v5.4s, v6.4s zip2 v21.4s, v5.4s, v6.4s zip1 v22.4s, v7.4s, v8.4s zip2 v23.4s, v7.4s, v8.4s zip1 v5.2d, v20.2d, v22.2d zip2 v6.2d, v20.2d, v22.2d zip1 v7.2d, v21.2d, v23.2d zip2 v8.2d, v21.2d, v23.2d zip1 v20.4s, v10.4s, v11.4s zip2 v21.4s, v10.4s, v11.4s zip1 v22.4s, v12.4s, v13.4s zip2 v23.4s, v12.4s, v13.4s zip1 v10.2d, v20.2d, v22.2d zip2 v11.2d, v20.2d, v22.2d zip1 v12.2d, v21.2d, v23.2d zip2 v13.2d, v21.2d, v23.2d zip1 v20.4s, v15.4s, v16.4s zip2 v21.4s, v15.4s, v16.4s zip1 v22.4s, v17.4s, v18.4s zip2 v23.4s, v17.4s, v18.4s zip1 v15.2d, v20.2d, v22.2d zip2 v16.2d, v20.2d, v22.2d zip1 v17.2d, v21.2d, v23.2d zip2 v18.2d, v21.2d, v23.2d add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s add v10.4s, v10.4s, v29.4s add v15.4s, v15.4s, v30.4s add v1.4s, v1.4s, v24.4s add v6.4s, v6.4s, v28.4s add v11.4s, v11.4s, v29.4s add v16.4s, v16.4s, v30.4s add v2.4s, v2.4s, v24.4s add v7.4s, v7.4s, v28.4s add v12.4s, v12.4s, v29.4s add v17.4s, v17.4s, v30.4s add v3.4s, v3.4s, v24.4s add v8.4s, v8.4s, v28.4s add v13.4s, v13.4s, v29.4s add v18.4s, v18.4s, v30.4s add v4.4s, v4.4s, v24.4s add v9.4s, v9.4s, v28.4s add v14.4s, v14.4s, v29.4s add v19.4s, v19.4s, v30.4s cmp x2, #320 b.le Lseal_tail ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v2.16b eor v21.16b, v21.16b, v7.16b eor v22.16b, v22.16b, v12.16b eor v23.16b, v23.16b, v17.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v3.16b eor v21.16b, v21.16b, v8.16b eor v22.16b, v22.16b, v13.16b eor v23.16b, v23.16b, v18.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v4.16b eor v21.16b, v21.16b, v9.16b eor v22.16b, v22.16b, v14.16b eor v23.16b, v23.16b, v19.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #320 mov x6, #0 mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration b Lseal_main_loop Lseal_tail: // This part of the function handles the storage and authentication of the last [0,320) bytes // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. cmp x2, #64 b.lt Lseal_tail_64 // Store and authenticate 64B blocks per iteration ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v21.d[0] mov x12, v21.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v22.d[0] mov x12, v22.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v23.d[0] mov x12, v23.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 // Shift the state left by 64 bytes for the next iteration of the loop mov v0.16b, v1.16b mov v5.16b, v6.16b mov v10.16b, v11.16b mov v15.16b, v16.16b mov v1.16b, v2.16b mov v6.16b, v7.16b mov v11.16b, v12.16b mov v16.16b, v17.16b mov v2.16b, v3.16b mov v7.16b, v8.16b mov v12.16b, v13.16b mov v17.16b, v18.16b mov v3.16b, v4.16b mov v8.16b, v9.16b mov v13.16b, v14.16b mov v18.16b, v19.16b b Lseal_tail Lseal_tail_64: ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr // Here we handle the last [0,64) bytes of plaintext cmp x2, #16 b.lt Lseal_tail_16 // Each iteration encrypt and authenticate a 16B block ld1 {v20.16b}, [x1], #16 eor v20.16b, v20.16b, v0.16b mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most st1 {v20.16b}, [x0], #16 sub x2, x2, #16 // Shift the state left by 16 bytes for the next iteration of the loop mov v0.16b, v5.16b mov v5.16b, v10.16b mov v10.16b, v15.16b b Lseal_tail_64 Lseal_tail_16: // Here we handle the last [0,16) bytes of ciphertext that require a padded block cbz x2, Lseal_hash_extra eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes not v22.16b, v20.16b mov x6, x2 add x1, x1, x2 cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding mov x7, #16 // We need to load some extra_in first for padding sub x7, x7, x2 cmp x4, x7 csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register mov x12, x7 add x3, x3, x7 sub x4, x4, x7 Lseal_tail16_compose_extra_in: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x3, #-1]! mov v20.b[0], w11 subs x7, x7, #1 b.gt Lseal_tail16_compose_extra_in add x3, x3, x12 Lseal_tail_16_compose: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x1, #-1]! mov v20.b[0], w11 ext v21.16b, v22.16b, v21.16b, #15 subs x2, x2, #1 b.gt Lseal_tail_16_compose and v0.16b, v0.16b, v21.16b eor v20.16b, v20.16b, v0.16b mov v21.16b, v20.16b Lseal_tail_16_store: umov w11, v20.b[0] strb w11, [x0], #1 ext v20.16b, v20.16b, v20.16b, #1 subs x6, x6, #1 b.gt Lseal_tail_16_store // Hash in the final ct block concatenated with extra_in mov x11, v21.d[0] mov x12, v21.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most Lseal_hash_extra: cbz x4, Lseal_finalize Lseal_hash_extra_loop: cmp x4, #16 b.lt Lseal_hash_extra_tail ld1 {v20.16b}, [x3], #16 mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most sub x4, x4, #16 b Lseal_hash_extra_loop Lseal_hash_extra_tail: cbz x4, Lseal_finalize eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext add x3, x3, x4 Lseal_hash_extra_load: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x3, #-1]! mov v20.b[0], w11 subs x4, x4, #1 b.gt Lseal_hash_extra_load // Hash in the final padded extra_in blcok mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most Lseal_finalize: mov x11, v31.d[0] mov x12, v31.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most // Final reduction step sub x12, xzr, x15 orr x13, xzr, #3 subs x11, x8, #-5 sbcs x12, x9, x12 sbcs x13, x10, x13 csel x8, x11, x8, cs csel x9, x12, x9, cs csel x10, x13, x10, cs mov x11, v27.d[0] mov x12, v27.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 stp x8, x9, [x5] ldp d8, d9, [sp, #16] ldp d10, d11, [sp, #32] ldp d12, d13, [sp, #48] ldp d14, d15, [sp, #64] .cfi_restore b15 .cfi_restore b14 .cfi_restore b13 .cfi_restore b12 .cfi_restore b11 .cfi_restore b10 .cfi_restore b9 .cfi_restore b8 ldp x29, x30, [sp], 80 .cfi_restore w29 .cfi_restore w30 .cfi_def_cfa_offset 0 AARCH64_VALIDATE_LINK_REGISTER ret Lseal_128: // On some architectures preparing 5 blocks for small buffers is wasteful eor v25.16b, v25.16b, v25.16b mov x11, #1 mov v25.s[0], w11 mov v0.16b, v24.16b mov v1.16b, v24.16b mov v2.16b, v24.16b mov v5.16b, v28.16b mov v6.16b, v28.16b mov v7.16b, v28.16b mov v10.16b, v29.16b mov v11.16b, v29.16b mov v12.16b, v29.16b mov v17.16b, v30.16b add v15.4s, v17.4s, v25.4s add v16.4s, v15.4s, v25.4s mov x6, #10 Lseal_128_rounds: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v6.16b, v6.16b, v6.16b, #4 ext v7.16b, v7.16b, v7.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 ext v16.16b, v16.16b, v16.16b, #12 ext v17.16b, v17.16b, v17.16b, #12 add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v6.16b, v6.16b, v6.16b, #12 ext v7.16b, v7.16b, v7.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 ext v16.16b, v16.16b, v16.16b, #4 ext v17.16b, v17.16b, v17.16b, #4 subs x6, x6, #1 b.hi Lseal_128_rounds add v0.4s, v0.4s, v24.4s add v1.4s, v1.4s, v24.4s add v2.4s, v2.4s, v24.4s add v5.4s, v5.4s, v28.4s add v6.4s, v6.4s, v28.4s add v7.4s, v7.4s, v28.4s // Only the first 32 bytes of the third block (counter = 0) are needed, // so skip updating v12 and v17. add v10.4s, v10.4s, v29.4s add v11.4s, v11.4s, v29.4s add v30.4s, v30.4s, v25.4s add v15.4s, v15.4s, v30.4s add v30.4s, v30.4s, v25.4s add v16.4s, v16.4s, v30.4s and v2.16b, v2.16b, v27.16b mov x16, v2.d[0] // Move the R key to GPRs mov x17, v2.d[1] mov v27.16b, v7.16b // Store the S key bl Lpoly_hash_ad_internal b Lseal_tail .cfi_endproc ///////////////////////////////// // // void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); // .globl _chacha20_poly1305_open .private_extern _chacha20_poly1305_open .align 6 _chacha20_poly1305_open: AARCH64_SIGN_LINK_REGISTER .cfi_startproc stp x29, x30, [sp, #-80]! .cfi_def_cfa_offset 80 .cfi_offset w30, -72 .cfi_offset w29, -80 mov x29, sp // We probably could do .cfi_def_cfa w29, 80 at this point, but since // we don't actually use the frame pointer like that, it's probably not // worth bothering. stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] .cfi_offset b15, -8 .cfi_offset b14, -16 .cfi_offset b13, -24 .cfi_offset b12, -32 .cfi_offset b11, -40 .cfi_offset b10, -48 .cfi_offset b9, -56 .cfi_offset b8, -64 adrp x11, Lchacha20_consts@PAGE add x11, x11, Lchacha20_consts@PAGEOFF ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values ld1 {v28.16b - v30.16b}, [x5] mov x15, #1 // Prepare the Poly1305 state mov x8, #0 mov x9, #0 mov x10, #0 mov v31.d[0], x4 // Store the input and aad lengths mov v31.d[1], x2 cmp x2, #128 b.le Lopen_128 // Optimization for smaller buffers // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys mov v0.16b, v24.16b mov v5.16b, v28.16b mov v10.16b, v29.16b mov v15.16b, v30.16b mov x6, #10 .align 5 Lopen_init_rounds: add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 subs x6, x6, #1 b.hi Lopen_init_rounds add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s and v0.16b, v0.16b, v27.16b mov x16, v0.d[0] // Move the R key to GPRs mov x17, v0.d[1] mov v27.16b, v5.16b // Store the S key bl Lpoly_hash_ad_internal Lopen_ad_done: mov x3, x1 // Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes Lopen_main_loop: cmp x2, #192 b.lt Lopen_tail adrp x11, Lchacha20_consts@PAGE add x11, x11, Lchacha20_consts@PAGEOFF ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] mov v4.16b, v24.16b ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 mov v9.16b, v28.16b ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 mov v14.16b, v29.16b ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] sub x5, x5, #32 add v15.4s, v15.4s, v25.4s mov v19.16b, v30.16b eor v20.16b, v20.16b, v20.16b //zero not v21.16b, v20.16b // -1 sub v21.4s, v25.4s, v21.4s // Add +1 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) add v19.4s, v19.4s, v20.4s lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 sub x4, x4, #10 mov x7, #10 subs x6, x7, x4 subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full cbz x7, Lopen_main_loop_rounds_short .align 5 Lopen_main_loop_rounds: ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most Lopen_main_loop_rounds_short: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v9.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v18.8h, v18.8h rev32 v19.8h, v19.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b eor v8.16b, v8.16b, v13.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v9.4s, #20 sli v8.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s add v3.4s, v3.4s, v7.4s add v4.4s, v4.4s, v8.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v18.16b, {v18.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v14.16b ushr v9.4s, v8.4s, #25 sli v9.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #4 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #12 ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most add v0.4s, v0.4s, v6.4s add v1.4s, v1.4s, v7.4s add v2.4s, v2.4s, v8.4s add v3.4s, v3.4s, v5.4s add v4.4s, v4.4s, v9.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v18.8h, v18.8h rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v19.8h, v19.8h add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v5.4s, #20 sli v8.4s, v5.4s, #12 ushr v5.4s, v9.4s, #20 sli v5.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v5.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v18.16b, {v18.16b}, v26.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v12.16b eor v6.16b, v6.16b, v13.16b eor v7.16b, v7.16b, v10.16b eor v8.16b, v8.16b, v11.16b eor v5.16b, v5.16b, v14.16b ushr v9.4s, v5.4s, #25 sli v9.4s, v5.4s, #7 ushr v5.4s, v8.4s, #25 sli v5.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #12 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #4 subs x7, x7, #1 b.gt Lopen_main_loop_rounds subs x6, x6, #1 b.ge Lopen_main_loop_rounds_short eor v20.16b, v20.16b, v20.16b //zero not v21.16b, v20.16b // -1 sub v21.4s, v25.4s, v21.4s // Add +1 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) add v19.4s, v19.4s, v20.4s add v15.4s, v15.4s, v25.4s mov x11, #5 dup v20.4s, w11 add v25.4s, v25.4s, v20.4s zip1 v20.4s, v0.4s, v1.4s zip2 v21.4s, v0.4s, v1.4s zip1 v22.4s, v2.4s, v3.4s zip2 v23.4s, v2.4s, v3.4s zip1 v0.2d, v20.2d, v22.2d zip2 v1.2d, v20.2d, v22.2d zip1 v2.2d, v21.2d, v23.2d zip2 v3.2d, v21.2d, v23.2d zip1 v20.4s, v5.4s, v6.4s zip2 v21.4s, v5.4s, v6.4s zip1 v22.4s, v7.4s, v8.4s zip2 v23.4s, v7.4s, v8.4s zip1 v5.2d, v20.2d, v22.2d zip2 v6.2d, v20.2d, v22.2d zip1 v7.2d, v21.2d, v23.2d zip2 v8.2d, v21.2d, v23.2d zip1 v20.4s, v10.4s, v11.4s zip2 v21.4s, v10.4s, v11.4s zip1 v22.4s, v12.4s, v13.4s zip2 v23.4s, v12.4s, v13.4s zip1 v10.2d, v20.2d, v22.2d zip2 v11.2d, v20.2d, v22.2d zip1 v12.2d, v21.2d, v23.2d zip2 v13.2d, v21.2d, v23.2d zip1 v20.4s, v15.4s, v16.4s zip2 v21.4s, v15.4s, v16.4s zip1 v22.4s, v17.4s, v18.4s zip2 v23.4s, v17.4s, v18.4s zip1 v15.2d, v20.2d, v22.2d zip2 v16.2d, v20.2d, v22.2d zip1 v17.2d, v21.2d, v23.2d zip2 v18.2d, v21.2d, v23.2d add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s add v10.4s, v10.4s, v29.4s add v15.4s, v15.4s, v30.4s add v1.4s, v1.4s, v24.4s add v6.4s, v6.4s, v28.4s add v11.4s, v11.4s, v29.4s add v16.4s, v16.4s, v30.4s add v2.4s, v2.4s, v24.4s add v7.4s, v7.4s, v28.4s add v12.4s, v12.4s, v29.4s add v17.4s, v17.4s, v30.4s add v3.4s, v3.4s, v24.4s add v8.4s, v8.4s, v28.4s add v13.4s, v13.4s, v29.4s add v18.4s, v18.4s, v30.4s add v4.4s, v4.4s, v24.4s add v9.4s, v9.4s, v28.4s add v14.4s, v14.4s, v29.4s add v19.4s, v19.4s, v30.4s // We can always safely store 192 bytes ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v2.16b eor v21.16b, v21.16b, v7.16b eor v22.16b, v22.16b, v12.16b eor v23.16b, v23.16b, v17.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #192 mov v0.16b, v3.16b mov v5.16b, v8.16b mov v10.16b, v13.16b mov v15.16b, v18.16b cmp x2, #64 b.lt Lopen_tail_64_store ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v3.16b eor v21.16b, v21.16b, v8.16b eor v22.16b, v22.16b, v13.16b eor v23.16b, v23.16b, v18.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 mov v0.16b, v4.16b mov v5.16b, v9.16b mov v10.16b, v14.16b mov v15.16b, v19.16b cmp x2, #64 b.lt Lopen_tail_64_store ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v4.16b eor v21.16b, v21.16b, v9.16b eor v22.16b, v22.16b, v14.16b eor v23.16b, v23.16b, v19.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 b Lopen_main_loop Lopen_tail: cbz x2, Lopen_finalize lsr x4, x2, #4 // How many whole blocks we have to hash cmp x2, #64 b.le Lopen_tail_64 cmp x2, #128 b.le Lopen_tail_128 Lopen_tail_192: // We need three more blocks mov v0.16b, v24.16b mov v1.16b, v24.16b mov v2.16b, v24.16b mov v5.16b, v28.16b mov v6.16b, v28.16b mov v7.16b, v28.16b mov v10.16b, v29.16b mov v11.16b, v29.16b mov v12.16b, v29.16b mov v15.16b, v30.16b mov v16.16b, v30.16b mov v17.16b, v30.16b eor v23.16b, v23.16b, v23.16b eor v21.16b, v21.16b, v21.16b ins v23.s[0], v25.s[0] ins v21.d[0], x15 add v22.4s, v23.4s, v21.4s add v21.4s, v22.4s, v21.4s add v15.4s, v15.4s, v21.4s add v16.4s, v16.4s, v23.4s add v17.4s, v17.4s, v22.4s mov x7, #10 subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing sub x4, x4, x7 cbz x7, Lopen_tail_192_rounds_no_hash Lopen_tail_192_rounds: ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most Lopen_tail_192_rounds_no_hash: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v6.16b, v6.16b, v6.16b, #4 ext v7.16b, v7.16b, v7.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 ext v16.16b, v16.16b, v16.16b, #12 ext v17.16b, v17.16b, v17.16b, #12 add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v6.16b, v6.16b, v6.16b, #12 ext v7.16b, v7.16b, v7.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 ext v16.16b, v16.16b, v16.16b, #4 ext v17.16b, v17.16b, v17.16b, #4 subs x7, x7, #1 b.gt Lopen_tail_192_rounds subs x6, x6, #1 b.ge Lopen_tail_192_rounds_no_hash // We hashed 160 bytes at most, may still have 32 bytes left Lopen_tail_192_hash: cbz x4, Lopen_tail_192_hash_done ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most sub x4, x4, #1 b Lopen_tail_192_hash Lopen_tail_192_hash_done: add v0.4s, v0.4s, v24.4s add v1.4s, v1.4s, v24.4s add v2.4s, v2.4s, v24.4s add v5.4s, v5.4s, v28.4s add v6.4s, v6.4s, v28.4s add v7.4s, v7.4s, v28.4s add v10.4s, v10.4s, v29.4s add v11.4s, v11.4s, v29.4s add v12.4s, v12.4s, v29.4s add v15.4s, v15.4s, v30.4s add v16.4s, v16.4s, v30.4s add v17.4s, v17.4s, v30.4s add v15.4s, v15.4s, v21.4s add v16.4s, v16.4s, v23.4s add v17.4s, v17.4s, v22.4s ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v2.16b eor v21.16b, v21.16b, v7.16b eor v22.16b, v22.16b, v12.16b eor v23.16b, v23.16b, v17.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #128 b Lopen_tail_64_store Lopen_tail_128: // We need two more blocks mov v0.16b, v24.16b mov v1.16b, v24.16b mov v5.16b, v28.16b mov v6.16b, v28.16b mov v10.16b, v29.16b mov v11.16b, v29.16b mov v15.16b, v30.16b mov v16.16b, v30.16b eor v23.16b, v23.16b, v23.16b eor v22.16b, v22.16b, v22.16b ins v23.s[0], v25.s[0] ins v22.d[0], x15 add v22.4s, v22.4s, v23.4s add v15.4s, v15.4s, v22.4s add v16.4s, v16.4s, v23.4s mov x6, #10 sub x6, x6, x4 Lopen_tail_128_rounds: add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 add v1.4s, v1.4s, v6.4s eor v16.16b, v16.16b, v1.16b rev32 v16.8h, v16.8h add v11.4s, v11.4s, v16.4s eor v6.16b, v6.16b, v11.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 add v1.4s, v1.4s, v20.4s eor v16.16b, v16.16b, v1.16b tbl v16.16b, {v16.16b}, v26.16b add v11.4s, v11.4s, v16.4s eor v20.16b, v20.16b, v11.16b ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v6.16b, v6.16b, v6.16b, #4 ext v11.16b, v11.16b, v11.16b, #8 ext v16.16b, v16.16b, v16.16b, #12 add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 add v1.4s, v1.4s, v6.4s eor v16.16b, v16.16b, v1.16b rev32 v16.8h, v16.8h add v11.4s, v11.4s, v16.4s eor v6.16b, v6.16b, v11.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 add v1.4s, v1.4s, v20.4s eor v16.16b, v16.16b, v1.16b tbl v16.16b, {v16.16b}, v26.16b add v11.4s, v11.4s, v16.4s eor v20.16b, v20.16b, v11.16b ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v6.16b, v6.16b, v6.16b, #12 ext v11.16b, v11.16b, v11.16b, #8 ext v16.16b, v16.16b, v16.16b, #4 subs x6, x6, #1 b.gt Lopen_tail_128_rounds cbz x4, Lopen_tail_128_rounds_done subs x4, x4, #1 ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most b Lopen_tail_128_rounds Lopen_tail_128_rounds_done: add v0.4s, v0.4s, v24.4s add v1.4s, v1.4s, v24.4s add v5.4s, v5.4s, v28.4s add v6.4s, v6.4s, v28.4s add v10.4s, v10.4s, v29.4s add v11.4s, v11.4s, v29.4s add v15.4s, v15.4s, v30.4s add v16.4s, v16.4s, v30.4s add v15.4s, v15.4s, v22.4s add v16.4s, v16.4s, v23.4s ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 b Lopen_tail_64_store Lopen_tail_64: // We just need a single block mov v0.16b, v24.16b mov v5.16b, v28.16b mov v10.16b, v29.16b mov v15.16b, v30.16b eor v23.16b, v23.16b, v23.16b ins v23.s[0], v25.s[0] add v15.4s, v15.4s, v23.4s mov x6, #10 sub x6, x6, x4 Lopen_tail_64_rounds: add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 subs x6, x6, #1 b.gt Lopen_tail_64_rounds cbz x4, Lopen_tail_64_rounds_done subs x4, x4, #1 ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most b Lopen_tail_64_rounds Lopen_tail_64_rounds_done: add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s add v10.4s, v10.4s, v29.4s add v15.4s, v15.4s, v30.4s add v15.4s, v15.4s, v23.4s Lopen_tail_64_store: cmp x2, #16 b.lt Lopen_tail_16 ld1 {v20.16b}, [x1], #16 eor v20.16b, v20.16b, v0.16b st1 {v20.16b}, [x0], #16 mov v0.16b, v5.16b mov v5.16b, v10.16b mov v10.16b, v15.16b sub x2, x2, #16 b Lopen_tail_64_store Lopen_tail_16: // Here we handle the last [0,16) bytes that require a padded block cbz x2, Lopen_finalize eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask not v22.16b, v20.16b add x7, x1, x2 mov x6, x2 Lopen_tail_16_compose: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x7, #-1]! mov v20.b[0], w11 ext v21.16b, v22.16b, v21.16b, #15 subs x2, x2, #1 b.gt Lopen_tail_16_compose and v20.16b, v20.16b, v21.16b // Hash in the final padded block mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most eor v20.16b, v20.16b, v0.16b Lopen_tail_16_store: umov w11, v20.b[0] strb w11, [x0], #1 ext v20.16b, v20.16b, v20.16b, #1 subs x6, x6, #1 b.gt Lopen_tail_16_store Lopen_finalize: mov x11, v31.d[0] mov x12, v31.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most // Final reduction step sub x12, xzr, x15 orr x13, xzr, #3 subs x11, x8, #-5 sbcs x12, x9, x12 sbcs x13, x10, x13 csel x8, x11, x8, cs csel x9, x12, x9, cs csel x10, x13, x10, cs mov x11, v27.d[0] mov x12, v27.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 stp x8, x9, [x5] ldp d8, d9, [sp, #16] ldp d10, d11, [sp, #32] ldp d12, d13, [sp, #48] ldp d14, d15, [sp, #64] .cfi_restore b15 .cfi_restore b14 .cfi_restore b13 .cfi_restore b12 .cfi_restore b11 .cfi_restore b10 .cfi_restore b9 .cfi_restore b8 ldp x29, x30, [sp], 80 .cfi_restore w29 .cfi_restore w30 .cfi_def_cfa_offset 0 AARCH64_VALIDATE_LINK_REGISTER ret Lopen_128: // On some architectures preparing 5 blocks for small buffers is wasteful eor v25.16b, v25.16b, v25.16b mov x11, #1 mov v25.s[0], w11 mov v0.16b, v24.16b mov v1.16b, v24.16b mov v2.16b, v24.16b mov v5.16b, v28.16b mov v6.16b, v28.16b mov v7.16b, v28.16b mov v10.16b, v29.16b mov v11.16b, v29.16b mov v12.16b, v29.16b mov v17.16b, v30.16b add v15.4s, v17.4s, v25.4s add v16.4s, v15.4s, v25.4s mov x6, #10 Lopen_128_rounds: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v6.16b, v6.16b, v6.16b, #4 ext v7.16b, v7.16b, v7.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 ext v16.16b, v16.16b, v16.16b, #12 ext v17.16b, v17.16b, v17.16b, #12 add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v6.16b, v6.16b, v6.16b, #12 ext v7.16b, v7.16b, v7.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 ext v16.16b, v16.16b, v16.16b, #4 ext v17.16b, v17.16b, v17.16b, #4 subs x6, x6, #1 b.hi Lopen_128_rounds add v0.4s, v0.4s, v24.4s add v1.4s, v1.4s, v24.4s add v2.4s, v2.4s, v24.4s add v5.4s, v5.4s, v28.4s add v6.4s, v6.4s, v28.4s add v7.4s, v7.4s, v28.4s add v10.4s, v10.4s, v29.4s add v11.4s, v11.4s, v29.4s add v30.4s, v30.4s, v25.4s add v15.4s, v15.4s, v30.4s add v30.4s, v30.4s, v25.4s add v16.4s, v16.4s, v30.4s and v2.16b, v2.16b, v27.16b mov x16, v2.d[0] // Move the R key to GPRs mov x17, v2.d[1] mov v27.16b, v7.16b // Store the S key bl Lpoly_hash_ad_internal Lopen_128_store: cmp x2, #64 b.lt Lopen_128_store_64 ld1 {v20.16b - v23.16b}, [x1], #64 mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v21.d[0] mov x12, v21.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v22.d[0] mov x12, v22.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v23.d[0] mov x12, v23.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 mov v0.16b, v1.16b mov v5.16b, v6.16b mov v10.16b, v11.16b mov v15.16b, v16.16b Lopen_128_store_64: lsr x4, x2, #4 mov x3, x1 Lopen_128_hash_64: cbz x4, Lopen_tail_64_store ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most sub x4, x4, #1 b Lopen_128_hash_64 .cfi_endproc #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) ring-0.17.14/pregenerated/chacha20_poly1305_armv8-linux64.S000064400000000000000000002211151046102023000211010ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) .section .rodata .align 7 .Lchacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .Linc: .long 1,2,3,4 .Lrol8: .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 .Lclamp: .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC .text .type .Lpoly_hash_ad_internal,%function .align 6 .Lpoly_hash_ad_internal: .cfi_startproc cbnz x4, .Lpoly_hash_intro ret .Lpoly_hash_intro: cmp x4, #16 b.lt .Lpoly_hash_ad_tail ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most sub x4, x4, #16 b .Lpoly_hash_ad_internal .Lpoly_hash_ad_tail: cbz x4, .Lpoly_hash_ad_ret eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD sub x4, x4, #1 .Lpoly_hash_tail_16_compose: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x3, x4] mov v20.b[0], w11 subs x4, x4, #1 b.ge .Lpoly_hash_tail_16_compose mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most .Lpoly_hash_ad_ret: ret .cfi_endproc .size .Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal ///////////////////////////////// // // void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); // .globl chacha20_poly1305_seal .hidden chacha20_poly1305_seal .type chacha20_poly1305_seal,%function .align 6 chacha20_poly1305_seal: AARCH64_SIGN_LINK_REGISTER .cfi_startproc stp x29, x30, [sp, #-80]! .cfi_def_cfa_offset 80 .cfi_offset w30, -72 .cfi_offset w29, -80 mov x29, sp // We probably could do .cfi_def_cfa w29, 80 at this point, but since // we don't actually use the frame pointer like that, it's probably not // worth bothering. stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] .cfi_offset b15, -8 .cfi_offset b14, -16 .cfi_offset b13, -24 .cfi_offset b12, -32 .cfi_offset b11, -40 .cfi_offset b10, -48 .cfi_offset b9, -56 .cfi_offset b8, -64 adrp x11, .Lchacha20_consts add x11, x11, :lo12:.Lchacha20_consts ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values ld1 {v28.16b - v30.16b}, [x5] mov x15, #1 // Prepare the Poly1305 state mov x8, #0 mov x9, #0 mov x10, #0 ldr x12, [x5, #56] // The total cipher text length includes extra_in_len add x12, x12, x2 mov v31.d[0], x4 // Store the input and aad lengths mov v31.d[1], x12 cmp x2, #128 b.le .Lseal_128 // Optimization for smaller buffers // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, // the fifth block (A4-D4) horizontally. ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] mov v4.16b, v24.16b ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 mov v9.16b, v28.16b ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 mov v14.16b, v29.16b ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] add v15.4s, v15.4s, v25.4s mov v19.16b, v30.16b sub x5, x5, #32 mov x6, #10 .align 5 .Lseal_init_rounds: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v9.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v18.8h, v18.8h rev32 v19.8h, v19.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b eor v8.16b, v8.16b, v13.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v9.4s, #20 sli v8.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s add v3.4s, v3.4s, v7.4s add v4.4s, v4.4s, v8.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v18.16b, {v18.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v14.16b ushr v9.4s, v8.4s, #25 sli v9.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #4 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #12 add v0.4s, v0.4s, v6.4s add v1.4s, v1.4s, v7.4s add v2.4s, v2.4s, v8.4s add v3.4s, v3.4s, v5.4s add v4.4s, v4.4s, v9.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v18.8h, v18.8h rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v19.8h, v19.8h add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v5.4s, #20 sli v8.4s, v5.4s, #12 ushr v5.4s, v9.4s, #20 sli v5.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v5.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v18.16b, {v18.16b}, v26.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v12.16b eor v6.16b, v6.16b, v13.16b eor v7.16b, v7.16b, v10.16b eor v8.16b, v8.16b, v11.16b eor v5.16b, v5.16b, v14.16b ushr v9.4s, v5.4s, #25 sli v9.4s, v5.4s, #7 ushr v5.4s, v8.4s, #25 sli v5.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #12 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #4 subs x6, x6, #1 b.hi .Lseal_init_rounds add v15.4s, v15.4s, v25.4s mov x11, #4 dup v20.4s, w11 add v25.4s, v25.4s, v20.4s zip1 v20.4s, v0.4s, v1.4s zip2 v21.4s, v0.4s, v1.4s zip1 v22.4s, v2.4s, v3.4s zip2 v23.4s, v2.4s, v3.4s zip1 v0.2d, v20.2d, v22.2d zip2 v1.2d, v20.2d, v22.2d zip1 v2.2d, v21.2d, v23.2d zip2 v3.2d, v21.2d, v23.2d zip1 v20.4s, v5.4s, v6.4s zip2 v21.4s, v5.4s, v6.4s zip1 v22.4s, v7.4s, v8.4s zip2 v23.4s, v7.4s, v8.4s zip1 v5.2d, v20.2d, v22.2d zip2 v6.2d, v20.2d, v22.2d zip1 v7.2d, v21.2d, v23.2d zip2 v8.2d, v21.2d, v23.2d zip1 v20.4s, v10.4s, v11.4s zip2 v21.4s, v10.4s, v11.4s zip1 v22.4s, v12.4s, v13.4s zip2 v23.4s, v12.4s, v13.4s zip1 v10.2d, v20.2d, v22.2d zip2 v11.2d, v20.2d, v22.2d zip1 v12.2d, v21.2d, v23.2d zip2 v13.2d, v21.2d, v23.2d zip1 v20.4s, v15.4s, v16.4s zip2 v21.4s, v15.4s, v16.4s zip1 v22.4s, v17.4s, v18.4s zip2 v23.4s, v17.4s, v18.4s zip1 v15.2d, v20.2d, v22.2d zip2 v16.2d, v20.2d, v22.2d zip1 v17.2d, v21.2d, v23.2d zip2 v18.2d, v21.2d, v23.2d add v4.4s, v4.4s, v24.4s add v9.4s, v9.4s, v28.4s and v4.16b, v4.16b, v27.16b add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s add v10.4s, v10.4s, v29.4s add v15.4s, v15.4s, v30.4s add v1.4s, v1.4s, v24.4s add v6.4s, v6.4s, v28.4s add v11.4s, v11.4s, v29.4s add v16.4s, v16.4s, v30.4s add v2.4s, v2.4s, v24.4s add v7.4s, v7.4s, v28.4s add v12.4s, v12.4s, v29.4s add v17.4s, v17.4s, v30.4s add v3.4s, v3.4s, v24.4s add v8.4s, v8.4s, v28.4s add v13.4s, v13.4s, v29.4s add v18.4s, v18.4s, v30.4s mov x16, v4.d[0] // Move the R key to GPRs mov x17, v4.d[1] mov v27.16b, v9.16b // Store the S key bl .Lpoly_hash_ad_internal mov x3, x0 cmp x2, #256 b.le .Lseal_tail ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v2.16b eor v21.16b, v21.16b, v7.16b eor v22.16b, v22.16b, v12.16b eor v23.16b, v23.16b, v17.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v3.16b eor v21.16b, v21.16b, v8.16b eor v22.16b, v22.16b, v13.16b eor v23.16b, v23.16b, v18.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #256 mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 .Lseal_main_loop: adrp x11, .Lchacha20_consts add x11, x11, :lo12:.Lchacha20_consts ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] mov v4.16b, v24.16b ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 mov v9.16b, v28.16b ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 mov v14.16b, v29.16b ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] add v15.4s, v15.4s, v25.4s mov v19.16b, v30.16b eor v20.16b, v20.16b, v20.16b //zero not v21.16b, v20.16b // -1 sub v21.4s, v25.4s, v21.4s // Add +1 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) add v19.4s, v19.4s, v20.4s sub x5, x5, #32 .align 5 .Lseal_main_loop_rounds: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v9.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v18.8h, v18.8h rev32 v19.8h, v19.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b eor v8.16b, v8.16b, v13.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v9.4s, #20 sli v8.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s add v3.4s, v3.4s, v7.4s add v4.4s, v4.4s, v8.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v18.16b, {v18.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v14.16b ushr v9.4s, v8.4s, #25 sli v9.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #4 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #12 ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most add v0.4s, v0.4s, v6.4s add v1.4s, v1.4s, v7.4s add v2.4s, v2.4s, v8.4s add v3.4s, v3.4s, v5.4s add v4.4s, v4.4s, v9.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v18.8h, v18.8h rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v19.8h, v19.8h add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v5.4s, #20 sli v8.4s, v5.4s, #12 ushr v5.4s, v9.4s, #20 sli v5.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v5.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v18.16b, {v18.16b}, v26.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v12.16b eor v6.16b, v6.16b, v13.16b eor v7.16b, v7.16b, v10.16b eor v8.16b, v8.16b, v11.16b eor v5.16b, v5.16b, v14.16b ushr v9.4s, v5.4s, #25 sli v9.4s, v5.4s, #7 ushr v5.4s, v8.4s, #25 sli v5.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #12 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #4 subs x6, x6, #1 b.ge .Lseal_main_loop_rounds ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most subs x7, x7, #1 b.gt .Lseal_main_loop_rounds eor v20.16b, v20.16b, v20.16b //zero not v21.16b, v20.16b // -1 sub v21.4s, v25.4s, v21.4s // Add +1 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) add v19.4s, v19.4s, v20.4s add v15.4s, v15.4s, v25.4s mov x11, #5 dup v20.4s, w11 add v25.4s, v25.4s, v20.4s zip1 v20.4s, v0.4s, v1.4s zip2 v21.4s, v0.4s, v1.4s zip1 v22.4s, v2.4s, v3.4s zip2 v23.4s, v2.4s, v3.4s zip1 v0.2d, v20.2d, v22.2d zip2 v1.2d, v20.2d, v22.2d zip1 v2.2d, v21.2d, v23.2d zip2 v3.2d, v21.2d, v23.2d zip1 v20.4s, v5.4s, v6.4s zip2 v21.4s, v5.4s, v6.4s zip1 v22.4s, v7.4s, v8.4s zip2 v23.4s, v7.4s, v8.4s zip1 v5.2d, v20.2d, v22.2d zip2 v6.2d, v20.2d, v22.2d zip1 v7.2d, v21.2d, v23.2d zip2 v8.2d, v21.2d, v23.2d zip1 v20.4s, v10.4s, v11.4s zip2 v21.4s, v10.4s, v11.4s zip1 v22.4s, v12.4s, v13.4s zip2 v23.4s, v12.4s, v13.4s zip1 v10.2d, v20.2d, v22.2d zip2 v11.2d, v20.2d, v22.2d zip1 v12.2d, v21.2d, v23.2d zip2 v13.2d, v21.2d, v23.2d zip1 v20.4s, v15.4s, v16.4s zip2 v21.4s, v15.4s, v16.4s zip1 v22.4s, v17.4s, v18.4s zip2 v23.4s, v17.4s, v18.4s zip1 v15.2d, v20.2d, v22.2d zip2 v16.2d, v20.2d, v22.2d zip1 v17.2d, v21.2d, v23.2d zip2 v18.2d, v21.2d, v23.2d add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s add v10.4s, v10.4s, v29.4s add v15.4s, v15.4s, v30.4s add v1.4s, v1.4s, v24.4s add v6.4s, v6.4s, v28.4s add v11.4s, v11.4s, v29.4s add v16.4s, v16.4s, v30.4s add v2.4s, v2.4s, v24.4s add v7.4s, v7.4s, v28.4s add v12.4s, v12.4s, v29.4s add v17.4s, v17.4s, v30.4s add v3.4s, v3.4s, v24.4s add v8.4s, v8.4s, v28.4s add v13.4s, v13.4s, v29.4s add v18.4s, v18.4s, v30.4s add v4.4s, v4.4s, v24.4s add v9.4s, v9.4s, v28.4s add v14.4s, v14.4s, v29.4s add v19.4s, v19.4s, v30.4s cmp x2, #320 b.le .Lseal_tail ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v2.16b eor v21.16b, v21.16b, v7.16b eor v22.16b, v22.16b, v12.16b eor v23.16b, v23.16b, v17.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v3.16b eor v21.16b, v21.16b, v8.16b eor v22.16b, v22.16b, v13.16b eor v23.16b, v23.16b, v18.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v4.16b eor v21.16b, v21.16b, v9.16b eor v22.16b, v22.16b, v14.16b eor v23.16b, v23.16b, v19.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #320 mov x6, #0 mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration b .Lseal_main_loop .Lseal_tail: // This part of the function handles the storage and authentication of the last [0,320) bytes // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. cmp x2, #64 b.lt .Lseal_tail_64 // Store and authenticate 64B blocks per iteration ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v21.d[0] mov x12, v21.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v22.d[0] mov x12, v22.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v23.d[0] mov x12, v23.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 // Shift the state left by 64 bytes for the next iteration of the loop mov v0.16b, v1.16b mov v5.16b, v6.16b mov v10.16b, v11.16b mov v15.16b, v16.16b mov v1.16b, v2.16b mov v6.16b, v7.16b mov v11.16b, v12.16b mov v16.16b, v17.16b mov v2.16b, v3.16b mov v7.16b, v8.16b mov v12.16b, v13.16b mov v17.16b, v18.16b mov v3.16b, v4.16b mov v8.16b, v9.16b mov v13.16b, v14.16b mov v18.16b, v19.16b b .Lseal_tail .Lseal_tail_64: ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr // Here we handle the last [0,64) bytes of plaintext cmp x2, #16 b.lt .Lseal_tail_16 // Each iteration encrypt and authenticate a 16B block ld1 {v20.16b}, [x1], #16 eor v20.16b, v20.16b, v0.16b mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most st1 {v20.16b}, [x0], #16 sub x2, x2, #16 // Shift the state left by 16 bytes for the next iteration of the loop mov v0.16b, v5.16b mov v5.16b, v10.16b mov v10.16b, v15.16b b .Lseal_tail_64 .Lseal_tail_16: // Here we handle the last [0,16) bytes of ciphertext that require a padded block cbz x2, .Lseal_hash_extra eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes not v22.16b, v20.16b mov x6, x2 add x1, x1, x2 cbz x4, .Lseal_tail_16_compose // No extra data to pad with, zero padding mov x7, #16 // We need to load some extra_in first for padding sub x7, x7, x2 cmp x4, x7 csel x7, x4, x7, lt // .Load the minimum of extra_in_len and the amount needed to fill the register mov x12, x7 add x3, x3, x7 sub x4, x4, x7 .Lseal_tail16_compose_extra_in: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x3, #-1]! mov v20.b[0], w11 subs x7, x7, #1 b.gt .Lseal_tail16_compose_extra_in add x3, x3, x12 .Lseal_tail_16_compose: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x1, #-1]! mov v20.b[0], w11 ext v21.16b, v22.16b, v21.16b, #15 subs x2, x2, #1 b.gt .Lseal_tail_16_compose and v0.16b, v0.16b, v21.16b eor v20.16b, v20.16b, v0.16b mov v21.16b, v20.16b .Lseal_tail_16_store: umov w11, v20.b[0] strb w11, [x0], #1 ext v20.16b, v20.16b, v20.16b, #1 subs x6, x6, #1 b.gt .Lseal_tail_16_store // Hash in the final ct block concatenated with extra_in mov x11, v21.d[0] mov x12, v21.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most .Lseal_hash_extra: cbz x4, .Lseal_finalize .Lseal_hash_extra_loop: cmp x4, #16 b.lt .Lseal_hash_extra_tail ld1 {v20.16b}, [x3], #16 mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most sub x4, x4, #16 b .Lseal_hash_extra_loop .Lseal_hash_extra_tail: cbz x4, .Lseal_finalize eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext add x3, x3, x4 .Lseal_hash_extra_load: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x3, #-1]! mov v20.b[0], w11 subs x4, x4, #1 b.gt .Lseal_hash_extra_load // Hash in the final padded extra_in blcok mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most .Lseal_finalize: mov x11, v31.d[0] mov x12, v31.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most // Final reduction step sub x12, xzr, x15 orr x13, xzr, #3 subs x11, x8, #-5 sbcs x12, x9, x12 sbcs x13, x10, x13 csel x8, x11, x8, cs csel x9, x12, x9, cs csel x10, x13, x10, cs mov x11, v27.d[0] mov x12, v27.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 stp x8, x9, [x5] ldp d8, d9, [sp, #16] ldp d10, d11, [sp, #32] ldp d12, d13, [sp, #48] ldp d14, d15, [sp, #64] .cfi_restore b15 .cfi_restore b14 .cfi_restore b13 .cfi_restore b12 .cfi_restore b11 .cfi_restore b10 .cfi_restore b9 .cfi_restore b8 ldp x29, x30, [sp], 80 .cfi_restore w29 .cfi_restore w30 .cfi_def_cfa_offset 0 AARCH64_VALIDATE_LINK_REGISTER ret .Lseal_128: // On some architectures preparing 5 blocks for small buffers is wasteful eor v25.16b, v25.16b, v25.16b mov x11, #1 mov v25.s[0], w11 mov v0.16b, v24.16b mov v1.16b, v24.16b mov v2.16b, v24.16b mov v5.16b, v28.16b mov v6.16b, v28.16b mov v7.16b, v28.16b mov v10.16b, v29.16b mov v11.16b, v29.16b mov v12.16b, v29.16b mov v17.16b, v30.16b add v15.4s, v17.4s, v25.4s add v16.4s, v15.4s, v25.4s mov x6, #10 .Lseal_128_rounds: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v6.16b, v6.16b, v6.16b, #4 ext v7.16b, v7.16b, v7.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 ext v16.16b, v16.16b, v16.16b, #12 ext v17.16b, v17.16b, v17.16b, #12 add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v6.16b, v6.16b, v6.16b, #12 ext v7.16b, v7.16b, v7.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 ext v16.16b, v16.16b, v16.16b, #4 ext v17.16b, v17.16b, v17.16b, #4 subs x6, x6, #1 b.hi .Lseal_128_rounds add v0.4s, v0.4s, v24.4s add v1.4s, v1.4s, v24.4s add v2.4s, v2.4s, v24.4s add v5.4s, v5.4s, v28.4s add v6.4s, v6.4s, v28.4s add v7.4s, v7.4s, v28.4s // Only the first 32 bytes of the third block (counter = 0) are needed, // so skip updating v12 and v17. add v10.4s, v10.4s, v29.4s add v11.4s, v11.4s, v29.4s add v30.4s, v30.4s, v25.4s add v15.4s, v15.4s, v30.4s add v30.4s, v30.4s, v25.4s add v16.4s, v16.4s, v30.4s and v2.16b, v2.16b, v27.16b mov x16, v2.d[0] // Move the R key to GPRs mov x17, v2.d[1] mov v27.16b, v7.16b // Store the S key bl .Lpoly_hash_ad_internal b .Lseal_tail .cfi_endproc .size chacha20_poly1305_seal,.-chacha20_poly1305_seal ///////////////////////////////// // // void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); // .globl chacha20_poly1305_open .hidden chacha20_poly1305_open .type chacha20_poly1305_open,%function .align 6 chacha20_poly1305_open: AARCH64_SIGN_LINK_REGISTER .cfi_startproc stp x29, x30, [sp, #-80]! .cfi_def_cfa_offset 80 .cfi_offset w30, -72 .cfi_offset w29, -80 mov x29, sp // We probably could do .cfi_def_cfa w29, 80 at this point, but since // we don't actually use the frame pointer like that, it's probably not // worth bothering. stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] .cfi_offset b15, -8 .cfi_offset b14, -16 .cfi_offset b13, -24 .cfi_offset b12, -32 .cfi_offset b11, -40 .cfi_offset b10, -48 .cfi_offset b9, -56 .cfi_offset b8, -64 adrp x11, .Lchacha20_consts add x11, x11, :lo12:.Lchacha20_consts ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values ld1 {v28.16b - v30.16b}, [x5] mov x15, #1 // Prepare the Poly1305 state mov x8, #0 mov x9, #0 mov x10, #0 mov v31.d[0], x4 // Store the input and aad lengths mov v31.d[1], x2 cmp x2, #128 b.le .Lopen_128 // Optimization for smaller buffers // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys mov v0.16b, v24.16b mov v5.16b, v28.16b mov v10.16b, v29.16b mov v15.16b, v30.16b mov x6, #10 .align 5 .Lopen_init_rounds: add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 subs x6, x6, #1 b.hi .Lopen_init_rounds add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s and v0.16b, v0.16b, v27.16b mov x16, v0.d[0] // Move the R key to GPRs mov x17, v0.d[1] mov v27.16b, v5.16b // Store the S key bl .Lpoly_hash_ad_internal .Lopen_ad_done: mov x3, x1 // Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes .Lopen_main_loop: cmp x2, #192 b.lt .Lopen_tail adrp x11, .Lchacha20_consts add x11, x11, :lo12:.Lchacha20_consts ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] mov v4.16b, v24.16b ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 mov v9.16b, v28.16b ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 mov v14.16b, v29.16b ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] sub x5, x5, #32 add v15.4s, v15.4s, v25.4s mov v19.16b, v30.16b eor v20.16b, v20.16b, v20.16b //zero not v21.16b, v20.16b // -1 sub v21.4s, v25.4s, v21.4s // Add +1 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) add v19.4s, v19.4s, v20.4s lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 sub x4, x4, #10 mov x7, #10 subs x6, x7, x4 subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full cbz x7, .Lopen_main_loop_rounds_short .align 5 .Lopen_main_loop_rounds: ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most .Lopen_main_loop_rounds_short: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v9.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v18.8h, v18.8h rev32 v19.8h, v19.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b eor v8.16b, v8.16b, v13.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v9.4s, #20 sli v8.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s add v3.4s, v3.4s, v7.4s add v4.4s, v4.4s, v8.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v18.16b, {v18.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v14.16b ushr v9.4s, v8.4s, #25 sli v9.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #4 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #12 ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most add v0.4s, v0.4s, v6.4s add v1.4s, v1.4s, v7.4s add v2.4s, v2.4s, v8.4s add v3.4s, v3.4s, v5.4s add v4.4s, v4.4s, v9.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v18.8h, v18.8h rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v19.8h, v19.8h add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v5.4s, #20 sli v8.4s, v5.4s, #12 ushr v5.4s, v9.4s, #20 sli v5.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v5.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v18.16b, {v18.16b}, v26.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v12.16b eor v6.16b, v6.16b, v13.16b eor v7.16b, v7.16b, v10.16b eor v8.16b, v8.16b, v11.16b eor v5.16b, v5.16b, v14.16b ushr v9.4s, v5.4s, #25 sli v9.4s, v5.4s, #7 ushr v5.4s, v8.4s, #25 sli v5.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #12 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #4 subs x7, x7, #1 b.gt .Lopen_main_loop_rounds subs x6, x6, #1 b.ge .Lopen_main_loop_rounds_short eor v20.16b, v20.16b, v20.16b //zero not v21.16b, v20.16b // -1 sub v21.4s, v25.4s, v21.4s // Add +1 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) add v19.4s, v19.4s, v20.4s add v15.4s, v15.4s, v25.4s mov x11, #5 dup v20.4s, w11 add v25.4s, v25.4s, v20.4s zip1 v20.4s, v0.4s, v1.4s zip2 v21.4s, v0.4s, v1.4s zip1 v22.4s, v2.4s, v3.4s zip2 v23.4s, v2.4s, v3.4s zip1 v0.2d, v20.2d, v22.2d zip2 v1.2d, v20.2d, v22.2d zip1 v2.2d, v21.2d, v23.2d zip2 v3.2d, v21.2d, v23.2d zip1 v20.4s, v5.4s, v6.4s zip2 v21.4s, v5.4s, v6.4s zip1 v22.4s, v7.4s, v8.4s zip2 v23.4s, v7.4s, v8.4s zip1 v5.2d, v20.2d, v22.2d zip2 v6.2d, v20.2d, v22.2d zip1 v7.2d, v21.2d, v23.2d zip2 v8.2d, v21.2d, v23.2d zip1 v20.4s, v10.4s, v11.4s zip2 v21.4s, v10.4s, v11.4s zip1 v22.4s, v12.4s, v13.4s zip2 v23.4s, v12.4s, v13.4s zip1 v10.2d, v20.2d, v22.2d zip2 v11.2d, v20.2d, v22.2d zip1 v12.2d, v21.2d, v23.2d zip2 v13.2d, v21.2d, v23.2d zip1 v20.4s, v15.4s, v16.4s zip2 v21.4s, v15.4s, v16.4s zip1 v22.4s, v17.4s, v18.4s zip2 v23.4s, v17.4s, v18.4s zip1 v15.2d, v20.2d, v22.2d zip2 v16.2d, v20.2d, v22.2d zip1 v17.2d, v21.2d, v23.2d zip2 v18.2d, v21.2d, v23.2d add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s add v10.4s, v10.4s, v29.4s add v15.4s, v15.4s, v30.4s add v1.4s, v1.4s, v24.4s add v6.4s, v6.4s, v28.4s add v11.4s, v11.4s, v29.4s add v16.4s, v16.4s, v30.4s add v2.4s, v2.4s, v24.4s add v7.4s, v7.4s, v28.4s add v12.4s, v12.4s, v29.4s add v17.4s, v17.4s, v30.4s add v3.4s, v3.4s, v24.4s add v8.4s, v8.4s, v28.4s add v13.4s, v13.4s, v29.4s add v18.4s, v18.4s, v30.4s add v4.4s, v4.4s, v24.4s add v9.4s, v9.4s, v28.4s add v14.4s, v14.4s, v29.4s add v19.4s, v19.4s, v30.4s // We can always safely store 192 bytes ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v2.16b eor v21.16b, v21.16b, v7.16b eor v22.16b, v22.16b, v12.16b eor v23.16b, v23.16b, v17.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #192 mov v0.16b, v3.16b mov v5.16b, v8.16b mov v10.16b, v13.16b mov v15.16b, v18.16b cmp x2, #64 b.lt .Lopen_tail_64_store ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v3.16b eor v21.16b, v21.16b, v8.16b eor v22.16b, v22.16b, v13.16b eor v23.16b, v23.16b, v18.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 mov v0.16b, v4.16b mov v5.16b, v9.16b mov v10.16b, v14.16b mov v15.16b, v19.16b cmp x2, #64 b.lt .Lopen_tail_64_store ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v4.16b eor v21.16b, v21.16b, v9.16b eor v22.16b, v22.16b, v14.16b eor v23.16b, v23.16b, v19.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 b .Lopen_main_loop .Lopen_tail: cbz x2, .Lopen_finalize lsr x4, x2, #4 // How many whole blocks we have to hash cmp x2, #64 b.le .Lopen_tail_64 cmp x2, #128 b.le .Lopen_tail_128 .Lopen_tail_192: // We need three more blocks mov v0.16b, v24.16b mov v1.16b, v24.16b mov v2.16b, v24.16b mov v5.16b, v28.16b mov v6.16b, v28.16b mov v7.16b, v28.16b mov v10.16b, v29.16b mov v11.16b, v29.16b mov v12.16b, v29.16b mov v15.16b, v30.16b mov v16.16b, v30.16b mov v17.16b, v30.16b eor v23.16b, v23.16b, v23.16b eor v21.16b, v21.16b, v21.16b ins v23.s[0], v25.s[0] ins v21.d[0], x15 add v22.4s, v23.4s, v21.4s add v21.4s, v22.4s, v21.4s add v15.4s, v15.4s, v21.4s add v16.4s, v16.4s, v23.4s add v17.4s, v17.4s, v22.4s mov x7, #10 subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing sub x4, x4, x7 cbz x7, .Lopen_tail_192_rounds_no_hash .Lopen_tail_192_rounds: ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most .Lopen_tail_192_rounds_no_hash: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v6.16b, v6.16b, v6.16b, #4 ext v7.16b, v7.16b, v7.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 ext v16.16b, v16.16b, v16.16b, #12 ext v17.16b, v17.16b, v17.16b, #12 add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v6.16b, v6.16b, v6.16b, #12 ext v7.16b, v7.16b, v7.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 ext v16.16b, v16.16b, v16.16b, #4 ext v17.16b, v17.16b, v17.16b, #4 subs x7, x7, #1 b.gt .Lopen_tail_192_rounds subs x6, x6, #1 b.ge .Lopen_tail_192_rounds_no_hash // We hashed 160 bytes at most, may still have 32 bytes left .Lopen_tail_192_hash: cbz x4, .Lopen_tail_192_hash_done ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most sub x4, x4, #1 b .Lopen_tail_192_hash .Lopen_tail_192_hash_done: add v0.4s, v0.4s, v24.4s add v1.4s, v1.4s, v24.4s add v2.4s, v2.4s, v24.4s add v5.4s, v5.4s, v28.4s add v6.4s, v6.4s, v28.4s add v7.4s, v7.4s, v28.4s add v10.4s, v10.4s, v29.4s add v11.4s, v11.4s, v29.4s add v12.4s, v12.4s, v29.4s add v15.4s, v15.4s, v30.4s add v16.4s, v16.4s, v30.4s add v17.4s, v17.4s, v30.4s add v15.4s, v15.4s, v21.4s add v16.4s, v16.4s, v23.4s add v17.4s, v17.4s, v22.4s ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v2.16b eor v21.16b, v21.16b, v7.16b eor v22.16b, v22.16b, v12.16b eor v23.16b, v23.16b, v17.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #128 b .Lopen_tail_64_store .Lopen_tail_128: // We need two more blocks mov v0.16b, v24.16b mov v1.16b, v24.16b mov v5.16b, v28.16b mov v6.16b, v28.16b mov v10.16b, v29.16b mov v11.16b, v29.16b mov v15.16b, v30.16b mov v16.16b, v30.16b eor v23.16b, v23.16b, v23.16b eor v22.16b, v22.16b, v22.16b ins v23.s[0], v25.s[0] ins v22.d[0], x15 add v22.4s, v22.4s, v23.4s add v15.4s, v15.4s, v22.4s add v16.4s, v16.4s, v23.4s mov x6, #10 sub x6, x6, x4 .Lopen_tail_128_rounds: add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 add v1.4s, v1.4s, v6.4s eor v16.16b, v16.16b, v1.16b rev32 v16.8h, v16.8h add v11.4s, v11.4s, v16.4s eor v6.16b, v6.16b, v11.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 add v1.4s, v1.4s, v20.4s eor v16.16b, v16.16b, v1.16b tbl v16.16b, {v16.16b}, v26.16b add v11.4s, v11.4s, v16.4s eor v20.16b, v20.16b, v11.16b ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v6.16b, v6.16b, v6.16b, #4 ext v11.16b, v11.16b, v11.16b, #8 ext v16.16b, v16.16b, v16.16b, #12 add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 add v1.4s, v1.4s, v6.4s eor v16.16b, v16.16b, v1.16b rev32 v16.8h, v16.8h add v11.4s, v11.4s, v16.4s eor v6.16b, v6.16b, v11.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 add v1.4s, v1.4s, v20.4s eor v16.16b, v16.16b, v1.16b tbl v16.16b, {v16.16b}, v26.16b add v11.4s, v11.4s, v16.4s eor v20.16b, v20.16b, v11.16b ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v6.16b, v6.16b, v6.16b, #12 ext v11.16b, v11.16b, v11.16b, #8 ext v16.16b, v16.16b, v16.16b, #4 subs x6, x6, #1 b.gt .Lopen_tail_128_rounds cbz x4, .Lopen_tail_128_rounds_done subs x4, x4, #1 ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most b .Lopen_tail_128_rounds .Lopen_tail_128_rounds_done: add v0.4s, v0.4s, v24.4s add v1.4s, v1.4s, v24.4s add v5.4s, v5.4s, v28.4s add v6.4s, v6.4s, v28.4s add v10.4s, v10.4s, v29.4s add v11.4s, v11.4s, v29.4s add v15.4s, v15.4s, v30.4s add v16.4s, v16.4s, v30.4s add v15.4s, v15.4s, v22.4s add v16.4s, v16.4s, v23.4s ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 b .Lopen_tail_64_store .Lopen_tail_64: // We just need a single block mov v0.16b, v24.16b mov v5.16b, v28.16b mov v10.16b, v29.16b mov v15.16b, v30.16b eor v23.16b, v23.16b, v23.16b ins v23.s[0], v25.s[0] add v15.4s, v15.4s, v23.4s mov x6, #10 sub x6, x6, x4 .Lopen_tail_64_rounds: add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 subs x6, x6, #1 b.gt .Lopen_tail_64_rounds cbz x4, .Lopen_tail_64_rounds_done subs x4, x4, #1 ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most b .Lopen_tail_64_rounds .Lopen_tail_64_rounds_done: add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s add v10.4s, v10.4s, v29.4s add v15.4s, v15.4s, v30.4s add v15.4s, v15.4s, v23.4s .Lopen_tail_64_store: cmp x2, #16 b.lt .Lopen_tail_16 ld1 {v20.16b}, [x1], #16 eor v20.16b, v20.16b, v0.16b st1 {v20.16b}, [x0], #16 mov v0.16b, v5.16b mov v5.16b, v10.16b mov v10.16b, v15.16b sub x2, x2, #16 b .Lopen_tail_64_store .Lopen_tail_16: // Here we handle the last [0,16) bytes that require a padded block cbz x2, .Lopen_finalize eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask not v22.16b, v20.16b add x7, x1, x2 mov x6, x2 .Lopen_tail_16_compose: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x7, #-1]! mov v20.b[0], w11 ext v21.16b, v22.16b, v21.16b, #15 subs x2, x2, #1 b.gt .Lopen_tail_16_compose and v20.16b, v20.16b, v21.16b // Hash in the final padded block mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most eor v20.16b, v20.16b, v0.16b .Lopen_tail_16_store: umov w11, v20.b[0] strb w11, [x0], #1 ext v20.16b, v20.16b, v20.16b, #1 subs x6, x6, #1 b.gt .Lopen_tail_16_store .Lopen_finalize: mov x11, v31.d[0] mov x12, v31.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most // Final reduction step sub x12, xzr, x15 orr x13, xzr, #3 subs x11, x8, #-5 sbcs x12, x9, x12 sbcs x13, x10, x13 csel x8, x11, x8, cs csel x9, x12, x9, cs csel x10, x13, x10, cs mov x11, v27.d[0] mov x12, v27.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 stp x8, x9, [x5] ldp d8, d9, [sp, #16] ldp d10, d11, [sp, #32] ldp d12, d13, [sp, #48] ldp d14, d15, [sp, #64] .cfi_restore b15 .cfi_restore b14 .cfi_restore b13 .cfi_restore b12 .cfi_restore b11 .cfi_restore b10 .cfi_restore b9 .cfi_restore b8 ldp x29, x30, [sp], 80 .cfi_restore w29 .cfi_restore w30 .cfi_def_cfa_offset 0 AARCH64_VALIDATE_LINK_REGISTER ret .Lopen_128: // On some architectures preparing 5 blocks for small buffers is wasteful eor v25.16b, v25.16b, v25.16b mov x11, #1 mov v25.s[0], w11 mov v0.16b, v24.16b mov v1.16b, v24.16b mov v2.16b, v24.16b mov v5.16b, v28.16b mov v6.16b, v28.16b mov v7.16b, v28.16b mov v10.16b, v29.16b mov v11.16b, v29.16b mov v12.16b, v29.16b mov v17.16b, v30.16b add v15.4s, v17.4s, v25.4s add v16.4s, v15.4s, v25.4s mov x6, #10 .Lopen_128_rounds: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v6.16b, v6.16b, v6.16b, #4 ext v7.16b, v7.16b, v7.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 ext v16.16b, v16.16b, v16.16b, #12 ext v17.16b, v17.16b, v17.16b, #12 add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v6.16b, v6.16b, v6.16b, #12 ext v7.16b, v7.16b, v7.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 ext v16.16b, v16.16b, v16.16b, #4 ext v17.16b, v17.16b, v17.16b, #4 subs x6, x6, #1 b.hi .Lopen_128_rounds add v0.4s, v0.4s, v24.4s add v1.4s, v1.4s, v24.4s add v2.4s, v2.4s, v24.4s add v5.4s, v5.4s, v28.4s add v6.4s, v6.4s, v28.4s add v7.4s, v7.4s, v28.4s add v10.4s, v10.4s, v29.4s add v11.4s, v11.4s, v29.4s add v30.4s, v30.4s, v25.4s add v15.4s, v15.4s, v30.4s add v30.4s, v30.4s, v25.4s add v16.4s, v16.4s, v30.4s and v2.16b, v2.16b, v27.16b mov x16, v2.d[0] // Move the R key to GPRs mov x17, v2.d[1] mov v27.16b, v7.16b // Store the S key bl .Lpoly_hash_ad_internal .Lopen_128_store: cmp x2, #64 b.lt .Lopen_128_store_64 ld1 {v20.16b - v23.16b}, [x1], #64 mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v21.d[0] mov x12, v21.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v22.d[0] mov x12, v22.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v23.d[0] mov x12, v23.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 mov v0.16b, v1.16b mov v5.16b, v6.16b mov v10.16b, v11.16b mov v15.16b, v16.16b .Lopen_128_store_64: lsr x4, x2, #4 mov x3, x1 .Lopen_128_hash_64: cbz x4, .Lopen_tail_64_store ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most sub x4, x4, #1 b .Lopen_128_hash_64 .cfi_endproc .size chacha20_poly1305_open,.-chacha20_poly1305_open #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) ring-0.17.14/pregenerated/chacha20_poly1305_armv8-win64.S000064400000000000000000002204031046102023000205360ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) .section .rodata .align 7 Lchacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' Linc: .long 1,2,3,4 Lrol8: .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 Lclamp: .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC .text .def Lpoly_hash_ad_internal .type 32 .endef .align 6 Lpoly_hash_ad_internal: .cfi_startproc cbnz x4, Lpoly_hash_intro ret Lpoly_hash_intro: cmp x4, #16 b.lt Lpoly_hash_ad_tail ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most sub x4, x4, #16 b Lpoly_hash_ad_internal Lpoly_hash_ad_tail: cbz x4, Lpoly_hash_ad_ret eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD sub x4, x4, #1 Lpoly_hash_tail_16_compose: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x3, x4] mov v20.b[0], w11 subs x4, x4, #1 b.ge Lpoly_hash_tail_16_compose mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most Lpoly_hash_ad_ret: ret .cfi_endproc ///////////////////////////////// // // void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); // .globl chacha20_poly1305_seal .def chacha20_poly1305_seal .type 32 .endef .align 6 chacha20_poly1305_seal: AARCH64_SIGN_LINK_REGISTER .cfi_startproc stp x29, x30, [sp, #-80]! .cfi_def_cfa_offset 80 .cfi_offset w30, -72 .cfi_offset w29, -80 mov x29, sp // We probably could do .cfi_def_cfa w29, 80 at this point, but since // we don't actually use the frame pointer like that, it's probably not // worth bothering. stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] .cfi_offset b15, -8 .cfi_offset b14, -16 .cfi_offset b13, -24 .cfi_offset b12, -32 .cfi_offset b11, -40 .cfi_offset b10, -48 .cfi_offset b9, -56 .cfi_offset b8, -64 adrp x11, Lchacha20_consts add x11, x11, :lo12:Lchacha20_consts ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values ld1 {v28.16b - v30.16b}, [x5] mov x15, #1 // Prepare the Poly1305 state mov x8, #0 mov x9, #0 mov x10, #0 ldr x12, [x5, #56] // The total cipher text length includes extra_in_len add x12, x12, x2 mov v31.d[0], x4 // Store the input and aad lengths mov v31.d[1], x12 cmp x2, #128 b.le Lseal_128 // Optimization for smaller buffers // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, // the fifth block (A4-D4) horizontally. ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] mov v4.16b, v24.16b ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 mov v9.16b, v28.16b ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 mov v14.16b, v29.16b ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] add v15.4s, v15.4s, v25.4s mov v19.16b, v30.16b sub x5, x5, #32 mov x6, #10 .align 5 Lseal_init_rounds: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v9.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v18.8h, v18.8h rev32 v19.8h, v19.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b eor v8.16b, v8.16b, v13.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v9.4s, #20 sli v8.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s add v3.4s, v3.4s, v7.4s add v4.4s, v4.4s, v8.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v18.16b, {v18.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v14.16b ushr v9.4s, v8.4s, #25 sli v9.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #4 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #12 add v0.4s, v0.4s, v6.4s add v1.4s, v1.4s, v7.4s add v2.4s, v2.4s, v8.4s add v3.4s, v3.4s, v5.4s add v4.4s, v4.4s, v9.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v18.8h, v18.8h rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v19.8h, v19.8h add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v5.4s, #20 sli v8.4s, v5.4s, #12 ushr v5.4s, v9.4s, #20 sli v5.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v5.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v18.16b, {v18.16b}, v26.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v12.16b eor v6.16b, v6.16b, v13.16b eor v7.16b, v7.16b, v10.16b eor v8.16b, v8.16b, v11.16b eor v5.16b, v5.16b, v14.16b ushr v9.4s, v5.4s, #25 sli v9.4s, v5.4s, #7 ushr v5.4s, v8.4s, #25 sli v5.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #12 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #4 subs x6, x6, #1 b.hi Lseal_init_rounds add v15.4s, v15.4s, v25.4s mov x11, #4 dup v20.4s, w11 add v25.4s, v25.4s, v20.4s zip1 v20.4s, v0.4s, v1.4s zip2 v21.4s, v0.4s, v1.4s zip1 v22.4s, v2.4s, v3.4s zip2 v23.4s, v2.4s, v3.4s zip1 v0.2d, v20.2d, v22.2d zip2 v1.2d, v20.2d, v22.2d zip1 v2.2d, v21.2d, v23.2d zip2 v3.2d, v21.2d, v23.2d zip1 v20.4s, v5.4s, v6.4s zip2 v21.4s, v5.4s, v6.4s zip1 v22.4s, v7.4s, v8.4s zip2 v23.4s, v7.4s, v8.4s zip1 v5.2d, v20.2d, v22.2d zip2 v6.2d, v20.2d, v22.2d zip1 v7.2d, v21.2d, v23.2d zip2 v8.2d, v21.2d, v23.2d zip1 v20.4s, v10.4s, v11.4s zip2 v21.4s, v10.4s, v11.4s zip1 v22.4s, v12.4s, v13.4s zip2 v23.4s, v12.4s, v13.4s zip1 v10.2d, v20.2d, v22.2d zip2 v11.2d, v20.2d, v22.2d zip1 v12.2d, v21.2d, v23.2d zip2 v13.2d, v21.2d, v23.2d zip1 v20.4s, v15.4s, v16.4s zip2 v21.4s, v15.4s, v16.4s zip1 v22.4s, v17.4s, v18.4s zip2 v23.4s, v17.4s, v18.4s zip1 v15.2d, v20.2d, v22.2d zip2 v16.2d, v20.2d, v22.2d zip1 v17.2d, v21.2d, v23.2d zip2 v18.2d, v21.2d, v23.2d add v4.4s, v4.4s, v24.4s add v9.4s, v9.4s, v28.4s and v4.16b, v4.16b, v27.16b add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s add v10.4s, v10.4s, v29.4s add v15.4s, v15.4s, v30.4s add v1.4s, v1.4s, v24.4s add v6.4s, v6.4s, v28.4s add v11.4s, v11.4s, v29.4s add v16.4s, v16.4s, v30.4s add v2.4s, v2.4s, v24.4s add v7.4s, v7.4s, v28.4s add v12.4s, v12.4s, v29.4s add v17.4s, v17.4s, v30.4s add v3.4s, v3.4s, v24.4s add v8.4s, v8.4s, v28.4s add v13.4s, v13.4s, v29.4s add v18.4s, v18.4s, v30.4s mov x16, v4.d[0] // Move the R key to GPRs mov x17, v4.d[1] mov v27.16b, v9.16b // Store the S key bl Lpoly_hash_ad_internal mov x3, x0 cmp x2, #256 b.le Lseal_tail ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v2.16b eor v21.16b, v21.16b, v7.16b eor v22.16b, v22.16b, v12.16b eor v23.16b, v23.16b, v17.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v3.16b eor v21.16b, v21.16b, v8.16b eor v22.16b, v22.16b, v13.16b eor v23.16b, v23.16b, v18.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #256 mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 Lseal_main_loop: adrp x11, Lchacha20_consts add x11, x11, :lo12:Lchacha20_consts ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] mov v4.16b, v24.16b ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 mov v9.16b, v28.16b ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 mov v14.16b, v29.16b ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] add v15.4s, v15.4s, v25.4s mov v19.16b, v30.16b eor v20.16b, v20.16b, v20.16b //zero not v21.16b, v20.16b // -1 sub v21.4s, v25.4s, v21.4s // Add +1 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) add v19.4s, v19.4s, v20.4s sub x5, x5, #32 .align 5 Lseal_main_loop_rounds: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v9.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v18.8h, v18.8h rev32 v19.8h, v19.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b eor v8.16b, v8.16b, v13.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v9.4s, #20 sli v8.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s add v3.4s, v3.4s, v7.4s add v4.4s, v4.4s, v8.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v18.16b, {v18.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v14.16b ushr v9.4s, v8.4s, #25 sli v9.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #4 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #12 ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most add v0.4s, v0.4s, v6.4s add v1.4s, v1.4s, v7.4s add v2.4s, v2.4s, v8.4s add v3.4s, v3.4s, v5.4s add v4.4s, v4.4s, v9.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v18.8h, v18.8h rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v19.8h, v19.8h add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v5.4s, #20 sli v8.4s, v5.4s, #12 ushr v5.4s, v9.4s, #20 sli v5.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v5.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v18.16b, {v18.16b}, v26.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v12.16b eor v6.16b, v6.16b, v13.16b eor v7.16b, v7.16b, v10.16b eor v8.16b, v8.16b, v11.16b eor v5.16b, v5.16b, v14.16b ushr v9.4s, v5.4s, #25 sli v9.4s, v5.4s, #7 ushr v5.4s, v8.4s, #25 sli v5.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #12 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #4 subs x6, x6, #1 b.ge Lseal_main_loop_rounds ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most subs x7, x7, #1 b.gt Lseal_main_loop_rounds eor v20.16b, v20.16b, v20.16b //zero not v21.16b, v20.16b // -1 sub v21.4s, v25.4s, v21.4s // Add +1 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) add v19.4s, v19.4s, v20.4s add v15.4s, v15.4s, v25.4s mov x11, #5 dup v20.4s, w11 add v25.4s, v25.4s, v20.4s zip1 v20.4s, v0.4s, v1.4s zip2 v21.4s, v0.4s, v1.4s zip1 v22.4s, v2.4s, v3.4s zip2 v23.4s, v2.4s, v3.4s zip1 v0.2d, v20.2d, v22.2d zip2 v1.2d, v20.2d, v22.2d zip1 v2.2d, v21.2d, v23.2d zip2 v3.2d, v21.2d, v23.2d zip1 v20.4s, v5.4s, v6.4s zip2 v21.4s, v5.4s, v6.4s zip1 v22.4s, v7.4s, v8.4s zip2 v23.4s, v7.4s, v8.4s zip1 v5.2d, v20.2d, v22.2d zip2 v6.2d, v20.2d, v22.2d zip1 v7.2d, v21.2d, v23.2d zip2 v8.2d, v21.2d, v23.2d zip1 v20.4s, v10.4s, v11.4s zip2 v21.4s, v10.4s, v11.4s zip1 v22.4s, v12.4s, v13.4s zip2 v23.4s, v12.4s, v13.4s zip1 v10.2d, v20.2d, v22.2d zip2 v11.2d, v20.2d, v22.2d zip1 v12.2d, v21.2d, v23.2d zip2 v13.2d, v21.2d, v23.2d zip1 v20.4s, v15.4s, v16.4s zip2 v21.4s, v15.4s, v16.4s zip1 v22.4s, v17.4s, v18.4s zip2 v23.4s, v17.4s, v18.4s zip1 v15.2d, v20.2d, v22.2d zip2 v16.2d, v20.2d, v22.2d zip1 v17.2d, v21.2d, v23.2d zip2 v18.2d, v21.2d, v23.2d add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s add v10.4s, v10.4s, v29.4s add v15.4s, v15.4s, v30.4s add v1.4s, v1.4s, v24.4s add v6.4s, v6.4s, v28.4s add v11.4s, v11.4s, v29.4s add v16.4s, v16.4s, v30.4s add v2.4s, v2.4s, v24.4s add v7.4s, v7.4s, v28.4s add v12.4s, v12.4s, v29.4s add v17.4s, v17.4s, v30.4s add v3.4s, v3.4s, v24.4s add v8.4s, v8.4s, v28.4s add v13.4s, v13.4s, v29.4s add v18.4s, v18.4s, v30.4s add v4.4s, v4.4s, v24.4s add v9.4s, v9.4s, v28.4s add v14.4s, v14.4s, v29.4s add v19.4s, v19.4s, v30.4s cmp x2, #320 b.le Lseal_tail ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v2.16b eor v21.16b, v21.16b, v7.16b eor v22.16b, v22.16b, v12.16b eor v23.16b, v23.16b, v17.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v3.16b eor v21.16b, v21.16b, v8.16b eor v22.16b, v22.16b, v13.16b eor v23.16b, v23.16b, v18.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v4.16b eor v21.16b, v21.16b, v9.16b eor v22.16b, v22.16b, v14.16b eor v23.16b, v23.16b, v19.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #320 mov x6, #0 mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration b Lseal_main_loop Lseal_tail: // This part of the function handles the storage and authentication of the last [0,320) bytes // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. cmp x2, #64 b.lt Lseal_tail_64 // Store and authenticate 64B blocks per iteration ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v21.d[0] mov x12, v21.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v22.d[0] mov x12, v22.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v23.d[0] mov x12, v23.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 // Shift the state left by 64 bytes for the next iteration of the loop mov v0.16b, v1.16b mov v5.16b, v6.16b mov v10.16b, v11.16b mov v15.16b, v16.16b mov v1.16b, v2.16b mov v6.16b, v7.16b mov v11.16b, v12.16b mov v16.16b, v17.16b mov v2.16b, v3.16b mov v7.16b, v8.16b mov v12.16b, v13.16b mov v17.16b, v18.16b mov v3.16b, v4.16b mov v8.16b, v9.16b mov v13.16b, v14.16b mov v18.16b, v19.16b b Lseal_tail Lseal_tail_64: ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr // Here we handle the last [0,64) bytes of plaintext cmp x2, #16 b.lt Lseal_tail_16 // Each iteration encrypt and authenticate a 16B block ld1 {v20.16b}, [x1], #16 eor v20.16b, v20.16b, v0.16b mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most st1 {v20.16b}, [x0], #16 sub x2, x2, #16 // Shift the state left by 16 bytes for the next iteration of the loop mov v0.16b, v5.16b mov v5.16b, v10.16b mov v10.16b, v15.16b b Lseal_tail_64 Lseal_tail_16: // Here we handle the last [0,16) bytes of ciphertext that require a padded block cbz x2, Lseal_hash_extra eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes not v22.16b, v20.16b mov x6, x2 add x1, x1, x2 cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding mov x7, #16 // We need to load some extra_in first for padding sub x7, x7, x2 cmp x4, x7 csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register mov x12, x7 add x3, x3, x7 sub x4, x4, x7 Lseal_tail16_compose_extra_in: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x3, #-1]! mov v20.b[0], w11 subs x7, x7, #1 b.gt Lseal_tail16_compose_extra_in add x3, x3, x12 Lseal_tail_16_compose: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x1, #-1]! mov v20.b[0], w11 ext v21.16b, v22.16b, v21.16b, #15 subs x2, x2, #1 b.gt Lseal_tail_16_compose and v0.16b, v0.16b, v21.16b eor v20.16b, v20.16b, v0.16b mov v21.16b, v20.16b Lseal_tail_16_store: umov w11, v20.b[0] strb w11, [x0], #1 ext v20.16b, v20.16b, v20.16b, #1 subs x6, x6, #1 b.gt Lseal_tail_16_store // Hash in the final ct block concatenated with extra_in mov x11, v21.d[0] mov x12, v21.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most Lseal_hash_extra: cbz x4, Lseal_finalize Lseal_hash_extra_loop: cmp x4, #16 b.lt Lseal_hash_extra_tail ld1 {v20.16b}, [x3], #16 mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most sub x4, x4, #16 b Lseal_hash_extra_loop Lseal_hash_extra_tail: cbz x4, Lseal_finalize eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext add x3, x3, x4 Lseal_hash_extra_load: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x3, #-1]! mov v20.b[0], w11 subs x4, x4, #1 b.gt Lseal_hash_extra_load // Hash in the final padded extra_in blcok mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most Lseal_finalize: mov x11, v31.d[0] mov x12, v31.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most // Final reduction step sub x12, xzr, x15 orr x13, xzr, #3 subs x11, x8, #-5 sbcs x12, x9, x12 sbcs x13, x10, x13 csel x8, x11, x8, cs csel x9, x12, x9, cs csel x10, x13, x10, cs mov x11, v27.d[0] mov x12, v27.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 stp x8, x9, [x5] ldp d8, d9, [sp, #16] ldp d10, d11, [sp, #32] ldp d12, d13, [sp, #48] ldp d14, d15, [sp, #64] .cfi_restore b15 .cfi_restore b14 .cfi_restore b13 .cfi_restore b12 .cfi_restore b11 .cfi_restore b10 .cfi_restore b9 .cfi_restore b8 ldp x29, x30, [sp], 80 .cfi_restore w29 .cfi_restore w30 .cfi_def_cfa_offset 0 AARCH64_VALIDATE_LINK_REGISTER ret Lseal_128: // On some architectures preparing 5 blocks for small buffers is wasteful eor v25.16b, v25.16b, v25.16b mov x11, #1 mov v25.s[0], w11 mov v0.16b, v24.16b mov v1.16b, v24.16b mov v2.16b, v24.16b mov v5.16b, v28.16b mov v6.16b, v28.16b mov v7.16b, v28.16b mov v10.16b, v29.16b mov v11.16b, v29.16b mov v12.16b, v29.16b mov v17.16b, v30.16b add v15.4s, v17.4s, v25.4s add v16.4s, v15.4s, v25.4s mov x6, #10 Lseal_128_rounds: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v6.16b, v6.16b, v6.16b, #4 ext v7.16b, v7.16b, v7.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 ext v16.16b, v16.16b, v16.16b, #12 ext v17.16b, v17.16b, v17.16b, #12 add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v6.16b, v6.16b, v6.16b, #12 ext v7.16b, v7.16b, v7.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 ext v16.16b, v16.16b, v16.16b, #4 ext v17.16b, v17.16b, v17.16b, #4 subs x6, x6, #1 b.hi Lseal_128_rounds add v0.4s, v0.4s, v24.4s add v1.4s, v1.4s, v24.4s add v2.4s, v2.4s, v24.4s add v5.4s, v5.4s, v28.4s add v6.4s, v6.4s, v28.4s add v7.4s, v7.4s, v28.4s // Only the first 32 bytes of the third block (counter = 0) are needed, // so skip updating v12 and v17. add v10.4s, v10.4s, v29.4s add v11.4s, v11.4s, v29.4s add v30.4s, v30.4s, v25.4s add v15.4s, v15.4s, v30.4s add v30.4s, v30.4s, v25.4s add v16.4s, v16.4s, v30.4s and v2.16b, v2.16b, v27.16b mov x16, v2.d[0] // Move the R key to GPRs mov x17, v2.d[1] mov v27.16b, v7.16b // Store the S key bl Lpoly_hash_ad_internal b Lseal_tail .cfi_endproc ///////////////////////////////// // // void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); // .globl chacha20_poly1305_open .def chacha20_poly1305_open .type 32 .endef .align 6 chacha20_poly1305_open: AARCH64_SIGN_LINK_REGISTER .cfi_startproc stp x29, x30, [sp, #-80]! .cfi_def_cfa_offset 80 .cfi_offset w30, -72 .cfi_offset w29, -80 mov x29, sp // We probably could do .cfi_def_cfa w29, 80 at this point, but since // we don't actually use the frame pointer like that, it's probably not // worth bothering. stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] .cfi_offset b15, -8 .cfi_offset b14, -16 .cfi_offset b13, -24 .cfi_offset b12, -32 .cfi_offset b11, -40 .cfi_offset b10, -48 .cfi_offset b9, -56 .cfi_offset b8, -64 adrp x11, Lchacha20_consts add x11, x11, :lo12:Lchacha20_consts ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values ld1 {v28.16b - v30.16b}, [x5] mov x15, #1 // Prepare the Poly1305 state mov x8, #0 mov x9, #0 mov x10, #0 mov v31.d[0], x4 // Store the input and aad lengths mov v31.d[1], x2 cmp x2, #128 b.le Lopen_128 // Optimization for smaller buffers // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys mov v0.16b, v24.16b mov v5.16b, v28.16b mov v10.16b, v29.16b mov v15.16b, v30.16b mov x6, #10 .align 5 Lopen_init_rounds: add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 subs x6, x6, #1 b.hi Lopen_init_rounds add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s and v0.16b, v0.16b, v27.16b mov x16, v0.d[0] // Move the R key to GPRs mov x17, v0.d[1] mov v27.16b, v5.16b // Store the S key bl Lpoly_hash_ad_internal Lopen_ad_done: mov x3, x1 // Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes Lopen_main_loop: cmp x2, #192 b.lt Lopen_tail adrp x11, Lchacha20_consts add x11, x11, :lo12:Lchacha20_consts ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] mov v4.16b, v24.16b ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 mov v9.16b, v28.16b ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 mov v14.16b, v29.16b ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] sub x5, x5, #32 add v15.4s, v15.4s, v25.4s mov v19.16b, v30.16b eor v20.16b, v20.16b, v20.16b //zero not v21.16b, v20.16b // -1 sub v21.4s, v25.4s, v21.4s // Add +1 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) add v19.4s, v19.4s, v20.4s lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 sub x4, x4, #10 mov x7, #10 subs x6, x7, x4 subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full cbz x7, Lopen_main_loop_rounds_short .align 5 Lopen_main_loop_rounds: ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most Lopen_main_loop_rounds_short: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v9.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v18.8h, v18.8h rev32 v19.8h, v19.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b eor v8.16b, v8.16b, v13.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v9.4s, #20 sli v8.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s add v3.4s, v3.4s, v7.4s add v4.4s, v4.4s, v8.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v18.16b, {v18.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s add v13.4s, v13.4s, v18.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v14.16b ushr v9.4s, v8.4s, #25 sli v9.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #4 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #12 ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most add v0.4s, v0.4s, v6.4s add v1.4s, v1.4s, v7.4s add v2.4s, v2.4s, v8.4s add v3.4s, v3.4s, v5.4s add v4.4s, v4.4s, v9.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b rev32 v18.8h, v18.8h rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h rev32 v19.8h, v19.8h add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v6.16b, v6.16b, v12.16b eor v7.16b, v7.16b, v13.16b eor v8.16b, v8.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v9.16b, v9.16b, v14.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 ushr v7.4s, v8.4s, #20 sli v7.4s, v8.4s, #12 ushr v8.4s, v5.4s, #20 sli v8.4s, v5.4s, #12 ushr v5.4s, v9.4s, #20 sli v5.4s, v9.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v8.4s add v4.4s, v4.4s, v5.4s eor v18.16b, v18.16b, v0.16b eor v15.16b, v15.16b, v1.16b eor v16.16b, v16.16b, v2.16b eor v17.16b, v17.16b, v3.16b eor v19.16b, v19.16b, v4.16b tbl v18.16b, {v18.16b}, v26.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b tbl v19.16b, {v19.16b}, v26.16b add v12.4s, v12.4s, v18.4s add v13.4s, v13.4s, v15.4s add v10.4s, v10.4s, v16.4s add v11.4s, v11.4s, v17.4s add v14.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v12.16b eor v6.16b, v6.16b, v13.16b eor v7.16b, v7.16b, v10.16b eor v8.16b, v8.16b, v11.16b eor v5.16b, v5.16b, v14.16b ushr v9.4s, v5.4s, #25 sli v9.4s, v5.4s, #7 ushr v5.4s, v8.4s, #25 sli v5.4s, v8.4s, #7 ushr v8.4s, v7.4s, #25 sli v8.4s, v7.4s, #7 ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v9.16b, v9.16b, v9.16b, #12 ext v14.16b, v14.16b, v14.16b, #8 ext v19.16b, v19.16b, v19.16b, #4 subs x7, x7, #1 b.gt Lopen_main_loop_rounds subs x6, x6, #1 b.ge Lopen_main_loop_rounds_short eor v20.16b, v20.16b, v20.16b //zero not v21.16b, v20.16b // -1 sub v21.4s, v25.4s, v21.4s // Add +1 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) add v19.4s, v19.4s, v20.4s add v15.4s, v15.4s, v25.4s mov x11, #5 dup v20.4s, w11 add v25.4s, v25.4s, v20.4s zip1 v20.4s, v0.4s, v1.4s zip2 v21.4s, v0.4s, v1.4s zip1 v22.4s, v2.4s, v3.4s zip2 v23.4s, v2.4s, v3.4s zip1 v0.2d, v20.2d, v22.2d zip2 v1.2d, v20.2d, v22.2d zip1 v2.2d, v21.2d, v23.2d zip2 v3.2d, v21.2d, v23.2d zip1 v20.4s, v5.4s, v6.4s zip2 v21.4s, v5.4s, v6.4s zip1 v22.4s, v7.4s, v8.4s zip2 v23.4s, v7.4s, v8.4s zip1 v5.2d, v20.2d, v22.2d zip2 v6.2d, v20.2d, v22.2d zip1 v7.2d, v21.2d, v23.2d zip2 v8.2d, v21.2d, v23.2d zip1 v20.4s, v10.4s, v11.4s zip2 v21.4s, v10.4s, v11.4s zip1 v22.4s, v12.4s, v13.4s zip2 v23.4s, v12.4s, v13.4s zip1 v10.2d, v20.2d, v22.2d zip2 v11.2d, v20.2d, v22.2d zip1 v12.2d, v21.2d, v23.2d zip2 v13.2d, v21.2d, v23.2d zip1 v20.4s, v15.4s, v16.4s zip2 v21.4s, v15.4s, v16.4s zip1 v22.4s, v17.4s, v18.4s zip2 v23.4s, v17.4s, v18.4s zip1 v15.2d, v20.2d, v22.2d zip2 v16.2d, v20.2d, v22.2d zip1 v17.2d, v21.2d, v23.2d zip2 v18.2d, v21.2d, v23.2d add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s add v10.4s, v10.4s, v29.4s add v15.4s, v15.4s, v30.4s add v1.4s, v1.4s, v24.4s add v6.4s, v6.4s, v28.4s add v11.4s, v11.4s, v29.4s add v16.4s, v16.4s, v30.4s add v2.4s, v2.4s, v24.4s add v7.4s, v7.4s, v28.4s add v12.4s, v12.4s, v29.4s add v17.4s, v17.4s, v30.4s add v3.4s, v3.4s, v24.4s add v8.4s, v8.4s, v28.4s add v13.4s, v13.4s, v29.4s add v18.4s, v18.4s, v30.4s add v4.4s, v4.4s, v24.4s add v9.4s, v9.4s, v28.4s add v14.4s, v14.4s, v29.4s add v19.4s, v19.4s, v30.4s // We can always safely store 192 bytes ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v2.16b eor v21.16b, v21.16b, v7.16b eor v22.16b, v22.16b, v12.16b eor v23.16b, v23.16b, v17.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #192 mov v0.16b, v3.16b mov v5.16b, v8.16b mov v10.16b, v13.16b mov v15.16b, v18.16b cmp x2, #64 b.lt Lopen_tail_64_store ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v3.16b eor v21.16b, v21.16b, v8.16b eor v22.16b, v22.16b, v13.16b eor v23.16b, v23.16b, v18.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 mov v0.16b, v4.16b mov v5.16b, v9.16b mov v10.16b, v14.16b mov v15.16b, v19.16b cmp x2, #64 b.lt Lopen_tail_64_store ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v4.16b eor v21.16b, v21.16b, v9.16b eor v22.16b, v22.16b, v14.16b eor v23.16b, v23.16b, v19.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 b Lopen_main_loop Lopen_tail: cbz x2, Lopen_finalize lsr x4, x2, #4 // How many whole blocks we have to hash cmp x2, #64 b.le Lopen_tail_64 cmp x2, #128 b.le Lopen_tail_128 Lopen_tail_192: // We need three more blocks mov v0.16b, v24.16b mov v1.16b, v24.16b mov v2.16b, v24.16b mov v5.16b, v28.16b mov v6.16b, v28.16b mov v7.16b, v28.16b mov v10.16b, v29.16b mov v11.16b, v29.16b mov v12.16b, v29.16b mov v15.16b, v30.16b mov v16.16b, v30.16b mov v17.16b, v30.16b eor v23.16b, v23.16b, v23.16b eor v21.16b, v21.16b, v21.16b ins v23.s[0], v25.s[0] ins v21.d[0], x15 add v22.4s, v23.4s, v21.4s add v21.4s, v22.4s, v21.4s add v15.4s, v15.4s, v21.4s add v16.4s, v16.4s, v23.4s add v17.4s, v17.4s, v22.4s mov x7, #10 subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing sub x4, x4, x7 cbz x7, Lopen_tail_192_rounds_no_hash Lopen_tail_192_rounds: ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most Lopen_tail_192_rounds_no_hash: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v6.16b, v6.16b, v6.16b, #4 ext v7.16b, v7.16b, v7.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 ext v16.16b, v16.16b, v16.16b, #12 ext v17.16b, v17.16b, v17.16b, #12 add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v6.16b, v6.16b, v6.16b, #12 ext v7.16b, v7.16b, v7.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 ext v16.16b, v16.16b, v16.16b, #4 ext v17.16b, v17.16b, v17.16b, #4 subs x7, x7, #1 b.gt Lopen_tail_192_rounds subs x6, x6, #1 b.ge Lopen_tail_192_rounds_no_hash // We hashed 160 bytes at most, may still have 32 bytes left Lopen_tail_192_hash: cbz x4, Lopen_tail_192_hash_done ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most sub x4, x4, #1 b Lopen_tail_192_hash Lopen_tail_192_hash_done: add v0.4s, v0.4s, v24.4s add v1.4s, v1.4s, v24.4s add v2.4s, v2.4s, v24.4s add v5.4s, v5.4s, v28.4s add v6.4s, v6.4s, v28.4s add v7.4s, v7.4s, v28.4s add v10.4s, v10.4s, v29.4s add v11.4s, v11.4s, v29.4s add v12.4s, v12.4s, v29.4s add v15.4s, v15.4s, v30.4s add v16.4s, v16.4s, v30.4s add v17.4s, v17.4s, v30.4s add v15.4s, v15.4s, v21.4s add v16.4s, v16.4s, v23.4s add v17.4s, v17.4s, v22.4s ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v2.16b eor v21.16b, v21.16b, v7.16b eor v22.16b, v22.16b, v12.16b eor v23.16b, v23.16b, v17.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #128 b Lopen_tail_64_store Lopen_tail_128: // We need two more blocks mov v0.16b, v24.16b mov v1.16b, v24.16b mov v5.16b, v28.16b mov v6.16b, v28.16b mov v10.16b, v29.16b mov v11.16b, v29.16b mov v15.16b, v30.16b mov v16.16b, v30.16b eor v23.16b, v23.16b, v23.16b eor v22.16b, v22.16b, v22.16b ins v23.s[0], v25.s[0] ins v22.d[0], x15 add v22.4s, v22.4s, v23.4s add v15.4s, v15.4s, v22.4s add v16.4s, v16.4s, v23.4s mov x6, #10 sub x6, x6, x4 Lopen_tail_128_rounds: add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 add v1.4s, v1.4s, v6.4s eor v16.16b, v16.16b, v1.16b rev32 v16.8h, v16.8h add v11.4s, v11.4s, v16.4s eor v6.16b, v6.16b, v11.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 add v1.4s, v1.4s, v20.4s eor v16.16b, v16.16b, v1.16b tbl v16.16b, {v16.16b}, v26.16b add v11.4s, v11.4s, v16.4s eor v20.16b, v20.16b, v11.16b ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v6.16b, v6.16b, v6.16b, #4 ext v11.16b, v11.16b, v11.16b, #8 ext v16.16b, v16.16b, v16.16b, #12 add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 add v1.4s, v1.4s, v6.4s eor v16.16b, v16.16b, v1.16b rev32 v16.8h, v16.8h add v11.4s, v11.4s, v16.4s eor v6.16b, v6.16b, v11.16b ushr v20.4s, v6.4s, #20 sli v20.4s, v6.4s, #12 add v1.4s, v1.4s, v20.4s eor v16.16b, v16.16b, v1.16b tbl v16.16b, {v16.16b}, v26.16b add v11.4s, v11.4s, v16.4s eor v20.16b, v20.16b, v11.16b ushr v6.4s, v20.4s, #25 sli v6.4s, v20.4s, #7 ext v6.16b, v6.16b, v6.16b, #12 ext v11.16b, v11.16b, v11.16b, #8 ext v16.16b, v16.16b, v16.16b, #4 subs x6, x6, #1 b.gt Lopen_tail_128_rounds cbz x4, Lopen_tail_128_rounds_done subs x4, x4, #1 ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most b Lopen_tail_128_rounds Lopen_tail_128_rounds_done: add v0.4s, v0.4s, v24.4s add v1.4s, v1.4s, v24.4s add v5.4s, v5.4s, v28.4s add v6.4s, v6.4s, v28.4s add v10.4s, v10.4s, v29.4s add v11.4s, v11.4s, v29.4s add v15.4s, v15.4s, v30.4s add v16.4s, v16.4s, v30.4s add v15.4s, v15.4s, v22.4s add v16.4s, v16.4s, v23.4s ld1 {v20.16b - v23.16b}, [x1], #64 eor v20.16b, v20.16b, v1.16b eor v21.16b, v21.16b, v6.16b eor v22.16b, v22.16b, v11.16b eor v23.16b, v23.16b, v16.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 b Lopen_tail_64_store Lopen_tail_64: // We just need a single block mov v0.16b, v24.16b mov v5.16b, v28.16b mov v10.16b, v29.16b mov v15.16b, v30.16b eor v23.16b, v23.16b, v23.16b ins v23.s[0], v25.s[0] add v15.4s, v15.4s, v23.4s mov x6, #10 sub x6, x6, x4 Lopen_tail_64_rounds: add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 add v0.4s, v0.4s, v5.4s eor v15.16b, v15.16b, v0.16b rev32 v15.8h, v15.8h add v10.4s, v10.4s, v15.4s eor v5.16b, v5.16b, v10.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 add v0.4s, v0.4s, v20.4s eor v15.16b, v15.16b, v0.16b tbl v15.16b, {v15.16b}, v26.16b add v10.4s, v10.4s, v15.4s eor v20.16b, v20.16b, v10.16b ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 subs x6, x6, #1 b.gt Lopen_tail_64_rounds cbz x4, Lopen_tail_64_rounds_done subs x4, x4, #1 ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most b Lopen_tail_64_rounds Lopen_tail_64_rounds_done: add v0.4s, v0.4s, v24.4s add v5.4s, v5.4s, v28.4s add v10.4s, v10.4s, v29.4s add v15.4s, v15.4s, v30.4s add v15.4s, v15.4s, v23.4s Lopen_tail_64_store: cmp x2, #16 b.lt Lopen_tail_16 ld1 {v20.16b}, [x1], #16 eor v20.16b, v20.16b, v0.16b st1 {v20.16b}, [x0], #16 mov v0.16b, v5.16b mov v5.16b, v10.16b mov v10.16b, v15.16b sub x2, x2, #16 b Lopen_tail_64_store Lopen_tail_16: // Here we handle the last [0,16) bytes that require a padded block cbz x2, Lopen_finalize eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask not v22.16b, v20.16b add x7, x1, x2 mov x6, x2 Lopen_tail_16_compose: ext v20.16b, v20.16b, v20.16b, #15 ldrb w11, [x7, #-1]! mov v20.b[0], w11 ext v21.16b, v22.16b, v21.16b, #15 subs x2, x2, #1 b.gt Lopen_tail_16_compose and v20.16b, v20.16b, v21.16b // Hash in the final padded block mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most eor v20.16b, v20.16b, v0.16b Lopen_tail_16_store: umov w11, v20.b[0] strb w11, [x0], #1 ext v20.16b, v20.16b, v20.16b, #1 subs x6, x6, #1 b.gt Lopen_tail_16_store Lopen_finalize: mov x11, v31.d[0] mov x12, v31.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most // Final reduction step sub x12, xzr, x15 orr x13, xzr, #3 subs x11, x8, #-5 sbcs x12, x9, x12 sbcs x13, x10, x13 csel x8, x11, x8, cs csel x9, x12, x9, cs csel x10, x13, x10, cs mov x11, v27.d[0] mov x12, v27.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 stp x8, x9, [x5] ldp d8, d9, [sp, #16] ldp d10, d11, [sp, #32] ldp d12, d13, [sp, #48] ldp d14, d15, [sp, #64] .cfi_restore b15 .cfi_restore b14 .cfi_restore b13 .cfi_restore b12 .cfi_restore b11 .cfi_restore b10 .cfi_restore b9 .cfi_restore b8 ldp x29, x30, [sp], 80 .cfi_restore w29 .cfi_restore w30 .cfi_def_cfa_offset 0 AARCH64_VALIDATE_LINK_REGISTER ret Lopen_128: // On some architectures preparing 5 blocks for small buffers is wasteful eor v25.16b, v25.16b, v25.16b mov x11, #1 mov v25.s[0], w11 mov v0.16b, v24.16b mov v1.16b, v24.16b mov v2.16b, v24.16b mov v5.16b, v28.16b mov v6.16b, v28.16b mov v7.16b, v28.16b mov v10.16b, v29.16b mov v11.16b, v29.16b mov v12.16b, v29.16b mov v17.16b, v30.16b add v15.4s, v17.4s, v25.4s add v16.4s, v15.4s, v25.4s mov x6, #10 Lopen_128_rounds: add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #4 ext v6.16b, v6.16b, v6.16b, #4 ext v7.16b, v7.16b, v7.16b, #4 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #12 ext v16.16b, v16.16b, v16.16b, #12 ext v17.16b, v17.16b, v17.16b, #12 add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b rev32 v15.8h, v15.8h rev32 v16.8h, v16.8h rev32 v17.8h, v17.8h add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v5.16b, v5.16b, v10.16b eor v6.16b, v6.16b, v11.16b eor v7.16b, v7.16b, v12.16b ushr v20.4s, v5.4s, #20 sli v20.4s, v5.4s, #12 ushr v5.4s, v6.4s, #20 sli v5.4s, v6.4s, #12 ushr v6.4s, v7.4s, #20 sli v6.4s, v7.4s, #12 add v0.4s, v0.4s, v20.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s eor v15.16b, v15.16b, v0.16b eor v16.16b, v16.16b, v1.16b eor v17.16b, v17.16b, v2.16b tbl v15.16b, {v15.16b}, v26.16b tbl v16.16b, {v16.16b}, v26.16b tbl v17.16b, {v17.16b}, v26.16b add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v16.4s add v12.4s, v12.4s, v17.4s eor v20.16b, v20.16b, v10.16b eor v5.16b, v5.16b, v11.16b eor v6.16b, v6.16b, v12.16b ushr v7.4s, v6.4s, #25 sli v7.4s, v6.4s, #7 ushr v6.4s, v5.4s, #25 sli v6.4s, v5.4s, #7 ushr v5.4s, v20.4s, #25 sli v5.4s, v20.4s, #7 ext v5.16b, v5.16b, v5.16b, #12 ext v6.16b, v6.16b, v6.16b, #12 ext v7.16b, v7.16b, v7.16b, #12 ext v10.16b, v10.16b, v10.16b, #8 ext v11.16b, v11.16b, v11.16b, #8 ext v12.16b, v12.16b, v12.16b, #8 ext v15.16b, v15.16b, v15.16b, #4 ext v16.16b, v16.16b, v16.16b, #4 ext v17.16b, v17.16b, v17.16b, #4 subs x6, x6, #1 b.hi Lopen_128_rounds add v0.4s, v0.4s, v24.4s add v1.4s, v1.4s, v24.4s add v2.4s, v2.4s, v24.4s add v5.4s, v5.4s, v28.4s add v6.4s, v6.4s, v28.4s add v7.4s, v7.4s, v28.4s add v10.4s, v10.4s, v29.4s add v11.4s, v11.4s, v29.4s add v30.4s, v30.4s, v25.4s add v15.4s, v15.4s, v30.4s add v30.4s, v30.4s, v25.4s add v16.4s, v16.4s, v30.4s and v2.16b, v2.16b, v27.16b mov x16, v2.d[0] // Move the R key to GPRs mov x17, v2.d[1] mov v27.16b, v7.16b // Store the S key bl Lpoly_hash_ad_internal Lopen_128_store: cmp x2, #64 b.lt Lopen_128_store_64 ld1 {v20.16b - v23.16b}, [x1], #64 mov x11, v20.d[0] mov x12, v20.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v21.d[0] mov x12, v21.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v22.d[0] mov x12, v22.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most mov x11, v23.d[0] mov x12, v23.d[1] adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most eor v20.16b, v20.16b, v0.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v10.16b eor v23.16b, v23.16b, v15.16b st1 {v20.16b - v23.16b}, [x0], #64 sub x2, x2, #64 mov v0.16b, v1.16b mov v5.16b, v6.16b mov v10.16b, v11.16b mov v15.16b, v16.16b Lopen_128_store_64: lsr x4, x2, #4 mov x3, x1 Lopen_128_hash_64: cbz x4, Lopen_tail_64_store ldp x11, x12, [x3], 16 adds x8, x8, x11 adcs x9, x9, x12 adc x10, x10, x15 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 umulh x12, x8, x16 mul x13, x9, x16 umulh x14, x9, x16 adds x12, x12, x13 mul x13, x10, x16 adc x13, x13, x14 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] umulh x8, x8, x17 adds x12, x12, x14 mul x14, x9, x17 umulh x9, x9, x17 adcs x14, x14, x8 mul x10, x10, x17 adc x10, x10, x9 adds x13, x13, x14 adc x14, x10, xzr and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) and x8, x13, #-4 extr x13, x14, x13, #2 adds x8, x8, x11 lsr x11, x14, #2 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits adds x8, x8, x13 adcs x9, x9, x12 adc x10, x10, xzr // At this point acc2 has the value of 4 at most sub x4, x4, #1 b Lopen_128_hash_64 .cfi_endproc #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-elf.S000064400000000000000000005714111046102023000202460ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .section .rodata .align 64 chacha20_poly1305_constants: .Lchacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .Lrol8: .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 .Lrol16: .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 .Lavx2_init: .long 0,0,0,0 .Lsse_inc: .long 1,0,0,0 .Lavx2_inc: .long 2,0,0,0,2,0,0,0 .Lclamp: .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC .quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF .align 16 .Land_masks: .byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .text .type poly_hash_ad_internal,@function .align 64 poly_hash_ad_internal: .cfi_startproc .cfi_def_cfa rsp, 8 xorq %r10,%r10 xorq %r11,%r11 xorq %r12,%r12 cmpq $13,%r8 jne .Lhash_ad_loop .Lpoly_fast_tls_ad: movq (%rcx),%r10 movq 5(%rcx),%r11 shrq $24,%r11 movq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 ret .Lhash_ad_loop: cmpq $16,%r8 jb .Lhash_ad_tail addq 0+0(%rcx),%r10 adcq 8+0(%rcx),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rcx),%rcx subq $16,%r8 jmp .Lhash_ad_loop .Lhash_ad_tail: cmpq $0,%r8 je .Lhash_ad_done xorq %r13,%r13 xorq %r14,%r14 xorq %r15,%r15 addq %r8,%rcx .Lhash_ad_tail_loop: shldq $8,%r13,%r14 shlq $8,%r13 movzbq -1(%rcx),%r15 xorq %r15,%r13 decq %rcx decq %r8 jne .Lhash_ad_tail_loop addq %r13,%r10 adcq %r14,%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 .Lhash_ad_done: ret .cfi_endproc .size poly_hash_ad_internal, .-poly_hash_ad_internal .globl chacha20_poly1305_open_sse41 .hidden chacha20_poly1305_open_sse41 .type chacha20_poly1305_open_sse41,@function .align 64 chacha20_poly1305_open_sse41: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %r9 .cfi_adjust_cfa_offset 8 .cfi_offset %r9,-64 subq $288 + 0 + 32,%rsp .cfi_adjust_cfa_offset 288 + 32 leaq 32(%rsp),%rbp andq $-32,%rbp movq %rdx,%rbx movq %r8,0+0+32(%rbp) movq %rbx,8+0+32(%rbp) cmpq $128,%rbx jbe .Lopen_sse_128 movdqa .Lchacha20_consts(%rip),%xmm0 movdqu 0(%r9),%xmm4 movdqu 16(%r9),%xmm8 movdqu 32(%r9),%xmm12 movdqa %xmm12,%xmm7 movdqa %xmm4,0+48(%rbp) movdqa %xmm8,0+64(%rbp) movdqa %xmm12,0+96(%rbp) movq $10,%r10 .Lopen_sse_init_rounds: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 decq %r10 jne .Lopen_sse_init_rounds paddd .Lchacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 pand .Lclamp(%rip),%xmm0 movdqa %xmm0,0+0(%rbp) movdqa %xmm4,0+16(%rbp) movq %r8,%r8 call poly_hash_ad_internal .Lopen_sse_main_loop: cmpq $256,%rbx jb .Lopen_sse_tail movdqa .Lchacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa %xmm0,%xmm2 movdqa %xmm4,%xmm6 movdqa %xmm8,%xmm10 movdqa %xmm0,%xmm3 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 movdqa 0+96(%rbp),%xmm15 paddd .Lsse_inc(%rip),%xmm15 movdqa %xmm15,%xmm14 paddd .Lsse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 paddd .Lsse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd .Lsse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movdqa %xmm14,0+128(%rbp) movdqa %xmm15,0+144(%rbp) movq $4,%rcx movq %rsi,%r8 .Lopen_sse_main_loop_rounds: movdqa %xmm8,0+80(%rbp) movdqa .Lrol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 leaq 16(%r8),%r8 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movdqa .Lrol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 movdqa 0+80(%rbp),%xmm8 imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 .byte 102,15,58,15,255,4 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,12 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 movdqa %xmm8,0+80(%rbp) movdqa .Lrol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 movdqa .Lrol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 movdqa 0+80(%rbp),%xmm8 .byte 102,15,58,15,255,12 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,4 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 decq %rcx jge .Lopen_sse_main_loop_rounds addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 cmpq $-6,%rcx jg .Lopen_sse_main_loop_rounds paddd .Lchacha20_consts(%rip),%xmm3 paddd 0+48(%rbp),%xmm7 paddd 0+64(%rbp),%xmm11 paddd 0+144(%rbp),%xmm15 paddd .Lchacha20_consts(%rip),%xmm2 paddd 0+48(%rbp),%xmm6 paddd 0+64(%rbp),%xmm10 paddd 0+128(%rbp),%xmm14 paddd .Lchacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd .Lchacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqa %xmm12,0+80(%rbp) movdqu 0 + 0(%rsi),%xmm12 pxor %xmm3,%xmm12 movdqu %xmm12,0 + 0(%rdi) movdqu 16 + 0(%rsi),%xmm12 pxor %xmm7,%xmm12 movdqu %xmm12,16 + 0(%rdi) movdqu 32 + 0(%rsi),%xmm12 pxor %xmm11,%xmm12 movdqu %xmm12,32 + 0(%rdi) movdqu 48 + 0(%rsi),%xmm12 pxor %xmm15,%xmm12 movdqu %xmm12,48 + 0(%rdi) movdqu 0 + 64(%rsi),%xmm3 movdqu 16 + 64(%rsi),%xmm7 movdqu 32 + 64(%rsi),%xmm11 movdqu 48 + 64(%rsi),%xmm15 pxor %xmm3,%xmm2 pxor %xmm7,%xmm6 pxor %xmm11,%xmm10 pxor %xmm14,%xmm15 movdqu %xmm2,0 + 64(%rdi) movdqu %xmm6,16 + 64(%rdi) movdqu %xmm10,32 + 64(%rdi) movdqu %xmm15,48 + 64(%rdi) movdqu 0 + 128(%rsi),%xmm3 movdqu 16 + 128(%rsi),%xmm7 movdqu 32 + 128(%rsi),%xmm11 movdqu 48 + 128(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 128(%rdi) movdqu %xmm5,16 + 128(%rdi) movdqu %xmm9,32 + 128(%rdi) movdqu %xmm15,48 + 128(%rdi) movdqu 0 + 192(%rsi),%xmm3 movdqu 16 + 192(%rsi),%xmm7 movdqu 32 + 192(%rsi),%xmm11 movdqu 48 + 192(%rsi),%xmm15 pxor %xmm3,%xmm0 pxor %xmm7,%xmm4 pxor %xmm11,%xmm8 pxor 0+80(%rbp),%xmm15 movdqu %xmm0,0 + 192(%rdi) movdqu %xmm4,16 + 192(%rdi) movdqu %xmm8,32 + 192(%rdi) movdqu %xmm15,48 + 192(%rdi) leaq 256(%rsi),%rsi leaq 256(%rdi),%rdi subq $256,%rbx jmp .Lopen_sse_main_loop .Lopen_sse_tail: testq %rbx,%rbx jz .Lopen_sse_finalize cmpq $192,%rbx ja .Lopen_sse_tail_256 cmpq $128,%rbx ja .Lopen_sse_tail_192 cmpq $64,%rbx ja .Lopen_sse_tail_128 movdqa .Lchacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa 0+96(%rbp),%xmm12 paddd .Lsse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) xorq %r8,%r8 movq %rbx,%rcx cmpq $16,%rcx jb .Lopen_sse_tail_64_rounds .Lopen_sse_tail_64_rounds_and_x1hash: addq 0+0(%rsi,%r8,1),%r10 adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 subq $16,%rcx .Lopen_sse_tail_64_rounds: addq $16,%r8 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 cmpq $16,%rcx jae .Lopen_sse_tail_64_rounds_and_x1hash cmpq $160,%r8 jne .Lopen_sse_tail_64_rounds paddd .Lchacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 jmp .Lopen_sse_tail_64_dec_loop .Lopen_sse_tail_128: movdqa .Lchacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa 0+96(%rbp),%xmm13 paddd .Lsse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd .Lsse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movq %rbx,%rcx andq $-16,%rcx xorq %r8,%r8 .Lopen_sse_tail_128_rounds_and_x1hash: addq 0+0(%rsi,%r8,1),%r10 adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 .Lopen_sse_tail_128_rounds: addq $16,%r8 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 cmpq %rcx,%r8 jb .Lopen_sse_tail_128_rounds_and_x1hash cmpq $160,%r8 jne .Lopen_sse_tail_128_rounds paddd .Lchacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd .Lchacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 movdqu 48 + 0(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 0(%rdi) movdqu %xmm5,16 + 0(%rdi) movdqu %xmm9,32 + 0(%rdi) movdqu %xmm15,48 + 0(%rdi) subq $64,%rbx leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi jmp .Lopen_sse_tail_64_dec_loop .Lopen_sse_tail_192: movdqa .Lchacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa %xmm0,%xmm2 movdqa %xmm4,%xmm6 movdqa %xmm8,%xmm10 movdqa 0+96(%rbp),%xmm14 paddd .Lsse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 paddd .Lsse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd .Lsse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movdqa %xmm14,0+128(%rbp) movq %rbx,%rcx movq $160,%r8 cmpq $160,%rcx cmovgq %r8,%rcx andq $-16,%rcx xorq %r8,%r8 .Lopen_sse_tail_192_rounds_and_x1hash: addq 0+0(%rsi,%r8,1),%r10 adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 .Lopen_sse_tail_192_rounds: addq $16,%r8 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 cmpq %rcx,%r8 jb .Lopen_sse_tail_192_rounds_and_x1hash cmpq $160,%r8 jne .Lopen_sse_tail_192_rounds cmpq $176,%rbx jb .Lopen_sse_tail_192_finish addq 0+160(%rsi),%r10 adcq 8+160(%rsi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 cmpq $192,%rbx jb .Lopen_sse_tail_192_finish addq 0+176(%rsi),%r10 adcq 8+176(%rsi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 .Lopen_sse_tail_192_finish: paddd .Lchacha20_consts(%rip),%xmm2 paddd 0+48(%rbp),%xmm6 paddd 0+64(%rbp),%xmm10 paddd 0+128(%rbp),%xmm14 paddd .Lchacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd .Lchacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 movdqu 48 + 0(%rsi),%xmm15 pxor %xmm3,%xmm2 pxor %xmm7,%xmm6 pxor %xmm11,%xmm10 pxor %xmm14,%xmm15 movdqu %xmm2,0 + 0(%rdi) movdqu %xmm6,16 + 0(%rdi) movdqu %xmm10,32 + 0(%rdi) movdqu %xmm15,48 + 0(%rdi) movdqu 0 + 64(%rsi),%xmm3 movdqu 16 + 64(%rsi),%xmm7 movdqu 32 + 64(%rsi),%xmm11 movdqu 48 + 64(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 64(%rdi) movdqu %xmm5,16 + 64(%rdi) movdqu %xmm9,32 + 64(%rdi) movdqu %xmm15,48 + 64(%rdi) subq $128,%rbx leaq 128(%rsi),%rsi leaq 128(%rdi),%rdi jmp .Lopen_sse_tail_64_dec_loop .Lopen_sse_tail_256: movdqa .Lchacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa %xmm0,%xmm2 movdqa %xmm4,%xmm6 movdqa %xmm8,%xmm10 movdqa %xmm0,%xmm3 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 movdqa 0+96(%rbp),%xmm15 paddd .Lsse_inc(%rip),%xmm15 movdqa %xmm15,%xmm14 paddd .Lsse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 paddd .Lsse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd .Lsse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movdqa %xmm14,0+128(%rbp) movdqa %xmm15,0+144(%rbp) xorq %r8,%r8 .Lopen_sse_tail_256_rounds_and_x1hash: addq 0+0(%rsi,%r8,1),%r10 adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 movdqa %xmm11,0+80(%rbp) paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm11 pslld $12,%xmm11 psrld $20,%xmm4 pxor %xmm11,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm11 pslld $7,%xmm11 psrld $25,%xmm4 pxor %xmm11,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm11 pslld $12,%xmm11 psrld $20,%xmm5 pxor %xmm11,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm11 pslld $7,%xmm11 psrld $25,%xmm5 pxor %xmm11,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm11 pslld $12,%xmm11 psrld $20,%xmm6 pxor %xmm11,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm11 pslld $7,%xmm11 psrld $25,%xmm6 pxor %xmm11,%xmm6 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 movdqa 0+80(%rbp),%xmm11 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movdqa %xmm9,0+80(%rbp) paddd %xmm7,%xmm3 pxor %xmm3,%xmm15 pshufb .Lrol16(%rip),%xmm15 paddd %xmm15,%xmm11 pxor %xmm11,%xmm7 movdqa %xmm7,%xmm9 pslld $12,%xmm9 psrld $20,%xmm7 pxor %xmm9,%xmm7 paddd %xmm7,%xmm3 pxor %xmm3,%xmm15 pshufb .Lrol8(%rip),%xmm15 paddd %xmm15,%xmm11 pxor %xmm11,%xmm7 movdqa %xmm7,%xmm9 pslld $7,%xmm9 psrld $25,%xmm7 pxor %xmm9,%xmm7 .byte 102,15,58,15,255,4 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,12 movdqa 0+80(%rbp),%xmm9 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx movdqa %xmm11,0+80(%rbp) paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm11 pslld $12,%xmm11 psrld $20,%xmm4 pxor %xmm11,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm11 pslld $7,%xmm11 psrld $25,%xmm4 pxor %xmm11,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm11 pslld $12,%xmm11 psrld $20,%xmm5 pxor %xmm11,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm11 pslld $7,%xmm11 psrld $25,%xmm5 pxor %xmm11,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm11 pslld $12,%xmm11 psrld $20,%xmm6 pxor %xmm11,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm11 pslld $7,%xmm11 psrld $25,%xmm6 pxor %xmm11,%xmm6 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 movdqa 0+80(%rbp),%xmm11 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 movdqa %xmm9,0+80(%rbp) paddd %xmm7,%xmm3 pxor %xmm3,%xmm15 pshufb .Lrol16(%rip),%xmm15 paddd %xmm15,%xmm11 pxor %xmm11,%xmm7 movdqa %xmm7,%xmm9 pslld $12,%xmm9 psrld $20,%xmm7 pxor %xmm9,%xmm7 paddd %xmm7,%xmm3 pxor %xmm3,%xmm15 pshufb .Lrol8(%rip),%xmm15 paddd %xmm15,%xmm11 pxor %xmm11,%xmm7 movdqa %xmm7,%xmm9 pslld $7,%xmm9 psrld $25,%xmm7 pxor %xmm9,%xmm7 .byte 102,15,58,15,255,12 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,4 movdqa 0+80(%rbp),%xmm9 addq $16,%r8 cmpq $160,%r8 jb .Lopen_sse_tail_256_rounds_and_x1hash movq %rbx,%rcx andq $-16,%rcx .Lopen_sse_tail_256_hash: addq 0+0(%rsi,%r8,1),%r10 adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq $16,%r8 cmpq %rcx,%r8 jb .Lopen_sse_tail_256_hash paddd .Lchacha20_consts(%rip),%xmm3 paddd 0+48(%rbp),%xmm7 paddd 0+64(%rbp),%xmm11 paddd 0+144(%rbp),%xmm15 paddd .Lchacha20_consts(%rip),%xmm2 paddd 0+48(%rbp),%xmm6 paddd 0+64(%rbp),%xmm10 paddd 0+128(%rbp),%xmm14 paddd .Lchacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd .Lchacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqa %xmm12,0+80(%rbp) movdqu 0 + 0(%rsi),%xmm12 pxor %xmm3,%xmm12 movdqu %xmm12,0 + 0(%rdi) movdqu 16 + 0(%rsi),%xmm12 pxor %xmm7,%xmm12 movdqu %xmm12,16 + 0(%rdi) movdqu 32 + 0(%rsi),%xmm12 pxor %xmm11,%xmm12 movdqu %xmm12,32 + 0(%rdi) movdqu 48 + 0(%rsi),%xmm12 pxor %xmm15,%xmm12 movdqu %xmm12,48 + 0(%rdi) movdqu 0 + 64(%rsi),%xmm3 movdqu 16 + 64(%rsi),%xmm7 movdqu 32 + 64(%rsi),%xmm11 movdqu 48 + 64(%rsi),%xmm15 pxor %xmm3,%xmm2 pxor %xmm7,%xmm6 pxor %xmm11,%xmm10 pxor %xmm14,%xmm15 movdqu %xmm2,0 + 64(%rdi) movdqu %xmm6,16 + 64(%rdi) movdqu %xmm10,32 + 64(%rdi) movdqu %xmm15,48 + 64(%rdi) movdqu 0 + 128(%rsi),%xmm3 movdqu 16 + 128(%rsi),%xmm7 movdqu 32 + 128(%rsi),%xmm11 movdqu 48 + 128(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 128(%rdi) movdqu %xmm5,16 + 128(%rdi) movdqu %xmm9,32 + 128(%rdi) movdqu %xmm15,48 + 128(%rdi) movdqa 0+80(%rbp),%xmm12 subq $192,%rbx leaq 192(%rsi),%rsi leaq 192(%rdi),%rdi .Lopen_sse_tail_64_dec_loop: cmpq $16,%rbx jb .Lopen_sse_tail_16_init subq $16,%rbx movdqu (%rsi),%xmm3 pxor %xmm3,%xmm0 movdqu %xmm0,(%rdi) leaq 16(%rsi),%rsi leaq 16(%rdi),%rdi movdqa %xmm4,%xmm0 movdqa %xmm8,%xmm4 movdqa %xmm12,%xmm8 jmp .Lopen_sse_tail_64_dec_loop .Lopen_sse_tail_16_init: movdqa %xmm0,%xmm1 .Lopen_sse_tail_16: testq %rbx,%rbx jz .Lopen_sse_finalize pxor %xmm3,%xmm3 leaq -1(%rsi,%rbx,1),%rsi movq %rbx,%r8 .Lopen_sse_tail_16_compose: pslldq $1,%xmm3 pinsrb $0,(%rsi),%xmm3 subq $1,%rsi subq $1,%r8 jnz .Lopen_sse_tail_16_compose .byte 102,73,15,126,221 pextrq $1,%xmm3,%r14 pxor %xmm1,%xmm3 .Lopen_sse_tail_16_extract: pextrb $0,%xmm3,(%rdi) psrldq $1,%xmm3 addq $1,%rdi subq $1,%rbx jne .Lopen_sse_tail_16_extract addq %r13,%r10 adcq %r14,%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 .Lopen_sse_finalize: addq 0+0+32(%rbp),%r10 adcq 8+0+32(%rbp),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 movq %r10,%r13 movq %r11,%r14 movq %r12,%r15 subq $-5,%r10 sbbq $-1,%r11 sbbq $3,%r12 cmovcq %r13,%r10 cmovcq %r14,%r11 cmovcq %r15,%r12 addq 0+0+16(%rbp),%r10 adcq 8+0+16(%rbp),%r11 .cfi_remember_state addq $288 + 0 + 32,%rsp .cfi_adjust_cfa_offset -(288 + 32) popq %r9 .cfi_adjust_cfa_offset -8 .cfi_restore %r9 movq %r10,(%r9) movq %r11,8(%r9) popq %r15 .cfi_adjust_cfa_offset -8 .cfi_restore %r15 popq %r14 .cfi_adjust_cfa_offset -8 .cfi_restore %r14 popq %r13 .cfi_adjust_cfa_offset -8 .cfi_restore %r13 popq %r12 .cfi_adjust_cfa_offset -8 .cfi_restore %r12 popq %rbx .cfi_adjust_cfa_offset -8 .cfi_restore %rbx popq %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp ret .Lopen_sse_128: .cfi_restore_state movdqu .Lchacha20_consts(%rip),%xmm0 movdqa %xmm0,%xmm1 movdqa %xmm0,%xmm2 movdqu 0(%r9),%xmm4 movdqa %xmm4,%xmm5 movdqa %xmm4,%xmm6 movdqu 16(%r9),%xmm8 movdqa %xmm8,%xmm9 movdqa %xmm8,%xmm10 movdqu 32(%r9),%xmm12 movdqa %xmm12,%xmm13 paddd .Lsse_inc(%rip),%xmm13 movdqa %xmm13,%xmm14 paddd .Lsse_inc(%rip),%xmm14 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 movdqa %xmm13,%xmm15 movq $10,%r10 .Lopen_sse_128_rounds: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 decq %r10 jnz .Lopen_sse_128_rounds paddd .Lchacha20_consts(%rip),%xmm0 paddd .Lchacha20_consts(%rip),%xmm1 paddd .Lchacha20_consts(%rip),%xmm2 paddd %xmm7,%xmm4 paddd %xmm7,%xmm5 paddd %xmm7,%xmm6 paddd %xmm11,%xmm9 paddd %xmm11,%xmm10 paddd %xmm15,%xmm13 paddd .Lsse_inc(%rip),%xmm15 paddd %xmm15,%xmm14 pand .Lclamp(%rip),%xmm0 movdqa %xmm0,0+0(%rbp) movdqa %xmm4,0+16(%rbp) movq %r8,%r8 call poly_hash_ad_internal .Lopen_sse_128_xor_hash: cmpq $16,%rbx jb .Lopen_sse_tail_16 subq $16,%rbx addq 0+0(%rsi),%r10 adcq 8+0(%rsi),%r11 adcq $1,%r12 movdqu 0(%rsi),%xmm3 pxor %xmm3,%xmm1 movdqu %xmm1,0(%rdi) leaq 16(%rsi),%rsi leaq 16(%rdi),%rdi movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 movdqa %xmm5,%xmm1 movdqa %xmm9,%xmm5 movdqa %xmm13,%xmm9 movdqa %xmm2,%xmm13 movdqa %xmm6,%xmm2 movdqa %xmm10,%xmm6 movdqa %xmm14,%xmm10 jmp .Lopen_sse_128_xor_hash .size chacha20_poly1305_open_sse41, .-chacha20_poly1305_open_sse41 .cfi_endproc .globl chacha20_poly1305_seal_sse41 .hidden chacha20_poly1305_seal_sse41 .type chacha20_poly1305_seal_sse41,@function .align 64 chacha20_poly1305_seal_sse41: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %r9 .cfi_adjust_cfa_offset 8 .cfi_offset %r9,-64 subq $288 + 0 + 32,%rsp .cfi_adjust_cfa_offset 288 + 32 leaq 32(%rsp),%rbp andq $-32,%rbp movq 56(%r9),%rbx addq %rdx,%rbx movq %r8,0+0+32(%rbp) movq %rbx,8+0+32(%rbp) movq %rdx,%rbx cmpq $128,%rbx jbe .Lseal_sse_128 movdqa .Lchacha20_consts(%rip),%xmm0 movdqu 0(%r9),%xmm4 movdqu 16(%r9),%xmm8 movdqu 32(%r9),%xmm12 movdqa %xmm0,%xmm1 movdqa %xmm0,%xmm2 movdqa %xmm0,%xmm3 movdqa %xmm4,%xmm5 movdqa %xmm4,%xmm6 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm9 movdqa %xmm8,%xmm10 movdqa %xmm8,%xmm11 movdqa %xmm12,%xmm15 paddd .Lsse_inc(%rip),%xmm12 movdqa %xmm12,%xmm14 paddd .Lsse_inc(%rip),%xmm12 movdqa %xmm12,%xmm13 paddd .Lsse_inc(%rip),%xmm12 movdqa %xmm4,0+48(%rbp) movdqa %xmm8,0+64(%rbp) movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movdqa %xmm14,0+128(%rbp) movdqa %xmm15,0+144(%rbp) movq $10,%r10 .Lseal_sse_init_rounds: movdqa %xmm8,0+80(%rbp) movdqa .Lrol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 movdqa .Lrol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 movdqa 0+80(%rbp),%xmm8 .byte 102,15,58,15,255,4 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,12 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 movdqa %xmm8,0+80(%rbp) movdqa .Lrol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 movdqa .Lrol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 movdqa 0+80(%rbp),%xmm8 .byte 102,15,58,15,255,12 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,4 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 decq %r10 jnz .Lseal_sse_init_rounds paddd .Lchacha20_consts(%rip),%xmm3 paddd 0+48(%rbp),%xmm7 paddd 0+64(%rbp),%xmm11 paddd 0+144(%rbp),%xmm15 paddd .Lchacha20_consts(%rip),%xmm2 paddd 0+48(%rbp),%xmm6 paddd 0+64(%rbp),%xmm10 paddd 0+128(%rbp),%xmm14 paddd .Lchacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd .Lchacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 pand .Lclamp(%rip),%xmm3 movdqa %xmm3,0+0(%rbp) movdqa %xmm7,0+16(%rbp) movq %r8,%r8 call poly_hash_ad_internal movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 movdqu 48 + 0(%rsi),%xmm15 pxor %xmm3,%xmm2 pxor %xmm7,%xmm6 pxor %xmm11,%xmm10 pxor %xmm14,%xmm15 movdqu %xmm2,0 + 0(%rdi) movdqu %xmm6,16 + 0(%rdi) movdqu %xmm10,32 + 0(%rdi) movdqu %xmm15,48 + 0(%rdi) movdqu 0 + 64(%rsi),%xmm3 movdqu 16 + 64(%rsi),%xmm7 movdqu 32 + 64(%rsi),%xmm11 movdqu 48 + 64(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 64(%rdi) movdqu %xmm5,16 + 64(%rdi) movdqu %xmm9,32 + 64(%rdi) movdqu %xmm15,48 + 64(%rdi) cmpq $192,%rbx ja .Lseal_sse_main_init movq $128,%rcx subq $128,%rbx leaq 128(%rsi),%rsi jmp .Lseal_sse_128_tail_hash .Lseal_sse_main_init: movdqu 0 + 128(%rsi),%xmm3 movdqu 16 + 128(%rsi),%xmm7 movdqu 32 + 128(%rsi),%xmm11 movdqu 48 + 128(%rsi),%xmm15 pxor %xmm3,%xmm0 pxor %xmm7,%xmm4 pxor %xmm11,%xmm8 pxor %xmm12,%xmm15 movdqu %xmm0,0 + 128(%rdi) movdqu %xmm4,16 + 128(%rdi) movdqu %xmm8,32 + 128(%rdi) movdqu %xmm15,48 + 128(%rdi) movq $192,%rcx subq $192,%rbx leaq 192(%rsi),%rsi movq $2,%rcx movq $8,%r8 cmpq $64,%rbx jbe .Lseal_sse_tail_64 cmpq $128,%rbx jbe .Lseal_sse_tail_128 cmpq $192,%rbx jbe .Lseal_sse_tail_192 .Lseal_sse_main_loop: movdqa .Lchacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa %xmm0,%xmm2 movdqa %xmm4,%xmm6 movdqa %xmm8,%xmm10 movdqa %xmm0,%xmm3 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 movdqa 0+96(%rbp),%xmm15 paddd .Lsse_inc(%rip),%xmm15 movdqa %xmm15,%xmm14 paddd .Lsse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 paddd .Lsse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd .Lsse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movdqa %xmm14,0+128(%rbp) movdqa %xmm15,0+144(%rbp) .align 32 .Lseal_sse_main_rounds: movdqa %xmm8,0+80(%rbp) movdqa .Lrol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movdqa .Lrol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 movdqa 0+80(%rbp),%xmm8 imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 .byte 102,15,58,15,255,4 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,12 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 movdqa %xmm8,0+80(%rbp) movdqa .Lrol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 movdqa .Lrol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 movdqa 0+80(%rbp),%xmm8 .byte 102,15,58,15,255,12 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,4 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 leaq 16(%rdi),%rdi decq %r8 jge .Lseal_sse_main_rounds addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi decq %rcx jg .Lseal_sse_main_rounds paddd .Lchacha20_consts(%rip),%xmm3 paddd 0+48(%rbp),%xmm7 paddd 0+64(%rbp),%xmm11 paddd 0+144(%rbp),%xmm15 paddd .Lchacha20_consts(%rip),%xmm2 paddd 0+48(%rbp),%xmm6 paddd 0+64(%rbp),%xmm10 paddd 0+128(%rbp),%xmm14 paddd .Lchacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd .Lchacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqa %xmm14,0+80(%rbp) movdqa %xmm14,0+80(%rbp) movdqu 0 + 0(%rsi),%xmm14 pxor %xmm3,%xmm14 movdqu %xmm14,0 + 0(%rdi) movdqu 16 + 0(%rsi),%xmm14 pxor %xmm7,%xmm14 movdqu %xmm14,16 + 0(%rdi) movdqu 32 + 0(%rsi),%xmm14 pxor %xmm11,%xmm14 movdqu %xmm14,32 + 0(%rdi) movdqu 48 + 0(%rsi),%xmm14 pxor %xmm15,%xmm14 movdqu %xmm14,48 + 0(%rdi) movdqa 0+80(%rbp),%xmm14 movdqu 0 + 64(%rsi),%xmm3 movdqu 16 + 64(%rsi),%xmm7 movdqu 32 + 64(%rsi),%xmm11 movdqu 48 + 64(%rsi),%xmm15 pxor %xmm3,%xmm2 pxor %xmm7,%xmm6 pxor %xmm11,%xmm10 pxor %xmm14,%xmm15 movdqu %xmm2,0 + 64(%rdi) movdqu %xmm6,16 + 64(%rdi) movdqu %xmm10,32 + 64(%rdi) movdqu %xmm15,48 + 64(%rdi) movdqu 0 + 128(%rsi),%xmm3 movdqu 16 + 128(%rsi),%xmm7 movdqu 32 + 128(%rsi),%xmm11 movdqu 48 + 128(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 128(%rdi) movdqu %xmm5,16 + 128(%rdi) movdqu %xmm9,32 + 128(%rdi) movdqu %xmm15,48 + 128(%rdi) cmpq $256,%rbx ja .Lseal_sse_main_loop_xor movq $192,%rcx subq $192,%rbx leaq 192(%rsi),%rsi jmp .Lseal_sse_128_tail_hash .Lseal_sse_main_loop_xor: movdqu 0 + 192(%rsi),%xmm3 movdqu 16 + 192(%rsi),%xmm7 movdqu 32 + 192(%rsi),%xmm11 movdqu 48 + 192(%rsi),%xmm15 pxor %xmm3,%xmm0 pxor %xmm7,%xmm4 pxor %xmm11,%xmm8 pxor %xmm12,%xmm15 movdqu %xmm0,0 + 192(%rdi) movdqu %xmm4,16 + 192(%rdi) movdqu %xmm8,32 + 192(%rdi) movdqu %xmm15,48 + 192(%rdi) leaq 256(%rsi),%rsi subq $256,%rbx movq $6,%rcx movq $4,%r8 cmpq $192,%rbx jg .Lseal_sse_main_loop movq %rbx,%rcx testq %rbx,%rbx je .Lseal_sse_128_tail_hash movq $6,%rcx cmpq $128,%rbx ja .Lseal_sse_tail_192 cmpq $64,%rbx ja .Lseal_sse_tail_128 .Lseal_sse_tail_64: movdqa .Lchacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa 0+96(%rbp),%xmm12 paddd .Lsse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) .Lseal_sse_tail_64_rounds_and_x2hash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi .Lseal_sse_tail_64_rounds_and_x1hash: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi decq %rcx jg .Lseal_sse_tail_64_rounds_and_x2hash decq %r8 jge .Lseal_sse_tail_64_rounds_and_x1hash paddd .Lchacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 jmp .Lseal_sse_128_tail_xor .Lseal_sse_tail_128: movdqa .Lchacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa 0+96(%rbp),%xmm13 paddd .Lsse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd .Lsse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) .Lseal_sse_tail_128_rounds_and_x2hash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi .Lseal_sse_tail_128_rounds_and_x1hash: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 leaq 16(%rdi),%rdi decq %rcx jg .Lseal_sse_tail_128_rounds_and_x2hash decq %r8 jge .Lseal_sse_tail_128_rounds_and_x1hash paddd .Lchacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd .Lchacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 movdqu 48 + 0(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 0(%rdi) movdqu %xmm5,16 + 0(%rdi) movdqu %xmm9,32 + 0(%rdi) movdqu %xmm15,48 + 0(%rdi) movq $64,%rcx subq $64,%rbx leaq 64(%rsi),%rsi jmp .Lseal_sse_128_tail_hash .Lseal_sse_tail_192: movdqa .Lchacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa %xmm0,%xmm2 movdqa %xmm4,%xmm6 movdqa %xmm8,%xmm10 movdqa 0+96(%rbp),%xmm14 paddd .Lsse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 paddd .Lsse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd .Lsse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movdqa %xmm14,0+128(%rbp) .Lseal_sse_tail_192_rounds_and_x2hash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi .Lseal_sse_tail_192_rounds_and_x1hash: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 leaq 16(%rdi),%rdi decq %rcx jg .Lseal_sse_tail_192_rounds_and_x2hash decq %r8 jge .Lseal_sse_tail_192_rounds_and_x1hash paddd .Lchacha20_consts(%rip),%xmm2 paddd 0+48(%rbp),%xmm6 paddd 0+64(%rbp),%xmm10 paddd 0+128(%rbp),%xmm14 paddd .Lchacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd .Lchacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 movdqu 48 + 0(%rsi),%xmm15 pxor %xmm3,%xmm2 pxor %xmm7,%xmm6 pxor %xmm11,%xmm10 pxor %xmm14,%xmm15 movdqu %xmm2,0 + 0(%rdi) movdqu %xmm6,16 + 0(%rdi) movdqu %xmm10,32 + 0(%rdi) movdqu %xmm15,48 + 0(%rdi) movdqu 0 + 64(%rsi),%xmm3 movdqu 16 + 64(%rsi),%xmm7 movdqu 32 + 64(%rsi),%xmm11 movdqu 48 + 64(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 64(%rdi) movdqu %xmm5,16 + 64(%rdi) movdqu %xmm9,32 + 64(%rdi) movdqu %xmm15,48 + 64(%rdi) movq $128,%rcx subq $128,%rbx leaq 128(%rsi),%rsi .Lseal_sse_128_tail_hash: cmpq $16,%rcx jb .Lseal_sse_128_tail_xor addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 subq $16,%rcx leaq 16(%rdi),%rdi jmp .Lseal_sse_128_tail_hash .Lseal_sse_128_tail_xor: cmpq $16,%rbx jb .Lseal_sse_tail_16 subq $16,%rbx movdqu 0(%rsi),%xmm3 pxor %xmm3,%xmm0 movdqu %xmm0,0(%rdi) addq 0(%rdi),%r10 adcq 8(%rdi),%r11 adcq $1,%r12 leaq 16(%rsi),%rsi leaq 16(%rdi),%rdi movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 movdqa %xmm4,%xmm0 movdqa %xmm8,%xmm4 movdqa %xmm12,%xmm8 movdqa %xmm1,%xmm12 movdqa %xmm5,%xmm1 movdqa %xmm9,%xmm5 movdqa %xmm13,%xmm9 jmp .Lseal_sse_128_tail_xor .Lseal_sse_tail_16: testq %rbx,%rbx jz .Lprocess_blocks_of_extra_in movq %rbx,%r8 movq %rbx,%rcx leaq -1(%rsi,%rbx,1),%rsi pxor %xmm15,%xmm15 .Lseal_sse_tail_16_compose: pslldq $1,%xmm15 pinsrb $0,(%rsi),%xmm15 leaq -1(%rsi),%rsi decq %rcx jne .Lseal_sse_tail_16_compose pxor %xmm0,%xmm15 movq %rbx,%rcx movdqu %xmm15,%xmm0 .Lseal_sse_tail_16_extract: pextrb $0,%xmm0,(%rdi) psrldq $1,%xmm0 addq $1,%rdi subq $1,%rcx jnz .Lseal_sse_tail_16_extract movq 288 + 0 + 32(%rsp),%r9 movq 56(%r9),%r14 movq 48(%r9),%r13 testq %r14,%r14 jz .Lprocess_partial_block movq $16,%r15 subq %rbx,%r15 cmpq %r15,%r14 jge .Lload_extra_in movq %r14,%r15 .Lload_extra_in: leaq -1(%r13,%r15,1),%rsi addq %r15,%r13 subq %r15,%r14 movq %r13,48(%r9) movq %r14,56(%r9) addq %r15,%r8 pxor %xmm11,%xmm11 .Lload_extra_load_loop: pslldq $1,%xmm11 pinsrb $0,(%rsi),%xmm11 leaq -1(%rsi),%rsi subq $1,%r15 jnz .Lload_extra_load_loop movq %rbx,%r15 .Lload_extra_shift_loop: pslldq $1,%xmm11 subq $1,%r15 jnz .Lload_extra_shift_loop leaq .Land_masks(%rip),%r15 shlq $4,%rbx pand -16(%r15,%rbx,1),%xmm15 por %xmm11,%xmm15 .byte 102,77,15,126,253 pextrq $1,%xmm15,%r14 addq %r13,%r10 adcq %r14,%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 .Lprocess_blocks_of_extra_in: movq 288+32+0 (%rsp),%r9 movq 48(%r9),%rsi movq 56(%r9),%r8 movq %r8,%rcx shrq $4,%r8 .Lprocess_extra_hash_loop: jz process_extra_in_trailer addq 0+0(%rsi),%r10 adcq 8+0(%rsi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rsi),%rsi subq $1,%r8 jmp .Lprocess_extra_hash_loop process_extra_in_trailer: andq $15,%rcx movq %rcx,%rbx jz .Ldo_length_block leaq -1(%rsi,%rcx,1),%rsi .Lprocess_extra_in_trailer_load: pslldq $1,%xmm15 pinsrb $0,(%rsi),%xmm15 leaq -1(%rsi),%rsi subq $1,%rcx jnz .Lprocess_extra_in_trailer_load .Lprocess_partial_block: leaq .Land_masks(%rip),%r15 shlq $4,%rbx pand -16(%r15,%rbx,1),%xmm15 .byte 102,77,15,126,253 pextrq $1,%xmm15,%r14 addq %r13,%r10 adcq %r14,%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 .Ldo_length_block: addq 0+0+32(%rbp),%r10 adcq 8+0+32(%rbp),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 movq %r10,%r13 movq %r11,%r14 movq %r12,%r15 subq $-5,%r10 sbbq $-1,%r11 sbbq $3,%r12 cmovcq %r13,%r10 cmovcq %r14,%r11 cmovcq %r15,%r12 addq 0+0+16(%rbp),%r10 adcq 8+0+16(%rbp),%r11 .cfi_remember_state addq $288 + 0 + 32,%rsp .cfi_adjust_cfa_offset -(288 + 32) popq %r9 .cfi_adjust_cfa_offset -8 .cfi_restore %r9 movq %r10,(%r9) movq %r11,8(%r9) popq %r15 .cfi_adjust_cfa_offset -8 .cfi_restore %r15 popq %r14 .cfi_adjust_cfa_offset -8 .cfi_restore %r14 popq %r13 .cfi_adjust_cfa_offset -8 .cfi_restore %r13 popq %r12 .cfi_adjust_cfa_offset -8 .cfi_restore %r12 popq %rbx .cfi_adjust_cfa_offset -8 .cfi_restore %rbx popq %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp ret .Lseal_sse_128: .cfi_restore_state movdqu .Lchacha20_consts(%rip),%xmm0 movdqa %xmm0,%xmm1 movdqa %xmm0,%xmm2 movdqu 0(%r9),%xmm4 movdqa %xmm4,%xmm5 movdqa %xmm4,%xmm6 movdqu 16(%r9),%xmm8 movdqa %xmm8,%xmm9 movdqa %xmm8,%xmm10 movdqu 32(%r9),%xmm14 movdqa %xmm14,%xmm12 paddd .Lsse_inc(%rip),%xmm12 movdqa %xmm12,%xmm13 paddd .Lsse_inc(%rip),%xmm13 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 movdqa %xmm12,%xmm15 movq $10,%r10 .Lseal_sse_128_rounds: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb .Lrol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb .Lrol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb .Lrol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 decq %r10 jnz .Lseal_sse_128_rounds paddd .Lchacha20_consts(%rip),%xmm0 paddd .Lchacha20_consts(%rip),%xmm1 paddd .Lchacha20_consts(%rip),%xmm2 paddd %xmm7,%xmm4 paddd %xmm7,%xmm5 paddd %xmm7,%xmm6 paddd %xmm11,%xmm8 paddd %xmm11,%xmm9 paddd %xmm15,%xmm12 paddd .Lsse_inc(%rip),%xmm15 paddd %xmm15,%xmm13 pand .Lclamp(%rip),%xmm2 movdqa %xmm2,0+0(%rbp) movdqa %xmm6,0+16(%rbp) movq %r8,%r8 call poly_hash_ad_internal jmp .Lseal_sse_128_tail_xor .size chacha20_poly1305_seal_sse41, .-chacha20_poly1305_seal_sse41 .cfi_endproc .globl chacha20_poly1305_open_avx2 .hidden chacha20_poly1305_open_avx2 .type chacha20_poly1305_open_avx2,@function .align 64 chacha20_poly1305_open_avx2: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %r9 .cfi_adjust_cfa_offset 8 .cfi_offset %r9,-64 subq $288 + 0 + 32,%rsp .cfi_adjust_cfa_offset 288 + 32 leaq 32(%rsp),%rbp andq $-32,%rbp movq %rdx,%rbx movq %r8,0+0+32(%rbp) movq %rbx,8+0+32(%rbp) vzeroupper vmovdqa .Lchacha20_consts(%rip),%ymm0 vbroadcasti128 0(%r9),%ymm4 vbroadcasti128 16(%r9),%ymm8 vbroadcasti128 32(%r9),%ymm12 vpaddd .Lavx2_init(%rip),%ymm12,%ymm12 cmpq $192,%rbx jbe .Lopen_avx2_192 cmpq $320,%rbx jbe .Lopen_avx2_320 vmovdqa %ymm4,0+64(%rbp) vmovdqa %ymm8,0+96(%rbp) vmovdqa %ymm12,0+160(%rbp) movq $10,%r10 .Lopen_avx2_init_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 decq %r10 jne .Lopen_avx2_init_rounds vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vpand .Lclamp(%rip),%ymm3,%ymm3 vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 movq %r8,%r8 call poly_hash_ad_internal xorq %rcx,%rcx .Lopen_avx2_init_hash: addq 0+0(%rsi,%rcx,1),%r10 adcq 8+0(%rsi,%rcx,1),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq $16,%rcx cmpq $64,%rcx jne .Lopen_avx2_init_hash vpxor 0(%rsi),%ymm0,%ymm0 vpxor 32(%rsi),%ymm4,%ymm4 vmovdqu %ymm0,0(%rdi) vmovdqu %ymm4,32(%rdi) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi subq $64,%rbx .Lopen_avx2_main_loop: cmpq $512,%rbx jb .Lopen_avx2_main_loop_done vmovdqa .Lchacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa .Lavx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm15,0+256(%rbp) vmovdqa %ymm14,0+224(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm12,0+160(%rbp) xorq %rcx,%rcx .Lopen_avx2_main_loop_rounds: addq 0+0(%rsi,%rcx,1),%r10 adcq 8+0(%rsi,%rcx,1),%r11 adcq $1,%r12 vmovdqa %ymm8,0+128(%rbp) vmovdqa .Lrol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 addq %rax,%r15 adcq %rdx,%r9 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa .Lrol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 addq 0+16(%rsi,%rcx,1),%r10 adcq 8+16(%rsi,%rcx,1),%r11 adcq $1,%r12 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm14,%ymm14,%ymm14 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa .Lrol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 addq %rax,%r15 adcq %rdx,%r9 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 addq 0+32(%rsi,%rcx,1),%r10 adcq 8+32(%rsi,%rcx,1),%r11 adcq $1,%r12 leaq 48(%rcx),%rcx vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa .Lrol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 addq %rax,%r15 adcq %rdx,%r9 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 vpalignr $12,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpalignr $4,%ymm12,%ymm12,%ymm12 cmpq $60*8,%rcx jne .Lopen_avx2_main_loop_rounds vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 vpaddd 0+64(%rbp),%ymm7,%ymm7 vpaddd 0+96(%rbp),%ymm11,%ymm11 vpaddd 0+256(%rbp),%ymm15,%ymm15 vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vmovdqa %ymm0,0+128(%rbp) addq 0+60*8(%rsi),%r10 adcq 8+60*8(%rsi),%r11 adcq $1,%r12 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 vpxor 0+0(%rsi),%ymm0,%ymm0 vpxor 32+0(%rsi),%ymm3,%ymm3 vpxor 64+0(%rsi),%ymm7,%ymm7 vpxor 96+0(%rsi),%ymm11,%ymm11 vmovdqu %ymm0,0+0(%rdi) vmovdqu %ymm3,32+0(%rdi) vmovdqu %ymm7,64+0(%rdi) vmovdqu %ymm11,96+0(%rdi) vmovdqa 0+128(%rbp),%ymm0 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+128(%rsi),%ymm3,%ymm3 vpxor 32+128(%rsi),%ymm2,%ymm2 vpxor 64+128(%rsi),%ymm6,%ymm6 vpxor 96+128(%rsi),%ymm10,%ymm10 vmovdqu %ymm3,0+128(%rdi) vmovdqu %ymm2,32+128(%rdi) vmovdqu %ymm6,64+128(%rdi) vmovdqu %ymm10,96+128(%rdi) addq 0+60*8+16(%rsi),%r10 adcq 8+60*8+16(%rsi),%r11 adcq $1,%r12 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+256(%rsi),%ymm3,%ymm3 vpxor 32+256(%rsi),%ymm1,%ymm1 vpxor 64+256(%rsi),%ymm5,%ymm5 vpxor 96+256(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+256(%rdi) vmovdqu %ymm1,32+256(%rdi) vmovdqu %ymm5,64+256(%rdi) vmovdqu %ymm9,96+256(%rdi) movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 vpxor 0+384(%rsi),%ymm3,%ymm3 vpxor 32+384(%rsi),%ymm0,%ymm0 vpxor 64+384(%rsi),%ymm4,%ymm4 vpxor 96+384(%rsi),%ymm8,%ymm8 vmovdqu %ymm3,0+384(%rdi) vmovdqu %ymm0,32+384(%rdi) vmovdqu %ymm4,64+384(%rdi) vmovdqu %ymm8,96+384(%rdi) leaq 512(%rsi),%rsi leaq 512(%rdi),%rdi subq $512,%rbx jmp .Lopen_avx2_main_loop .Lopen_avx2_main_loop_done: testq %rbx,%rbx vzeroupper je .Lopen_sse_finalize cmpq $384,%rbx ja .Lopen_avx2_tail_512 cmpq $256,%rbx ja .Lopen_avx2_tail_384 cmpq $128,%rbx ja .Lopen_avx2_tail_256 vmovdqa .Lchacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa .Lavx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm12 vmovdqa %ymm12,0+160(%rbp) xorq %r8,%r8 movq %rbx,%rcx andq $-16,%rcx testq %rcx,%rcx je .Lopen_avx2_tail_128_rounds .Lopen_avx2_tail_128_rounds_and_x1hash: addq 0+0(%rsi,%r8,1),%r10 adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 .Lopen_avx2_tail_128_rounds: addq $16,%r8 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 cmpq %rcx,%r8 jb .Lopen_avx2_tail_128_rounds_and_x1hash cmpq $160,%r8 jne .Lopen_avx2_tail_128_rounds vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 jmp .Lopen_avx2_tail_128_xor .Lopen_avx2_tail_256: vmovdqa .Lchacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa .Lavx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) movq %rbx,0+128(%rbp) movq %rbx,%rcx subq $128,%rcx shrq $4,%rcx movq $10,%r8 cmpq $10,%rcx cmovgq %r8,%rcx movq %rsi,%rbx xorq %r8,%r8 .Lopen_avx2_tail_256_rounds_and_x1hash: addq 0+0(%rbx),%r10 adcq 8+0(%rbx),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rbx),%rbx .Lopen_avx2_tail_256_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 incq %r8 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm6,%ymm6,%ymm6 cmpq %rcx,%r8 jb .Lopen_avx2_tail_256_rounds_and_x1hash cmpq $10,%r8 jne .Lopen_avx2_tail_256_rounds movq %rbx,%r8 subq %rsi,%rbx movq %rbx,%rcx movq 0+128(%rbp),%rbx .Lopen_avx2_tail_256_hash: addq $16,%rcx cmpq %rbx,%rcx jg .Lopen_avx2_tail_256_done addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 jmp .Lopen_avx2_tail_256_hash .Lopen_avx2_tail_256_done: vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+0(%rsi),%ymm3,%ymm3 vpxor 32+0(%rsi),%ymm1,%ymm1 vpxor 64+0(%rsi),%ymm5,%ymm5 vpxor 96+0(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+0(%rdi) vmovdqu %ymm1,32+0(%rdi) vmovdqu %ymm5,64+0(%rdi) vmovdqu %ymm9,96+0(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 leaq 128(%rsi),%rsi leaq 128(%rdi),%rdi subq $128,%rbx jmp .Lopen_avx2_tail_128_xor .Lopen_avx2_tail_384: vmovdqa .Lchacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa .Lavx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm14,0+224(%rbp) movq %rbx,0+128(%rbp) movq %rbx,%rcx subq $256,%rcx shrq $4,%rcx addq $6,%rcx movq $10,%r8 cmpq $10,%rcx cmovgq %r8,%rcx movq %rsi,%rbx xorq %r8,%r8 .Lopen_avx2_tail_384_rounds_and_x2hash: addq 0+0(%rbx),%r10 adcq 8+0(%rbx),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rbx),%rbx .Lopen_avx2_tail_384_rounds_and_x1hash: vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm6,%ymm6,%ymm6 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 addq 0+0(%rbx),%r10 adcq 8+0(%rbx),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rbx),%rbx incq %r8 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm6,%ymm6,%ymm6 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 cmpq %rcx,%r8 jb .Lopen_avx2_tail_384_rounds_and_x2hash cmpq $10,%r8 jne .Lopen_avx2_tail_384_rounds_and_x1hash movq %rbx,%r8 subq %rsi,%rbx movq %rbx,%rcx movq 0+128(%rbp),%rbx .Lopen_avx2_384_tail_hash: addq $16,%rcx cmpq %rbx,%rcx jg .Lopen_avx2_384_tail_done addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 jmp .Lopen_avx2_384_tail_hash .Lopen_avx2_384_tail_done: vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+0(%rsi),%ymm3,%ymm3 vpxor 32+0(%rsi),%ymm2,%ymm2 vpxor 64+0(%rsi),%ymm6,%ymm6 vpxor 96+0(%rsi),%ymm10,%ymm10 vmovdqu %ymm3,0+0(%rdi) vmovdqu %ymm2,32+0(%rdi) vmovdqu %ymm6,64+0(%rdi) vmovdqu %ymm10,96+0(%rdi) vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+128(%rsi),%ymm3,%ymm3 vpxor 32+128(%rsi),%ymm1,%ymm1 vpxor 64+128(%rsi),%ymm5,%ymm5 vpxor 96+128(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+128(%rdi) vmovdqu %ymm1,32+128(%rdi) vmovdqu %ymm5,64+128(%rdi) vmovdqu %ymm9,96+128(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 leaq 256(%rsi),%rsi leaq 256(%rdi),%rdi subq $256,%rbx jmp .Lopen_avx2_tail_128_xor .Lopen_avx2_tail_512: vmovdqa .Lchacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa .Lavx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm15,0+256(%rbp) vmovdqa %ymm14,0+224(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm12,0+160(%rbp) xorq %rcx,%rcx movq %rsi,%r8 .Lopen_avx2_tail_512_rounds_and_x2hash: addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 .Lopen_avx2_tail_512_rounds_and_x1hash: vmovdqa %ymm8,0+128(%rbp) vmovdqa .Lrol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa .Lrol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa .Lrol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 addq 0+16(%r8),%r10 adcq 8+16(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%r8),%r8 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa .Lrol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 vpalignr $12,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm12,%ymm12,%ymm12 incq %rcx cmpq $4,%rcx jl .Lopen_avx2_tail_512_rounds_and_x2hash cmpq $10,%rcx jne .Lopen_avx2_tail_512_rounds_and_x1hash movq %rbx,%rcx subq $384,%rcx andq $-16,%rcx .Lopen_avx2_tail_512_hash: testq %rcx,%rcx je .Lopen_avx2_tail_512_done addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 subq $16,%rcx jmp .Lopen_avx2_tail_512_hash .Lopen_avx2_tail_512_done: vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 vpaddd 0+64(%rbp),%ymm7,%ymm7 vpaddd 0+96(%rbp),%ymm11,%ymm11 vpaddd 0+256(%rbp),%ymm15,%ymm15 vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vmovdqa %ymm0,0+128(%rbp) vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 vpxor 0+0(%rsi),%ymm0,%ymm0 vpxor 32+0(%rsi),%ymm3,%ymm3 vpxor 64+0(%rsi),%ymm7,%ymm7 vpxor 96+0(%rsi),%ymm11,%ymm11 vmovdqu %ymm0,0+0(%rdi) vmovdqu %ymm3,32+0(%rdi) vmovdqu %ymm7,64+0(%rdi) vmovdqu %ymm11,96+0(%rdi) vmovdqa 0+128(%rbp),%ymm0 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+128(%rsi),%ymm3,%ymm3 vpxor 32+128(%rsi),%ymm2,%ymm2 vpxor 64+128(%rsi),%ymm6,%ymm6 vpxor 96+128(%rsi),%ymm10,%ymm10 vmovdqu %ymm3,0+128(%rdi) vmovdqu %ymm2,32+128(%rdi) vmovdqu %ymm6,64+128(%rdi) vmovdqu %ymm10,96+128(%rdi) vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+256(%rsi),%ymm3,%ymm3 vpxor 32+256(%rsi),%ymm1,%ymm1 vpxor 64+256(%rsi),%ymm5,%ymm5 vpxor 96+256(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+256(%rdi) vmovdqu %ymm1,32+256(%rdi) vmovdqu %ymm5,64+256(%rdi) vmovdqu %ymm9,96+256(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 leaq 384(%rsi),%rsi leaq 384(%rdi),%rdi subq $384,%rbx .Lopen_avx2_tail_128_xor: cmpq $32,%rbx jb .Lopen_avx2_tail_32_xor subq $32,%rbx vpxor (%rsi),%ymm0,%ymm0 vmovdqu %ymm0,(%rdi) leaq 32(%rsi),%rsi leaq 32(%rdi),%rdi vmovdqa %ymm4,%ymm0 vmovdqa %ymm8,%ymm4 vmovdqa %ymm12,%ymm8 jmp .Lopen_avx2_tail_128_xor .Lopen_avx2_tail_32_xor: cmpq $16,%rbx vmovdqa %xmm0,%xmm1 jb .Lopen_avx2_exit subq $16,%rbx vpxor (%rsi),%xmm0,%xmm1 vmovdqu %xmm1,(%rdi) leaq 16(%rsi),%rsi leaq 16(%rdi),%rdi vperm2i128 $0x11,%ymm0,%ymm0,%ymm0 vmovdqa %xmm0,%xmm1 .Lopen_avx2_exit: vzeroupper jmp .Lopen_sse_tail_16 .Lopen_avx2_192: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 vmovdqa %ymm12,%ymm11 vmovdqa %ymm13,%ymm15 movq $10,%r10 .Lopen_avx2_192_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 decq %r10 jne .Lopen_avx2_192_rounds vpaddd %ymm2,%ymm0,%ymm0 vpaddd %ymm2,%ymm1,%ymm1 vpaddd %ymm6,%ymm4,%ymm4 vpaddd %ymm6,%ymm5,%ymm5 vpaddd %ymm10,%ymm8,%ymm8 vpaddd %ymm10,%ymm9,%ymm9 vpaddd %ymm11,%ymm12,%ymm12 vpaddd %ymm15,%ymm13,%ymm13 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vpand .Lclamp(%rip),%ymm3,%ymm3 vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 .Lopen_avx2_short: movq %r8,%r8 call poly_hash_ad_internal .Lopen_avx2_short_hash_and_xor_loop: cmpq $32,%rbx jb .Lopen_avx2_short_tail_32 subq $32,%rbx addq 0+0(%rsi),%r10 adcq 8+0(%rsi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq 0+16(%rsi),%r10 adcq 8+16(%rsi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpxor (%rsi),%ymm0,%ymm0 vmovdqu %ymm0,(%rdi) leaq 32(%rsi),%rsi leaq 32(%rdi),%rdi vmovdqa %ymm4,%ymm0 vmovdqa %ymm8,%ymm4 vmovdqa %ymm12,%ymm8 vmovdqa %ymm1,%ymm12 vmovdqa %ymm5,%ymm1 vmovdqa %ymm9,%ymm5 vmovdqa %ymm13,%ymm9 vmovdqa %ymm2,%ymm13 vmovdqa %ymm6,%ymm2 jmp .Lopen_avx2_short_hash_and_xor_loop .Lopen_avx2_short_tail_32: cmpq $16,%rbx vmovdqa %xmm0,%xmm1 jb .Lopen_avx2_short_tail_32_exit subq $16,%rbx addq 0+0(%rsi),%r10 adcq 8+0(%rsi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpxor (%rsi),%xmm0,%xmm3 vmovdqu %xmm3,(%rdi) leaq 16(%rsi),%rsi leaq 16(%rdi),%rdi vextracti128 $1,%ymm0,%xmm1 .Lopen_avx2_short_tail_32_exit: vzeroupper jmp .Lopen_sse_tail_16 .Lopen_avx2_320: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm14,0+224(%rbp) movq $10,%r10 .Lopen_avx2_320_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm6,%ymm6,%ymm6 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm6,%ymm6,%ymm6 decq %r10 jne .Lopen_avx2_320_rounds vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 vpaddd %ymm7,%ymm4,%ymm4 vpaddd %ymm7,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 vpaddd %ymm11,%ymm8,%ymm8 vpaddd %ymm11,%ymm9,%ymm9 vpaddd %ymm11,%ymm10,%ymm10 vpaddd 0+160(%rbp),%ymm12,%ymm12 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd 0+224(%rbp),%ymm14,%ymm14 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vpand .Lclamp(%rip),%ymm3,%ymm3 vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 jmp .Lopen_avx2_short .size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 .cfi_endproc .globl chacha20_poly1305_seal_avx2 .hidden chacha20_poly1305_seal_avx2 .type chacha20_poly1305_seal_avx2,@function .align 64 chacha20_poly1305_seal_avx2: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %r9 .cfi_adjust_cfa_offset 8 .cfi_offset %r9,-64 subq $288 + 0 + 32,%rsp .cfi_adjust_cfa_offset 288 + 32 leaq 32(%rsp),%rbp andq $-32,%rbp movq 56(%r9),%rbx addq %rdx,%rbx movq %r8,0+0+32(%rbp) movq %rbx,8+0+32(%rbp) movq %rdx,%rbx vzeroupper vmovdqa .Lchacha20_consts(%rip),%ymm0 vbroadcasti128 0(%r9),%ymm4 vbroadcasti128 16(%r9),%ymm8 vbroadcasti128 32(%r9),%ymm12 vpaddd .Lavx2_init(%rip),%ymm12,%ymm12 cmpq $192,%rbx jbe .Lseal_avx2_192 cmpq $320,%rbx jbe .Lseal_avx2_320 vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm4,%ymm7 vmovdqa %ymm4,0+64(%rbp) vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 vmovdqa %ymm8,%ymm11 vmovdqa %ymm8,0+96(%rbp) vmovdqa %ymm12,%ymm15 vpaddd .Lavx2_inc(%rip),%ymm15,%ymm14 vpaddd .Lavx2_inc(%rip),%ymm14,%ymm13 vpaddd .Lavx2_inc(%rip),%ymm13,%ymm12 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm14,0+224(%rbp) vmovdqa %ymm15,0+256(%rbp) movq $10,%r10 .Lseal_avx2_init_rounds: vmovdqa %ymm8,0+128(%rbp) vmovdqa .Lrol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa .Lrol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa .Lrol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa .Lrol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 vpalignr $12,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm12,%ymm12,%ymm12 decq %r10 jnz .Lseal_avx2_init_rounds vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 vpaddd 0+64(%rbp),%ymm7,%ymm7 vpaddd 0+96(%rbp),%ymm11,%ymm11 vpaddd 0+256(%rbp),%ymm15,%ymm15 vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 vperm2i128 $0x02,%ymm3,%ymm7,%ymm15 vperm2i128 $0x13,%ymm3,%ymm7,%ymm3 vpand .Lclamp(%rip),%ymm15,%ymm15 vmovdqa %ymm15,0+0(%rbp) movq %r8,%r8 call poly_hash_ad_internal vpxor 0(%rsi),%ymm3,%ymm3 vpxor 32(%rsi),%ymm11,%ymm11 vmovdqu %ymm3,0(%rdi) vmovdqu %ymm11,32(%rdi) vperm2i128 $0x02,%ymm2,%ymm6,%ymm15 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+64(%rsi),%ymm15,%ymm15 vpxor 32+64(%rsi),%ymm2,%ymm2 vpxor 64+64(%rsi),%ymm6,%ymm6 vpxor 96+64(%rsi),%ymm10,%ymm10 vmovdqu %ymm15,0+64(%rdi) vmovdqu %ymm2,32+64(%rdi) vmovdqu %ymm6,64+64(%rdi) vmovdqu %ymm10,96+64(%rdi) vperm2i128 $0x02,%ymm1,%ymm5,%ymm15 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+192(%rsi),%ymm15,%ymm15 vpxor 32+192(%rsi),%ymm1,%ymm1 vpxor 64+192(%rsi),%ymm5,%ymm5 vpxor 96+192(%rsi),%ymm9,%ymm9 vmovdqu %ymm15,0+192(%rdi) vmovdqu %ymm1,32+192(%rdi) vmovdqu %ymm5,64+192(%rdi) vmovdqu %ymm9,96+192(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm15 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm15,%ymm8 leaq 320(%rsi),%rsi subq $320,%rbx movq $320,%rcx cmpq $128,%rbx jbe .Lseal_avx2_short_hash_remainder vpxor 0(%rsi),%ymm0,%ymm0 vpxor 32(%rsi),%ymm4,%ymm4 vpxor 64(%rsi),%ymm8,%ymm8 vpxor 96(%rsi),%ymm12,%ymm12 vmovdqu %ymm0,320(%rdi) vmovdqu %ymm4,352(%rdi) vmovdqu %ymm8,384(%rdi) vmovdqu %ymm12,416(%rdi) leaq 128(%rsi),%rsi subq $128,%rbx movq $8,%rcx movq $2,%r8 cmpq $128,%rbx jbe .Lseal_avx2_tail_128 cmpq $256,%rbx jbe .Lseal_avx2_tail_256 cmpq $384,%rbx jbe .Lseal_avx2_tail_384 cmpq $512,%rbx jbe .Lseal_avx2_tail_512 vmovdqa .Lchacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa .Lavx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm15,0+256(%rbp) vmovdqa %ymm14,0+224(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm8,0+128(%rbp) vmovdqa .Lrol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa .Lrol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa .Lrol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa .Lrol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 vpalignr $12,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa .Lrol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa .Lrol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 subq $16,%rdi movq $9,%rcx jmp .Lseal_avx2_main_loop_rounds_entry .align 32 .Lseal_avx2_main_loop: vmovdqa .Lchacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa .Lavx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm15,0+256(%rbp) vmovdqa %ymm14,0+224(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm12,0+160(%rbp) movq $10,%rcx .align 32 .Lseal_avx2_main_loop_rounds: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 vmovdqa %ymm8,0+128(%rbp) vmovdqa .Lrol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 addq %rax,%r15 adcq %rdx,%r9 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa .Lrol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 .Lseal_avx2_main_loop_rounds_entry: vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm14,%ymm14,%ymm14 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa .Lrol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 addq %rax,%r15 adcq %rdx,%r9 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 addq 0+32(%rdi),%r10 adcq 8+32(%rdi),%r11 adcq $1,%r12 leaq 48(%rdi),%rdi vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa .Lrol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 addq %rax,%r15 adcq %rdx,%r9 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 vpalignr $12,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpalignr $4,%ymm12,%ymm12,%ymm12 decq %rcx jne .Lseal_avx2_main_loop_rounds vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 vpaddd 0+64(%rbp),%ymm7,%ymm7 vpaddd 0+96(%rbp),%ymm11,%ymm11 vpaddd 0+256(%rbp),%ymm15,%ymm15 vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vmovdqa %ymm0,0+128(%rbp) addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 vpxor 0+0(%rsi),%ymm0,%ymm0 vpxor 32+0(%rsi),%ymm3,%ymm3 vpxor 64+0(%rsi),%ymm7,%ymm7 vpxor 96+0(%rsi),%ymm11,%ymm11 vmovdqu %ymm0,0+0(%rdi) vmovdqu %ymm3,32+0(%rdi) vmovdqu %ymm7,64+0(%rdi) vmovdqu %ymm11,96+0(%rdi) vmovdqa 0+128(%rbp),%ymm0 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+128(%rsi),%ymm3,%ymm3 vpxor 32+128(%rsi),%ymm2,%ymm2 vpxor 64+128(%rsi),%ymm6,%ymm6 vpxor 96+128(%rsi),%ymm10,%ymm10 vmovdqu %ymm3,0+128(%rdi) vmovdqu %ymm2,32+128(%rdi) vmovdqu %ymm6,64+128(%rdi) vmovdqu %ymm10,96+128(%rdi) vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+256(%rsi),%ymm3,%ymm3 vpxor 32+256(%rsi),%ymm1,%ymm1 vpxor 64+256(%rsi),%ymm5,%ymm5 vpxor 96+256(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+256(%rdi) vmovdqu %ymm1,32+256(%rdi) vmovdqu %ymm5,64+256(%rdi) vmovdqu %ymm9,96+256(%rdi) vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 vpxor 0+384(%rsi),%ymm3,%ymm3 vpxor 32+384(%rsi),%ymm0,%ymm0 vpxor 64+384(%rsi),%ymm4,%ymm4 vpxor 96+384(%rsi),%ymm8,%ymm8 vmovdqu %ymm3,0+384(%rdi) vmovdqu %ymm0,32+384(%rdi) vmovdqu %ymm4,64+384(%rdi) vmovdqu %ymm8,96+384(%rdi) leaq 512(%rsi),%rsi subq $512,%rbx cmpq $512,%rbx jg .Lseal_avx2_main_loop addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi movq $10,%rcx xorq %r8,%r8 cmpq $384,%rbx ja .Lseal_avx2_tail_512 cmpq $256,%rbx ja .Lseal_avx2_tail_384 cmpq $128,%rbx ja .Lseal_avx2_tail_256 .Lseal_avx2_tail_128: vmovdqa .Lchacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa .Lavx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm12 vmovdqa %ymm12,0+160(%rbp) .Lseal_avx2_tail_128_rounds_and_3xhash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi .Lseal_avx2_tail_128_rounds_and_2xhash: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi decq %rcx jg .Lseal_avx2_tail_128_rounds_and_3xhash decq %r8 jge .Lseal_avx2_tail_128_rounds_and_2xhash vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 jmp .Lseal_avx2_short_loop .Lseal_avx2_tail_256: vmovdqa .Lchacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa .Lavx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) .Lseal_avx2_tail_256_rounds_and_3xhash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi .Lseal_avx2_tail_256_rounds_and_2xhash: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi decq %rcx jg .Lseal_avx2_tail_256_rounds_and_3xhash decq %r8 jge .Lseal_avx2_tail_256_rounds_and_2xhash vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+0(%rsi),%ymm3,%ymm3 vpxor 32+0(%rsi),%ymm1,%ymm1 vpxor 64+0(%rsi),%ymm5,%ymm5 vpxor 96+0(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+0(%rdi) vmovdqu %ymm1,32+0(%rdi) vmovdqu %ymm5,64+0(%rdi) vmovdqu %ymm9,96+0(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 movq $128,%rcx leaq 128(%rsi),%rsi subq $128,%rbx jmp .Lseal_avx2_short_hash_remainder .Lseal_avx2_tail_384: vmovdqa .Lchacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa .Lavx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm14,0+224(%rbp) .Lseal_avx2_tail_384_rounds_and_3xhash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi .Lseal_avx2_tail_384_rounds_and_2xhash: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm6,%ymm6,%ymm6 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm6,%ymm6,%ymm6 leaq 32(%rdi),%rdi decq %rcx jg .Lseal_avx2_tail_384_rounds_and_3xhash decq %r8 jge .Lseal_avx2_tail_384_rounds_and_2xhash vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+0(%rsi),%ymm3,%ymm3 vpxor 32+0(%rsi),%ymm2,%ymm2 vpxor 64+0(%rsi),%ymm6,%ymm6 vpxor 96+0(%rsi),%ymm10,%ymm10 vmovdqu %ymm3,0+0(%rdi) vmovdqu %ymm2,32+0(%rdi) vmovdqu %ymm6,64+0(%rdi) vmovdqu %ymm10,96+0(%rdi) vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+128(%rsi),%ymm3,%ymm3 vpxor 32+128(%rsi),%ymm1,%ymm1 vpxor 64+128(%rsi),%ymm5,%ymm5 vpxor 96+128(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+128(%rdi) vmovdqu %ymm1,32+128(%rdi) vmovdqu %ymm5,64+128(%rdi) vmovdqu %ymm9,96+128(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 movq $256,%rcx leaq 256(%rsi),%rsi subq $256,%rbx jmp .Lseal_avx2_short_hash_remainder .Lseal_avx2_tail_512: vmovdqa .Lchacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa .Lavx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm15,0+256(%rbp) vmovdqa %ymm14,0+224(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm12,0+160(%rbp) .Lseal_avx2_tail_512_rounds_and_3xhash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi .Lseal_avx2_tail_512_rounds_and_2xhash: vmovdqa %ymm8,0+128(%rbp) vmovdqa .Lrol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa .Lrol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 addq %rax,%r15 adcq %rdx,%r9 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa .Lrol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa .Lrol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 vpalignr $12,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm12,%ymm12,%ymm12 addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi decq %rcx jg .Lseal_avx2_tail_512_rounds_and_3xhash decq %r8 jge .Lseal_avx2_tail_512_rounds_and_2xhash vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 vpaddd 0+64(%rbp),%ymm7,%ymm7 vpaddd 0+96(%rbp),%ymm11,%ymm11 vpaddd 0+256(%rbp),%ymm15,%ymm15 vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vmovdqa %ymm0,0+128(%rbp) vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 vpxor 0+0(%rsi),%ymm0,%ymm0 vpxor 32+0(%rsi),%ymm3,%ymm3 vpxor 64+0(%rsi),%ymm7,%ymm7 vpxor 96+0(%rsi),%ymm11,%ymm11 vmovdqu %ymm0,0+0(%rdi) vmovdqu %ymm3,32+0(%rdi) vmovdqu %ymm7,64+0(%rdi) vmovdqu %ymm11,96+0(%rdi) vmovdqa 0+128(%rbp),%ymm0 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+128(%rsi),%ymm3,%ymm3 vpxor 32+128(%rsi),%ymm2,%ymm2 vpxor 64+128(%rsi),%ymm6,%ymm6 vpxor 96+128(%rsi),%ymm10,%ymm10 vmovdqu %ymm3,0+128(%rdi) vmovdqu %ymm2,32+128(%rdi) vmovdqu %ymm6,64+128(%rdi) vmovdqu %ymm10,96+128(%rdi) vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+256(%rsi),%ymm3,%ymm3 vpxor 32+256(%rsi),%ymm1,%ymm1 vpxor 64+256(%rsi),%ymm5,%ymm5 vpxor 96+256(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+256(%rdi) vmovdqu %ymm1,32+256(%rdi) vmovdqu %ymm5,64+256(%rdi) vmovdqu %ymm9,96+256(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 movq $384,%rcx leaq 384(%rsi),%rsi subq $384,%rbx jmp .Lseal_avx2_short_hash_remainder .Lseal_avx2_320: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm14,0+224(%rbp) movq $10,%r10 .Lseal_avx2_320_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm6,%ymm6,%ymm6 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb .Lrol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm6,%ymm6,%ymm6 decq %r10 jne .Lseal_avx2_320_rounds vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 vpaddd %ymm7,%ymm4,%ymm4 vpaddd %ymm7,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 vpaddd %ymm11,%ymm8,%ymm8 vpaddd %ymm11,%ymm9,%ymm9 vpaddd %ymm11,%ymm10,%ymm10 vpaddd 0+160(%rbp),%ymm12,%ymm12 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd 0+224(%rbp),%ymm14,%ymm14 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vpand .Lclamp(%rip),%ymm3,%ymm3 vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 jmp .Lseal_avx2_short .Lseal_avx2_192: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 vmovdqa %ymm12,%ymm11 vmovdqa %ymm13,%ymm15 movq $10,%r10 .Lseal_avx2_192_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb .Lrol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb .Lrol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 decq %r10 jne .Lseal_avx2_192_rounds vpaddd %ymm2,%ymm0,%ymm0 vpaddd %ymm2,%ymm1,%ymm1 vpaddd %ymm6,%ymm4,%ymm4 vpaddd %ymm6,%ymm5,%ymm5 vpaddd %ymm10,%ymm8,%ymm8 vpaddd %ymm10,%ymm9,%ymm9 vpaddd %ymm11,%ymm12,%ymm12 vpaddd %ymm15,%ymm13,%ymm13 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vpand .Lclamp(%rip),%ymm3,%ymm3 vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 .Lseal_avx2_short: movq %r8,%r8 call poly_hash_ad_internal xorq %rcx,%rcx .Lseal_avx2_short_hash_remainder: cmpq $16,%rcx jb .Lseal_avx2_short_loop addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 subq $16,%rcx addq $16,%rdi jmp .Lseal_avx2_short_hash_remainder .Lseal_avx2_short_loop: cmpq $32,%rbx jb .Lseal_avx2_short_tail subq $32,%rbx vpxor (%rsi),%ymm0,%ymm0 vmovdqu %ymm0,(%rdi) leaq 32(%rsi),%rsi addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi vmovdqa %ymm4,%ymm0 vmovdqa %ymm8,%ymm4 vmovdqa %ymm12,%ymm8 vmovdqa %ymm1,%ymm12 vmovdqa %ymm5,%ymm1 vmovdqa %ymm9,%ymm5 vmovdqa %ymm13,%ymm9 vmovdqa %ymm2,%ymm13 vmovdqa %ymm6,%ymm2 jmp .Lseal_avx2_short_loop .Lseal_avx2_short_tail: cmpq $16,%rbx jb .Lseal_avx2_exit subq $16,%rbx vpxor (%rsi),%xmm0,%xmm3 vmovdqu %xmm3,(%rdi) leaq 16(%rsi),%rsi addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi vextracti128 $1,%ymm0,%xmm0 .Lseal_avx2_exit: vzeroupper jmp .Lseal_sse_tail_16 .cfi_endproc .size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2 #endif ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-macosx.S000064400000000000000000005641201046102023000207710ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .section __DATA,__const .p2align 6 chacha20_poly1305_constants: L$chacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' L$rol8: .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 L$rol16: .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 L$avx2_init: .long 0,0,0,0 L$sse_inc: .long 1,0,0,0 L$avx2_inc: .long 2,0,0,0,2,0,0,0 L$clamp: .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC .quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF .p2align 4 L$and_masks: .byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .text .p2align 6 poly_hash_ad_internal: xorq %r10,%r10 xorq %r11,%r11 xorq %r12,%r12 cmpq $13,%r8 jne L$hash_ad_loop L$poly_fast_tls_ad: movq (%rcx),%r10 movq 5(%rcx),%r11 shrq $24,%r11 movq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 ret L$hash_ad_loop: cmpq $16,%r8 jb L$hash_ad_tail addq 0+0(%rcx),%r10 adcq 8+0(%rcx),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rcx),%rcx subq $16,%r8 jmp L$hash_ad_loop L$hash_ad_tail: cmpq $0,%r8 je L$hash_ad_done xorq %r13,%r13 xorq %r14,%r14 xorq %r15,%r15 addq %r8,%rcx L$hash_ad_tail_loop: shldq $8,%r13,%r14 shlq $8,%r13 movzbq -1(%rcx),%r15 xorq %r15,%r13 decq %rcx decq %r8 jne L$hash_ad_tail_loop addq %r13,%r10 adcq %r14,%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 L$hash_ad_done: ret .globl _chacha20_poly1305_open_sse41 .private_extern _chacha20_poly1305_open_sse41 .p2align 6 _chacha20_poly1305_open_sse41: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %r9 subq $288 + 0 + 32,%rsp leaq 32(%rsp),%rbp andq $-32,%rbp movq %rdx,%rbx movq %r8,0+0+32(%rbp) movq %rbx,8+0+32(%rbp) cmpq $128,%rbx jbe L$open_sse_128 movdqa L$chacha20_consts(%rip),%xmm0 movdqu 0(%r9),%xmm4 movdqu 16(%r9),%xmm8 movdqu 32(%r9),%xmm12 movdqa %xmm12,%xmm7 movdqa %xmm4,0+48(%rbp) movdqa %xmm8,0+64(%rbp) movdqa %xmm12,0+96(%rbp) movq $10,%r10 L$open_sse_init_rounds: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 decq %r10 jne L$open_sse_init_rounds paddd L$chacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 pand L$clamp(%rip),%xmm0 movdqa %xmm0,0+0(%rbp) movdqa %xmm4,0+16(%rbp) movq %r8,%r8 call poly_hash_ad_internal L$open_sse_main_loop: cmpq $256,%rbx jb L$open_sse_tail movdqa L$chacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa %xmm0,%xmm2 movdqa %xmm4,%xmm6 movdqa %xmm8,%xmm10 movdqa %xmm0,%xmm3 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 movdqa 0+96(%rbp),%xmm15 paddd L$sse_inc(%rip),%xmm15 movdqa %xmm15,%xmm14 paddd L$sse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movdqa %xmm14,0+128(%rbp) movdqa %xmm15,0+144(%rbp) movq $4,%rcx movq %rsi,%r8 L$open_sse_main_loop_rounds: movdqa %xmm8,0+80(%rbp) movdqa L$rol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 leaq 16(%r8),%r8 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movdqa L$rol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 movdqa 0+80(%rbp),%xmm8 imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 .byte 102,15,58,15,255,4 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,12 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 movdqa %xmm8,0+80(%rbp) movdqa L$rol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 movdqa L$rol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 movdqa 0+80(%rbp),%xmm8 .byte 102,15,58,15,255,12 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,4 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 decq %rcx jge L$open_sse_main_loop_rounds addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 cmpq $-6,%rcx jg L$open_sse_main_loop_rounds paddd L$chacha20_consts(%rip),%xmm3 paddd 0+48(%rbp),%xmm7 paddd 0+64(%rbp),%xmm11 paddd 0+144(%rbp),%xmm15 paddd L$chacha20_consts(%rip),%xmm2 paddd 0+48(%rbp),%xmm6 paddd 0+64(%rbp),%xmm10 paddd 0+128(%rbp),%xmm14 paddd L$chacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd L$chacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqa %xmm12,0+80(%rbp) movdqu 0 + 0(%rsi),%xmm12 pxor %xmm3,%xmm12 movdqu %xmm12,0 + 0(%rdi) movdqu 16 + 0(%rsi),%xmm12 pxor %xmm7,%xmm12 movdqu %xmm12,16 + 0(%rdi) movdqu 32 + 0(%rsi),%xmm12 pxor %xmm11,%xmm12 movdqu %xmm12,32 + 0(%rdi) movdqu 48 + 0(%rsi),%xmm12 pxor %xmm15,%xmm12 movdqu %xmm12,48 + 0(%rdi) movdqu 0 + 64(%rsi),%xmm3 movdqu 16 + 64(%rsi),%xmm7 movdqu 32 + 64(%rsi),%xmm11 movdqu 48 + 64(%rsi),%xmm15 pxor %xmm3,%xmm2 pxor %xmm7,%xmm6 pxor %xmm11,%xmm10 pxor %xmm14,%xmm15 movdqu %xmm2,0 + 64(%rdi) movdqu %xmm6,16 + 64(%rdi) movdqu %xmm10,32 + 64(%rdi) movdqu %xmm15,48 + 64(%rdi) movdqu 0 + 128(%rsi),%xmm3 movdqu 16 + 128(%rsi),%xmm7 movdqu 32 + 128(%rsi),%xmm11 movdqu 48 + 128(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 128(%rdi) movdqu %xmm5,16 + 128(%rdi) movdqu %xmm9,32 + 128(%rdi) movdqu %xmm15,48 + 128(%rdi) movdqu 0 + 192(%rsi),%xmm3 movdqu 16 + 192(%rsi),%xmm7 movdqu 32 + 192(%rsi),%xmm11 movdqu 48 + 192(%rsi),%xmm15 pxor %xmm3,%xmm0 pxor %xmm7,%xmm4 pxor %xmm11,%xmm8 pxor 0+80(%rbp),%xmm15 movdqu %xmm0,0 + 192(%rdi) movdqu %xmm4,16 + 192(%rdi) movdqu %xmm8,32 + 192(%rdi) movdqu %xmm15,48 + 192(%rdi) leaq 256(%rsi),%rsi leaq 256(%rdi),%rdi subq $256,%rbx jmp L$open_sse_main_loop L$open_sse_tail: testq %rbx,%rbx jz L$open_sse_finalize cmpq $192,%rbx ja L$open_sse_tail_256 cmpq $128,%rbx ja L$open_sse_tail_192 cmpq $64,%rbx ja L$open_sse_tail_128 movdqa L$chacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa 0+96(%rbp),%xmm12 paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) xorq %r8,%r8 movq %rbx,%rcx cmpq $16,%rcx jb L$open_sse_tail_64_rounds L$open_sse_tail_64_rounds_and_x1hash: addq 0+0(%rsi,%r8,1),%r10 adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 subq $16,%rcx L$open_sse_tail_64_rounds: addq $16,%r8 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 cmpq $16,%rcx jae L$open_sse_tail_64_rounds_and_x1hash cmpq $160,%r8 jne L$open_sse_tail_64_rounds paddd L$chacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 jmp L$open_sse_tail_64_dec_loop L$open_sse_tail_128: movdqa L$chacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa 0+96(%rbp),%xmm13 paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movq %rbx,%rcx andq $-16,%rcx xorq %r8,%r8 L$open_sse_tail_128_rounds_and_x1hash: addq 0+0(%rsi,%r8,1),%r10 adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 L$open_sse_tail_128_rounds: addq $16,%r8 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 cmpq %rcx,%r8 jb L$open_sse_tail_128_rounds_and_x1hash cmpq $160,%r8 jne L$open_sse_tail_128_rounds paddd L$chacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd L$chacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 movdqu 48 + 0(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 0(%rdi) movdqu %xmm5,16 + 0(%rdi) movdqu %xmm9,32 + 0(%rdi) movdqu %xmm15,48 + 0(%rdi) subq $64,%rbx leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi jmp L$open_sse_tail_64_dec_loop L$open_sse_tail_192: movdqa L$chacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa %xmm0,%xmm2 movdqa %xmm4,%xmm6 movdqa %xmm8,%xmm10 movdqa 0+96(%rbp),%xmm14 paddd L$sse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movdqa %xmm14,0+128(%rbp) movq %rbx,%rcx movq $160,%r8 cmpq $160,%rcx cmovgq %r8,%rcx andq $-16,%rcx xorq %r8,%r8 L$open_sse_tail_192_rounds_and_x1hash: addq 0+0(%rsi,%r8,1),%r10 adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 L$open_sse_tail_192_rounds: addq $16,%r8 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 cmpq %rcx,%r8 jb L$open_sse_tail_192_rounds_and_x1hash cmpq $160,%r8 jne L$open_sse_tail_192_rounds cmpq $176,%rbx jb L$open_sse_tail_192_finish addq 0+160(%rsi),%r10 adcq 8+160(%rsi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 cmpq $192,%rbx jb L$open_sse_tail_192_finish addq 0+176(%rsi),%r10 adcq 8+176(%rsi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 L$open_sse_tail_192_finish: paddd L$chacha20_consts(%rip),%xmm2 paddd 0+48(%rbp),%xmm6 paddd 0+64(%rbp),%xmm10 paddd 0+128(%rbp),%xmm14 paddd L$chacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd L$chacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 movdqu 48 + 0(%rsi),%xmm15 pxor %xmm3,%xmm2 pxor %xmm7,%xmm6 pxor %xmm11,%xmm10 pxor %xmm14,%xmm15 movdqu %xmm2,0 + 0(%rdi) movdqu %xmm6,16 + 0(%rdi) movdqu %xmm10,32 + 0(%rdi) movdqu %xmm15,48 + 0(%rdi) movdqu 0 + 64(%rsi),%xmm3 movdqu 16 + 64(%rsi),%xmm7 movdqu 32 + 64(%rsi),%xmm11 movdqu 48 + 64(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 64(%rdi) movdqu %xmm5,16 + 64(%rdi) movdqu %xmm9,32 + 64(%rdi) movdqu %xmm15,48 + 64(%rdi) subq $128,%rbx leaq 128(%rsi),%rsi leaq 128(%rdi),%rdi jmp L$open_sse_tail_64_dec_loop L$open_sse_tail_256: movdqa L$chacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa %xmm0,%xmm2 movdqa %xmm4,%xmm6 movdqa %xmm8,%xmm10 movdqa %xmm0,%xmm3 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 movdqa 0+96(%rbp),%xmm15 paddd L$sse_inc(%rip),%xmm15 movdqa %xmm15,%xmm14 paddd L$sse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movdqa %xmm14,0+128(%rbp) movdqa %xmm15,0+144(%rbp) xorq %r8,%r8 L$open_sse_tail_256_rounds_and_x1hash: addq 0+0(%rsi,%r8,1),%r10 adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 movdqa %xmm11,0+80(%rbp) paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm11 pslld $12,%xmm11 psrld $20,%xmm4 pxor %xmm11,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm11 pslld $7,%xmm11 psrld $25,%xmm4 pxor %xmm11,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm11 pslld $12,%xmm11 psrld $20,%xmm5 pxor %xmm11,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm11 pslld $7,%xmm11 psrld $25,%xmm5 pxor %xmm11,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm11 pslld $12,%xmm11 psrld $20,%xmm6 pxor %xmm11,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm11 pslld $7,%xmm11 psrld $25,%xmm6 pxor %xmm11,%xmm6 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 movdqa 0+80(%rbp),%xmm11 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movdqa %xmm9,0+80(%rbp) paddd %xmm7,%xmm3 pxor %xmm3,%xmm15 pshufb L$rol16(%rip),%xmm15 paddd %xmm15,%xmm11 pxor %xmm11,%xmm7 movdqa %xmm7,%xmm9 pslld $12,%xmm9 psrld $20,%xmm7 pxor %xmm9,%xmm7 paddd %xmm7,%xmm3 pxor %xmm3,%xmm15 pshufb L$rol8(%rip),%xmm15 paddd %xmm15,%xmm11 pxor %xmm11,%xmm7 movdqa %xmm7,%xmm9 pslld $7,%xmm9 psrld $25,%xmm7 pxor %xmm9,%xmm7 .byte 102,15,58,15,255,4 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,12 movdqa 0+80(%rbp),%xmm9 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx movdqa %xmm11,0+80(%rbp) paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm11 pslld $12,%xmm11 psrld $20,%xmm4 pxor %xmm11,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm11 pslld $7,%xmm11 psrld $25,%xmm4 pxor %xmm11,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm11 pslld $12,%xmm11 psrld $20,%xmm5 pxor %xmm11,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm11 pslld $7,%xmm11 psrld $25,%xmm5 pxor %xmm11,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm11 pslld $12,%xmm11 psrld $20,%xmm6 pxor %xmm11,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm11 pslld $7,%xmm11 psrld $25,%xmm6 pxor %xmm11,%xmm6 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 movdqa 0+80(%rbp),%xmm11 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 movdqa %xmm9,0+80(%rbp) paddd %xmm7,%xmm3 pxor %xmm3,%xmm15 pshufb L$rol16(%rip),%xmm15 paddd %xmm15,%xmm11 pxor %xmm11,%xmm7 movdqa %xmm7,%xmm9 pslld $12,%xmm9 psrld $20,%xmm7 pxor %xmm9,%xmm7 paddd %xmm7,%xmm3 pxor %xmm3,%xmm15 pshufb L$rol8(%rip),%xmm15 paddd %xmm15,%xmm11 pxor %xmm11,%xmm7 movdqa %xmm7,%xmm9 pslld $7,%xmm9 psrld $25,%xmm7 pxor %xmm9,%xmm7 .byte 102,15,58,15,255,12 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,4 movdqa 0+80(%rbp),%xmm9 addq $16,%r8 cmpq $160,%r8 jb L$open_sse_tail_256_rounds_and_x1hash movq %rbx,%rcx andq $-16,%rcx L$open_sse_tail_256_hash: addq 0+0(%rsi,%r8,1),%r10 adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq $16,%r8 cmpq %rcx,%r8 jb L$open_sse_tail_256_hash paddd L$chacha20_consts(%rip),%xmm3 paddd 0+48(%rbp),%xmm7 paddd 0+64(%rbp),%xmm11 paddd 0+144(%rbp),%xmm15 paddd L$chacha20_consts(%rip),%xmm2 paddd 0+48(%rbp),%xmm6 paddd 0+64(%rbp),%xmm10 paddd 0+128(%rbp),%xmm14 paddd L$chacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd L$chacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqa %xmm12,0+80(%rbp) movdqu 0 + 0(%rsi),%xmm12 pxor %xmm3,%xmm12 movdqu %xmm12,0 + 0(%rdi) movdqu 16 + 0(%rsi),%xmm12 pxor %xmm7,%xmm12 movdqu %xmm12,16 + 0(%rdi) movdqu 32 + 0(%rsi),%xmm12 pxor %xmm11,%xmm12 movdqu %xmm12,32 + 0(%rdi) movdqu 48 + 0(%rsi),%xmm12 pxor %xmm15,%xmm12 movdqu %xmm12,48 + 0(%rdi) movdqu 0 + 64(%rsi),%xmm3 movdqu 16 + 64(%rsi),%xmm7 movdqu 32 + 64(%rsi),%xmm11 movdqu 48 + 64(%rsi),%xmm15 pxor %xmm3,%xmm2 pxor %xmm7,%xmm6 pxor %xmm11,%xmm10 pxor %xmm14,%xmm15 movdqu %xmm2,0 + 64(%rdi) movdqu %xmm6,16 + 64(%rdi) movdqu %xmm10,32 + 64(%rdi) movdqu %xmm15,48 + 64(%rdi) movdqu 0 + 128(%rsi),%xmm3 movdqu 16 + 128(%rsi),%xmm7 movdqu 32 + 128(%rsi),%xmm11 movdqu 48 + 128(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 128(%rdi) movdqu %xmm5,16 + 128(%rdi) movdqu %xmm9,32 + 128(%rdi) movdqu %xmm15,48 + 128(%rdi) movdqa 0+80(%rbp),%xmm12 subq $192,%rbx leaq 192(%rsi),%rsi leaq 192(%rdi),%rdi L$open_sse_tail_64_dec_loop: cmpq $16,%rbx jb L$open_sse_tail_16_init subq $16,%rbx movdqu (%rsi),%xmm3 pxor %xmm3,%xmm0 movdqu %xmm0,(%rdi) leaq 16(%rsi),%rsi leaq 16(%rdi),%rdi movdqa %xmm4,%xmm0 movdqa %xmm8,%xmm4 movdqa %xmm12,%xmm8 jmp L$open_sse_tail_64_dec_loop L$open_sse_tail_16_init: movdqa %xmm0,%xmm1 L$open_sse_tail_16: testq %rbx,%rbx jz L$open_sse_finalize pxor %xmm3,%xmm3 leaq -1(%rsi,%rbx,1),%rsi movq %rbx,%r8 L$open_sse_tail_16_compose: pslldq $1,%xmm3 pinsrb $0,(%rsi),%xmm3 subq $1,%rsi subq $1,%r8 jnz L$open_sse_tail_16_compose .byte 102,73,15,126,221 pextrq $1,%xmm3,%r14 pxor %xmm1,%xmm3 L$open_sse_tail_16_extract: pextrb $0,%xmm3,(%rdi) psrldq $1,%xmm3 addq $1,%rdi subq $1,%rbx jne L$open_sse_tail_16_extract addq %r13,%r10 adcq %r14,%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 L$open_sse_finalize: addq 0+0+32(%rbp),%r10 adcq 8+0+32(%rbp),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 movq %r10,%r13 movq %r11,%r14 movq %r12,%r15 subq $-5,%r10 sbbq $-1,%r11 sbbq $3,%r12 cmovcq %r13,%r10 cmovcq %r14,%r11 cmovcq %r15,%r12 addq 0+0+16(%rbp),%r10 adcq 8+0+16(%rbp),%r11 addq $288 + 0 + 32,%rsp popq %r9 movq %r10,(%r9) movq %r11,8(%r9) popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx popq %rbp ret L$open_sse_128: movdqu L$chacha20_consts(%rip),%xmm0 movdqa %xmm0,%xmm1 movdqa %xmm0,%xmm2 movdqu 0(%r9),%xmm4 movdqa %xmm4,%xmm5 movdqa %xmm4,%xmm6 movdqu 16(%r9),%xmm8 movdqa %xmm8,%xmm9 movdqa %xmm8,%xmm10 movdqu 32(%r9),%xmm12 movdqa %xmm12,%xmm13 paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm14 paddd L$sse_inc(%rip),%xmm14 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 movdqa %xmm13,%xmm15 movq $10,%r10 L$open_sse_128_rounds: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 decq %r10 jnz L$open_sse_128_rounds paddd L$chacha20_consts(%rip),%xmm0 paddd L$chacha20_consts(%rip),%xmm1 paddd L$chacha20_consts(%rip),%xmm2 paddd %xmm7,%xmm4 paddd %xmm7,%xmm5 paddd %xmm7,%xmm6 paddd %xmm11,%xmm9 paddd %xmm11,%xmm10 paddd %xmm15,%xmm13 paddd L$sse_inc(%rip),%xmm15 paddd %xmm15,%xmm14 pand L$clamp(%rip),%xmm0 movdqa %xmm0,0+0(%rbp) movdqa %xmm4,0+16(%rbp) movq %r8,%r8 call poly_hash_ad_internal L$open_sse_128_xor_hash: cmpq $16,%rbx jb L$open_sse_tail_16 subq $16,%rbx addq 0+0(%rsi),%r10 adcq 8+0(%rsi),%r11 adcq $1,%r12 movdqu 0(%rsi),%xmm3 pxor %xmm3,%xmm1 movdqu %xmm1,0(%rdi) leaq 16(%rsi),%rsi leaq 16(%rdi),%rdi movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 movdqa %xmm5,%xmm1 movdqa %xmm9,%xmm5 movdqa %xmm13,%xmm9 movdqa %xmm2,%xmm13 movdqa %xmm6,%xmm2 movdqa %xmm10,%xmm6 movdqa %xmm14,%xmm10 jmp L$open_sse_128_xor_hash .globl _chacha20_poly1305_seal_sse41 .private_extern _chacha20_poly1305_seal_sse41 .p2align 6 _chacha20_poly1305_seal_sse41: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %r9 subq $288 + 0 + 32,%rsp leaq 32(%rsp),%rbp andq $-32,%rbp movq 56(%r9),%rbx addq %rdx,%rbx movq %r8,0+0+32(%rbp) movq %rbx,8+0+32(%rbp) movq %rdx,%rbx cmpq $128,%rbx jbe L$seal_sse_128 movdqa L$chacha20_consts(%rip),%xmm0 movdqu 0(%r9),%xmm4 movdqu 16(%r9),%xmm8 movdqu 32(%r9),%xmm12 movdqa %xmm0,%xmm1 movdqa %xmm0,%xmm2 movdqa %xmm0,%xmm3 movdqa %xmm4,%xmm5 movdqa %xmm4,%xmm6 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm9 movdqa %xmm8,%xmm10 movdqa %xmm8,%xmm11 movdqa %xmm12,%xmm15 paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,%xmm14 paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,%xmm13 paddd L$sse_inc(%rip),%xmm12 movdqa %xmm4,0+48(%rbp) movdqa %xmm8,0+64(%rbp) movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movdqa %xmm14,0+128(%rbp) movdqa %xmm15,0+144(%rbp) movq $10,%r10 L$seal_sse_init_rounds: movdqa %xmm8,0+80(%rbp) movdqa L$rol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 movdqa L$rol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 movdqa 0+80(%rbp),%xmm8 .byte 102,15,58,15,255,4 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,12 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 movdqa %xmm8,0+80(%rbp) movdqa L$rol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 movdqa L$rol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 movdqa 0+80(%rbp),%xmm8 .byte 102,15,58,15,255,12 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,4 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 decq %r10 jnz L$seal_sse_init_rounds paddd L$chacha20_consts(%rip),%xmm3 paddd 0+48(%rbp),%xmm7 paddd 0+64(%rbp),%xmm11 paddd 0+144(%rbp),%xmm15 paddd L$chacha20_consts(%rip),%xmm2 paddd 0+48(%rbp),%xmm6 paddd 0+64(%rbp),%xmm10 paddd 0+128(%rbp),%xmm14 paddd L$chacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd L$chacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 pand L$clamp(%rip),%xmm3 movdqa %xmm3,0+0(%rbp) movdqa %xmm7,0+16(%rbp) movq %r8,%r8 call poly_hash_ad_internal movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 movdqu 48 + 0(%rsi),%xmm15 pxor %xmm3,%xmm2 pxor %xmm7,%xmm6 pxor %xmm11,%xmm10 pxor %xmm14,%xmm15 movdqu %xmm2,0 + 0(%rdi) movdqu %xmm6,16 + 0(%rdi) movdqu %xmm10,32 + 0(%rdi) movdqu %xmm15,48 + 0(%rdi) movdqu 0 + 64(%rsi),%xmm3 movdqu 16 + 64(%rsi),%xmm7 movdqu 32 + 64(%rsi),%xmm11 movdqu 48 + 64(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 64(%rdi) movdqu %xmm5,16 + 64(%rdi) movdqu %xmm9,32 + 64(%rdi) movdqu %xmm15,48 + 64(%rdi) cmpq $192,%rbx ja L$seal_sse_main_init movq $128,%rcx subq $128,%rbx leaq 128(%rsi),%rsi jmp L$seal_sse_128_tail_hash L$seal_sse_main_init: movdqu 0 + 128(%rsi),%xmm3 movdqu 16 + 128(%rsi),%xmm7 movdqu 32 + 128(%rsi),%xmm11 movdqu 48 + 128(%rsi),%xmm15 pxor %xmm3,%xmm0 pxor %xmm7,%xmm4 pxor %xmm11,%xmm8 pxor %xmm12,%xmm15 movdqu %xmm0,0 + 128(%rdi) movdqu %xmm4,16 + 128(%rdi) movdqu %xmm8,32 + 128(%rdi) movdqu %xmm15,48 + 128(%rdi) movq $192,%rcx subq $192,%rbx leaq 192(%rsi),%rsi movq $2,%rcx movq $8,%r8 cmpq $64,%rbx jbe L$seal_sse_tail_64 cmpq $128,%rbx jbe L$seal_sse_tail_128 cmpq $192,%rbx jbe L$seal_sse_tail_192 L$seal_sse_main_loop: movdqa L$chacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa %xmm0,%xmm2 movdqa %xmm4,%xmm6 movdqa %xmm8,%xmm10 movdqa %xmm0,%xmm3 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 movdqa 0+96(%rbp),%xmm15 paddd L$sse_inc(%rip),%xmm15 movdqa %xmm15,%xmm14 paddd L$sse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movdqa %xmm14,0+128(%rbp) movdqa %xmm15,0+144(%rbp) .p2align 5 L$seal_sse_main_rounds: movdqa %xmm8,0+80(%rbp) movdqa L$rol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movdqa L$rol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 movdqa 0+80(%rbp),%xmm8 imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 .byte 102,15,58,15,255,4 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,12 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 movdqa %xmm8,0+80(%rbp) movdqa L$rol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 movdqa L$rol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 paddd %xmm4,%xmm0 pxor %xmm3,%xmm15 pxor %xmm2,%xmm14 pxor %xmm1,%xmm13 pxor %xmm0,%xmm12 .byte 102,69,15,56,0,248 .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 pxor %xmm8,%xmm7 movdqa %xmm6,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm6 pxor %xmm8,%xmm6 movdqa %xmm5,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm5 pxor %xmm8,%xmm5 movdqa %xmm4,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 movdqa 0+80(%rbp),%xmm8 .byte 102,15,58,15,255,12 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,4 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 leaq 16(%rdi),%rdi decq %r8 jge L$seal_sse_main_rounds addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi decq %rcx jg L$seal_sse_main_rounds paddd L$chacha20_consts(%rip),%xmm3 paddd 0+48(%rbp),%xmm7 paddd 0+64(%rbp),%xmm11 paddd 0+144(%rbp),%xmm15 paddd L$chacha20_consts(%rip),%xmm2 paddd 0+48(%rbp),%xmm6 paddd 0+64(%rbp),%xmm10 paddd 0+128(%rbp),%xmm14 paddd L$chacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd L$chacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqa %xmm14,0+80(%rbp) movdqa %xmm14,0+80(%rbp) movdqu 0 + 0(%rsi),%xmm14 pxor %xmm3,%xmm14 movdqu %xmm14,0 + 0(%rdi) movdqu 16 + 0(%rsi),%xmm14 pxor %xmm7,%xmm14 movdqu %xmm14,16 + 0(%rdi) movdqu 32 + 0(%rsi),%xmm14 pxor %xmm11,%xmm14 movdqu %xmm14,32 + 0(%rdi) movdqu 48 + 0(%rsi),%xmm14 pxor %xmm15,%xmm14 movdqu %xmm14,48 + 0(%rdi) movdqa 0+80(%rbp),%xmm14 movdqu 0 + 64(%rsi),%xmm3 movdqu 16 + 64(%rsi),%xmm7 movdqu 32 + 64(%rsi),%xmm11 movdqu 48 + 64(%rsi),%xmm15 pxor %xmm3,%xmm2 pxor %xmm7,%xmm6 pxor %xmm11,%xmm10 pxor %xmm14,%xmm15 movdqu %xmm2,0 + 64(%rdi) movdqu %xmm6,16 + 64(%rdi) movdqu %xmm10,32 + 64(%rdi) movdqu %xmm15,48 + 64(%rdi) movdqu 0 + 128(%rsi),%xmm3 movdqu 16 + 128(%rsi),%xmm7 movdqu 32 + 128(%rsi),%xmm11 movdqu 48 + 128(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 128(%rdi) movdqu %xmm5,16 + 128(%rdi) movdqu %xmm9,32 + 128(%rdi) movdqu %xmm15,48 + 128(%rdi) cmpq $256,%rbx ja L$seal_sse_main_loop_xor movq $192,%rcx subq $192,%rbx leaq 192(%rsi),%rsi jmp L$seal_sse_128_tail_hash L$seal_sse_main_loop_xor: movdqu 0 + 192(%rsi),%xmm3 movdqu 16 + 192(%rsi),%xmm7 movdqu 32 + 192(%rsi),%xmm11 movdqu 48 + 192(%rsi),%xmm15 pxor %xmm3,%xmm0 pxor %xmm7,%xmm4 pxor %xmm11,%xmm8 pxor %xmm12,%xmm15 movdqu %xmm0,0 + 192(%rdi) movdqu %xmm4,16 + 192(%rdi) movdqu %xmm8,32 + 192(%rdi) movdqu %xmm15,48 + 192(%rdi) leaq 256(%rsi),%rsi subq $256,%rbx movq $6,%rcx movq $4,%r8 cmpq $192,%rbx jg L$seal_sse_main_loop movq %rbx,%rcx testq %rbx,%rbx je L$seal_sse_128_tail_hash movq $6,%rcx cmpq $128,%rbx ja L$seal_sse_tail_192 cmpq $64,%rbx ja L$seal_sse_tail_128 L$seal_sse_tail_64: movdqa L$chacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa 0+96(%rbp),%xmm12 paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) L$seal_sse_tail_64_rounds_and_x2hash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi L$seal_sse_tail_64_rounds_and_x1hash: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi decq %rcx jg L$seal_sse_tail_64_rounds_and_x2hash decq %r8 jge L$seal_sse_tail_64_rounds_and_x1hash paddd L$chacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 jmp L$seal_sse_128_tail_xor L$seal_sse_tail_128: movdqa L$chacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa 0+96(%rbp),%xmm13 paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) L$seal_sse_tail_128_rounds_and_x2hash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi L$seal_sse_tail_128_rounds_and_x1hash: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 leaq 16(%rdi),%rdi decq %rcx jg L$seal_sse_tail_128_rounds_and_x2hash decq %r8 jge L$seal_sse_tail_128_rounds_and_x1hash paddd L$chacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd L$chacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 movdqu 48 + 0(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 0(%rdi) movdqu %xmm5,16 + 0(%rdi) movdqu %xmm9,32 + 0(%rdi) movdqu %xmm15,48 + 0(%rdi) movq $64,%rcx subq $64,%rbx leaq 64(%rsi),%rsi jmp L$seal_sse_128_tail_hash L$seal_sse_tail_192: movdqa L$chacha20_consts(%rip),%xmm0 movdqa 0+48(%rbp),%xmm4 movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa %xmm0,%xmm2 movdqa %xmm4,%xmm6 movdqa %xmm8,%xmm10 movdqa 0+96(%rbp),%xmm14 paddd L$sse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,0+96(%rbp) movdqa %xmm13,0+112(%rbp) movdqa %xmm14,0+128(%rbp) L$seal_sse_tail_192_rounds_and_x2hash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi L$seal_sse_tail_192_rounds_and_x1hash: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 leaq 16(%rdi),%rdi decq %rcx jg L$seal_sse_tail_192_rounds_and_x2hash decq %r8 jge L$seal_sse_tail_192_rounds_and_x1hash paddd L$chacha20_consts(%rip),%xmm2 paddd 0+48(%rbp),%xmm6 paddd 0+64(%rbp),%xmm10 paddd 0+128(%rbp),%xmm14 paddd L$chacha20_consts(%rip),%xmm1 paddd 0+48(%rbp),%xmm5 paddd 0+64(%rbp),%xmm9 paddd 0+112(%rbp),%xmm13 paddd L$chacha20_consts(%rip),%xmm0 paddd 0+48(%rbp),%xmm4 paddd 0+64(%rbp),%xmm8 paddd 0+96(%rbp),%xmm12 movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 movdqu 48 + 0(%rsi),%xmm15 pxor %xmm3,%xmm2 pxor %xmm7,%xmm6 pxor %xmm11,%xmm10 pxor %xmm14,%xmm15 movdqu %xmm2,0 + 0(%rdi) movdqu %xmm6,16 + 0(%rdi) movdqu %xmm10,32 + 0(%rdi) movdqu %xmm15,48 + 0(%rdi) movdqu 0 + 64(%rsi),%xmm3 movdqu 16 + 64(%rsi),%xmm7 movdqu 32 + 64(%rsi),%xmm11 movdqu 48 + 64(%rsi),%xmm15 pxor %xmm3,%xmm1 pxor %xmm7,%xmm5 pxor %xmm11,%xmm9 pxor %xmm13,%xmm15 movdqu %xmm1,0 + 64(%rdi) movdqu %xmm5,16 + 64(%rdi) movdqu %xmm9,32 + 64(%rdi) movdqu %xmm15,48 + 64(%rdi) movq $128,%rcx subq $128,%rbx leaq 128(%rsi),%rsi L$seal_sse_128_tail_hash: cmpq $16,%rcx jb L$seal_sse_128_tail_xor addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 subq $16,%rcx leaq 16(%rdi),%rdi jmp L$seal_sse_128_tail_hash L$seal_sse_128_tail_xor: cmpq $16,%rbx jb L$seal_sse_tail_16 subq $16,%rbx movdqu 0(%rsi),%xmm3 pxor %xmm3,%xmm0 movdqu %xmm0,0(%rdi) addq 0(%rdi),%r10 adcq 8(%rdi),%r11 adcq $1,%r12 leaq 16(%rsi),%rsi leaq 16(%rdi),%rdi movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 movdqa %xmm4,%xmm0 movdqa %xmm8,%xmm4 movdqa %xmm12,%xmm8 movdqa %xmm1,%xmm12 movdqa %xmm5,%xmm1 movdqa %xmm9,%xmm5 movdqa %xmm13,%xmm9 jmp L$seal_sse_128_tail_xor L$seal_sse_tail_16: testq %rbx,%rbx jz L$process_blocks_of_extra_in movq %rbx,%r8 movq %rbx,%rcx leaq -1(%rsi,%rbx,1),%rsi pxor %xmm15,%xmm15 L$seal_sse_tail_16_compose: pslldq $1,%xmm15 pinsrb $0,(%rsi),%xmm15 leaq -1(%rsi),%rsi decq %rcx jne L$seal_sse_tail_16_compose pxor %xmm0,%xmm15 movq %rbx,%rcx movdqu %xmm15,%xmm0 L$seal_sse_tail_16_extract: pextrb $0,%xmm0,(%rdi) psrldq $1,%xmm0 addq $1,%rdi subq $1,%rcx jnz L$seal_sse_tail_16_extract movq 288 + 0 + 32(%rsp),%r9 movq 56(%r9),%r14 movq 48(%r9),%r13 testq %r14,%r14 jz L$process_partial_block movq $16,%r15 subq %rbx,%r15 cmpq %r15,%r14 jge L$load_extra_in movq %r14,%r15 L$load_extra_in: leaq -1(%r13,%r15,1),%rsi addq %r15,%r13 subq %r15,%r14 movq %r13,48(%r9) movq %r14,56(%r9) addq %r15,%r8 pxor %xmm11,%xmm11 L$load_extra_load_loop: pslldq $1,%xmm11 pinsrb $0,(%rsi),%xmm11 leaq -1(%rsi),%rsi subq $1,%r15 jnz L$load_extra_load_loop movq %rbx,%r15 L$load_extra_shift_loop: pslldq $1,%xmm11 subq $1,%r15 jnz L$load_extra_shift_loop leaq L$and_masks(%rip),%r15 shlq $4,%rbx pand -16(%r15,%rbx,1),%xmm15 por %xmm11,%xmm15 .byte 102,77,15,126,253 pextrq $1,%xmm15,%r14 addq %r13,%r10 adcq %r14,%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 L$process_blocks_of_extra_in: movq 288+32+0 (%rsp),%r9 movq 48(%r9),%rsi movq 56(%r9),%r8 movq %r8,%rcx shrq $4,%r8 L$process_extra_hash_loop: jz process_extra_in_trailer addq 0+0(%rsi),%r10 adcq 8+0(%rsi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rsi),%rsi subq $1,%r8 jmp L$process_extra_hash_loop process_extra_in_trailer: andq $15,%rcx movq %rcx,%rbx jz L$do_length_block leaq -1(%rsi,%rcx,1),%rsi L$process_extra_in_trailer_load: pslldq $1,%xmm15 pinsrb $0,(%rsi),%xmm15 leaq -1(%rsi),%rsi subq $1,%rcx jnz L$process_extra_in_trailer_load L$process_partial_block: leaq L$and_masks(%rip),%r15 shlq $4,%rbx pand -16(%r15,%rbx,1),%xmm15 .byte 102,77,15,126,253 pextrq $1,%xmm15,%r14 addq %r13,%r10 adcq %r14,%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 L$do_length_block: addq 0+0+32(%rbp),%r10 adcq 8+0+32(%rbp),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 movq %r10,%r13 movq %r11,%r14 movq %r12,%r15 subq $-5,%r10 sbbq $-1,%r11 sbbq $3,%r12 cmovcq %r13,%r10 cmovcq %r14,%r11 cmovcq %r15,%r12 addq 0+0+16(%rbp),%r10 adcq 8+0+16(%rbp),%r11 addq $288 + 0 + 32,%rsp popq %r9 movq %r10,(%r9) movq %r11,8(%r9) popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx popq %rbp ret L$seal_sse_128: movdqu L$chacha20_consts(%rip),%xmm0 movdqa %xmm0,%xmm1 movdqa %xmm0,%xmm2 movdqu 0(%r9),%xmm4 movdqa %xmm4,%xmm5 movdqa %xmm4,%xmm6 movdqu 16(%r9),%xmm8 movdqa %xmm8,%xmm9 movdqa %xmm8,%xmm10 movdqu 32(%r9),%xmm14 movdqa %xmm14,%xmm12 paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,%xmm13 paddd L$sse_inc(%rip),%xmm13 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 movdqa %xmm12,%xmm15 movq $10,%r10 L$seal_sse_128_rounds: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $12,%xmm3 psrld $20,%xmm4 pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 pslld $7,%xmm3 psrld $25,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $12,%xmm3 psrld $20,%xmm5 pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 pslld $7,%xmm3 psrld $25,%xmm5 pxor %xmm3,%xmm5 .byte 102,15,58,15,237,12 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,4 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $12,%xmm3 psrld $20,%xmm6 pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 pslld $7,%xmm3 psrld $25,%xmm6 pxor %xmm3,%xmm6 .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 decq %r10 jnz L$seal_sse_128_rounds paddd L$chacha20_consts(%rip),%xmm0 paddd L$chacha20_consts(%rip),%xmm1 paddd L$chacha20_consts(%rip),%xmm2 paddd %xmm7,%xmm4 paddd %xmm7,%xmm5 paddd %xmm7,%xmm6 paddd %xmm11,%xmm8 paddd %xmm11,%xmm9 paddd %xmm15,%xmm12 paddd L$sse_inc(%rip),%xmm15 paddd %xmm15,%xmm13 pand L$clamp(%rip),%xmm2 movdqa %xmm2,0+0(%rbp) movdqa %xmm6,0+16(%rbp) movq %r8,%r8 call poly_hash_ad_internal jmp L$seal_sse_128_tail_xor .globl _chacha20_poly1305_open_avx2 .private_extern _chacha20_poly1305_open_avx2 .p2align 6 _chacha20_poly1305_open_avx2: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %r9 subq $288 + 0 + 32,%rsp leaq 32(%rsp),%rbp andq $-32,%rbp movq %rdx,%rbx movq %r8,0+0+32(%rbp) movq %rbx,8+0+32(%rbp) vzeroupper vmovdqa L$chacha20_consts(%rip),%ymm0 vbroadcasti128 0(%r9),%ymm4 vbroadcasti128 16(%r9),%ymm8 vbroadcasti128 32(%r9),%ymm12 vpaddd L$avx2_init(%rip),%ymm12,%ymm12 cmpq $192,%rbx jbe L$open_avx2_192 cmpq $320,%rbx jbe L$open_avx2_320 vmovdqa %ymm4,0+64(%rbp) vmovdqa %ymm8,0+96(%rbp) vmovdqa %ymm12,0+160(%rbp) movq $10,%r10 L$open_avx2_init_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 decq %r10 jne L$open_avx2_init_rounds vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vpand L$clamp(%rip),%ymm3,%ymm3 vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 movq %r8,%r8 call poly_hash_ad_internal xorq %rcx,%rcx L$open_avx2_init_hash: addq 0+0(%rsi,%rcx,1),%r10 adcq 8+0(%rsi,%rcx,1),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq $16,%rcx cmpq $64,%rcx jne L$open_avx2_init_hash vpxor 0(%rsi),%ymm0,%ymm0 vpxor 32(%rsi),%ymm4,%ymm4 vmovdqu %ymm0,0(%rdi) vmovdqu %ymm4,32(%rdi) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi subq $64,%rbx L$open_avx2_main_loop: cmpq $512,%rbx jb L$open_avx2_main_loop_done vmovdqa L$chacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa L$avx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm15,0+256(%rbp) vmovdqa %ymm14,0+224(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm12,0+160(%rbp) xorq %rcx,%rcx L$open_avx2_main_loop_rounds: addq 0+0(%rsi,%rcx,1),%r10 adcq 8+0(%rsi,%rcx,1),%r11 adcq $1,%r12 vmovdqa %ymm8,0+128(%rbp) vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 addq %rax,%r15 adcq %rdx,%r9 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 addq 0+16(%rsi,%rcx,1),%r10 adcq 8+16(%rsi,%rcx,1),%r11 adcq $1,%r12 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm14,%ymm14,%ymm14 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 addq %rax,%r15 adcq %rdx,%r9 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 addq 0+32(%rsi,%rcx,1),%r10 adcq 8+32(%rsi,%rcx,1),%r11 adcq $1,%r12 leaq 48(%rcx),%rcx vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 addq %rax,%r15 adcq %rdx,%r9 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 vpalignr $12,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpalignr $4,%ymm12,%ymm12,%ymm12 cmpq $60*8,%rcx jne L$open_avx2_main_loop_rounds vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 vpaddd 0+64(%rbp),%ymm7,%ymm7 vpaddd 0+96(%rbp),%ymm11,%ymm11 vpaddd 0+256(%rbp),%ymm15,%ymm15 vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vmovdqa %ymm0,0+128(%rbp) addq 0+60*8(%rsi),%r10 adcq 8+60*8(%rsi),%r11 adcq $1,%r12 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 vpxor 0+0(%rsi),%ymm0,%ymm0 vpxor 32+0(%rsi),%ymm3,%ymm3 vpxor 64+0(%rsi),%ymm7,%ymm7 vpxor 96+0(%rsi),%ymm11,%ymm11 vmovdqu %ymm0,0+0(%rdi) vmovdqu %ymm3,32+0(%rdi) vmovdqu %ymm7,64+0(%rdi) vmovdqu %ymm11,96+0(%rdi) vmovdqa 0+128(%rbp),%ymm0 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+128(%rsi),%ymm3,%ymm3 vpxor 32+128(%rsi),%ymm2,%ymm2 vpxor 64+128(%rsi),%ymm6,%ymm6 vpxor 96+128(%rsi),%ymm10,%ymm10 vmovdqu %ymm3,0+128(%rdi) vmovdqu %ymm2,32+128(%rdi) vmovdqu %ymm6,64+128(%rdi) vmovdqu %ymm10,96+128(%rdi) addq 0+60*8+16(%rsi),%r10 adcq 8+60*8+16(%rsi),%r11 adcq $1,%r12 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+256(%rsi),%ymm3,%ymm3 vpxor 32+256(%rsi),%ymm1,%ymm1 vpxor 64+256(%rsi),%ymm5,%ymm5 vpxor 96+256(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+256(%rdi) vmovdqu %ymm1,32+256(%rdi) vmovdqu %ymm5,64+256(%rdi) vmovdqu %ymm9,96+256(%rdi) movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 vpxor 0+384(%rsi),%ymm3,%ymm3 vpxor 32+384(%rsi),%ymm0,%ymm0 vpxor 64+384(%rsi),%ymm4,%ymm4 vpxor 96+384(%rsi),%ymm8,%ymm8 vmovdqu %ymm3,0+384(%rdi) vmovdqu %ymm0,32+384(%rdi) vmovdqu %ymm4,64+384(%rdi) vmovdqu %ymm8,96+384(%rdi) leaq 512(%rsi),%rsi leaq 512(%rdi),%rdi subq $512,%rbx jmp L$open_avx2_main_loop L$open_avx2_main_loop_done: testq %rbx,%rbx vzeroupper je L$open_sse_finalize cmpq $384,%rbx ja L$open_avx2_tail_512 cmpq $256,%rbx ja L$open_avx2_tail_384 cmpq $128,%rbx ja L$open_avx2_tail_256 vmovdqa L$chacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa L$avx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm12 vmovdqa %ymm12,0+160(%rbp) xorq %r8,%r8 movq %rbx,%rcx andq $-16,%rcx testq %rcx,%rcx je L$open_avx2_tail_128_rounds L$open_avx2_tail_128_rounds_and_x1hash: addq 0+0(%rsi,%r8,1),%r10 adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 L$open_avx2_tail_128_rounds: addq $16,%r8 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 cmpq %rcx,%r8 jb L$open_avx2_tail_128_rounds_and_x1hash cmpq $160,%r8 jne L$open_avx2_tail_128_rounds vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 jmp L$open_avx2_tail_128_xor L$open_avx2_tail_256: vmovdqa L$chacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa L$avx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) movq %rbx,0+128(%rbp) movq %rbx,%rcx subq $128,%rcx shrq $4,%rcx movq $10,%r8 cmpq $10,%rcx cmovgq %r8,%rcx movq %rsi,%rbx xorq %r8,%r8 L$open_avx2_tail_256_rounds_and_x1hash: addq 0+0(%rbx),%r10 adcq 8+0(%rbx),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rbx),%rbx L$open_avx2_tail_256_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 incq %r8 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm6,%ymm6,%ymm6 cmpq %rcx,%r8 jb L$open_avx2_tail_256_rounds_and_x1hash cmpq $10,%r8 jne L$open_avx2_tail_256_rounds movq %rbx,%r8 subq %rsi,%rbx movq %rbx,%rcx movq 0+128(%rbp),%rbx L$open_avx2_tail_256_hash: addq $16,%rcx cmpq %rbx,%rcx jg L$open_avx2_tail_256_done addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 jmp L$open_avx2_tail_256_hash L$open_avx2_tail_256_done: vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+0(%rsi),%ymm3,%ymm3 vpxor 32+0(%rsi),%ymm1,%ymm1 vpxor 64+0(%rsi),%ymm5,%ymm5 vpxor 96+0(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+0(%rdi) vmovdqu %ymm1,32+0(%rdi) vmovdqu %ymm5,64+0(%rdi) vmovdqu %ymm9,96+0(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 leaq 128(%rsi),%rsi leaq 128(%rdi),%rdi subq $128,%rbx jmp L$open_avx2_tail_128_xor L$open_avx2_tail_384: vmovdqa L$chacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa L$avx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm14,0+224(%rbp) movq %rbx,0+128(%rbp) movq %rbx,%rcx subq $256,%rcx shrq $4,%rcx addq $6,%rcx movq $10,%r8 cmpq $10,%rcx cmovgq %r8,%rcx movq %rsi,%rbx xorq %r8,%r8 L$open_avx2_tail_384_rounds_and_x2hash: addq 0+0(%rbx),%r10 adcq 8+0(%rbx),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rbx),%rbx L$open_avx2_tail_384_rounds_and_x1hash: vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm6,%ymm6,%ymm6 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 addq 0+0(%rbx),%r10 adcq 8+0(%rbx),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rbx),%rbx incq %r8 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm6,%ymm6,%ymm6 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 cmpq %rcx,%r8 jb L$open_avx2_tail_384_rounds_and_x2hash cmpq $10,%r8 jne L$open_avx2_tail_384_rounds_and_x1hash movq %rbx,%r8 subq %rsi,%rbx movq %rbx,%rcx movq 0+128(%rbp),%rbx L$open_avx2_384_tail_hash: addq $16,%rcx cmpq %rbx,%rcx jg L$open_avx2_384_tail_done addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 jmp L$open_avx2_384_tail_hash L$open_avx2_384_tail_done: vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+0(%rsi),%ymm3,%ymm3 vpxor 32+0(%rsi),%ymm2,%ymm2 vpxor 64+0(%rsi),%ymm6,%ymm6 vpxor 96+0(%rsi),%ymm10,%ymm10 vmovdqu %ymm3,0+0(%rdi) vmovdqu %ymm2,32+0(%rdi) vmovdqu %ymm6,64+0(%rdi) vmovdqu %ymm10,96+0(%rdi) vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+128(%rsi),%ymm3,%ymm3 vpxor 32+128(%rsi),%ymm1,%ymm1 vpxor 64+128(%rsi),%ymm5,%ymm5 vpxor 96+128(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+128(%rdi) vmovdqu %ymm1,32+128(%rdi) vmovdqu %ymm5,64+128(%rdi) vmovdqu %ymm9,96+128(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 leaq 256(%rsi),%rsi leaq 256(%rdi),%rdi subq $256,%rbx jmp L$open_avx2_tail_128_xor L$open_avx2_tail_512: vmovdqa L$chacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa L$avx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm15,0+256(%rbp) vmovdqa %ymm14,0+224(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm12,0+160(%rbp) xorq %rcx,%rcx movq %rsi,%r8 L$open_avx2_tail_512_rounds_and_x2hash: addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 L$open_avx2_tail_512_rounds_and_x1hash: vmovdqa %ymm8,0+128(%rbp) vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 addq 0+16(%r8),%r10 adcq 8+16(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%r8),%r8 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 vpalignr $12,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm12,%ymm12,%ymm12 incq %rcx cmpq $4,%rcx jl L$open_avx2_tail_512_rounds_and_x2hash cmpq $10,%rcx jne L$open_avx2_tail_512_rounds_and_x1hash movq %rbx,%rcx subq $384,%rcx andq $-16,%rcx L$open_avx2_tail_512_hash: testq %rcx,%rcx je L$open_avx2_tail_512_done addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 subq $16,%rcx jmp L$open_avx2_tail_512_hash L$open_avx2_tail_512_done: vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 vpaddd 0+64(%rbp),%ymm7,%ymm7 vpaddd 0+96(%rbp),%ymm11,%ymm11 vpaddd 0+256(%rbp),%ymm15,%ymm15 vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vmovdqa %ymm0,0+128(%rbp) vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 vpxor 0+0(%rsi),%ymm0,%ymm0 vpxor 32+0(%rsi),%ymm3,%ymm3 vpxor 64+0(%rsi),%ymm7,%ymm7 vpxor 96+0(%rsi),%ymm11,%ymm11 vmovdqu %ymm0,0+0(%rdi) vmovdqu %ymm3,32+0(%rdi) vmovdqu %ymm7,64+0(%rdi) vmovdqu %ymm11,96+0(%rdi) vmovdqa 0+128(%rbp),%ymm0 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+128(%rsi),%ymm3,%ymm3 vpxor 32+128(%rsi),%ymm2,%ymm2 vpxor 64+128(%rsi),%ymm6,%ymm6 vpxor 96+128(%rsi),%ymm10,%ymm10 vmovdqu %ymm3,0+128(%rdi) vmovdqu %ymm2,32+128(%rdi) vmovdqu %ymm6,64+128(%rdi) vmovdqu %ymm10,96+128(%rdi) vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+256(%rsi),%ymm3,%ymm3 vpxor 32+256(%rsi),%ymm1,%ymm1 vpxor 64+256(%rsi),%ymm5,%ymm5 vpxor 96+256(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+256(%rdi) vmovdqu %ymm1,32+256(%rdi) vmovdqu %ymm5,64+256(%rdi) vmovdqu %ymm9,96+256(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 leaq 384(%rsi),%rsi leaq 384(%rdi),%rdi subq $384,%rbx L$open_avx2_tail_128_xor: cmpq $32,%rbx jb L$open_avx2_tail_32_xor subq $32,%rbx vpxor (%rsi),%ymm0,%ymm0 vmovdqu %ymm0,(%rdi) leaq 32(%rsi),%rsi leaq 32(%rdi),%rdi vmovdqa %ymm4,%ymm0 vmovdqa %ymm8,%ymm4 vmovdqa %ymm12,%ymm8 jmp L$open_avx2_tail_128_xor L$open_avx2_tail_32_xor: cmpq $16,%rbx vmovdqa %xmm0,%xmm1 jb L$open_avx2_exit subq $16,%rbx vpxor (%rsi),%xmm0,%xmm1 vmovdqu %xmm1,(%rdi) leaq 16(%rsi),%rsi leaq 16(%rdi),%rdi vperm2i128 $0x11,%ymm0,%ymm0,%ymm0 vmovdqa %xmm0,%xmm1 L$open_avx2_exit: vzeroupper jmp L$open_sse_tail_16 L$open_avx2_192: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 vmovdqa %ymm12,%ymm11 vmovdqa %ymm13,%ymm15 movq $10,%r10 L$open_avx2_192_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 decq %r10 jne L$open_avx2_192_rounds vpaddd %ymm2,%ymm0,%ymm0 vpaddd %ymm2,%ymm1,%ymm1 vpaddd %ymm6,%ymm4,%ymm4 vpaddd %ymm6,%ymm5,%ymm5 vpaddd %ymm10,%ymm8,%ymm8 vpaddd %ymm10,%ymm9,%ymm9 vpaddd %ymm11,%ymm12,%ymm12 vpaddd %ymm15,%ymm13,%ymm13 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vpand L$clamp(%rip),%ymm3,%ymm3 vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 L$open_avx2_short: movq %r8,%r8 call poly_hash_ad_internal L$open_avx2_short_hash_and_xor_loop: cmpq $32,%rbx jb L$open_avx2_short_tail_32 subq $32,%rbx addq 0+0(%rsi),%r10 adcq 8+0(%rsi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq 0+16(%rsi),%r10 adcq 8+16(%rsi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpxor (%rsi),%ymm0,%ymm0 vmovdqu %ymm0,(%rdi) leaq 32(%rsi),%rsi leaq 32(%rdi),%rdi vmovdqa %ymm4,%ymm0 vmovdqa %ymm8,%ymm4 vmovdqa %ymm12,%ymm8 vmovdqa %ymm1,%ymm12 vmovdqa %ymm5,%ymm1 vmovdqa %ymm9,%ymm5 vmovdqa %ymm13,%ymm9 vmovdqa %ymm2,%ymm13 vmovdqa %ymm6,%ymm2 jmp L$open_avx2_short_hash_and_xor_loop L$open_avx2_short_tail_32: cmpq $16,%rbx vmovdqa %xmm0,%xmm1 jb L$open_avx2_short_tail_32_exit subq $16,%rbx addq 0+0(%rsi),%r10 adcq 8+0(%rsi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpxor (%rsi),%xmm0,%xmm3 vmovdqu %xmm3,(%rdi) leaq 16(%rsi),%rsi leaq 16(%rdi),%rdi vextracti128 $1,%ymm0,%xmm1 L$open_avx2_short_tail_32_exit: vzeroupper jmp L$open_sse_tail_16 L$open_avx2_320: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 vpaddd L$avx2_inc(%rip),%ymm13,%ymm14 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm14,0+224(%rbp) movq $10,%r10 L$open_avx2_320_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm6,%ymm6,%ymm6 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm6,%ymm6,%ymm6 decq %r10 jne L$open_avx2_320_rounds vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 vpaddd %ymm7,%ymm4,%ymm4 vpaddd %ymm7,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 vpaddd %ymm11,%ymm8,%ymm8 vpaddd %ymm11,%ymm9,%ymm9 vpaddd %ymm11,%ymm10,%ymm10 vpaddd 0+160(%rbp),%ymm12,%ymm12 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd 0+224(%rbp),%ymm14,%ymm14 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vpand L$clamp(%rip),%ymm3,%ymm3 vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 jmp L$open_avx2_short .globl _chacha20_poly1305_seal_avx2 .private_extern _chacha20_poly1305_seal_avx2 .p2align 6 _chacha20_poly1305_seal_avx2: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %r9 subq $288 + 0 + 32,%rsp leaq 32(%rsp),%rbp andq $-32,%rbp movq 56(%r9),%rbx addq %rdx,%rbx movq %r8,0+0+32(%rbp) movq %rbx,8+0+32(%rbp) movq %rdx,%rbx vzeroupper vmovdqa L$chacha20_consts(%rip),%ymm0 vbroadcasti128 0(%r9),%ymm4 vbroadcasti128 16(%r9),%ymm8 vbroadcasti128 32(%r9),%ymm12 vpaddd L$avx2_init(%rip),%ymm12,%ymm12 cmpq $192,%rbx jbe L$seal_avx2_192 cmpq $320,%rbx jbe L$seal_avx2_320 vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm4,%ymm7 vmovdqa %ymm4,0+64(%rbp) vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 vmovdqa %ymm8,%ymm11 vmovdqa %ymm8,0+96(%rbp) vmovdqa %ymm12,%ymm15 vpaddd L$avx2_inc(%rip),%ymm15,%ymm14 vpaddd L$avx2_inc(%rip),%ymm14,%ymm13 vpaddd L$avx2_inc(%rip),%ymm13,%ymm12 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm14,0+224(%rbp) vmovdqa %ymm15,0+256(%rbp) movq $10,%r10 L$seal_avx2_init_rounds: vmovdqa %ymm8,0+128(%rbp) vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 vpalignr $12,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm12,%ymm12,%ymm12 decq %r10 jnz L$seal_avx2_init_rounds vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 vpaddd 0+64(%rbp),%ymm7,%ymm7 vpaddd 0+96(%rbp),%ymm11,%ymm11 vpaddd 0+256(%rbp),%ymm15,%ymm15 vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 vperm2i128 $0x02,%ymm3,%ymm7,%ymm15 vperm2i128 $0x13,%ymm3,%ymm7,%ymm3 vpand L$clamp(%rip),%ymm15,%ymm15 vmovdqa %ymm15,0+0(%rbp) movq %r8,%r8 call poly_hash_ad_internal vpxor 0(%rsi),%ymm3,%ymm3 vpxor 32(%rsi),%ymm11,%ymm11 vmovdqu %ymm3,0(%rdi) vmovdqu %ymm11,32(%rdi) vperm2i128 $0x02,%ymm2,%ymm6,%ymm15 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+64(%rsi),%ymm15,%ymm15 vpxor 32+64(%rsi),%ymm2,%ymm2 vpxor 64+64(%rsi),%ymm6,%ymm6 vpxor 96+64(%rsi),%ymm10,%ymm10 vmovdqu %ymm15,0+64(%rdi) vmovdqu %ymm2,32+64(%rdi) vmovdqu %ymm6,64+64(%rdi) vmovdqu %ymm10,96+64(%rdi) vperm2i128 $0x02,%ymm1,%ymm5,%ymm15 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+192(%rsi),%ymm15,%ymm15 vpxor 32+192(%rsi),%ymm1,%ymm1 vpxor 64+192(%rsi),%ymm5,%ymm5 vpxor 96+192(%rsi),%ymm9,%ymm9 vmovdqu %ymm15,0+192(%rdi) vmovdqu %ymm1,32+192(%rdi) vmovdqu %ymm5,64+192(%rdi) vmovdqu %ymm9,96+192(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm15 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm15,%ymm8 leaq 320(%rsi),%rsi subq $320,%rbx movq $320,%rcx cmpq $128,%rbx jbe L$seal_avx2_short_hash_remainder vpxor 0(%rsi),%ymm0,%ymm0 vpxor 32(%rsi),%ymm4,%ymm4 vpxor 64(%rsi),%ymm8,%ymm8 vpxor 96(%rsi),%ymm12,%ymm12 vmovdqu %ymm0,320(%rdi) vmovdqu %ymm4,352(%rdi) vmovdqu %ymm8,384(%rdi) vmovdqu %ymm12,416(%rdi) leaq 128(%rsi),%rsi subq $128,%rbx movq $8,%rcx movq $2,%r8 cmpq $128,%rbx jbe L$seal_avx2_tail_128 cmpq $256,%rbx jbe L$seal_avx2_tail_256 cmpq $384,%rbx jbe L$seal_avx2_tail_384 cmpq $512,%rbx jbe L$seal_avx2_tail_512 vmovdqa L$chacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa L$avx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm15,0+256(%rbp) vmovdqa %ymm14,0+224(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm8,0+128(%rbp) vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 vpalignr $12,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 subq $16,%rdi movq $9,%rcx jmp L$seal_avx2_main_loop_rounds_entry .p2align 5 L$seal_avx2_main_loop: vmovdqa L$chacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa L$avx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm15,0+256(%rbp) vmovdqa %ymm14,0+224(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm12,0+160(%rbp) movq $10,%rcx .p2align 5 L$seal_avx2_main_loop_rounds: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 vmovdqa %ymm8,0+128(%rbp) vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 addq %rax,%r15 adcq %rdx,%r9 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 L$seal_avx2_main_loop_rounds_entry: vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm14,%ymm14,%ymm14 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 addq %rax,%r15 adcq %rdx,%r9 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 addq 0+32(%rdi),%r10 adcq 8+32(%rdi),%r11 adcq $1,%r12 leaq 48(%rdi),%rdi vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 addq %rax,%r15 adcq %rdx,%r9 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 vpalignr $12,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpalignr $4,%ymm12,%ymm12,%ymm12 decq %rcx jne L$seal_avx2_main_loop_rounds vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 vpaddd 0+64(%rbp),%ymm7,%ymm7 vpaddd 0+96(%rbp),%ymm11,%ymm11 vpaddd 0+256(%rbp),%ymm15,%ymm15 vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vmovdqa %ymm0,0+128(%rbp) addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 vpxor 0+0(%rsi),%ymm0,%ymm0 vpxor 32+0(%rsi),%ymm3,%ymm3 vpxor 64+0(%rsi),%ymm7,%ymm7 vpxor 96+0(%rsi),%ymm11,%ymm11 vmovdqu %ymm0,0+0(%rdi) vmovdqu %ymm3,32+0(%rdi) vmovdqu %ymm7,64+0(%rdi) vmovdqu %ymm11,96+0(%rdi) vmovdqa 0+128(%rbp),%ymm0 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+128(%rsi),%ymm3,%ymm3 vpxor 32+128(%rsi),%ymm2,%ymm2 vpxor 64+128(%rsi),%ymm6,%ymm6 vpxor 96+128(%rsi),%ymm10,%ymm10 vmovdqu %ymm3,0+128(%rdi) vmovdqu %ymm2,32+128(%rdi) vmovdqu %ymm6,64+128(%rdi) vmovdqu %ymm10,96+128(%rdi) vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+256(%rsi),%ymm3,%ymm3 vpxor 32+256(%rsi),%ymm1,%ymm1 vpxor 64+256(%rsi),%ymm5,%ymm5 vpxor 96+256(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+256(%rdi) vmovdqu %ymm1,32+256(%rdi) vmovdqu %ymm5,64+256(%rdi) vmovdqu %ymm9,96+256(%rdi) vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 vpxor 0+384(%rsi),%ymm3,%ymm3 vpxor 32+384(%rsi),%ymm0,%ymm0 vpxor 64+384(%rsi),%ymm4,%ymm4 vpxor 96+384(%rsi),%ymm8,%ymm8 vmovdqu %ymm3,0+384(%rdi) vmovdqu %ymm0,32+384(%rdi) vmovdqu %ymm4,64+384(%rdi) vmovdqu %ymm8,96+384(%rdi) leaq 512(%rsi),%rsi subq $512,%rbx cmpq $512,%rbx jg L$seal_avx2_main_loop addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi movq $10,%rcx xorq %r8,%r8 cmpq $384,%rbx ja L$seal_avx2_tail_512 cmpq $256,%rbx ja L$seal_avx2_tail_384 cmpq $128,%rbx ja L$seal_avx2_tail_256 L$seal_avx2_tail_128: vmovdqa L$chacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa L$avx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm12 vmovdqa %ymm12,0+160(%rbp) L$seal_avx2_tail_128_rounds_and_3xhash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi L$seal_avx2_tail_128_rounds_and_2xhash: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi decq %rcx jg L$seal_avx2_tail_128_rounds_and_3xhash decq %r8 jge L$seal_avx2_tail_128_rounds_and_2xhash vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 jmp L$seal_avx2_short_loop L$seal_avx2_tail_256: vmovdqa L$chacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa L$avx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) L$seal_avx2_tail_256_rounds_and_3xhash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi L$seal_avx2_tail_256_rounds_and_2xhash: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi decq %rcx jg L$seal_avx2_tail_256_rounds_and_3xhash decq %r8 jge L$seal_avx2_tail_256_rounds_and_2xhash vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+0(%rsi),%ymm3,%ymm3 vpxor 32+0(%rsi),%ymm1,%ymm1 vpxor 64+0(%rsi),%ymm5,%ymm5 vpxor 96+0(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+0(%rdi) vmovdqu %ymm1,32+0(%rdi) vmovdqu %ymm5,64+0(%rdi) vmovdqu %ymm9,96+0(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 movq $128,%rcx leaq 128(%rsi),%rsi subq $128,%rbx jmp L$seal_avx2_short_hash_remainder L$seal_avx2_tail_384: vmovdqa L$chacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa L$avx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm14,0+224(%rbp) L$seal_avx2_tail_384_rounds_and_3xhash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi L$seal_avx2_tail_384_rounds_and_2xhash: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm6,%ymm6,%ymm6 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm6,%ymm6,%ymm6 leaq 32(%rdi),%rdi decq %rcx jg L$seal_avx2_tail_384_rounds_and_3xhash decq %r8 jge L$seal_avx2_tail_384_rounds_and_2xhash vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+0(%rsi),%ymm3,%ymm3 vpxor 32+0(%rsi),%ymm2,%ymm2 vpxor 64+0(%rsi),%ymm6,%ymm6 vpxor 96+0(%rsi),%ymm10,%ymm10 vmovdqu %ymm3,0+0(%rdi) vmovdqu %ymm2,32+0(%rdi) vmovdqu %ymm6,64+0(%rdi) vmovdqu %ymm10,96+0(%rdi) vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+128(%rsi),%ymm3,%ymm3 vpxor 32+128(%rsi),%ymm1,%ymm1 vpxor 64+128(%rsi),%ymm5,%ymm5 vpxor 96+128(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+128(%rdi) vmovdqu %ymm1,32+128(%rdi) vmovdqu %ymm5,64+128(%rdi) vmovdqu %ymm9,96+128(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 movq $256,%rcx leaq 256(%rsi),%rsi subq $256,%rbx jmp L$seal_avx2_short_hash_remainder L$seal_avx2_tail_512: vmovdqa L$chacha20_consts(%rip),%ymm0 vmovdqa 0+64(%rbp),%ymm4 vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa L$avx2_inc(%rip),%ymm12 vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 vmovdqa %ymm15,0+256(%rbp) vmovdqa %ymm14,0+224(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm12,0+160(%rbp) L$seal_avx2_tail_512_rounds_and_3xhash: addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi L$seal_avx2_tail_512_rounds_and_2xhash: vmovdqa %ymm8,0+128(%rbp) vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 addq %rax,%r15 adcq %rdx,%r9 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 vmovdqa %ymm8,0+128(%rbp) vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 vpalignr $12,%ymm6,%ymm6,%ymm6 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm12,%ymm12,%ymm12 addq %rax,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi decq %rcx jg L$seal_avx2_tail_512_rounds_and_3xhash decq %r8 jge L$seal_avx2_tail_512_rounds_and_2xhash vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 vpaddd 0+64(%rbp),%ymm7,%ymm7 vpaddd 0+96(%rbp),%ymm11,%ymm11 vpaddd 0+256(%rbp),%ymm15,%ymm15 vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 vpaddd 0+64(%rbp),%ymm6,%ymm6 vpaddd 0+96(%rbp),%ymm10,%ymm10 vpaddd 0+224(%rbp),%ymm14,%ymm14 vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 vpaddd 0+64(%rbp),%ymm5,%ymm5 vpaddd 0+96(%rbp),%ymm9,%ymm9 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd 0+64(%rbp),%ymm4,%ymm4 vpaddd 0+96(%rbp),%ymm8,%ymm8 vpaddd 0+160(%rbp),%ymm12,%ymm12 vmovdqa %ymm0,0+128(%rbp) vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 vpxor 0+0(%rsi),%ymm0,%ymm0 vpxor 32+0(%rsi),%ymm3,%ymm3 vpxor 64+0(%rsi),%ymm7,%ymm7 vpxor 96+0(%rsi),%ymm11,%ymm11 vmovdqu %ymm0,0+0(%rdi) vmovdqu %ymm3,32+0(%rdi) vmovdqu %ymm7,64+0(%rdi) vmovdqu %ymm11,96+0(%rdi) vmovdqa 0+128(%rbp),%ymm0 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 vpxor 0+128(%rsi),%ymm3,%ymm3 vpxor 32+128(%rsi),%ymm2,%ymm2 vpxor 64+128(%rsi),%ymm6,%ymm6 vpxor 96+128(%rsi),%ymm10,%ymm10 vmovdqu %ymm3,0+128(%rdi) vmovdqu %ymm2,32+128(%rdi) vmovdqu %ymm6,64+128(%rdi) vmovdqu %ymm10,96+128(%rdi) vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 vpxor 0+256(%rsi),%ymm3,%ymm3 vpxor 32+256(%rsi),%ymm1,%ymm1 vpxor 64+256(%rsi),%ymm5,%ymm5 vpxor 96+256(%rsi),%ymm9,%ymm9 vmovdqu %ymm3,0+256(%rdi) vmovdqu %ymm1,32+256(%rdi) vmovdqu %ymm5,64+256(%rdi) vmovdqu %ymm9,96+256(%rdi) vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 movq $384,%rcx leaq 384(%rsi),%rsi subq $384,%rbx jmp L$seal_avx2_short_hash_remainder L$seal_avx2_320: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 vpaddd L$avx2_inc(%rip),%ymm13,%ymm14 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 vmovdqa %ymm12,0+160(%rbp) vmovdqa %ymm13,0+192(%rbp) vmovdqa %ymm14,0+224(%rbp) movq $10,%r10 L$seal_avx2_320_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm6,%ymm6,%ymm6 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 vpslld $12,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 vpsrld $25,%ymm6,%ymm6 vpxor %ymm3,%ymm6,%ymm6 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $12,%ymm6,%ymm6,%ymm6 decq %r10 jne L$seal_avx2_320_rounds vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 vpaddd %ymm7,%ymm4,%ymm4 vpaddd %ymm7,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 vpaddd %ymm11,%ymm8,%ymm8 vpaddd %ymm11,%ymm9,%ymm9 vpaddd %ymm11,%ymm10,%ymm10 vpaddd 0+160(%rbp),%ymm12,%ymm12 vpaddd 0+192(%rbp),%ymm13,%ymm13 vpaddd 0+224(%rbp),%ymm14,%ymm14 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vpand L$clamp(%rip),%ymm3,%ymm3 vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 jmp L$seal_avx2_short L$seal_avx2_192: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 vmovdqa %ymm12,%ymm11 vmovdqa %ymm13,%ymm15 movq $10,%r10 L$seal_avx2_192_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 vpslld $12,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 vpsrld $25,%ymm4,%ymm4 vpxor %ymm3,%ymm4,%ymm4 vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 vpslld $12,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 vpsrld $25,%ymm5,%ymm5 vpxor %ymm3,%ymm5,%ymm5 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 decq %r10 jne L$seal_avx2_192_rounds vpaddd %ymm2,%ymm0,%ymm0 vpaddd %ymm2,%ymm1,%ymm1 vpaddd %ymm6,%ymm4,%ymm4 vpaddd %ymm6,%ymm5,%ymm5 vpaddd %ymm10,%ymm8,%ymm8 vpaddd %ymm10,%ymm9,%ymm9 vpaddd %ymm11,%ymm12,%ymm12 vpaddd %ymm15,%ymm13,%ymm13 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vpand L$clamp(%rip),%ymm3,%ymm3 vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 L$seal_avx2_short: movq %r8,%r8 call poly_hash_ad_internal xorq %rcx,%rcx L$seal_avx2_short_hash_remainder: cmpq $16,%rcx jb L$seal_avx2_short_loop addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 subq $16,%rcx addq $16,%rdi jmp L$seal_avx2_short_hash_remainder L$seal_avx2_short_loop: cmpq $32,%rbx jb L$seal_avx2_short_tail subq $32,%rbx vpxor (%rsi),%ymm0,%ymm0 vmovdqu %ymm0,(%rdi) leaq 32(%rsi),%rsi addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi vmovdqa %ymm4,%ymm0 vmovdqa %ymm8,%ymm4 vmovdqa %ymm12,%ymm8 vmovdqa %ymm1,%ymm12 vmovdqa %ymm5,%ymm1 vmovdqa %ymm9,%ymm5 vmovdqa %ymm13,%ymm9 vmovdqa %ymm2,%ymm13 vmovdqa %ymm6,%ymm2 jmp L$seal_avx2_short_loop L$seal_avx2_short_tail: cmpq $16,%rbx jb L$seal_avx2_exit subq $16,%rbx vpxor (%rsi),%xmm0,%xmm3 vmovdqu %xmm3,(%rdi) leaq 16(%rsi),%rsi addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 andq $3,%r12 movq %r15,%r13 andq $-4,%r13 movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 addq %r13,%r15 adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi vextracti128 $1,%ymm0,%xmm0 L$seal_avx2_exit: vzeroupper jmp L$seal_sse_tail_16 #endif ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.asm000064400000000000000000005620071046102023000210150ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %ifidn __OUTPUT_FORMAT__, win64 default rel %define XMMWORD %define YMMWORD %define ZMMWORD %define _CET_ENDBR %include "ring_core_generated/prefix_symbols_nasm.inc" section .rdata rdata align=8 ALIGN 64 chacha20_poly1305_constants: $L$chacha20_consts: DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' $L$rol8: DB 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 DB 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 $L$rol16: DB 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 DB 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 $L$avx2_init: DD 0,0,0,0 $L$sse_inc: DD 1,0,0,0 $L$avx2_inc: DD 2,0,0,0,2,0,0,0 $L$clamp: DQ 0x0FFFFFFC0FFFFFFF,0x0FFFFFFC0FFFFFFC DQ 0xFFFFFFFFFFFFFFFF,0xFFFFFFFFFFFFFFFF ALIGN 16 $L$and_masks: DB 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 DB 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 DB 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 DB 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 DB 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 DB 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff section .text code align=64 ALIGN 64 poly_hash_ad_internal: xor r10,r10 xor r11,r11 xor r12,r12 cmp r8,13 jne NEAR $L$hash_ad_loop $L$poly_fast_tls_ad: mov r10,QWORD[rcx] mov r11,QWORD[5+rcx] shr r11,24 mov r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 ret $L$hash_ad_loop: cmp r8,16 jb NEAR $L$hash_ad_tail add r10,QWORD[((0+0))+rcx] adc r11,QWORD[((8+0))+rcx] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rcx,[16+rcx] sub r8,16 jmp NEAR $L$hash_ad_loop $L$hash_ad_tail: cmp r8,0 je NEAR $L$hash_ad_done xor r13,r13 xor r14,r14 xor r15,r15 add rcx,r8 $L$hash_ad_tail_loop: shld r14,r13,8 shl r13,8 movzx r15,BYTE[((-1))+rcx] xor r13,r15 dec rcx dec r8 jne NEAR $L$hash_ad_tail_loop add r10,r13 adc r11,r14 adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 $L$hash_ad_done: ret global chacha20_poly1305_open_sse41 ALIGN 64 chacha20_poly1305_open_sse41: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_chacha20_poly1305_open_sse41: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] mov r9,QWORD[48+rsp] _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 push r9 sub rsp,288 + 160 + 32 lea rbp,[32+rsp] and rbp,-32 movaps XMMWORD[(0+0)+rbp],xmm6 movaps XMMWORD[(16+0)+rbp],xmm7 movaps XMMWORD[(32+0)+rbp],xmm8 movaps XMMWORD[(48+0)+rbp],xmm9 movaps XMMWORD[(64+0)+rbp],xmm10 movaps XMMWORD[(80+0)+rbp],xmm11 movaps XMMWORD[(96+0)+rbp],xmm12 movaps XMMWORD[(112+0)+rbp],xmm13 movaps XMMWORD[(128+0)+rbp],xmm14 movaps XMMWORD[(144+0)+rbp],xmm15 mov rbx,rdx mov QWORD[((0+160+32))+rbp],r8 mov QWORD[((8+160+32))+rbp],rbx cmp rbx,128 jbe NEAR $L$open_sse_128 movdqa xmm0,XMMWORD[$L$chacha20_consts] movdqu xmm4,XMMWORD[r9] movdqu xmm8,XMMWORD[16+r9] movdqu xmm12,XMMWORD[32+r9] movdqa xmm7,xmm12 movdqa XMMWORD[(160+48)+rbp],xmm4 movdqa XMMWORD[(160+64)+rbp],xmm8 movdqa XMMWORD[(160+96)+rbp],xmm12 mov r10,10 $L$open_sse_init_rounds: paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,4 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,12 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,12 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,4 dec r10 jne NEAR $L$open_sse_init_rounds paddd xmm0,XMMWORD[$L$chacha20_consts] paddd xmm4,XMMWORD[((160+48))+rbp] pand xmm0,XMMWORD[$L$clamp] movdqa XMMWORD[(160+0)+rbp],xmm0 movdqa XMMWORD[(160+16)+rbp],xmm4 mov r8,r8 call poly_hash_ad_internal $L$open_sse_main_loop: cmp rbx,16*16 jb NEAR $L$open_sse_tail movdqa xmm0,XMMWORD[$L$chacha20_consts] movdqa xmm4,XMMWORD[((160+48))+rbp] movdqa xmm8,XMMWORD[((160+64))+rbp] movdqa xmm1,xmm0 movdqa xmm5,xmm4 movdqa xmm9,xmm8 movdqa xmm2,xmm0 movdqa xmm6,xmm4 movdqa xmm10,xmm8 movdqa xmm3,xmm0 movdqa xmm7,xmm4 movdqa xmm11,xmm8 movdqa xmm15,XMMWORD[((160+96))+rbp] paddd xmm15,XMMWORD[$L$sse_inc] movdqa xmm14,xmm15 paddd xmm14,XMMWORD[$L$sse_inc] movdqa xmm13,xmm14 paddd xmm13,XMMWORD[$L$sse_inc] movdqa xmm12,xmm13 paddd xmm12,XMMWORD[$L$sse_inc] movdqa XMMWORD[(160+96)+rbp],xmm12 movdqa XMMWORD[(160+112)+rbp],xmm13 movdqa XMMWORD[(160+128)+rbp],xmm14 movdqa XMMWORD[(160+144)+rbp],xmm15 mov rcx,4 mov r8,rsi $L$open_sse_main_loop_rounds: movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,XMMWORD[$L$rol16] paddd xmm3,xmm7 paddd xmm2,xmm6 paddd xmm1,xmm5 paddd xmm0,xmm4 pxor xmm15,xmm3 pxor xmm14,xmm2 pxor xmm13,xmm1 pxor xmm12,xmm0 DB 102,69,15,56,0,248 DB 102,69,15,56,0,240 DB 102,69,15,56,0,232 DB 102,69,15,56,0,224 movdqa xmm8,XMMWORD[((160+80))+rbp] paddd xmm11,xmm15 paddd xmm10,xmm14 paddd xmm9,xmm13 paddd xmm8,xmm12 pxor xmm7,xmm11 add r10,QWORD[((0+0))+r8] adc r11,QWORD[((8+0))+r8] adc r12,1 lea r8,[16+r8] pxor xmm6,xmm10 pxor xmm5,xmm9 pxor xmm4,xmm8 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,xmm7 psrld xmm8,20 pslld xmm7,32-20 pxor xmm7,xmm8 movdqa xmm8,xmm6 psrld xmm8,20 pslld xmm6,32-20 pxor xmm6,xmm8 movdqa xmm8,xmm5 psrld xmm8,20 pslld xmm5,32-20 pxor xmm5,xmm8 movdqa xmm8,xmm4 psrld xmm8,20 pslld xmm4,32-20 pxor xmm4,xmm8 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx movdqa xmm8,XMMWORD[$L$rol8] paddd xmm3,xmm7 paddd xmm2,xmm6 paddd xmm1,xmm5 paddd xmm0,xmm4 pxor xmm15,xmm3 pxor xmm14,xmm2 pxor xmm13,xmm1 pxor xmm12,xmm0 DB 102,69,15,56,0,248 DB 102,69,15,56,0,240 DB 102,69,15,56,0,232 DB 102,69,15,56,0,224 movdqa xmm8,XMMWORD[((160+80))+rbp] paddd xmm11,xmm15 paddd xmm10,xmm14 paddd xmm9,xmm13 paddd xmm8,xmm12 pxor xmm7,xmm11 pxor xmm6,xmm10 mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 pxor xmm5,xmm9 pxor xmm4,xmm8 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,xmm7 psrld xmm8,25 pslld xmm7,32-25 pxor xmm7,xmm8 movdqa xmm8,xmm6 psrld xmm8,25 pslld xmm6,32-25 pxor xmm6,xmm8 movdqa xmm8,xmm5 psrld xmm8,25 pslld xmm5,32-25 pxor xmm5,xmm8 movdqa xmm8,xmm4 psrld xmm8,25 pslld xmm4,32-25 pxor xmm4,xmm8 movdqa xmm8,XMMWORD[((160+80))+rbp] imul r9,r12 add r15,r10 adc r9,rdx DB 102,15,58,15,255,4 DB 102,69,15,58,15,219,8 DB 102,69,15,58,15,255,12 DB 102,15,58,15,246,4 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,12 DB 102,15,58,15,237,4 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,12 DB 102,15,58,15,228,4 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,12 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,XMMWORD[$L$rol16] paddd xmm3,xmm7 paddd xmm2,xmm6 paddd xmm1,xmm5 paddd xmm0,xmm4 pxor xmm15,xmm3 pxor xmm14,xmm2 mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 pxor xmm13,xmm1 pxor xmm12,xmm0 DB 102,69,15,56,0,248 DB 102,69,15,56,0,240 DB 102,69,15,56,0,232 DB 102,69,15,56,0,224 movdqa xmm8,XMMWORD[((160+80))+rbp] paddd xmm11,xmm15 paddd xmm10,xmm14 paddd xmm9,xmm13 paddd xmm8,xmm12 pxor xmm7,xmm11 pxor xmm6,xmm10 pxor xmm5,xmm9 pxor xmm4,xmm8 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,xmm7 psrld xmm8,20 pslld xmm7,32-20 pxor xmm7,xmm8 movdqa xmm8,xmm6 psrld xmm8,20 pslld xmm6,32-20 pxor xmm6,xmm8 movdqa xmm8,xmm5 psrld xmm8,20 pslld xmm5,32-20 pxor xmm5,xmm8 movdqa xmm8,xmm4 psrld xmm8,20 pslld xmm4,32-20 pxor xmm4,xmm8 movdqa xmm8,XMMWORD[$L$rol8] paddd xmm3,xmm7 paddd xmm2,xmm6 paddd xmm1,xmm5 paddd xmm0,xmm4 pxor xmm15,xmm3 pxor xmm14,xmm2 pxor xmm13,xmm1 pxor xmm12,xmm0 DB 102,69,15,56,0,248 DB 102,69,15,56,0,240 DB 102,69,15,56,0,232 DB 102,69,15,56,0,224 movdqa xmm8,XMMWORD[((160+80))+rbp] paddd xmm11,xmm15 paddd xmm10,xmm14 paddd xmm9,xmm13 paddd xmm8,xmm12 pxor xmm7,xmm11 pxor xmm6,xmm10 pxor xmm5,xmm9 pxor xmm4,xmm8 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,xmm7 psrld xmm8,25 pslld xmm7,32-25 pxor xmm7,xmm8 movdqa xmm8,xmm6 psrld xmm8,25 pslld xmm6,32-25 pxor xmm6,xmm8 movdqa xmm8,xmm5 psrld xmm8,25 pslld xmm5,32-25 pxor xmm5,xmm8 movdqa xmm8,xmm4 psrld xmm8,25 pslld xmm4,32-25 pxor xmm4,xmm8 movdqa xmm8,XMMWORD[((160+80))+rbp] DB 102,15,58,15,255,12 DB 102,69,15,58,15,219,8 DB 102,69,15,58,15,255,4 DB 102,15,58,15,246,12 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,4 DB 102,15,58,15,237,12 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,4 DB 102,15,58,15,228,12 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,4 dec rcx jge NEAR $L$open_sse_main_loop_rounds add r10,QWORD[((0+0))+r8] adc r11,QWORD[((8+0))+r8] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea r8,[16+r8] cmp rcx,-6 jg NEAR $L$open_sse_main_loop_rounds paddd xmm3,XMMWORD[$L$chacha20_consts] paddd xmm7,XMMWORD[((160+48))+rbp] paddd xmm11,XMMWORD[((160+64))+rbp] paddd xmm15,XMMWORD[((160+144))+rbp] paddd xmm2,XMMWORD[$L$chacha20_consts] paddd xmm6,XMMWORD[((160+48))+rbp] paddd xmm10,XMMWORD[((160+64))+rbp] paddd xmm14,XMMWORD[((160+128))+rbp] paddd xmm1,XMMWORD[$L$chacha20_consts] paddd xmm5,XMMWORD[((160+48))+rbp] paddd xmm9,XMMWORD[((160+64))+rbp] paddd xmm13,XMMWORD[((160+112))+rbp] paddd xmm0,XMMWORD[$L$chacha20_consts] paddd xmm4,XMMWORD[((160+48))+rbp] paddd xmm8,XMMWORD[((160+64))+rbp] paddd xmm12,XMMWORD[((160+96))+rbp] movdqa XMMWORD[(160+80)+rbp],xmm12 movdqu xmm12,XMMWORD[((0 + 0))+rsi] pxor xmm12,xmm3 movdqu XMMWORD[(0 + 0)+rdi],xmm12 movdqu xmm12,XMMWORD[((16 + 0))+rsi] pxor xmm12,xmm7 movdqu XMMWORD[(16 + 0)+rdi],xmm12 movdqu xmm12,XMMWORD[((32 + 0))+rsi] pxor xmm12,xmm11 movdqu XMMWORD[(32 + 0)+rdi],xmm12 movdqu xmm12,XMMWORD[((48 + 0))+rsi] pxor xmm12,xmm15 movdqu XMMWORD[(48 + 0)+rdi],xmm12 movdqu xmm3,XMMWORD[((0 + 64))+rsi] movdqu xmm7,XMMWORD[((16 + 64))+rsi] movdqu xmm11,XMMWORD[((32 + 64))+rsi] movdqu xmm15,XMMWORD[((48 + 64))+rsi] pxor xmm2,xmm3 pxor xmm6,xmm7 pxor xmm10,xmm11 pxor xmm15,xmm14 movdqu XMMWORD[(0 + 64)+rdi],xmm2 movdqu XMMWORD[(16 + 64)+rdi],xmm6 movdqu XMMWORD[(32 + 64)+rdi],xmm10 movdqu XMMWORD[(48 + 64)+rdi],xmm15 movdqu xmm3,XMMWORD[((0 + 128))+rsi] movdqu xmm7,XMMWORD[((16 + 128))+rsi] movdqu xmm11,XMMWORD[((32 + 128))+rsi] movdqu xmm15,XMMWORD[((48 + 128))+rsi] pxor xmm1,xmm3 pxor xmm5,xmm7 pxor xmm9,xmm11 pxor xmm15,xmm13 movdqu XMMWORD[(0 + 128)+rdi],xmm1 movdqu XMMWORD[(16 + 128)+rdi],xmm5 movdqu XMMWORD[(32 + 128)+rdi],xmm9 movdqu XMMWORD[(48 + 128)+rdi],xmm15 movdqu xmm3,XMMWORD[((0 + 192))+rsi] movdqu xmm7,XMMWORD[((16 + 192))+rsi] movdqu xmm11,XMMWORD[((32 + 192))+rsi] movdqu xmm15,XMMWORD[((48 + 192))+rsi] pxor xmm0,xmm3 pxor xmm4,xmm7 pxor xmm8,xmm11 pxor xmm15,XMMWORD[((160+80))+rbp] movdqu XMMWORD[(0 + 192)+rdi],xmm0 movdqu XMMWORD[(16 + 192)+rdi],xmm4 movdqu XMMWORD[(32 + 192)+rdi],xmm8 movdqu XMMWORD[(48 + 192)+rdi],xmm15 lea rsi,[256+rsi] lea rdi,[256+rdi] sub rbx,16*16 jmp NEAR $L$open_sse_main_loop $L$open_sse_tail: test rbx,rbx jz NEAR $L$open_sse_finalize cmp rbx,12*16 ja NEAR $L$open_sse_tail_256 cmp rbx,8*16 ja NEAR $L$open_sse_tail_192 cmp rbx,4*16 ja NEAR $L$open_sse_tail_128 movdqa xmm0,XMMWORD[$L$chacha20_consts] movdqa xmm4,XMMWORD[((160+48))+rbp] movdqa xmm8,XMMWORD[((160+64))+rbp] movdqa xmm12,XMMWORD[((160+96))+rbp] paddd xmm12,XMMWORD[$L$sse_inc] movdqa XMMWORD[(160+96)+rbp],xmm12 xor r8,r8 mov rcx,rbx cmp rcx,16 jb NEAR $L$open_sse_tail_64_rounds $L$open_sse_tail_64_rounds_and_x1hash: add r10,QWORD[((0+0))+r8*1+rsi] adc r11,QWORD[((8+0))+r8*1+rsi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 sub rcx,16 $L$open_sse_tail_64_rounds: add r8,16 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,4 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,12 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,12 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,4 cmp rcx,16 jae NEAR $L$open_sse_tail_64_rounds_and_x1hash cmp r8,10*16 jne NEAR $L$open_sse_tail_64_rounds paddd xmm0,XMMWORD[$L$chacha20_consts] paddd xmm4,XMMWORD[((160+48))+rbp] paddd xmm8,XMMWORD[((160+64))+rbp] paddd xmm12,XMMWORD[((160+96))+rbp] jmp NEAR $L$open_sse_tail_64_dec_loop $L$open_sse_tail_128: movdqa xmm0,XMMWORD[$L$chacha20_consts] movdqa xmm4,XMMWORD[((160+48))+rbp] movdqa xmm8,XMMWORD[((160+64))+rbp] movdqa xmm1,xmm0 movdqa xmm5,xmm4 movdqa xmm9,xmm8 movdqa xmm13,XMMWORD[((160+96))+rbp] paddd xmm13,XMMWORD[$L$sse_inc] movdqa xmm12,xmm13 paddd xmm12,XMMWORD[$L$sse_inc] movdqa XMMWORD[(160+96)+rbp],xmm12 movdqa XMMWORD[(160+112)+rbp],xmm13 mov rcx,rbx and rcx,-16 xor r8,r8 $L$open_sse_tail_128_rounds_and_x1hash: add r10,QWORD[((0+0))+r8*1+rsi] adc r11,QWORD[((8+0))+r8*1+rsi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 $L$open_sse_tail_128_rounds: add r8,16 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,4 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,12 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,12 psrld xmm5,20 pxor xmm5,xmm3 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,7 psrld xmm5,25 pxor xmm5,xmm3 DB 102,15,58,15,237,4 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,12 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,12 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,4 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,12 psrld xmm5,20 pxor xmm5,xmm3 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,7 psrld xmm5,25 pxor xmm5,xmm3 DB 102,15,58,15,237,12 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,4 cmp r8,rcx jb NEAR $L$open_sse_tail_128_rounds_and_x1hash cmp r8,10*16 jne NEAR $L$open_sse_tail_128_rounds paddd xmm1,XMMWORD[$L$chacha20_consts] paddd xmm5,XMMWORD[((160+48))+rbp] paddd xmm9,XMMWORD[((160+64))+rbp] paddd xmm13,XMMWORD[((160+112))+rbp] paddd xmm0,XMMWORD[$L$chacha20_consts] paddd xmm4,XMMWORD[((160+48))+rbp] paddd xmm8,XMMWORD[((160+64))+rbp] paddd xmm12,XMMWORD[((160+96))+rbp] movdqu xmm3,XMMWORD[((0 + 0))+rsi] movdqu xmm7,XMMWORD[((16 + 0))+rsi] movdqu xmm11,XMMWORD[((32 + 0))+rsi] movdqu xmm15,XMMWORD[((48 + 0))+rsi] pxor xmm1,xmm3 pxor xmm5,xmm7 pxor xmm9,xmm11 pxor xmm15,xmm13 movdqu XMMWORD[(0 + 0)+rdi],xmm1 movdqu XMMWORD[(16 + 0)+rdi],xmm5 movdqu XMMWORD[(32 + 0)+rdi],xmm9 movdqu XMMWORD[(48 + 0)+rdi],xmm15 sub rbx,4*16 lea rsi,[64+rsi] lea rdi,[64+rdi] jmp NEAR $L$open_sse_tail_64_dec_loop $L$open_sse_tail_192: movdqa xmm0,XMMWORD[$L$chacha20_consts] movdqa xmm4,XMMWORD[((160+48))+rbp] movdqa xmm8,XMMWORD[((160+64))+rbp] movdqa xmm1,xmm0 movdqa xmm5,xmm4 movdqa xmm9,xmm8 movdqa xmm2,xmm0 movdqa xmm6,xmm4 movdqa xmm10,xmm8 movdqa xmm14,XMMWORD[((160+96))+rbp] paddd xmm14,XMMWORD[$L$sse_inc] movdqa xmm13,xmm14 paddd xmm13,XMMWORD[$L$sse_inc] movdqa xmm12,xmm13 paddd xmm12,XMMWORD[$L$sse_inc] movdqa XMMWORD[(160+96)+rbp],xmm12 movdqa XMMWORD[(160+112)+rbp],xmm13 movdqa XMMWORD[(160+128)+rbp],xmm14 mov rcx,rbx mov r8,10*16 cmp rcx,10*16 cmovg rcx,r8 and rcx,-16 xor r8,r8 $L$open_sse_tail_192_rounds_and_x1hash: add r10,QWORD[((0+0))+r8*1+rsi] adc r11,QWORD[((8+0))+r8*1+rsi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 $L$open_sse_tail_192_rounds: add r8,16 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,4 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,12 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,12 psrld xmm5,20 pxor xmm5,xmm3 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,7 psrld xmm5,25 pxor xmm5,xmm3 DB 102,15,58,15,237,4 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,12 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol16] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,12 psrld xmm6,20 pxor xmm6,xmm3 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol8] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,7 psrld xmm6,25 pxor xmm6,xmm3 DB 102,15,58,15,246,4 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,12 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,12 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,4 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,12 psrld xmm5,20 pxor xmm5,xmm3 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,7 psrld xmm5,25 pxor xmm5,xmm3 DB 102,15,58,15,237,12 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,4 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol16] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,12 psrld xmm6,20 pxor xmm6,xmm3 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol8] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,7 psrld xmm6,25 pxor xmm6,xmm3 DB 102,15,58,15,246,12 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,4 cmp r8,rcx jb NEAR $L$open_sse_tail_192_rounds_and_x1hash cmp r8,10*16 jne NEAR $L$open_sse_tail_192_rounds cmp rbx,11*16 jb NEAR $L$open_sse_tail_192_finish add r10,QWORD[((0+160))+rsi] adc r11,QWORD[((8+160))+rsi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 cmp rbx,12*16 jb NEAR $L$open_sse_tail_192_finish add r10,QWORD[((0+176))+rsi] adc r11,QWORD[((8+176))+rsi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 $L$open_sse_tail_192_finish: paddd xmm2,XMMWORD[$L$chacha20_consts] paddd xmm6,XMMWORD[((160+48))+rbp] paddd xmm10,XMMWORD[((160+64))+rbp] paddd xmm14,XMMWORD[((160+128))+rbp] paddd xmm1,XMMWORD[$L$chacha20_consts] paddd xmm5,XMMWORD[((160+48))+rbp] paddd xmm9,XMMWORD[((160+64))+rbp] paddd xmm13,XMMWORD[((160+112))+rbp] paddd xmm0,XMMWORD[$L$chacha20_consts] paddd xmm4,XMMWORD[((160+48))+rbp] paddd xmm8,XMMWORD[((160+64))+rbp] paddd xmm12,XMMWORD[((160+96))+rbp] movdqu xmm3,XMMWORD[((0 + 0))+rsi] movdqu xmm7,XMMWORD[((16 + 0))+rsi] movdqu xmm11,XMMWORD[((32 + 0))+rsi] movdqu xmm15,XMMWORD[((48 + 0))+rsi] pxor xmm2,xmm3 pxor xmm6,xmm7 pxor xmm10,xmm11 pxor xmm15,xmm14 movdqu XMMWORD[(0 + 0)+rdi],xmm2 movdqu XMMWORD[(16 + 0)+rdi],xmm6 movdqu XMMWORD[(32 + 0)+rdi],xmm10 movdqu XMMWORD[(48 + 0)+rdi],xmm15 movdqu xmm3,XMMWORD[((0 + 64))+rsi] movdqu xmm7,XMMWORD[((16 + 64))+rsi] movdqu xmm11,XMMWORD[((32 + 64))+rsi] movdqu xmm15,XMMWORD[((48 + 64))+rsi] pxor xmm1,xmm3 pxor xmm5,xmm7 pxor xmm9,xmm11 pxor xmm15,xmm13 movdqu XMMWORD[(0 + 64)+rdi],xmm1 movdqu XMMWORD[(16 + 64)+rdi],xmm5 movdqu XMMWORD[(32 + 64)+rdi],xmm9 movdqu XMMWORD[(48 + 64)+rdi],xmm15 sub rbx,8*16 lea rsi,[128+rsi] lea rdi,[128+rdi] jmp NEAR $L$open_sse_tail_64_dec_loop $L$open_sse_tail_256: movdqa xmm0,XMMWORD[$L$chacha20_consts] movdqa xmm4,XMMWORD[((160+48))+rbp] movdqa xmm8,XMMWORD[((160+64))+rbp] movdqa xmm1,xmm0 movdqa xmm5,xmm4 movdqa xmm9,xmm8 movdqa xmm2,xmm0 movdqa xmm6,xmm4 movdqa xmm10,xmm8 movdqa xmm3,xmm0 movdqa xmm7,xmm4 movdqa xmm11,xmm8 movdqa xmm15,XMMWORD[((160+96))+rbp] paddd xmm15,XMMWORD[$L$sse_inc] movdqa xmm14,xmm15 paddd xmm14,XMMWORD[$L$sse_inc] movdqa xmm13,xmm14 paddd xmm13,XMMWORD[$L$sse_inc] movdqa xmm12,xmm13 paddd xmm12,XMMWORD[$L$sse_inc] movdqa XMMWORD[(160+96)+rbp],xmm12 movdqa XMMWORD[(160+112)+rbp],xmm13 movdqa XMMWORD[(160+128)+rbp],xmm14 movdqa XMMWORD[(160+144)+rbp],xmm15 xor r8,r8 $L$open_sse_tail_256_rounds_and_x1hash: add r10,QWORD[((0+0))+r8*1+rsi] adc r11,QWORD[((8+0))+r8*1+rsi] adc r12,1 movdqa XMMWORD[(160+80)+rbp],xmm11 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm11,xmm4 pslld xmm11,12 psrld xmm4,20 pxor xmm4,xmm11 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm11,xmm4 pslld xmm11,7 psrld xmm4,25 pxor xmm4,xmm11 DB 102,15,58,15,228,4 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,12 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm11,xmm5 pslld xmm11,12 psrld xmm5,20 pxor xmm5,xmm11 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm11,xmm5 pslld xmm11,7 psrld xmm5,25 pxor xmm5,xmm11 DB 102,15,58,15,237,4 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,12 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol16] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm11,xmm6 pslld xmm11,12 psrld xmm6,20 pxor xmm6,xmm11 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol8] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm11,xmm6 pslld xmm11,7 psrld xmm6,25 pxor xmm6,xmm11 DB 102,15,58,15,246,4 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,12 movdqa xmm11,XMMWORD[((160+80))+rbp] mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx movdqa XMMWORD[(160+80)+rbp],xmm9 paddd xmm3,xmm7 pxor xmm15,xmm3 pshufb xmm15,XMMWORD[$L$rol16] paddd xmm11,xmm15 pxor xmm7,xmm11 movdqa xmm9,xmm7 pslld xmm9,12 psrld xmm7,20 pxor xmm7,xmm9 paddd xmm3,xmm7 pxor xmm15,xmm3 pshufb xmm15,XMMWORD[$L$rol8] paddd xmm11,xmm15 pxor xmm7,xmm11 movdqa xmm9,xmm7 pslld xmm9,7 psrld xmm7,25 pxor xmm7,xmm9 DB 102,15,58,15,255,4 DB 102,69,15,58,15,219,8 DB 102,69,15,58,15,255,12 movdqa xmm9,XMMWORD[((160+80))+rbp] mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 movdqa XMMWORD[(160+80)+rbp],xmm11 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm11,xmm4 pslld xmm11,12 psrld xmm4,20 pxor xmm4,xmm11 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm11,xmm4 pslld xmm11,7 psrld xmm4,25 pxor xmm4,xmm11 DB 102,15,58,15,228,12 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,4 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm11,xmm5 pslld xmm11,12 psrld xmm5,20 pxor xmm5,xmm11 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm11,xmm5 pslld xmm11,7 psrld xmm5,25 pxor xmm5,xmm11 DB 102,15,58,15,237,12 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,4 imul r9,r12 add r15,r10 adc r9,rdx paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol16] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm11,xmm6 pslld xmm11,12 psrld xmm6,20 pxor xmm6,xmm11 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol8] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm11,xmm6 pslld xmm11,7 psrld xmm6,25 pxor xmm6,xmm11 DB 102,15,58,15,246,12 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,4 movdqa xmm11,XMMWORD[((160+80))+rbp] mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 movdqa XMMWORD[(160+80)+rbp],xmm9 paddd xmm3,xmm7 pxor xmm15,xmm3 pshufb xmm15,XMMWORD[$L$rol16] paddd xmm11,xmm15 pxor xmm7,xmm11 movdqa xmm9,xmm7 pslld xmm9,12 psrld xmm7,20 pxor xmm7,xmm9 paddd xmm3,xmm7 pxor xmm15,xmm3 pshufb xmm15,XMMWORD[$L$rol8] paddd xmm11,xmm15 pxor xmm7,xmm11 movdqa xmm9,xmm7 pslld xmm9,7 psrld xmm7,25 pxor xmm7,xmm9 DB 102,15,58,15,255,12 DB 102,69,15,58,15,219,8 DB 102,69,15,58,15,255,4 movdqa xmm9,XMMWORD[((160+80))+rbp] add r8,16 cmp r8,10*16 jb NEAR $L$open_sse_tail_256_rounds_and_x1hash mov rcx,rbx and rcx,-16 $L$open_sse_tail_256_hash: add r10,QWORD[((0+0))+r8*1+rsi] adc r11,QWORD[((8+0))+r8*1+rsi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 add r8,16 cmp r8,rcx jb NEAR $L$open_sse_tail_256_hash paddd xmm3,XMMWORD[$L$chacha20_consts] paddd xmm7,XMMWORD[((160+48))+rbp] paddd xmm11,XMMWORD[((160+64))+rbp] paddd xmm15,XMMWORD[((160+144))+rbp] paddd xmm2,XMMWORD[$L$chacha20_consts] paddd xmm6,XMMWORD[((160+48))+rbp] paddd xmm10,XMMWORD[((160+64))+rbp] paddd xmm14,XMMWORD[((160+128))+rbp] paddd xmm1,XMMWORD[$L$chacha20_consts] paddd xmm5,XMMWORD[((160+48))+rbp] paddd xmm9,XMMWORD[((160+64))+rbp] paddd xmm13,XMMWORD[((160+112))+rbp] paddd xmm0,XMMWORD[$L$chacha20_consts] paddd xmm4,XMMWORD[((160+48))+rbp] paddd xmm8,XMMWORD[((160+64))+rbp] paddd xmm12,XMMWORD[((160+96))+rbp] movdqa XMMWORD[(160+80)+rbp],xmm12 movdqu xmm12,XMMWORD[((0 + 0))+rsi] pxor xmm12,xmm3 movdqu XMMWORD[(0 + 0)+rdi],xmm12 movdqu xmm12,XMMWORD[((16 + 0))+rsi] pxor xmm12,xmm7 movdqu XMMWORD[(16 + 0)+rdi],xmm12 movdqu xmm12,XMMWORD[((32 + 0))+rsi] pxor xmm12,xmm11 movdqu XMMWORD[(32 + 0)+rdi],xmm12 movdqu xmm12,XMMWORD[((48 + 0))+rsi] pxor xmm12,xmm15 movdqu XMMWORD[(48 + 0)+rdi],xmm12 movdqu xmm3,XMMWORD[((0 + 64))+rsi] movdqu xmm7,XMMWORD[((16 + 64))+rsi] movdqu xmm11,XMMWORD[((32 + 64))+rsi] movdqu xmm15,XMMWORD[((48 + 64))+rsi] pxor xmm2,xmm3 pxor xmm6,xmm7 pxor xmm10,xmm11 pxor xmm15,xmm14 movdqu XMMWORD[(0 + 64)+rdi],xmm2 movdqu XMMWORD[(16 + 64)+rdi],xmm6 movdqu XMMWORD[(32 + 64)+rdi],xmm10 movdqu XMMWORD[(48 + 64)+rdi],xmm15 movdqu xmm3,XMMWORD[((0 + 128))+rsi] movdqu xmm7,XMMWORD[((16 + 128))+rsi] movdqu xmm11,XMMWORD[((32 + 128))+rsi] movdqu xmm15,XMMWORD[((48 + 128))+rsi] pxor xmm1,xmm3 pxor xmm5,xmm7 pxor xmm9,xmm11 pxor xmm15,xmm13 movdqu XMMWORD[(0 + 128)+rdi],xmm1 movdqu XMMWORD[(16 + 128)+rdi],xmm5 movdqu XMMWORD[(32 + 128)+rdi],xmm9 movdqu XMMWORD[(48 + 128)+rdi],xmm15 movdqa xmm12,XMMWORD[((160+80))+rbp] sub rbx,12*16 lea rsi,[192+rsi] lea rdi,[192+rdi] $L$open_sse_tail_64_dec_loop: cmp rbx,16 jb NEAR $L$open_sse_tail_16_init sub rbx,16 movdqu xmm3,XMMWORD[rsi] pxor xmm0,xmm3 movdqu XMMWORD[rdi],xmm0 lea rsi,[16+rsi] lea rdi,[16+rdi] movdqa xmm0,xmm4 movdqa xmm4,xmm8 movdqa xmm8,xmm12 jmp NEAR $L$open_sse_tail_64_dec_loop $L$open_sse_tail_16_init: movdqa xmm1,xmm0 $L$open_sse_tail_16: test rbx,rbx jz NEAR $L$open_sse_finalize pxor xmm3,xmm3 lea rsi,[((-1))+rbx*1+rsi] mov r8,rbx $L$open_sse_tail_16_compose: pslldq xmm3,1 pinsrb xmm3,BYTE[rsi],0 sub rsi,1 sub r8,1 jnz NEAR $L$open_sse_tail_16_compose DB 102,73,15,126,221 pextrq r14,xmm3,1 pxor xmm3,xmm1 $L$open_sse_tail_16_extract: pextrb XMMWORD[rdi],xmm3,0 psrldq xmm3,1 add rdi,1 sub rbx,1 jne NEAR $L$open_sse_tail_16_extract add r10,r13 adc r11,r14 adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 $L$open_sse_finalize: add r10,QWORD[((0+160+32))+rbp] adc r11,QWORD[((8+160+32))+rbp] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 mov r13,r10 mov r14,r11 mov r15,r12 sub r10,-5 sbb r11,-1 sbb r12,3 cmovc r10,r13 cmovc r11,r14 cmovc r12,r15 add r10,QWORD[((0+160+16))+rbp] adc r11,QWORD[((8+160+16))+rbp] movaps xmm6,XMMWORD[((0+0))+rbp] movaps xmm7,XMMWORD[((16+0))+rbp] movaps xmm8,XMMWORD[((32+0))+rbp] movaps xmm9,XMMWORD[((48+0))+rbp] movaps xmm10,XMMWORD[((64+0))+rbp] movaps xmm11,XMMWORD[((80+0))+rbp] movaps xmm12,XMMWORD[((96+0))+rbp] movaps xmm13,XMMWORD[((112+0))+rbp] movaps xmm14,XMMWORD[((128+0))+rbp] movaps xmm15,XMMWORD[((144+0))+rbp] add rsp,288 + 160 + 32 pop r9 mov QWORD[r9],r10 mov QWORD[8+r9],r11 pop r15 pop r14 pop r13 pop r12 pop rbx pop rbp mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$open_sse_128: movdqu xmm0,XMMWORD[$L$chacha20_consts] movdqa xmm1,xmm0 movdqa xmm2,xmm0 movdqu xmm4,XMMWORD[r9] movdqa xmm5,xmm4 movdqa xmm6,xmm4 movdqu xmm8,XMMWORD[16+r9] movdqa xmm9,xmm8 movdqa xmm10,xmm8 movdqu xmm12,XMMWORD[32+r9] movdqa xmm13,xmm12 paddd xmm13,XMMWORD[$L$sse_inc] movdqa xmm14,xmm13 paddd xmm14,XMMWORD[$L$sse_inc] movdqa xmm7,xmm4 movdqa xmm11,xmm8 movdqa xmm15,xmm13 mov r10,10 $L$open_sse_128_rounds: paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,4 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,12 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,12 psrld xmm5,20 pxor xmm5,xmm3 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,7 psrld xmm5,25 pxor xmm5,xmm3 DB 102,15,58,15,237,4 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,12 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol16] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,12 psrld xmm6,20 pxor xmm6,xmm3 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol8] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,7 psrld xmm6,25 pxor xmm6,xmm3 DB 102,15,58,15,246,4 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,12 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,12 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,4 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,12 psrld xmm5,20 pxor xmm5,xmm3 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,7 psrld xmm5,25 pxor xmm5,xmm3 DB 102,15,58,15,237,12 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,4 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol16] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,12 psrld xmm6,20 pxor xmm6,xmm3 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol8] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,7 psrld xmm6,25 pxor xmm6,xmm3 DB 102,15,58,15,246,12 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,4 dec r10 jnz NEAR $L$open_sse_128_rounds paddd xmm0,XMMWORD[$L$chacha20_consts] paddd xmm1,XMMWORD[$L$chacha20_consts] paddd xmm2,XMMWORD[$L$chacha20_consts] paddd xmm4,xmm7 paddd xmm5,xmm7 paddd xmm6,xmm7 paddd xmm9,xmm11 paddd xmm10,xmm11 paddd xmm13,xmm15 paddd xmm15,XMMWORD[$L$sse_inc] paddd xmm14,xmm15 pand xmm0,XMMWORD[$L$clamp] movdqa XMMWORD[(160+0)+rbp],xmm0 movdqa XMMWORD[(160+16)+rbp],xmm4 mov r8,r8 call poly_hash_ad_internal $L$open_sse_128_xor_hash: cmp rbx,16 jb NEAR $L$open_sse_tail_16 sub rbx,16 add r10,QWORD[((0+0))+rsi] adc r11,QWORD[((8+0))+rsi] adc r12,1 movdqu xmm3,XMMWORD[rsi] pxor xmm1,xmm3 movdqu XMMWORD[rdi],xmm1 lea rsi,[16+rsi] lea rdi,[16+rdi] mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 movdqa xmm1,xmm5 movdqa xmm5,xmm9 movdqa xmm9,xmm13 movdqa xmm13,xmm2 movdqa xmm2,xmm6 movdqa xmm6,xmm10 movdqa xmm10,xmm14 jmp NEAR $L$open_sse_128_xor_hash $L$SEH_end_chacha20_poly1305_open_sse41: global chacha20_poly1305_seal_sse41 ALIGN 64 chacha20_poly1305_seal_sse41: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_chacha20_poly1305_seal_sse41: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] mov r9,QWORD[48+rsp] _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 push r9 sub rsp,288 + 160 + 32 lea rbp,[32+rsp] and rbp,-32 movaps XMMWORD[(0+0)+rbp],xmm6 movaps XMMWORD[(16+0)+rbp],xmm7 movaps XMMWORD[(32+0)+rbp],xmm8 movaps XMMWORD[(48+0)+rbp],xmm9 movaps XMMWORD[(64+0)+rbp],xmm10 movaps XMMWORD[(80+0)+rbp],xmm11 movaps XMMWORD[(96+0)+rbp],xmm12 movaps XMMWORD[(112+0)+rbp],xmm13 movaps XMMWORD[(128+0)+rbp],xmm14 movaps XMMWORD[(144+0)+rbp],xmm15 mov rbx,QWORD[56+r9] add rbx,rdx mov QWORD[((0+160+32))+rbp],r8 mov QWORD[((8+160+32))+rbp],rbx mov rbx,rdx cmp rbx,128 jbe NEAR $L$seal_sse_128 movdqa xmm0,XMMWORD[$L$chacha20_consts] movdqu xmm4,XMMWORD[r9] movdqu xmm8,XMMWORD[16+r9] movdqu xmm12,XMMWORD[32+r9] movdqa xmm1,xmm0 movdqa xmm2,xmm0 movdqa xmm3,xmm0 movdqa xmm5,xmm4 movdqa xmm6,xmm4 movdqa xmm7,xmm4 movdqa xmm9,xmm8 movdqa xmm10,xmm8 movdqa xmm11,xmm8 movdqa xmm15,xmm12 paddd xmm12,XMMWORD[$L$sse_inc] movdqa xmm14,xmm12 paddd xmm12,XMMWORD[$L$sse_inc] movdqa xmm13,xmm12 paddd xmm12,XMMWORD[$L$sse_inc] movdqa XMMWORD[(160+48)+rbp],xmm4 movdqa XMMWORD[(160+64)+rbp],xmm8 movdqa XMMWORD[(160+96)+rbp],xmm12 movdqa XMMWORD[(160+112)+rbp],xmm13 movdqa XMMWORD[(160+128)+rbp],xmm14 movdqa XMMWORD[(160+144)+rbp],xmm15 mov r10,10 $L$seal_sse_init_rounds: movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,XMMWORD[$L$rol16] paddd xmm3,xmm7 paddd xmm2,xmm6 paddd xmm1,xmm5 paddd xmm0,xmm4 pxor xmm15,xmm3 pxor xmm14,xmm2 pxor xmm13,xmm1 pxor xmm12,xmm0 DB 102,69,15,56,0,248 DB 102,69,15,56,0,240 DB 102,69,15,56,0,232 DB 102,69,15,56,0,224 movdqa xmm8,XMMWORD[((160+80))+rbp] paddd xmm11,xmm15 paddd xmm10,xmm14 paddd xmm9,xmm13 paddd xmm8,xmm12 pxor xmm7,xmm11 pxor xmm6,xmm10 pxor xmm5,xmm9 pxor xmm4,xmm8 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,xmm7 psrld xmm8,20 pslld xmm7,32-20 pxor xmm7,xmm8 movdqa xmm8,xmm6 psrld xmm8,20 pslld xmm6,32-20 pxor xmm6,xmm8 movdqa xmm8,xmm5 psrld xmm8,20 pslld xmm5,32-20 pxor xmm5,xmm8 movdqa xmm8,xmm4 psrld xmm8,20 pslld xmm4,32-20 pxor xmm4,xmm8 movdqa xmm8,XMMWORD[$L$rol8] paddd xmm3,xmm7 paddd xmm2,xmm6 paddd xmm1,xmm5 paddd xmm0,xmm4 pxor xmm15,xmm3 pxor xmm14,xmm2 pxor xmm13,xmm1 pxor xmm12,xmm0 DB 102,69,15,56,0,248 DB 102,69,15,56,0,240 DB 102,69,15,56,0,232 DB 102,69,15,56,0,224 movdqa xmm8,XMMWORD[((160+80))+rbp] paddd xmm11,xmm15 paddd xmm10,xmm14 paddd xmm9,xmm13 paddd xmm8,xmm12 pxor xmm7,xmm11 pxor xmm6,xmm10 pxor xmm5,xmm9 pxor xmm4,xmm8 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,xmm7 psrld xmm8,25 pslld xmm7,32-25 pxor xmm7,xmm8 movdqa xmm8,xmm6 psrld xmm8,25 pslld xmm6,32-25 pxor xmm6,xmm8 movdqa xmm8,xmm5 psrld xmm8,25 pslld xmm5,32-25 pxor xmm5,xmm8 movdqa xmm8,xmm4 psrld xmm8,25 pslld xmm4,32-25 pxor xmm4,xmm8 movdqa xmm8,XMMWORD[((160+80))+rbp] DB 102,15,58,15,255,4 DB 102,69,15,58,15,219,8 DB 102,69,15,58,15,255,12 DB 102,15,58,15,246,4 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,12 DB 102,15,58,15,237,4 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,12 DB 102,15,58,15,228,4 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,12 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,XMMWORD[$L$rol16] paddd xmm3,xmm7 paddd xmm2,xmm6 paddd xmm1,xmm5 paddd xmm0,xmm4 pxor xmm15,xmm3 pxor xmm14,xmm2 pxor xmm13,xmm1 pxor xmm12,xmm0 DB 102,69,15,56,0,248 DB 102,69,15,56,0,240 DB 102,69,15,56,0,232 DB 102,69,15,56,0,224 movdqa xmm8,XMMWORD[((160+80))+rbp] paddd xmm11,xmm15 paddd xmm10,xmm14 paddd xmm9,xmm13 paddd xmm8,xmm12 pxor xmm7,xmm11 pxor xmm6,xmm10 pxor xmm5,xmm9 pxor xmm4,xmm8 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,xmm7 psrld xmm8,20 pslld xmm7,32-20 pxor xmm7,xmm8 movdqa xmm8,xmm6 psrld xmm8,20 pslld xmm6,32-20 pxor xmm6,xmm8 movdqa xmm8,xmm5 psrld xmm8,20 pslld xmm5,32-20 pxor xmm5,xmm8 movdqa xmm8,xmm4 psrld xmm8,20 pslld xmm4,32-20 pxor xmm4,xmm8 movdqa xmm8,XMMWORD[$L$rol8] paddd xmm3,xmm7 paddd xmm2,xmm6 paddd xmm1,xmm5 paddd xmm0,xmm4 pxor xmm15,xmm3 pxor xmm14,xmm2 pxor xmm13,xmm1 pxor xmm12,xmm0 DB 102,69,15,56,0,248 DB 102,69,15,56,0,240 DB 102,69,15,56,0,232 DB 102,69,15,56,0,224 movdqa xmm8,XMMWORD[((160+80))+rbp] paddd xmm11,xmm15 paddd xmm10,xmm14 paddd xmm9,xmm13 paddd xmm8,xmm12 pxor xmm7,xmm11 pxor xmm6,xmm10 pxor xmm5,xmm9 pxor xmm4,xmm8 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,xmm7 psrld xmm8,25 pslld xmm7,32-25 pxor xmm7,xmm8 movdqa xmm8,xmm6 psrld xmm8,25 pslld xmm6,32-25 pxor xmm6,xmm8 movdqa xmm8,xmm5 psrld xmm8,25 pslld xmm5,32-25 pxor xmm5,xmm8 movdqa xmm8,xmm4 psrld xmm8,25 pslld xmm4,32-25 pxor xmm4,xmm8 movdqa xmm8,XMMWORD[((160+80))+rbp] DB 102,15,58,15,255,12 DB 102,69,15,58,15,219,8 DB 102,69,15,58,15,255,4 DB 102,15,58,15,246,12 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,4 DB 102,15,58,15,237,12 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,4 DB 102,15,58,15,228,12 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,4 dec r10 jnz NEAR $L$seal_sse_init_rounds paddd xmm3,XMMWORD[$L$chacha20_consts] paddd xmm7,XMMWORD[((160+48))+rbp] paddd xmm11,XMMWORD[((160+64))+rbp] paddd xmm15,XMMWORD[((160+144))+rbp] paddd xmm2,XMMWORD[$L$chacha20_consts] paddd xmm6,XMMWORD[((160+48))+rbp] paddd xmm10,XMMWORD[((160+64))+rbp] paddd xmm14,XMMWORD[((160+128))+rbp] paddd xmm1,XMMWORD[$L$chacha20_consts] paddd xmm5,XMMWORD[((160+48))+rbp] paddd xmm9,XMMWORD[((160+64))+rbp] paddd xmm13,XMMWORD[((160+112))+rbp] paddd xmm0,XMMWORD[$L$chacha20_consts] paddd xmm4,XMMWORD[((160+48))+rbp] paddd xmm8,XMMWORD[((160+64))+rbp] paddd xmm12,XMMWORD[((160+96))+rbp] pand xmm3,XMMWORD[$L$clamp] movdqa XMMWORD[(160+0)+rbp],xmm3 movdqa XMMWORD[(160+16)+rbp],xmm7 mov r8,r8 call poly_hash_ad_internal movdqu xmm3,XMMWORD[((0 + 0))+rsi] movdqu xmm7,XMMWORD[((16 + 0))+rsi] movdqu xmm11,XMMWORD[((32 + 0))+rsi] movdqu xmm15,XMMWORD[((48 + 0))+rsi] pxor xmm2,xmm3 pxor xmm6,xmm7 pxor xmm10,xmm11 pxor xmm15,xmm14 movdqu XMMWORD[(0 + 0)+rdi],xmm2 movdqu XMMWORD[(16 + 0)+rdi],xmm6 movdqu XMMWORD[(32 + 0)+rdi],xmm10 movdqu XMMWORD[(48 + 0)+rdi],xmm15 movdqu xmm3,XMMWORD[((0 + 64))+rsi] movdqu xmm7,XMMWORD[((16 + 64))+rsi] movdqu xmm11,XMMWORD[((32 + 64))+rsi] movdqu xmm15,XMMWORD[((48 + 64))+rsi] pxor xmm1,xmm3 pxor xmm5,xmm7 pxor xmm9,xmm11 pxor xmm15,xmm13 movdqu XMMWORD[(0 + 64)+rdi],xmm1 movdqu XMMWORD[(16 + 64)+rdi],xmm5 movdqu XMMWORD[(32 + 64)+rdi],xmm9 movdqu XMMWORD[(48 + 64)+rdi],xmm15 cmp rbx,12*16 ja NEAR $L$seal_sse_main_init mov rcx,8*16 sub rbx,8*16 lea rsi,[128+rsi] jmp NEAR $L$seal_sse_128_tail_hash $L$seal_sse_main_init: movdqu xmm3,XMMWORD[((0 + 128))+rsi] movdqu xmm7,XMMWORD[((16 + 128))+rsi] movdqu xmm11,XMMWORD[((32 + 128))+rsi] movdqu xmm15,XMMWORD[((48 + 128))+rsi] pxor xmm0,xmm3 pxor xmm4,xmm7 pxor xmm8,xmm11 pxor xmm15,xmm12 movdqu XMMWORD[(0 + 128)+rdi],xmm0 movdqu XMMWORD[(16 + 128)+rdi],xmm4 movdqu XMMWORD[(32 + 128)+rdi],xmm8 movdqu XMMWORD[(48 + 128)+rdi],xmm15 mov rcx,12*16 sub rbx,12*16 lea rsi,[192+rsi] mov rcx,2 mov r8,8 cmp rbx,4*16 jbe NEAR $L$seal_sse_tail_64 cmp rbx,8*16 jbe NEAR $L$seal_sse_tail_128 cmp rbx,12*16 jbe NEAR $L$seal_sse_tail_192 $L$seal_sse_main_loop: movdqa xmm0,XMMWORD[$L$chacha20_consts] movdqa xmm4,XMMWORD[((160+48))+rbp] movdqa xmm8,XMMWORD[((160+64))+rbp] movdqa xmm1,xmm0 movdqa xmm5,xmm4 movdqa xmm9,xmm8 movdqa xmm2,xmm0 movdqa xmm6,xmm4 movdqa xmm10,xmm8 movdqa xmm3,xmm0 movdqa xmm7,xmm4 movdqa xmm11,xmm8 movdqa xmm15,XMMWORD[((160+96))+rbp] paddd xmm15,XMMWORD[$L$sse_inc] movdqa xmm14,xmm15 paddd xmm14,XMMWORD[$L$sse_inc] movdqa xmm13,xmm14 paddd xmm13,XMMWORD[$L$sse_inc] movdqa xmm12,xmm13 paddd xmm12,XMMWORD[$L$sse_inc] movdqa XMMWORD[(160+96)+rbp],xmm12 movdqa XMMWORD[(160+112)+rbp],xmm13 movdqa XMMWORD[(160+128)+rbp],xmm14 movdqa XMMWORD[(160+144)+rbp],xmm15 ALIGN 32 $L$seal_sse_main_rounds: movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,XMMWORD[$L$rol16] paddd xmm3,xmm7 paddd xmm2,xmm6 paddd xmm1,xmm5 paddd xmm0,xmm4 pxor xmm15,xmm3 pxor xmm14,xmm2 pxor xmm13,xmm1 pxor xmm12,xmm0 DB 102,69,15,56,0,248 DB 102,69,15,56,0,240 DB 102,69,15,56,0,232 DB 102,69,15,56,0,224 movdqa xmm8,XMMWORD[((160+80))+rbp] paddd xmm11,xmm15 paddd xmm10,xmm14 paddd xmm9,xmm13 paddd xmm8,xmm12 pxor xmm7,xmm11 add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 pxor xmm6,xmm10 pxor xmm5,xmm9 pxor xmm4,xmm8 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,xmm7 psrld xmm8,20 pslld xmm7,32-20 pxor xmm7,xmm8 movdqa xmm8,xmm6 psrld xmm8,20 pslld xmm6,32-20 pxor xmm6,xmm8 movdqa xmm8,xmm5 psrld xmm8,20 pslld xmm5,32-20 pxor xmm5,xmm8 movdqa xmm8,xmm4 psrld xmm8,20 pslld xmm4,32-20 pxor xmm4,xmm8 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx movdqa xmm8,XMMWORD[$L$rol8] paddd xmm3,xmm7 paddd xmm2,xmm6 paddd xmm1,xmm5 paddd xmm0,xmm4 pxor xmm15,xmm3 pxor xmm14,xmm2 pxor xmm13,xmm1 pxor xmm12,xmm0 DB 102,69,15,56,0,248 DB 102,69,15,56,0,240 DB 102,69,15,56,0,232 DB 102,69,15,56,0,224 movdqa xmm8,XMMWORD[((160+80))+rbp] paddd xmm11,xmm15 paddd xmm10,xmm14 paddd xmm9,xmm13 paddd xmm8,xmm12 pxor xmm7,xmm11 pxor xmm6,xmm10 mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 pxor xmm5,xmm9 pxor xmm4,xmm8 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,xmm7 psrld xmm8,25 pslld xmm7,32-25 pxor xmm7,xmm8 movdqa xmm8,xmm6 psrld xmm8,25 pslld xmm6,32-25 pxor xmm6,xmm8 movdqa xmm8,xmm5 psrld xmm8,25 pslld xmm5,32-25 pxor xmm5,xmm8 movdqa xmm8,xmm4 psrld xmm8,25 pslld xmm4,32-25 pxor xmm4,xmm8 movdqa xmm8,XMMWORD[((160+80))+rbp] imul r9,r12 add r15,r10 adc r9,rdx DB 102,15,58,15,255,4 DB 102,69,15,58,15,219,8 DB 102,69,15,58,15,255,12 DB 102,15,58,15,246,4 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,12 DB 102,15,58,15,237,4 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,12 DB 102,15,58,15,228,4 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,12 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,XMMWORD[$L$rol16] paddd xmm3,xmm7 paddd xmm2,xmm6 paddd xmm1,xmm5 paddd xmm0,xmm4 pxor xmm15,xmm3 pxor xmm14,xmm2 mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 pxor xmm13,xmm1 pxor xmm12,xmm0 DB 102,69,15,56,0,248 DB 102,69,15,56,0,240 DB 102,69,15,56,0,232 DB 102,69,15,56,0,224 movdqa xmm8,XMMWORD[((160+80))+rbp] paddd xmm11,xmm15 paddd xmm10,xmm14 paddd xmm9,xmm13 paddd xmm8,xmm12 pxor xmm7,xmm11 pxor xmm6,xmm10 pxor xmm5,xmm9 pxor xmm4,xmm8 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,xmm7 psrld xmm8,20 pslld xmm7,32-20 pxor xmm7,xmm8 movdqa xmm8,xmm6 psrld xmm8,20 pslld xmm6,32-20 pxor xmm6,xmm8 movdqa xmm8,xmm5 psrld xmm8,20 pslld xmm5,32-20 pxor xmm5,xmm8 movdqa xmm8,xmm4 psrld xmm8,20 pslld xmm4,32-20 pxor xmm4,xmm8 movdqa xmm8,XMMWORD[$L$rol8] paddd xmm3,xmm7 paddd xmm2,xmm6 paddd xmm1,xmm5 paddd xmm0,xmm4 pxor xmm15,xmm3 pxor xmm14,xmm2 pxor xmm13,xmm1 pxor xmm12,xmm0 DB 102,69,15,56,0,248 DB 102,69,15,56,0,240 DB 102,69,15,56,0,232 DB 102,69,15,56,0,224 movdqa xmm8,XMMWORD[((160+80))+rbp] paddd xmm11,xmm15 paddd xmm10,xmm14 paddd xmm9,xmm13 paddd xmm8,xmm12 pxor xmm7,xmm11 pxor xmm6,xmm10 pxor xmm5,xmm9 pxor xmm4,xmm8 movdqa XMMWORD[(160+80)+rbp],xmm8 movdqa xmm8,xmm7 psrld xmm8,25 pslld xmm7,32-25 pxor xmm7,xmm8 movdqa xmm8,xmm6 psrld xmm8,25 pslld xmm6,32-25 pxor xmm6,xmm8 movdqa xmm8,xmm5 psrld xmm8,25 pslld xmm5,32-25 pxor xmm5,xmm8 movdqa xmm8,xmm4 psrld xmm8,25 pslld xmm4,32-25 pxor xmm4,xmm8 movdqa xmm8,XMMWORD[((160+80))+rbp] DB 102,15,58,15,255,12 DB 102,69,15,58,15,219,8 DB 102,69,15,58,15,255,4 DB 102,15,58,15,246,12 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,4 DB 102,15,58,15,237,12 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,4 DB 102,15,58,15,228,12 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,4 lea rdi,[16+rdi] dec r8 jge NEAR $L$seal_sse_main_rounds add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[16+rdi] dec rcx jg NEAR $L$seal_sse_main_rounds paddd xmm3,XMMWORD[$L$chacha20_consts] paddd xmm7,XMMWORD[((160+48))+rbp] paddd xmm11,XMMWORD[((160+64))+rbp] paddd xmm15,XMMWORD[((160+144))+rbp] paddd xmm2,XMMWORD[$L$chacha20_consts] paddd xmm6,XMMWORD[((160+48))+rbp] paddd xmm10,XMMWORD[((160+64))+rbp] paddd xmm14,XMMWORD[((160+128))+rbp] paddd xmm1,XMMWORD[$L$chacha20_consts] paddd xmm5,XMMWORD[((160+48))+rbp] paddd xmm9,XMMWORD[((160+64))+rbp] paddd xmm13,XMMWORD[((160+112))+rbp] paddd xmm0,XMMWORD[$L$chacha20_consts] paddd xmm4,XMMWORD[((160+48))+rbp] paddd xmm8,XMMWORD[((160+64))+rbp] paddd xmm12,XMMWORD[((160+96))+rbp] movdqa XMMWORD[(160+80)+rbp],xmm14 movdqa XMMWORD[(160+80)+rbp],xmm14 movdqu xmm14,XMMWORD[((0 + 0))+rsi] pxor xmm14,xmm3 movdqu XMMWORD[(0 + 0)+rdi],xmm14 movdqu xmm14,XMMWORD[((16 + 0))+rsi] pxor xmm14,xmm7 movdqu XMMWORD[(16 + 0)+rdi],xmm14 movdqu xmm14,XMMWORD[((32 + 0))+rsi] pxor xmm14,xmm11 movdqu XMMWORD[(32 + 0)+rdi],xmm14 movdqu xmm14,XMMWORD[((48 + 0))+rsi] pxor xmm14,xmm15 movdqu XMMWORD[(48 + 0)+rdi],xmm14 movdqa xmm14,XMMWORD[((160+80))+rbp] movdqu xmm3,XMMWORD[((0 + 64))+rsi] movdqu xmm7,XMMWORD[((16 + 64))+rsi] movdqu xmm11,XMMWORD[((32 + 64))+rsi] movdqu xmm15,XMMWORD[((48 + 64))+rsi] pxor xmm2,xmm3 pxor xmm6,xmm7 pxor xmm10,xmm11 pxor xmm15,xmm14 movdqu XMMWORD[(0 + 64)+rdi],xmm2 movdqu XMMWORD[(16 + 64)+rdi],xmm6 movdqu XMMWORD[(32 + 64)+rdi],xmm10 movdqu XMMWORD[(48 + 64)+rdi],xmm15 movdqu xmm3,XMMWORD[((0 + 128))+rsi] movdqu xmm7,XMMWORD[((16 + 128))+rsi] movdqu xmm11,XMMWORD[((32 + 128))+rsi] movdqu xmm15,XMMWORD[((48 + 128))+rsi] pxor xmm1,xmm3 pxor xmm5,xmm7 pxor xmm9,xmm11 pxor xmm15,xmm13 movdqu XMMWORD[(0 + 128)+rdi],xmm1 movdqu XMMWORD[(16 + 128)+rdi],xmm5 movdqu XMMWORD[(32 + 128)+rdi],xmm9 movdqu XMMWORD[(48 + 128)+rdi],xmm15 cmp rbx,16*16 ja NEAR $L$seal_sse_main_loop_xor mov rcx,12*16 sub rbx,12*16 lea rsi,[192+rsi] jmp NEAR $L$seal_sse_128_tail_hash $L$seal_sse_main_loop_xor: movdqu xmm3,XMMWORD[((0 + 192))+rsi] movdqu xmm7,XMMWORD[((16 + 192))+rsi] movdqu xmm11,XMMWORD[((32 + 192))+rsi] movdqu xmm15,XMMWORD[((48 + 192))+rsi] pxor xmm0,xmm3 pxor xmm4,xmm7 pxor xmm8,xmm11 pxor xmm15,xmm12 movdqu XMMWORD[(0 + 192)+rdi],xmm0 movdqu XMMWORD[(16 + 192)+rdi],xmm4 movdqu XMMWORD[(32 + 192)+rdi],xmm8 movdqu XMMWORD[(48 + 192)+rdi],xmm15 lea rsi,[256+rsi] sub rbx,16*16 mov rcx,6 mov r8,4 cmp rbx,12*16 jg NEAR $L$seal_sse_main_loop mov rcx,rbx test rbx,rbx je NEAR $L$seal_sse_128_tail_hash mov rcx,6 cmp rbx,8*16 ja NEAR $L$seal_sse_tail_192 cmp rbx,4*16 ja NEAR $L$seal_sse_tail_128 $L$seal_sse_tail_64: movdqa xmm0,XMMWORD[$L$chacha20_consts] movdqa xmm4,XMMWORD[((160+48))+rbp] movdqa xmm8,XMMWORD[((160+64))+rbp] movdqa xmm12,XMMWORD[((160+96))+rbp] paddd xmm12,XMMWORD[$L$sse_inc] movdqa XMMWORD[(160+96)+rbp],xmm12 $L$seal_sse_tail_64_rounds_and_x2hash: add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[16+rdi] $L$seal_sse_tail_64_rounds_and_x1hash: paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,4 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,12 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,12 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,4 add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[16+rdi] dec rcx jg NEAR $L$seal_sse_tail_64_rounds_and_x2hash dec r8 jge NEAR $L$seal_sse_tail_64_rounds_and_x1hash paddd xmm0,XMMWORD[$L$chacha20_consts] paddd xmm4,XMMWORD[((160+48))+rbp] paddd xmm8,XMMWORD[((160+64))+rbp] paddd xmm12,XMMWORD[((160+96))+rbp] jmp NEAR $L$seal_sse_128_tail_xor $L$seal_sse_tail_128: movdqa xmm0,XMMWORD[$L$chacha20_consts] movdqa xmm4,XMMWORD[((160+48))+rbp] movdqa xmm8,XMMWORD[((160+64))+rbp] movdqa xmm1,xmm0 movdqa xmm5,xmm4 movdqa xmm9,xmm8 movdqa xmm13,XMMWORD[((160+96))+rbp] paddd xmm13,XMMWORD[$L$sse_inc] movdqa xmm12,xmm13 paddd xmm12,XMMWORD[$L$sse_inc] movdqa XMMWORD[(160+96)+rbp],xmm12 movdqa XMMWORD[(160+112)+rbp],xmm13 $L$seal_sse_tail_128_rounds_and_x2hash: add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[16+rdi] $L$seal_sse_tail_128_rounds_and_x1hash: paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,4 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,12 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,12 psrld xmm5,20 pxor xmm5,xmm3 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,7 psrld xmm5,25 pxor xmm5,xmm3 DB 102,15,58,15,237,4 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,12 add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,12 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,4 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,12 psrld xmm5,20 pxor xmm5,xmm3 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,7 psrld xmm5,25 pxor xmm5,xmm3 DB 102,15,58,15,237,12 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,4 lea rdi,[16+rdi] dec rcx jg NEAR $L$seal_sse_tail_128_rounds_and_x2hash dec r8 jge NEAR $L$seal_sse_tail_128_rounds_and_x1hash paddd xmm1,XMMWORD[$L$chacha20_consts] paddd xmm5,XMMWORD[((160+48))+rbp] paddd xmm9,XMMWORD[((160+64))+rbp] paddd xmm13,XMMWORD[((160+112))+rbp] paddd xmm0,XMMWORD[$L$chacha20_consts] paddd xmm4,XMMWORD[((160+48))+rbp] paddd xmm8,XMMWORD[((160+64))+rbp] paddd xmm12,XMMWORD[((160+96))+rbp] movdqu xmm3,XMMWORD[((0 + 0))+rsi] movdqu xmm7,XMMWORD[((16 + 0))+rsi] movdqu xmm11,XMMWORD[((32 + 0))+rsi] movdqu xmm15,XMMWORD[((48 + 0))+rsi] pxor xmm1,xmm3 pxor xmm5,xmm7 pxor xmm9,xmm11 pxor xmm15,xmm13 movdqu XMMWORD[(0 + 0)+rdi],xmm1 movdqu XMMWORD[(16 + 0)+rdi],xmm5 movdqu XMMWORD[(32 + 0)+rdi],xmm9 movdqu XMMWORD[(48 + 0)+rdi],xmm15 mov rcx,4*16 sub rbx,4*16 lea rsi,[64+rsi] jmp NEAR $L$seal_sse_128_tail_hash $L$seal_sse_tail_192: movdqa xmm0,XMMWORD[$L$chacha20_consts] movdqa xmm4,XMMWORD[((160+48))+rbp] movdqa xmm8,XMMWORD[((160+64))+rbp] movdqa xmm1,xmm0 movdqa xmm5,xmm4 movdqa xmm9,xmm8 movdqa xmm2,xmm0 movdqa xmm6,xmm4 movdqa xmm10,xmm8 movdqa xmm14,XMMWORD[((160+96))+rbp] paddd xmm14,XMMWORD[$L$sse_inc] movdqa xmm13,xmm14 paddd xmm13,XMMWORD[$L$sse_inc] movdqa xmm12,xmm13 paddd xmm12,XMMWORD[$L$sse_inc] movdqa XMMWORD[(160+96)+rbp],xmm12 movdqa XMMWORD[(160+112)+rbp],xmm13 movdqa XMMWORD[(160+128)+rbp],xmm14 $L$seal_sse_tail_192_rounds_and_x2hash: add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[16+rdi] $L$seal_sse_tail_192_rounds_and_x1hash: paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,4 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,12 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,12 psrld xmm5,20 pxor xmm5,xmm3 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,7 psrld xmm5,25 pxor xmm5,xmm3 DB 102,15,58,15,237,4 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,12 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol16] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,12 psrld xmm6,20 pxor xmm6,xmm3 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol8] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,7 psrld xmm6,25 pxor xmm6,xmm3 DB 102,15,58,15,246,4 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,12 add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,12 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,4 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,12 psrld xmm5,20 pxor xmm5,xmm3 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,7 psrld xmm5,25 pxor xmm5,xmm3 DB 102,15,58,15,237,12 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,4 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol16] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,12 psrld xmm6,20 pxor xmm6,xmm3 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol8] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,7 psrld xmm6,25 pxor xmm6,xmm3 DB 102,15,58,15,246,12 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,4 lea rdi,[16+rdi] dec rcx jg NEAR $L$seal_sse_tail_192_rounds_and_x2hash dec r8 jge NEAR $L$seal_sse_tail_192_rounds_and_x1hash paddd xmm2,XMMWORD[$L$chacha20_consts] paddd xmm6,XMMWORD[((160+48))+rbp] paddd xmm10,XMMWORD[((160+64))+rbp] paddd xmm14,XMMWORD[((160+128))+rbp] paddd xmm1,XMMWORD[$L$chacha20_consts] paddd xmm5,XMMWORD[((160+48))+rbp] paddd xmm9,XMMWORD[((160+64))+rbp] paddd xmm13,XMMWORD[((160+112))+rbp] paddd xmm0,XMMWORD[$L$chacha20_consts] paddd xmm4,XMMWORD[((160+48))+rbp] paddd xmm8,XMMWORD[((160+64))+rbp] paddd xmm12,XMMWORD[((160+96))+rbp] movdqu xmm3,XMMWORD[((0 + 0))+rsi] movdqu xmm7,XMMWORD[((16 + 0))+rsi] movdqu xmm11,XMMWORD[((32 + 0))+rsi] movdqu xmm15,XMMWORD[((48 + 0))+rsi] pxor xmm2,xmm3 pxor xmm6,xmm7 pxor xmm10,xmm11 pxor xmm15,xmm14 movdqu XMMWORD[(0 + 0)+rdi],xmm2 movdqu XMMWORD[(16 + 0)+rdi],xmm6 movdqu XMMWORD[(32 + 0)+rdi],xmm10 movdqu XMMWORD[(48 + 0)+rdi],xmm15 movdqu xmm3,XMMWORD[((0 + 64))+rsi] movdqu xmm7,XMMWORD[((16 + 64))+rsi] movdqu xmm11,XMMWORD[((32 + 64))+rsi] movdqu xmm15,XMMWORD[((48 + 64))+rsi] pxor xmm1,xmm3 pxor xmm5,xmm7 pxor xmm9,xmm11 pxor xmm15,xmm13 movdqu XMMWORD[(0 + 64)+rdi],xmm1 movdqu XMMWORD[(16 + 64)+rdi],xmm5 movdqu XMMWORD[(32 + 64)+rdi],xmm9 movdqu XMMWORD[(48 + 64)+rdi],xmm15 mov rcx,8*16 sub rbx,8*16 lea rsi,[128+rsi] $L$seal_sse_128_tail_hash: cmp rcx,16 jb NEAR $L$seal_sse_128_tail_xor add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 sub rcx,16 lea rdi,[16+rdi] jmp NEAR $L$seal_sse_128_tail_hash $L$seal_sse_128_tail_xor: cmp rbx,16 jb NEAR $L$seal_sse_tail_16 sub rbx,16 movdqu xmm3,XMMWORD[rsi] pxor xmm0,xmm3 movdqu XMMWORD[rdi],xmm0 add r10,QWORD[rdi] adc r11,QWORD[8+rdi] adc r12,1 lea rsi,[16+rsi] lea rdi,[16+rdi] mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 movdqa xmm0,xmm4 movdqa xmm4,xmm8 movdqa xmm8,xmm12 movdqa xmm12,xmm1 movdqa xmm1,xmm5 movdqa xmm5,xmm9 movdqa xmm9,xmm13 jmp NEAR $L$seal_sse_128_tail_xor $L$seal_sse_tail_16: test rbx,rbx jz NEAR $L$process_blocks_of_extra_in mov r8,rbx mov rcx,rbx lea rsi,[((-1))+rbx*1+rsi] pxor xmm15,xmm15 $L$seal_sse_tail_16_compose: pslldq xmm15,1 pinsrb xmm15,BYTE[rsi],0 lea rsi,[((-1))+rsi] dec rcx jne NEAR $L$seal_sse_tail_16_compose pxor xmm15,xmm0 mov rcx,rbx movdqu xmm0,xmm15 $L$seal_sse_tail_16_extract: pextrb XMMWORD[rdi],xmm0,0 psrldq xmm0,1 add rdi,1 sub rcx,1 jnz NEAR $L$seal_sse_tail_16_extract mov r9,QWORD[((288 + 160 + 32))+rsp] mov r14,QWORD[56+r9] mov r13,QWORD[48+r9] test r14,r14 jz NEAR $L$process_partial_block mov r15,16 sub r15,rbx cmp r14,r15 jge NEAR $L$load_extra_in mov r15,r14 $L$load_extra_in: lea rsi,[((-1))+r15*1+r13] add r13,r15 sub r14,r15 mov QWORD[48+r9],r13 mov QWORD[56+r9],r14 add r8,r15 pxor xmm11,xmm11 $L$load_extra_load_loop: pslldq xmm11,1 pinsrb xmm11,BYTE[rsi],0 lea rsi,[((-1))+rsi] sub r15,1 jnz NEAR $L$load_extra_load_loop mov r15,rbx $L$load_extra_shift_loop: pslldq xmm11,1 sub r15,1 jnz NEAR $L$load_extra_shift_loop lea r15,[$L$and_masks] shl rbx,4 pand xmm15,XMMWORD[((-16))+rbx*1+r15] por xmm15,xmm11 DB 102,77,15,126,253 pextrq r14,xmm15,1 add r10,r13 adc r11,r14 adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 $L$process_blocks_of_extra_in: mov r9,QWORD[((288+32+160 ))+rsp] mov rsi,QWORD[48+r9] mov r8,QWORD[56+r9] mov rcx,r8 shr r8,4 $L$process_extra_hash_loop: jz NEAR process_extra_in_trailer add r10,QWORD[((0+0))+rsi] adc r11,QWORD[((8+0))+rsi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rsi,[16+rsi] sub r8,1 jmp NEAR $L$process_extra_hash_loop process_extra_in_trailer: and rcx,15 mov rbx,rcx jz NEAR $L$do_length_block lea rsi,[((-1))+rcx*1+rsi] $L$process_extra_in_trailer_load: pslldq xmm15,1 pinsrb xmm15,BYTE[rsi],0 lea rsi,[((-1))+rsi] sub rcx,1 jnz NEAR $L$process_extra_in_trailer_load $L$process_partial_block: lea r15,[$L$and_masks] shl rbx,4 pand xmm15,XMMWORD[((-16))+rbx*1+r15] DB 102,77,15,126,253 pextrq r14,xmm15,1 add r10,r13 adc r11,r14 adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 $L$do_length_block: add r10,QWORD[((0+160+32))+rbp] adc r11,QWORD[((8+160+32))+rbp] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 mov r13,r10 mov r14,r11 mov r15,r12 sub r10,-5 sbb r11,-1 sbb r12,3 cmovc r10,r13 cmovc r11,r14 cmovc r12,r15 add r10,QWORD[((0+160+16))+rbp] adc r11,QWORD[((8+160+16))+rbp] movaps xmm6,XMMWORD[((0+0))+rbp] movaps xmm7,XMMWORD[((16+0))+rbp] movaps xmm8,XMMWORD[((32+0))+rbp] movaps xmm9,XMMWORD[((48+0))+rbp] movaps xmm10,XMMWORD[((64+0))+rbp] movaps xmm11,XMMWORD[((80+0))+rbp] movaps xmm12,XMMWORD[((96+0))+rbp] movaps xmm13,XMMWORD[((112+0))+rbp] movaps xmm14,XMMWORD[((128+0))+rbp] movaps xmm15,XMMWORD[((144+0))+rbp] add rsp,288 + 160 + 32 pop r9 mov QWORD[r9],r10 mov QWORD[8+r9],r11 pop r15 pop r14 pop r13 pop r12 pop rbx pop rbp mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$seal_sse_128: movdqu xmm0,XMMWORD[$L$chacha20_consts] movdqa xmm1,xmm0 movdqa xmm2,xmm0 movdqu xmm4,XMMWORD[r9] movdqa xmm5,xmm4 movdqa xmm6,xmm4 movdqu xmm8,XMMWORD[16+r9] movdqa xmm9,xmm8 movdqa xmm10,xmm8 movdqu xmm14,XMMWORD[32+r9] movdqa xmm12,xmm14 paddd xmm12,XMMWORD[$L$sse_inc] movdqa xmm13,xmm12 paddd xmm13,XMMWORD[$L$sse_inc] movdqa xmm7,xmm4 movdqa xmm11,xmm8 movdqa xmm15,xmm12 mov r10,10 $L$seal_sse_128_rounds: paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,4 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,12 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,12 psrld xmm5,20 pxor xmm5,xmm3 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,7 psrld xmm5,25 pxor xmm5,xmm3 DB 102,15,58,15,237,4 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,12 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol16] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,12 psrld xmm6,20 pxor xmm6,xmm3 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol8] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,7 psrld xmm6,25 pxor xmm6,xmm3 DB 102,15,58,15,246,4 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,12 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol16] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,12 psrld xmm4,20 pxor xmm4,xmm3 paddd xmm0,xmm4 pxor xmm12,xmm0 pshufb xmm12,XMMWORD[$L$rol8] paddd xmm8,xmm12 pxor xmm4,xmm8 movdqa xmm3,xmm4 pslld xmm3,7 psrld xmm4,25 pxor xmm4,xmm3 DB 102,15,58,15,228,12 DB 102,69,15,58,15,192,8 DB 102,69,15,58,15,228,4 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol16] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,12 psrld xmm5,20 pxor xmm5,xmm3 paddd xmm1,xmm5 pxor xmm13,xmm1 pshufb xmm13,XMMWORD[$L$rol8] paddd xmm9,xmm13 pxor xmm5,xmm9 movdqa xmm3,xmm5 pslld xmm3,7 psrld xmm5,25 pxor xmm5,xmm3 DB 102,15,58,15,237,12 DB 102,69,15,58,15,201,8 DB 102,69,15,58,15,237,4 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol16] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,12 psrld xmm6,20 pxor xmm6,xmm3 paddd xmm2,xmm6 pxor xmm14,xmm2 pshufb xmm14,XMMWORD[$L$rol8] paddd xmm10,xmm14 pxor xmm6,xmm10 movdqa xmm3,xmm6 pslld xmm3,7 psrld xmm6,25 pxor xmm6,xmm3 DB 102,15,58,15,246,12 DB 102,69,15,58,15,210,8 DB 102,69,15,58,15,246,4 dec r10 jnz NEAR $L$seal_sse_128_rounds paddd xmm0,XMMWORD[$L$chacha20_consts] paddd xmm1,XMMWORD[$L$chacha20_consts] paddd xmm2,XMMWORD[$L$chacha20_consts] paddd xmm4,xmm7 paddd xmm5,xmm7 paddd xmm6,xmm7 paddd xmm8,xmm11 paddd xmm9,xmm11 paddd xmm12,xmm15 paddd xmm15,XMMWORD[$L$sse_inc] paddd xmm13,xmm15 pand xmm2,XMMWORD[$L$clamp] movdqa XMMWORD[(160+0)+rbp],xmm2 movdqa XMMWORD[(160+16)+rbp],xmm6 mov r8,r8 call poly_hash_ad_internal jmp NEAR $L$seal_sse_128_tail_xor $L$SEH_end_chacha20_poly1305_seal_sse41: global chacha20_poly1305_open_avx2 ALIGN 64 chacha20_poly1305_open_avx2: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_chacha20_poly1305_open_avx2: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] mov r9,QWORD[48+rsp] _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 push r9 sub rsp,288 + 160 + 32 lea rbp,[32+rsp] and rbp,-32 movaps XMMWORD[(0+0)+rbp],xmm6 movaps XMMWORD[(16+0)+rbp],xmm7 movaps XMMWORD[(32+0)+rbp],xmm8 movaps XMMWORD[(48+0)+rbp],xmm9 movaps XMMWORD[(64+0)+rbp],xmm10 movaps XMMWORD[(80+0)+rbp],xmm11 movaps XMMWORD[(96+0)+rbp],xmm12 movaps XMMWORD[(112+0)+rbp],xmm13 movaps XMMWORD[(128+0)+rbp],xmm14 movaps XMMWORD[(144+0)+rbp],xmm15 mov rbx,rdx mov QWORD[((0+160+32))+rbp],r8 mov QWORD[((8+160+32))+rbp],rbx vzeroupper vmovdqa ymm0,YMMWORD[$L$chacha20_consts] vbroadcasti128 ymm4,XMMWORD[r9] vbroadcasti128 ymm8,XMMWORD[16+r9] vbroadcasti128 ymm12,XMMWORD[32+r9] vpaddd ymm12,ymm12,YMMWORD[$L$avx2_init] cmp rbx,6*32 jbe NEAR $L$open_avx2_192 cmp rbx,10*32 jbe NEAR $L$open_avx2_320 vmovdqa YMMWORD[(160+64)+rbp],ymm4 vmovdqa YMMWORD[(160+96)+rbp],ymm8 vmovdqa YMMWORD[(160+160)+rbp],ymm12 mov r10,10 $L$open_avx2_init_rounds: vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,4 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,12 dec r10 jne NEAR $L$open_avx2_init_rounds vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vperm2i128 ymm3,ymm4,ymm0,0x02 vpand ymm3,ymm3,YMMWORD[$L$clamp] vmovdqa YMMWORD[(160+0)+rbp],ymm3 vperm2i128 ymm0,ymm4,ymm0,0x13 vperm2i128 ymm4,ymm12,ymm8,0x13 mov r8,r8 call poly_hash_ad_internal xor rcx,rcx $L$open_avx2_init_hash: add r10,QWORD[((0+0))+rcx*1+rsi] adc r11,QWORD[((8+0))+rcx*1+rsi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 add rcx,16 cmp rcx,2*32 jne NEAR $L$open_avx2_init_hash vpxor ymm0,ymm0,YMMWORD[rsi] vpxor ymm4,ymm4,YMMWORD[32+rsi] vmovdqu YMMWORD[rdi],ymm0 vmovdqu YMMWORD[32+rdi],ymm4 lea rsi,[64+rsi] lea rdi,[64+rdi] sub rbx,2*32 $L$open_avx2_main_loop: cmp rbx,16*32 jb NEAR $L$open_avx2_main_loop_done vmovdqa ymm0,YMMWORD[$L$chacha20_consts] vmovdqa ymm4,YMMWORD[((160+64))+rbp] vmovdqa ymm8,YMMWORD[((160+96))+rbp] vmovdqa ymm1,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm2,ymm0 vmovdqa ymm6,ymm4 vmovdqa ymm10,ymm8 vmovdqa ymm3,ymm0 vmovdqa ymm7,ymm4 vmovdqa ymm11,ymm8 vmovdqa ymm12,YMMWORD[$L$avx2_inc] vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] vpaddd ymm14,ymm12,ymm15 vpaddd ymm13,ymm12,ymm14 vpaddd ymm12,ymm12,ymm13 vmovdqa YMMWORD[(160+256)+rbp],ymm15 vmovdqa YMMWORD[(160+224)+rbp],ymm14 vmovdqa YMMWORD[(160+192)+rbp],ymm13 vmovdqa YMMWORD[(160+160)+rbp],ymm12 xor rcx,rcx $L$open_avx2_main_loop_rounds: add r10,QWORD[((0+0))+rcx*1+rsi] adc r11,QWORD[((8+0))+rcx*1+rsi] adc r12,1 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vmovdqa ymm8,YMMWORD[$L$rol16] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,20 vpslld ymm7,ymm7,32-20 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,20 vpslld ymm6,ymm6,32-20 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,20 vpslld ymm5,ymm5,32-20 add r15,rax adc r9,rdx vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,20 vpslld ymm4,ymm4,32-20 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[$L$rol8] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 add r10,QWORD[((0+16))+rcx*1+rsi] adc r11,QWORD[((8+16))+rcx*1+rsi] adc r12,1 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,25 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx vpslld ymm7,ymm7,32-25 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,25 vpslld ymm6,ymm6,32-25 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,25 vpslld ymm5,ymm5,32-25 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,25 vpslld ymm4,ymm4,32-25 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[((160+128))+rbp] vpalignr ymm7,ymm7,ymm7,4 vpalignr ymm11,ymm11,ymm11,8 vpalignr ymm15,ymm15,ymm15,12 vpalignr ymm6,ymm6,ymm6,4 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm14,ymm14,ymm14,12 mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 vpalignr ymm5,ymm5,ymm5,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm4,ymm4,ymm4,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm12,ymm12,ymm12,12 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vmovdqa ymm8,YMMWORD[$L$rol16] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 add r15,rax adc r9,rdx vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,20 vpslld ymm7,ymm7,32-20 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,20 vpslld ymm6,ymm6,32-20 vpxor ymm6,ymm6,ymm8 add r10,QWORD[((0+32))+rcx*1+rsi] adc r11,QWORD[((8+32))+rcx*1+rsi] adc r12,1 lea rcx,[48+rcx] vpsrld ymm8,ymm5,20 vpslld ymm5,ymm5,32-20 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,20 vpslld ymm4,ymm4,32-20 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[$L$rol8] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,25 vpslld ymm7,ymm7,32-25 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,25 vpslld ymm6,ymm6,32-25 vpxor ymm6,ymm6,ymm8 add r15,rax adc r9,rdx vpsrld ymm8,ymm5,25 vpslld ymm5,ymm5,32-25 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,25 vpslld ymm4,ymm4,32-25 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[((160+128))+rbp] vpalignr ymm7,ymm7,ymm7,12 vpalignr ymm11,ymm11,ymm11,8 vpalignr ymm15,ymm15,ymm15,4 vpalignr ymm6,ymm6,ymm6,12 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm14,ymm14,ymm14,4 vpalignr ymm5,ymm5,ymm5,12 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm4,ymm4,ymm4,12 vpalignr ymm8,ymm8,ymm8,8 mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vpalignr ymm12,ymm12,ymm12,4 cmp rcx,10*6*8 jne NEAR $L$open_avx2_main_loop_rounds vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vmovdqa YMMWORD[(160+128)+rbp],ymm0 add r10,QWORD[((0+480))+rsi] adc r11,QWORD[((8+480))+rsi] adc r12,1 vperm2i128 ymm0,ymm7,ymm3,0x02 vperm2i128 ymm7,ymm7,ymm3,0x13 vperm2i128 ymm3,ymm15,ymm11,0x02 vperm2i128 ymm11,ymm15,ymm11,0x13 vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] vmovdqu YMMWORD[(0+0)+rdi],ymm0 vmovdqu YMMWORD[(32+0)+rdi],ymm3 vmovdqu YMMWORD[(64+0)+rdi],ymm7 vmovdqu YMMWORD[(96+0)+rdi],ymm11 vmovdqa ymm0,YMMWORD[((160+128))+rbp] mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vperm2i128 ymm3,ymm6,ymm2,0x02 vperm2i128 ymm6,ymm6,ymm2,0x13 vperm2i128 ymm2,ymm14,ymm10,0x02 vperm2i128 ymm10,ymm14,ymm10,0x13 vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] vmovdqu YMMWORD[(0+128)+rdi],ymm3 vmovdqu YMMWORD[(32+128)+rdi],ymm2 vmovdqu YMMWORD[(64+128)+rdi],ymm6 vmovdqu YMMWORD[(96+128)+rdi],ymm10 add r10,QWORD[((0+480+16))+rsi] adc r11,QWORD[((8+480+16))+rsi] adc r12,1 vperm2i128 ymm3,ymm5,ymm1,0x02 vperm2i128 ymm5,ymm5,ymm1,0x13 vperm2i128 ymm1,ymm13,ymm9,0x02 vperm2i128 ymm9,ymm13,ymm9,0x13 vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] vmovdqu YMMWORD[(0+256)+rdi],ymm3 vmovdqu YMMWORD[(32+256)+rdi],ymm1 vmovdqu YMMWORD[(64+256)+rdi],ymm5 vmovdqu YMMWORD[(96+256)+rdi],ymm9 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vperm2i128 ymm3,ymm4,ymm0,0x02 vperm2i128 ymm4,ymm4,ymm0,0x13 vperm2i128 ymm0,ymm12,ymm8,0x02 vperm2i128 ymm8,ymm12,ymm8,0x13 vpxor ymm3,ymm3,YMMWORD[((0+384))+rsi] vpxor ymm0,ymm0,YMMWORD[((32+384))+rsi] vpxor ymm4,ymm4,YMMWORD[((64+384))+rsi] vpxor ymm8,ymm8,YMMWORD[((96+384))+rsi] vmovdqu YMMWORD[(0+384)+rdi],ymm3 vmovdqu YMMWORD[(32+384)+rdi],ymm0 vmovdqu YMMWORD[(64+384)+rdi],ymm4 vmovdqu YMMWORD[(96+384)+rdi],ymm8 lea rsi,[512+rsi] lea rdi,[512+rdi] sub rbx,16*32 jmp NEAR $L$open_avx2_main_loop $L$open_avx2_main_loop_done: test rbx,rbx vzeroupper je NEAR $L$open_sse_finalize cmp rbx,12*32 ja NEAR $L$open_avx2_tail_512 cmp rbx,8*32 ja NEAR $L$open_avx2_tail_384 cmp rbx,4*32 ja NEAR $L$open_avx2_tail_256 vmovdqa ymm0,YMMWORD[$L$chacha20_consts] vmovdqa ymm4,YMMWORD[((160+64))+rbp] vmovdqa ymm8,YMMWORD[((160+96))+rbp] vmovdqa ymm12,YMMWORD[$L$avx2_inc] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vmovdqa YMMWORD[(160+160)+rbp],ymm12 xor r8,r8 mov rcx,rbx and rcx,-16 test rcx,rcx je NEAR $L$open_avx2_tail_128_rounds $L$open_avx2_tail_128_rounds_and_x1hash: add r10,QWORD[((0+0))+r8*1+rsi] adc r11,QWORD[((8+0))+r8*1+rsi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 $L$open_avx2_tail_128_rounds: add r8,16 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,4 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,12 cmp r8,rcx jb NEAR $L$open_avx2_tail_128_rounds_and_x1hash cmp r8,160 jne NEAR $L$open_avx2_tail_128_rounds vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vperm2i128 ymm3,ymm4,ymm0,0x13 vperm2i128 ymm0,ymm4,ymm0,0x02 vperm2i128 ymm4,ymm12,ymm8,0x02 vperm2i128 ymm12,ymm12,ymm8,0x13 vmovdqa ymm8,ymm3 jmp NEAR $L$open_avx2_tail_128_xor $L$open_avx2_tail_256: vmovdqa ymm0,YMMWORD[$L$chacha20_consts] vmovdqa ymm4,YMMWORD[((160+64))+rbp] vmovdqa ymm8,YMMWORD[((160+96))+rbp] vmovdqa ymm1,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm12,YMMWORD[$L$avx2_inc] vpaddd ymm13,ymm12,YMMWORD[((160+160))+rbp] vpaddd ymm12,ymm12,ymm13 vmovdqa YMMWORD[(160+160)+rbp],ymm12 vmovdqa YMMWORD[(160+192)+rbp],ymm13 mov QWORD[((160+128))+rbp],rbx mov rcx,rbx sub rcx,4*32 shr rcx,4 mov r8,10 cmp rcx,10 cmovg rcx,r8 mov rbx,rsi xor r8,r8 $L$open_avx2_tail_256_rounds_and_x1hash: add r10,QWORD[((0+0))+rbx] adc r11,QWORD[((8+0))+rbx] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rbx,[16+rbx] $L$open_avx2_tail_256_rounds: vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,4 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,4 inc r8 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,12 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,12 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol16] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpsrld ymm3,ymm6,20 vpslld ymm6,ymm6,12 vpxor ymm6,ymm6,ymm3 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol8] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpslld ymm3,ymm6,7 vpsrld ymm6,ymm6,25 vpxor ymm6,ymm6,ymm3 vpalignr ymm14,ymm14,ymm14,4 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm6,ymm6,ymm6,12 cmp r8,rcx jb NEAR $L$open_avx2_tail_256_rounds_and_x1hash cmp r8,10 jne NEAR $L$open_avx2_tail_256_rounds mov r8,rbx sub rbx,rsi mov rcx,rbx mov rbx,QWORD[((160+128))+rbp] $L$open_avx2_tail_256_hash: add rcx,16 cmp rcx,rbx jg NEAR $L$open_avx2_tail_256_done add r10,QWORD[((0+0))+r8] adc r11,QWORD[((8+0))+r8] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea r8,[16+r8] jmp NEAR $L$open_avx2_tail_256_hash $L$open_avx2_tail_256_done: vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vperm2i128 ymm3,ymm5,ymm1,0x02 vperm2i128 ymm5,ymm5,ymm1,0x13 vperm2i128 ymm1,ymm13,ymm9,0x02 vperm2i128 ymm9,ymm13,ymm9,0x13 vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] vpxor ymm1,ymm1,YMMWORD[((32+0))+rsi] vpxor ymm5,ymm5,YMMWORD[((64+0))+rsi] vpxor ymm9,ymm9,YMMWORD[((96+0))+rsi] vmovdqu YMMWORD[(0+0)+rdi],ymm3 vmovdqu YMMWORD[(32+0)+rdi],ymm1 vmovdqu YMMWORD[(64+0)+rdi],ymm5 vmovdqu YMMWORD[(96+0)+rdi],ymm9 vperm2i128 ymm3,ymm4,ymm0,0x13 vperm2i128 ymm0,ymm4,ymm0,0x02 vperm2i128 ymm4,ymm12,ymm8,0x02 vperm2i128 ymm12,ymm12,ymm8,0x13 vmovdqa ymm8,ymm3 lea rsi,[128+rsi] lea rdi,[128+rdi] sub rbx,4*32 jmp NEAR $L$open_avx2_tail_128_xor $L$open_avx2_tail_384: vmovdqa ymm0,YMMWORD[$L$chacha20_consts] vmovdqa ymm4,YMMWORD[((160+64))+rbp] vmovdqa ymm8,YMMWORD[((160+96))+rbp] vmovdqa ymm1,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm2,ymm0 vmovdqa ymm6,ymm4 vmovdqa ymm10,ymm8 vmovdqa ymm12,YMMWORD[$L$avx2_inc] vpaddd ymm14,ymm12,YMMWORD[((160+160))+rbp] vpaddd ymm13,ymm12,ymm14 vpaddd ymm12,ymm12,ymm13 vmovdqa YMMWORD[(160+160)+rbp],ymm12 vmovdqa YMMWORD[(160+192)+rbp],ymm13 vmovdqa YMMWORD[(160+224)+rbp],ymm14 mov QWORD[((160+128))+rbp],rbx mov rcx,rbx sub rcx,8*32 shr rcx,4 add rcx,6 mov r8,10 cmp rcx,10 cmovg rcx,r8 mov rbx,rsi xor r8,r8 $L$open_avx2_tail_384_rounds_and_x2hash: add r10,QWORD[((0+0))+rbx] adc r11,QWORD[((8+0))+rbx] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rbx,[16+rbx] $L$open_avx2_tail_384_rounds_and_x1hash: vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol16] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpsrld ymm3,ymm6,20 vpslld ymm6,ymm6,12 vpxor ymm6,ymm6,ymm3 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol8] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpslld ymm3,ymm6,7 vpsrld ymm6,ymm6,25 vpxor ymm6,ymm6,ymm3 vpalignr ymm14,ymm14,ymm14,12 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm6,ymm6,ymm6,4 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,4 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,4 add r10,QWORD[((0+0))+rbx] adc r11,QWORD[((8+0))+rbx] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rbx,[16+rbx] inc r8 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol16] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpsrld ymm3,ymm6,20 vpslld ymm6,ymm6,12 vpxor ymm6,ymm6,ymm3 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol8] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpslld ymm3,ymm6,7 vpsrld ymm6,ymm6,25 vpxor ymm6,ymm6,ymm3 vpalignr ymm14,ymm14,ymm14,4 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm6,ymm6,ymm6,12 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,12 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,12 cmp r8,rcx jb NEAR $L$open_avx2_tail_384_rounds_and_x2hash cmp r8,10 jne NEAR $L$open_avx2_tail_384_rounds_and_x1hash mov r8,rbx sub rbx,rsi mov rcx,rbx mov rbx,QWORD[((160+128))+rbp] $L$open_avx2_384_tail_hash: add rcx,16 cmp rcx,rbx jg NEAR $L$open_avx2_384_tail_done add r10,QWORD[((0+0))+r8] adc r11,QWORD[((8+0))+r8] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea r8,[16+r8] jmp NEAR $L$open_avx2_384_tail_hash $L$open_avx2_384_tail_done: vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vperm2i128 ymm3,ymm6,ymm2,0x02 vperm2i128 ymm6,ymm6,ymm2,0x13 vperm2i128 ymm2,ymm14,ymm10,0x02 vperm2i128 ymm10,ymm14,ymm10,0x13 vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] vpxor ymm2,ymm2,YMMWORD[((32+0))+rsi] vpxor ymm6,ymm6,YMMWORD[((64+0))+rsi] vpxor ymm10,ymm10,YMMWORD[((96+0))+rsi] vmovdqu YMMWORD[(0+0)+rdi],ymm3 vmovdqu YMMWORD[(32+0)+rdi],ymm2 vmovdqu YMMWORD[(64+0)+rdi],ymm6 vmovdqu YMMWORD[(96+0)+rdi],ymm10 vperm2i128 ymm3,ymm5,ymm1,0x02 vperm2i128 ymm5,ymm5,ymm1,0x13 vperm2i128 ymm1,ymm13,ymm9,0x02 vperm2i128 ymm9,ymm13,ymm9,0x13 vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] vpxor ymm1,ymm1,YMMWORD[((32+128))+rsi] vpxor ymm5,ymm5,YMMWORD[((64+128))+rsi] vpxor ymm9,ymm9,YMMWORD[((96+128))+rsi] vmovdqu YMMWORD[(0+128)+rdi],ymm3 vmovdqu YMMWORD[(32+128)+rdi],ymm1 vmovdqu YMMWORD[(64+128)+rdi],ymm5 vmovdqu YMMWORD[(96+128)+rdi],ymm9 vperm2i128 ymm3,ymm4,ymm0,0x13 vperm2i128 ymm0,ymm4,ymm0,0x02 vperm2i128 ymm4,ymm12,ymm8,0x02 vperm2i128 ymm12,ymm12,ymm8,0x13 vmovdqa ymm8,ymm3 lea rsi,[256+rsi] lea rdi,[256+rdi] sub rbx,8*32 jmp NEAR $L$open_avx2_tail_128_xor $L$open_avx2_tail_512: vmovdqa ymm0,YMMWORD[$L$chacha20_consts] vmovdqa ymm4,YMMWORD[((160+64))+rbp] vmovdqa ymm8,YMMWORD[((160+96))+rbp] vmovdqa ymm1,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm2,ymm0 vmovdqa ymm6,ymm4 vmovdqa ymm10,ymm8 vmovdqa ymm3,ymm0 vmovdqa ymm7,ymm4 vmovdqa ymm11,ymm8 vmovdqa ymm12,YMMWORD[$L$avx2_inc] vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] vpaddd ymm14,ymm12,ymm15 vpaddd ymm13,ymm12,ymm14 vpaddd ymm12,ymm12,ymm13 vmovdqa YMMWORD[(160+256)+rbp],ymm15 vmovdqa YMMWORD[(160+224)+rbp],ymm14 vmovdqa YMMWORD[(160+192)+rbp],ymm13 vmovdqa YMMWORD[(160+160)+rbp],ymm12 xor rcx,rcx mov r8,rsi $L$open_avx2_tail_512_rounds_and_x2hash: add r10,QWORD[((0+0))+r8] adc r11,QWORD[((8+0))+r8] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea r8,[16+r8] $L$open_avx2_tail_512_rounds_and_x1hash: vmovdqa YMMWORD[(160+128)+rbp],ymm8 vmovdqa ymm8,YMMWORD[$L$rol16] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,20 vpslld ymm7,ymm7,32-20 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,20 vpslld ymm6,ymm6,32-20 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,20 vpslld ymm5,ymm5,32-20 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,20 vpslld ymm4,ymm4,32-20 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[$L$rol8] vpaddd ymm3,ymm3,ymm7 add r10,QWORD[((0+0))+r8] adc r11,QWORD[((8+0))+r8] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,25 vpslld ymm7,ymm7,32-25 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,25 vpslld ymm6,ymm6,32-25 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,25 vpslld ymm5,ymm5,32-25 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,25 vpslld ymm4,ymm4,32-25 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[((160+128))+rbp] vpalignr ymm7,ymm7,ymm7,4 vpalignr ymm11,ymm11,ymm11,8 vpalignr ymm15,ymm15,ymm15,12 vpalignr ymm6,ymm6,ymm6,4 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm14,ymm14,ymm14,12 vpalignr ymm5,ymm5,ymm5,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm4,ymm4,ymm4,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm12,ymm12,ymm12,12 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vmovdqa ymm8,YMMWORD[$L$rol16] vpaddd ymm3,ymm3,ymm7 add r10,QWORD[((0+16))+r8] adc r11,QWORD[((8+16))+r8] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea r8,[32+r8] vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,20 vpslld ymm7,ymm7,32-20 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,20 vpslld ymm6,ymm6,32-20 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,20 vpslld ymm5,ymm5,32-20 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,20 vpslld ymm4,ymm4,32-20 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[$L$rol8] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,25 vpslld ymm7,ymm7,32-25 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,25 vpslld ymm6,ymm6,32-25 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,25 vpslld ymm5,ymm5,32-25 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,25 vpslld ymm4,ymm4,32-25 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[((160+128))+rbp] vpalignr ymm7,ymm7,ymm7,12 vpalignr ymm11,ymm11,ymm11,8 vpalignr ymm15,ymm15,ymm15,4 vpalignr ymm6,ymm6,ymm6,12 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm14,ymm14,ymm14,4 vpalignr ymm5,ymm5,ymm5,12 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm4,ymm4,ymm4,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm12,ymm12,ymm12,4 inc rcx cmp rcx,4 jl NEAR $L$open_avx2_tail_512_rounds_and_x2hash cmp rcx,10 jne NEAR $L$open_avx2_tail_512_rounds_and_x1hash mov rcx,rbx sub rcx,12*32 and rcx,-16 $L$open_avx2_tail_512_hash: test rcx,rcx je NEAR $L$open_avx2_tail_512_done add r10,QWORD[((0+0))+r8] adc r11,QWORD[((8+0))+r8] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea r8,[16+r8] sub rcx,2*8 jmp NEAR $L$open_avx2_tail_512_hash $L$open_avx2_tail_512_done: vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vmovdqa YMMWORD[(160+128)+rbp],ymm0 vperm2i128 ymm0,ymm7,ymm3,0x02 vperm2i128 ymm7,ymm7,ymm3,0x13 vperm2i128 ymm3,ymm15,ymm11,0x02 vperm2i128 ymm11,ymm15,ymm11,0x13 vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] vmovdqu YMMWORD[(0+0)+rdi],ymm0 vmovdqu YMMWORD[(32+0)+rdi],ymm3 vmovdqu YMMWORD[(64+0)+rdi],ymm7 vmovdqu YMMWORD[(96+0)+rdi],ymm11 vmovdqa ymm0,YMMWORD[((160+128))+rbp] vperm2i128 ymm3,ymm6,ymm2,0x02 vperm2i128 ymm6,ymm6,ymm2,0x13 vperm2i128 ymm2,ymm14,ymm10,0x02 vperm2i128 ymm10,ymm14,ymm10,0x13 vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] vmovdqu YMMWORD[(0+128)+rdi],ymm3 vmovdqu YMMWORD[(32+128)+rdi],ymm2 vmovdqu YMMWORD[(64+128)+rdi],ymm6 vmovdqu YMMWORD[(96+128)+rdi],ymm10 vperm2i128 ymm3,ymm5,ymm1,0x02 vperm2i128 ymm5,ymm5,ymm1,0x13 vperm2i128 ymm1,ymm13,ymm9,0x02 vperm2i128 ymm9,ymm13,ymm9,0x13 vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] vmovdqu YMMWORD[(0+256)+rdi],ymm3 vmovdqu YMMWORD[(32+256)+rdi],ymm1 vmovdqu YMMWORD[(64+256)+rdi],ymm5 vmovdqu YMMWORD[(96+256)+rdi],ymm9 vperm2i128 ymm3,ymm4,ymm0,0x13 vperm2i128 ymm0,ymm4,ymm0,0x02 vperm2i128 ymm4,ymm12,ymm8,0x02 vperm2i128 ymm12,ymm12,ymm8,0x13 vmovdqa ymm8,ymm3 lea rsi,[384+rsi] lea rdi,[384+rdi] sub rbx,12*32 $L$open_avx2_tail_128_xor: cmp rbx,32 jb NEAR $L$open_avx2_tail_32_xor sub rbx,32 vpxor ymm0,ymm0,YMMWORD[rsi] vmovdqu YMMWORD[rdi],ymm0 lea rsi,[32+rsi] lea rdi,[32+rdi] vmovdqa ymm0,ymm4 vmovdqa ymm4,ymm8 vmovdqa ymm8,ymm12 jmp NEAR $L$open_avx2_tail_128_xor $L$open_avx2_tail_32_xor: cmp rbx,16 vmovdqa xmm1,xmm0 jb NEAR $L$open_avx2_exit sub rbx,16 vpxor xmm1,xmm0,XMMWORD[rsi] vmovdqu XMMWORD[rdi],xmm1 lea rsi,[16+rsi] lea rdi,[16+rdi] vperm2i128 ymm0,ymm0,ymm0,0x11 vmovdqa xmm1,xmm0 $L$open_avx2_exit: vzeroupper jmp NEAR $L$open_sse_tail_16 $L$open_avx2_192: vmovdqa ymm1,ymm0 vmovdqa ymm2,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm6,ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm10,ymm8 vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] vmovdqa ymm11,ymm12 vmovdqa ymm15,ymm13 mov r10,10 $L$open_avx2_192_rounds: vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,4 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,4 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,12 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,12 dec r10 jne NEAR $L$open_avx2_192_rounds vpaddd ymm0,ymm0,ymm2 vpaddd ymm1,ymm1,ymm2 vpaddd ymm4,ymm4,ymm6 vpaddd ymm5,ymm5,ymm6 vpaddd ymm8,ymm8,ymm10 vpaddd ymm9,ymm9,ymm10 vpaddd ymm12,ymm12,ymm11 vpaddd ymm13,ymm13,ymm15 vperm2i128 ymm3,ymm4,ymm0,0x02 vpand ymm3,ymm3,YMMWORD[$L$clamp] vmovdqa YMMWORD[(160+0)+rbp],ymm3 vperm2i128 ymm0,ymm4,ymm0,0x13 vperm2i128 ymm4,ymm12,ymm8,0x13 vperm2i128 ymm8,ymm5,ymm1,0x02 vperm2i128 ymm12,ymm13,ymm9,0x02 vperm2i128 ymm1,ymm5,ymm1,0x13 vperm2i128 ymm5,ymm13,ymm9,0x13 $L$open_avx2_short: mov r8,r8 call poly_hash_ad_internal $L$open_avx2_short_hash_and_xor_loop: cmp rbx,32 jb NEAR $L$open_avx2_short_tail_32 sub rbx,32 add r10,QWORD[((0+0))+rsi] adc r11,QWORD[((8+0))+rsi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 add r10,QWORD[((0+16))+rsi] adc r11,QWORD[((8+16))+rsi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vpxor ymm0,ymm0,YMMWORD[rsi] vmovdqu YMMWORD[rdi],ymm0 lea rsi,[32+rsi] lea rdi,[32+rdi] vmovdqa ymm0,ymm4 vmovdqa ymm4,ymm8 vmovdqa ymm8,ymm12 vmovdqa ymm12,ymm1 vmovdqa ymm1,ymm5 vmovdqa ymm5,ymm9 vmovdqa ymm9,ymm13 vmovdqa ymm13,ymm2 vmovdqa ymm2,ymm6 jmp NEAR $L$open_avx2_short_hash_and_xor_loop $L$open_avx2_short_tail_32: cmp rbx,16 vmovdqa xmm1,xmm0 jb NEAR $L$open_avx2_short_tail_32_exit sub rbx,16 add r10,QWORD[((0+0))+rsi] adc r11,QWORD[((8+0))+rsi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vpxor xmm3,xmm0,XMMWORD[rsi] vmovdqu XMMWORD[rdi],xmm3 lea rsi,[16+rsi] lea rdi,[16+rdi] vextracti128 xmm1,ymm0,1 $L$open_avx2_short_tail_32_exit: vzeroupper jmp NEAR $L$open_sse_tail_16 $L$open_avx2_320: vmovdqa ymm1,ymm0 vmovdqa ymm2,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm6,ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm10,ymm8 vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] vpaddd ymm14,ymm13,YMMWORD[$L$avx2_inc] vmovdqa ymm7,ymm4 vmovdqa ymm11,ymm8 vmovdqa YMMWORD[(160+160)+rbp],ymm12 vmovdqa YMMWORD[(160+192)+rbp],ymm13 vmovdqa YMMWORD[(160+224)+rbp],ymm14 mov r10,10 $L$open_avx2_320_rounds: vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,4 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,4 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol16] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpsrld ymm3,ymm6,20 vpslld ymm6,ymm6,12 vpxor ymm6,ymm6,ymm3 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol8] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpslld ymm3,ymm6,7 vpsrld ymm6,ymm6,25 vpxor ymm6,ymm6,ymm3 vpalignr ymm14,ymm14,ymm14,12 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm6,ymm6,ymm6,4 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,12 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,12 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol16] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpsrld ymm3,ymm6,20 vpslld ymm6,ymm6,12 vpxor ymm6,ymm6,ymm3 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol8] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpslld ymm3,ymm6,7 vpsrld ymm6,ymm6,25 vpxor ymm6,ymm6,ymm3 vpalignr ymm14,ymm14,ymm14,4 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm6,ymm6,ymm6,12 dec r10 jne NEAR $L$open_avx2_320_rounds vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,ymm7 vpaddd ymm5,ymm5,ymm7 vpaddd ymm6,ymm6,ymm7 vpaddd ymm8,ymm8,ymm11 vpaddd ymm9,ymm9,ymm11 vpaddd ymm10,ymm10,ymm11 vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] vperm2i128 ymm3,ymm4,ymm0,0x02 vpand ymm3,ymm3,YMMWORD[$L$clamp] vmovdqa YMMWORD[(160+0)+rbp],ymm3 vperm2i128 ymm0,ymm4,ymm0,0x13 vperm2i128 ymm4,ymm12,ymm8,0x13 vperm2i128 ymm8,ymm5,ymm1,0x02 vperm2i128 ymm12,ymm13,ymm9,0x02 vperm2i128 ymm1,ymm5,ymm1,0x13 vperm2i128 ymm5,ymm13,ymm9,0x13 vperm2i128 ymm9,ymm6,ymm2,0x02 vperm2i128 ymm13,ymm14,ymm10,0x02 vperm2i128 ymm2,ymm6,ymm2,0x13 vperm2i128 ymm6,ymm14,ymm10,0x13 jmp NEAR $L$open_avx2_short $L$SEH_end_chacha20_poly1305_open_avx2: global chacha20_poly1305_seal_avx2 ALIGN 64 chacha20_poly1305_seal_avx2: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_chacha20_poly1305_seal_avx2: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] mov r9,QWORD[48+rsp] _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 push r9 sub rsp,288 + 160 + 32 lea rbp,[32+rsp] and rbp,-32 movaps XMMWORD[(0+0)+rbp],xmm6 movaps XMMWORD[(16+0)+rbp],xmm7 movaps XMMWORD[(32+0)+rbp],xmm8 movaps XMMWORD[(48+0)+rbp],xmm9 movaps XMMWORD[(64+0)+rbp],xmm10 movaps XMMWORD[(80+0)+rbp],xmm11 movaps XMMWORD[(96+0)+rbp],xmm12 movaps XMMWORD[(112+0)+rbp],xmm13 movaps XMMWORD[(128+0)+rbp],xmm14 movaps XMMWORD[(144+0)+rbp],xmm15 mov rbx,QWORD[56+r9] add rbx,rdx mov QWORD[((0+160+32))+rbp],r8 mov QWORD[((8+160+32))+rbp],rbx mov rbx,rdx vzeroupper vmovdqa ymm0,YMMWORD[$L$chacha20_consts] vbroadcasti128 ymm4,XMMWORD[r9] vbroadcasti128 ymm8,XMMWORD[16+r9] vbroadcasti128 ymm12,XMMWORD[32+r9] vpaddd ymm12,ymm12,YMMWORD[$L$avx2_init] cmp rbx,6*32 jbe NEAR $L$seal_avx2_192 cmp rbx,10*32 jbe NEAR $L$seal_avx2_320 vmovdqa ymm1,ymm0 vmovdqa ymm2,ymm0 vmovdqa ymm3,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm6,ymm4 vmovdqa ymm7,ymm4 vmovdqa YMMWORD[(160+64)+rbp],ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm10,ymm8 vmovdqa ymm11,ymm8 vmovdqa YMMWORD[(160+96)+rbp],ymm8 vmovdqa ymm15,ymm12 vpaddd ymm14,ymm15,YMMWORD[$L$avx2_inc] vpaddd ymm13,ymm14,YMMWORD[$L$avx2_inc] vpaddd ymm12,ymm13,YMMWORD[$L$avx2_inc] vmovdqa YMMWORD[(160+160)+rbp],ymm12 vmovdqa YMMWORD[(160+192)+rbp],ymm13 vmovdqa YMMWORD[(160+224)+rbp],ymm14 vmovdqa YMMWORD[(160+256)+rbp],ymm15 mov r10,10 $L$seal_avx2_init_rounds: vmovdqa YMMWORD[(160+128)+rbp],ymm8 vmovdqa ymm8,YMMWORD[$L$rol16] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,20 vpslld ymm7,ymm7,32-20 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,20 vpslld ymm6,ymm6,32-20 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,20 vpslld ymm5,ymm5,32-20 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,20 vpslld ymm4,ymm4,32-20 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[$L$rol8] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,25 vpslld ymm7,ymm7,32-25 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,25 vpslld ymm6,ymm6,32-25 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,25 vpslld ymm5,ymm5,32-25 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,25 vpslld ymm4,ymm4,32-25 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[((160+128))+rbp] vpalignr ymm7,ymm7,ymm7,4 vpalignr ymm11,ymm11,ymm11,8 vpalignr ymm15,ymm15,ymm15,12 vpalignr ymm6,ymm6,ymm6,4 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm14,ymm14,ymm14,12 vpalignr ymm5,ymm5,ymm5,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm4,ymm4,ymm4,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm12,ymm12,ymm12,12 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vmovdqa ymm8,YMMWORD[$L$rol16] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,20 vpslld ymm7,ymm7,32-20 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,20 vpslld ymm6,ymm6,32-20 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,20 vpslld ymm5,ymm5,32-20 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,20 vpslld ymm4,ymm4,32-20 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[$L$rol8] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,25 vpslld ymm7,ymm7,32-25 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,25 vpslld ymm6,ymm6,32-25 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,25 vpslld ymm5,ymm5,32-25 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,25 vpslld ymm4,ymm4,32-25 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[((160+128))+rbp] vpalignr ymm7,ymm7,ymm7,12 vpalignr ymm11,ymm11,ymm11,8 vpalignr ymm15,ymm15,ymm15,4 vpalignr ymm6,ymm6,ymm6,12 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm14,ymm14,ymm14,4 vpalignr ymm5,ymm5,ymm5,12 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm4,ymm4,ymm4,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm12,ymm12,ymm12,4 dec r10 jnz NEAR $L$seal_avx2_init_rounds vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vperm2i128 ymm11,ymm15,ymm11,0x13 vperm2i128 ymm15,ymm7,ymm3,0x02 vperm2i128 ymm3,ymm7,ymm3,0x13 vpand ymm15,ymm15,YMMWORD[$L$clamp] vmovdqa YMMWORD[(160+0)+rbp],ymm15 mov r8,r8 call poly_hash_ad_internal vpxor ymm3,ymm3,YMMWORD[rsi] vpxor ymm11,ymm11,YMMWORD[32+rsi] vmovdqu YMMWORD[rdi],ymm3 vmovdqu YMMWORD[32+rdi],ymm11 vperm2i128 ymm15,ymm6,ymm2,0x02 vperm2i128 ymm6,ymm6,ymm2,0x13 vperm2i128 ymm2,ymm14,ymm10,0x02 vperm2i128 ymm10,ymm14,ymm10,0x13 vpxor ymm15,ymm15,YMMWORD[((0+64))+rsi] vpxor ymm2,ymm2,YMMWORD[((32+64))+rsi] vpxor ymm6,ymm6,YMMWORD[((64+64))+rsi] vpxor ymm10,ymm10,YMMWORD[((96+64))+rsi] vmovdqu YMMWORD[(0+64)+rdi],ymm15 vmovdqu YMMWORD[(32+64)+rdi],ymm2 vmovdqu YMMWORD[(64+64)+rdi],ymm6 vmovdqu YMMWORD[(96+64)+rdi],ymm10 vperm2i128 ymm15,ymm5,ymm1,0x02 vperm2i128 ymm5,ymm5,ymm1,0x13 vperm2i128 ymm1,ymm13,ymm9,0x02 vperm2i128 ymm9,ymm13,ymm9,0x13 vpxor ymm15,ymm15,YMMWORD[((0+192))+rsi] vpxor ymm1,ymm1,YMMWORD[((32+192))+rsi] vpxor ymm5,ymm5,YMMWORD[((64+192))+rsi] vpxor ymm9,ymm9,YMMWORD[((96+192))+rsi] vmovdqu YMMWORD[(0+192)+rdi],ymm15 vmovdqu YMMWORD[(32+192)+rdi],ymm1 vmovdqu YMMWORD[(64+192)+rdi],ymm5 vmovdqu YMMWORD[(96+192)+rdi],ymm9 vperm2i128 ymm15,ymm4,ymm0,0x13 vperm2i128 ymm0,ymm4,ymm0,0x02 vperm2i128 ymm4,ymm12,ymm8,0x02 vperm2i128 ymm12,ymm12,ymm8,0x13 vmovdqa ymm8,ymm15 lea rsi,[320+rsi] sub rbx,10*32 mov rcx,10*32 cmp rbx,4*32 jbe NEAR $L$seal_avx2_short_hash_remainder vpxor ymm0,ymm0,YMMWORD[rsi] vpxor ymm4,ymm4,YMMWORD[32+rsi] vpxor ymm8,ymm8,YMMWORD[64+rsi] vpxor ymm12,ymm12,YMMWORD[96+rsi] vmovdqu YMMWORD[320+rdi],ymm0 vmovdqu YMMWORD[352+rdi],ymm4 vmovdqu YMMWORD[384+rdi],ymm8 vmovdqu YMMWORD[416+rdi],ymm12 lea rsi,[128+rsi] sub rbx,4*32 mov rcx,8 mov r8,2 cmp rbx,4*32 jbe NEAR $L$seal_avx2_tail_128 cmp rbx,8*32 jbe NEAR $L$seal_avx2_tail_256 cmp rbx,12*32 jbe NEAR $L$seal_avx2_tail_384 cmp rbx,16*32 jbe NEAR $L$seal_avx2_tail_512 vmovdqa ymm0,YMMWORD[$L$chacha20_consts] vmovdqa ymm4,YMMWORD[((160+64))+rbp] vmovdqa ymm8,YMMWORD[((160+96))+rbp] vmovdqa ymm1,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm2,ymm0 vmovdqa ymm6,ymm4 vmovdqa ymm10,ymm8 vmovdqa ymm3,ymm0 vmovdqa ymm7,ymm4 vmovdqa ymm11,ymm8 vmovdqa ymm12,YMMWORD[$L$avx2_inc] vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] vpaddd ymm14,ymm12,ymm15 vpaddd ymm13,ymm12,ymm14 vpaddd ymm12,ymm12,ymm13 vmovdqa YMMWORD[(160+256)+rbp],ymm15 vmovdqa YMMWORD[(160+224)+rbp],ymm14 vmovdqa YMMWORD[(160+192)+rbp],ymm13 vmovdqa YMMWORD[(160+160)+rbp],ymm12 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vmovdqa ymm8,YMMWORD[$L$rol16] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,20 vpslld ymm7,ymm7,32-20 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,20 vpslld ymm6,ymm6,32-20 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,20 vpslld ymm5,ymm5,32-20 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,20 vpslld ymm4,ymm4,32-20 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[$L$rol8] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,25 vpslld ymm7,ymm7,32-25 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,25 vpslld ymm6,ymm6,32-25 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,25 vpslld ymm5,ymm5,32-25 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,25 vpslld ymm4,ymm4,32-25 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[((160+128))+rbp] vpalignr ymm7,ymm7,ymm7,4 vpalignr ymm11,ymm11,ymm11,8 vpalignr ymm15,ymm15,ymm15,12 vpalignr ymm6,ymm6,ymm6,4 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm14,ymm14,ymm14,12 vpalignr ymm5,ymm5,ymm5,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm4,ymm4,ymm4,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm12,ymm12,ymm12,12 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vmovdqa ymm8,YMMWORD[$L$rol16] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,20 vpslld ymm7,ymm7,32-20 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,20 vpslld ymm6,ymm6,32-20 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,20 vpslld ymm5,ymm5,32-20 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,20 vpslld ymm4,ymm4,32-20 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[$L$rol8] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,25 vpslld ymm7,ymm7,32-25 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,25 vpslld ymm6,ymm6,32-25 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,25 vpslld ymm5,ymm5,32-25 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,25 vpslld ymm4,ymm4,32-25 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[((160+128))+rbp] vpalignr ymm7,ymm7,ymm7,12 vpalignr ymm11,ymm11,ymm11,8 vpalignr ymm15,ymm15,ymm15,4 vpalignr ymm6,ymm6,ymm6,12 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm14,ymm14,ymm14,4 vpalignr ymm5,ymm5,ymm5,12 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm4,ymm4,ymm4,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm12,ymm12,ymm12,4 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vmovdqa ymm8,YMMWORD[$L$rol16] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,20 vpslld ymm7,ymm7,32-20 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,20 vpslld ymm6,ymm6,32-20 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,20 vpslld ymm5,ymm5,32-20 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,20 vpslld ymm4,ymm4,32-20 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[$L$rol8] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 sub rdi,16 mov rcx,9 jmp NEAR $L$seal_avx2_main_loop_rounds_entry ALIGN 32 $L$seal_avx2_main_loop: vmovdqa ymm0,YMMWORD[$L$chacha20_consts] vmovdqa ymm4,YMMWORD[((160+64))+rbp] vmovdqa ymm8,YMMWORD[((160+96))+rbp] vmovdqa ymm1,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm2,ymm0 vmovdqa ymm6,ymm4 vmovdqa ymm10,ymm8 vmovdqa ymm3,ymm0 vmovdqa ymm7,ymm4 vmovdqa ymm11,ymm8 vmovdqa ymm12,YMMWORD[$L$avx2_inc] vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] vpaddd ymm14,ymm12,ymm15 vpaddd ymm13,ymm12,ymm14 vpaddd ymm12,ymm12,ymm13 vmovdqa YMMWORD[(160+256)+rbp],ymm15 vmovdqa YMMWORD[(160+224)+rbp],ymm14 vmovdqa YMMWORD[(160+192)+rbp],ymm13 vmovdqa YMMWORD[(160+160)+rbp],ymm12 mov rcx,10 ALIGN 32 $L$seal_avx2_main_loop_rounds: add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vmovdqa ymm8,YMMWORD[$L$rol16] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,20 vpslld ymm7,ymm7,32-20 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,20 vpslld ymm6,ymm6,32-20 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,20 vpslld ymm5,ymm5,32-20 add r15,rax adc r9,rdx vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,20 vpslld ymm4,ymm4,32-20 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[$L$rol8] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 $L$seal_avx2_main_loop_rounds_entry: vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 add r10,QWORD[((0+16))+rdi] adc r11,QWORD[((8+16))+rdi] adc r12,1 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,25 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx vpslld ymm7,ymm7,32-25 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,25 vpslld ymm6,ymm6,32-25 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,25 vpslld ymm5,ymm5,32-25 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,25 vpslld ymm4,ymm4,32-25 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[((160+128))+rbp] vpalignr ymm7,ymm7,ymm7,4 vpalignr ymm11,ymm11,ymm11,8 vpalignr ymm15,ymm15,ymm15,12 vpalignr ymm6,ymm6,ymm6,4 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm14,ymm14,ymm14,12 mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 vpalignr ymm5,ymm5,ymm5,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm4,ymm4,ymm4,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm12,ymm12,ymm12,12 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vmovdqa ymm8,YMMWORD[$L$rol16] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 add r15,rax adc r9,rdx vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,20 vpslld ymm7,ymm7,32-20 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,20 vpslld ymm6,ymm6,32-20 vpxor ymm6,ymm6,ymm8 add r10,QWORD[((0+32))+rdi] adc r11,QWORD[((8+32))+rdi] adc r12,1 lea rdi,[48+rdi] vpsrld ymm8,ymm5,20 vpslld ymm5,ymm5,32-20 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,20 vpslld ymm4,ymm4,32-20 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[$L$rol8] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,25 vpslld ymm7,ymm7,32-25 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,25 vpslld ymm6,ymm6,32-25 vpxor ymm6,ymm6,ymm8 add r15,rax adc r9,rdx vpsrld ymm8,ymm5,25 vpslld ymm5,ymm5,32-25 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,25 vpslld ymm4,ymm4,32-25 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[((160+128))+rbp] vpalignr ymm7,ymm7,ymm7,12 vpalignr ymm11,ymm11,ymm11,8 vpalignr ymm15,ymm15,ymm15,4 vpalignr ymm6,ymm6,ymm6,12 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm14,ymm14,ymm14,4 vpalignr ymm5,ymm5,ymm5,12 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm4,ymm4,ymm4,12 vpalignr ymm8,ymm8,ymm8,8 mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vpalignr ymm12,ymm12,ymm12,4 dec rcx jne NEAR $L$seal_avx2_main_loop_rounds vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vmovdqa YMMWORD[(160+128)+rbp],ymm0 add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 add r10,QWORD[((0+16))+rdi] adc r11,QWORD[((8+16))+rdi] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[32+rdi] vperm2i128 ymm0,ymm7,ymm3,0x02 vperm2i128 ymm7,ymm7,ymm3,0x13 vperm2i128 ymm3,ymm15,ymm11,0x02 vperm2i128 ymm11,ymm15,ymm11,0x13 vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] vmovdqu YMMWORD[(0+0)+rdi],ymm0 vmovdqu YMMWORD[(32+0)+rdi],ymm3 vmovdqu YMMWORD[(64+0)+rdi],ymm7 vmovdqu YMMWORD[(96+0)+rdi],ymm11 vmovdqa ymm0,YMMWORD[((160+128))+rbp] vperm2i128 ymm3,ymm6,ymm2,0x02 vperm2i128 ymm6,ymm6,ymm2,0x13 vperm2i128 ymm2,ymm14,ymm10,0x02 vperm2i128 ymm10,ymm14,ymm10,0x13 vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] vmovdqu YMMWORD[(0+128)+rdi],ymm3 vmovdqu YMMWORD[(32+128)+rdi],ymm2 vmovdqu YMMWORD[(64+128)+rdi],ymm6 vmovdqu YMMWORD[(96+128)+rdi],ymm10 vperm2i128 ymm3,ymm5,ymm1,0x02 vperm2i128 ymm5,ymm5,ymm1,0x13 vperm2i128 ymm1,ymm13,ymm9,0x02 vperm2i128 ymm9,ymm13,ymm9,0x13 vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] vmovdqu YMMWORD[(0+256)+rdi],ymm3 vmovdqu YMMWORD[(32+256)+rdi],ymm1 vmovdqu YMMWORD[(64+256)+rdi],ymm5 vmovdqu YMMWORD[(96+256)+rdi],ymm9 vperm2i128 ymm3,ymm4,ymm0,0x02 vperm2i128 ymm4,ymm4,ymm0,0x13 vperm2i128 ymm0,ymm12,ymm8,0x02 vperm2i128 ymm8,ymm12,ymm8,0x13 vpxor ymm3,ymm3,YMMWORD[((0+384))+rsi] vpxor ymm0,ymm0,YMMWORD[((32+384))+rsi] vpxor ymm4,ymm4,YMMWORD[((64+384))+rsi] vpxor ymm8,ymm8,YMMWORD[((96+384))+rsi] vmovdqu YMMWORD[(0+384)+rdi],ymm3 vmovdqu YMMWORD[(32+384)+rdi],ymm0 vmovdqu YMMWORD[(64+384)+rdi],ymm4 vmovdqu YMMWORD[(96+384)+rdi],ymm8 lea rsi,[512+rsi] sub rbx,16*32 cmp rbx,16*32 jg NEAR $L$seal_avx2_main_loop add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 add r10,QWORD[((0+16))+rdi] adc r11,QWORD[((8+16))+rdi] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[32+rdi] mov rcx,10 xor r8,r8 cmp rbx,12*32 ja NEAR $L$seal_avx2_tail_512 cmp rbx,8*32 ja NEAR $L$seal_avx2_tail_384 cmp rbx,4*32 ja NEAR $L$seal_avx2_tail_256 $L$seal_avx2_tail_128: vmovdqa ymm0,YMMWORD[$L$chacha20_consts] vmovdqa ymm4,YMMWORD[((160+64))+rbp] vmovdqa ymm8,YMMWORD[((160+96))+rbp] vmovdqa ymm12,YMMWORD[$L$avx2_inc] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vmovdqa YMMWORD[(160+160)+rbp],ymm12 $L$seal_avx2_tail_128_rounds_and_3xhash: add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[16+rdi] $L$seal_avx2_tail_128_rounds_and_2xhash: vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,4 add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,12 add r10,QWORD[((0+16))+rdi] adc r11,QWORD[((8+16))+rdi] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[32+rdi] dec rcx jg NEAR $L$seal_avx2_tail_128_rounds_and_3xhash dec r8 jge NEAR $L$seal_avx2_tail_128_rounds_and_2xhash vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vperm2i128 ymm3,ymm4,ymm0,0x13 vperm2i128 ymm0,ymm4,ymm0,0x02 vperm2i128 ymm4,ymm12,ymm8,0x02 vperm2i128 ymm12,ymm12,ymm8,0x13 vmovdqa ymm8,ymm3 jmp NEAR $L$seal_avx2_short_loop $L$seal_avx2_tail_256: vmovdqa ymm0,YMMWORD[$L$chacha20_consts] vmovdqa ymm4,YMMWORD[((160+64))+rbp] vmovdqa ymm8,YMMWORD[((160+96))+rbp] vmovdqa ymm1,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm12,YMMWORD[$L$avx2_inc] vpaddd ymm13,ymm12,YMMWORD[((160+160))+rbp] vpaddd ymm12,ymm12,ymm13 vmovdqa YMMWORD[(160+160)+rbp],ymm12 vmovdqa YMMWORD[(160+192)+rbp],ymm13 $L$seal_avx2_tail_256_rounds_and_3xhash: add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[16+rdi] $L$seal_avx2_tail_256_rounds_and_2xhash: vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,4 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,4 add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,12 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,12 add r10,QWORD[((0+16))+rdi] adc r11,QWORD[((8+16))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[32+rdi] dec rcx jg NEAR $L$seal_avx2_tail_256_rounds_and_3xhash dec r8 jge NEAR $L$seal_avx2_tail_256_rounds_and_2xhash vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vperm2i128 ymm3,ymm5,ymm1,0x02 vperm2i128 ymm5,ymm5,ymm1,0x13 vperm2i128 ymm1,ymm13,ymm9,0x02 vperm2i128 ymm9,ymm13,ymm9,0x13 vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] vpxor ymm1,ymm1,YMMWORD[((32+0))+rsi] vpxor ymm5,ymm5,YMMWORD[((64+0))+rsi] vpxor ymm9,ymm9,YMMWORD[((96+0))+rsi] vmovdqu YMMWORD[(0+0)+rdi],ymm3 vmovdqu YMMWORD[(32+0)+rdi],ymm1 vmovdqu YMMWORD[(64+0)+rdi],ymm5 vmovdqu YMMWORD[(96+0)+rdi],ymm9 vperm2i128 ymm3,ymm4,ymm0,0x13 vperm2i128 ymm0,ymm4,ymm0,0x02 vperm2i128 ymm4,ymm12,ymm8,0x02 vperm2i128 ymm12,ymm12,ymm8,0x13 vmovdqa ymm8,ymm3 mov rcx,4*32 lea rsi,[128+rsi] sub rbx,4*32 jmp NEAR $L$seal_avx2_short_hash_remainder $L$seal_avx2_tail_384: vmovdqa ymm0,YMMWORD[$L$chacha20_consts] vmovdqa ymm4,YMMWORD[((160+64))+rbp] vmovdqa ymm8,YMMWORD[((160+96))+rbp] vmovdqa ymm1,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm2,ymm0 vmovdqa ymm6,ymm4 vmovdqa ymm10,ymm8 vmovdqa ymm12,YMMWORD[$L$avx2_inc] vpaddd ymm14,ymm12,YMMWORD[((160+160))+rbp] vpaddd ymm13,ymm12,ymm14 vpaddd ymm12,ymm12,ymm13 vmovdqa YMMWORD[(160+160)+rbp],ymm12 vmovdqa YMMWORD[(160+192)+rbp],ymm13 vmovdqa YMMWORD[(160+224)+rbp],ymm14 $L$seal_avx2_tail_384_rounds_and_3xhash: add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[16+rdi] $L$seal_avx2_tail_384_rounds_and_2xhash: vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,4 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,4 add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol16] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpsrld ymm3,ymm6,20 vpslld ymm6,ymm6,12 vpxor ymm6,ymm6,ymm3 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol8] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpslld ymm3,ymm6,7 vpsrld ymm6,ymm6,25 vpxor ymm6,ymm6,ymm3 vpalignr ymm14,ymm14,ymm14,12 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm6,ymm6,ymm6,4 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,12 add r10,QWORD[((0+16))+rdi] adc r11,QWORD[((8+16))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,12 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol16] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpsrld ymm3,ymm6,20 vpslld ymm6,ymm6,12 vpxor ymm6,ymm6,ymm3 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol8] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpslld ymm3,ymm6,7 vpsrld ymm6,ymm6,25 vpxor ymm6,ymm6,ymm3 vpalignr ymm14,ymm14,ymm14,4 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm6,ymm6,ymm6,12 lea rdi,[32+rdi] dec rcx jg NEAR $L$seal_avx2_tail_384_rounds_and_3xhash dec r8 jge NEAR $L$seal_avx2_tail_384_rounds_and_2xhash vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vperm2i128 ymm3,ymm6,ymm2,0x02 vperm2i128 ymm6,ymm6,ymm2,0x13 vperm2i128 ymm2,ymm14,ymm10,0x02 vperm2i128 ymm10,ymm14,ymm10,0x13 vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] vpxor ymm2,ymm2,YMMWORD[((32+0))+rsi] vpxor ymm6,ymm6,YMMWORD[((64+0))+rsi] vpxor ymm10,ymm10,YMMWORD[((96+0))+rsi] vmovdqu YMMWORD[(0+0)+rdi],ymm3 vmovdqu YMMWORD[(32+0)+rdi],ymm2 vmovdqu YMMWORD[(64+0)+rdi],ymm6 vmovdqu YMMWORD[(96+0)+rdi],ymm10 vperm2i128 ymm3,ymm5,ymm1,0x02 vperm2i128 ymm5,ymm5,ymm1,0x13 vperm2i128 ymm1,ymm13,ymm9,0x02 vperm2i128 ymm9,ymm13,ymm9,0x13 vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] vpxor ymm1,ymm1,YMMWORD[((32+128))+rsi] vpxor ymm5,ymm5,YMMWORD[((64+128))+rsi] vpxor ymm9,ymm9,YMMWORD[((96+128))+rsi] vmovdqu YMMWORD[(0+128)+rdi],ymm3 vmovdqu YMMWORD[(32+128)+rdi],ymm1 vmovdqu YMMWORD[(64+128)+rdi],ymm5 vmovdqu YMMWORD[(96+128)+rdi],ymm9 vperm2i128 ymm3,ymm4,ymm0,0x13 vperm2i128 ymm0,ymm4,ymm0,0x02 vperm2i128 ymm4,ymm12,ymm8,0x02 vperm2i128 ymm12,ymm12,ymm8,0x13 vmovdqa ymm8,ymm3 mov rcx,8*32 lea rsi,[256+rsi] sub rbx,8*32 jmp NEAR $L$seal_avx2_short_hash_remainder $L$seal_avx2_tail_512: vmovdqa ymm0,YMMWORD[$L$chacha20_consts] vmovdqa ymm4,YMMWORD[((160+64))+rbp] vmovdqa ymm8,YMMWORD[((160+96))+rbp] vmovdqa ymm1,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm2,ymm0 vmovdqa ymm6,ymm4 vmovdqa ymm10,ymm8 vmovdqa ymm3,ymm0 vmovdqa ymm7,ymm4 vmovdqa ymm11,ymm8 vmovdqa ymm12,YMMWORD[$L$avx2_inc] vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] vpaddd ymm14,ymm12,ymm15 vpaddd ymm13,ymm12,ymm14 vpaddd ymm12,ymm12,ymm13 vmovdqa YMMWORD[(160+256)+rbp],ymm15 vmovdqa YMMWORD[(160+224)+rbp],ymm14 vmovdqa YMMWORD[(160+192)+rbp],ymm13 vmovdqa YMMWORD[(160+160)+rbp],ymm12 $L$seal_avx2_tail_512_rounds_and_3xhash: add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[16+rdi] $L$seal_avx2_tail_512_rounds_and_2xhash: vmovdqa YMMWORD[(160+128)+rbp],ymm8 vmovdqa ymm8,YMMWORD[$L$rol16] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,20 vpslld ymm7,ymm7,32-20 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,20 vpslld ymm6,ymm6,32-20 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,20 vpslld ymm5,ymm5,32-20 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,20 vpslld ymm4,ymm4,32-20 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[$L$rol8] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,25 vpslld ymm7,ymm7,32-25 vpxor ymm7,ymm7,ymm8 mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 vpsrld ymm8,ymm6,25 vpslld ymm6,ymm6,32-25 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,25 vpslld ymm5,ymm5,32-25 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,25 vpslld ymm4,ymm4,32-25 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[((160+128))+rbp] vpalignr ymm7,ymm7,ymm7,4 vpalignr ymm11,ymm11,ymm11,8 vpalignr ymm15,ymm15,ymm15,12 vpalignr ymm6,ymm6,ymm6,4 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm14,ymm14,ymm14,12 vpalignr ymm5,ymm5,ymm5,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm4,ymm4,ymm4,4 add r15,rax adc r9,rdx vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm12,ymm12,ymm12,12 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vmovdqa ymm8,YMMWORD[$L$rol16] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,20 vpslld ymm7,ymm7,32-20 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,20 vpslld ymm6,ymm6,32-20 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,20 vpslld ymm5,ymm5,32-20 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,20 vpslld ymm4,ymm4,32-20 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[$L$rol8] vpaddd ymm3,ymm3,ymm7 vpaddd ymm2,ymm2,ymm6 add r10,QWORD[((0+16))+rdi] adc r11,QWORD[((8+16))+rdi] adc r12,1 vpaddd ymm1,ymm1,ymm5 vpaddd ymm0,ymm0,ymm4 vpxor ymm15,ymm15,ymm3 vpxor ymm14,ymm14,ymm2 vpxor ymm13,ymm13,ymm1 vpxor ymm12,ymm12,ymm0 vpshufb ymm15,ymm15,ymm8 vpshufb ymm14,ymm14,ymm8 vpshufb ymm13,ymm13,ymm8 vpshufb ymm12,ymm12,ymm8 vpaddd ymm11,ymm11,ymm15 vpaddd ymm10,ymm10,ymm14 vpaddd ymm9,ymm9,ymm13 vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] vpxor ymm7,ymm7,ymm11 vpxor ymm6,ymm6,ymm10 vpxor ymm5,ymm5,ymm9 vpxor ymm4,ymm4,ymm8 vmovdqa YMMWORD[(160+128)+rbp],ymm8 vpsrld ymm8,ymm7,25 mov rdx,QWORD[((0+160+0))+rbp] mov r15,rdx mulx r14,r13,r10 mulx rdx,rax,r11 imul r15,r12 add r14,rax adc r15,rdx vpslld ymm7,ymm7,32-25 vpxor ymm7,ymm7,ymm8 vpsrld ymm8,ymm6,25 vpslld ymm6,ymm6,32-25 vpxor ymm6,ymm6,ymm8 vpsrld ymm8,ymm5,25 vpslld ymm5,ymm5,32-25 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm4,25 vpslld ymm4,ymm4,32-25 vpxor ymm4,ymm4,ymm8 vmovdqa ymm8,YMMWORD[((160+128))+rbp] vpalignr ymm7,ymm7,ymm7,12 vpalignr ymm11,ymm11,ymm11,8 vpalignr ymm15,ymm15,ymm15,4 vpalignr ymm6,ymm6,ymm6,12 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm14,ymm14,ymm14,4 vpalignr ymm5,ymm5,ymm5,12 vpalignr ymm9,ymm9,ymm9,8 mov rdx,QWORD[((8+160+0))+rbp] mulx rax,r10,r10 add r14,r10 mulx r9,r11,r11 adc r15,r11 adc r9,0 imul rdx,r12 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm4,ymm4,ymm4,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm12,ymm12,ymm12,4 add r15,rax adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[32+rdi] dec rcx jg NEAR $L$seal_avx2_tail_512_rounds_and_3xhash dec r8 jge NEAR $L$seal_avx2_tail_512_rounds_and_2xhash vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vmovdqa YMMWORD[(160+128)+rbp],ymm0 vperm2i128 ymm0,ymm7,ymm3,0x02 vperm2i128 ymm7,ymm7,ymm3,0x13 vperm2i128 ymm3,ymm15,ymm11,0x02 vperm2i128 ymm11,ymm15,ymm11,0x13 vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] vmovdqu YMMWORD[(0+0)+rdi],ymm0 vmovdqu YMMWORD[(32+0)+rdi],ymm3 vmovdqu YMMWORD[(64+0)+rdi],ymm7 vmovdqu YMMWORD[(96+0)+rdi],ymm11 vmovdqa ymm0,YMMWORD[((160+128))+rbp] vperm2i128 ymm3,ymm6,ymm2,0x02 vperm2i128 ymm6,ymm6,ymm2,0x13 vperm2i128 ymm2,ymm14,ymm10,0x02 vperm2i128 ymm10,ymm14,ymm10,0x13 vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] vmovdqu YMMWORD[(0+128)+rdi],ymm3 vmovdqu YMMWORD[(32+128)+rdi],ymm2 vmovdqu YMMWORD[(64+128)+rdi],ymm6 vmovdqu YMMWORD[(96+128)+rdi],ymm10 vperm2i128 ymm3,ymm5,ymm1,0x02 vperm2i128 ymm5,ymm5,ymm1,0x13 vperm2i128 ymm1,ymm13,ymm9,0x02 vperm2i128 ymm9,ymm13,ymm9,0x13 vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] vmovdqu YMMWORD[(0+256)+rdi],ymm3 vmovdqu YMMWORD[(32+256)+rdi],ymm1 vmovdqu YMMWORD[(64+256)+rdi],ymm5 vmovdqu YMMWORD[(96+256)+rdi],ymm9 vperm2i128 ymm3,ymm4,ymm0,0x13 vperm2i128 ymm0,ymm4,ymm0,0x02 vperm2i128 ymm4,ymm12,ymm8,0x02 vperm2i128 ymm12,ymm12,ymm8,0x13 vmovdqa ymm8,ymm3 mov rcx,12*32 lea rsi,[384+rsi] sub rbx,12*32 jmp NEAR $L$seal_avx2_short_hash_remainder $L$seal_avx2_320: vmovdqa ymm1,ymm0 vmovdqa ymm2,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm6,ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm10,ymm8 vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] vpaddd ymm14,ymm13,YMMWORD[$L$avx2_inc] vmovdqa ymm7,ymm4 vmovdqa ymm11,ymm8 vmovdqa YMMWORD[(160+160)+rbp],ymm12 vmovdqa YMMWORD[(160+192)+rbp],ymm13 vmovdqa YMMWORD[(160+224)+rbp],ymm14 mov r10,10 $L$seal_avx2_320_rounds: vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,4 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,4 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol16] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpsrld ymm3,ymm6,20 vpslld ymm6,ymm6,12 vpxor ymm6,ymm6,ymm3 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol8] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpslld ymm3,ymm6,7 vpsrld ymm6,ymm6,25 vpxor ymm6,ymm6,ymm3 vpalignr ymm14,ymm14,ymm14,12 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm6,ymm6,ymm6,4 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,12 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,12 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol16] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpsrld ymm3,ymm6,20 vpslld ymm6,ymm6,12 vpxor ymm6,ymm6,ymm3 vpaddd ymm2,ymm2,ymm6 vpxor ymm14,ymm14,ymm2 vpshufb ymm14,ymm14,YMMWORD[$L$rol8] vpaddd ymm10,ymm10,ymm14 vpxor ymm6,ymm6,ymm10 vpslld ymm3,ymm6,7 vpsrld ymm6,ymm6,25 vpxor ymm6,ymm6,ymm3 vpalignr ymm14,ymm14,ymm14,4 vpalignr ymm10,ymm10,ymm10,8 vpalignr ymm6,ymm6,ymm6,12 dec r10 jne NEAR $L$seal_avx2_320_rounds vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] vpaddd ymm4,ymm4,ymm7 vpaddd ymm5,ymm5,ymm7 vpaddd ymm6,ymm6,ymm7 vpaddd ymm8,ymm8,ymm11 vpaddd ymm9,ymm9,ymm11 vpaddd ymm10,ymm10,ymm11 vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] vperm2i128 ymm3,ymm4,ymm0,0x02 vpand ymm3,ymm3,YMMWORD[$L$clamp] vmovdqa YMMWORD[(160+0)+rbp],ymm3 vperm2i128 ymm0,ymm4,ymm0,0x13 vperm2i128 ymm4,ymm12,ymm8,0x13 vperm2i128 ymm8,ymm5,ymm1,0x02 vperm2i128 ymm12,ymm13,ymm9,0x02 vperm2i128 ymm1,ymm5,ymm1,0x13 vperm2i128 ymm5,ymm13,ymm9,0x13 vperm2i128 ymm9,ymm6,ymm2,0x02 vperm2i128 ymm13,ymm14,ymm10,0x02 vperm2i128 ymm2,ymm6,ymm2,0x13 vperm2i128 ymm6,ymm14,ymm10,0x13 jmp NEAR $L$seal_avx2_short $L$seal_avx2_192: vmovdqa ymm1,ymm0 vmovdqa ymm2,ymm0 vmovdqa ymm5,ymm4 vmovdqa ymm6,ymm4 vmovdqa ymm9,ymm8 vmovdqa ymm10,ymm8 vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] vmovdqa ymm11,ymm12 vmovdqa ymm15,ymm13 mov r10,10 $L$seal_avx2_192_rounds: vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,12 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,4 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,12 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,4 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol16] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpsrld ymm3,ymm4,20 vpslld ymm4,ymm4,12 vpxor ymm4,ymm4,ymm3 vpaddd ymm0,ymm0,ymm4 vpxor ymm12,ymm12,ymm0 vpshufb ymm12,ymm12,YMMWORD[$L$rol8] vpaddd ymm8,ymm8,ymm12 vpxor ymm4,ymm4,ymm8 vpslld ymm3,ymm4,7 vpsrld ymm4,ymm4,25 vpxor ymm4,ymm4,ymm3 vpalignr ymm12,ymm12,ymm12,4 vpalignr ymm8,ymm8,ymm8,8 vpalignr ymm4,ymm4,ymm4,12 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol16] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpsrld ymm3,ymm5,20 vpslld ymm5,ymm5,12 vpxor ymm5,ymm5,ymm3 vpaddd ymm1,ymm1,ymm5 vpxor ymm13,ymm13,ymm1 vpshufb ymm13,ymm13,YMMWORD[$L$rol8] vpaddd ymm9,ymm9,ymm13 vpxor ymm5,ymm5,ymm9 vpslld ymm3,ymm5,7 vpsrld ymm5,ymm5,25 vpxor ymm5,ymm5,ymm3 vpalignr ymm13,ymm13,ymm13,4 vpalignr ymm9,ymm9,ymm9,8 vpalignr ymm5,ymm5,ymm5,12 dec r10 jne NEAR $L$seal_avx2_192_rounds vpaddd ymm0,ymm0,ymm2 vpaddd ymm1,ymm1,ymm2 vpaddd ymm4,ymm4,ymm6 vpaddd ymm5,ymm5,ymm6 vpaddd ymm8,ymm8,ymm10 vpaddd ymm9,ymm9,ymm10 vpaddd ymm12,ymm12,ymm11 vpaddd ymm13,ymm13,ymm15 vperm2i128 ymm3,ymm4,ymm0,0x02 vpand ymm3,ymm3,YMMWORD[$L$clamp] vmovdqa YMMWORD[(160+0)+rbp],ymm3 vperm2i128 ymm0,ymm4,ymm0,0x13 vperm2i128 ymm4,ymm12,ymm8,0x13 vperm2i128 ymm8,ymm5,ymm1,0x02 vperm2i128 ymm12,ymm13,ymm9,0x02 vperm2i128 ymm1,ymm5,ymm1,0x13 vperm2i128 ymm5,ymm13,ymm9,0x13 $L$seal_avx2_short: mov r8,r8 call poly_hash_ad_internal xor rcx,rcx $L$seal_avx2_short_hash_remainder: cmp rcx,16 jb NEAR $L$seal_avx2_short_loop add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 sub rcx,16 add rdi,16 jmp NEAR $L$seal_avx2_short_hash_remainder $L$seal_avx2_short_loop: cmp rbx,32 jb NEAR $L$seal_avx2_short_tail sub rbx,32 vpxor ymm0,ymm0,YMMWORD[rsi] vmovdqu YMMWORD[rdi],ymm0 lea rsi,[32+rsi] add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 add r10,QWORD[((0+16))+rdi] adc r11,QWORD[((8+16))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[32+rdi] vmovdqa ymm0,ymm4 vmovdqa ymm4,ymm8 vmovdqa ymm8,ymm12 vmovdqa ymm12,ymm1 vmovdqa ymm1,ymm5 vmovdqa ymm5,ymm9 vmovdqa ymm9,ymm13 vmovdqa ymm13,ymm2 vmovdqa ymm2,ymm6 jmp NEAR $L$seal_avx2_short_loop $L$seal_avx2_short_tail: cmp rbx,16 jb NEAR $L$seal_avx2_exit sub rbx,16 vpxor xmm3,xmm0,XMMWORD[rsi] vmovdqu XMMWORD[rdi],xmm3 lea rsi,[16+rsi] add r10,QWORD[((0+0))+rdi] adc r11,QWORD[((8+0))+rdi] adc r12,1 mov rax,QWORD[((0+160+0))+rbp] mov r15,rax mul r10 mov r13,rax mov r14,rdx mov rax,QWORD[((0+160+0))+rbp] mul r11 imul r15,r12 add r14,rax adc r15,rdx mov rax,QWORD[((8+160+0))+rbp] mov r9,rax mul r10 add r14,rax adc rdx,0 mov r10,rdx mov rax,QWORD[((8+160+0))+rbp] mul r11 add r15,rax adc rdx,0 imul r9,r12 add r15,r10 adc r9,rdx mov r10,r13 mov r11,r14 mov r12,r15 and r12,3 mov r13,r15 and r13,-4 mov r14,r9 shrd r15,r9,2 shr r9,2 add r15,r13 adc r9,r14 add r10,r15 adc r11,r9 adc r12,0 lea rdi,[16+rdi] vextracti128 xmm0,ymm0,1 $L$seal_avx2_exit: vzeroupper jmp NEAR $L$seal_sse_tail_16 $L$SEH_end_chacha20_poly1305_seal_avx2: %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.o000064400000000000000000003733251046102023000204760ustar 00000000000000dg.debug$S@B.debug$Tl'(@B.rdata()@p@.text)y p`BC:\Users\b\p\ring\pregenerated\chacha20_poly1305_x86_64-nasm.asmfɍzhJ7P D;<= > ?BCDE$F+G.H1I4J7K>LAMENHOKPRQURXS[T_UbViWlXoYsZw[z\}]^_`abcdefghijlopqrstuvwxyz{|}~ "%),038<?BEHLPTY]cfilotx}  @EJMPSVY^cdegikmov{       !#$%$&)'.(2)7*<+@,D-I.S/X0]1a2f3k4o5u6|789:;<=>?@ABCDEFGHIJKLNOQRTUV&X)Y.[5\;^C_K`TaXb\cadeeifngrhvi{jklmnopqrstuyz|}~ %+1:?DINSVZ^bglqz "(.4:CHMRW\ahknqux $+18?ELS\eimquz       %+ 0!5":#@$E%J&O'U(Z)_*h+l,p-t.x/}0123456789:;<=>?@ABCDEFG H I J K L M" N( O- P2 Q; RA SH TO UU V\ Wc Xi Yp Zw [} \ ] _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~        ! $ ( , 0 6 > F O X ` h q z                          # ' , 1 6 ; A G O W ` i m q v {                       ! ' . 4 8 > F N W ` i r u x |                                            $ ) 3 8 = A F K O S X b !g "l #p $u %z &~ ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = >@ABCD#E+F4G=IBLJMRN[O_PcQhRqSzTUVWYZ[]^_`abcdefghijklmnopqrstuv w xyz{|}!~&*-036:>BGQV[_dimqv "&+5:?CHMQUZdinrw| $,5>FNW`diouy}      ),29=ADHMQX[^adkn r!u"x#$%&'()*+,-./0123456789:;<=@ABCDEFGH I JKL M%N*O.P3Q8R<SBTIUPVTWYXcYhZm[q\v]{^_`abcdefghijklmnopqrstuvw xyz{| }&~-48=GLQUZ_cglv{ !+059>CGKPZ_dhmrv| !%(,/48;>ADHOU\cgnqtwz       "!+"4#<$D%M&V'Z(_)e*k+o,s-x.}/0123456789:;<=>@ABCFGHIJ KLMNO!P%Q*R3S<TAUJVOWXX]YfZo[x\]_abcdefghijklmnopqrstuvw xyz{$|.}3~8=CHMQV`ejouz !$'.158;DHMW\aflqvz #(,1;@EJPUZ`gnrw "&+ 5 : ? D JOTZahqtwz~ !"#$%&'()*+,-./0123 4567"8+:/;6<<>??CAGBLCPDWEZF]G`HcIjJmKqLtMwN~OPQRSTUVWXYZ[\]^_`abcdefghjklmnopqrs!t*u2v:wCxLyTz\{e|n}w~|!&+3;DMV]dkouy}        " & ) , 3 6 9 < @ C J M P T X [ ^ a d g k n r u z ~                                  !!  !! !"!#!$!%!&!'"!(%!))!,,!-/!.2!/6!0:!1>!2B!3F!4J!6Q!7X!9\!:`!;e!<j!=o!>t!?y!@~!A!B!E!H!J!K!L!N!P!R!T!V!X!Y!Z!^!_!`!a!b!c!d!e!f!g!h!i!j!k"l"m "n"o"r"s"t)"u."v3"w7"x<"yA"zE"{I"|N"}X"~]"b"f"k"p"t"z"""""""""""""""""""""""""# #### #%#)#-#2#<#A#F#J#O#T#X#^#e#l#p#u##########################$ $ $$$ $%$*$.$3$8$<$B$I$P$T$Y$c$h$m$q$v${$$$$$$$$$$$$$$$$$$$$$$$$$% %%%$%'%,%0%6%:% =% A% E%I%M%Q%U%Y%`%c%f%i%l%s%v%z%}%%%%% %!%"%#%$%%%&%'%(%)%*%+%,%-%.%/%0%1%2%3%4%5%6%7%:%;%<%=%>%?%@&A&M@&OE&PJ&QM&SP&TS&UV&VY&W^&Xc&]d&_e&ag&ci&ek&gm&ko&mv&o{&p&r&s&t&u&v&w&x&y&z&{&}&~&&&&&&&&&&&&&'' '''''('-'6';'D'L'U'^'g'p'y'''''''''''''''''''''''''( ((((!('(,(1(6(<(A(F(K(Q(V([(d(h(l(p(t(y(~(((((((((((((((((((((((() )))))$))).)7)=)D)K)Q)X)_)e)l)s)y)))) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) * * * * * * $* )* /* 4* 9* >* D*! I*" N*# S*$ Y*% ^*& c*' l*( p*) t** x*+ |*, *- *. */ *0 *1 *2 *3 *4 *5 *6 *7 *8 *9 *: *; *< *= *> *? *@ *A *B *C +D +E +F +G +H +I !+J &+K ,+L 1+M 6+N ?+O E+P L+Q S+R Y+S `+T g+U m+V t+W {+X +Y +Z +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j ,k ,l ,m ,p (,q 0,r 8,t ;,u @,v D,w I,x O,y U,z Y,{ ],| b,} g,~ k, p, v, |, , , , , , , , , , , , , , , , , , , , , , - - - - - - #- ,- 5- :- A- H- M- S- W- ]- d- j- q- w- - - - - - - - - - - - - - - - - - - - - - . . . . ). 2. 6. :. >. B. G. L. Q. V. \. b. h. n. w. |. . . . . . . . . . . . . . . . . . . . . . . . . . / / / / / / / !/ $/ (/ +/ ./ 7/ ;/ ?/ C/ G/ L/ Q/ V/ [/ a/ g/ m/ s/ |/ / / / / / / / / / / / / / / / / / / /! /" /# /$ /% /& /' /( /) 0* 0+ 0, 0- 0. 0/ $00 )01 202 603 904 <05 B06 I07 P08 V09 ]0: d0; j0< q0= x0> ~0? 0@ 0A 0B 0C 0D 0E 0F 0G 0H 0I 0J 0K 0L 0M 0N 0O 0P 0Q 0R 0S 0T 0U 0V 0W 0X 0Y 0Z 0[ 1\ 1] 1^ 1_ 1` "1a '1b ,1c 11d 61e ;1f D1g I1h O1i T1j Y1k ^1l d1m i1n n1o s1p y1q ~1r 1s 1t 1u 1v 1w 1x 1y 1z 1{ 1| 1} 1~ 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 "2 '2 ,2 12 72 <2 A2 F2 L2 Q2 V2 [2 a2 f2 k2 t2 z2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 !3 $3 '3 +3 /3 23 53 83 ;3 >3 B3 E3 I3 L3 Q3 U3 X3 [3 ^3 a3 e3 i3 l3 r3 z3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 !4 &4 ,4 24 74 =4 C4 H4 N4 W4 \4 a4 g4 m4 q4 u4 z4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 )5 25! 65" :5# ?5$ D5% L5& T5' ]5( f5* m5+ t5, y5- 5. 5/ 50 51 52 53 54 55 56 57 5: 5; 5< 5= 5> 5? 5B 5C 5D 5E 5F 5G 6H 6I 6J 6K 6L 6M 6N 6O !6P $6Q '6R *6S .6T 16U 86V ;6W >6X B6Y F6Z I6[ L6\ O6] R6^ U6_ Y6` \6a `6b c6c h6d l6e o6f r6g u6h x6i |6k 6m 6n 6o 6p 6q 6r 6s 6t 6u 6v 6w 6x 6y 6z 6{ 6| 6} 6~ 6 6 6 6 6 6 7 7 7 7 7 7 !7 %7 *7 47 97 >7 B7 G7 L7 P7 V7 ]7 d7 g7 k7 o7 v7 y7 |7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 '8 08 58 =8 E8 N8 R8 V8 [8 d8 m8 r8 {8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 9 9 9 9 9 9 9 9 !9 %9 )9 .9 89 =9 B9 F9 K9 P9 T9 X9 ]9 g9 l9 q9 u9 z9 9 9 9 9 9 9 9 9 9 9 9! 9" 9# 9$ 9% 9& 9' 9( 9) 9* 9+ 9, 9- 9. :/ :0 :1 :2 :3 :4 :5 !:6 $:7 ':8 .:9 1:: 5:; 8:< ;:= B:> E:? H:@ K:A O:B R:C Y:D \:E _:F c:G g:H j:I m:J p:K s:L v:M z:N }:O :P :Q :R :S :T :U :V :W :X :Y :Z :[ :\ :] :^ :_ :` :a :b :c :d :e :f :g :h :i :j ;k ;l ;m ;n ;o ";p ';q ,;r 0;s 5;t :;u >;v B;w G;x Q;y V;z [;{ _;| d;} i;~ m; s; z; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; < < < < < < < $< )< 1< 9< B< F< J< O< S< W< \< e< n< s< |< < < < < < < < < < < < < < < < < < < < < < < < < < < < < = = = = = = = = = = %= )= ,= /= 2= 5= 9= == A= F= P= U= Z= ^= c= h= l= p= u= = = = = = = = = = = = = = = = = = = = = = = = = = > > > > > !> %> *> 4> 9> >> B> G> L> P> T> Y> c> h> m> q> v> {> > > > >! >" ># >$ >% >& >' >( >) >* >+ >, >- >. >/ >0 >1 >2 >3 >4 >5 >6 >7 >8 >9 >: >; >< >= ?> ?? ?@ ?A ?B ?C ?D ?E ?F ?G #?H '?I +?J 0?K :?L ??M D?N H?O M?P R?Q V?R Z?S _?T i?U n?V s?W w?X |?Y ?Z ?[ ?\ ?] ?^ ?_ ?` ?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m ?n ?o ?p ?q @r @s @t @u @v #@w (@x ,@y 1@z 6@{ :@| >@} C@~ M@ R@ W@ [@ `@ e@ i@ o@ v@ }@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ A A A A A A A $A )A /A 5A :A ?A EA KA OA SA XA ]A bA gA mA sA xA A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A B B B B B B B B B B $B (B ,B 1B 5B ;B ?B CB GB KB NB RB VB ZB ^B eB hB kB nB qB xB {B B B B BBBBBBBBBB B B B B BBBBBBBBBBBBBBBBB B!C"C# C&C'C)C*C+!C,&C.,C/3C07C1:C2@C5EC8HC9MC;SC<XC=\C>`C?fCHnCIrCJvCKyCLCNCOCPCRCSCXC[C\C]C^CbCeCgChCiCjCkCpCsCtCuCzC{C|CCCDDD DDDDDD$D'D+D.D1D8D;D>DADEDHDODRDUDYD]D`DcDfDiDlDpDsDwDzDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDEEE EEEEEEE!E$E(E+E0E4E7E:E=E@EDEHELEQEUEXE^EcEiEpEtExE~EEEEEEEEEEEEEEEEEEEEEE E E E E EEEEEEFFF FFFFFF"F%F(F+F /F$6F%=F&AF'HF(KF)NF*QF+TF,[F-^F.bF/eF0hF1oF2rF3uF4xF5|F6F7F8F9F:F;F<F=F>F?F@FAFBFCFDFEFFFGFHFIFJFKFNFOFPFQFRFSFTFUFVFXFYF[F\G]G^ G_G`GaGbGc'Gd/Gg6Gj8Gl;Gm?GnAGpCGrEGtGGvHGxIGzNG{SG|TG\G`GdGiGmGqGwG|GGGGGGGGGGGGGGGGGGGGGGGGHH HHHH"H)H-H2H$K@%KB'KD)KF+KH-KL/KN6KQ;KR?KTCKUGKVLKWQKXVKY[KZ`K[eK\mK]uK_xK`KaKcKdKeKfKgKhKiKjKkKlKnKoKpKqKsKtKuKvKwKxLyLz L{L|L}L~!L&L+L0L4L:L@LFLJLNLWL\LaLfLkLoLsLwLLLLLLLLLLLLLLLLLLLLLLLMM M MMMMM M'M*M.M1M4M;M>MAMDMHMKMRMUMXM\M`McMfMiMlMoMsMvMzM}MMMMMMMMMMMMMMMMMMMMMMMMMMMMNN NNNN#N(N-N5N=NENMNPNTNYN]NeNmNqNuNyN}NN N N N N NNNNNNNNNNNNNNNNNNN N!N"N#N$O% O&O'O(O) O*%O+*O,/O-4O.9O/|O?O@OAOBOCODOEOFOGOHOIOJOKOLOMONOOOPOQOROSOTOUOVOWOXOYOZO[P\P] P^P_P`PaPb Pc#Pd(Pe-Pf2Pg7PhdT?gT@jTAnTBrTCuTDxTE{TF~TGTHTITJTKTLTMTNTOTPTQTRTSTTTUTVTWTXTYTZT[T\T]T^U_U`UaUbUcUd$Ue*Uf2Ug:UhBUiJUjRUkZUlbUmjUnqUotUpwUqzUr}UsUtUuUvUwUxUyUzU{U|U}U~UUUUUUUUUUUUUUUUUUUUUUUV VVV#V+V3V;VCVKVRVYV`VeVhVkVqVxV~VVVVVVVVVVVVVVVVVVVVVVVVWW W WWWWWW#W&W-W0W3W7W;W>WAWDWGWJWNWQWUWXW]WaWdWgWjWmWqWuWyW}WWWWWWWWWWWWWWWWWWWWWWWWWXX XXXX"X'X +X 1X 7X =X@XFXMXSX[XcXkXsXyXXXXXXX X!X"X#X$X%X&X'X(X)X+X,X-X.X/X0X1Y2Y3Y5 Y6Y7Y8Y9Y:!Y;&Y<*Y=-Y>0Y?7Y@]?]@]A]B]C]D]E]F]G]H]J]L^M^N^O^P^Q^R#^S'^T+^U/^V8^W=^XB^YG^ZL^[P^\V^]\^^b^_f^`j^as^bx^c}^d^e^f^g^h^i^j^k^l^m^n^o^p^q^r^s^t^u^v^w^x^y^z^{^|_}_~ _____$_*_-_1_5_<_?_B_E_H_O_R_V_Y_\_c_f_i_l_p_s_z_}_________________________________` `````#`)`-`1`:`?`D`I`N`R`V`Z`c`h`m`r`w`{``````````````````````````aa a aaaa!a$a(a,a3a6a;a@aDaGaJaQaVaYa^aaaeaialaoa ra ua xa |a aaaaaaaaaaaaaaaaa a!a"a#a$a%b&b'b(b)b* b+$b,)b-.b.3b/7b0b?b@bAbBbCbEbFbGbHbKbLbMbNbObPbQbRcScT cUcVcWcX#cY(cZ-c[2c\:c]Bc^Jc_RcaUcbXcd[ce_cfccgjchmcipcjsckvcl}cmcncocpcqcrcsctcucvcwcxcyczc{c|c}c~ccccccccccccccccddd ddddd d%d*d/d4d9d>dCdKdPdUdZd_dgdldqdvd{ddddddddddddddddddddddddddddddeee e eeeee e#e&e)e-e1e5e9e=eAeEeIeNeSeXe]ebegeleteye~eeeeeeeeeeeeeeeeeeeeeeeef ffff$f,f0f4f8f%g?*g@/gA4gB9gC>gDCgEHgFMgGRgHZgI^gJbgKfgLjgMngNrgOvgPzgQgRgSgTgUgVgWgXgYgZg[g\g]g^g_g`gagbgcgdgegfggghgigjhk hlhmhnho#hp)hq/hr5hs;htAhuGhvMhxPhyThzZh{^h|dh}gh~nhrhuh{h~hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhiiiii&i.i6i>iFiNiVi^ifinivi~iiiiiiiiiiiiiiiiiiiiiij jjj$j,j2j8j>jDjLjTj\jdjljtj|jjjjjjjjjjjjjjjjjjjjjjjjjjkkkkkkkk"k &k *k /k 4k 9l?=l@AlAElBNlCSlDXlE]lFblGflHllIrlJxlK|lLlMlNlOlPlQlRlSlTlUlVlWlXlYlZl[l\l]l_l`lalblcldlelflgmh mimkmlmn%mo+mp1mq7mr=msCmuFmvKmxOmyUmzYm{\m|`m}dm~kmnmqmtmwm~mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmnnn n nnnnn n'n*n-n0n4n7n>nAnDnHnLnOnRnUnXn[n_nbnfninnnrnunxn{n~nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnoo o ooooo!o$o'o+o/o2o5o8o;o>oBoEoIoLoQoUoXo[o^oaoeo io mo qo uo{o~oooooooooooooo o!o#o$o%o&o'o(o)o*o+p,p-p.p/p0p1"p2&p3,p42p58p6ep?ip@rpAwpB|pCpDpEpFpGpHpIpJpKpLpMpNpOpPpQpRpSpTpUpVpWpXpYpZp[q\q]q^q_q`qa qb%qc)qd-qe1qf:qg?qhDqiIqjNqkRqlXqm^qndqohqplqquqrzqsqtquqvqwqxqyqzq{q|q}q~qqqqqqqqqqqqqqrr rrrr r&r,r/r5r=rErMrQrUrYr^rcrhrprxrrrrrrrrrrrrrrrrss s ssssss#s$s%s's)s+s-s/s6s;s?sCsGsLsQsVs[s`sesmsusys|ssssssssssssssssssssssssttttt 't /t 7t ?t EtMtUtYt]tatetitmtqtutzttttttt t!t"t#t$t%t&t't(t)t*t+t,t-t.t/t0t1t2u3u4u5 u6u7u8u9u: u;%u<*u=/u>4u?9u@>uACuBKuCPuDUuEZuF_uGguHluIquJvuK{uLuMuNuOuPuQuRuSuTuUuVuWuXuYuZu[u\u]u^u_u`uaubvcvd vevfvgvhvivj#vk(vl-vm2vn7voz?z@zAzB{C{D{E {F{G{H{I{J {K%{L*{M/{N4{O9{P>{QC{RK{SP{TU{UZ{V_{Wg{Xl{Yq{Zv{[{{\{]{^{_{`{a{b{c{d{e{f{g{h{i{j{k{l{m{n{o{p{q{r|s|t |u|v|w|x|y|z#|{(||-|}2|~7|<|A|F|N|S|X|]|b|j|o|t|y|~|||||||||||||||||||||||||||}} }}}}$})}.}3}8}=}B}G}L}Q}Y}_}e}k}q}w}}}}}}}}}}}}}}}}}}}}}}}}}}}~~ ~~~~"~'~,~1~6~;~@~E~J~O~T~\~`~d~h~l~p~t~y~~~~~~~~~~~~~~~~~~~~~~~~~   # ' +3;?CGKOSW[bejosvy~ !"#$%&'()*+,-./0123456789:; < =>?@!A)B-C1D5E9F=G@HCIFJJKMLQMTNYO]P`QcRfSiTmWqXuYyZ~[\]^_`abcdefghĀìjрk؀lۀmnopqrstuvw xyz{!|&}.~4:@FLRY^afimqw}ŁʁρҁՁځ߁ !%(+.15:BGLQV[`dhlpuzł̂ςԂق݂ "%)-2:?DINSX[^chmrw|ƃɃ̃ σ Ӄ փ ڃ ݃ %-5= E!M"U#]$e%m&u'}(*+,-./0123456„7ń8ʄ9̈́:ф;Մ<؄=ۄ>ބ?@ABCDEFGHIJK LMNOP!Q&R+S/T2U5V<WAXDYIZL[P\T]W^Z_]``acbgcjdneqfvgzh}ijklnopqrstuvwx…yDžz̅|ԅ}څ~ $,28>DLT\dlt|Ć̆Ԇ܆   ',/47;?BEHKNRUY\aehknquy}ćLJʇ͇чԇ؇ۇ !'/7?GOWZ^bilqvz}     ňȈˈΈшՈ و"݈#$%&'()*+ ,-./"0'1+21374=5@6D7H8O9R:W;\<`=c>f?m@rAuBzC}DEFGHIJKLMNOPQRSTUVWÉX̉YщZ։[ۉ\]^_`abcd e fghi#j'k+l2m5n:o?pCqFrIsPtUuXv]w`xdyhzk{n|q}t~w{~Ċ̊Ԋڊ "*/7?BFJQTWZ]dgknqx{~ËƋɋ̋ϋӋ׋ۋߋ %)/5;?CLQV[`dhluzČnjˌΌь، ی ތ     #&) ,!/"3#7$;%D&I'N(S)X*\+`,d-m.r/w0|123456789:;<=>č?ȍ@эA֍BۍCDEFGHIJKLMNOPQ!R$S(T+U.V5W8X;Y>ZB[E\L]O^R_V`Za]b`ccdfeifmgphtiwj|klmnoprstuvwxyzƎ{Ύ|֎}ގ~ $*06<@ELSX`hptx}ÏǏΏяԏ׏ڏ  #&)-047<@CFILPTX\ejoty}ɐΐӐؐݐ #'.147:ADHKNUX [ ^ b e lorvz} !"#$%Ƒ&ˑ'Б(Ց)ّ*ݑ+,-./01234567%8*9/:4;9<==A>E?N@SAXB]CbDfElFrGxH|IJKLMNOPQRSTUVWXYZ’[ɒ\̒]ϒ^Ӓ_ג`ڒaݒbcdefghijklmn o pqrs#t(u-v2w6x:y>zG{L|Q}V~[_ekquyÓɓϓՓٓܓ #+3;CKQW]cglqvzɔєٔ %-15:>BGKOT\dins{Õȕ˕Еӕוەޕ       % ) - 1 5 9 = A E J O T Y ^ c h p u z }    ! " # $ % & ' ( ) Ė* ɖ+ Ζ, Ӗ- ۖ. ߖ/ 0 1 2 3 4 5 6 7 8 9 : ; < = > #? (@ -A 2B 7C <D DE IF NG SH XI `J eK jL oM vN {O ~P Q R S T U V W X Y Z [ \ ] ×^ ɗ_ ϗ` ՗a ۗb c d e f g h i j k l m !n %o )p -q 1r 5s 9t =u Av Fw Kx Py Uz Z{ _| d} l~ o r u y | ˜ ǘ ̘ ј ֘ ۘ       $ ( - 2 7 < A F K S X ] b g o t { ~ ę ə љ י ݙ       & , 2 8 ; >!A!D!G!K!N!R !U !Z !^ !a !d!g!j!n!r!u!{!~!!!!!!!!!Ě!̚ !Ԛ!!ܚ"!#!$!%!&!(! )!*!+!,!$-!(.!-/!20!71!;2!@3!E4!J6!R7!X8!^9!d:!j;!r!?!@!A!B!C!D!E!F!›G!ʛH!қI!ڛJ!K!L!M!N!O!P!Q!R!S!U!#V!*W!1X!6[!:\!>]!B^!F_!K`!Pa!Xb!`c!dd!ie!qf!yg!h!j!k!l!m!n!o!p!q!r!s!t!u!Ɯv!˜w!Мx!՜y!ٜz!ߜ{!|!}!~!!!!! !!!!!%!*!/!4!9!=!C!I!O!S!W!`!e!j!o!t!x!|!!!!!!!!!!!!!ĝ!ɝ!Ν!ӝ!؝!ܝ!!!!!!!!! !!!!!(!-!2!7!<!@!D!H!Q!V![!`!e!i!o!u!{!!!!!!!!!!!!!!Ğ!ɞ!͞!Ӟ!ٞ!ߞ!!!!!!!! !!!!#!+!3!9!A!I!O!U![!a!g!m!s!y!!!!!!"""""""" "ğ "͟ "ҟ "ן "ܟ""""""""" """" "$"("1"6"; "@!"E""I#"M$"Q%"Z&"_'"d("i)"n*"r+"x,"~-"."/"0"1"2"3"4"5"6"7"8"9"à:"Ƞ;"͠<"Ҡ="֠>"ܠ?"@"A"B"C"D"E"F"G" H"I"J"K""L"'M",N"1O"6P":Q"@R"FS"LU"OV"UW"YX"]Y"aZ"e["j\"o]"t^"y_"a"b"d"e"f"g"h"i"k"l"m"o"¡p"ȡq"ˡr"ϡs"ӡt"ڡu"ݡv"w"x"y"z"{"|"}"~"""" """"""""&")","/"2"5"9"<"@"C"H"L"O"R"U"X"\"`"d"i"m"s"w"{""""""""""""""""""¢"Ţ"ɢ"̢"Ӣ"֢"٢"ݢ""""""""""""" " """"""#"*"-"0"3"6"="@"D"G"J"Q"T"W"Z"^"a"h"k"n"r"v"y"|"""""""""""""""""""£"ƣ"ˣ"У#ԣ#أ#ݣ##### # # # # # # ########&#)#0#3#6#9#=#@#G#J#M #Q!#U"#X##[$#^%#a&#d'#h(#k)#o*#r+#w,#{-#~.#/#0#1#3#4#6#7#'EC:\Users\b\p\ring\pregenerated\chacha20_poly1305_x86_64-nasm.o4'The Netwide Assembler 2.13.03( chacha20_poly1305_constants  L$chacha20_consts  L$rol8  L$rol16 "L$avx2_init "L$sse_inc "L$avx2_inc #L$clamp  L$and_maskspoly_hash_ad_internalL$poly_fast_tls_adL$hash_ad_loopL$hash_ad_tailL$hash_ad_tail_loopL$hash_ad_done9ring_core_0_17_14__chacha20_poly1305_open_sse412L$SEH_begin_chacha20_poly1305_open_sse41 L$open_sse_init_roundsL$open_sse_main_loop%L$open_sse_main_loop_roundsL$open_sse_tail.L$open_sse_tail_64_rounds_and_x1hash#L$open_sse_tail_64_roundsL$open_sse_tail_128/L$open_sse_tail_128_rounds_and_x1hash$L$open_sse_tail_128_roundsL$open_sse_tail_192/L$open_sse_tail_192_rounds_and_x1hash$L$open_sse_tail_192_rounds$L$open_sse_tail_192_finishL$open_sse_tail_256/L$open_sse_tail_256_rounds_and_x1hash"L$open_sse_tail_256_hash%L$open_sse_tail_64_dec_loop!L$open_sse_tail_16_initL$open_sse_tail_16$L$open_sse_tail_16_compose$L$open_sse_tail_16_extractL$open_sse_finalizeL$open_sse_128L$open_sse_128_rounds!L$open_sse_128_xor_hash0L$SEH_end_chacha20_poly1305_open_sse419ring_core_0_17_14__chacha20_poly1305_seal_sse412L$SEH_begin_chacha20_poly1305_seal_sse41 L$seal_sse_init_roundsL$seal_sse_main_initL$seal_sse_main_loop L$seal_sse_main_rounds"L$seal_sse_main_loop_xorL$seal_sse_tail_64.L$seal_sse_tail_64_rounds_and_x2hash.L$seal_sse_tail_64_rounds_and_x1hashL$seal_sse_tail_128/L$seal_sse_tail_128_rounds_and_x2hash/L$seal_sse_tail_128_rounds_and_x1hashL$seal_sse_tail_192/L$seal_sse_tail_192_rounds_and_x2hash/L$seal_sse_tail_192_rounds_and_x1hash"L$seal_sse_128_tail_hash!L$seal_sse_128_tail_xorL$seal_sse_tail_16$L$seal_sse_tail_16_compose$L$seal_sse_tail_16_extractL$load_extra_in L$load_extra_load_loop!L$load_extra_shift_loop&L$process_blocks_of_extra_in#L$process_extra_hash_loop"process_extra_in_trailer)L$process_extra_in_trailer_load!L$process_partial_blockL$do_length_blockL$seal_sse_128L$seal_sse_128_rounds0L$SEH_end_chacha20_poly1305_seal_sse418ring_core_0_17_14__chacha20_poly1305_open_avx21L$SEH_begin_chacha20_poly1305_open_avx2!L$open_avx2_init_roundsL$open_avx2_init_hashL$open_avx2_main_loop&L$open_avx2_main_loop_rounds$L$open_avx2_main_loop_done0L$open_avx2_tail_128_rounds_and_x1hash%L$open_avx2_tail_128_roundsL$open_avx2_tail_2560L$open_avx2_tail_256_rounds_and_x1hash%L$open_avx2_tail_256_rounds#L$open_avx2_tail_256_hash#L$open_avx2_tail_256_doneL$open_avx2_tail_3840L$open_avx2_tail_384_rounds_and_x2hash0L$open_avx2_tail_384_rounds_and_x1hash#L$open_avx2_384_tail_hash#L$open_avx2_384_tail_doneL$open_avx2_tail_5120L$open_avx2_tail_512_rounds_and_x2hash0L$open_avx2_tail_512_rounds_and_x1hash#L$open_avx2_tail_512_hash#L$open_avx2_tail_512_done"L$open_avx2_tail_128_xor!L$open_avx2_tail_32_xorL$open_avx2_exitL$open_avx2_192 L$open_avx2_192_roundsL$open_avx2_short-L$open_avx2_short_hash_and_xor_loop#L$open_avx2_short_tail_32(L$open_avx2_short_tail_32_exitL$open_avx2_320 L$open_avx2_320_rounds/L$SEH_end_chacha20_poly1305_open_avx28ring_core_0_17_14__chacha20_poly1305_seal_avx21L$SEH_begin_chacha20_poly1305_seal_avx2!L$seal_avx2_init_roundsL$seal_avx2_main_loop&L$seal_avx2_main_loop_rounds,L$seal_avx2_main_loop_rounds_entryL$seal_avx2_tail_1280L$seal_avx2_tail_128_rounds_and_3xhash0L$seal_avx2_tail_128_rounds_and_2xhashL$seal_avx2_tail_2560L$seal_avx2_tail_256_rounds_and_3xhash0L$seal_avx2_tail_256_rounds_and_2xhashL$seal_avx2_tail_3840L$seal_avx2_tail_384_rounds_and_3xhash0L$seal_avx2_tail_384_rounds_and_2xhashL$seal_avx2_tail_5120L$seal_avx2_tail_512_rounds_and_3xhash0L$seal_avx2_tail_512_rounds_and_2xhashL$seal_avx2_320 L$seal_avx2_320_roundsL$seal_avx2_192 L$seal_avx2_192_roundsL$seal_avx2_short*L$seal_avx2_short_hash_remainder L$seal_avx2_short_loop L$seal_avx2_short_tailL$seal_avx2_exit/L$SEH_end_chacha20_poly1305_seal_avx2x | U Y               + / A E b f           (  ,  \  `  ~          !  ! 5 " 9 " T # X # $ $ % % & & ' ' ! ( % ( G ) K ) f * j * + + , , - -  . . # / ' / I 0 M 0 o 1 s 1 2 2 3 3 4 4 5 5  6 " 6 Y 7 ] 7 8 8 9 9 : : ; ; < < 5= 9= S> W> ? ? @ @ A A B B 4C 8C SD WD E E F F G G H H I I @J DJ fK jK L L M M N N O O P P 7Q ;Q bR fR S S T T U U V V W W IX MX |Y Y Z Z [ [ \ \ ] ] /^ 3^ a_ e_ ` ` a a b b c c &d *d Ke Oe kf of g g h h i i j j 9k =k kl ol m m n n o o p p .q 2q Jr Nr es is t t u u v v w w "x &x =y Ay _z cz { { | | } } ~ $~ A E i m         ; ? m q       # ' C G u y         ! % > B j n       expand 32-byte kexpand 32-byte k            M1M1M1I LLYIAHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIILLYIHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHIIUIM1M1M1LMILyM1HIMMIHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIÐH|$Ht$HHHLLLD$(LL$0USATAUAVAWAQHHl$ H)u)}D)E D)M0D)U@D)]PD)e`D)mpD)D)HLHHfoAo!EoAEoa fAoffDfDA ffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff:fE:fE: ffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff: fE:fE:IfffffMHfofofDofofofEofofofEofofofEofDofD=pfEofD5pfEofD-pfEofD%pfDfDfD fD0IfDfDo@fffffDfDfDfDfE8fE8fE8fE8fDofEfEfEfEfAMMXIM@fAfAfAfDfDofArfr fAfDofArfr fAfDofArfr fAfDofArfr fAHIIIIHIMIIfDo fffffDfDfDfDfE8fE8fE8fE8fDofEfEfEfEfAfAHIIIHIHIIHfAfAfDfDofArfrfAfDofArfrfAfDofArfrfAfDofArfrfAfDoMMIf:fE:fE: f:fE:fE: f:fE:fE: f:fE:fE: fDfDo@fffffDfDMMMIMIMMIMMMMIfDfDfE8fE8fE8fE8fDofEfEfEfEfAfAfAfAfDfDofArfr fAfDofArfr fAfDofArfr fAfDofArfr fAfDo fffffDfDfDfDfE8fE8fE8fE8fDofEfEfEfEfAfAfAfAfDfDofArfrfAfDofArfrfAfDofArfrfAfDofArfrfAfDof: fE:fE:f: fE:fE:f: fE:fE:f: fE:fE:HOMMXIHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIM@HfffDfD0fffDfD f ffDfDfffDfDfDDo&fDD'DoffDDgDof fEDg Dof0fEDg0o^@o~PDo^`Do~pfffEfEW@wPDW`DpooDoDofffEfEDDooDoDofffEfDDDHHHHtH HyH@fofofDofDofD%pfDM1HHNM\0IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHIffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff:fE:fE: ffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff: fE:fE:HtIfffDfD)fofofDofofofEofDofD-pfEofD%pfDfDHHM1NM\0IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIIffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff:fE:fE: ffDfD8-@fEfAfofr frfffDfD8- fEfAfofrfrff:fE:fE: ffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff: fE:fE:ffDfD8-@fEfAfofr frfffDfD8- fEfAfofrfrff: fE:fE:I9If ffDfDfffDfDoo~Do^ Do~0fffEfEoDO D0H@Hv@H@ fofofDofofofEofofofEofDofD5pfEofD-pfEofD%pfDfDfD HAHIOHM1NM\0IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIIffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff:fE:fE: ffDfD8-@fEfAfofr frfffDfD8- fEfAfofrfrff:fE:fE: ffDfD85@fEfAfofr frfffDfD85 fEfAfofrfrff:fE:fE: ffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff: fE:fE:ffDfD8-@fEfAfofr frfffDfD8- fEfAfofrfrff: fE:fE:ffDfD85@fEfAfofr frfffDfD85 fEfAfofrfrff: fE:fE:I9I:HCLLIHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHLLIHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIfffDfD f ffDfDfffDfDoo~Do^ Do~0fffEfEwDW D0o^@o~PDo^`Do~pfffEfEO@oPDO`DpHHHfofofDofofofEofofofEofofofEofDofD=pfEofD5pfEofD-pfEofD%pfDfDfD fD0M1NM\0IfDffDfD8%@fEfAfDofAr frfAffDfD8% fEfAfDofArfrfAf:fE:fE: ffDfD8-@fEfAfDofAr frfAffDfD8- fEfAfDofArfrfAf:fE:fE: ffDfD85@fEfAfDofAr frfAffDfD85 fEfAfDofArfrfAf:fE:fE: fDoHIIIIHIMIIfDffDfD8=@fEfAfDofAr frfAffDfD8= fEfAfDofArfrfAf:fE:fE: fDoHIIIHIHIIHfDffDfD8%@fEfAfDofAr frfAffDfD8% fEfAfDofArfrfAf: fE:fE:ffDfD8-@fEfAfDofAr frfAffDfD8- fEfAfDofArfrfAf: fE:fE:MMIffDfD85@fEfAfDofAr frfAffDfD85 fEfAfDofArfrfAf: fE:fE:fDoMMMIMIMMIMMMMIfDffDfD8=@fEfAfDofAr frfAffDfD8= fEfAfDofArfrfAf: fE:fE:fDoIIQHHNM\0IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIII9]fffDfD0fffDfD f ffDfDfffDfDfDDo&fDD'DoffDDgDof fEDg Dof0fEDg0o^@o~PDo^`Do~pfffEfEW@wPDW`DpooDoDofffEfEDDfDoHHHH+HofHvHfofAofEofoHfHtIfsf: HIfI~fI:ff:fsHHMMIHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMILLIHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIMMMIIIMBMBMBLL(u(}D(E D(M0D(U@D(]PD(e`D(mpD(D(HAYMMYA_A^A]A\[]H|$Ht$ofofoAo!fofoEoAfEofEoEoa fEofD-pfEofD5pfofEofEoA ffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff:fE:fE: ffDfD8-@fEfAfofr frfffDfD8- fEfAfofrfrff:fE:fE: ffDfD85@fEfAfofr frfffDfD85 fEfAfofrfrff:fE:fE: ffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff: fE:fE:ffDfD8-@fEfAfofr frfffDfD8- fEfAfofrfrff: fE:fE:ffDfD85@fEfAfofr frfffDfD85 fEfAfofrfrff: fE:fE:IKff fffffEfEfEfD=pfEfffMHnHLL^IofHvHHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIfofAofEofDofofAofEo$H|$Ht$HHHLLLD$(LL$0USATAUAVAWAQHHl$ H)u)}D)E D)M0D)U@D)]PD)e`D)mpD)D)IY8HLHHHz foAo!EoAEoa fofofofofofofEofEofEofEofD%pfEofD%pfEofD%pffDfDfDfD fD0A fDfDo@fffffDfDfDfDfE8fE8fE8fE8fDofEfEfEfEfAfAfAfAfDfDofArfr fAfDofArfr fAfDofArfr fAfDofArfr fAfDo fffffDfDfDfDfE8fE8fE8fE8fDofEfEfEfEfAfAfAfAfDfDofArfrfAfDofArfrfAfDofArfrfAfDofArfrfAfDof:fE:fE: f:fE:fE: f:fE:fE: f:fE:fE: fDfDo@fffffDfDfDfDfE8fE8fE8fE8fDofEfEfEfEfAfAfAfAfDfDofArfr fAfDofArfr fAfDofArfr fAfDofArfr fAfDo fffffDfDfDfDfE8fE8fE8fE8fDofEfEfEfEfAfAfAfAfDfDofArfrfAfDofArfrfAfDofArfrfAfDofArfrfAfDof: fE:fE:f: fE:fE:f: fE:fE:f: fE:fE:IfffDfD0fffDfD f ffDfDfffDfDfffMoo~Do^ Do~0fffEfEwDW D0o^@o~PDo^`Do~pfffEfEO@oPDO`DpHHHooDoDofffEfEDDHHAH@WH HfofofDofofofEofofofEofofofEofDofD=pfEofD5pfEofD-pfEofD%pfDfDfD fD0fDfDo@fffffDfDfDfDfE8fE8fE8fE8fDofEfEfEfEfALL_IfAfAfAfDfDofArfr fAfDofArfr fAfDofArfr fAfDofArfr fAHIIIIHIMIIfDo fffffDfDfDfDfE8fE8fE8fE8fDofEfEfEfEfAfAHIIIHIHIIHfAfAfDfDofArfrfAfDofArfrfAfDofArfrfAfDofArfrfAfDoMMIf:fE:fE: f:fE:fE: f:fE:fE: f:fE:fE: fDfDo@fffffDfDMMMIMIMMIMMMMIfDfDfE8fE8fE8fE8fDofEfEfEfEfAfAfAfAfDfDofArfr fAfDofArfr fAfDofArfr fAfDofArfr fAfDo fffffDfDfDfDfE8fE8fE8fE8fDofEfEfEfEfAfAfAfAfDfDofArfrfAfDofArfrfAfDofArfrfAfDofArfrfAfDof: fE:fE:f: fE:fE:f: fE:fE:f: fE:fE:HIOLL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHHfffDfD0fffDfD f ffDfDfffDfDfDfDDo6fDD7DovfDDwDov fEDw Dov0fEDw0fDoo^@o~PDo^`Do~pfffEfEW@wPDW`DpooDoDofffEfEDDHHHv ooDoDofffEfEDDHHAHHH HH@fofofDofDofD%pfDLL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff:fE:fE: ffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff: fE:fE:LL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHHIrfffDfD fofofDofofofEofDofD-pfEofD%pfDfDLL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff:fE:fE: ffDfD8-@fEfAfofr frfffDfD8- fEfAfofrfrff:fE:fE: LL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff: fE:fE:ffDfD8-@fEfAfofr frfffDfD8- fEfAfofrfrff: fE:fE:HHIf ffDfDfffDfDoo~Do^ Do~0fffEfEoDO D0@H@Hv@]fofofDofofofEofofofEofDofD5pfEofD-pfEofD%pfDfDfD LL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff:fE:fE: ffDfD8-@fEfAfofr frfffDfD8- fEfAfofrfrff:fE:fE: ffDfD85@fEfAfofr frfffDfD85 fEfAfofrfrff:fE:fE: LL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff: fE:fE:ffDfD8-@fEfAfofr frfffDfD8- fEfAfofrfrff: fE:fE:ffDfD85@fEfAfofr frfffDfD85 fEfAfofrfrff: fE:fE:HHIfffDfD f ffDfDfffDfDoo~Do^ Do~0fffEfEwDW D0o^@o~PDo^`Do~pfffEfEO@oPDO`DpHHHLL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHHUHHofLL_IHvHHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIfofAofEofDofofAofEo$H}IHHtfEfAsfD: >HvHfDHAof:fsHHL$Mq8Mi0MAI)M9MKt=MM)Mi0Mq8MfEfAsfD: HvIIfAsIL=HfE|fEfM~fM:MMIHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIL$Iq0MA8LILL^IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHvIYHHHtfAsfD: >HvHL=HfE|fM~fM:MMIHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMILLIHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIMMMIIIMBMBMBLL(u(}D(E D(M0D(U@D(]PD(e`D(mpD(D(HAYMMYA_A^A]A\[]H|$Ht$ofofoAo!fofoEoAfEofEoEoq fEofD%pfEofD-pfofEofEoA ffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff:fE:fE: ffDfD8-@fEfAfofr frfffDfD8- fEfAfofrfrff:fE:fE: ffDfD85@fEfAfofr frfffDfD85 fEfAfofrfrff:fE:fE: ffDfD8%@fEfAfofr frfffDfD8% fEfAfofrfrff: fE:fE:ffDfD8-@fEfAfofr frfffDfD8- fEfAfofrfrff: fE:fE:ffDfD85@fEfAfofr frfffDfD85 fEfAfofrfrff: fE:fE:IKff fffffEfEfEfD=pfEfffM3_H|$Ht$HHHLLLD$(LL$0USATAUAVAWAQHHl$ H)u)}D)E D)M0D)U@D)]PD)e`D)mpD)D)HLHwo}Z!B}ZAB}Za %`HcH@#}}@A b%@A=]rr b% A=]rrC C=]b%@A=]rr b% A=]rrCC=] I/=@]F]FFMH1LL\1IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHH@\f g Hv@H@H@Hoo}oooA}oooA}oooA}o}o%@AAA}}}`}@H1LL\1I} }o@ HIBMIIBB BBA%A-A5 EH«MBMIIMU]} Žrr EŽrr MŽrr IIUŽrr ]}o MMMIMIMMIMMMMI BB BBA%A-LT1L\1IA5 EMU]} ŽrHIBMIIrEŽrrMŽrrUŽrr]}o EC%C MC-C  H«MBMIIUC5C ]C=C } }o@ BB IIBBA%A-A5 EMUMMMIMIMMIMMMMI]} Žrr EŽrr MLT1 L\1(IHI0Žrr UŽrr ]}o  BB BHIBMIIBA%A-A5 EMUH«MBMII]} ŽrrEŽrrMIIŽrrUŽrr]}o E C%CM C-C U C5C] C=MMMIMIMMIMMMMICH%-  5`=@ LLIEFEFFCF^ ~@%^`_ @~_`o HIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIMFMF FC F-~LLIUFUFFCF @5` @~`HIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMI]F]FFCF=~HHH]HwHV H`Hoo}o}o%@}@M1HHHNM\0IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIIb%@A=]rr b% A=]rrC C=]b%@A=]rr b% A=]rrCC=] I9I=@]F]FFCF}o!oo}oooA}o}o%@A}@}`H HHHA H IOHM1LL[IHIBMIIH«MBMIIIIMMMIMIMMIMMMMIH[b%@A=]rr b% A=]rrC C=]b-@A5Urr b- A5UrrC C5UIb%@A=]rr b% A=]rrCC=] b-@A5Urr b- A5UrrCC5U b 5@A-Mrr b 5 A-MrrC C-M I9~I IH)HH HH9MMXIHIBMIIH«MBMIIIIMMMIMIMMIMMMMIM@l 5`=@UFUFFCFN n@5N`O o@~O`]F]FFCF}oHHH oo}oooA}oooA}o}o%@AA}@}`}H HHHHA H IOHM1LL[IHIBMIIH«MBMIIIIMMMIMIMMIMMMMIH[ b 5@A-Mrr b 5 A-MrrC  C-Mb-@A5Urr b- A5UrrC C5Ub%@A=]rr b% A=]rrC C=]LL[IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIH[I b 5@A-Mrr b 5 A-MrrC C-M b-@A5Urr b- A5UrrCC5U b%@A=]rr b% A=]rrCC=] I9I IH)HH HH9MMXIHIBMIIH«MBMIIIIMMMIMIMMIMMMMIM@l-  5`=@MFMF FC FV v@-V`W w@~W`UFUFFCF5~]F]FFCF}oHHHoo}oooA}oooA}oooA}o}o%@AAA}}}`}@H1IMMXIHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIM@} }o@ BB BBA%A-A5 EMU]} Žrr EŽrr MŽrr UŽrr ]}o MMXIHIBMIIH«MBMIIIIMMMIMIMMIMMMMI BB BBA%A-A5 EMU]} ŽrrEŽrrMŽrrUŽrr]}o EC%C MC-C  UC5C ]C=C } }o@MPMXIHIBMIIH«MBMIIIIMMMIMIMMIMMMMIM@  BB BBA%A-A5 EMU]} Žrr EŽrr MŽrr UŽrr ]}o  BB BBA%A-A5 EMU]} ŽrrEŽrrMŽrrUŽrr]}o E C%CM C-C U C5C] C=CHHH HHHHMMXIHIBMIIH«MBMIIIIMMMIMIMMIMMMMIM@Hl%-  5`=@ EFEFFCF^ ~@%^`_ @~_`o MFMF FC F-~UFUFFCF @5` @~`]F]FFCF}oHHHH 'H Hv H o}oA}oHoHHvH}Fow銴ooooA}oA}o-A}oA}oA b%@A=]rr b% A=]rrC C=]b-@A5Urr b- A5UrrC C5Ub%@A=]rr b% A=]rrCC=] b-@A5Urr b- A5UrrCC5U IgA=A5AA]F]FFcUFCFUFFM赒H jH LL^IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMILVL^IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHv H o}oA}o}oo}oA}o}ooHoHLL^IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHvH}9w!ooooA}oA}o-5oA}o}@}`}A b%@A=]rr b% A=]rrC C=]b-@A5Urr b- A5UrrC C5U b 5@A-Mrr b 5 A-MrrC  C-Mb%@A=]rr b% A=]rrCC=] b-@A5Urr b- A5UrrCC5U b 5@A-Mrr b 5 A-MrrC C-M I A=A5A-@` ]F]FFcUFCFUFFcMFC FMF FlH|$Ht$HHHLLLD$(LL$0USATAUAVAWAQHHl$ H)u)}D)E D)M0D)U@D)]PD)e`D)mpD)D)IY8HLHHwo}Z!B}ZAB}Za %`H+H@k(ooooooA}oA}oA}o}A}o5 -%}@}`}}A } }o@ BB BBA%A-A5 EMU]} Žrr EŽrr MŽrr UŽrr ]}o  BB BBA%A-A5 EMU]} ŽrrEŽrrMŽrrUŽrr]}o EC%C MC-C  UC5C ]C=C } }o@ BB BBA%A-A5 EMU]} Žrr EŽrr MŽrr UŽrr ]}o  BB BBA%A-A5 EMU]} ŽrrEŽrrMŽrrUŽrr]}o E C%CM C-C U C5C] C=CI%-  5`=@CFcEFEF=}M謇%^ ~_ cMFMF FC F~@V`ﶀ-~@W`~cUFUFFCF5 ~~ c]F]FFCFA}oH@H@@Hw(f =F@f`@`~~HHAHHHHHNoo}oooA}oooA}oooA}o}o%@AAA}}}`}@} }o@ BB BBA%A-A5 EMU]} Žrr EŽrr MŽrr UŽrr ]}o  BB BBA%A-A5 EMU]} ŽrrEŽrrMŽrrUŽrr]}o EC%C MC-C  UC5C ]C=C } }o@ BB BBA%A-A5 EMU]} Žrr EŽrr MŽrr UŽrr ]}o  BB BBA%A-A5 EMU]} ŽrrEŽrrMŽrrUŽrr]}o E C%CM C-C U C5C] C=C} }o@ BB BBA%A-A5 EMU]} Žrr EŽrr MŽrr UŽrr ]}o H oo}oooA}oooA}oooA}o}o%@AAA}}}`}@ LL_I} }o@ HIBMIIBB BBA%A-A5 EH«MBMIIMU]} Žrr EŽrr MŽrr IIUŽrr ]}o MMMIMIMMIMMMMI BB BBA%A-LWL_IA5 EMU]} ŽrHIBMIIrEŽrrMŽrrUŽrr]}o EC%C MC-C  H«MBMIIUC5C ]C=C } }o@ BB IIBBA%A-A5 EMUMMMIMIMMIMMMMI]} Žrr EŽrr MLW L_(IH0Žrr UŽrr ]}o  BB BHIBMIIBA%A-A5 EMUH«MBMII]} ŽrrEŽrrMIIŽrrUŽrr]}o E C%CM C-C U C5C] C=MMMIMIMMIMMMMICH%-  5`=@ LL_IHIBMIIH«MBMIIIIMMMIMIMMIMMMMILWL_IHIBMIIH«MBMIIIIMMMIMIMMIMMMMIH EFEFFCF^ ~@%^`_ @~_`o MFMF FC F-~UFUFFCF @5` @~`]F]FFCF=~HHHLL_IHIBMIIH«MBMIIIIMMMIMIMMIMMMMILWL_IHIBMIIH«MBMIIIIMMMIMIMMIMMMMIH M1H H>Hoo}o}o%@}@LL_IHIBMIIH«MBMIIIIMMMIMIMMIMMMMIHb%@A=]rr b% A=]rrC C=]LL_IHIBMIIH«MBMIIIIMMMIMIMMIMMMMIb%@A=]rr b% A=]rrCC=] LWL_IHIBMIIH«MBMIIIIMMMIMIMMIMMMMIH HI%=@]F]FFCF}otoo}oooA}o}o%@A}@}`LL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHb%@A=]rr b% A=]rrC C=]b-@A5Urr b- A5UrrC C5ULL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIb%@A=]rr b% A=]rrCC=] b-@A5Urr b- A5UrrCC5U LWL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIH HI1 5`=@UFUFFCFN n@5N`O o@~O`]F]FFCF}oùHHfoo}oooA}oooA}o}o%@AA}@}`}LL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHb%@A=]rr b% A=]rrC C=]b-@A5Urr b- A5UrrC C5ULL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMI b 5@A-Mrr b 5 A-MrrC  C-Mb%@A=]rr b% A=]rrCC=] LWL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIb-@A5Urr b- A5UrrCC5U b 5@A-Mrr b 5 A-MrrC C-M H HIi-  5`=@MFMF FC FV v@-V`W w@~W`UFUFFCF5~]F]FFCF}oùHH oo}oooA}oooA}oooA}o}o%@AAA}}}`}@LL_IHIBMIIH«MBMIIIIMMMIMIMMIMMMMIH} }o@ BB BBA%A-A5 EMLL_IU]} Žrr EŽrr MŽrr UŽrr ]}o HIBMII BB BBA%A-A5 EMU]} ŽrrEH«MBMIIŽrrMŽrrUŽrr]}o EC%C MC-C  UC5C ]IIC=C } }o@ BB BBA%A-A5 MMMIMIMMIMMMMIEMU]} Žrr EŽrr MŽrr UŽrr ]}o LWL_I BB BBA%A-A5 EMU]} ŽrHIBMIIrEŽrrMŽrrUŽrr]}o E C%CM C-C U C5H«MBMIIC] C=CIIMMMIMIMMIMMMMIH HI%-  5`=@ EFEFFCF^ ~@%^`_ @~_`o MFMF FC F-~UFUFFCF @5` @~`]F]FFCF}oùHHooooA}oA}o-5oA}o}@}`}A b%@A=]rr b% A=]rrC C=]b-@A5Urr b- A5UrrC C5U b 5@A-Mrr b 5 A-MrrC  C-Mb%@A=]rr b% A=]rrCC=] b-@A5Urr b- A5UrrCC5U b 5@A-Mrr b 5 A-MrrC C-M I A=A5A-@` ]F]FFcUFCFUFFcMFC FMF F)ooooA}oA}o-A}oA}oA b%@A=]rr b% A=]rrC C=]b-@A5Urr b- A5UrrC C5Ub%@A=]rr b% A=]rrCC=] b-@A5Urr b- A5UrrCC5U IgA=A5AA]F]FFcUFCFUFFME^H1HLL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIHHUH jH Hv LL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMILWL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIH o}oA}o}oo}oA}o}ooHHHvLL_IHIIIIHIMIIHIIIHIHIIHMMIMMMIMIMMIMMMMIH}9wp O?ad: \ ~  B e / ^   FvM|1` B _Cr'V88FTb*\S7}1 .P!!!%"T""" #8#{###$_$$$$$%%&$'2'@''`()h*+++,$,{-----..3/01v33335566707898i8w849c999::;M;;;-<j<x<<L={===0>_>6?e???@I@@@@CEXGGGGG8HgHHHIKIIIJ/JpJxJJJJKKKLSL|LLLMNiNWOPQCScSSSVVWWWXWXXXYYY#ZaZZZZ)[R[;\[\\!] ^4^o^^^^__6`_```aaabccd(fVg i*iJiji8kYkkkk!lJlllmooo pEpnppp q6qqqqqq9rArIrrss tttQttuvwwwx@xy zQzz{|}X~~~7% )Ii+Cȉ Hq@i͍ʎ\aŐ!JC~/X!זȚT\!\$M=ɟ-V.filegC:\Users\b\p\ring\.debug$S.debug$Tl.rdata.text.absolut L$rol8 L$rol16@2`>pHL$clampS_uYo@M"9.Nj z  B:"DHc~Ck3N b!q",%&@&M&'6,Kw-` .w5556588'%9M)<a<==A1B C&C MC'C7CNCfDDQEcE~E/FTGG%JLK{ KKMMPNeVVFqWbXwYY[7\\ |]* ]Q ak a b Xc c rh i j5 jM k^ kn Lk Cm Km n {o o o rA sp s Et ~  m ' WB وi ~ ? ׋ X  T/ D k  6     i*ݣARxchacha20_poly1305_constantsL$chacha20_constsL$avx2_initL$sse_incL$avx2_incL$and_maskspoly_hash_ad_internalL$poly_fast_tls_adL$hash_ad_loopL$hash_ad_tailL$hash_ad_tail_loopL$hash_ad_donering_core_0_17_14__chacha20_poly1305_open_sse41L$SEH_begin_chacha20_poly1305_open_sse41L$open_sse_init_roundsL$open_sse_main_loopL$open_sse_main_loop_roundsL$open_sse_tailL$open_sse_tail_64_rounds_and_x1hashL$open_sse_tail_64_roundsL$open_sse_tail_128L$open_sse_tail_128_rounds_and_x1hashL$open_sse_tail_128_roundsL$open_sse_tail_192L$open_sse_tail_192_rounds_and_x1hashL$open_sse_tail_192_roundsL$open_sse_tail_192_finishL$open_sse_tail_256L$open_sse_tail_256_rounds_and_x1hashL$open_sse_tail_256_hashL$open_sse_tail_64_dec_loopL$open_sse_tail_16_initL$open_sse_tail_16L$open_sse_tail_16_composeL$open_sse_tail_16_extractL$open_sse_finalizeL$open_sse_128L$open_sse_128_roundsL$open_sse_128_xor_hashL$SEH_end_chacha20_poly1305_open_sse41ring_core_0_17_14__chacha20_poly1305_seal_sse41L$SEH_begin_chacha20_poly1305_seal_sse41L$seal_sse_init_roundsL$seal_sse_main_initL$seal_sse_main_loopL$seal_sse_main_roundsL$seal_sse_main_loop_xorL$seal_sse_tail_64L$seal_sse_tail_64_rounds_and_x2hashL$seal_sse_tail_64_rounds_and_x1hashL$seal_sse_tail_128L$seal_sse_tail_128_rounds_and_x2hashL$seal_sse_tail_128_rounds_and_x1hashL$seal_sse_tail_192L$seal_sse_tail_192_rounds_and_x2hashL$seal_sse_tail_192_rounds_and_x1hashL$seal_sse_128_tail_hashL$seal_sse_128_tail_xorL$seal_sse_tail_16L$seal_sse_tail_16_composeL$seal_sse_tail_16_extractL$load_extra_inL$load_extra_load_loopL$load_extra_shift_loopL$process_blocks_of_extra_inL$process_extra_hash_loopprocess_extra_in_trailerL$process_extra_in_trailer_loadL$process_partial_blockL$do_length_blockL$seal_sse_128L$seal_sse_128_roundsL$SEH_end_chacha20_poly1305_seal_sse41ring_core_0_17_14__chacha20_poly1305_open_avx2L$SEH_begin_chacha20_poly1305_open_avx2L$open_avx2_init_roundsL$open_avx2_init_hashL$open_avx2_main_loopL$open_avx2_main_loop_roundsL$open_avx2_main_loop_doneL$open_avx2_tail_128_rounds_and_x1hashL$open_avx2_tail_128_roundsL$open_avx2_tail_256L$open_avx2_tail_256_rounds_and_x1hashL$open_avx2_tail_256_roundsL$open_avx2_tail_256_hashL$open_avx2_tail_256_doneL$open_avx2_tail_384L$open_avx2_tail_384_rounds_and_x2hashL$open_avx2_tail_384_rounds_and_x1hashL$open_avx2_384_tail_hashL$open_avx2_384_tail_doneL$open_avx2_tail_512L$open_avx2_tail_512_rounds_and_x2hashL$open_avx2_tail_512_rounds_and_x1hashL$open_avx2_tail_512_hashL$open_avx2_tail_512_doneL$open_avx2_tail_128_xorL$open_avx2_tail_32_xorL$open_avx2_exitL$open_avx2_192L$open_avx2_192_roundsL$open_avx2_shortL$open_avx2_short_hash_and_xor_loopL$open_avx2_short_tail_32L$open_avx2_short_tail_32_exitL$open_avx2_320L$open_avx2_320_roundsL$SEH_end_chacha20_poly1305_open_avx2ring_core_0_17_14__chacha20_poly1305_seal_avx2L$SEH_begin_chacha20_poly1305_seal_avx2L$seal_avx2_init_roundsL$seal_avx2_main_loopL$seal_avx2_main_loop_roundsL$seal_avx2_main_loop_rounds_entryL$seal_avx2_tail_128L$seal_avx2_tail_128_rounds_and_3xhashL$seal_avx2_tail_128_rounds_and_2xhashL$seal_avx2_tail_256L$seal_avx2_tail_256_rounds_and_3xhashL$seal_avx2_tail_256_rounds_and_2xhashL$seal_avx2_tail_384L$seal_avx2_tail_384_rounds_and_3xhashL$seal_avx2_tail_384_rounds_and_2xhashL$seal_avx2_tail_512L$seal_avx2_tail_512_rounds_and_3xhashL$seal_avx2_tail_512_rounds_and_2xhashL$seal_avx2_320L$seal_avx2_320_roundsL$seal_avx2_192L$seal_avx2_192_roundsL$seal_avx2_shortL$seal_avx2_short_hash_remainderL$seal_avx2_short_loopL$seal_avx2_short_tailL$seal_avx2_exitL$SEH_end_chacha20_poly1305_seal_avx2ring-0.17.14/pregenerated/ghash-armv4-linux32.S000064400000000000000000000141751046102023000171610ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both @ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL @ instructions are in aesv8-armx.pl.) .arch armv7-a .text #if defined(__thumb2__) || defined(__clang__) .syntax unified #define ldrplb ldrbpl #define ldrneb ldrbne #endif #if defined(__thumb2__) .thumb #else .code 32 #endif #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon .globl gcm_init_neon .hidden gcm_init_neon .type gcm_init_neon,%function .align 4 gcm_init_neon: vld1.64 d7,[r1]! @ load H vmov.i8 q8,#0xe1 vld1.64 d6,[r1] vshl.i64 d17,#57 vshr.u64 d16,#63 @ t0=0xc2....01 vdup.8 q9,d7[7] vshr.u64 d26,d6,#63 vshr.s8 q9,#7 @ broadcast carry bit vshl.i64 q3,q3,#1 vand q8,q8,q9 vorr d7,d26 @ H<<<=1 veor q3,q3,q8 @ twisted H vstmia r0,{q3} bx lr @ bx lr .size gcm_init_neon,.-gcm_init_neon .globl gcm_gmult_neon .hidden gcm_gmult_neon .type gcm_gmult_neon,%function .align 4 gcm_gmult_neon: vld1.64 d7,[r0]! @ load Xi vld1.64 d6,[r0]! vmov.i64 d29,#0x0000ffffffffffff vldmia r1,{d26,d27} @ load twisted H vmov.i64 d30,#0x00000000ffffffff #ifdef __ARMEL__ vrev64.8 q3,q3 #endif vmov.i64 d31,#0x000000000000ffff veor d28,d26,d27 @ Karatsuba pre-processing mov r3,#16 b .Lgmult_neon .size gcm_gmult_neon,.-gcm_gmult_neon .globl gcm_ghash_neon .hidden gcm_ghash_neon .type gcm_ghash_neon,%function .align 4 gcm_ghash_neon: vld1.64 d1,[r0]! @ load Xi vld1.64 d0,[r0]! vmov.i64 d29,#0x0000ffffffffffff vldmia r1,{d26,d27} @ load twisted H vmov.i64 d30,#0x00000000ffffffff #ifdef __ARMEL__ vrev64.8 q0,q0 #endif vmov.i64 d31,#0x000000000000ffff veor d28,d26,d27 @ Karatsuba pre-processing .Loop_neon: vld1.64 d7,[r2]! @ load inp vld1.64 d6,[r2]! #ifdef __ARMEL__ vrev64.8 q3,q3 #endif veor q3,q0 @ inp^=Xi .Lgmult_neon: vext.8 d16, d26, d26, #1 @ A1 vmull.p8 q8, d16, d6 @ F = A1*B vext.8 d0, d6, d6, #1 @ B1 vmull.p8 q0, d26, d0 @ E = A*B1 vext.8 d18, d26, d26, #2 @ A2 vmull.p8 q9, d18, d6 @ H = A2*B vext.8 d22, d6, d6, #2 @ B2 vmull.p8 q11, d26, d22 @ G = A*B2 vext.8 d20, d26, d26, #3 @ A3 veor q8, q8, q0 @ L = E + F vmull.p8 q10, d20, d6 @ J = A3*B vext.8 d0, d6, d6, #3 @ B3 veor q9, q9, q11 @ M = G + H vmull.p8 q0, d26, d0 @ I = A*B3 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 vand d17, d17, d29 vext.8 d22, d6, d6, #4 @ B4 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 vand d19, d19, d30 vmull.p8 q11, d26, d22 @ K = A*B4 veor q10, q10, q0 @ N = I + J veor d16, d16, d17 veor d18, d18, d19 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 vand d21, d21, d31 vext.8 q8, q8, q8, #15 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 vmov.i64 d23, #0 vext.8 q9, q9, q9, #14 veor d20, d20, d21 vmull.p8 q0, d26, d6 @ D = A*B vext.8 q11, q11, q11, #12 vext.8 q10, q10, q10, #13 veor q8, q8, q9 veor q10, q10, q11 veor q0, q0, q8 veor q0, q0, q10 veor d6,d6,d7 @ Karatsuba pre-processing vext.8 d16, d28, d28, #1 @ A1 vmull.p8 q8, d16, d6 @ F = A1*B vext.8 d2, d6, d6, #1 @ B1 vmull.p8 q1, d28, d2 @ E = A*B1 vext.8 d18, d28, d28, #2 @ A2 vmull.p8 q9, d18, d6 @ H = A2*B vext.8 d22, d6, d6, #2 @ B2 vmull.p8 q11, d28, d22 @ G = A*B2 vext.8 d20, d28, d28, #3 @ A3 veor q8, q8, q1 @ L = E + F vmull.p8 q10, d20, d6 @ J = A3*B vext.8 d2, d6, d6, #3 @ B3 veor q9, q9, q11 @ M = G + H vmull.p8 q1, d28, d2 @ I = A*B3 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 vand d17, d17, d29 vext.8 d22, d6, d6, #4 @ B4 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 vand d19, d19, d30 vmull.p8 q11, d28, d22 @ K = A*B4 veor q10, q10, q1 @ N = I + J veor d16, d16, d17 veor d18, d18, d19 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 vand d21, d21, d31 vext.8 q8, q8, q8, #15 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 vmov.i64 d23, #0 vext.8 q9, q9, q9, #14 veor d20, d20, d21 vmull.p8 q1, d28, d6 @ D = A*B vext.8 q11, q11, q11, #12 vext.8 q10, q10, q10, #13 veor q8, q8, q9 veor q10, q10, q11 veor q1, q1, q8 veor q1, q1, q10 vext.8 d16, d27, d27, #1 @ A1 vmull.p8 q8, d16, d7 @ F = A1*B vext.8 d4, d7, d7, #1 @ B1 vmull.p8 q2, d27, d4 @ E = A*B1 vext.8 d18, d27, d27, #2 @ A2 vmull.p8 q9, d18, d7 @ H = A2*B vext.8 d22, d7, d7, #2 @ B2 vmull.p8 q11, d27, d22 @ G = A*B2 vext.8 d20, d27, d27, #3 @ A3 veor q8, q8, q2 @ L = E + F vmull.p8 q10, d20, d7 @ J = A3*B vext.8 d4, d7, d7, #3 @ B3 veor q9, q9, q11 @ M = G + H vmull.p8 q2, d27, d4 @ I = A*B3 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 vand d17, d17, d29 vext.8 d22, d7, d7, #4 @ B4 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 vand d19, d19, d30 vmull.p8 q11, d27, d22 @ K = A*B4 veor q10, q10, q2 @ N = I + J veor d16, d16, d17 veor d18, d18, d19 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 vand d21, d21, d31 vext.8 q8, q8, q8, #15 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 vmov.i64 d23, #0 vext.8 q9, q9, q9, #14 veor d20, d20, d21 vmull.p8 q2, d27, d7 @ D = A*B vext.8 q11, q11, q11, #12 vext.8 q10, q10, q10, #13 veor q8, q8, q9 veor q10, q10, q11 veor q2, q2, q8 veor q2, q2, q10 veor q1,q1,q0 @ Karatsuba post-processing veor q1,q1,q2 veor d1,d1,d2 veor d4,d4,d3 @ Xh|Xl - 256-bit result @ equivalent of reduction_avx from ghash-x86_64.pl vshl.i64 q9,q0,#57 @ 1st phase vshl.i64 q10,q0,#62 veor q10,q10,q9 @ vshl.i64 q9,q0,#63 veor q10, q10, q9 @ veor d1,d1,d20 @ veor d4,d4,d21 vshr.u64 q10,q0,#1 @ 2nd phase veor q2,q2,q0 veor q0,q0,q10 @ vshr.u64 q10,q10,#6 vshr.u64 q0,q0,#1 @ veor q0,q0,q2 @ veor q0,q0,q10 @ subs r3,#16 bne .Loop_neon #ifdef __ARMEL__ vrev64.8 q0,q0 #endif sub r0,#16 vst1.64 d1,[r0]! @ write out Xi vst1.64 d0,[r0] bx lr @ bx lr .size gcm_ghash_neon,.-gcm_ghash_neon #endif .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) ring-0.17.14/pregenerated/ghash-neon-armv8-ios64.S000064400000000000000000000251731046102023000175620ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) .text .globl _gcm_init_neon .private_extern _gcm_init_neon .align 4 _gcm_init_neon: AARCH64_VALID_CALL_TARGET // This function is adapted from gcm_init_v8. xC2 is t3. ld1 {v17.2d}, [x1] // load H movi v19.16b, #0xe1 shl v19.2d, v19.2d, #57 // 0xc2.0 ext v3.16b, v17.16b, v17.16b, #8 ushr v18.2d, v19.2d, #63 dup v17.4s, v17.s[1] ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 ushr v18.2d, v3.2d, #63 sshr v17.4s, v17.4s, #31 // broadcast carry bit and v18.16b, v18.16b, v16.16b shl v3.2d, v3.2d, #1 ext v18.16b, v18.16b, v18.16b, #8 and v16.16b, v16.16b, v17.16b orr v3.16b, v3.16b, v18.16b // H<<<=1 eor v5.16b, v3.16b, v16.16b // twisted H st1 {v5.2d}, [x0] // store Htable[0] ret .globl _gcm_gmult_neon .private_extern _gcm_gmult_neon .align 4 _gcm_gmult_neon: AARCH64_VALID_CALL_TARGET ld1 {v3.16b}, [x0] // load Xi ld1 {v5.1d}, [x1], #8 // load twisted H ld1 {v6.1d}, [x1] adrp x9, Lmasks@PAGE // load constants add x9, x9, Lmasks@PAGEOFF ld1 {v24.2d, v25.2d}, [x9] rev64 v3.16b, v3.16b // byteswap Xi ext v3.16b, v3.16b, v3.16b, #8 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing mov x3, #16 b Lgmult_neon .globl _gcm_ghash_neon .private_extern _gcm_ghash_neon .align 4 _gcm_ghash_neon: AARCH64_VALID_CALL_TARGET ld1 {v0.16b}, [x0] // load Xi ld1 {v5.1d}, [x1], #8 // load twisted H ld1 {v6.1d}, [x1] adrp x9, Lmasks@PAGE // load constants add x9, x9, Lmasks@PAGEOFF ld1 {v24.2d, v25.2d}, [x9] rev64 v0.16b, v0.16b // byteswap Xi ext v0.16b, v0.16b, v0.16b, #8 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing Loop_neon: ld1 {v3.16b}, [x2], #16 // load inp rev64 v3.16b, v3.16b // byteswap inp ext v3.16b, v3.16b, v3.16b, #8 eor v3.16b, v3.16b, v0.16b // inp ^= Xi Lgmult_neon: // Split the input into v3 and v4. (The upper halves are unused, // so it is okay to leave them alone.) ins v4.d[0], v3.d[1] ext v16.8b, v5.8b, v5.8b, #1 // A1 pmull v16.8h, v16.8b, v3.8b // F = A1*B ext v0.8b, v3.8b, v3.8b, #1 // B1 pmull v0.8h, v5.8b, v0.8b // E = A*B1 ext v17.8b, v5.8b, v5.8b, #2 // A2 pmull v17.8h, v17.8b, v3.8b // H = A2*B ext v19.8b, v3.8b, v3.8b, #2 // B2 pmull v19.8h, v5.8b, v19.8b // G = A*B2 ext v18.8b, v5.8b, v5.8b, #3 // A3 eor v16.16b, v16.16b, v0.16b // L = E + F pmull v18.8h, v18.8b, v3.8b // J = A3*B ext v0.8b, v3.8b, v3.8b, #3 // B3 eor v17.16b, v17.16b, v19.16b // M = G + H pmull v0.8h, v5.8b, v0.8b // I = A*B3 // Here we diverge from the 32-bit version. It computes the following // (instructions reordered for clarity): // // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) // vand $t0#hi, $t0#hi, $k48 // veor $t0#lo, $t0#lo, $t0#hi // // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) // vand $t1#hi, $t1#hi, $k32 // veor $t1#lo, $t1#lo, $t1#hi // // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) // vand $t2#hi, $t2#hi, $k16 // veor $t2#lo, $t2#lo, $t2#hi // // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) // vmov.i64 $t3#hi, #0 // // $kN is a mask with the bottom N bits set. AArch64 cannot compute on // upper halves of SIMD registers, so we must split each half into // separate registers. To compensate, we pair computations up and // parallelize. ext v19.8b, v3.8b, v3.8b, #4 // B4 eor v18.16b, v18.16b, v0.16b // N = I + J pmull v19.8h, v5.8b, v19.8b // K = A*B4 // This can probably be scheduled more efficiently. For now, we just // pair up independent instructions. zip1 v20.2d, v16.2d, v17.2d zip1 v22.2d, v18.2d, v19.2d zip2 v21.2d, v16.2d, v17.2d zip2 v23.2d, v18.2d, v19.2d eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b and v21.16b, v21.16b, v24.16b and v23.16b, v23.16b, v25.16b eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b zip1 v16.2d, v20.2d, v21.2d zip1 v18.2d, v22.2d, v23.2d zip2 v17.2d, v20.2d, v21.2d zip2 v19.2d, v22.2d, v23.2d ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 pmull v0.8h, v5.8b, v3.8b // D = A*B ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 eor v16.16b, v16.16b, v17.16b eor v18.16b, v18.16b, v19.16b eor v0.16b, v0.16b, v16.16b eor v0.16b, v0.16b, v18.16b eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing ext v16.8b, v7.8b, v7.8b, #1 // A1 pmull v16.8h, v16.8b, v3.8b // F = A1*B ext v1.8b, v3.8b, v3.8b, #1 // B1 pmull v1.8h, v7.8b, v1.8b // E = A*B1 ext v17.8b, v7.8b, v7.8b, #2 // A2 pmull v17.8h, v17.8b, v3.8b // H = A2*B ext v19.8b, v3.8b, v3.8b, #2 // B2 pmull v19.8h, v7.8b, v19.8b // G = A*B2 ext v18.8b, v7.8b, v7.8b, #3 // A3 eor v16.16b, v16.16b, v1.16b // L = E + F pmull v18.8h, v18.8b, v3.8b // J = A3*B ext v1.8b, v3.8b, v3.8b, #3 // B3 eor v17.16b, v17.16b, v19.16b // M = G + H pmull v1.8h, v7.8b, v1.8b // I = A*B3 // Here we diverge from the 32-bit version. It computes the following // (instructions reordered for clarity): // // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) // vand $t0#hi, $t0#hi, $k48 // veor $t0#lo, $t0#lo, $t0#hi // // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) // vand $t1#hi, $t1#hi, $k32 // veor $t1#lo, $t1#lo, $t1#hi // // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) // vand $t2#hi, $t2#hi, $k16 // veor $t2#lo, $t2#lo, $t2#hi // // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) // vmov.i64 $t3#hi, #0 // // $kN is a mask with the bottom N bits set. AArch64 cannot compute on // upper halves of SIMD registers, so we must split each half into // separate registers. To compensate, we pair computations up and // parallelize. ext v19.8b, v3.8b, v3.8b, #4 // B4 eor v18.16b, v18.16b, v1.16b // N = I + J pmull v19.8h, v7.8b, v19.8b // K = A*B4 // This can probably be scheduled more efficiently. For now, we just // pair up independent instructions. zip1 v20.2d, v16.2d, v17.2d zip1 v22.2d, v18.2d, v19.2d zip2 v21.2d, v16.2d, v17.2d zip2 v23.2d, v18.2d, v19.2d eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b and v21.16b, v21.16b, v24.16b and v23.16b, v23.16b, v25.16b eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b zip1 v16.2d, v20.2d, v21.2d zip1 v18.2d, v22.2d, v23.2d zip2 v17.2d, v20.2d, v21.2d zip2 v19.2d, v22.2d, v23.2d ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 pmull v1.8h, v7.8b, v3.8b // D = A*B ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 eor v16.16b, v16.16b, v17.16b eor v18.16b, v18.16b, v19.16b eor v1.16b, v1.16b, v16.16b eor v1.16b, v1.16b, v18.16b ext v16.8b, v6.8b, v6.8b, #1 // A1 pmull v16.8h, v16.8b, v4.8b // F = A1*B ext v2.8b, v4.8b, v4.8b, #1 // B1 pmull v2.8h, v6.8b, v2.8b // E = A*B1 ext v17.8b, v6.8b, v6.8b, #2 // A2 pmull v17.8h, v17.8b, v4.8b // H = A2*B ext v19.8b, v4.8b, v4.8b, #2 // B2 pmull v19.8h, v6.8b, v19.8b // G = A*B2 ext v18.8b, v6.8b, v6.8b, #3 // A3 eor v16.16b, v16.16b, v2.16b // L = E + F pmull v18.8h, v18.8b, v4.8b // J = A3*B ext v2.8b, v4.8b, v4.8b, #3 // B3 eor v17.16b, v17.16b, v19.16b // M = G + H pmull v2.8h, v6.8b, v2.8b // I = A*B3 // Here we diverge from the 32-bit version. It computes the following // (instructions reordered for clarity): // // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) // vand $t0#hi, $t0#hi, $k48 // veor $t0#lo, $t0#lo, $t0#hi // // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) // vand $t1#hi, $t1#hi, $k32 // veor $t1#lo, $t1#lo, $t1#hi // // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) // vand $t2#hi, $t2#hi, $k16 // veor $t2#lo, $t2#lo, $t2#hi // // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) // vmov.i64 $t3#hi, #0 // // $kN is a mask with the bottom N bits set. AArch64 cannot compute on // upper halves of SIMD registers, so we must split each half into // separate registers. To compensate, we pair computations up and // parallelize. ext v19.8b, v4.8b, v4.8b, #4 // B4 eor v18.16b, v18.16b, v2.16b // N = I + J pmull v19.8h, v6.8b, v19.8b // K = A*B4 // This can probably be scheduled more efficiently. For now, we just // pair up independent instructions. zip1 v20.2d, v16.2d, v17.2d zip1 v22.2d, v18.2d, v19.2d zip2 v21.2d, v16.2d, v17.2d zip2 v23.2d, v18.2d, v19.2d eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b and v21.16b, v21.16b, v24.16b and v23.16b, v23.16b, v25.16b eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b zip1 v16.2d, v20.2d, v21.2d zip1 v18.2d, v22.2d, v23.2d zip2 v17.2d, v20.2d, v21.2d zip2 v19.2d, v22.2d, v23.2d ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 pmull v2.8h, v6.8b, v4.8b // D = A*B ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 eor v16.16b, v16.16b, v17.16b eor v18.16b, v18.16b, v19.16b eor v2.16b, v2.16b, v16.16b eor v2.16b, v2.16b, v18.16b ext v16.16b, v0.16b, v2.16b, #8 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing eor v1.16b, v1.16b, v2.16b eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result // This is a no-op due to the ins instruction below. // ins v2.d[0], v1.d[1] // equivalent of reduction_avx from ghash-x86_64.pl shl v17.2d, v0.2d, #57 // 1st phase shl v18.2d, v0.2d, #62 eor v18.16b, v18.16b, v17.16b // shl v17.2d, v0.2d, #63 eor v18.16b, v18.16b, v17.16b // // Note Xm contains {Xl.d[1], Xh.d[0]}. eor v18.16b, v18.16b, v1.16b ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] ushr v18.2d, v0.2d, #1 // 2nd phase eor v2.16b, v2.16b,v0.16b eor v0.16b, v0.16b,v18.16b // ushr v18.2d, v18.2d, #6 ushr v0.2d, v0.2d, #1 // eor v0.16b, v0.16b, v2.16b // eor v0.16b, v0.16b, v18.16b // subs x3, x3, #16 bne Loop_neon rev64 v0.16b, v0.16b // byteswap Xi and write ext v0.16b, v0.16b, v0.16b, #8 st1 {v0.16b}, [x0] ret .section __TEXT,__const .align 4 Lmasks: .quad 0x0000ffffffffffff // k48 .quad 0x00000000ffffffff // k32 .quad 0x000000000000ffff // k16 .quad 0x0000000000000000 // k0 .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) ring-0.17.14/pregenerated/ghash-neon-armv8-linux64.S000064400000000000000000000254201046102023000201220ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) .text .globl gcm_init_neon .hidden gcm_init_neon .type gcm_init_neon,%function .align 4 gcm_init_neon: AARCH64_VALID_CALL_TARGET // This function is adapted from gcm_init_v8. xC2 is t3. ld1 {v17.2d}, [x1] // load H movi v19.16b, #0xe1 shl v19.2d, v19.2d, #57 // 0xc2.0 ext v3.16b, v17.16b, v17.16b, #8 ushr v18.2d, v19.2d, #63 dup v17.4s, v17.s[1] ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 ushr v18.2d, v3.2d, #63 sshr v17.4s, v17.4s, #31 // broadcast carry bit and v18.16b, v18.16b, v16.16b shl v3.2d, v3.2d, #1 ext v18.16b, v18.16b, v18.16b, #8 and v16.16b, v16.16b, v17.16b orr v3.16b, v3.16b, v18.16b // H<<<=1 eor v5.16b, v3.16b, v16.16b // twisted H st1 {v5.2d}, [x0] // store Htable[0] ret .size gcm_init_neon,.-gcm_init_neon .globl gcm_gmult_neon .hidden gcm_gmult_neon .type gcm_gmult_neon,%function .align 4 gcm_gmult_neon: AARCH64_VALID_CALL_TARGET ld1 {v3.16b}, [x0] // load Xi ld1 {v5.1d}, [x1], #8 // load twisted H ld1 {v6.1d}, [x1] adrp x9, .Lmasks // load constants add x9, x9, :lo12:.Lmasks ld1 {v24.2d, v25.2d}, [x9] rev64 v3.16b, v3.16b // byteswap Xi ext v3.16b, v3.16b, v3.16b, #8 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing mov x3, #16 b .Lgmult_neon .size gcm_gmult_neon,.-gcm_gmult_neon .globl gcm_ghash_neon .hidden gcm_ghash_neon .type gcm_ghash_neon,%function .align 4 gcm_ghash_neon: AARCH64_VALID_CALL_TARGET ld1 {v0.16b}, [x0] // load Xi ld1 {v5.1d}, [x1], #8 // load twisted H ld1 {v6.1d}, [x1] adrp x9, .Lmasks // load constants add x9, x9, :lo12:.Lmasks ld1 {v24.2d, v25.2d}, [x9] rev64 v0.16b, v0.16b // byteswap Xi ext v0.16b, v0.16b, v0.16b, #8 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing .Loop_neon: ld1 {v3.16b}, [x2], #16 // load inp rev64 v3.16b, v3.16b // byteswap inp ext v3.16b, v3.16b, v3.16b, #8 eor v3.16b, v3.16b, v0.16b // inp ^= Xi .Lgmult_neon: // Split the input into v3 and v4. (The upper halves are unused, // so it is okay to leave them alone.) ins v4.d[0], v3.d[1] ext v16.8b, v5.8b, v5.8b, #1 // A1 pmull v16.8h, v16.8b, v3.8b // F = A1*B ext v0.8b, v3.8b, v3.8b, #1 // B1 pmull v0.8h, v5.8b, v0.8b // E = A*B1 ext v17.8b, v5.8b, v5.8b, #2 // A2 pmull v17.8h, v17.8b, v3.8b // H = A2*B ext v19.8b, v3.8b, v3.8b, #2 // B2 pmull v19.8h, v5.8b, v19.8b // G = A*B2 ext v18.8b, v5.8b, v5.8b, #3 // A3 eor v16.16b, v16.16b, v0.16b // L = E + F pmull v18.8h, v18.8b, v3.8b // J = A3*B ext v0.8b, v3.8b, v3.8b, #3 // B3 eor v17.16b, v17.16b, v19.16b // M = G + H pmull v0.8h, v5.8b, v0.8b // I = A*B3 // Here we diverge from the 32-bit version. It computes the following // (instructions reordered for clarity): // // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) // vand $t0#hi, $t0#hi, $k48 // veor $t0#lo, $t0#lo, $t0#hi // // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) // vand $t1#hi, $t1#hi, $k32 // veor $t1#lo, $t1#lo, $t1#hi // // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) // vand $t2#hi, $t2#hi, $k16 // veor $t2#lo, $t2#lo, $t2#hi // // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) // vmov.i64 $t3#hi, #0 // // $kN is a mask with the bottom N bits set. AArch64 cannot compute on // upper halves of SIMD registers, so we must split each half into // separate registers. To compensate, we pair computations up and // parallelize. ext v19.8b, v3.8b, v3.8b, #4 // B4 eor v18.16b, v18.16b, v0.16b // N = I + J pmull v19.8h, v5.8b, v19.8b // K = A*B4 // This can probably be scheduled more efficiently. For now, we just // pair up independent instructions. zip1 v20.2d, v16.2d, v17.2d zip1 v22.2d, v18.2d, v19.2d zip2 v21.2d, v16.2d, v17.2d zip2 v23.2d, v18.2d, v19.2d eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b and v21.16b, v21.16b, v24.16b and v23.16b, v23.16b, v25.16b eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b zip1 v16.2d, v20.2d, v21.2d zip1 v18.2d, v22.2d, v23.2d zip2 v17.2d, v20.2d, v21.2d zip2 v19.2d, v22.2d, v23.2d ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 pmull v0.8h, v5.8b, v3.8b // D = A*B ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 eor v16.16b, v16.16b, v17.16b eor v18.16b, v18.16b, v19.16b eor v0.16b, v0.16b, v16.16b eor v0.16b, v0.16b, v18.16b eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing ext v16.8b, v7.8b, v7.8b, #1 // A1 pmull v16.8h, v16.8b, v3.8b // F = A1*B ext v1.8b, v3.8b, v3.8b, #1 // B1 pmull v1.8h, v7.8b, v1.8b // E = A*B1 ext v17.8b, v7.8b, v7.8b, #2 // A2 pmull v17.8h, v17.8b, v3.8b // H = A2*B ext v19.8b, v3.8b, v3.8b, #2 // B2 pmull v19.8h, v7.8b, v19.8b // G = A*B2 ext v18.8b, v7.8b, v7.8b, #3 // A3 eor v16.16b, v16.16b, v1.16b // L = E + F pmull v18.8h, v18.8b, v3.8b // J = A3*B ext v1.8b, v3.8b, v3.8b, #3 // B3 eor v17.16b, v17.16b, v19.16b // M = G + H pmull v1.8h, v7.8b, v1.8b // I = A*B3 // Here we diverge from the 32-bit version. It computes the following // (instructions reordered for clarity): // // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) // vand $t0#hi, $t0#hi, $k48 // veor $t0#lo, $t0#lo, $t0#hi // // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) // vand $t1#hi, $t1#hi, $k32 // veor $t1#lo, $t1#lo, $t1#hi // // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) // vand $t2#hi, $t2#hi, $k16 // veor $t2#lo, $t2#lo, $t2#hi // // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) // vmov.i64 $t3#hi, #0 // // $kN is a mask with the bottom N bits set. AArch64 cannot compute on // upper halves of SIMD registers, so we must split each half into // separate registers. To compensate, we pair computations up and // parallelize. ext v19.8b, v3.8b, v3.8b, #4 // B4 eor v18.16b, v18.16b, v1.16b // N = I + J pmull v19.8h, v7.8b, v19.8b // K = A*B4 // This can probably be scheduled more efficiently. For now, we just // pair up independent instructions. zip1 v20.2d, v16.2d, v17.2d zip1 v22.2d, v18.2d, v19.2d zip2 v21.2d, v16.2d, v17.2d zip2 v23.2d, v18.2d, v19.2d eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b and v21.16b, v21.16b, v24.16b and v23.16b, v23.16b, v25.16b eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b zip1 v16.2d, v20.2d, v21.2d zip1 v18.2d, v22.2d, v23.2d zip2 v17.2d, v20.2d, v21.2d zip2 v19.2d, v22.2d, v23.2d ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 pmull v1.8h, v7.8b, v3.8b // D = A*B ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 eor v16.16b, v16.16b, v17.16b eor v18.16b, v18.16b, v19.16b eor v1.16b, v1.16b, v16.16b eor v1.16b, v1.16b, v18.16b ext v16.8b, v6.8b, v6.8b, #1 // A1 pmull v16.8h, v16.8b, v4.8b // F = A1*B ext v2.8b, v4.8b, v4.8b, #1 // B1 pmull v2.8h, v6.8b, v2.8b // E = A*B1 ext v17.8b, v6.8b, v6.8b, #2 // A2 pmull v17.8h, v17.8b, v4.8b // H = A2*B ext v19.8b, v4.8b, v4.8b, #2 // B2 pmull v19.8h, v6.8b, v19.8b // G = A*B2 ext v18.8b, v6.8b, v6.8b, #3 // A3 eor v16.16b, v16.16b, v2.16b // L = E + F pmull v18.8h, v18.8b, v4.8b // J = A3*B ext v2.8b, v4.8b, v4.8b, #3 // B3 eor v17.16b, v17.16b, v19.16b // M = G + H pmull v2.8h, v6.8b, v2.8b // I = A*B3 // Here we diverge from the 32-bit version. It computes the following // (instructions reordered for clarity): // // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) // vand $t0#hi, $t0#hi, $k48 // veor $t0#lo, $t0#lo, $t0#hi // // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) // vand $t1#hi, $t1#hi, $k32 // veor $t1#lo, $t1#lo, $t1#hi // // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) // vand $t2#hi, $t2#hi, $k16 // veor $t2#lo, $t2#lo, $t2#hi // // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) // vmov.i64 $t3#hi, #0 // // $kN is a mask with the bottom N bits set. AArch64 cannot compute on // upper halves of SIMD registers, so we must split each half into // separate registers. To compensate, we pair computations up and // parallelize. ext v19.8b, v4.8b, v4.8b, #4 // B4 eor v18.16b, v18.16b, v2.16b // N = I + J pmull v19.8h, v6.8b, v19.8b // K = A*B4 // This can probably be scheduled more efficiently. For now, we just // pair up independent instructions. zip1 v20.2d, v16.2d, v17.2d zip1 v22.2d, v18.2d, v19.2d zip2 v21.2d, v16.2d, v17.2d zip2 v23.2d, v18.2d, v19.2d eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b and v21.16b, v21.16b, v24.16b and v23.16b, v23.16b, v25.16b eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b zip1 v16.2d, v20.2d, v21.2d zip1 v18.2d, v22.2d, v23.2d zip2 v17.2d, v20.2d, v21.2d zip2 v19.2d, v22.2d, v23.2d ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 pmull v2.8h, v6.8b, v4.8b // D = A*B ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 eor v16.16b, v16.16b, v17.16b eor v18.16b, v18.16b, v19.16b eor v2.16b, v2.16b, v16.16b eor v2.16b, v2.16b, v18.16b ext v16.16b, v0.16b, v2.16b, #8 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing eor v1.16b, v1.16b, v2.16b eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result // This is a no-op due to the ins instruction below. // ins v2.d[0], v1.d[1] // equivalent of reduction_avx from ghash-x86_64.pl shl v17.2d, v0.2d, #57 // 1st phase shl v18.2d, v0.2d, #62 eor v18.16b, v18.16b, v17.16b // shl v17.2d, v0.2d, #63 eor v18.16b, v18.16b, v17.16b // // Note Xm contains {Xl.d[1], Xh.d[0]}. eor v18.16b, v18.16b, v1.16b ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] ushr v18.2d, v0.2d, #1 // 2nd phase eor v2.16b, v2.16b,v0.16b eor v0.16b, v0.16b,v18.16b // ushr v18.2d, v18.2d, #6 ushr v0.2d, v0.2d, #1 // eor v0.16b, v0.16b, v2.16b // eor v0.16b, v0.16b, v18.16b // subs x3, x3, #16 bne .Loop_neon rev64 v0.16b, v0.16b // byteswap Xi and write ext v0.16b, v0.16b, v0.16b, #8 st1 {v0.16b}, [x0] ret .size gcm_ghash_neon,.-gcm_ghash_neon .section .rodata .align 4 .Lmasks: .quad 0x0000ffffffffffff // k48 .quad 0x00000000ffffffff // k32 .quad 0x000000000000ffff // k16 .quad 0x0000000000000000 // k0 .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) ring-0.17.14/pregenerated/ghash-neon-armv8-win64.S000064400000000000000000000251571046102023000175670ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) .text .globl gcm_init_neon .def gcm_init_neon .type 32 .endef .align 4 gcm_init_neon: AARCH64_VALID_CALL_TARGET // This function is adapted from gcm_init_v8. xC2 is t3. ld1 {v17.2d}, [x1] // load H movi v19.16b, #0xe1 shl v19.2d, v19.2d, #57 // 0xc2.0 ext v3.16b, v17.16b, v17.16b, #8 ushr v18.2d, v19.2d, #63 dup v17.4s, v17.s[1] ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 ushr v18.2d, v3.2d, #63 sshr v17.4s, v17.4s, #31 // broadcast carry bit and v18.16b, v18.16b, v16.16b shl v3.2d, v3.2d, #1 ext v18.16b, v18.16b, v18.16b, #8 and v16.16b, v16.16b, v17.16b orr v3.16b, v3.16b, v18.16b // H<<<=1 eor v5.16b, v3.16b, v16.16b // twisted H st1 {v5.2d}, [x0] // store Htable[0] ret .globl gcm_gmult_neon .def gcm_gmult_neon .type 32 .endef .align 4 gcm_gmult_neon: AARCH64_VALID_CALL_TARGET ld1 {v3.16b}, [x0] // load Xi ld1 {v5.1d}, [x1], #8 // load twisted H ld1 {v6.1d}, [x1] adrp x9, Lmasks // load constants add x9, x9, :lo12:Lmasks ld1 {v24.2d, v25.2d}, [x9] rev64 v3.16b, v3.16b // byteswap Xi ext v3.16b, v3.16b, v3.16b, #8 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing mov x3, #16 b Lgmult_neon .globl gcm_ghash_neon .def gcm_ghash_neon .type 32 .endef .align 4 gcm_ghash_neon: AARCH64_VALID_CALL_TARGET ld1 {v0.16b}, [x0] // load Xi ld1 {v5.1d}, [x1], #8 // load twisted H ld1 {v6.1d}, [x1] adrp x9, Lmasks // load constants add x9, x9, :lo12:Lmasks ld1 {v24.2d, v25.2d}, [x9] rev64 v0.16b, v0.16b // byteswap Xi ext v0.16b, v0.16b, v0.16b, #8 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing Loop_neon: ld1 {v3.16b}, [x2], #16 // load inp rev64 v3.16b, v3.16b // byteswap inp ext v3.16b, v3.16b, v3.16b, #8 eor v3.16b, v3.16b, v0.16b // inp ^= Xi Lgmult_neon: // Split the input into v3 and v4. (The upper halves are unused, // so it is okay to leave them alone.) ins v4.d[0], v3.d[1] ext v16.8b, v5.8b, v5.8b, #1 // A1 pmull v16.8h, v16.8b, v3.8b // F = A1*B ext v0.8b, v3.8b, v3.8b, #1 // B1 pmull v0.8h, v5.8b, v0.8b // E = A*B1 ext v17.8b, v5.8b, v5.8b, #2 // A2 pmull v17.8h, v17.8b, v3.8b // H = A2*B ext v19.8b, v3.8b, v3.8b, #2 // B2 pmull v19.8h, v5.8b, v19.8b // G = A*B2 ext v18.8b, v5.8b, v5.8b, #3 // A3 eor v16.16b, v16.16b, v0.16b // L = E + F pmull v18.8h, v18.8b, v3.8b // J = A3*B ext v0.8b, v3.8b, v3.8b, #3 // B3 eor v17.16b, v17.16b, v19.16b // M = G + H pmull v0.8h, v5.8b, v0.8b // I = A*B3 // Here we diverge from the 32-bit version. It computes the following // (instructions reordered for clarity): // // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) // vand $t0#hi, $t0#hi, $k48 // veor $t0#lo, $t0#lo, $t0#hi // // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) // vand $t1#hi, $t1#hi, $k32 // veor $t1#lo, $t1#lo, $t1#hi // // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) // vand $t2#hi, $t2#hi, $k16 // veor $t2#lo, $t2#lo, $t2#hi // // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) // vmov.i64 $t3#hi, #0 // // $kN is a mask with the bottom N bits set. AArch64 cannot compute on // upper halves of SIMD registers, so we must split each half into // separate registers. To compensate, we pair computations up and // parallelize. ext v19.8b, v3.8b, v3.8b, #4 // B4 eor v18.16b, v18.16b, v0.16b // N = I + J pmull v19.8h, v5.8b, v19.8b // K = A*B4 // This can probably be scheduled more efficiently. For now, we just // pair up independent instructions. zip1 v20.2d, v16.2d, v17.2d zip1 v22.2d, v18.2d, v19.2d zip2 v21.2d, v16.2d, v17.2d zip2 v23.2d, v18.2d, v19.2d eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b and v21.16b, v21.16b, v24.16b and v23.16b, v23.16b, v25.16b eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b zip1 v16.2d, v20.2d, v21.2d zip1 v18.2d, v22.2d, v23.2d zip2 v17.2d, v20.2d, v21.2d zip2 v19.2d, v22.2d, v23.2d ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 pmull v0.8h, v5.8b, v3.8b // D = A*B ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 eor v16.16b, v16.16b, v17.16b eor v18.16b, v18.16b, v19.16b eor v0.16b, v0.16b, v16.16b eor v0.16b, v0.16b, v18.16b eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing ext v16.8b, v7.8b, v7.8b, #1 // A1 pmull v16.8h, v16.8b, v3.8b // F = A1*B ext v1.8b, v3.8b, v3.8b, #1 // B1 pmull v1.8h, v7.8b, v1.8b // E = A*B1 ext v17.8b, v7.8b, v7.8b, #2 // A2 pmull v17.8h, v17.8b, v3.8b // H = A2*B ext v19.8b, v3.8b, v3.8b, #2 // B2 pmull v19.8h, v7.8b, v19.8b // G = A*B2 ext v18.8b, v7.8b, v7.8b, #3 // A3 eor v16.16b, v16.16b, v1.16b // L = E + F pmull v18.8h, v18.8b, v3.8b // J = A3*B ext v1.8b, v3.8b, v3.8b, #3 // B3 eor v17.16b, v17.16b, v19.16b // M = G + H pmull v1.8h, v7.8b, v1.8b // I = A*B3 // Here we diverge from the 32-bit version. It computes the following // (instructions reordered for clarity): // // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) // vand $t0#hi, $t0#hi, $k48 // veor $t0#lo, $t0#lo, $t0#hi // // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) // vand $t1#hi, $t1#hi, $k32 // veor $t1#lo, $t1#lo, $t1#hi // // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) // vand $t2#hi, $t2#hi, $k16 // veor $t2#lo, $t2#lo, $t2#hi // // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) // vmov.i64 $t3#hi, #0 // // $kN is a mask with the bottom N bits set. AArch64 cannot compute on // upper halves of SIMD registers, so we must split each half into // separate registers. To compensate, we pair computations up and // parallelize. ext v19.8b, v3.8b, v3.8b, #4 // B4 eor v18.16b, v18.16b, v1.16b // N = I + J pmull v19.8h, v7.8b, v19.8b // K = A*B4 // This can probably be scheduled more efficiently. For now, we just // pair up independent instructions. zip1 v20.2d, v16.2d, v17.2d zip1 v22.2d, v18.2d, v19.2d zip2 v21.2d, v16.2d, v17.2d zip2 v23.2d, v18.2d, v19.2d eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b and v21.16b, v21.16b, v24.16b and v23.16b, v23.16b, v25.16b eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b zip1 v16.2d, v20.2d, v21.2d zip1 v18.2d, v22.2d, v23.2d zip2 v17.2d, v20.2d, v21.2d zip2 v19.2d, v22.2d, v23.2d ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 pmull v1.8h, v7.8b, v3.8b // D = A*B ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 eor v16.16b, v16.16b, v17.16b eor v18.16b, v18.16b, v19.16b eor v1.16b, v1.16b, v16.16b eor v1.16b, v1.16b, v18.16b ext v16.8b, v6.8b, v6.8b, #1 // A1 pmull v16.8h, v16.8b, v4.8b // F = A1*B ext v2.8b, v4.8b, v4.8b, #1 // B1 pmull v2.8h, v6.8b, v2.8b // E = A*B1 ext v17.8b, v6.8b, v6.8b, #2 // A2 pmull v17.8h, v17.8b, v4.8b // H = A2*B ext v19.8b, v4.8b, v4.8b, #2 // B2 pmull v19.8h, v6.8b, v19.8b // G = A*B2 ext v18.8b, v6.8b, v6.8b, #3 // A3 eor v16.16b, v16.16b, v2.16b // L = E + F pmull v18.8h, v18.8b, v4.8b // J = A3*B ext v2.8b, v4.8b, v4.8b, #3 // B3 eor v17.16b, v17.16b, v19.16b // M = G + H pmull v2.8h, v6.8b, v2.8b // I = A*B3 // Here we diverge from the 32-bit version. It computes the following // (instructions reordered for clarity): // // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) // vand $t0#hi, $t0#hi, $k48 // veor $t0#lo, $t0#lo, $t0#hi // // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) // vand $t1#hi, $t1#hi, $k32 // veor $t1#lo, $t1#lo, $t1#hi // // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) // vand $t2#hi, $t2#hi, $k16 // veor $t2#lo, $t2#lo, $t2#hi // // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) // vmov.i64 $t3#hi, #0 // // $kN is a mask with the bottom N bits set. AArch64 cannot compute on // upper halves of SIMD registers, so we must split each half into // separate registers. To compensate, we pair computations up and // parallelize. ext v19.8b, v4.8b, v4.8b, #4 // B4 eor v18.16b, v18.16b, v2.16b // N = I + J pmull v19.8h, v6.8b, v19.8b // K = A*B4 // This can probably be scheduled more efficiently. For now, we just // pair up independent instructions. zip1 v20.2d, v16.2d, v17.2d zip1 v22.2d, v18.2d, v19.2d zip2 v21.2d, v16.2d, v17.2d zip2 v23.2d, v18.2d, v19.2d eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b and v21.16b, v21.16b, v24.16b and v23.16b, v23.16b, v25.16b eor v20.16b, v20.16b, v21.16b eor v22.16b, v22.16b, v23.16b zip1 v16.2d, v20.2d, v21.2d zip1 v18.2d, v22.2d, v23.2d zip2 v17.2d, v20.2d, v21.2d zip2 v19.2d, v22.2d, v23.2d ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 pmull v2.8h, v6.8b, v4.8b // D = A*B ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 eor v16.16b, v16.16b, v17.16b eor v18.16b, v18.16b, v19.16b eor v2.16b, v2.16b, v16.16b eor v2.16b, v2.16b, v18.16b ext v16.16b, v0.16b, v2.16b, #8 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing eor v1.16b, v1.16b, v2.16b eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result // This is a no-op due to the ins instruction below. // ins v2.d[0], v1.d[1] // equivalent of reduction_avx from ghash-x86_64.pl shl v17.2d, v0.2d, #57 // 1st phase shl v18.2d, v0.2d, #62 eor v18.16b, v18.16b, v17.16b // shl v17.2d, v0.2d, #63 eor v18.16b, v18.16b, v17.16b // // Note Xm contains {Xl.d[1], Xh.d[0]}. eor v18.16b, v18.16b, v1.16b ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] ushr v18.2d, v0.2d, #1 // 2nd phase eor v2.16b, v2.16b,v0.16b eor v0.16b, v0.16b,v18.16b // ushr v18.2d, v18.2d, #6 ushr v0.2d, v0.2d, #1 // eor v0.16b, v0.16b, v2.16b // eor v0.16b, v0.16b, v18.16b // subs x3, x3, #16 bne Loop_neon rev64 v0.16b, v0.16b // byteswap Xi and write ext v0.16b, v0.16b, v0.16b, #8 st1 {v0.16b}, [x0] ret .section .rodata .align 4 Lmasks: .quad 0x0000ffffffffffff // k48 .quad 0x00000000ffffffff // k32 .quad 0x000000000000ffff // k16 .quad 0x0000000000000000 // k0 .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) ring-0.17.14/pregenerated/ghash-x86-elf.S000064400000000000000000000126521046102023000160150ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) .text .globl gcm_init_clmul .hidden gcm_init_clmul .type gcm_init_clmul,@function .align 16 gcm_init_clmul: .L_gcm_init_clmul_begin: movl 4(%esp),%edx movl 8(%esp),%eax call .L000pic .L000pic: popl %ecx leal .Lbswap-.L000pic(%ecx),%ecx movdqu (%eax),%xmm2 pshufd $78,%xmm2,%xmm2 pshufd $255,%xmm2,%xmm4 movdqa %xmm2,%xmm3 psllq $1,%xmm2 pxor %xmm5,%xmm5 psrlq $63,%xmm3 pcmpgtd %xmm4,%xmm5 pslldq $8,%xmm3 por %xmm3,%xmm2 pand 16(%ecx),%xmm5 pxor %xmm5,%xmm2 movdqa %xmm2,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 xorps %xmm0,%xmm3 xorps %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 pshufd $78,%xmm2,%xmm3 pshufd $78,%xmm0,%xmm4 pxor %xmm2,%xmm3 movdqu %xmm2,(%edx) pxor %xmm0,%xmm4 movdqu %xmm0,16(%edx) .byte 102,15,58,15,227,8 movdqu %xmm4,32(%edx) ret .size gcm_init_clmul,.-.L_gcm_init_clmul_begin .globl gcm_ghash_clmul .hidden gcm_ghash_clmul .type gcm_ghash_clmul,@function .align 16 gcm_ghash_clmul: .L_gcm_ghash_clmul_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi movl 20(%esp),%eax movl 24(%esp),%edx movl 28(%esp),%esi movl 32(%esp),%ebx call .L001pic .L001pic: popl %ecx leal .Lbswap-.L001pic(%ecx),%ecx movdqu (%eax),%xmm0 movdqa (%ecx),%xmm5 movdqu (%edx),%xmm2 .byte 102,15,56,0,197 subl $16,%ebx jz .L002odd_tail movdqu (%esi),%xmm3 movdqu 16(%esi),%xmm6 .byte 102,15,56,0,221 .byte 102,15,56,0,245 movdqu 32(%edx),%xmm5 pxor %xmm3,%xmm0 pshufd $78,%xmm6,%xmm3 movdqa %xmm6,%xmm7 pxor %xmm6,%xmm3 leal 32(%esi),%esi .byte 102,15,58,68,242,0 .byte 102,15,58,68,250,17 .byte 102,15,58,68,221,0 movups 16(%edx),%xmm2 nop subl $32,%ebx jbe .L003even_tail jmp .L004mod_loop .align 32 .L004mod_loop: pshufd $78,%xmm0,%xmm4 movdqa %xmm0,%xmm1 pxor %xmm0,%xmm4 nop .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,229,16 movups (%edx),%xmm2 xorps %xmm6,%xmm0 movdqa (%ecx),%xmm5 xorps %xmm7,%xmm1 movdqu (%esi),%xmm7 pxor %xmm0,%xmm3 movdqu 16(%esi),%xmm6 pxor %xmm1,%xmm3 .byte 102,15,56,0,253 pxor %xmm3,%xmm4 movdqa %xmm4,%xmm3 psrldq $8,%xmm4 pslldq $8,%xmm3 pxor %xmm4,%xmm1 pxor %xmm3,%xmm0 .byte 102,15,56,0,245 pxor %xmm7,%xmm1 movdqa %xmm6,%xmm7 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 .byte 102,15,58,68,242,0 movups 32(%edx),%xmm5 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 pshufd $78,%xmm7,%xmm3 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm7,%xmm3 pxor %xmm4,%xmm1 .byte 102,15,58,68,250,17 movups 16(%edx),%xmm2 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 .byte 102,15,58,68,221,0 leal 32(%esi),%esi subl $32,%ebx ja .L004mod_loop .L003even_tail: pshufd $78,%xmm0,%xmm4 movdqa %xmm0,%xmm1 pxor %xmm0,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,229,16 movdqa (%ecx),%xmm5 xorps %xmm6,%xmm0 xorps %xmm7,%xmm1 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 pxor %xmm3,%xmm4 movdqa %xmm4,%xmm3 psrldq $8,%xmm4 pslldq $8,%xmm3 pxor %xmm4,%xmm1 pxor %xmm3,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 testl %ebx,%ebx jnz .L005done movups (%edx),%xmm2 .L002odd_tail: movdqu (%esi),%xmm3 .byte 102,15,56,0,221 pxor %xmm3,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 xorps %xmm0,%xmm3 xorps %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 .L005done: .byte 102,15,56,0,197 movdqu %xmm0,(%eax) popl %edi popl %esi popl %ebx popl %ebp ret .size gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin .align 64 .Lbswap: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67 .byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 .byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 .byte 0 #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) ring-0.17.14/pregenerated/ghash-x86-win32n.asm000064400000000000000000000116741046102023000167500ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %include "ring_core_generated/prefix_symbols_nasm.inc" %ifidn __OUTPUT_FORMAT__, win32 %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 $@feat.00 equ 1 section .text code align=64 %else section .text code %endif global _gcm_init_clmul align 16 _gcm_init_clmul: L$_gcm_init_clmul_begin: mov edx,DWORD [4+esp] mov eax,DWORD [8+esp] call L$000pic L$000pic: pop ecx lea ecx,[(L$bswap-L$000pic)+ecx] movdqu xmm2,[eax] pshufd xmm2,xmm2,78 pshufd xmm4,xmm2,255 movdqa xmm3,xmm2 psllq xmm2,1 pxor xmm5,xmm5 psrlq xmm3,63 pcmpgtd xmm5,xmm4 pslldq xmm3,8 por xmm2,xmm3 pand xmm5,[16+ecx] pxor xmm2,xmm5 movdqa xmm0,xmm2 movdqa xmm1,xmm0 pshufd xmm3,xmm0,78 pshufd xmm4,xmm2,78 pxor xmm3,xmm0 pxor xmm4,xmm2 db 102,15,58,68,194,0 db 102,15,58,68,202,17 db 102,15,58,68,220,0 xorps xmm3,xmm0 xorps xmm3,xmm1 movdqa xmm4,xmm3 psrldq xmm3,8 pslldq xmm4,8 pxor xmm1,xmm3 pxor xmm0,xmm4 movdqa xmm4,xmm0 movdqa xmm3,xmm0 psllq xmm0,5 pxor xmm3,xmm0 psllq xmm0,1 pxor xmm0,xmm3 psllq xmm0,57 movdqa xmm3,xmm0 pslldq xmm0,8 psrldq xmm3,8 pxor xmm0,xmm4 pxor xmm1,xmm3 movdqa xmm4,xmm0 psrlq xmm0,1 pxor xmm1,xmm4 pxor xmm4,xmm0 psrlq xmm0,5 pxor xmm0,xmm4 psrlq xmm0,1 pxor xmm0,xmm1 pshufd xmm3,xmm2,78 pshufd xmm4,xmm0,78 pxor xmm3,xmm2 movdqu [edx],xmm2 pxor xmm4,xmm0 movdqu [16+edx],xmm0 db 102,15,58,15,227,8 movdqu [32+edx],xmm4 ret global _gcm_ghash_clmul align 16 _gcm_ghash_clmul: L$_gcm_ghash_clmul_begin: push ebp push ebx push esi push edi mov eax,DWORD [20+esp] mov edx,DWORD [24+esp] mov esi,DWORD [28+esp] mov ebx,DWORD [32+esp] call L$001pic L$001pic: pop ecx lea ecx,[(L$bswap-L$001pic)+ecx] movdqu xmm0,[eax] movdqa xmm5,[ecx] movdqu xmm2,[edx] db 102,15,56,0,197 sub ebx,16 jz NEAR L$002odd_tail movdqu xmm3,[esi] movdqu xmm6,[16+esi] db 102,15,56,0,221 db 102,15,56,0,245 movdqu xmm5,[32+edx] pxor xmm0,xmm3 pshufd xmm3,xmm6,78 movdqa xmm7,xmm6 pxor xmm3,xmm6 lea esi,[32+esi] db 102,15,58,68,242,0 db 102,15,58,68,250,17 db 102,15,58,68,221,0 movups xmm2,[16+edx] nop sub ebx,32 jbe NEAR L$003even_tail jmp NEAR L$004mod_loop align 32 L$004mod_loop: pshufd xmm4,xmm0,78 movdqa xmm1,xmm0 pxor xmm4,xmm0 nop db 102,15,58,68,194,0 db 102,15,58,68,202,17 db 102,15,58,68,229,16 movups xmm2,[edx] xorps xmm0,xmm6 movdqa xmm5,[ecx] xorps xmm1,xmm7 movdqu xmm7,[esi] pxor xmm3,xmm0 movdqu xmm6,[16+esi] pxor xmm3,xmm1 db 102,15,56,0,253 pxor xmm4,xmm3 movdqa xmm3,xmm4 psrldq xmm4,8 pslldq xmm3,8 pxor xmm1,xmm4 pxor xmm0,xmm3 db 102,15,56,0,245 pxor xmm1,xmm7 movdqa xmm7,xmm6 movdqa xmm4,xmm0 movdqa xmm3,xmm0 psllq xmm0,5 pxor xmm3,xmm0 psllq xmm0,1 pxor xmm0,xmm3 db 102,15,58,68,242,0 movups xmm5,[32+edx] psllq xmm0,57 movdqa xmm3,xmm0 pslldq xmm0,8 psrldq xmm3,8 pxor xmm0,xmm4 pxor xmm1,xmm3 pshufd xmm3,xmm7,78 movdqa xmm4,xmm0 psrlq xmm0,1 pxor xmm3,xmm7 pxor xmm1,xmm4 db 102,15,58,68,250,17 movups xmm2,[16+edx] pxor xmm4,xmm0 psrlq xmm0,5 pxor xmm0,xmm4 psrlq xmm0,1 pxor xmm0,xmm1 db 102,15,58,68,221,0 lea esi,[32+esi] sub ebx,32 ja NEAR L$004mod_loop L$003even_tail: pshufd xmm4,xmm0,78 movdqa xmm1,xmm0 pxor xmm4,xmm0 db 102,15,58,68,194,0 db 102,15,58,68,202,17 db 102,15,58,68,229,16 movdqa xmm5,[ecx] xorps xmm0,xmm6 xorps xmm1,xmm7 pxor xmm3,xmm0 pxor xmm3,xmm1 pxor xmm4,xmm3 movdqa xmm3,xmm4 psrldq xmm4,8 pslldq xmm3,8 pxor xmm1,xmm4 pxor xmm0,xmm3 movdqa xmm4,xmm0 movdqa xmm3,xmm0 psllq xmm0,5 pxor xmm3,xmm0 psllq xmm0,1 pxor xmm0,xmm3 psllq xmm0,57 movdqa xmm3,xmm0 pslldq xmm0,8 psrldq xmm3,8 pxor xmm0,xmm4 pxor xmm1,xmm3 movdqa xmm4,xmm0 psrlq xmm0,1 pxor xmm1,xmm4 pxor xmm4,xmm0 psrlq xmm0,5 pxor xmm0,xmm4 psrlq xmm0,1 pxor xmm0,xmm1 test ebx,ebx jnz NEAR L$005done movups xmm2,[edx] L$002odd_tail: movdqu xmm3,[esi] db 102,15,56,0,221 pxor xmm0,xmm3 movdqa xmm1,xmm0 pshufd xmm3,xmm0,78 pshufd xmm4,xmm2,78 pxor xmm3,xmm0 pxor xmm4,xmm2 db 102,15,58,68,194,0 db 102,15,58,68,202,17 db 102,15,58,68,220,0 xorps xmm3,xmm0 xorps xmm3,xmm1 movdqa xmm4,xmm3 psrldq xmm3,8 pslldq xmm4,8 pxor xmm1,xmm3 pxor xmm0,xmm4 movdqa xmm4,xmm0 movdqa xmm3,xmm0 psllq xmm0,5 pxor xmm3,xmm0 psllq xmm0,1 pxor xmm0,xmm3 psllq xmm0,57 movdqa xmm3,xmm0 pslldq xmm0,8 psrldq xmm3,8 pxor xmm0,xmm4 pxor xmm1,xmm3 movdqa xmm4,xmm0 psrlq xmm0,1 pxor xmm1,xmm4 pxor xmm4,xmm0 psrlq xmm0,5 pxor xmm0,xmm4 psrlq xmm0,1 pxor xmm0,xmm1 L$005done: db 102,15,56,0,197 movdqu [eax],xmm0 pop edi pop esi pop ebx pop ebp ret align 64 L$bswap: db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 db 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 db 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67 db 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 db 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 db 0 %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/ghash-x86-win32n.o000064400000000000000000000111341046102023000164150ustar 00000000000000Lg1.debug$S x @B.debug$T8h @B.text 1 p`5C:\Users\b\p\ring\pregenerated\ghash-x86-win32n.asm},~f/ "&+/48 =!A"F#J$N%R&W'\(`)d*j+p,v-y.|/0123456789:;<=>?@ABCDEFGHIJKLMN OPR U!V"W#X$Y(Z,[0\4]9_:`@aDbHcLdQeTfZg^hcihjmkrlvm{nopqrstuvwxy{|}~ "&*.27;@DJNSW\aeinrw{  $)-15:>BGKPTV\_chlpuz~   @ P `p8C:\Users\b\p\ring\pregenerated\ghash-x86-win32n.o4'The Netwide Assembler 2.13.03,_ring_core_0_17_14__gcm_init_clmul!L$_gcm_init_clmul_beginL$000pic-_ring_core_0_17_14__gcm_ghash_clmul"L$_gcm_ghash_clmul_beginL$001picL$004mod_loopL$003even_tailL$002odd_tailL$005doneL$bswapl p         D  H  h  l  |           T$D$Y3ofpNfpfofsffs?fffsffiffofofpNfpNfff:Df:Df:DWWfofsfsfffofofsffsffs9fofsfsfffofsfffsffsffpNfpNffBf:b ÐUSVWD$T$t$\$ Yofo)of8Ńoovf8f8oj ffpNfofލv f:Df:Df:DR  fpNfoff:Df:Df:DWfo)Wo>fovff8ffofsfsfff8ffofofofsffsff:Dj fs9fofsfsfffpNfofsfff:DRffsffsff:Dv fpNfoff:Df:Df:Dfo)WWffffofsfsfffofofsffsffs9fofsfsfffofsfffsffsfof8ffofpNfpNfff:Df:Df:DWWfofsfsfffofofsffsffs9fofsfsfffofsfffsffsff8_^[]Ð GHASH for x86, CRYPTOGAMS by .filegC:\Users\b\p\ring\.debug$S .debug$T8.text.absolut@feat.00'L$000pic ? c L$001pic9|_L$bswap@_ring_core_0_17_14__gcm_init_clmulL$_gcm_init_clmul_begin_ring_core_0_17_14__gcm_ghash_clmulL$_gcm_ghash_clmul_beginL$004mod_loopL$003even_tailL$002odd_tailL$005donering-0.17.14/pregenerated/ghash-x86_64-elf.S000064400000000000000000000527111046102023000163260ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text .globl gcm_init_clmul .hidden gcm_init_clmul .type gcm_init_clmul,@function .align 16 gcm_init_clmul: .cfi_startproc _CET_ENDBR .L_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 pshufd $255,%xmm2,%xmm4 movdqa %xmm2,%xmm3 psllq $1,%xmm2 pxor %xmm5,%xmm5 psrlq $63,%xmm3 pcmpgtd %xmm4,%xmm5 pslldq $8,%xmm3 por %xmm3,%xmm2 pand .L0x1c2_polynomial(%rip),%xmm5 pxor %xmm5,%xmm2 pshufd $78,%xmm2,%xmm6 movdqa %xmm2,%xmm0 pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 pshufd $78,%xmm2,%xmm3 pshufd $78,%xmm0,%xmm4 pxor %xmm2,%xmm3 movdqu %xmm2,0(%rdi) pxor %xmm0,%xmm4 movdqu %xmm0,16(%rdi) .byte 102,15,58,15,227,8 movdqu %xmm4,32(%rdi) movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 movdqa %xmm0,%xmm5 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 pshufd $78,%xmm5,%xmm3 pshufd $78,%xmm0,%xmm4 pxor %xmm5,%xmm3 movdqu %xmm5,48(%rdi) pxor %xmm0,%xmm4 movdqu %xmm0,64(%rdi) .byte 102,15,58,15,227,8 movdqu %xmm4,80(%rdi) ret .cfi_endproc .size gcm_init_clmul,.-gcm_init_clmul .globl gcm_ghash_clmul .hidden gcm_ghash_clmul .type gcm_ghash_clmul,@function .align 32 gcm_ghash_clmul: .cfi_startproc _CET_ENDBR .L_ghash_clmul: movdqa .Lbswap_mask(%rip),%xmm10 movdqu (%rdi),%xmm0 movdqu (%rsi),%xmm2 movdqu 32(%rsi),%xmm7 .byte 102,65,15,56,0,194 subq $0x10,%rcx jz .Lodd_tail movdqu 16(%rsi),%xmm6 cmpq $0x30,%rcx jb .Lskip4x subq $0x30,%rcx movq $0xA040608020C0E000,%rax movdqu 48(%rsi),%xmm14 movdqu 64(%rsi),%xmm15 movdqu 48(%rdx),%xmm3 movdqu 32(%rdx),%xmm11 .byte 102,65,15,56,0,218 .byte 102,69,15,56,0,218 movdqa %xmm3,%xmm5 pshufd $78,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 .byte 102,15,58,68,231,0 movdqa %xmm11,%xmm13 pshufd $78,%xmm11,%xmm12 pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 .byte 102,68,15,58,68,231,16 xorps %xmm11,%xmm3 xorps %xmm13,%xmm5 movups 80(%rsi),%xmm7 xorps %xmm12,%xmm4 movdqu 16(%rdx),%xmm11 movdqu 0(%rdx),%xmm8 .byte 102,69,15,56,0,218 .byte 102,69,15,56,0,194 movdqa %xmm11,%xmm13 pshufd $78,%xmm11,%xmm12 pxor %xmm8,%xmm0 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 .byte 102,68,15,58,68,231,0 xorps %xmm11,%xmm3 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx subq $0x40,%rcx jc .Ltail4x jmp .Lmod4_loop .align 32 .Lmod4_loop: .byte 102,65,15,58,68,199,0 xorps %xmm12,%xmm4 movdqu 48(%rdx),%xmm11 .byte 102,69,15,56,0,218 .byte 102,65,15,58,68,207,17 xorps %xmm3,%xmm0 movdqu 32(%rdx),%xmm3 movdqa %xmm11,%xmm13 .byte 102,68,15,58,68,199,16 pshufd $78,%xmm11,%xmm12 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rsi),%xmm7 xorps %xmm4,%xmm8 .byte 102,68,15,58,68,218,0 pshufd $78,%xmm3,%xmm4 pxor %xmm0,%xmm8 movdqa %xmm3,%xmm5 pxor %xmm1,%xmm8 pxor %xmm3,%xmm4 movdqa %xmm8,%xmm9 .byte 102,68,15,58,68,234,17 pslldq $8,%xmm8 psrldq $8,%xmm9 pxor %xmm8,%xmm0 movdqa .L7_mask(%rip),%xmm8 pxor %xmm9,%xmm1 .byte 102,76,15,110,200 pand %xmm0,%xmm8 .byte 102,69,15,56,0,200 pxor %xmm0,%xmm9 .byte 102,68,15,58,68,231,0 psllq $57,%xmm9 movdqa %xmm9,%xmm8 pslldq $8,%xmm9 .byte 102,15,58,68,222,0 psrldq $8,%xmm8 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 movdqu 0(%rdx),%xmm8 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 .byte 102,15,58,68,238,17 xorps %xmm11,%xmm3 movdqu 16(%rdx),%xmm11 .byte 102,69,15,56,0,218 .byte 102,15,58,68,231,16 xorps %xmm13,%xmm5 movups 80(%rsi),%xmm7 .byte 102,69,15,56,0,194 pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 pshufd $78,%xmm11,%xmm12 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 pshufd $78,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx subq $0x40,%rcx jnc .Lmod4_loop .Ltail4x: .byte 102,65,15,58,68,199,0 .byte 102,65,15,58,68,207,17 .byte 102,68,15,58,68,199,16 xorps %xmm12,%xmm4 xorps %xmm3,%xmm0 xorps %xmm5,%xmm1 pxor %xmm0,%xmm1 pxor %xmm4,%xmm8 pxor %xmm1,%xmm8 pxor %xmm0,%xmm1 movdqa %xmm8,%xmm9 psrldq $8,%xmm8 pslldq $8,%xmm9 pxor %xmm8,%xmm1 pxor %xmm9,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 addq $0x40,%rcx jz .Ldone movdqu 32(%rsi),%xmm7 subq $0x10,%rcx jz .Lodd_tail .Lskip4x: movdqu (%rdx),%xmm8 movdqu 16(%rdx),%xmm3 .byte 102,69,15,56,0,194 .byte 102,65,15,56,0,218 pxor %xmm8,%xmm0 movdqa %xmm3,%xmm5 pshufd $78,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 .byte 102,15,58,68,231,0 leaq 32(%rdx),%rdx nop subq $0x20,%rcx jbe .Leven_tail nop jmp .Lmod_loop .align 32 .Lmod_loop: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 pshufd $78,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 .byte 102,15,58,68,206,17 .byte 102,15,58,68,231,16 pxor %xmm3,%xmm0 pxor %xmm5,%xmm1 movdqu (%rdx),%xmm9 pxor %xmm0,%xmm8 .byte 102,69,15,56,0,202 movdqu 16(%rdx),%xmm3 pxor %xmm1,%xmm8 pxor %xmm9,%xmm1 pxor %xmm8,%xmm4 .byte 102,65,15,56,0,218 movdqa %xmm4,%xmm8 psrldq $8,%xmm8 pslldq $8,%xmm4 pxor %xmm8,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm3,%xmm5 movdqa %xmm0,%xmm9 movdqa %xmm0,%xmm8 psllq $5,%xmm0 pxor %xmm0,%xmm8 .byte 102,15,58,68,218,0 psllq $1,%xmm0 pxor %xmm8,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm8 pslldq $8,%xmm0 psrldq $8,%xmm8 pxor %xmm9,%xmm0 pshufd $78,%xmm5,%xmm4 pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 .byte 102,15,58,68,234,17 pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 pxor %xmm9,%xmm0 leaq 32(%rdx),%rdx psrlq $1,%xmm0 .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 subq $0x20,%rcx ja .Lmod_loop .Leven_tail: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 pshufd $78,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 .byte 102,15,58,68,206,17 .byte 102,15,58,68,231,16 pxor %xmm3,%xmm0 pxor %xmm5,%xmm1 pxor %xmm0,%xmm8 pxor %xmm1,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm8 psrldq $8,%xmm8 pslldq $8,%xmm4 pxor %xmm8,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 testq %rcx,%rcx jnz .Ldone .Lodd_tail: movdqu (%rdx),%xmm8 .byte 102,69,15,56,0,194 pxor %xmm8,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,223,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 .Ldone: .byte 102,65,15,56,0,194 movdqu %xmm0,(%rdi) ret .cfi_endproc .size gcm_ghash_clmul,.-gcm_ghash_clmul .globl gcm_init_avx .hidden gcm_init_avx .type gcm_init_avx,@function .align 32 gcm_init_avx: .cfi_startproc _CET_ENDBR vzeroupper vmovdqu (%rsi),%xmm2 vpshufd $78,%xmm2,%xmm2 vpshufd $255,%xmm2,%xmm4 vpsrlq $63,%xmm2,%xmm3 vpsllq $1,%xmm2,%xmm2 vpxor %xmm5,%xmm5,%xmm5 vpcmpgtd %xmm4,%xmm5,%xmm5 vpslldq $8,%xmm3,%xmm3 vpor %xmm3,%xmm2,%xmm2 vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm2,%xmm2,%xmm6 vmovdqa %xmm2,%xmm0 vpxor %xmm2,%xmm6,%xmm6 movq $4,%r10 jmp .Linit_start_avx .align 32 .Linit_loop_avx: vpalignr $8,%xmm3,%xmm4,%xmm5 vmovdqu %xmm5,-16(%rdi) vpunpckhqdq %xmm0,%xmm0,%xmm3 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 vpxor %xmm0,%xmm1,%xmm4 vpxor %xmm4,%xmm3,%xmm3 vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 vpxor %xmm3,%xmm4,%xmm4 vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $1,%xmm0,%xmm0 vpxor %xmm1,%xmm0,%xmm0 .Linit_start_avx: vmovdqa %xmm0,%xmm5 vpunpckhqdq %xmm0,%xmm0,%xmm3 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 vpxor %xmm0,%xmm1,%xmm4 vpxor %xmm4,%xmm3,%xmm3 vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 vpxor %xmm3,%xmm4,%xmm4 vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $1,%xmm0,%xmm0 vpxor %xmm1,%xmm0,%xmm0 vpshufd $78,%xmm5,%xmm3 vpshufd $78,%xmm0,%xmm4 vpxor %xmm5,%xmm3,%xmm3 vmovdqu %xmm5,0(%rdi) vpxor %xmm0,%xmm4,%xmm4 vmovdqu %xmm0,16(%rdi) leaq 48(%rdi),%rdi subq $1,%r10 jnz .Linit_loop_avx vpalignr $8,%xmm4,%xmm3,%xmm5 vmovdqu %xmm5,-16(%rdi) vzeroupper ret .cfi_endproc .size gcm_init_avx,.-gcm_init_avx .globl gcm_ghash_avx .hidden gcm_ghash_avx .type gcm_ghash_avx,@function .align 32 gcm_ghash_avx: .cfi_startproc _CET_ENDBR vzeroupper vmovdqu (%rdi),%xmm10 leaq .L0x1c2_polynomial(%rip),%r10 leaq 64(%rsi),%rsi vmovdqu .Lbswap_mask(%rip),%xmm13 vpshufb %xmm13,%xmm10,%xmm10 cmpq $0x80,%rcx jb .Lshort_avx subq $0x80,%rcx vmovdqu 112(%rdx),%xmm14 vmovdqu 0-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm14 vmovdqu 32-64(%rsi),%xmm7 vpunpckhqdq %xmm14,%xmm14,%xmm9 vmovdqu 96(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm14,%xmm9,%xmm9 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 16-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vmovdqu 80(%rdx),%xmm14 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 48-64(%rsi),%xmm6 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 64(%rdx),%xmm15 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 80-64(%rsi),%xmm7 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vmovdqu 48(%rdx),%xmm14 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm4,%xmm1,%xmm1 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 96-64(%rsi),%xmm6 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 128-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 32(%rdx),%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vmovdqu 16(%rdx),%xmm14 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm4,%xmm1,%xmm1 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 144-64(%rsi),%xmm6 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 176-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu (%rdx),%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 160-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 leaq 128(%rdx),%rdx cmpq $0x80,%rcx jb .Ltail_avx vpxor %xmm10,%xmm15,%xmm15 subq $0x80,%rcx jmp .Loop8x_avx .align 32 .Loop8x_avx: vpunpckhqdq %xmm15,%xmm15,%xmm8 vmovdqu 112(%rdx),%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpxor %xmm15,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 vmovdqu 0-64(%rsi),%xmm6 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 vmovdqu 32-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 96(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm3,%xmm10,%xmm10 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vxorps %xmm4,%xmm11,%xmm11 vmovdqu 16-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm5,%xmm12,%xmm12 vxorps %xmm15,%xmm8,%xmm8 vmovdqu 80(%rdx),%xmm14 vpxor %xmm10,%xmm12,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm11,%xmm12,%xmm12 vpslldq $8,%xmm12,%xmm9 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vpsrldq $8,%xmm12,%xmm12 vpxor %xmm9,%xmm10,%xmm10 vmovdqu 48-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm14 vxorps %xmm12,%xmm11,%xmm11 vpxor %xmm1,%xmm4,%xmm4 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 80-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 64(%rdx),%xmm15 vpalignr $8,%xmm10,%xmm10,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm4,%xmm1,%xmm1 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vxorps %xmm15,%xmm8,%xmm8 vpxor %xmm5,%xmm2,%xmm2 vmovdqu 48(%rdx),%xmm14 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 96-64(%rsi),%xmm6 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 128-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 32(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm4,%xmm1,%xmm1 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vpxor %xmm5,%xmm2,%xmm2 vxorps %xmm12,%xmm10,%xmm10 vmovdqu 16(%rdx),%xmm14 vpalignr $8,%xmm10,%xmm10,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 144-64(%rsi),%xmm6 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 vxorps %xmm11,%xmm12,%xmm12 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 176-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu (%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 160-64(%rsi),%xmm6 vpxor %xmm12,%xmm15,%xmm15 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 vpxor %xmm10,%xmm15,%xmm15 leaq 128(%rdx),%rdx subq $0x80,%rcx jnc .Loop8x_avx addq $0x80,%rcx jmp .Ltail_no_xor_avx .align 32 .Lshort_avx: vmovdqu -16(%rdx,%rcx,1),%xmm14 leaq (%rdx,%rcx,1),%rdx vmovdqu 0-64(%rsi),%xmm6 vmovdqu 32-64(%rsi),%xmm7 vpshufb %xmm13,%xmm14,%xmm15 vmovdqa %xmm0,%xmm3 vmovdqa %xmm1,%xmm4 vmovdqa %xmm2,%xmm5 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -32(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 16-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -48(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 48-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu 80-64(%rsi),%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -64(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -80(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 96-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu 128-64(%rsi),%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -96(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -112(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 144-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovq 184-64(%rsi),%xmm7 subq $0x10,%rcx jmp .Ltail_avx .align 32 .Ltail_avx: vpxor %xmm10,%xmm15,%xmm15 .Ltail_no_xor_avx: vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu (%r10),%xmm12 vpxor %xmm0,%xmm3,%xmm10 vpxor %xmm1,%xmm4,%xmm11 vpxor %xmm2,%xmm5,%xmm5 vpxor %xmm10,%xmm5,%xmm5 vpxor %xmm11,%xmm5,%xmm5 vpslldq $8,%xmm5,%xmm9 vpsrldq $8,%xmm5,%xmm5 vpxor %xmm9,%xmm10,%xmm10 vpxor %xmm5,%xmm11,%xmm11 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 vpalignr $8,%xmm10,%xmm10,%xmm10 vpxor %xmm9,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 vpalignr $8,%xmm10,%xmm10,%xmm10 vpxor %xmm11,%xmm10,%xmm10 vpxor %xmm9,%xmm10,%xmm10 cmpq $0,%rcx jne .Lshort_avx vpshufb %xmm13,%xmm10,%xmm10 vmovdqu %xmm10,(%rdi) vzeroupper ret .cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx .section .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .L7_mask: .long 7,0,7,0 .align 64 .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 .text #endif ring-0.17.14/pregenerated/ghash-x86_64-macosx.S000064400000000000000000000522251046102023000170520ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text .globl _gcm_init_clmul .private_extern _gcm_init_clmul .p2align 4 _gcm_init_clmul: _CET_ENDBR L$_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 pshufd $255,%xmm2,%xmm4 movdqa %xmm2,%xmm3 psllq $1,%xmm2 pxor %xmm5,%xmm5 psrlq $63,%xmm3 pcmpgtd %xmm4,%xmm5 pslldq $8,%xmm3 por %xmm3,%xmm2 pand L$0x1c2_polynomial(%rip),%xmm5 pxor %xmm5,%xmm2 pshufd $78,%xmm2,%xmm6 movdqa %xmm2,%xmm0 pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 pshufd $78,%xmm2,%xmm3 pshufd $78,%xmm0,%xmm4 pxor %xmm2,%xmm3 movdqu %xmm2,0(%rdi) pxor %xmm0,%xmm4 movdqu %xmm0,16(%rdi) .byte 102,15,58,15,227,8 movdqu %xmm4,32(%rdi) movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 movdqa %xmm0,%xmm5 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 pshufd $78,%xmm5,%xmm3 pshufd $78,%xmm0,%xmm4 pxor %xmm5,%xmm3 movdqu %xmm5,48(%rdi) pxor %xmm0,%xmm4 movdqu %xmm0,64(%rdi) .byte 102,15,58,15,227,8 movdqu %xmm4,80(%rdi) ret .globl _gcm_ghash_clmul .private_extern _gcm_ghash_clmul .p2align 5 _gcm_ghash_clmul: _CET_ENDBR L$_ghash_clmul: movdqa L$bswap_mask(%rip),%xmm10 movdqu (%rdi),%xmm0 movdqu (%rsi),%xmm2 movdqu 32(%rsi),%xmm7 .byte 102,65,15,56,0,194 subq $0x10,%rcx jz L$odd_tail movdqu 16(%rsi),%xmm6 cmpq $0x30,%rcx jb L$skip4x subq $0x30,%rcx movq $0xA040608020C0E000,%rax movdqu 48(%rsi),%xmm14 movdqu 64(%rsi),%xmm15 movdqu 48(%rdx),%xmm3 movdqu 32(%rdx),%xmm11 .byte 102,65,15,56,0,218 .byte 102,69,15,56,0,218 movdqa %xmm3,%xmm5 pshufd $78,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 .byte 102,15,58,68,231,0 movdqa %xmm11,%xmm13 pshufd $78,%xmm11,%xmm12 pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 .byte 102,68,15,58,68,231,16 xorps %xmm11,%xmm3 xorps %xmm13,%xmm5 movups 80(%rsi),%xmm7 xorps %xmm12,%xmm4 movdqu 16(%rdx),%xmm11 movdqu 0(%rdx),%xmm8 .byte 102,69,15,56,0,218 .byte 102,69,15,56,0,194 movdqa %xmm11,%xmm13 pshufd $78,%xmm11,%xmm12 pxor %xmm8,%xmm0 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 .byte 102,68,15,58,68,231,0 xorps %xmm11,%xmm3 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx subq $0x40,%rcx jc L$tail4x jmp L$mod4_loop .p2align 5 L$mod4_loop: .byte 102,65,15,58,68,199,0 xorps %xmm12,%xmm4 movdqu 48(%rdx),%xmm11 .byte 102,69,15,56,0,218 .byte 102,65,15,58,68,207,17 xorps %xmm3,%xmm0 movdqu 32(%rdx),%xmm3 movdqa %xmm11,%xmm13 .byte 102,68,15,58,68,199,16 pshufd $78,%xmm11,%xmm12 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rsi),%xmm7 xorps %xmm4,%xmm8 .byte 102,68,15,58,68,218,0 pshufd $78,%xmm3,%xmm4 pxor %xmm0,%xmm8 movdqa %xmm3,%xmm5 pxor %xmm1,%xmm8 pxor %xmm3,%xmm4 movdqa %xmm8,%xmm9 .byte 102,68,15,58,68,234,17 pslldq $8,%xmm8 psrldq $8,%xmm9 pxor %xmm8,%xmm0 movdqa L$7_mask(%rip),%xmm8 pxor %xmm9,%xmm1 .byte 102,76,15,110,200 pand %xmm0,%xmm8 .byte 102,69,15,56,0,200 pxor %xmm0,%xmm9 .byte 102,68,15,58,68,231,0 psllq $57,%xmm9 movdqa %xmm9,%xmm8 pslldq $8,%xmm9 .byte 102,15,58,68,222,0 psrldq $8,%xmm8 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 movdqu 0(%rdx),%xmm8 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 .byte 102,15,58,68,238,17 xorps %xmm11,%xmm3 movdqu 16(%rdx),%xmm11 .byte 102,69,15,56,0,218 .byte 102,15,58,68,231,16 xorps %xmm13,%xmm5 movups 80(%rsi),%xmm7 .byte 102,69,15,56,0,194 pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 pshufd $78,%xmm11,%xmm12 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 pshufd $78,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx subq $0x40,%rcx jnc L$mod4_loop L$tail4x: .byte 102,65,15,58,68,199,0 .byte 102,65,15,58,68,207,17 .byte 102,68,15,58,68,199,16 xorps %xmm12,%xmm4 xorps %xmm3,%xmm0 xorps %xmm5,%xmm1 pxor %xmm0,%xmm1 pxor %xmm4,%xmm8 pxor %xmm1,%xmm8 pxor %xmm0,%xmm1 movdqa %xmm8,%xmm9 psrldq $8,%xmm8 pslldq $8,%xmm9 pxor %xmm8,%xmm1 pxor %xmm9,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 addq $0x40,%rcx jz L$done movdqu 32(%rsi),%xmm7 subq $0x10,%rcx jz L$odd_tail L$skip4x: movdqu (%rdx),%xmm8 movdqu 16(%rdx),%xmm3 .byte 102,69,15,56,0,194 .byte 102,65,15,56,0,218 pxor %xmm8,%xmm0 movdqa %xmm3,%xmm5 pshufd $78,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 .byte 102,15,58,68,231,0 leaq 32(%rdx),%rdx nop subq $0x20,%rcx jbe L$even_tail nop jmp L$mod_loop .p2align 5 L$mod_loop: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 pshufd $78,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 .byte 102,15,58,68,206,17 .byte 102,15,58,68,231,16 pxor %xmm3,%xmm0 pxor %xmm5,%xmm1 movdqu (%rdx),%xmm9 pxor %xmm0,%xmm8 .byte 102,69,15,56,0,202 movdqu 16(%rdx),%xmm3 pxor %xmm1,%xmm8 pxor %xmm9,%xmm1 pxor %xmm8,%xmm4 .byte 102,65,15,56,0,218 movdqa %xmm4,%xmm8 psrldq $8,%xmm8 pslldq $8,%xmm4 pxor %xmm8,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm3,%xmm5 movdqa %xmm0,%xmm9 movdqa %xmm0,%xmm8 psllq $5,%xmm0 pxor %xmm0,%xmm8 .byte 102,15,58,68,218,0 psllq $1,%xmm0 pxor %xmm8,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm8 pslldq $8,%xmm0 psrldq $8,%xmm8 pxor %xmm9,%xmm0 pshufd $78,%xmm5,%xmm4 pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 .byte 102,15,58,68,234,17 pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 pxor %xmm9,%xmm0 leaq 32(%rdx),%rdx psrlq $1,%xmm0 .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 subq $0x20,%rcx ja L$mod_loop L$even_tail: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 pshufd $78,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 .byte 102,15,58,68,206,17 .byte 102,15,58,68,231,16 pxor %xmm3,%xmm0 pxor %xmm5,%xmm1 pxor %xmm0,%xmm8 pxor %xmm1,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm8 psrldq $8,%xmm8 pslldq $8,%xmm4 pxor %xmm8,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 testq %rcx,%rcx jnz L$done L$odd_tail: movdqu (%rdx),%xmm8 .byte 102,69,15,56,0,194 pxor %xmm8,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,223,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 L$done: .byte 102,65,15,56,0,194 movdqu %xmm0,(%rdi) ret .globl _gcm_init_avx .private_extern _gcm_init_avx .p2align 5 _gcm_init_avx: _CET_ENDBR vzeroupper vmovdqu (%rsi),%xmm2 vpshufd $78,%xmm2,%xmm2 vpshufd $255,%xmm2,%xmm4 vpsrlq $63,%xmm2,%xmm3 vpsllq $1,%xmm2,%xmm2 vpxor %xmm5,%xmm5,%xmm5 vpcmpgtd %xmm4,%xmm5,%xmm5 vpslldq $8,%xmm3,%xmm3 vpor %xmm3,%xmm2,%xmm2 vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm2,%xmm2,%xmm6 vmovdqa %xmm2,%xmm0 vpxor %xmm2,%xmm6,%xmm6 movq $4,%r10 jmp L$init_start_avx .p2align 5 L$init_loop_avx: vpalignr $8,%xmm3,%xmm4,%xmm5 vmovdqu %xmm5,-16(%rdi) vpunpckhqdq %xmm0,%xmm0,%xmm3 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 vpxor %xmm0,%xmm1,%xmm4 vpxor %xmm4,%xmm3,%xmm3 vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 vpxor %xmm3,%xmm4,%xmm4 vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $1,%xmm0,%xmm0 vpxor %xmm1,%xmm0,%xmm0 L$init_start_avx: vmovdqa %xmm0,%xmm5 vpunpckhqdq %xmm0,%xmm0,%xmm3 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 vpxor %xmm0,%xmm1,%xmm4 vpxor %xmm4,%xmm3,%xmm3 vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 vpxor %xmm3,%xmm4,%xmm4 vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $1,%xmm0,%xmm0 vpxor %xmm1,%xmm0,%xmm0 vpshufd $78,%xmm5,%xmm3 vpshufd $78,%xmm0,%xmm4 vpxor %xmm5,%xmm3,%xmm3 vmovdqu %xmm5,0(%rdi) vpxor %xmm0,%xmm4,%xmm4 vmovdqu %xmm0,16(%rdi) leaq 48(%rdi),%rdi subq $1,%r10 jnz L$init_loop_avx vpalignr $8,%xmm4,%xmm3,%xmm5 vmovdqu %xmm5,-16(%rdi) vzeroupper ret .globl _gcm_ghash_avx .private_extern _gcm_ghash_avx .p2align 5 _gcm_ghash_avx: _CET_ENDBR vzeroupper vmovdqu (%rdi),%xmm10 leaq L$0x1c2_polynomial(%rip),%r10 leaq 64(%rsi),%rsi vmovdqu L$bswap_mask(%rip),%xmm13 vpshufb %xmm13,%xmm10,%xmm10 cmpq $0x80,%rcx jb L$short_avx subq $0x80,%rcx vmovdqu 112(%rdx),%xmm14 vmovdqu 0-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm14 vmovdqu 32-64(%rsi),%xmm7 vpunpckhqdq %xmm14,%xmm14,%xmm9 vmovdqu 96(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm14,%xmm9,%xmm9 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 16-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vmovdqu 80(%rdx),%xmm14 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 48-64(%rsi),%xmm6 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 64(%rdx),%xmm15 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 80-64(%rsi),%xmm7 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vmovdqu 48(%rdx),%xmm14 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm4,%xmm1,%xmm1 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 96-64(%rsi),%xmm6 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 128-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 32(%rdx),%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vmovdqu 16(%rdx),%xmm14 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm4,%xmm1,%xmm1 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 144-64(%rsi),%xmm6 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 176-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu (%rdx),%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 160-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 leaq 128(%rdx),%rdx cmpq $0x80,%rcx jb L$tail_avx vpxor %xmm10,%xmm15,%xmm15 subq $0x80,%rcx jmp L$oop8x_avx .p2align 5 L$oop8x_avx: vpunpckhqdq %xmm15,%xmm15,%xmm8 vmovdqu 112(%rdx),%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpxor %xmm15,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 vmovdqu 0-64(%rsi),%xmm6 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 vmovdqu 32-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 96(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm3,%xmm10,%xmm10 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vxorps %xmm4,%xmm11,%xmm11 vmovdqu 16-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm5,%xmm12,%xmm12 vxorps %xmm15,%xmm8,%xmm8 vmovdqu 80(%rdx),%xmm14 vpxor %xmm10,%xmm12,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm11,%xmm12,%xmm12 vpslldq $8,%xmm12,%xmm9 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vpsrldq $8,%xmm12,%xmm12 vpxor %xmm9,%xmm10,%xmm10 vmovdqu 48-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm14 vxorps %xmm12,%xmm11,%xmm11 vpxor %xmm1,%xmm4,%xmm4 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 80-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 64(%rdx),%xmm15 vpalignr $8,%xmm10,%xmm10,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm4,%xmm1,%xmm1 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vxorps %xmm15,%xmm8,%xmm8 vpxor %xmm5,%xmm2,%xmm2 vmovdqu 48(%rdx),%xmm14 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 96-64(%rsi),%xmm6 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 128-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 32(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm4,%xmm1,%xmm1 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vpxor %xmm5,%xmm2,%xmm2 vxorps %xmm12,%xmm10,%xmm10 vmovdqu 16(%rdx),%xmm14 vpalignr $8,%xmm10,%xmm10,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 144-64(%rsi),%xmm6 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 vxorps %xmm11,%xmm12,%xmm12 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 176-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu (%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 160-64(%rsi),%xmm6 vpxor %xmm12,%xmm15,%xmm15 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 vpxor %xmm10,%xmm15,%xmm15 leaq 128(%rdx),%rdx subq $0x80,%rcx jnc L$oop8x_avx addq $0x80,%rcx jmp L$tail_no_xor_avx .p2align 5 L$short_avx: vmovdqu -16(%rdx,%rcx,1),%xmm14 leaq (%rdx,%rcx,1),%rdx vmovdqu 0-64(%rsi),%xmm6 vmovdqu 32-64(%rsi),%xmm7 vpshufb %xmm13,%xmm14,%xmm15 vmovdqa %xmm0,%xmm3 vmovdqa %xmm1,%xmm4 vmovdqa %xmm2,%xmm5 subq $0x10,%rcx jz L$tail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -32(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 16-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz L$tail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -48(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 48-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu 80-64(%rsi),%xmm7 subq $0x10,%rcx jz L$tail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -64(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz L$tail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -80(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 96-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu 128-64(%rsi),%xmm7 subq $0x10,%rcx jz L$tail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -96(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz L$tail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -112(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 144-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovq 184-64(%rsi),%xmm7 subq $0x10,%rcx jmp L$tail_avx .p2align 5 L$tail_avx: vpxor %xmm10,%xmm15,%xmm15 L$tail_no_xor_avx: vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu (%r10),%xmm12 vpxor %xmm0,%xmm3,%xmm10 vpxor %xmm1,%xmm4,%xmm11 vpxor %xmm2,%xmm5,%xmm5 vpxor %xmm10,%xmm5,%xmm5 vpxor %xmm11,%xmm5,%xmm5 vpslldq $8,%xmm5,%xmm9 vpsrldq $8,%xmm5,%xmm5 vpxor %xmm9,%xmm10,%xmm10 vpxor %xmm5,%xmm11,%xmm11 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 vpalignr $8,%xmm10,%xmm10,%xmm10 vpxor %xmm9,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 vpalignr $8,%xmm10,%xmm10,%xmm10 vpxor %xmm11,%xmm10,%xmm10 vpxor %xmm9,%xmm10,%xmm10 cmpq $0,%rcx jne L$short_avx vpshufb %xmm13,%xmm10,%xmm10 vmovdqu %xmm10,(%rdi) vzeroupper ret .section __DATA,__const .p2align 6 L$bswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 L$0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 L$7_mask: .long 7,0,7,0 .p2align 6 .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 .text #endif ring-0.17.14/pregenerated/ghash-x86_64-nasm.asm000064400000000000000000000637021046102023000170760ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %ifidn __OUTPUT_FORMAT__, win64 default rel %define XMMWORD %define YMMWORD %define ZMMWORD %define _CET_ENDBR %include "ring_core_generated/prefix_symbols_nasm.inc" section .text code align=64 global gcm_init_clmul ALIGN 16 gcm_init_clmul: $L$SEH_begin_gcm_init_clmul_1: _CET_ENDBR $L$_init_clmul: sub rsp,0x18 $L$SEH_prologue_gcm_init_clmul_2: movaps XMMWORD[rsp],xmm6 $L$SEH_prologue_gcm_init_clmul_3: $L$SEH_endprologue_gcm_init_clmul_4: movdqu xmm2,XMMWORD[rdx] pshufd xmm2,xmm2,78 pshufd xmm4,xmm2,255 movdqa xmm3,xmm2 psllq xmm2,1 pxor xmm5,xmm5 psrlq xmm3,63 pcmpgtd xmm5,xmm4 pslldq xmm3,8 por xmm2,xmm3 pand xmm5,XMMWORD[$L$0x1c2_polynomial] pxor xmm2,xmm5 pshufd xmm6,xmm2,78 movdqa xmm0,xmm2 pxor xmm6,xmm2 movdqa xmm1,xmm0 pshufd xmm3,xmm0,78 pxor xmm3,xmm0 DB 102,15,58,68,194,0 DB 102,15,58,68,202,17 DB 102,15,58,68,222,0 pxor xmm3,xmm0 pxor xmm3,xmm1 movdqa xmm4,xmm3 psrldq xmm3,8 pslldq xmm4,8 pxor xmm1,xmm3 pxor xmm0,xmm4 movdqa xmm4,xmm0 movdqa xmm3,xmm0 psllq xmm0,5 pxor xmm3,xmm0 psllq xmm0,1 pxor xmm0,xmm3 psllq xmm0,57 movdqa xmm3,xmm0 pslldq xmm0,8 psrldq xmm3,8 pxor xmm0,xmm4 pxor xmm1,xmm3 movdqa xmm4,xmm0 psrlq xmm0,1 pxor xmm1,xmm4 pxor xmm4,xmm0 psrlq xmm0,5 pxor xmm0,xmm4 psrlq xmm0,1 pxor xmm0,xmm1 pshufd xmm3,xmm2,78 pshufd xmm4,xmm0,78 pxor xmm3,xmm2 movdqu XMMWORD[rcx],xmm2 pxor xmm4,xmm0 movdqu XMMWORD[16+rcx],xmm0 DB 102,15,58,15,227,8 movdqu XMMWORD[32+rcx],xmm4 movdqa xmm1,xmm0 pshufd xmm3,xmm0,78 pxor xmm3,xmm0 DB 102,15,58,68,194,0 DB 102,15,58,68,202,17 DB 102,15,58,68,222,0 pxor xmm3,xmm0 pxor xmm3,xmm1 movdqa xmm4,xmm3 psrldq xmm3,8 pslldq xmm4,8 pxor xmm1,xmm3 pxor xmm0,xmm4 movdqa xmm4,xmm0 movdqa xmm3,xmm0 psllq xmm0,5 pxor xmm3,xmm0 psllq xmm0,1 pxor xmm0,xmm3 psllq xmm0,57 movdqa xmm3,xmm0 pslldq xmm0,8 psrldq xmm3,8 pxor xmm0,xmm4 pxor xmm1,xmm3 movdqa xmm4,xmm0 psrlq xmm0,1 pxor xmm1,xmm4 pxor xmm4,xmm0 psrlq xmm0,5 pxor xmm0,xmm4 psrlq xmm0,1 pxor xmm0,xmm1 movdqa xmm5,xmm0 movdqa xmm1,xmm0 pshufd xmm3,xmm0,78 pxor xmm3,xmm0 DB 102,15,58,68,194,0 DB 102,15,58,68,202,17 DB 102,15,58,68,222,0 pxor xmm3,xmm0 pxor xmm3,xmm1 movdqa xmm4,xmm3 psrldq xmm3,8 pslldq xmm4,8 pxor xmm1,xmm3 pxor xmm0,xmm4 movdqa xmm4,xmm0 movdqa xmm3,xmm0 psllq xmm0,5 pxor xmm3,xmm0 psllq xmm0,1 pxor xmm0,xmm3 psllq xmm0,57 movdqa xmm3,xmm0 pslldq xmm0,8 psrldq xmm3,8 pxor xmm0,xmm4 pxor xmm1,xmm3 movdqa xmm4,xmm0 psrlq xmm0,1 pxor xmm1,xmm4 pxor xmm4,xmm0 psrlq xmm0,5 pxor xmm0,xmm4 psrlq xmm0,1 pxor xmm0,xmm1 pshufd xmm3,xmm5,78 pshufd xmm4,xmm0,78 pxor xmm3,xmm5 movdqu XMMWORD[48+rcx],xmm5 pxor xmm4,xmm0 movdqu XMMWORD[64+rcx],xmm0 DB 102,15,58,15,227,8 movdqu XMMWORD[80+rcx],xmm4 movaps xmm6,XMMWORD[rsp] lea rsp,[24+rsp] ret $L$SEH_end_gcm_init_clmul_5: global gcm_ghash_clmul ALIGN 32 gcm_ghash_clmul: $L$SEH_begin_gcm_ghash_clmul_1: _CET_ENDBR $L$_ghash_clmul: lea rax,[((-136))+rsp] lea rsp,[((-32))+rax] $L$SEH_prologue_gcm_ghash_clmul_2: movaps XMMWORD[(-32)+rax],xmm6 $L$SEH_prologue_gcm_ghash_clmul_3: movaps XMMWORD[(-16)+rax],xmm7 $L$SEH_prologue_gcm_ghash_clmul_4: movaps XMMWORD[rax],xmm8 $L$SEH_prologue_gcm_ghash_clmul_5: movaps XMMWORD[16+rax],xmm9 $L$SEH_prologue_gcm_ghash_clmul_6: movaps XMMWORD[32+rax],xmm10 $L$SEH_prologue_gcm_ghash_clmul_7: movaps XMMWORD[48+rax],xmm11 $L$SEH_prologue_gcm_ghash_clmul_8: movaps XMMWORD[64+rax],xmm12 $L$SEH_prologue_gcm_ghash_clmul_9: movaps XMMWORD[80+rax],xmm13 $L$SEH_prologue_gcm_ghash_clmul_10: movaps XMMWORD[96+rax],xmm14 $L$SEH_prologue_gcm_ghash_clmul_11: movaps XMMWORD[112+rax],xmm15 $L$SEH_prologue_gcm_ghash_clmul_12: $L$SEH_endprologue_gcm_ghash_clmul_13: movdqa xmm10,XMMWORD[$L$bswap_mask] movdqu xmm0,XMMWORD[rcx] movdqu xmm2,XMMWORD[rdx] movdqu xmm7,XMMWORD[32+rdx] DB 102,65,15,56,0,194 sub r9,0x10 jz NEAR $L$odd_tail movdqu xmm6,XMMWORD[16+rdx] cmp r9,0x30 jb NEAR $L$skip4x sub r9,0x30 mov rax,0xA040608020C0E000 movdqu xmm14,XMMWORD[48+rdx] movdqu xmm15,XMMWORD[64+rdx] movdqu xmm3,XMMWORD[48+r8] movdqu xmm11,XMMWORD[32+r8] DB 102,65,15,56,0,218 DB 102,69,15,56,0,218 movdqa xmm5,xmm3 pshufd xmm4,xmm3,78 pxor xmm4,xmm3 DB 102,15,58,68,218,0 DB 102,15,58,68,234,17 DB 102,15,58,68,231,0 movdqa xmm13,xmm11 pshufd xmm12,xmm11,78 pxor xmm12,xmm11 DB 102,68,15,58,68,222,0 DB 102,68,15,58,68,238,17 DB 102,68,15,58,68,231,16 xorps xmm3,xmm11 xorps xmm5,xmm13 movups xmm7,XMMWORD[80+rdx] xorps xmm4,xmm12 movdqu xmm11,XMMWORD[16+r8] movdqu xmm8,XMMWORD[r8] DB 102,69,15,56,0,218 DB 102,69,15,56,0,194 movdqa xmm13,xmm11 pshufd xmm12,xmm11,78 pxor xmm0,xmm8 pxor xmm12,xmm11 DB 102,69,15,58,68,222,0 movdqa xmm1,xmm0 pshufd xmm8,xmm0,78 pxor xmm8,xmm0 DB 102,69,15,58,68,238,17 DB 102,68,15,58,68,231,0 xorps xmm3,xmm11 xorps xmm5,xmm13 lea r8,[64+r8] sub r9,0x40 jc NEAR $L$tail4x jmp NEAR $L$mod4_loop ALIGN 32 $L$mod4_loop: DB 102,65,15,58,68,199,0 xorps xmm4,xmm12 movdqu xmm11,XMMWORD[48+r8] DB 102,69,15,56,0,218 DB 102,65,15,58,68,207,17 xorps xmm0,xmm3 movdqu xmm3,XMMWORD[32+r8] movdqa xmm13,xmm11 DB 102,68,15,58,68,199,16 pshufd xmm12,xmm11,78 xorps xmm1,xmm5 pxor xmm12,xmm11 DB 102,65,15,56,0,218 movups xmm7,XMMWORD[32+rdx] xorps xmm8,xmm4 DB 102,68,15,58,68,218,0 pshufd xmm4,xmm3,78 pxor xmm8,xmm0 movdqa xmm5,xmm3 pxor xmm8,xmm1 pxor xmm4,xmm3 movdqa xmm9,xmm8 DB 102,68,15,58,68,234,17 pslldq xmm8,8 psrldq xmm9,8 pxor xmm0,xmm8 movdqa xmm8,XMMWORD[$L$7_mask] pxor xmm1,xmm9 DB 102,76,15,110,200 pand xmm8,xmm0 DB 102,69,15,56,0,200 pxor xmm9,xmm0 DB 102,68,15,58,68,231,0 psllq xmm9,57 movdqa xmm8,xmm9 pslldq xmm9,8 DB 102,15,58,68,222,0 psrldq xmm8,8 pxor xmm0,xmm9 pxor xmm1,xmm8 movdqu xmm8,XMMWORD[r8] movdqa xmm9,xmm0 psrlq xmm0,1 DB 102,15,58,68,238,17 xorps xmm3,xmm11 movdqu xmm11,XMMWORD[16+r8] DB 102,69,15,56,0,218 DB 102,15,58,68,231,16 xorps xmm5,xmm13 movups xmm7,XMMWORD[80+rdx] DB 102,69,15,56,0,194 pxor xmm1,xmm9 pxor xmm9,xmm0 psrlq xmm0,5 movdqa xmm13,xmm11 pxor xmm4,xmm12 pshufd xmm12,xmm11,78 pxor xmm0,xmm9 pxor xmm1,xmm8 pxor xmm12,xmm11 DB 102,69,15,58,68,222,0 psrlq xmm0,1 pxor xmm0,xmm1 movdqa xmm1,xmm0 DB 102,69,15,58,68,238,17 xorps xmm3,xmm11 pshufd xmm8,xmm0,78 pxor xmm8,xmm0 DB 102,68,15,58,68,231,0 xorps xmm5,xmm13 lea r8,[64+r8] sub r9,0x40 jnc NEAR $L$mod4_loop $L$tail4x: DB 102,65,15,58,68,199,0 DB 102,65,15,58,68,207,17 DB 102,68,15,58,68,199,16 xorps xmm4,xmm12 xorps xmm0,xmm3 xorps xmm1,xmm5 pxor xmm1,xmm0 pxor xmm8,xmm4 pxor xmm8,xmm1 pxor xmm1,xmm0 movdqa xmm9,xmm8 psrldq xmm8,8 pslldq xmm9,8 pxor xmm1,xmm8 pxor xmm0,xmm9 movdqa xmm4,xmm0 movdqa xmm3,xmm0 psllq xmm0,5 pxor xmm3,xmm0 psllq xmm0,1 pxor xmm0,xmm3 psllq xmm0,57 movdqa xmm3,xmm0 pslldq xmm0,8 psrldq xmm3,8 pxor xmm0,xmm4 pxor xmm1,xmm3 movdqa xmm4,xmm0 psrlq xmm0,1 pxor xmm1,xmm4 pxor xmm4,xmm0 psrlq xmm0,5 pxor xmm0,xmm4 psrlq xmm0,1 pxor xmm0,xmm1 add r9,0x40 jz NEAR $L$done movdqu xmm7,XMMWORD[32+rdx] sub r9,0x10 jz NEAR $L$odd_tail $L$skip4x: movdqu xmm8,XMMWORD[r8] movdqu xmm3,XMMWORD[16+r8] DB 102,69,15,56,0,194 DB 102,65,15,56,0,218 pxor xmm0,xmm8 movdqa xmm5,xmm3 pshufd xmm4,xmm3,78 pxor xmm4,xmm3 DB 102,15,58,68,218,0 DB 102,15,58,68,234,17 DB 102,15,58,68,231,0 lea r8,[32+r8] nop sub r9,0x20 jbe NEAR $L$even_tail nop jmp NEAR $L$mod_loop ALIGN 32 $L$mod_loop: movdqa xmm1,xmm0 movdqa xmm8,xmm4 pshufd xmm4,xmm0,78 pxor xmm4,xmm0 DB 102,15,58,68,198,0 DB 102,15,58,68,206,17 DB 102,15,58,68,231,16 pxor xmm0,xmm3 pxor xmm1,xmm5 movdqu xmm9,XMMWORD[r8] pxor xmm8,xmm0 DB 102,69,15,56,0,202 movdqu xmm3,XMMWORD[16+r8] pxor xmm8,xmm1 pxor xmm1,xmm9 pxor xmm4,xmm8 DB 102,65,15,56,0,218 movdqa xmm8,xmm4 psrldq xmm8,8 pslldq xmm4,8 pxor xmm1,xmm8 pxor xmm0,xmm4 movdqa xmm5,xmm3 movdqa xmm9,xmm0 movdqa xmm8,xmm0 psllq xmm0,5 pxor xmm8,xmm0 DB 102,15,58,68,218,0 psllq xmm0,1 pxor xmm0,xmm8 psllq xmm0,57 movdqa xmm8,xmm0 pslldq xmm0,8 psrldq xmm8,8 pxor xmm0,xmm9 pshufd xmm4,xmm5,78 pxor xmm1,xmm8 pxor xmm4,xmm5 movdqa xmm9,xmm0 psrlq xmm0,1 DB 102,15,58,68,234,17 pxor xmm1,xmm9 pxor xmm9,xmm0 psrlq xmm0,5 pxor xmm0,xmm9 lea r8,[32+r8] psrlq xmm0,1 DB 102,15,58,68,231,0 pxor xmm0,xmm1 sub r9,0x20 ja NEAR $L$mod_loop $L$even_tail: movdqa xmm1,xmm0 movdqa xmm8,xmm4 pshufd xmm4,xmm0,78 pxor xmm4,xmm0 DB 102,15,58,68,198,0 DB 102,15,58,68,206,17 DB 102,15,58,68,231,16 pxor xmm0,xmm3 pxor xmm1,xmm5 pxor xmm8,xmm0 pxor xmm8,xmm1 pxor xmm4,xmm8 movdqa xmm8,xmm4 psrldq xmm8,8 pslldq xmm4,8 pxor xmm1,xmm8 pxor xmm0,xmm4 movdqa xmm4,xmm0 movdqa xmm3,xmm0 psllq xmm0,5 pxor xmm3,xmm0 psllq xmm0,1 pxor xmm0,xmm3 psllq xmm0,57 movdqa xmm3,xmm0 pslldq xmm0,8 psrldq xmm3,8 pxor xmm0,xmm4 pxor xmm1,xmm3 movdqa xmm4,xmm0 psrlq xmm0,1 pxor xmm1,xmm4 pxor xmm4,xmm0 psrlq xmm0,5 pxor xmm0,xmm4 psrlq xmm0,1 pxor xmm0,xmm1 test r9,r9 jnz NEAR $L$done $L$odd_tail: movdqu xmm8,XMMWORD[r8] DB 102,69,15,56,0,194 pxor xmm0,xmm8 movdqa xmm1,xmm0 pshufd xmm3,xmm0,78 pxor xmm3,xmm0 DB 102,15,58,68,194,0 DB 102,15,58,68,202,17 DB 102,15,58,68,223,0 pxor xmm3,xmm0 pxor xmm3,xmm1 movdqa xmm4,xmm3 psrldq xmm3,8 pslldq xmm4,8 pxor xmm1,xmm3 pxor xmm0,xmm4 movdqa xmm4,xmm0 movdqa xmm3,xmm0 psllq xmm0,5 pxor xmm3,xmm0 psllq xmm0,1 pxor xmm0,xmm3 psllq xmm0,57 movdqa xmm3,xmm0 pslldq xmm0,8 psrldq xmm3,8 pxor xmm0,xmm4 pxor xmm1,xmm3 movdqa xmm4,xmm0 psrlq xmm0,1 pxor xmm1,xmm4 pxor xmm4,xmm0 psrlq xmm0,5 pxor xmm0,xmm4 psrlq xmm0,1 pxor xmm0,xmm1 $L$done: DB 102,65,15,56,0,194 movdqu XMMWORD[rcx],xmm0 movaps xmm6,XMMWORD[rsp] movaps xmm7,XMMWORD[16+rsp] movaps xmm8,XMMWORD[32+rsp] movaps xmm9,XMMWORD[48+rsp] movaps xmm10,XMMWORD[64+rsp] movaps xmm11,XMMWORD[80+rsp] movaps xmm12,XMMWORD[96+rsp] movaps xmm13,XMMWORD[112+rsp] movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] lea rsp,[168+rsp] ret $L$SEH_end_gcm_ghash_clmul_14: global gcm_init_avx ALIGN 32 gcm_init_avx: $L$SEH_begin_gcm_init_avx_1: _CET_ENDBR sub rsp,0x18 $L$SEH_prologue_gcm_init_avx_2: movaps XMMWORD[rsp],xmm6 $L$SEH_prologue_gcm_init_avx_3: $L$SEH_endprologue_gcm_init_avx_4: vzeroupper vmovdqu xmm2,XMMWORD[rdx] vpshufd xmm2,xmm2,78 vpshufd xmm4,xmm2,255 vpsrlq xmm3,xmm2,63 vpsllq xmm2,xmm2,1 vpxor xmm5,xmm5,xmm5 vpcmpgtd xmm5,xmm5,xmm4 vpslldq xmm3,xmm3,8 vpor xmm2,xmm2,xmm3 vpand xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial] vpxor xmm2,xmm2,xmm5 vpunpckhqdq xmm6,xmm2,xmm2 vmovdqa xmm0,xmm2 vpxor xmm6,xmm6,xmm2 mov r10,4 jmp NEAR $L$init_start_avx ALIGN 32 $L$init_loop_avx: vpalignr xmm5,xmm4,xmm3,8 vmovdqu XMMWORD[(-16)+rcx],xmm5 vpunpckhqdq xmm3,xmm0,xmm0 vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm1,xmm0,xmm2,0x11 vpclmulqdq xmm0,xmm0,xmm2,0x00 vpclmulqdq xmm3,xmm3,xmm6,0x00 vpxor xmm4,xmm1,xmm0 vpxor xmm3,xmm3,xmm4 vpslldq xmm4,xmm3,8 vpsrldq xmm3,xmm3,8 vpxor xmm0,xmm0,xmm4 vpxor xmm1,xmm1,xmm3 vpsllq xmm3,xmm0,57 vpsllq xmm4,xmm0,62 vpxor xmm4,xmm4,xmm3 vpsllq xmm3,xmm0,63 vpxor xmm4,xmm4,xmm3 vpslldq xmm3,xmm4,8 vpsrldq xmm4,xmm4,8 vpxor xmm0,xmm0,xmm3 vpxor xmm1,xmm1,xmm4 vpsrlq xmm4,xmm0,1 vpxor xmm1,xmm1,xmm0 vpxor xmm0,xmm0,xmm4 vpsrlq xmm4,xmm4,5 vpxor xmm0,xmm0,xmm4 vpsrlq xmm0,xmm0,1 vpxor xmm0,xmm0,xmm1 $L$init_start_avx: vmovdqa xmm5,xmm0 vpunpckhqdq xmm3,xmm0,xmm0 vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm1,xmm0,xmm2,0x11 vpclmulqdq xmm0,xmm0,xmm2,0x00 vpclmulqdq xmm3,xmm3,xmm6,0x00 vpxor xmm4,xmm1,xmm0 vpxor xmm3,xmm3,xmm4 vpslldq xmm4,xmm3,8 vpsrldq xmm3,xmm3,8 vpxor xmm0,xmm0,xmm4 vpxor xmm1,xmm1,xmm3 vpsllq xmm3,xmm0,57 vpsllq xmm4,xmm0,62 vpxor xmm4,xmm4,xmm3 vpsllq xmm3,xmm0,63 vpxor xmm4,xmm4,xmm3 vpslldq xmm3,xmm4,8 vpsrldq xmm4,xmm4,8 vpxor xmm0,xmm0,xmm3 vpxor xmm1,xmm1,xmm4 vpsrlq xmm4,xmm0,1 vpxor xmm1,xmm1,xmm0 vpxor xmm0,xmm0,xmm4 vpsrlq xmm4,xmm4,5 vpxor xmm0,xmm0,xmm4 vpsrlq xmm0,xmm0,1 vpxor xmm0,xmm0,xmm1 vpshufd xmm3,xmm5,78 vpshufd xmm4,xmm0,78 vpxor xmm3,xmm3,xmm5 vmovdqu XMMWORD[rcx],xmm5 vpxor xmm4,xmm4,xmm0 vmovdqu XMMWORD[16+rcx],xmm0 lea rcx,[48+rcx] sub r10,1 jnz NEAR $L$init_loop_avx vpalignr xmm5,xmm3,xmm4,8 vmovdqu XMMWORD[(-16)+rcx],xmm5 vzeroupper movaps xmm6,XMMWORD[rsp] lea rsp,[24+rsp] ret $L$SEH_end_gcm_init_avx_5: global gcm_ghash_avx ALIGN 32 gcm_ghash_avx: $L$SEH_begin_gcm_ghash_avx_1: _CET_ENDBR lea rax,[((-136))+rsp] lea rsp,[((-32))+rax] $L$SEH_prologue_gcm_ghash_avx_2: movaps XMMWORD[(-32)+rax],xmm6 $L$SEH_prologue_gcm_ghash_avx_3: movaps XMMWORD[(-16)+rax],xmm7 $L$SEH_prologue_gcm_ghash_avx_4: movaps XMMWORD[rax],xmm8 $L$SEH_prologue_gcm_ghash_avx_5: movaps XMMWORD[16+rax],xmm9 $L$SEH_prologue_gcm_ghash_avx_6: movaps XMMWORD[32+rax],xmm10 $L$SEH_prologue_gcm_ghash_avx_7: movaps XMMWORD[48+rax],xmm11 $L$SEH_prologue_gcm_ghash_avx_8: movaps XMMWORD[64+rax],xmm12 $L$SEH_prologue_gcm_ghash_avx_9: movaps XMMWORD[80+rax],xmm13 $L$SEH_prologue_gcm_ghash_avx_10: movaps XMMWORD[96+rax],xmm14 $L$SEH_prologue_gcm_ghash_avx_11: movaps XMMWORD[112+rax],xmm15 $L$SEH_prologue_gcm_ghash_avx_12: $L$SEH_endprologue_gcm_ghash_avx_13: vzeroupper vmovdqu xmm10,XMMWORD[rcx] lea r10,[$L$0x1c2_polynomial] lea rdx,[64+rdx] vmovdqu xmm13,XMMWORD[$L$bswap_mask] vpshufb xmm10,xmm10,xmm13 cmp r9,0x80 jb NEAR $L$short_avx sub r9,0x80 vmovdqu xmm14,XMMWORD[112+r8] vmovdqu xmm6,XMMWORD[((0-64))+rdx] vpshufb xmm14,xmm14,xmm13 vmovdqu xmm7,XMMWORD[((32-64))+rdx] vpunpckhqdq xmm9,xmm14,xmm14 vmovdqu xmm15,XMMWORD[96+r8] vpclmulqdq xmm0,xmm14,xmm6,0x00 vpxor xmm9,xmm9,xmm14 vpshufb xmm15,xmm15,xmm13 vpclmulqdq xmm1,xmm14,xmm6,0x11 vmovdqu xmm6,XMMWORD[((16-64))+rdx] vpunpckhqdq xmm8,xmm15,xmm15 vmovdqu xmm14,XMMWORD[80+r8] vpclmulqdq xmm2,xmm9,xmm7,0x00 vpxor xmm8,xmm8,xmm15 vpshufb xmm14,xmm14,xmm13 vpclmulqdq xmm3,xmm15,xmm6,0x00 vpunpckhqdq xmm9,xmm14,xmm14 vpclmulqdq xmm4,xmm15,xmm6,0x11 vmovdqu xmm6,XMMWORD[((48-64))+rdx] vpxor xmm9,xmm9,xmm14 vmovdqu xmm15,XMMWORD[64+r8] vpclmulqdq xmm5,xmm8,xmm7,0x10 vmovdqu xmm7,XMMWORD[((80-64))+rdx] vpshufb xmm15,xmm15,xmm13 vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm0,xmm14,xmm6,0x00 vpxor xmm4,xmm4,xmm1 vpunpckhqdq xmm8,xmm15,xmm15 vpclmulqdq xmm1,xmm14,xmm6,0x11 vmovdqu xmm6,XMMWORD[((64-64))+rdx] vpxor xmm5,xmm5,xmm2 vpclmulqdq xmm2,xmm9,xmm7,0x00 vpxor xmm8,xmm8,xmm15 vmovdqu xmm14,XMMWORD[48+r8] vpxor xmm0,xmm0,xmm3 vpclmulqdq xmm3,xmm15,xmm6,0x00 vpxor xmm1,xmm1,xmm4 vpshufb xmm14,xmm14,xmm13 vpclmulqdq xmm4,xmm15,xmm6,0x11 vmovdqu xmm6,XMMWORD[((96-64))+rdx] vpxor xmm2,xmm2,xmm5 vpunpckhqdq xmm9,xmm14,xmm14 vpclmulqdq xmm5,xmm8,xmm7,0x10 vmovdqu xmm7,XMMWORD[((128-64))+rdx] vpxor xmm9,xmm9,xmm14 vmovdqu xmm15,XMMWORD[32+r8] vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm0,xmm14,xmm6,0x00 vpxor xmm4,xmm4,xmm1 vpshufb xmm15,xmm15,xmm13 vpclmulqdq xmm1,xmm14,xmm6,0x11 vmovdqu xmm6,XMMWORD[((112-64))+rdx] vpxor xmm5,xmm5,xmm2 vpunpckhqdq xmm8,xmm15,xmm15 vpclmulqdq xmm2,xmm9,xmm7,0x00 vpxor xmm8,xmm8,xmm15 vmovdqu xmm14,XMMWORD[16+r8] vpxor xmm0,xmm0,xmm3 vpclmulqdq xmm3,xmm15,xmm6,0x00 vpxor xmm1,xmm1,xmm4 vpshufb xmm14,xmm14,xmm13 vpclmulqdq xmm4,xmm15,xmm6,0x11 vmovdqu xmm6,XMMWORD[((144-64))+rdx] vpxor xmm2,xmm2,xmm5 vpunpckhqdq xmm9,xmm14,xmm14 vpclmulqdq xmm5,xmm8,xmm7,0x10 vmovdqu xmm7,XMMWORD[((176-64))+rdx] vpxor xmm9,xmm9,xmm14 vmovdqu xmm15,XMMWORD[r8] vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm0,xmm14,xmm6,0x00 vpxor xmm4,xmm4,xmm1 vpshufb xmm15,xmm15,xmm13 vpclmulqdq xmm1,xmm14,xmm6,0x11 vmovdqu xmm6,XMMWORD[((160-64))+rdx] vpxor xmm5,xmm5,xmm2 vpclmulqdq xmm2,xmm9,xmm7,0x10 lea r8,[128+r8] cmp r9,0x80 jb NEAR $L$tail_avx vpxor xmm15,xmm15,xmm10 sub r9,0x80 jmp NEAR $L$oop8x_avx ALIGN 32 $L$oop8x_avx: vpunpckhqdq xmm8,xmm15,xmm15 vmovdqu xmm14,XMMWORD[112+r8] vpxor xmm3,xmm3,xmm0 vpxor xmm8,xmm8,xmm15 vpclmulqdq xmm10,xmm15,xmm6,0x00 vpshufb xmm14,xmm14,xmm13 vpxor xmm4,xmm4,xmm1 vpclmulqdq xmm11,xmm15,xmm6,0x11 vmovdqu xmm6,XMMWORD[((0-64))+rdx] vpunpckhqdq xmm9,xmm14,xmm14 vpxor xmm5,xmm5,xmm2 vpclmulqdq xmm12,xmm8,xmm7,0x00 vmovdqu xmm7,XMMWORD[((32-64))+rdx] vpxor xmm9,xmm9,xmm14 vmovdqu xmm15,XMMWORD[96+r8] vpclmulqdq xmm0,xmm14,xmm6,0x00 vpxor xmm10,xmm10,xmm3 vpshufb xmm15,xmm15,xmm13 vpclmulqdq xmm1,xmm14,xmm6,0x11 vxorps xmm11,xmm11,xmm4 vmovdqu xmm6,XMMWORD[((16-64))+rdx] vpunpckhqdq xmm8,xmm15,xmm15 vpclmulqdq xmm2,xmm9,xmm7,0x00 vpxor xmm12,xmm12,xmm5 vxorps xmm8,xmm8,xmm15 vmovdqu xmm14,XMMWORD[80+r8] vpxor xmm12,xmm12,xmm10 vpclmulqdq xmm3,xmm15,xmm6,0x00 vpxor xmm12,xmm12,xmm11 vpslldq xmm9,xmm12,8 vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm4,xmm15,xmm6,0x11 vpsrldq xmm12,xmm12,8 vpxor xmm10,xmm10,xmm9 vmovdqu xmm6,XMMWORD[((48-64))+rdx] vpshufb xmm14,xmm14,xmm13 vxorps xmm11,xmm11,xmm12 vpxor xmm4,xmm4,xmm1 vpunpckhqdq xmm9,xmm14,xmm14 vpclmulqdq xmm5,xmm8,xmm7,0x10 vmovdqu xmm7,XMMWORD[((80-64))+rdx] vpxor xmm9,xmm9,xmm14 vpxor xmm5,xmm5,xmm2 vmovdqu xmm15,XMMWORD[64+r8] vpalignr xmm12,xmm10,xmm10,8 vpclmulqdq xmm0,xmm14,xmm6,0x00 vpshufb xmm15,xmm15,xmm13 vpxor xmm0,xmm0,xmm3 vpclmulqdq xmm1,xmm14,xmm6,0x11 vmovdqu xmm6,XMMWORD[((64-64))+rdx] vpunpckhqdq xmm8,xmm15,xmm15 vpxor xmm1,xmm1,xmm4 vpclmulqdq xmm2,xmm9,xmm7,0x00 vxorps xmm8,xmm8,xmm15 vpxor xmm2,xmm2,xmm5 vmovdqu xmm14,XMMWORD[48+r8] vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10 vpclmulqdq xmm3,xmm15,xmm6,0x00 vpshufb xmm14,xmm14,xmm13 vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm4,xmm15,xmm6,0x11 vmovdqu xmm6,XMMWORD[((96-64))+rdx] vpunpckhqdq xmm9,xmm14,xmm14 vpxor xmm4,xmm4,xmm1 vpclmulqdq xmm5,xmm8,xmm7,0x10 vmovdqu xmm7,XMMWORD[((128-64))+rdx] vpxor xmm9,xmm9,xmm14 vpxor xmm5,xmm5,xmm2 vmovdqu xmm15,XMMWORD[32+r8] vpclmulqdq xmm0,xmm14,xmm6,0x00 vpshufb xmm15,xmm15,xmm13 vpxor xmm0,xmm0,xmm3 vpclmulqdq xmm1,xmm14,xmm6,0x11 vmovdqu xmm6,XMMWORD[((112-64))+rdx] vpunpckhqdq xmm8,xmm15,xmm15 vpxor xmm1,xmm1,xmm4 vpclmulqdq xmm2,xmm9,xmm7,0x00 vpxor xmm8,xmm8,xmm15 vpxor xmm2,xmm2,xmm5 vxorps xmm10,xmm10,xmm12 vmovdqu xmm14,XMMWORD[16+r8] vpalignr xmm12,xmm10,xmm10,8 vpclmulqdq xmm3,xmm15,xmm6,0x00 vpshufb xmm14,xmm14,xmm13 vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm4,xmm15,xmm6,0x11 vmovdqu xmm6,XMMWORD[((144-64))+rdx] vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10 vxorps xmm12,xmm12,xmm11 vpunpckhqdq xmm9,xmm14,xmm14 vpxor xmm4,xmm4,xmm1 vpclmulqdq xmm5,xmm8,xmm7,0x10 vmovdqu xmm7,XMMWORD[((176-64))+rdx] vpxor xmm9,xmm9,xmm14 vpxor xmm5,xmm5,xmm2 vmovdqu xmm15,XMMWORD[r8] vpclmulqdq xmm0,xmm14,xmm6,0x00 vpshufb xmm15,xmm15,xmm13 vpclmulqdq xmm1,xmm14,xmm6,0x11 vmovdqu xmm6,XMMWORD[((160-64))+rdx] vpxor xmm15,xmm15,xmm12 vpclmulqdq xmm2,xmm9,xmm7,0x10 vpxor xmm15,xmm15,xmm10 lea r8,[128+r8] sub r9,0x80 jnc NEAR $L$oop8x_avx add r9,0x80 jmp NEAR $L$tail_no_xor_avx ALIGN 32 $L$short_avx: vmovdqu xmm14,XMMWORD[((-16))+r9*1+r8] lea r8,[r9*1+r8] vmovdqu xmm6,XMMWORD[((0-64))+rdx] vmovdqu xmm7,XMMWORD[((32-64))+rdx] vpshufb xmm15,xmm14,xmm13 vmovdqa xmm3,xmm0 vmovdqa xmm4,xmm1 vmovdqa xmm5,xmm2 sub r9,0x10 jz NEAR $L$tail_avx vpunpckhqdq xmm8,xmm15,xmm15 vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm0,xmm15,xmm6,0x00 vpxor xmm8,xmm8,xmm15 vmovdqu xmm14,XMMWORD[((-32))+r8] vpxor xmm4,xmm4,xmm1 vpclmulqdq xmm1,xmm15,xmm6,0x11 vmovdqu xmm6,XMMWORD[((16-64))+rdx] vpshufb xmm15,xmm14,xmm13 vpxor xmm5,xmm5,xmm2 vpclmulqdq xmm2,xmm8,xmm7,0x00 vpsrldq xmm7,xmm7,8 sub r9,0x10 jz NEAR $L$tail_avx vpunpckhqdq xmm8,xmm15,xmm15 vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm0,xmm15,xmm6,0x00 vpxor xmm8,xmm8,xmm15 vmovdqu xmm14,XMMWORD[((-48))+r8] vpxor xmm4,xmm4,xmm1 vpclmulqdq xmm1,xmm15,xmm6,0x11 vmovdqu xmm6,XMMWORD[((48-64))+rdx] vpshufb xmm15,xmm14,xmm13 vpxor xmm5,xmm5,xmm2 vpclmulqdq xmm2,xmm8,xmm7,0x00 vmovdqu xmm7,XMMWORD[((80-64))+rdx] sub r9,0x10 jz NEAR $L$tail_avx vpunpckhqdq xmm8,xmm15,xmm15 vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm0,xmm15,xmm6,0x00 vpxor xmm8,xmm8,xmm15 vmovdqu xmm14,XMMWORD[((-64))+r8] vpxor xmm4,xmm4,xmm1 vpclmulqdq xmm1,xmm15,xmm6,0x11 vmovdqu xmm6,XMMWORD[((64-64))+rdx] vpshufb xmm15,xmm14,xmm13 vpxor xmm5,xmm5,xmm2 vpclmulqdq xmm2,xmm8,xmm7,0x00 vpsrldq xmm7,xmm7,8 sub r9,0x10 jz NEAR $L$tail_avx vpunpckhqdq xmm8,xmm15,xmm15 vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm0,xmm15,xmm6,0x00 vpxor xmm8,xmm8,xmm15 vmovdqu xmm14,XMMWORD[((-80))+r8] vpxor xmm4,xmm4,xmm1 vpclmulqdq xmm1,xmm15,xmm6,0x11 vmovdqu xmm6,XMMWORD[((96-64))+rdx] vpshufb xmm15,xmm14,xmm13 vpxor xmm5,xmm5,xmm2 vpclmulqdq xmm2,xmm8,xmm7,0x00 vmovdqu xmm7,XMMWORD[((128-64))+rdx] sub r9,0x10 jz NEAR $L$tail_avx vpunpckhqdq xmm8,xmm15,xmm15 vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm0,xmm15,xmm6,0x00 vpxor xmm8,xmm8,xmm15 vmovdqu xmm14,XMMWORD[((-96))+r8] vpxor xmm4,xmm4,xmm1 vpclmulqdq xmm1,xmm15,xmm6,0x11 vmovdqu xmm6,XMMWORD[((112-64))+rdx] vpshufb xmm15,xmm14,xmm13 vpxor xmm5,xmm5,xmm2 vpclmulqdq xmm2,xmm8,xmm7,0x00 vpsrldq xmm7,xmm7,8 sub r9,0x10 jz NEAR $L$tail_avx vpunpckhqdq xmm8,xmm15,xmm15 vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm0,xmm15,xmm6,0x00 vpxor xmm8,xmm8,xmm15 vmovdqu xmm14,XMMWORD[((-112))+r8] vpxor xmm4,xmm4,xmm1 vpclmulqdq xmm1,xmm15,xmm6,0x11 vmovdqu xmm6,XMMWORD[((144-64))+rdx] vpshufb xmm15,xmm14,xmm13 vpxor xmm5,xmm5,xmm2 vpclmulqdq xmm2,xmm8,xmm7,0x00 vmovq xmm7,QWORD[((184-64))+rdx] sub r9,0x10 jmp NEAR $L$tail_avx ALIGN 32 $L$tail_avx: vpxor xmm15,xmm15,xmm10 $L$tail_no_xor_avx: vpunpckhqdq xmm8,xmm15,xmm15 vpxor xmm3,xmm3,xmm0 vpclmulqdq xmm0,xmm15,xmm6,0x00 vpxor xmm8,xmm8,xmm15 vpxor xmm4,xmm4,xmm1 vpclmulqdq xmm1,xmm15,xmm6,0x11 vpxor xmm5,xmm5,xmm2 vpclmulqdq xmm2,xmm8,xmm7,0x00 vmovdqu xmm12,XMMWORD[r10] vpxor xmm10,xmm3,xmm0 vpxor xmm11,xmm4,xmm1 vpxor xmm5,xmm5,xmm2 vpxor xmm5,xmm5,xmm10 vpxor xmm5,xmm5,xmm11 vpslldq xmm9,xmm5,8 vpsrldq xmm5,xmm5,8 vpxor xmm10,xmm10,xmm9 vpxor xmm11,xmm11,xmm5 vpclmulqdq xmm9,xmm10,xmm12,0x10 vpalignr xmm10,xmm10,xmm10,8 vpxor xmm10,xmm10,xmm9 vpclmulqdq xmm9,xmm10,xmm12,0x10 vpalignr xmm10,xmm10,xmm10,8 vpxor xmm10,xmm10,xmm11 vpxor xmm10,xmm10,xmm9 cmp r9,0 jne NEAR $L$short_avx vpshufb xmm10,xmm10,xmm13 vmovdqu XMMWORD[rcx],xmm10 vzeroupper movaps xmm6,XMMWORD[rsp] movaps xmm7,XMMWORD[16+rsp] movaps xmm8,XMMWORD[32+rsp] movaps xmm9,XMMWORD[48+rsp] movaps xmm10,XMMWORD[64+rsp] movaps xmm11,XMMWORD[80+rsp] movaps xmm12,XMMWORD[96+rsp] movaps xmm13,XMMWORD[112+rsp] movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] lea rsp,[168+rsp] ret $L$SEH_end_gcm_ghash_avx_14: section .rdata rdata align=8 ALIGN 64 $L$bswap_mask: DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 $L$0x1c2_polynomial: DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 $L$7_mask: DD 7,0,7,0 ALIGN 64 DB 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52 DB 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 DB 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 DB 114,103,62,0 ALIGN 64 section .text section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_gcm_init_clmul_1 wrt ..imagebase DD $L$SEH_end_gcm_init_clmul_5 wrt ..imagebase DD $L$SEH_info_gcm_init_clmul_0 wrt ..imagebase DD $L$SEH_begin_gcm_ghash_clmul_1 wrt ..imagebase DD $L$SEH_end_gcm_ghash_clmul_14 wrt ..imagebase DD $L$SEH_info_gcm_ghash_clmul_0 wrt ..imagebase DD $L$SEH_begin_gcm_init_avx_1 wrt ..imagebase DD $L$SEH_end_gcm_init_avx_5 wrt ..imagebase DD $L$SEH_info_gcm_init_avx_0 wrt ..imagebase DD $L$SEH_begin_gcm_ghash_avx_1 wrt ..imagebase DD $L$SEH_end_gcm_ghash_avx_14 wrt ..imagebase DD $L$SEH_info_gcm_ghash_avx_0 wrt ..imagebase section .xdata rdata align=8 ALIGN 4 $L$SEH_info_gcm_init_clmul_0: DB 1 DB $L$SEH_endprologue_gcm_init_clmul_4-$L$SEH_begin_gcm_init_clmul_1 DB 3 DB 0 DB $L$SEH_prologue_gcm_init_clmul_3-$L$SEH_begin_gcm_init_clmul_1 DB 104 DW 0 DB $L$SEH_prologue_gcm_init_clmul_2-$L$SEH_begin_gcm_init_clmul_1 DB 34 DW 0 $L$SEH_info_gcm_ghash_clmul_0: DB 1 DB $L$SEH_endprologue_gcm_ghash_clmul_13-$L$SEH_begin_gcm_ghash_clmul_1 DB 22 DB 0 DB $L$SEH_prologue_gcm_ghash_clmul_12-$L$SEH_begin_gcm_ghash_clmul_1 DB 248 DW 9 DB $L$SEH_prologue_gcm_ghash_clmul_11-$L$SEH_begin_gcm_ghash_clmul_1 DB 232 DW 8 DB $L$SEH_prologue_gcm_ghash_clmul_10-$L$SEH_begin_gcm_ghash_clmul_1 DB 216 DW 7 DB $L$SEH_prologue_gcm_ghash_clmul_9-$L$SEH_begin_gcm_ghash_clmul_1 DB 200 DW 6 DB $L$SEH_prologue_gcm_ghash_clmul_8-$L$SEH_begin_gcm_ghash_clmul_1 DB 184 DW 5 DB $L$SEH_prologue_gcm_ghash_clmul_7-$L$SEH_begin_gcm_ghash_clmul_1 DB 168 DW 4 DB $L$SEH_prologue_gcm_ghash_clmul_6-$L$SEH_begin_gcm_ghash_clmul_1 DB 152 DW 3 DB $L$SEH_prologue_gcm_ghash_clmul_5-$L$SEH_begin_gcm_ghash_clmul_1 DB 136 DW 2 DB $L$SEH_prologue_gcm_ghash_clmul_4-$L$SEH_begin_gcm_ghash_clmul_1 DB 120 DW 1 DB $L$SEH_prologue_gcm_ghash_clmul_3-$L$SEH_begin_gcm_ghash_clmul_1 DB 104 DW 0 DB $L$SEH_prologue_gcm_ghash_clmul_2-$L$SEH_begin_gcm_ghash_clmul_1 DB 1 DW 21 $L$SEH_info_gcm_init_avx_0: DB 1 DB $L$SEH_endprologue_gcm_init_avx_4-$L$SEH_begin_gcm_init_avx_1 DB 3 DB 0 DB $L$SEH_prologue_gcm_init_avx_3-$L$SEH_begin_gcm_init_avx_1 DB 104 DW 0 DB $L$SEH_prologue_gcm_init_avx_2-$L$SEH_begin_gcm_init_avx_1 DB 34 DW 0 $L$SEH_info_gcm_ghash_avx_0: DB 1 DB $L$SEH_endprologue_gcm_ghash_avx_13-$L$SEH_begin_gcm_ghash_avx_1 DB 22 DB 0 DB $L$SEH_prologue_gcm_ghash_avx_12-$L$SEH_begin_gcm_ghash_avx_1 DB 248 DW 9 DB $L$SEH_prologue_gcm_ghash_avx_11-$L$SEH_begin_gcm_ghash_avx_1 DB 232 DW 8 DB $L$SEH_prologue_gcm_ghash_avx_10-$L$SEH_begin_gcm_ghash_avx_1 DB 216 DW 7 DB $L$SEH_prologue_gcm_ghash_avx_9-$L$SEH_begin_gcm_ghash_avx_1 DB 200 DW 6 DB $L$SEH_prologue_gcm_ghash_avx_8-$L$SEH_begin_gcm_ghash_avx_1 DB 184 DW 5 DB $L$SEH_prologue_gcm_ghash_avx_7-$L$SEH_begin_gcm_ghash_avx_1 DB 168 DW 4 DB $L$SEH_prologue_gcm_ghash_avx_6-$L$SEH_begin_gcm_ghash_avx_1 DB 152 DW 3 DB $L$SEH_prologue_gcm_ghash_avx_5-$L$SEH_begin_gcm_ghash_avx_1 DB 136 DW 2 DB $L$SEH_prologue_gcm_ghash_avx_4-$L$SEH_begin_gcm_ghash_avx_1 DB 120 DW 1 DB $L$SEH_prologue_gcm_ghash_avx_3-$L$SEH_begin_gcm_ghash_avx_1 DB 104 DW 0 DB $L$SEH_prologue_gcm_ghash_avx_2-$L$SEH_begin_gcm_ghash_avx_1 DB 1 DW 21 %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/ghash-x86_64-nasm.o000064400000000000000000000467331046102023000165610ustar 00000000000000dgAO.debug$S(',(@B.debug$Tl@--@B.text`- @ p`.rdataH@@@p@.pdata0@@ @0@.xdataxpAA@@@6C:\Users\b\p\ring\pregenerated\ghash-x86_64-nasm.asmOmy ֧`  !"##($,%1&5)=*A-F.J/N0R1W2[3a4g5m6q7u9y:~;<=?@ABCDEFGHIJMNOPQRSTUVWXYZ[\ ] ^_`a"b(c,d0f4g9h>iBjFlJmNnSoWp\q`resitnusvww{z{|}~ !%*.37<AEJNSY^bgh "&+/5;AFLQX_fjnrv|      ! '!,"3#9$<%A&G'K(O)V*[,`-d.i/m0r1y2345679:;<=>?@ABCDFGHIJKLM NOPQR#T(U-V3W8X=YBZI[N\R]V^]_a`galcsdwf{ghklmnopqrtuwxyz{}~  %)-38<BGMSY^bgkqw} #(.38=BGMRW\`ejpuz      "&*.37;@DIMPV[a f!j"o#s$y%&'(*+,-.0123456789:;>?@ABCDEG H I J K L M N& O, P2 Q; RD SL TM Z` _d ah dk fo gt jy k~ l m n o p s t v w x y z { } ~                   $ ( - 1 5 : > C G K O S Y _ e i m r w {                                  ( , 0 4 8 = B G L Q V [ ^ b i m u z                               # ( . 2 6 < A G K !Q "U #Z $` %e &i 'n (t )y *~ , - . / 0 1 2 3 4 5 6 8 9 : ; < = > ? @ A B C E F G H I J K L M O' P. Q4 S9 T@ UE W` Ye Zk [o \t ]z ^ _ ` a b c d e f h i j k l m n o p q r t u v w x y z{ |}~#(.38<BHNSW]afjpuy %*/39>CGLRW]bgmry  &+15;@EIOTX^cgmrx|      !"##$(%,&2(7);*A+F,L-P.V/[0`1d2j3o4s5x79;<=>?@ABDFGHJKLMNOQRSUVWXZ[ ]^_`a!b'c-d3e9f?gEhNiWj_k 9C:\Users\b\p\ring\pregenerated\ghash-x86_64-nasm.o4'The Netwide Assembler 2.13.03+ring_core_0_17_14__gcm_init_clmul&L$SEH_begin_gcm_init_clmul_1L$_init_clmul)L$SEH_prologue_gcm_init_clmul_2)L$SEH_prologue_gcm_init_clmul_3,L$SEH_endprologue_gcm_init_clmul_4$L$SEH_end_gcm_init_clmul_5,ring_core_0_17_14__gcm_ghash_clmul'L$SEH_begin_gcm_ghash_clmul_1L$_ghash_clmul*L$SEH_prologue_gcm_ghash_clmul_2*L$SEH_prologue_gcm_ghash_clmul_3*L$SEH_prologue_gcm_ghash_clmul_4*L$SEH_prologue_gcm_ghash_clmul_5*L$SEH_prologue_gcm_ghash_clmul_6*L$SEH_prologue_gcm_ghash_clmul_7*L$SEH_prologue_gcm_ghash_clmul_8*L$SEH_prologue_gcm_ghash_clmul_9+L$SEH_prologue_gcm_ghash_clmul_10+L$SEH_prologue_gcm_ghash_clmul_11+L$SEH_prologue_gcm_ghash_clmul_12.L$SEH_endprologue_gcm_ghash_clmul_13L$mod4_loopL$tail4xL$skip4xL$mod_loopL$even_tailL$odd_tailL$done&L$SEH_end_gcm_ghash_clmul_14)ring_core_0_17_14__gcm_init_avx$L$SEH_begin_gcm_init_avx_1'L$SEH_prologue_gcm_init_avx_2'L$SEH_prologue_gcm_init_avx_3*L$SEH_endprologue_gcm_init_avx_4L$init_loop_avxL$init_start_avx"L$SEH_end_gcm_init_avx_5*ring_core_0_17_14__gcm_ghash_avx%L$SEH_begin_gcm_ghash_avx_1(L$SEH_prologue_gcm_ghash_avx_2(L$SEH_prologue_gcm_ghash_avx_3(L$SEH_prologue_gcm_ghash_avx_4(L$SEH_prologue_gcm_ghash_avx_5(L$SEH_prologue_gcm_ghash_avx_6(L$SEH_prologue_gcm_ghash_avx_7(L$SEH_prologue_gcm_ghash_avx_8(L$SEH_prologue_gcm_ghash_avx_9)L$SEH_prologue_gcm_ghash_avx_10)L$SEH_prologue_gcm_ghash_avx_11)L$SEH_prologue_gcm_ghash_avx_12,L$SEH_endprologue_gcm_ghash_avx_13L$oop8x_avxL$short_avxL$tail_avxL$tail_no_xor_avx$L$SEH_end_gcm_ghash_avx_14  L$bswap_mask  L$0x1c2_polynomial "L$7_mask(  L$SEH_info_gcm_init_clmul_0)  L$SEH_info_gcm_ghash_clmul_0&  L$SEH_info_gcm_init_avx_0'  L$SEH_info_gcm_ghash_avx_0l p     & * ? C j n         @ D Z ^          6  :  b  f  ! ! " " !# !# A!$ E!$ q!% u!% !& !& !' !' !( !( !) !) !* !* !+ !+ ", ", -"- 1"- X". \". ~"/ "/ "0 "0 "1 "1 "2 #2 #3 #3 3#4 7#4 W#5 [#5 #6 #6 #7 #7 #8 #8 #9 $9 ($: ,$: R$; V$; |$< $< $= $= $> $> $? $? %%@ )%@ P%A T%A {%B %B %C %C %D %D %E %E %F %F &G &G 4&H 8&H O&I S&I p&J t&J &K &K &L &L &M &M 'N 'N H)4$ofpNfpfofsffs?fffsff-ffpNfoffofpNff:Df:Df:Dfffofsfsfffofofsffsffs9fofsfsfffofsfffsffsffpNfpNffAf:a fofpNff:Df:Df:Dfffofsfsfffofofsffsffs9fofsfsfffofsfffsffsffofofpNff:Df:Df:Dfffofsfsfffofofsffsffs9fofsfsfffofsfffsffsffpNfpNfi0fA@f:aP(4$Hd$ÐH$xH`)p)xD)D)HD)P D)X0D)`@D)hPD)p`D)xpfDooooz fA8IuorI0RI0H `@Dor0Doz@AoX0EoX fA8fE8fofpNff:Df:Df:DfEofEpNfEfD:DfD:DfD:DAWAWzPAWEoXEofE8fE8fEofEpNfAfEfE:DfofDpNfDfE:DfD:DAWAWM@@I@fA:DAWEoX0fE8fA:DWAoX fEofD:DfEpNWfEfA8z DWfD:DfpNfDfofDffEofD:DfAsfAsfAfDo fAfLnfDfE8fDfD:DfAs9fEofAsf:DfAsfAfAEofDofsf:DAWEoXfE8f:DAWzPfE8fAfDfsfEofAfEpNfAfAfEfE:DfsffofE:DAWfDpNfDfD:DAWM@@I@{fA:DfA:DfD:DAWWWffDfDffEofAsfAsfAfAfofofsffsffs9fofsfsfffofsfffsffsfI@oz IEoAoXfE8fA8fAfofpNff:Df:Df:DM@ I fofDofpNff:Df:Df:DffEofDfE8AoXfDfAfAfA8fDofAsfsfAffofDofDofsfDf:DfsfAfs9fDofsfAsfAfpNfAffDofsf:DfAfDfsfAM@ fsf:DfI fofDofpNff:Df:Df:DfffDfDfAfDofAsfsfAffofofsffsffs9fofsfsfffofsfffsffsfMEofE8fAfofpNff:Df:Df:Dfffofsfsfffofofsffsffs9fofsfsfffofsfffsffsffA8(4$(|$D(D$ D(L$0D(T$@D(\$PD(d$`D(l$pD($D($H$ÐH)4$wopNps?sfs-moAYimyDyDaDsss9s>s?sssssomyDyDaDsss9s>s?ssssspNpN)AHI0Iaiw(4$Hd$ÐH$xH`)p)xD)D)HD)P D)X0D)`@D)hPD)p`D)xpwzoLHR@zo-B)IIAzopporB ozA mAzox` DA1B DorAmAzopP1DA9B DA mDorA1Azox@9DozB DAm Do21DA9Azop0DB Dor A m9Doz@A1Azox DB Dor0Am1DA9AzopDB DorPA m9DozpA1Azo8 DB Dor`1DMILAIAmAzoppA9cDB cDorA mc9DozA1Azox` D)B D WorAm1DA8WAzopPADA1sDsA)orB A WA m9DozA1Azox@C) DB Do2Am1DA8WAzop0C)DDB Dor A m9Doz@A1Azox DB Dor0Am1DA9A(WAzopC)DB DorPC)DAWA m9DozpA1Azo8 DB Dor`A1DAMIIzotOorozB oooIAmDA9AzopDorB 9DsIiAmDA9AzopDorB 9DozI"AmDA9AzopDo2B 9DsIAmDA9AzopDor B 9Doz@IAmDA9AzopDor0B 9DsINAmDA9AzopDorPB 9D~zxIAAmDA9D9DAzo"aYQQűssA)!C)DC)A)C)DC)A)A)IB)zw(4$(|$D(D$ D(L$0D(T$@D(\$PD(d$`D(l$pD($D($H$9 e q  GHASH for x86_64, CRYPTOGAMS by hM `  < `H    $(, h";; 61,'"xh h";; 61,'"xh .filegC:\Users\b\p\ring\.debug$S('.debug$Tl.text`.rdata.pdata0 .xdatax.absolut&CQqh@a'IkL$tail4xL$skip4xBVL$doneM ` ` 8d Vh th  G     , +0 J4 i8 = B G L Q $V D[ d[ ` `L$7_mask  0<JHering_core_0_17_14__gcm_init_clmulL$SEH_begin_gcm_init_clmul_1L$_init_clmulL$SEH_prologue_gcm_init_clmul_2L$SEH_prologue_gcm_init_clmul_3L$SEH_endprologue_gcm_init_clmul_4L$SEH_end_gcm_init_clmul_5ring_core_0_17_14__gcm_ghash_clmulL$SEH_begin_gcm_ghash_clmul_1L$_ghash_clmulL$SEH_prologue_gcm_ghash_clmul_2L$SEH_prologue_gcm_ghash_clmul_3L$SEH_prologue_gcm_ghash_clmul_4L$SEH_prologue_gcm_ghash_clmul_5L$SEH_prologue_gcm_ghash_clmul_6L$SEH_prologue_gcm_ghash_clmul_7L$SEH_prologue_gcm_ghash_clmul_8L$SEH_prologue_gcm_ghash_clmul_9L$SEH_prologue_gcm_ghash_clmul_10L$SEH_prologue_gcm_ghash_clmul_11L$SEH_prologue_gcm_ghash_clmul_12L$SEH_endprologue_gcm_ghash_clmul_13L$mod4_loopL$mod_loopL$even_tailL$odd_tailL$SEH_end_gcm_ghash_clmul_14ring_core_0_17_14__gcm_init_avxL$SEH_begin_gcm_init_avx_1L$SEH_prologue_gcm_init_avx_2L$SEH_prologue_gcm_init_avx_3L$SEH_endprologue_gcm_init_avx_4L$init_loop_avxL$init_start_avxL$SEH_end_gcm_init_avx_5ring_core_0_17_14__gcm_ghash_avxL$SEH_begin_gcm_ghash_avx_1L$SEH_prologue_gcm_ghash_avx_2L$SEH_prologue_gcm_ghash_avx_3L$SEH_prologue_gcm_ghash_avx_4L$SEH_prologue_gcm_ghash_avx_5L$SEH_prologue_gcm_ghash_avx_6L$SEH_prologue_gcm_ghash_avx_7L$SEH_prologue_gcm_ghash_avx_8L$SEH_prologue_gcm_ghash_avx_9L$SEH_prologue_gcm_ghash_avx_10L$SEH_prologue_gcm_ghash_avx_11L$SEH_prologue_gcm_ghash_avx_12L$SEH_endprologue_gcm_ghash_avx_13L$oop8x_avxL$short_avxL$tail_avxL$tail_no_xor_avxL$SEH_end_gcm_ghash_avx_14L$bswap_maskL$0x1c2_polynomialL$SEH_info_gcm_init_clmul_0L$SEH_info_gcm_ghash_clmul_0L$SEH_info_gcm_init_avx_0L$SEH_info_gcm_ghash_avx_0ring-0.17.14/pregenerated/ghashv8-armx-ios64.S000064400000000000000000000100521046102023000170030ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) #if __ARM_MAX_ARCH__>=7 .text .globl _gcm_init_clmul .private_extern _gcm_init_clmul .align 4 _gcm_init_clmul: AARCH64_VALID_CALL_TARGET ld1 {v17.2d},[x1] //load input H movi v19.16b,#0xe1 shl v19.2d,v19.2d,#57 //0xc2.0 ext v3.16b,v17.16b,v17.16b,#8 ushr v18.2d,v19.2d,#63 dup v17.4s,v17.s[1] ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 ushr v18.2d,v3.2d,#63 sshr v17.4s,v17.4s,#31 //broadcast carry bit and v18.16b,v18.16b,v16.16b shl v3.2d,v3.2d,#1 ext v18.16b,v18.16b,v18.16b,#8 and v16.16b,v16.16b,v17.16b orr v3.16b,v3.16b,v18.16b //H<<<=1 eor v20.16b,v3.16b,v16.16b //twisted H st1 {v20.2d},[x0],#16 //store Htable[0] //calculate H^2 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing pmull v0.1q,v20.1d,v20.1d eor v16.16b,v16.16b,v20.16b pmull2 v2.1q,v20.2d,v20.2d pmull v1.1q,v16.1d,v16.1d ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v17.16b eor v1.16b,v1.16b,v18.16b pmull v18.1q,v0.1d,v19.1d //1st phase ins v2.d[0],v1.d[1] ins v1.d[1],v0.d[0] eor v0.16b,v1.16b,v18.16b ext v18.16b,v0.16b,v0.16b,#8 //2nd phase pmull v0.1q,v0.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v22.16b,v0.16b,v18.16b ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing eor v17.16b,v17.16b,v22.16b ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] //calculate H^3 and H^4 pmull v0.1q,v20.1d, v22.1d pmull v5.1q,v22.1d,v22.1d pmull2 v2.1q,v20.2d, v22.2d pmull2 v7.1q,v22.2d,v22.2d pmull v1.1q,v16.1d,v17.1d pmull v6.1q,v17.1d,v17.1d ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing ext v17.16b,v5.16b,v7.16b,#8 eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v16.16b eor v4.16b,v5.16b,v7.16b eor v6.16b,v6.16b,v17.16b eor v1.16b,v1.16b,v18.16b pmull v18.1q,v0.1d,v19.1d //1st phase eor v6.16b,v6.16b,v4.16b pmull v4.1q,v5.1d,v19.1d ins v2.d[0],v1.d[1] ins v7.d[0],v6.d[1] ins v1.d[1],v0.d[0] ins v6.d[1],v5.d[0] eor v0.16b,v1.16b,v18.16b eor v5.16b,v6.16b,v4.16b ext v18.16b,v0.16b,v0.16b,#8 //2nd phase ext v4.16b,v5.16b,v5.16b,#8 pmull v0.1q,v0.1d,v19.1d pmull v5.1q,v5.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b eor v20.16b, v0.16b,v18.16b //H^3 eor v22.16b,v5.16b,v4.16b //H^4 ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing ext v17.16b,v22.16b,v22.16b,#8 eor v16.16b,v16.16b,v20.16b eor v17.16b,v17.16b,v22.16b ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] ret .globl _gcm_gmult_clmul .private_extern _gcm_gmult_clmul .align 4 _gcm_gmult_clmul: AARCH64_VALID_CALL_TARGET ld1 {v17.2d},[x0] //load Xi movi v19.16b,#0xe1 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... shl v19.2d,v19.2d,#57 #ifndef __AARCH64EB__ rev64 v17.16b,v17.16b #endif ext v3.16b,v17.16b,v17.16b,#8 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v17.16b eor v1.16b,v1.16b,v18.16b pmull v18.1q,v0.1d,v19.1d //1st phase of reduction ins v2.d[0],v1.d[1] ins v1.d[1],v0.d[0] eor v0.16b,v1.16b,v18.16b ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction pmull v0.1q,v0.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v0.16b,v0.16b,v18.16b #ifndef __AARCH64EB__ rev64 v0.16b,v0.16b #endif ext v0.16b,v0.16b,v0.16b,#8 st1 {v0.2d},[x0] //write out Xi ret .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) ring-0.17.14/pregenerated/ghashv8-armx-linux64.S000064400000000000000000000102551046102023000173550ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) #if __ARM_MAX_ARCH__>=7 .text .arch armv8-a+crypto .globl gcm_init_clmul .hidden gcm_init_clmul .type gcm_init_clmul,%function .align 4 gcm_init_clmul: AARCH64_VALID_CALL_TARGET ld1 {v17.2d},[x1] //load input H movi v19.16b,#0xe1 shl v19.2d,v19.2d,#57 //0xc2.0 ext v3.16b,v17.16b,v17.16b,#8 ushr v18.2d,v19.2d,#63 dup v17.4s,v17.s[1] ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 ushr v18.2d,v3.2d,#63 sshr v17.4s,v17.4s,#31 //broadcast carry bit and v18.16b,v18.16b,v16.16b shl v3.2d,v3.2d,#1 ext v18.16b,v18.16b,v18.16b,#8 and v16.16b,v16.16b,v17.16b orr v3.16b,v3.16b,v18.16b //H<<<=1 eor v20.16b,v3.16b,v16.16b //twisted H st1 {v20.2d},[x0],#16 //store Htable[0] //calculate H^2 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing pmull v0.1q,v20.1d,v20.1d eor v16.16b,v16.16b,v20.16b pmull2 v2.1q,v20.2d,v20.2d pmull v1.1q,v16.1d,v16.1d ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v17.16b eor v1.16b,v1.16b,v18.16b pmull v18.1q,v0.1d,v19.1d //1st phase ins v2.d[0],v1.d[1] ins v1.d[1],v0.d[0] eor v0.16b,v1.16b,v18.16b ext v18.16b,v0.16b,v0.16b,#8 //2nd phase pmull v0.1q,v0.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v22.16b,v0.16b,v18.16b ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing eor v17.16b,v17.16b,v22.16b ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] //calculate H^3 and H^4 pmull v0.1q,v20.1d, v22.1d pmull v5.1q,v22.1d,v22.1d pmull2 v2.1q,v20.2d, v22.2d pmull2 v7.1q,v22.2d,v22.2d pmull v1.1q,v16.1d,v17.1d pmull v6.1q,v17.1d,v17.1d ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing ext v17.16b,v5.16b,v7.16b,#8 eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v16.16b eor v4.16b,v5.16b,v7.16b eor v6.16b,v6.16b,v17.16b eor v1.16b,v1.16b,v18.16b pmull v18.1q,v0.1d,v19.1d //1st phase eor v6.16b,v6.16b,v4.16b pmull v4.1q,v5.1d,v19.1d ins v2.d[0],v1.d[1] ins v7.d[0],v6.d[1] ins v1.d[1],v0.d[0] ins v6.d[1],v5.d[0] eor v0.16b,v1.16b,v18.16b eor v5.16b,v6.16b,v4.16b ext v18.16b,v0.16b,v0.16b,#8 //2nd phase ext v4.16b,v5.16b,v5.16b,#8 pmull v0.1q,v0.1d,v19.1d pmull v5.1q,v5.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b eor v20.16b, v0.16b,v18.16b //H^3 eor v22.16b,v5.16b,v4.16b //H^4 ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing ext v17.16b,v22.16b,v22.16b,#8 eor v16.16b,v16.16b,v20.16b eor v17.16b,v17.16b,v22.16b ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] ret .size gcm_init_clmul,.-gcm_init_clmul .globl gcm_gmult_clmul .hidden gcm_gmult_clmul .type gcm_gmult_clmul,%function .align 4 gcm_gmult_clmul: AARCH64_VALID_CALL_TARGET ld1 {v17.2d},[x0] //load Xi movi v19.16b,#0xe1 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... shl v19.2d,v19.2d,#57 #ifndef __AARCH64EB__ rev64 v17.16b,v17.16b #endif ext v3.16b,v17.16b,v17.16b,#8 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v17.16b eor v1.16b,v1.16b,v18.16b pmull v18.1q,v0.1d,v19.1d //1st phase of reduction ins v2.d[0],v1.d[1] ins v1.d[1],v0.d[0] eor v0.16b,v1.16b,v18.16b ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction pmull v0.1q,v0.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v0.16b,v0.16b,v18.16b #ifndef __AARCH64EB__ rev64 v0.16b,v0.16b #endif ext v0.16b,v0.16b,v0.16b,#8 st1 {v0.2d},[x0] //write out Xi ret .size gcm_gmult_clmul,.-gcm_gmult_clmul .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) ring-0.17.14/pregenerated/ghashv8-armx-win64.S000064400000000000000000000101021046102023000170020ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) #if __ARM_MAX_ARCH__>=7 .text .arch armv8-a+crypto .globl gcm_init_clmul .def gcm_init_clmul .type 32 .endef .align 4 gcm_init_clmul: AARCH64_VALID_CALL_TARGET ld1 {v17.2d},[x1] //load input H movi v19.16b,#0xe1 shl v19.2d,v19.2d,#57 //0xc2.0 ext v3.16b,v17.16b,v17.16b,#8 ushr v18.2d,v19.2d,#63 dup v17.4s,v17.s[1] ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 ushr v18.2d,v3.2d,#63 sshr v17.4s,v17.4s,#31 //broadcast carry bit and v18.16b,v18.16b,v16.16b shl v3.2d,v3.2d,#1 ext v18.16b,v18.16b,v18.16b,#8 and v16.16b,v16.16b,v17.16b orr v3.16b,v3.16b,v18.16b //H<<<=1 eor v20.16b,v3.16b,v16.16b //twisted H st1 {v20.2d},[x0],#16 //store Htable[0] //calculate H^2 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing pmull v0.1q,v20.1d,v20.1d eor v16.16b,v16.16b,v20.16b pmull2 v2.1q,v20.2d,v20.2d pmull v1.1q,v16.1d,v16.1d ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v17.16b eor v1.16b,v1.16b,v18.16b pmull v18.1q,v0.1d,v19.1d //1st phase ins v2.d[0],v1.d[1] ins v1.d[1],v0.d[0] eor v0.16b,v1.16b,v18.16b ext v18.16b,v0.16b,v0.16b,#8 //2nd phase pmull v0.1q,v0.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v22.16b,v0.16b,v18.16b ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing eor v17.16b,v17.16b,v22.16b ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] //calculate H^3 and H^4 pmull v0.1q,v20.1d, v22.1d pmull v5.1q,v22.1d,v22.1d pmull2 v2.1q,v20.2d, v22.2d pmull2 v7.1q,v22.2d,v22.2d pmull v1.1q,v16.1d,v17.1d pmull v6.1q,v17.1d,v17.1d ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing ext v17.16b,v5.16b,v7.16b,#8 eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v16.16b eor v4.16b,v5.16b,v7.16b eor v6.16b,v6.16b,v17.16b eor v1.16b,v1.16b,v18.16b pmull v18.1q,v0.1d,v19.1d //1st phase eor v6.16b,v6.16b,v4.16b pmull v4.1q,v5.1d,v19.1d ins v2.d[0],v1.d[1] ins v7.d[0],v6.d[1] ins v1.d[1],v0.d[0] ins v6.d[1],v5.d[0] eor v0.16b,v1.16b,v18.16b eor v5.16b,v6.16b,v4.16b ext v18.16b,v0.16b,v0.16b,#8 //2nd phase ext v4.16b,v5.16b,v5.16b,#8 pmull v0.1q,v0.1d,v19.1d pmull v5.1q,v5.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b eor v20.16b, v0.16b,v18.16b //H^3 eor v22.16b,v5.16b,v4.16b //H^4 ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing ext v17.16b,v22.16b,v22.16b,#8 eor v16.16b,v16.16b,v20.16b eor v17.16b,v17.16b,v22.16b ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] ret .globl gcm_gmult_clmul .def gcm_gmult_clmul .type 32 .endef .align 4 gcm_gmult_clmul: AARCH64_VALID_CALL_TARGET ld1 {v17.2d},[x0] //load Xi movi v19.16b,#0xe1 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... shl v19.2d,v19.2d,#57 #ifndef __AARCH64EB__ rev64 v17.16b,v17.16b #endif ext v3.16b,v17.16b,v17.16b,#8 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v17.16b eor v1.16b,v1.16b,v18.16b pmull v18.1q,v0.1d,v19.1d //1st phase of reduction ins v2.d[0],v1.d[1] ins v1.d[1],v0.d[0] eor v0.16b,v1.16b,v18.16b ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction pmull v0.1q,v0.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v0.16b,v0.16b,v18.16b #ifndef __AARCH64EB__ rev64 v0.16b,v0.16b #endif ext v0.16b,v0.16b,v0.16b,#8 st1 {v0.2d},[x0] //write out Xi ret .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) ring-0.17.14/pregenerated/p256-armv8-asm-ios64.S000064400000000000000000001051111046102023000167740ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) .section __TEXT,__const .align 5 Lpoly: .quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 LRR: // 2^512 mod P precomputed for NIST P256 polynomial .quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd Lone_mont: .quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe Lone: .quad 1,0,0,0 Lord: .quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 LordK: .quad 0xccd1c8aaee00bc4f .byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .text // void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], // const BN_ULONG x2[4]); .globl _ecp_nistz256_mul_mont .private_extern _ecp_nistz256_mul_mont .align 4 _ecp_nistz256_mul_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldr x3,[x2] // bp[0] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] adrp x13,Lpoly@PAGE add x13,x13,Lpoly@PAGEOFF ldr x12,[x13,#8] ldr x13,[x13,#24] bl __ecp_nistz256_mul_mont ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 AARCH64_VALIDATE_LINK_REGISTER ret // void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl _ecp_nistz256_sqr_mont .private_extern _ecp_nistz256_sqr_mont .align 4 _ecp_nistz256_sqr_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] adrp x13,Lpoly@PAGE add x13,x13,Lpoly@PAGEOFF ldr x12,[x13,#8] ldr x13,[x13,#24] bl __ecp_nistz256_sqr_mont ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 AARCH64_VALIDATE_LINK_REGISTER ret // void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl _ecp_nistz256_neg .private_extern _ecp_nistz256_neg .align 4 _ecp_nistz256_neg: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 mov x2,x1 mov x14,xzr // a = 0 mov x15,xzr mov x16,xzr mov x17,xzr adrp x13,Lpoly@PAGE add x13,x13,Lpoly@PAGEOFF ldr x12,[x13,#8] ldr x13,[x13,#24] bl __ecp_nistz256_sub_from ldp x29,x30,[sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret // note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded // to x4-x7 and b[0] - to x3 .align 4 __ecp_nistz256_mul_mont: mul x14,x4,x3 // a[0]*b[0] umulh x8,x4,x3 mul x15,x5,x3 // a[1]*b[0] umulh x9,x5,x3 mul x16,x6,x3 // a[2]*b[0] umulh x10,x6,x3 mul x17,x7,x3 // a[3]*b[0] umulh x11,x7,x3 ldr x3,[x2,#8] // b[1] adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 lsr x9,x14,#32 adcs x17,x17,x10 adc x19,xzr,x11 mov x20,xzr subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] mul x8,x4,x3 // lo(a[0]*b[i]) adcs x15,x16,x9 mul x9,x5,x3 // lo(a[1]*b[i]) adcs x16,x17,x10 // +=acc[0]*0xffff0001 mul x10,x6,x3 // lo(a[2]*b[i]) adcs x17,x19,x11 mul x11,x7,x3 // lo(a[3]*b[i]) adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts of multiplication umulh x8,x4,x3 // hi(a[0]*b[i]) adcs x15,x15,x9 umulh x9,x5,x3 // hi(a[1]*b[i]) adcs x16,x16,x10 umulh x10,x6,x3 // hi(a[2]*b[i]) adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr ldr x3,[x2,#8*(1+1)] // b[1+1] adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 lsr x9,x14,#32 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] mul x8,x4,x3 // lo(a[0]*b[i]) adcs x15,x16,x9 mul x9,x5,x3 // lo(a[1]*b[i]) adcs x16,x17,x10 // +=acc[0]*0xffff0001 mul x10,x6,x3 // lo(a[2]*b[i]) adcs x17,x19,x11 mul x11,x7,x3 // lo(a[3]*b[i]) adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts of multiplication umulh x8,x4,x3 // hi(a[0]*b[i]) adcs x15,x15,x9 umulh x9,x5,x3 // hi(a[1]*b[i]) adcs x16,x16,x10 umulh x10,x6,x3 // hi(a[2]*b[i]) adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr ldr x3,[x2,#8*(2+1)] // b[2+1] adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 lsr x9,x14,#32 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] mul x8,x4,x3 // lo(a[0]*b[i]) adcs x15,x16,x9 mul x9,x5,x3 // lo(a[1]*b[i]) adcs x16,x17,x10 // +=acc[0]*0xffff0001 mul x10,x6,x3 // lo(a[2]*b[i]) adcs x17,x19,x11 mul x11,x7,x3 // lo(a[3]*b[i]) adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts of multiplication umulh x8,x4,x3 // hi(a[0]*b[i]) adcs x15,x15,x9 umulh x9,x5,x3 // hi(a[1]*b[i]) adcs x16,x16,x10 umulh x10,x6,x3 // hi(a[2]*b[i]) adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 lsr x9,x14,#32 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr // last reduction subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 adcs x16,x17,x10 // +=acc[0]*0xffff0001 adcs x17,x19,x11 adc x19,x20,xzr adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus sbcs x9,x15,x12 sbcs x10,x16,xzr sbcs x11,x17,x13 sbcs xzr,x19,xzr // did it borrow? csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo stp x14,x15,[x0] csel x17,x17,x11,lo stp x16,x17,[x0,#16] ret // note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded // to x4-x7 .align 4 __ecp_nistz256_sqr_mont: // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul x15,x5,x4 // a[1]*a[0] umulh x9,x5,x4 mul x16,x6,x4 // a[2]*a[0] umulh x10,x6,x4 mul x17,x7,x4 // a[3]*a[0] umulh x19,x7,x4 adds x16,x16,x9 // accumulate high parts of multiplication mul x8,x6,x5 // a[2]*a[1] umulh x9,x6,x5 adcs x17,x17,x10 mul x10,x7,x5 // a[3]*a[1] umulh x11,x7,x5 adc x19,x19,xzr // can't overflow mul x20,x7,x6 // a[3]*a[2] umulh x1,x7,x6 adds x9,x9,x10 // accumulate high parts of multiplication mul x14,x4,x4 // a[0]*a[0] adc x10,x11,xzr // can't overflow adds x17,x17,x8 // accumulate low parts of multiplication umulh x4,x4,x4 adcs x19,x19,x9 mul x9,x5,x5 // a[1]*a[1] adcs x20,x20,x10 umulh x5,x5,x5 adc x1,x1,xzr // can't overflow adds x15,x15,x15 // acc[1-6]*=2 mul x10,x6,x6 // a[2]*a[2] adcs x16,x16,x16 umulh x6,x6,x6 adcs x17,x17,x17 mul x11,x7,x7 // a[3]*a[3] adcs x19,x19,x19 umulh x7,x7,x7 adcs x20,x20,x20 adcs x1,x1,x1 adc x2,xzr,xzr adds x15,x15,x4 // +a[i]*a[i] adcs x16,x16,x9 adcs x17,x17,x5 adcs x19,x19,x10 adcs x20,x20,x6 lsl x8,x14,#32 adcs x1,x1,x11 lsr x9,x14,#32 adc x2,x2,x7 subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 lsl x8,x14,#32 adcs x16,x17,x10 // +=acc[0]*0xffff0001 lsr x9,x14,#32 adc x17,x11,xzr // can't overflow subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 lsl x8,x14,#32 adcs x16,x17,x10 // +=acc[0]*0xffff0001 lsr x9,x14,#32 adc x17,x11,xzr // can't overflow subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 lsl x8,x14,#32 adcs x16,x17,x10 // +=acc[0]*0xffff0001 lsr x9,x14,#32 adc x17,x11,xzr // can't overflow subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 adcs x16,x17,x10 // +=acc[0]*0xffff0001 adc x17,x11,xzr // can't overflow adds x14,x14,x19 // accumulate upper half adcs x15,x15,x20 adcs x16,x16,x1 adcs x17,x17,x2 adc x19,xzr,xzr adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus sbcs x9,x15,x12 sbcs x10,x16,xzr sbcs x11,x17,x13 sbcs xzr,x19,xzr // did it borrow? csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo stp x14,x15,[x0] csel x17,x17,x11,lo stp x16,x17,[x0,#16] ret // Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to // x4-x7 and x8-x11. This is done because it's used in multiple // contexts, e.g. in multiplication by 2 and 3... .align 4 __ecp_nistz256_add_to: adds x14,x14,x8 // ret = a+b adcs x15,x15,x9 adcs x16,x16,x10 adcs x17,x17,x11 adc x1,xzr,xzr // zap x1 adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus sbcs x9,x15,x12 sbcs x10,x16,xzr sbcs x11,x17,x13 sbcs xzr,x1,xzr // did subtraction borrow? csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo stp x14,x15,[x0] csel x17,x17,x11,lo stp x16,x17,[x0,#16] ret .align 4 __ecp_nistz256_sub_from: ldp x8,x9,[x2] ldp x10,x11,[x2,#16] subs x14,x14,x8 // ret = a-b sbcs x15,x15,x9 sbcs x16,x16,x10 sbcs x17,x17,x11 sbc x1,xzr,xzr // zap x1 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus adcs x9,x15,x12 adcs x10,x16,xzr adc x11,x17,x13 cmp x1,xzr // did subtraction borrow? csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret csel x15,x15,x9,eq csel x16,x16,x10,eq stp x14,x15,[x0] csel x17,x17,x11,eq stp x16,x17,[x0,#16] ret .align 4 __ecp_nistz256_sub_morf: ldp x8,x9,[x2] ldp x10,x11,[x2,#16] subs x14,x8,x14 // ret = b-a sbcs x15,x9,x15 sbcs x16,x10,x16 sbcs x17,x11,x17 sbc x1,xzr,xzr // zap x1 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus adcs x9,x15,x12 adcs x10,x16,xzr adc x11,x17,x13 cmp x1,xzr // did subtraction borrow? csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret csel x15,x15,x9,eq csel x16,x16,x10,eq stp x14,x15,[x0] csel x17,x17,x11,eq stp x16,x17,[x0,#16] ret .align 4 __ecp_nistz256_div_by_2: subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus adcs x9,x15,x12 adcs x10,x16,xzr adcs x11,x17,x13 adc x1,xzr,xzr // zap x1 tst x14,#1 // is a even? csel x14,x14,x8,eq // ret = even ? a : a+modulus csel x15,x15,x9,eq csel x16,x16,x10,eq csel x17,x17,x11,eq csel x1,xzr,x1,eq lsr x14,x14,#1 // ret >>= 1 orr x14,x14,x15,lsl#63 lsr x15,x15,#1 orr x15,x15,x16,lsl#63 lsr x16,x16,#1 orr x16,x16,x17,lsl#63 lsr x17,x17,#1 stp x14,x15,[x0] orr x17,x17,x1,lsl#63 stp x16,x17,[x0,#16] ret .globl _ecp_nistz256_point_double .private_extern _ecp_nistz256_point_double .align 5 _ecp_nistz256_point_double: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] sub sp,sp,#32*4 Ldouble_shortcut: ldp x14,x15,[x1,#32] mov x21,x0 ldp x16,x17,[x1,#48] mov x22,x1 adrp x13,Lpoly@PAGE add x13,x13,Lpoly@PAGEOFF ldr x12,[x13,#8] mov x8,x14 ldr x13,[x13,#24] mov x9,x15 ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 ldp x6,x7,[x22,#64+16] add x0,sp,#0 bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); add x0,sp,#64 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); ldp x8,x9,[x22] ldp x10,x11,[x22,#16] mov x4,x14 // put Zsqr aside for p256_sub mov x5,x15 mov x6,x16 mov x7,x17 add x0,sp,#32 bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); add x2,x22,#0 mov x14,x4 // restore Zsqr mov x15,x5 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont mov x16,x6 mov x17,x7 ldp x6,x7,[sp,#0+16] add x0,sp,#64 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); add x0,sp,#0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); ldr x3,[x22,#32] ldp x4,x5,[x22,#64] ldp x6,x7,[x22,#64+16] add x2,x22,#32 add x0,sp,#96 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); mov x8,x14 mov x9,x15 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 ldp x6,x7,[sp,#0+16] add x0,x21,#64 bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); add x0,sp,#96 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); ldr x3,[sp,#64] // forward load for p256_mul_mont ldp x4,x5,[sp,#32] ldp x6,x7,[sp,#32+16] add x0,x21,#32 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); add x2,sp,#64 add x0,sp,#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); mov x8,x14 // duplicate M mov x9,x15 mov x10,x16 mov x11,x17 mov x4,x14 // put M aside mov x5,x15 mov x6,x16 mov x7,x17 add x0,sp,#32 bl __ecp_nistz256_add_to mov x8,x4 // restore M mov x9,x5 ldr x3,[x22] // forward load for p256_mul_mont mov x10,x6 ldp x4,x5,[sp,#0] mov x11,x7 ldp x6,x7,[sp,#0+16] bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); add x2,x22,#0 add x0,sp,#0 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); mov x8,x14 mov x9,x15 ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 ldp x6,x7,[sp,#32+16] add x0,sp,#96 bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); add x0,x21,#0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); add x2,sp,#96 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); add x2,sp,#0 add x0,sp,#0 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); ldr x3,[sp,#32] mov x4,x14 // copy S mov x5,x15 mov x6,x16 mov x7,x17 add x2,sp,#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); add x2,x21,#32 add x0,x21,#32 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .globl _ecp_nistz256_point_add .private_extern _ecp_nistz256_point_add .align 5 _ecp_nistz256_point_add: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#32*12 ldp x4,x5,[x2,#64] // in2_z ldp x6,x7,[x2,#64+16] mov x21,x0 mov x22,x1 mov x23,x2 adrp x13,Lpoly@PAGE add x13,x13,Lpoly@PAGEOFF ldr x12,[x13,#8] ldr x13,[x13,#24] orr x8,x4,x5 orr x10,x6,x7 orr x25,x8,x10 cmp x25,#0 csetm x25,ne // ~in2infty add x0,sp,#192 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); ldp x4,x5,[x22,#64] // in1_z ldp x6,x7,[x22,#64+16] orr x8,x4,x5 orr x10,x6,x7 orr x24,x8,x10 cmp x24,#0 csetm x24,ne // ~in1infty add x0,sp,#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); ldr x3,[x23,#64] ldp x4,x5,[sp,#192] ldp x6,x7,[sp,#192+16] add x2,x23,#64 add x0,sp,#320 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); ldr x3,[x22,#64] ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x2,x22,#64 add x0,sp,#352 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); ldr x3,[x22,#32] ldp x4,x5,[sp,#320] ldp x6,x7,[sp,#320+16] add x2,x22,#32 add x0,sp,#320 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); ldr x3,[x23,#32] ldp x4,x5,[sp,#352] ldp x6,x7,[sp,#352+16] add x2,x23,#32 add x0,sp,#352 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); add x2,sp,#320 ldr x3,[sp,#192] // forward load for p256_mul_mont ldp x4,x5,[x22] ldp x6,x7,[x22,#16] add x0,sp,#160 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); orr x14,x14,x15 // see if result is zero orr x16,x16,x17 orr x26,x14,x16 // ~is_equal(S1,S2) add x2,sp,#192 add x0,sp,#256 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); ldr x3,[sp,#128] ldp x4,x5,[x23] ldp x6,x7,[x23,#16] add x2,sp,#128 add x0,sp,#288 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); add x2,sp,#256 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont ldp x6,x7,[sp,#160+16] add x0,sp,#96 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); orr x14,x14,x15 // see if result is zero orr x16,x16,x17 orr x14,x14,x16 // ~is_equal(U1,U2) mvn x27,x24 // -1/0 -> 0/-1 mvn x28,x25 // -1/0 -> 0/-1 orr x14,x14,x27 orr x14,x14,x28 orr x14,x14,x26 cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) Ladd_double: mov x1,x22 mov x0,x21 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames b Ldouble_shortcut .align 4 Ladd_proceed: add x0,sp,#192 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); ldr x3,[x22,#64] ldp x4,x5,[sp,#96] ldp x6,x7,[sp,#96+16] add x2,x22,#64 add x0,sp,#64 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); ldp x4,x5,[sp,#96] ldp x6,x7,[sp,#96+16] add x0,sp,#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); ldr x3,[x23,#64] ldp x4,x5,[sp,#64] ldp x6,x7,[sp,#64+16] add x2,x23,#64 add x0,sp,#64 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); ldr x3,[sp,#96] ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x2,sp,#96 add x0,sp,#224 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); ldr x3,[sp,#128] ldp x4,x5,[sp,#256] ldp x6,x7,[sp,#256+16] add x2,sp,#128 add x0,sp,#288 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); mov x8,x14 mov x9,x15 mov x10,x16 mov x11,x17 add x0,sp,#128 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); add x2,sp,#192 add x0,sp,#0 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); add x2,sp,#224 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); add x2,sp,#288 ldr x3,[sp,#224] // forward load for p256_mul_mont ldp x4,x5,[sp,#320] ldp x6,x7,[sp,#320+16] add x0,sp,#32 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); add x2,sp,#224 add x0,sp,#352 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); ldr x3,[sp,#160] ldp x4,x5,[sp,#32] ldp x6,x7,[sp,#32+16] add x2,sp,#160 add x0,sp,#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); add x2,sp,#352 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); ldp x4,x5,[sp,#0] // res ldp x6,x7,[sp,#0+16] ldp x8,x9,[x23] // in2 ldp x10,x11,[x23,#16] ldp x14,x15,[x22,#0] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#0+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+0+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+0+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#0+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#0+48] stp x14,x15,[x21,#0] stp x16,x17,[x21,#0+16] ldp x14,x15,[x22,#32] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#32+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+32+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+32+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#32+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#32+48] stp x14,x15,[x21,#32] stp x16,x17,[x21,#32+16] ldp x14,x15,[x22,#64] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#64+16] csel x8,x4,x8,ne csel x9,x5,x9,ne csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? csel x14,x8,x14,ne csel x15,x9,x15,ne csel x16,x10,x16,ne csel x17,x11,x17,ne stp x14,x15,[x21,#64] stp x16,x17,[x21,#64+16] Ladd_done: add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .globl _ecp_nistz256_point_add_affine .private_extern _ecp_nistz256_point_add_affine .align 5 _ecp_nistz256_point_add_affine: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-80]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] sub sp,sp,#32*10 mov x21,x0 mov x22,x1 mov x23,x2 adrp x13,Lpoly@PAGE add x13,x13,Lpoly@PAGEOFF ldr x12,[x13,#8] ldr x13,[x13,#24] ldp x4,x5,[x1,#64] // in1_z ldp x6,x7,[x1,#64+16] orr x8,x4,x5 orr x10,x6,x7 orr x24,x8,x10 cmp x24,#0 csetm x24,ne // ~in1infty ldp x14,x15,[x2] // in2_x ldp x16,x17,[x2,#16] ldp x8,x9,[x2,#32] // in2_y ldp x10,x11,[x2,#48] orr x14,x14,x15 orr x16,x16,x17 orr x8,x8,x9 orr x10,x10,x11 orr x14,x14,x16 orr x8,x8,x10 orr x25,x14,x8 cmp x25,#0 csetm x25,ne // ~in2infty add x0,sp,#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); mov x4,x14 mov x5,x15 mov x6,x16 mov x7,x17 ldr x3,[x23] add x2,x23,#0 add x0,sp,#96 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); add x2,x22,#0 ldr x3,[x22,#64] // forward load for p256_mul_mont ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x0,sp,#160 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); add x2,x22,#64 add x0,sp,#128 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); ldr x3,[x22,#64] ldp x4,x5,[sp,#160] ldp x6,x7,[sp,#160+16] add x2,x22,#64 add x0,sp,#64 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); ldr x3,[x23,#32] ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x2,x23,#32 add x0,sp,#128 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); add x2,x22,#32 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont ldp x6,x7,[sp,#160+16] add x0,sp,#192 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); add x0,sp,#224 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); ldp x4,x5,[sp,#192] ldp x6,x7,[sp,#192+16] add x0,sp,#288 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); ldr x3,[sp,#160] ldp x4,x5,[sp,#224] ldp x6,x7,[sp,#224+16] add x2,sp,#160 add x0,sp,#256 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); ldr x3,[x22] ldp x4,x5,[sp,#224] ldp x6,x7,[sp,#224+16] add x2,x22,#0 add x0,sp,#96 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); mov x8,x14 mov x9,x15 mov x10,x16 mov x11,x17 add x0,sp,#224 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); add x2,sp,#288 add x0,sp,#0 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); add x2,sp,#256 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); add x2,sp,#96 ldr x3,[x22,#32] // forward load for p256_mul_mont ldp x4,x5,[sp,#256] ldp x6,x7,[sp,#256+16] add x0,sp,#32 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); add x2,x22,#32 add x0,sp,#128 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); ldr x3,[sp,#192] ldp x4,x5,[sp,#32] ldp x6,x7,[sp,#32+16] add x2,sp,#192 add x0,sp,#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); add x2,sp,#128 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); ldp x4,x5,[sp,#0] // res ldp x6,x7,[sp,#0+16] ldp x8,x9,[x23] // in2 ldp x10,x11,[x23,#16] ldp x14,x15,[x22,#0] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#0+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+0+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+0+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#0+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#0+48] stp x14,x15,[x21,#0] stp x16,x17,[x21,#0+16] adrp x23,Lone_mont@PAGE-64 add x23,x23,Lone_mont@PAGEOFF-64 ldp x14,x15,[x22,#32] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#32+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+32+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+32+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#32+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#32+48] stp x14,x15,[x21,#32] stp x16,x17,[x21,#32+16] ldp x14,x15,[x22,#64] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#64+16] csel x8,x4,x8,ne csel x9,x5,x9,ne csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? csel x14,x8,x14,ne csel x15,x9,x15,ne csel x16,x10,x16,ne csel x17,x11,x17,ne stp x14,x15,[x21,#64] stp x16,x17,[x21,#64+16] add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x29,x30,[sp],#80 AARCH64_VALIDATE_LINK_REGISTER ret //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], // uint64_t b[4]); .globl _ecp_nistz256_ord_mul_mont .private_extern _ecp_nistz256_ord_mul_mont .align 4 _ecp_nistz256_ord_mul_mont: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] adrp x23,Lord@PAGE add x23,x23,Lord@PAGEOFF ldr x3,[x2] // bp[0] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x12,x13,[x23,#0] ldp x21,x22,[x23,#16] ldr x23,[x23,#32] mul x14,x4,x3 // a[0]*b[0] umulh x8,x4,x3 mul x15,x5,x3 // a[1]*b[0] umulh x9,x5,x3 mul x16,x6,x3 // a[2]*b[0] umulh x10,x6,x3 mul x17,x7,x3 // a[3]*b[0] umulh x19,x7,x3 mul x24,x14,x23 adds x15,x15,x8 // accumulate high parts of multiplication adcs x16,x16,x9 adcs x17,x17,x10 adc x19,x19,xzr mov x20,xzr ldr x3,[x2,#8*1] // b[i] lsl x8,x24,#32 subs x16,x16,x24 lsr x9,x24,#32 sbcs x17,x17,x8 sbcs x19,x19,x9 sbc x20,x20,xzr subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 mul x8,x4,x3 adc x11,x11,xzr mul x9,x5,x3 adds x14,x15,x10 mul x10,x6,x3 adcs x15,x16,x11 mul x11,x7,x3 adcs x16,x17,x24 adcs x17,x19,x24 adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts umulh x8,x4,x3 adcs x15,x15,x9 umulh x9,x5,x3 adcs x16,x16,x10 umulh x10,x6,x3 adcs x17,x17,x11 umulh x11,x7,x3 adc x19,x19,xzr mul x24,x14,x23 adds x15,x15,x8 // accumulate high parts adcs x16,x16,x9 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr ldr x3,[x2,#8*2] // b[i] lsl x8,x24,#32 subs x16,x16,x24 lsr x9,x24,#32 sbcs x17,x17,x8 sbcs x19,x19,x9 sbc x20,x20,xzr subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 mul x8,x4,x3 adc x11,x11,xzr mul x9,x5,x3 adds x14,x15,x10 mul x10,x6,x3 adcs x15,x16,x11 mul x11,x7,x3 adcs x16,x17,x24 adcs x17,x19,x24 adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts umulh x8,x4,x3 adcs x15,x15,x9 umulh x9,x5,x3 adcs x16,x16,x10 umulh x10,x6,x3 adcs x17,x17,x11 umulh x11,x7,x3 adc x19,x19,xzr mul x24,x14,x23 adds x15,x15,x8 // accumulate high parts adcs x16,x16,x9 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr ldr x3,[x2,#8*3] // b[i] lsl x8,x24,#32 subs x16,x16,x24 lsr x9,x24,#32 sbcs x17,x17,x8 sbcs x19,x19,x9 sbc x20,x20,xzr subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 mul x8,x4,x3 adc x11,x11,xzr mul x9,x5,x3 adds x14,x15,x10 mul x10,x6,x3 adcs x15,x16,x11 mul x11,x7,x3 adcs x16,x17,x24 adcs x17,x19,x24 adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts umulh x8,x4,x3 adcs x15,x15,x9 umulh x9,x5,x3 adcs x16,x16,x10 umulh x10,x6,x3 adcs x17,x17,x11 umulh x11,x7,x3 adc x19,x19,xzr mul x24,x14,x23 adds x15,x15,x8 // accumulate high parts adcs x16,x16,x9 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr lsl x8,x24,#32 // last reduction subs x16,x16,x24 lsr x9,x24,#32 sbcs x17,x17,x8 sbcs x19,x19,x9 sbc x20,x20,xzr subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 adc x11,x11,xzr adds x14,x15,x10 adcs x15,x16,x11 adcs x16,x17,x24 adcs x17,x19,x24 adc x19,x20,xzr subs x8,x14,x12 // ret -= modulus sbcs x9,x15,x13 sbcs x10,x16,x21 sbcs x11,x17,x22 sbcs xzr,x19,xzr csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo stp x14,x15,[x0] csel x17,x17,x11,lo stp x16,x17,[x0,#16] ldp x19,x20,[sp,#16] ldp x21,x22,[sp,#32] ldp x23,x24,[sp,#48] ldr x29,[sp],#64 ret //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], // uint64_t rep); .globl _ecp_nistz256_ord_sqr_mont .private_extern _ecp_nistz256_ord_sqr_mont .align 4 _ecp_nistz256_ord_sqr_mont: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] adrp x23,Lord@PAGE add x23,x23,Lord@PAGEOFF ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x12,x13,[x23,#0] ldp x21,x22,[x23,#16] ldr x23,[x23,#32] b Loop_ord_sqr .align 4 Loop_ord_sqr: sub x2,x2,#1 //////////////////////////////////////////////////////////////// // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul x15,x5,x4 // a[1]*a[0] umulh x9,x5,x4 mul x16,x6,x4 // a[2]*a[0] umulh x10,x6,x4 mul x17,x7,x4 // a[3]*a[0] umulh x19,x7,x4 adds x16,x16,x9 // accumulate high parts of multiplication mul x8,x6,x5 // a[2]*a[1] umulh x9,x6,x5 adcs x17,x17,x10 mul x10,x7,x5 // a[3]*a[1] umulh x11,x7,x5 adc x19,x19,xzr // can't overflow mul x20,x7,x6 // a[3]*a[2] umulh x1,x7,x6 adds x9,x9,x10 // accumulate high parts of multiplication mul x14,x4,x4 // a[0]*a[0] adc x10,x11,xzr // can't overflow adds x17,x17,x8 // accumulate low parts of multiplication umulh x4,x4,x4 adcs x19,x19,x9 mul x9,x5,x5 // a[1]*a[1] adcs x20,x20,x10 umulh x5,x5,x5 adc x1,x1,xzr // can't overflow adds x15,x15,x15 // acc[1-6]*=2 mul x10,x6,x6 // a[2]*a[2] adcs x16,x16,x16 umulh x6,x6,x6 adcs x17,x17,x17 mul x11,x7,x7 // a[3]*a[3] adcs x19,x19,x19 umulh x7,x7,x7 adcs x20,x20,x20 adcs x1,x1,x1 adc x3,xzr,xzr adds x15,x15,x4 // +a[i]*a[i] mul x24,x14,x23 adcs x16,x16,x9 adcs x17,x17,x5 adcs x19,x19,x10 adcs x20,x20,x6 adcs x1,x1,x11 adc x3,x3,x7 subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 adc x11,x11,xzr adds x14,x15,x10 adcs x15,x16,x11 adcs x16,x17,x24 adc x17,xzr,x24 // can't overflow mul x11,x14,x23 lsl x8,x24,#32 subs x15,x15,x24 lsr x9,x24,#32 sbcs x16,x16,x8 sbc x17,x17,x9 // can't borrow subs xzr,x14,#1 umulh x9,x12,x11 mul x10,x13,x11 umulh x24,x13,x11 adcs x10,x10,x9 adc x24,x24,xzr adds x14,x15,x10 adcs x15,x16,x24 adcs x16,x17,x11 adc x17,xzr,x11 // can't overflow mul x24,x14,x23 lsl x8,x11,#32 subs x15,x15,x11 lsr x9,x11,#32 sbcs x16,x16,x8 sbc x17,x17,x9 // can't borrow subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 adc x11,x11,xzr adds x14,x15,x10 adcs x15,x16,x11 adcs x16,x17,x24 adc x17,xzr,x24 // can't overflow mul x11,x14,x23 lsl x8,x24,#32 subs x15,x15,x24 lsr x9,x24,#32 sbcs x16,x16,x8 sbc x17,x17,x9 // can't borrow subs xzr,x14,#1 umulh x9,x12,x11 mul x10,x13,x11 umulh x24,x13,x11 adcs x10,x10,x9 adc x24,x24,xzr adds x14,x15,x10 adcs x15,x16,x24 adcs x16,x17,x11 adc x17,xzr,x11 // can't overflow lsl x8,x11,#32 subs x15,x15,x11 lsr x9,x11,#32 sbcs x16,x16,x8 sbc x17,x17,x9 // can't borrow adds x14,x14,x19 // accumulate upper half adcs x15,x15,x20 adcs x16,x16,x1 adcs x17,x17,x3 adc x19,xzr,xzr subs x8,x14,x12 // ret -= modulus sbcs x9,x15,x13 sbcs x10,x16,x21 sbcs x11,x17,x22 sbcs xzr,x19,xzr csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x5,x15,x9,lo csel x6,x16,x10,lo csel x7,x17,x11,lo cbnz x2,Loop_ord_sqr stp x4,x5,[x0] stp x6,x7,[x0,#16] ldp x19,x20,[sp,#16] ldp x21,x22,[sp,#32] ldp x23,x24,[sp,#48] ldr x29,[sp],#64 ret //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); .globl _ecp_nistz256_select_w5 .private_extern _ecp_nistz256_select_w5 .align 4 _ecp_nistz256_select_w5: AARCH64_VALID_CALL_TARGET // x10 := x0 // w9 := 0; loop counter and incremented internal index mov x10, x0 mov w9, #0 // [v16-v21] := 0 movi v16.16b, #0 movi v17.16b, #0 movi v18.16b, #0 movi v19.16b, #0 movi v20.16b, #0 movi v21.16b, #0 Lselect_w5_loop: // Loop 16 times. // Increment index (loop counter); tested at the end of the loop add w9, w9, #1 // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 // and advance x1 to point to the next entry ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 // x11 := (w9 == w2)? All 1s : All 0s cmp w9, w2 csetm x11, eq // continue loading ... ld1 {v26.2d, v27.2d}, [x1],#32 // duplicate mask_64 into Mask (all 0s or all 1s) dup v3.2d, x11 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] // i.e., values in output registers will remain the same if w9 != w2 bit v16.16b, v22.16b, v3.16b bit v17.16b, v23.16b, v3.16b bit v18.16b, v24.16b, v3.16b bit v19.16b, v25.16b, v3.16b bit v20.16b, v26.16b, v3.16b bit v21.16b, v27.16b, v3.16b // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back tbz w9, #4, Lselect_w5_loop // Write [v16-v21] to memory at the output pointer st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 st1 {v20.2d, v21.2d}, [x10] ret //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); .globl _ecp_nistz256_select_w7 .private_extern _ecp_nistz256_select_w7 .align 4 _ecp_nistz256_select_w7: AARCH64_VALID_CALL_TARGET // w9 := 0; loop counter and incremented internal index mov w9, #0 // [v16-v21] := 0 movi v16.16b, #0 movi v17.16b, #0 movi v18.16b, #0 movi v19.16b, #0 Lselect_w7_loop: // Loop 64 times. // Increment index (loop counter); tested at the end of the loop add w9, w9, #1 // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 // and advance x1 to point to the next entry ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 // x11 := (w9 == w2)? All 1s : All 0s cmp w9, w2 csetm x11, eq // duplicate mask_64 into Mask (all 0s or all 1s) dup v3.2d, x11 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] // i.e., values in output registers will remain the same if w9 != w2 bit v16.16b, v22.16b, v3.16b bit v17.16b, v23.16b, v3.16b bit v18.16b, v24.16b, v3.16b bit v19.16b, v25.16b, v3.16b // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back tbz w9, #6, Lselect_w7_loop // Write [v16-v19] to memory at the output pointer st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] ret #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) ring-0.17.14/pregenerated/p256-armv8-asm-linux64.S000064400000000000000000001076121046102023000173510ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) .section .rodata .align 5 .Lpoly: .quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 .LRR: // 2^512 mod P precomputed for NIST P256 polynomial .quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd .Lone_mont: .quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe .Lone: .quad 1,0,0,0 .Lord: .quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 .LordK: .quad 0xccd1c8aaee00bc4f .byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .text // void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], // const BN_ULONG x2[4]); .globl ecp_nistz256_mul_mont .hidden ecp_nistz256_mul_mont .type ecp_nistz256_mul_mont,%function .align 4 ecp_nistz256_mul_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldr x3,[x2] // bp[0] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] adrp x13,.Lpoly add x13,x13,:lo12:.Lpoly ldr x12,[x13,#8] ldr x13,[x13,#24] bl __ecp_nistz256_mul_mont ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont // void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_sqr_mont .hidden ecp_nistz256_sqr_mont .type ecp_nistz256_sqr_mont,%function .align 4 ecp_nistz256_sqr_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] adrp x13,.Lpoly add x13,x13,:lo12:.Lpoly ldr x12,[x13,#8] ldr x13,[x13,#24] bl __ecp_nistz256_sqr_mont ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont // void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_neg .hidden ecp_nistz256_neg .type ecp_nistz256_neg,%function .align 4 ecp_nistz256_neg: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 mov x2,x1 mov x14,xzr // a = 0 mov x15,xzr mov x16,xzr mov x17,xzr adrp x13,.Lpoly add x13,x13,:lo12:.Lpoly ldr x12,[x13,#8] ldr x13,[x13,#24] bl __ecp_nistz256_sub_from ldp x29,x30,[sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_neg,.-ecp_nistz256_neg // note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded // to x4-x7 and b[0] - to x3 .type __ecp_nistz256_mul_mont,%function .align 4 __ecp_nistz256_mul_mont: mul x14,x4,x3 // a[0]*b[0] umulh x8,x4,x3 mul x15,x5,x3 // a[1]*b[0] umulh x9,x5,x3 mul x16,x6,x3 // a[2]*b[0] umulh x10,x6,x3 mul x17,x7,x3 // a[3]*b[0] umulh x11,x7,x3 ldr x3,[x2,#8] // b[1] adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 lsr x9,x14,#32 adcs x17,x17,x10 adc x19,xzr,x11 mov x20,xzr subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] mul x8,x4,x3 // lo(a[0]*b[i]) adcs x15,x16,x9 mul x9,x5,x3 // lo(a[1]*b[i]) adcs x16,x17,x10 // +=acc[0]*0xffff0001 mul x10,x6,x3 // lo(a[2]*b[i]) adcs x17,x19,x11 mul x11,x7,x3 // lo(a[3]*b[i]) adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts of multiplication umulh x8,x4,x3 // hi(a[0]*b[i]) adcs x15,x15,x9 umulh x9,x5,x3 // hi(a[1]*b[i]) adcs x16,x16,x10 umulh x10,x6,x3 // hi(a[2]*b[i]) adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr ldr x3,[x2,#8*(1+1)] // b[1+1] adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 lsr x9,x14,#32 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] mul x8,x4,x3 // lo(a[0]*b[i]) adcs x15,x16,x9 mul x9,x5,x3 // lo(a[1]*b[i]) adcs x16,x17,x10 // +=acc[0]*0xffff0001 mul x10,x6,x3 // lo(a[2]*b[i]) adcs x17,x19,x11 mul x11,x7,x3 // lo(a[3]*b[i]) adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts of multiplication umulh x8,x4,x3 // hi(a[0]*b[i]) adcs x15,x15,x9 umulh x9,x5,x3 // hi(a[1]*b[i]) adcs x16,x16,x10 umulh x10,x6,x3 // hi(a[2]*b[i]) adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr ldr x3,[x2,#8*(2+1)] // b[2+1] adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 lsr x9,x14,#32 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] mul x8,x4,x3 // lo(a[0]*b[i]) adcs x15,x16,x9 mul x9,x5,x3 // lo(a[1]*b[i]) adcs x16,x17,x10 // +=acc[0]*0xffff0001 mul x10,x6,x3 // lo(a[2]*b[i]) adcs x17,x19,x11 mul x11,x7,x3 // lo(a[3]*b[i]) adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts of multiplication umulh x8,x4,x3 // hi(a[0]*b[i]) adcs x15,x15,x9 umulh x9,x5,x3 // hi(a[1]*b[i]) adcs x16,x16,x10 umulh x10,x6,x3 // hi(a[2]*b[i]) adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 lsr x9,x14,#32 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr // last reduction subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 adcs x16,x17,x10 // +=acc[0]*0xffff0001 adcs x17,x19,x11 adc x19,x20,xzr adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus sbcs x9,x15,x12 sbcs x10,x16,xzr sbcs x11,x17,x13 sbcs xzr,x19,xzr // did it borrow? csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo stp x14,x15,[x0] csel x17,x17,x11,lo stp x16,x17,[x0,#16] ret .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont // note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded // to x4-x7 .type __ecp_nistz256_sqr_mont,%function .align 4 __ecp_nistz256_sqr_mont: // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul x15,x5,x4 // a[1]*a[0] umulh x9,x5,x4 mul x16,x6,x4 // a[2]*a[0] umulh x10,x6,x4 mul x17,x7,x4 // a[3]*a[0] umulh x19,x7,x4 adds x16,x16,x9 // accumulate high parts of multiplication mul x8,x6,x5 // a[2]*a[1] umulh x9,x6,x5 adcs x17,x17,x10 mul x10,x7,x5 // a[3]*a[1] umulh x11,x7,x5 adc x19,x19,xzr // can't overflow mul x20,x7,x6 // a[3]*a[2] umulh x1,x7,x6 adds x9,x9,x10 // accumulate high parts of multiplication mul x14,x4,x4 // a[0]*a[0] adc x10,x11,xzr // can't overflow adds x17,x17,x8 // accumulate low parts of multiplication umulh x4,x4,x4 adcs x19,x19,x9 mul x9,x5,x5 // a[1]*a[1] adcs x20,x20,x10 umulh x5,x5,x5 adc x1,x1,xzr // can't overflow adds x15,x15,x15 // acc[1-6]*=2 mul x10,x6,x6 // a[2]*a[2] adcs x16,x16,x16 umulh x6,x6,x6 adcs x17,x17,x17 mul x11,x7,x7 // a[3]*a[3] adcs x19,x19,x19 umulh x7,x7,x7 adcs x20,x20,x20 adcs x1,x1,x1 adc x2,xzr,xzr adds x15,x15,x4 // +a[i]*a[i] adcs x16,x16,x9 adcs x17,x17,x5 adcs x19,x19,x10 adcs x20,x20,x6 lsl x8,x14,#32 adcs x1,x1,x11 lsr x9,x14,#32 adc x2,x2,x7 subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 lsl x8,x14,#32 adcs x16,x17,x10 // +=acc[0]*0xffff0001 lsr x9,x14,#32 adc x17,x11,xzr // can't overflow subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 lsl x8,x14,#32 adcs x16,x17,x10 // +=acc[0]*0xffff0001 lsr x9,x14,#32 adc x17,x11,xzr // can't overflow subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 lsl x8,x14,#32 adcs x16,x17,x10 // +=acc[0]*0xffff0001 lsr x9,x14,#32 adc x17,x11,xzr // can't overflow subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 adcs x16,x17,x10 // +=acc[0]*0xffff0001 adc x17,x11,xzr // can't overflow adds x14,x14,x19 // accumulate upper half adcs x15,x15,x20 adcs x16,x16,x1 adcs x17,x17,x2 adc x19,xzr,xzr adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus sbcs x9,x15,x12 sbcs x10,x16,xzr sbcs x11,x17,x13 sbcs xzr,x19,xzr // did it borrow? csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo stp x14,x15,[x0] csel x17,x17,x11,lo stp x16,x17,[x0,#16] ret .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont // Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to // x4-x7 and x8-x11. This is done because it's used in multiple // contexts, e.g. in multiplication by 2 and 3... .type __ecp_nistz256_add_to,%function .align 4 __ecp_nistz256_add_to: adds x14,x14,x8 // ret = a+b adcs x15,x15,x9 adcs x16,x16,x10 adcs x17,x17,x11 adc x1,xzr,xzr // zap x1 adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus sbcs x9,x15,x12 sbcs x10,x16,xzr sbcs x11,x17,x13 sbcs xzr,x1,xzr // did subtraction borrow? csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo stp x14,x15,[x0] csel x17,x17,x11,lo stp x16,x17,[x0,#16] ret .size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to .type __ecp_nistz256_sub_from,%function .align 4 __ecp_nistz256_sub_from: ldp x8,x9,[x2] ldp x10,x11,[x2,#16] subs x14,x14,x8 // ret = a-b sbcs x15,x15,x9 sbcs x16,x16,x10 sbcs x17,x17,x11 sbc x1,xzr,xzr // zap x1 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus adcs x9,x15,x12 adcs x10,x16,xzr adc x11,x17,x13 cmp x1,xzr // did subtraction borrow? csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret csel x15,x15,x9,eq csel x16,x16,x10,eq stp x14,x15,[x0] csel x17,x17,x11,eq stp x16,x17,[x0,#16] ret .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from .type __ecp_nistz256_sub_morf,%function .align 4 __ecp_nistz256_sub_morf: ldp x8,x9,[x2] ldp x10,x11,[x2,#16] subs x14,x8,x14 // ret = b-a sbcs x15,x9,x15 sbcs x16,x10,x16 sbcs x17,x11,x17 sbc x1,xzr,xzr // zap x1 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus adcs x9,x15,x12 adcs x10,x16,xzr adc x11,x17,x13 cmp x1,xzr // did subtraction borrow? csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret csel x15,x15,x9,eq csel x16,x16,x10,eq stp x14,x15,[x0] csel x17,x17,x11,eq stp x16,x17,[x0,#16] ret .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf .type __ecp_nistz256_div_by_2,%function .align 4 __ecp_nistz256_div_by_2: subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus adcs x9,x15,x12 adcs x10,x16,xzr adcs x11,x17,x13 adc x1,xzr,xzr // zap x1 tst x14,#1 // is a even? csel x14,x14,x8,eq // ret = even ? a : a+modulus csel x15,x15,x9,eq csel x16,x16,x10,eq csel x17,x17,x11,eq csel x1,xzr,x1,eq lsr x14,x14,#1 // ret >>= 1 orr x14,x14,x15,lsl#63 lsr x15,x15,#1 orr x15,x15,x16,lsl#63 lsr x16,x16,#1 orr x16,x16,x17,lsl#63 lsr x17,x17,#1 stp x14,x15,[x0] orr x17,x17,x1,lsl#63 stp x16,x17,[x0,#16] ret .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 .globl ecp_nistz256_point_double .hidden ecp_nistz256_point_double .type ecp_nistz256_point_double,%function .align 5 ecp_nistz256_point_double: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] sub sp,sp,#32*4 .Ldouble_shortcut: ldp x14,x15,[x1,#32] mov x21,x0 ldp x16,x17,[x1,#48] mov x22,x1 adrp x13,.Lpoly add x13,x13,:lo12:.Lpoly ldr x12,[x13,#8] mov x8,x14 ldr x13,[x13,#24] mov x9,x15 ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 ldp x6,x7,[x22,#64+16] add x0,sp,#0 bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); add x0,sp,#64 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); ldp x8,x9,[x22] ldp x10,x11,[x22,#16] mov x4,x14 // put Zsqr aside for p256_sub mov x5,x15 mov x6,x16 mov x7,x17 add x0,sp,#32 bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); add x2,x22,#0 mov x14,x4 // restore Zsqr mov x15,x5 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont mov x16,x6 mov x17,x7 ldp x6,x7,[sp,#0+16] add x0,sp,#64 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); add x0,sp,#0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); ldr x3,[x22,#32] ldp x4,x5,[x22,#64] ldp x6,x7,[x22,#64+16] add x2,x22,#32 add x0,sp,#96 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); mov x8,x14 mov x9,x15 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 ldp x6,x7,[sp,#0+16] add x0,x21,#64 bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); add x0,sp,#96 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); ldr x3,[sp,#64] // forward load for p256_mul_mont ldp x4,x5,[sp,#32] ldp x6,x7,[sp,#32+16] add x0,x21,#32 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); add x2,sp,#64 add x0,sp,#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); mov x8,x14 // duplicate M mov x9,x15 mov x10,x16 mov x11,x17 mov x4,x14 // put M aside mov x5,x15 mov x6,x16 mov x7,x17 add x0,sp,#32 bl __ecp_nistz256_add_to mov x8,x4 // restore M mov x9,x5 ldr x3,[x22] // forward load for p256_mul_mont mov x10,x6 ldp x4,x5,[sp,#0] mov x11,x7 ldp x6,x7,[sp,#0+16] bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); add x2,x22,#0 add x0,sp,#0 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); mov x8,x14 mov x9,x15 ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 ldp x6,x7,[sp,#32+16] add x0,sp,#96 bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); add x0,x21,#0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); add x2,sp,#96 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); add x2,sp,#0 add x0,sp,#0 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); ldr x3,[sp,#32] mov x4,x14 // copy S mov x5,x15 mov x6,x16 mov x7,x17 add x2,sp,#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); add x2,x21,#32 add x0,x21,#32 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_double,.-ecp_nistz256_point_double .globl ecp_nistz256_point_add .hidden ecp_nistz256_point_add .type ecp_nistz256_point_add,%function .align 5 ecp_nistz256_point_add: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#32*12 ldp x4,x5,[x2,#64] // in2_z ldp x6,x7,[x2,#64+16] mov x21,x0 mov x22,x1 mov x23,x2 adrp x13,.Lpoly add x13,x13,:lo12:.Lpoly ldr x12,[x13,#8] ldr x13,[x13,#24] orr x8,x4,x5 orr x10,x6,x7 orr x25,x8,x10 cmp x25,#0 csetm x25,ne // ~in2infty add x0,sp,#192 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); ldp x4,x5,[x22,#64] // in1_z ldp x6,x7,[x22,#64+16] orr x8,x4,x5 orr x10,x6,x7 orr x24,x8,x10 cmp x24,#0 csetm x24,ne // ~in1infty add x0,sp,#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); ldr x3,[x23,#64] ldp x4,x5,[sp,#192] ldp x6,x7,[sp,#192+16] add x2,x23,#64 add x0,sp,#320 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); ldr x3,[x22,#64] ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x2,x22,#64 add x0,sp,#352 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); ldr x3,[x22,#32] ldp x4,x5,[sp,#320] ldp x6,x7,[sp,#320+16] add x2,x22,#32 add x0,sp,#320 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); ldr x3,[x23,#32] ldp x4,x5,[sp,#352] ldp x6,x7,[sp,#352+16] add x2,x23,#32 add x0,sp,#352 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); add x2,sp,#320 ldr x3,[sp,#192] // forward load for p256_mul_mont ldp x4,x5,[x22] ldp x6,x7,[x22,#16] add x0,sp,#160 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); orr x14,x14,x15 // see if result is zero orr x16,x16,x17 orr x26,x14,x16 // ~is_equal(S1,S2) add x2,sp,#192 add x0,sp,#256 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); ldr x3,[sp,#128] ldp x4,x5,[x23] ldp x6,x7,[x23,#16] add x2,sp,#128 add x0,sp,#288 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); add x2,sp,#256 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont ldp x6,x7,[sp,#160+16] add x0,sp,#96 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); orr x14,x14,x15 // see if result is zero orr x16,x16,x17 orr x14,x14,x16 // ~is_equal(U1,U2) mvn x27,x24 // -1/0 -> 0/-1 mvn x28,x25 // -1/0 -> 0/-1 orr x14,x14,x27 orr x14,x14,x28 orr x14,x14,x26 cbnz x14,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) .Ladd_double: mov x1,x22 mov x0,x21 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames b .Ldouble_shortcut .align 4 .Ladd_proceed: add x0,sp,#192 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); ldr x3,[x22,#64] ldp x4,x5,[sp,#96] ldp x6,x7,[sp,#96+16] add x2,x22,#64 add x0,sp,#64 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); ldp x4,x5,[sp,#96] ldp x6,x7,[sp,#96+16] add x0,sp,#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); ldr x3,[x23,#64] ldp x4,x5,[sp,#64] ldp x6,x7,[sp,#64+16] add x2,x23,#64 add x0,sp,#64 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); ldr x3,[sp,#96] ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x2,sp,#96 add x0,sp,#224 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); ldr x3,[sp,#128] ldp x4,x5,[sp,#256] ldp x6,x7,[sp,#256+16] add x2,sp,#128 add x0,sp,#288 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); mov x8,x14 mov x9,x15 mov x10,x16 mov x11,x17 add x0,sp,#128 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); add x2,sp,#192 add x0,sp,#0 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); add x2,sp,#224 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); add x2,sp,#288 ldr x3,[sp,#224] // forward load for p256_mul_mont ldp x4,x5,[sp,#320] ldp x6,x7,[sp,#320+16] add x0,sp,#32 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); add x2,sp,#224 add x0,sp,#352 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); ldr x3,[sp,#160] ldp x4,x5,[sp,#32] ldp x6,x7,[sp,#32+16] add x2,sp,#160 add x0,sp,#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); add x2,sp,#352 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); ldp x4,x5,[sp,#0] // res ldp x6,x7,[sp,#0+16] ldp x8,x9,[x23] // in2 ldp x10,x11,[x23,#16] ldp x14,x15,[x22,#0] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#0+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+0+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+0+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#0+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#0+48] stp x14,x15,[x21,#0] stp x16,x17,[x21,#0+16] ldp x14,x15,[x22,#32] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#32+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+32+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+32+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#32+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#32+48] stp x14,x15,[x21,#32] stp x16,x17,[x21,#32+16] ldp x14,x15,[x22,#64] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#64+16] csel x8,x4,x8,ne csel x9,x5,x9,ne csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? csel x14,x8,x14,ne csel x15,x9,x15,ne csel x16,x10,x16,ne csel x17,x11,x17,ne stp x14,x15,[x21,#64] stp x16,x17,[x21,#64+16] .Ladd_done: add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_add,.-ecp_nistz256_point_add .globl ecp_nistz256_point_add_affine .hidden ecp_nistz256_point_add_affine .type ecp_nistz256_point_add_affine,%function .align 5 ecp_nistz256_point_add_affine: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-80]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] sub sp,sp,#32*10 mov x21,x0 mov x22,x1 mov x23,x2 adrp x13,.Lpoly add x13,x13,:lo12:.Lpoly ldr x12,[x13,#8] ldr x13,[x13,#24] ldp x4,x5,[x1,#64] // in1_z ldp x6,x7,[x1,#64+16] orr x8,x4,x5 orr x10,x6,x7 orr x24,x8,x10 cmp x24,#0 csetm x24,ne // ~in1infty ldp x14,x15,[x2] // in2_x ldp x16,x17,[x2,#16] ldp x8,x9,[x2,#32] // in2_y ldp x10,x11,[x2,#48] orr x14,x14,x15 orr x16,x16,x17 orr x8,x8,x9 orr x10,x10,x11 orr x14,x14,x16 orr x8,x8,x10 orr x25,x14,x8 cmp x25,#0 csetm x25,ne // ~in2infty add x0,sp,#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); mov x4,x14 mov x5,x15 mov x6,x16 mov x7,x17 ldr x3,[x23] add x2,x23,#0 add x0,sp,#96 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); add x2,x22,#0 ldr x3,[x22,#64] // forward load for p256_mul_mont ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x0,sp,#160 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); add x2,x22,#64 add x0,sp,#128 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); ldr x3,[x22,#64] ldp x4,x5,[sp,#160] ldp x6,x7,[sp,#160+16] add x2,x22,#64 add x0,sp,#64 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); ldr x3,[x23,#32] ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x2,x23,#32 add x0,sp,#128 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); add x2,x22,#32 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont ldp x6,x7,[sp,#160+16] add x0,sp,#192 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); add x0,sp,#224 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); ldp x4,x5,[sp,#192] ldp x6,x7,[sp,#192+16] add x0,sp,#288 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); ldr x3,[sp,#160] ldp x4,x5,[sp,#224] ldp x6,x7,[sp,#224+16] add x2,sp,#160 add x0,sp,#256 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); ldr x3,[x22] ldp x4,x5,[sp,#224] ldp x6,x7,[sp,#224+16] add x2,x22,#0 add x0,sp,#96 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); mov x8,x14 mov x9,x15 mov x10,x16 mov x11,x17 add x0,sp,#224 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); add x2,sp,#288 add x0,sp,#0 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); add x2,sp,#256 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); add x2,sp,#96 ldr x3,[x22,#32] // forward load for p256_mul_mont ldp x4,x5,[sp,#256] ldp x6,x7,[sp,#256+16] add x0,sp,#32 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); add x2,x22,#32 add x0,sp,#128 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); ldr x3,[sp,#192] ldp x4,x5,[sp,#32] ldp x6,x7,[sp,#32+16] add x2,sp,#192 add x0,sp,#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); add x2,sp,#128 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); ldp x4,x5,[sp,#0] // res ldp x6,x7,[sp,#0+16] ldp x8,x9,[x23] // in2 ldp x10,x11,[x23,#16] ldp x14,x15,[x22,#0] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#0+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+0+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+0+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#0+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#0+48] stp x14,x15,[x21,#0] stp x16,x17,[x21,#0+16] adrp x23,.Lone_mont-64 add x23,x23,:lo12:.Lone_mont-64 ldp x14,x15,[x22,#32] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#32+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+32+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+32+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#32+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#32+48] stp x14,x15,[x21,#32] stp x16,x17,[x21,#32+16] ldp x14,x15,[x22,#64] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#64+16] csel x8,x4,x8,ne csel x9,x5,x9,ne csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? csel x14,x8,x14,ne csel x15,x9,x15,ne csel x16,x10,x16,ne csel x17,x11,x17,ne stp x14,x15,[x21,#64] stp x16,x17,[x21,#64+16] add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x29,x30,[sp],#80 AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], // uint64_t b[4]); .globl ecp_nistz256_ord_mul_mont .hidden ecp_nistz256_ord_mul_mont .type ecp_nistz256_ord_mul_mont,%function .align 4 ecp_nistz256_ord_mul_mont: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] adrp x23,.Lord add x23,x23,:lo12:.Lord ldr x3,[x2] // bp[0] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x12,x13,[x23,#0] ldp x21,x22,[x23,#16] ldr x23,[x23,#32] mul x14,x4,x3 // a[0]*b[0] umulh x8,x4,x3 mul x15,x5,x3 // a[1]*b[0] umulh x9,x5,x3 mul x16,x6,x3 // a[2]*b[0] umulh x10,x6,x3 mul x17,x7,x3 // a[3]*b[0] umulh x19,x7,x3 mul x24,x14,x23 adds x15,x15,x8 // accumulate high parts of multiplication adcs x16,x16,x9 adcs x17,x17,x10 adc x19,x19,xzr mov x20,xzr ldr x3,[x2,#8*1] // b[i] lsl x8,x24,#32 subs x16,x16,x24 lsr x9,x24,#32 sbcs x17,x17,x8 sbcs x19,x19,x9 sbc x20,x20,xzr subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 mul x8,x4,x3 adc x11,x11,xzr mul x9,x5,x3 adds x14,x15,x10 mul x10,x6,x3 adcs x15,x16,x11 mul x11,x7,x3 adcs x16,x17,x24 adcs x17,x19,x24 adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts umulh x8,x4,x3 adcs x15,x15,x9 umulh x9,x5,x3 adcs x16,x16,x10 umulh x10,x6,x3 adcs x17,x17,x11 umulh x11,x7,x3 adc x19,x19,xzr mul x24,x14,x23 adds x15,x15,x8 // accumulate high parts adcs x16,x16,x9 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr ldr x3,[x2,#8*2] // b[i] lsl x8,x24,#32 subs x16,x16,x24 lsr x9,x24,#32 sbcs x17,x17,x8 sbcs x19,x19,x9 sbc x20,x20,xzr subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 mul x8,x4,x3 adc x11,x11,xzr mul x9,x5,x3 adds x14,x15,x10 mul x10,x6,x3 adcs x15,x16,x11 mul x11,x7,x3 adcs x16,x17,x24 adcs x17,x19,x24 adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts umulh x8,x4,x3 adcs x15,x15,x9 umulh x9,x5,x3 adcs x16,x16,x10 umulh x10,x6,x3 adcs x17,x17,x11 umulh x11,x7,x3 adc x19,x19,xzr mul x24,x14,x23 adds x15,x15,x8 // accumulate high parts adcs x16,x16,x9 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr ldr x3,[x2,#8*3] // b[i] lsl x8,x24,#32 subs x16,x16,x24 lsr x9,x24,#32 sbcs x17,x17,x8 sbcs x19,x19,x9 sbc x20,x20,xzr subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 mul x8,x4,x3 adc x11,x11,xzr mul x9,x5,x3 adds x14,x15,x10 mul x10,x6,x3 adcs x15,x16,x11 mul x11,x7,x3 adcs x16,x17,x24 adcs x17,x19,x24 adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts umulh x8,x4,x3 adcs x15,x15,x9 umulh x9,x5,x3 adcs x16,x16,x10 umulh x10,x6,x3 adcs x17,x17,x11 umulh x11,x7,x3 adc x19,x19,xzr mul x24,x14,x23 adds x15,x15,x8 // accumulate high parts adcs x16,x16,x9 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr lsl x8,x24,#32 // last reduction subs x16,x16,x24 lsr x9,x24,#32 sbcs x17,x17,x8 sbcs x19,x19,x9 sbc x20,x20,xzr subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 adc x11,x11,xzr adds x14,x15,x10 adcs x15,x16,x11 adcs x16,x17,x24 adcs x17,x19,x24 adc x19,x20,xzr subs x8,x14,x12 // ret -= modulus sbcs x9,x15,x13 sbcs x10,x16,x21 sbcs x11,x17,x22 sbcs xzr,x19,xzr csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo stp x14,x15,[x0] csel x17,x17,x11,lo stp x16,x17,[x0,#16] ldp x19,x20,[sp,#16] ldp x21,x22,[sp,#32] ldp x23,x24,[sp,#48] ldr x29,[sp],#64 ret .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], // uint64_t rep); .globl ecp_nistz256_ord_sqr_mont .hidden ecp_nistz256_ord_sqr_mont .type ecp_nistz256_ord_sqr_mont,%function .align 4 ecp_nistz256_ord_sqr_mont: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] adrp x23,.Lord add x23,x23,:lo12:.Lord ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x12,x13,[x23,#0] ldp x21,x22,[x23,#16] ldr x23,[x23,#32] b .Loop_ord_sqr .align 4 .Loop_ord_sqr: sub x2,x2,#1 //////////////////////////////////////////////////////////////// // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul x15,x5,x4 // a[1]*a[0] umulh x9,x5,x4 mul x16,x6,x4 // a[2]*a[0] umulh x10,x6,x4 mul x17,x7,x4 // a[3]*a[0] umulh x19,x7,x4 adds x16,x16,x9 // accumulate high parts of multiplication mul x8,x6,x5 // a[2]*a[1] umulh x9,x6,x5 adcs x17,x17,x10 mul x10,x7,x5 // a[3]*a[1] umulh x11,x7,x5 adc x19,x19,xzr // can't overflow mul x20,x7,x6 // a[3]*a[2] umulh x1,x7,x6 adds x9,x9,x10 // accumulate high parts of multiplication mul x14,x4,x4 // a[0]*a[0] adc x10,x11,xzr // can't overflow adds x17,x17,x8 // accumulate low parts of multiplication umulh x4,x4,x4 adcs x19,x19,x9 mul x9,x5,x5 // a[1]*a[1] adcs x20,x20,x10 umulh x5,x5,x5 adc x1,x1,xzr // can't overflow adds x15,x15,x15 // acc[1-6]*=2 mul x10,x6,x6 // a[2]*a[2] adcs x16,x16,x16 umulh x6,x6,x6 adcs x17,x17,x17 mul x11,x7,x7 // a[3]*a[3] adcs x19,x19,x19 umulh x7,x7,x7 adcs x20,x20,x20 adcs x1,x1,x1 adc x3,xzr,xzr adds x15,x15,x4 // +a[i]*a[i] mul x24,x14,x23 adcs x16,x16,x9 adcs x17,x17,x5 adcs x19,x19,x10 adcs x20,x20,x6 adcs x1,x1,x11 adc x3,x3,x7 subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 adc x11,x11,xzr adds x14,x15,x10 adcs x15,x16,x11 adcs x16,x17,x24 adc x17,xzr,x24 // can't overflow mul x11,x14,x23 lsl x8,x24,#32 subs x15,x15,x24 lsr x9,x24,#32 sbcs x16,x16,x8 sbc x17,x17,x9 // can't borrow subs xzr,x14,#1 umulh x9,x12,x11 mul x10,x13,x11 umulh x24,x13,x11 adcs x10,x10,x9 adc x24,x24,xzr adds x14,x15,x10 adcs x15,x16,x24 adcs x16,x17,x11 adc x17,xzr,x11 // can't overflow mul x24,x14,x23 lsl x8,x11,#32 subs x15,x15,x11 lsr x9,x11,#32 sbcs x16,x16,x8 sbc x17,x17,x9 // can't borrow subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 adc x11,x11,xzr adds x14,x15,x10 adcs x15,x16,x11 adcs x16,x17,x24 adc x17,xzr,x24 // can't overflow mul x11,x14,x23 lsl x8,x24,#32 subs x15,x15,x24 lsr x9,x24,#32 sbcs x16,x16,x8 sbc x17,x17,x9 // can't borrow subs xzr,x14,#1 umulh x9,x12,x11 mul x10,x13,x11 umulh x24,x13,x11 adcs x10,x10,x9 adc x24,x24,xzr adds x14,x15,x10 adcs x15,x16,x24 adcs x16,x17,x11 adc x17,xzr,x11 // can't overflow lsl x8,x11,#32 subs x15,x15,x11 lsr x9,x11,#32 sbcs x16,x16,x8 sbc x17,x17,x9 // can't borrow adds x14,x14,x19 // accumulate upper half adcs x15,x15,x20 adcs x16,x16,x1 adcs x17,x17,x3 adc x19,xzr,xzr subs x8,x14,x12 // ret -= modulus sbcs x9,x15,x13 sbcs x10,x16,x21 sbcs x11,x17,x22 sbcs xzr,x19,xzr csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x5,x15,x9,lo csel x6,x16,x10,lo csel x7,x17,x11,lo cbnz x2,.Loop_ord_sqr stp x4,x5,[x0] stp x6,x7,[x0,#16] ldp x19,x20,[sp,#16] ldp x21,x22,[sp,#32] ldp x23,x24,[sp,#48] ldr x29,[sp],#64 ret .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_select_w5 .hidden ecp_nistz256_select_w5 .type ecp_nistz256_select_w5,%function .align 4 ecp_nistz256_select_w5: AARCH64_VALID_CALL_TARGET // x10 := x0 // w9 := 0; loop counter and incremented internal index mov x10, x0 mov w9, #0 // [v16-v21] := 0 movi v16.16b, #0 movi v17.16b, #0 movi v18.16b, #0 movi v19.16b, #0 movi v20.16b, #0 movi v21.16b, #0 .Lselect_w5_loop: // Loop 16 times. // Increment index (loop counter); tested at the end of the loop add w9, w9, #1 // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 // and advance x1 to point to the next entry ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 // x11 := (w9 == w2)? All 1s : All 0s cmp w9, w2 csetm x11, eq // continue loading ... ld1 {v26.2d, v27.2d}, [x1],#32 // duplicate mask_64 into Mask (all 0s or all 1s) dup v3.2d, x11 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] // i.e., values in output registers will remain the same if w9 != w2 bit v16.16b, v22.16b, v3.16b bit v17.16b, v23.16b, v3.16b bit v18.16b, v24.16b, v3.16b bit v19.16b, v25.16b, v3.16b bit v20.16b, v26.16b, v3.16b bit v21.16b, v27.16b, v3.16b // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back tbz w9, #4, .Lselect_w5_loop // Write [v16-v21] to memory at the output pointer st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 st1 {v20.2d, v21.2d}, [x10] ret .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_select_w7 .hidden ecp_nistz256_select_w7 .type ecp_nistz256_select_w7,%function .align 4 ecp_nistz256_select_w7: AARCH64_VALID_CALL_TARGET // w9 := 0; loop counter and incremented internal index mov w9, #0 // [v16-v21] := 0 movi v16.16b, #0 movi v17.16b, #0 movi v18.16b, #0 movi v19.16b, #0 .Lselect_w7_loop: // Loop 64 times. // Increment index (loop counter); tested at the end of the loop add w9, w9, #1 // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 // and advance x1 to point to the next entry ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 // x11 := (w9 == w2)? All 1s : All 0s cmp w9, w2 csetm x11, eq // duplicate mask_64 into Mask (all 0s or all 1s) dup v3.2d, x11 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] // i.e., values in output registers will remain the same if w9 != w2 bit v16.16b, v22.16b, v3.16b bit v17.16b, v23.16b, v3.16b bit v18.16b, v24.16b, v3.16b bit v19.16b, v25.16b, v3.16b // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back tbz w9, #6, .Lselect_w7_loop // Write [v16-v19] to memory at the output pointer st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] ret .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) ring-0.17.14/pregenerated/p256-armv8-asm-win64.S000064400000000000000000001055071046102023000170100ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) .section .rodata .align 5 Lpoly: .quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 LRR: // 2^512 mod P precomputed for NIST P256 polynomial .quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd Lone_mont: .quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe Lone: .quad 1,0,0,0 Lord: .quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 LordK: .quad 0xccd1c8aaee00bc4f .byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .text // void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], // const BN_ULONG x2[4]); .globl ecp_nistz256_mul_mont .def ecp_nistz256_mul_mont .type 32 .endef .align 4 ecp_nistz256_mul_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldr x3,[x2] // bp[0] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] adrp x13,Lpoly add x13,x13,:lo12:Lpoly ldr x12,[x13,#8] ldr x13,[x13,#24] bl __ecp_nistz256_mul_mont ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 AARCH64_VALIDATE_LINK_REGISTER ret // void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_sqr_mont .def ecp_nistz256_sqr_mont .type 32 .endef .align 4 ecp_nistz256_sqr_mont: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] adrp x13,Lpoly add x13,x13,:lo12:Lpoly ldr x12,[x13,#8] ldr x13,[x13,#24] bl __ecp_nistz256_sqr_mont ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 AARCH64_VALIDATE_LINK_REGISTER ret // void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_neg .def ecp_nistz256_neg .type 32 .endef .align 4 ecp_nistz256_neg: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 mov x2,x1 mov x14,xzr // a = 0 mov x15,xzr mov x16,xzr mov x17,xzr adrp x13,Lpoly add x13,x13,:lo12:Lpoly ldr x12,[x13,#8] ldr x13,[x13,#24] bl __ecp_nistz256_sub_from ldp x29,x30,[sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret // note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded // to x4-x7 and b[0] - to x3 .def __ecp_nistz256_mul_mont .type 32 .endef .align 4 __ecp_nistz256_mul_mont: mul x14,x4,x3 // a[0]*b[0] umulh x8,x4,x3 mul x15,x5,x3 // a[1]*b[0] umulh x9,x5,x3 mul x16,x6,x3 // a[2]*b[0] umulh x10,x6,x3 mul x17,x7,x3 // a[3]*b[0] umulh x11,x7,x3 ldr x3,[x2,#8] // b[1] adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 lsr x9,x14,#32 adcs x17,x17,x10 adc x19,xzr,x11 mov x20,xzr subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] mul x8,x4,x3 // lo(a[0]*b[i]) adcs x15,x16,x9 mul x9,x5,x3 // lo(a[1]*b[i]) adcs x16,x17,x10 // +=acc[0]*0xffff0001 mul x10,x6,x3 // lo(a[2]*b[i]) adcs x17,x19,x11 mul x11,x7,x3 // lo(a[3]*b[i]) adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts of multiplication umulh x8,x4,x3 // hi(a[0]*b[i]) adcs x15,x15,x9 umulh x9,x5,x3 // hi(a[1]*b[i]) adcs x16,x16,x10 umulh x10,x6,x3 // hi(a[2]*b[i]) adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr ldr x3,[x2,#8*(1+1)] // b[1+1] adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 lsr x9,x14,#32 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] mul x8,x4,x3 // lo(a[0]*b[i]) adcs x15,x16,x9 mul x9,x5,x3 // lo(a[1]*b[i]) adcs x16,x17,x10 // +=acc[0]*0xffff0001 mul x10,x6,x3 // lo(a[2]*b[i]) adcs x17,x19,x11 mul x11,x7,x3 // lo(a[3]*b[i]) adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts of multiplication umulh x8,x4,x3 // hi(a[0]*b[i]) adcs x15,x15,x9 umulh x9,x5,x3 // hi(a[1]*b[i]) adcs x16,x16,x10 umulh x10,x6,x3 // hi(a[2]*b[i]) adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr ldr x3,[x2,#8*(2+1)] // b[2+1] adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 lsr x9,x14,#32 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] mul x8,x4,x3 // lo(a[0]*b[i]) adcs x15,x16,x9 mul x9,x5,x3 // lo(a[1]*b[i]) adcs x16,x17,x10 // +=acc[0]*0xffff0001 mul x10,x6,x3 // lo(a[2]*b[i]) adcs x17,x19,x11 mul x11,x7,x3 // lo(a[3]*b[i]) adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts of multiplication umulh x8,x4,x3 // hi(a[0]*b[i]) adcs x15,x15,x9 umulh x9,x5,x3 // hi(a[1]*b[i]) adcs x16,x16,x10 umulh x10,x6,x3 // hi(a[2]*b[i]) adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 lsr x9,x14,#32 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr // last reduction subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 adcs x16,x17,x10 // +=acc[0]*0xffff0001 adcs x17,x19,x11 adc x19,x20,xzr adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus sbcs x9,x15,x12 sbcs x10,x16,xzr sbcs x11,x17,x13 sbcs xzr,x19,xzr // did it borrow? csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo stp x14,x15,[x0] csel x17,x17,x11,lo stp x16,x17,[x0,#16] ret // note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded // to x4-x7 .def __ecp_nistz256_sqr_mont .type 32 .endef .align 4 __ecp_nistz256_sqr_mont: // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul x15,x5,x4 // a[1]*a[0] umulh x9,x5,x4 mul x16,x6,x4 // a[2]*a[0] umulh x10,x6,x4 mul x17,x7,x4 // a[3]*a[0] umulh x19,x7,x4 adds x16,x16,x9 // accumulate high parts of multiplication mul x8,x6,x5 // a[2]*a[1] umulh x9,x6,x5 adcs x17,x17,x10 mul x10,x7,x5 // a[3]*a[1] umulh x11,x7,x5 adc x19,x19,xzr // can't overflow mul x20,x7,x6 // a[3]*a[2] umulh x1,x7,x6 adds x9,x9,x10 // accumulate high parts of multiplication mul x14,x4,x4 // a[0]*a[0] adc x10,x11,xzr // can't overflow adds x17,x17,x8 // accumulate low parts of multiplication umulh x4,x4,x4 adcs x19,x19,x9 mul x9,x5,x5 // a[1]*a[1] adcs x20,x20,x10 umulh x5,x5,x5 adc x1,x1,xzr // can't overflow adds x15,x15,x15 // acc[1-6]*=2 mul x10,x6,x6 // a[2]*a[2] adcs x16,x16,x16 umulh x6,x6,x6 adcs x17,x17,x17 mul x11,x7,x7 // a[3]*a[3] adcs x19,x19,x19 umulh x7,x7,x7 adcs x20,x20,x20 adcs x1,x1,x1 adc x2,xzr,xzr adds x15,x15,x4 // +a[i]*a[i] adcs x16,x16,x9 adcs x17,x17,x5 adcs x19,x19,x10 adcs x20,x20,x6 lsl x8,x14,#32 adcs x1,x1,x11 lsr x9,x14,#32 adc x2,x2,x7 subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 lsl x8,x14,#32 adcs x16,x17,x10 // +=acc[0]*0xffff0001 lsr x9,x14,#32 adc x17,x11,xzr // can't overflow subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 lsl x8,x14,#32 adcs x16,x17,x10 // +=acc[0]*0xffff0001 lsr x9,x14,#32 adc x17,x11,xzr // can't overflow subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 lsl x8,x14,#32 adcs x16,x17,x10 // +=acc[0]*0xffff0001 lsr x9,x14,#32 adc x17,x11,xzr // can't overflow subs x10,x14,x8 // "*0xffff0001" sbc x11,x14,x9 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] adcs x15,x16,x9 adcs x16,x17,x10 // +=acc[0]*0xffff0001 adc x17,x11,xzr // can't overflow adds x14,x14,x19 // accumulate upper half adcs x15,x15,x20 adcs x16,x16,x1 adcs x17,x17,x2 adc x19,xzr,xzr adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus sbcs x9,x15,x12 sbcs x10,x16,xzr sbcs x11,x17,x13 sbcs xzr,x19,xzr // did it borrow? csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo stp x14,x15,[x0] csel x17,x17,x11,lo stp x16,x17,[x0,#16] ret // Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to // x4-x7 and x8-x11. This is done because it's used in multiple // contexts, e.g. in multiplication by 2 and 3... .def __ecp_nistz256_add_to .type 32 .endef .align 4 __ecp_nistz256_add_to: adds x14,x14,x8 // ret = a+b adcs x15,x15,x9 adcs x16,x16,x10 adcs x17,x17,x11 adc x1,xzr,xzr // zap x1 adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus sbcs x9,x15,x12 sbcs x10,x16,xzr sbcs x11,x17,x13 sbcs xzr,x1,xzr // did subtraction borrow? csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo stp x14,x15,[x0] csel x17,x17,x11,lo stp x16,x17,[x0,#16] ret .def __ecp_nistz256_sub_from .type 32 .endef .align 4 __ecp_nistz256_sub_from: ldp x8,x9,[x2] ldp x10,x11,[x2,#16] subs x14,x14,x8 // ret = a-b sbcs x15,x15,x9 sbcs x16,x16,x10 sbcs x17,x17,x11 sbc x1,xzr,xzr // zap x1 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus adcs x9,x15,x12 adcs x10,x16,xzr adc x11,x17,x13 cmp x1,xzr // did subtraction borrow? csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret csel x15,x15,x9,eq csel x16,x16,x10,eq stp x14,x15,[x0] csel x17,x17,x11,eq stp x16,x17,[x0,#16] ret .def __ecp_nistz256_sub_morf .type 32 .endef .align 4 __ecp_nistz256_sub_morf: ldp x8,x9,[x2] ldp x10,x11,[x2,#16] subs x14,x8,x14 // ret = b-a sbcs x15,x9,x15 sbcs x16,x10,x16 sbcs x17,x11,x17 sbc x1,xzr,xzr // zap x1 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus adcs x9,x15,x12 adcs x10,x16,xzr adc x11,x17,x13 cmp x1,xzr // did subtraction borrow? csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret csel x15,x15,x9,eq csel x16,x16,x10,eq stp x14,x15,[x0] csel x17,x17,x11,eq stp x16,x17,[x0,#16] ret .def __ecp_nistz256_div_by_2 .type 32 .endef .align 4 __ecp_nistz256_div_by_2: subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus adcs x9,x15,x12 adcs x10,x16,xzr adcs x11,x17,x13 adc x1,xzr,xzr // zap x1 tst x14,#1 // is a even? csel x14,x14,x8,eq // ret = even ? a : a+modulus csel x15,x15,x9,eq csel x16,x16,x10,eq csel x17,x17,x11,eq csel x1,xzr,x1,eq lsr x14,x14,#1 // ret >>= 1 orr x14,x14,x15,lsl#63 lsr x15,x15,#1 orr x15,x15,x16,lsl#63 lsr x16,x16,#1 orr x16,x16,x17,lsl#63 lsr x17,x17,#1 stp x14,x15,[x0] orr x17,x17,x1,lsl#63 stp x16,x17,[x0,#16] ret .globl ecp_nistz256_point_double .def ecp_nistz256_point_double .type 32 .endef .align 5 ecp_nistz256_point_double: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] sub sp,sp,#32*4 Ldouble_shortcut: ldp x14,x15,[x1,#32] mov x21,x0 ldp x16,x17,[x1,#48] mov x22,x1 adrp x13,Lpoly add x13,x13,:lo12:Lpoly ldr x12,[x13,#8] mov x8,x14 ldr x13,[x13,#24] mov x9,x15 ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 ldp x6,x7,[x22,#64+16] add x0,sp,#0 bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); add x0,sp,#64 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); ldp x8,x9,[x22] ldp x10,x11,[x22,#16] mov x4,x14 // put Zsqr aside for p256_sub mov x5,x15 mov x6,x16 mov x7,x17 add x0,sp,#32 bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); add x2,x22,#0 mov x14,x4 // restore Zsqr mov x15,x5 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont mov x16,x6 mov x17,x7 ldp x6,x7,[sp,#0+16] add x0,sp,#64 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); add x0,sp,#0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); ldr x3,[x22,#32] ldp x4,x5,[x22,#64] ldp x6,x7,[x22,#64+16] add x2,x22,#32 add x0,sp,#96 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); mov x8,x14 mov x9,x15 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 ldp x6,x7,[sp,#0+16] add x0,x21,#64 bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); add x0,sp,#96 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); ldr x3,[sp,#64] // forward load for p256_mul_mont ldp x4,x5,[sp,#32] ldp x6,x7,[sp,#32+16] add x0,x21,#32 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); add x2,sp,#64 add x0,sp,#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); mov x8,x14 // duplicate M mov x9,x15 mov x10,x16 mov x11,x17 mov x4,x14 // put M aside mov x5,x15 mov x6,x16 mov x7,x17 add x0,sp,#32 bl __ecp_nistz256_add_to mov x8,x4 // restore M mov x9,x5 ldr x3,[x22] // forward load for p256_mul_mont mov x10,x6 ldp x4,x5,[sp,#0] mov x11,x7 ldp x6,x7,[sp,#0+16] bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); add x2,x22,#0 add x0,sp,#0 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); mov x8,x14 mov x9,x15 ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 ldp x6,x7,[sp,#32+16] add x0,sp,#96 bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); add x0,x21,#0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); add x2,sp,#96 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); add x2,sp,#0 add x0,sp,#0 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); ldr x3,[sp,#32] mov x4,x14 // copy S mov x5,x15 mov x6,x16 mov x7,x17 add x2,sp,#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); add x2,x21,#32 add x0,x21,#32 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .globl ecp_nistz256_point_add .def ecp_nistz256_point_add .type 32 .endef .align 5 ecp_nistz256_point_add: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#32*12 ldp x4,x5,[x2,#64] // in2_z ldp x6,x7,[x2,#64+16] mov x21,x0 mov x22,x1 mov x23,x2 adrp x13,Lpoly add x13,x13,:lo12:Lpoly ldr x12,[x13,#8] ldr x13,[x13,#24] orr x8,x4,x5 orr x10,x6,x7 orr x25,x8,x10 cmp x25,#0 csetm x25,ne // ~in2infty add x0,sp,#192 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); ldp x4,x5,[x22,#64] // in1_z ldp x6,x7,[x22,#64+16] orr x8,x4,x5 orr x10,x6,x7 orr x24,x8,x10 cmp x24,#0 csetm x24,ne // ~in1infty add x0,sp,#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); ldr x3,[x23,#64] ldp x4,x5,[sp,#192] ldp x6,x7,[sp,#192+16] add x2,x23,#64 add x0,sp,#320 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); ldr x3,[x22,#64] ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x2,x22,#64 add x0,sp,#352 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); ldr x3,[x22,#32] ldp x4,x5,[sp,#320] ldp x6,x7,[sp,#320+16] add x2,x22,#32 add x0,sp,#320 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); ldr x3,[x23,#32] ldp x4,x5,[sp,#352] ldp x6,x7,[sp,#352+16] add x2,x23,#32 add x0,sp,#352 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); add x2,sp,#320 ldr x3,[sp,#192] // forward load for p256_mul_mont ldp x4,x5,[x22] ldp x6,x7,[x22,#16] add x0,sp,#160 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); orr x14,x14,x15 // see if result is zero orr x16,x16,x17 orr x26,x14,x16 // ~is_equal(S1,S2) add x2,sp,#192 add x0,sp,#256 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); ldr x3,[sp,#128] ldp x4,x5,[x23] ldp x6,x7,[x23,#16] add x2,sp,#128 add x0,sp,#288 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); add x2,sp,#256 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont ldp x6,x7,[sp,#160+16] add x0,sp,#96 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); orr x14,x14,x15 // see if result is zero orr x16,x16,x17 orr x14,x14,x16 // ~is_equal(U1,U2) mvn x27,x24 // -1/0 -> 0/-1 mvn x28,x25 // -1/0 -> 0/-1 orr x14,x14,x27 orr x14,x14,x28 orr x14,x14,x26 cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) Ladd_double: mov x1,x22 mov x0,x21 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames b Ldouble_shortcut .align 4 Ladd_proceed: add x0,sp,#192 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); ldr x3,[x22,#64] ldp x4,x5,[sp,#96] ldp x6,x7,[sp,#96+16] add x2,x22,#64 add x0,sp,#64 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); ldp x4,x5,[sp,#96] ldp x6,x7,[sp,#96+16] add x0,sp,#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); ldr x3,[x23,#64] ldp x4,x5,[sp,#64] ldp x6,x7,[sp,#64+16] add x2,x23,#64 add x0,sp,#64 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); ldr x3,[sp,#96] ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x2,sp,#96 add x0,sp,#224 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); ldr x3,[sp,#128] ldp x4,x5,[sp,#256] ldp x6,x7,[sp,#256+16] add x2,sp,#128 add x0,sp,#288 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); mov x8,x14 mov x9,x15 mov x10,x16 mov x11,x17 add x0,sp,#128 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); add x2,sp,#192 add x0,sp,#0 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); add x2,sp,#224 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); add x2,sp,#288 ldr x3,[sp,#224] // forward load for p256_mul_mont ldp x4,x5,[sp,#320] ldp x6,x7,[sp,#320+16] add x0,sp,#32 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); add x2,sp,#224 add x0,sp,#352 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); ldr x3,[sp,#160] ldp x4,x5,[sp,#32] ldp x6,x7,[sp,#32+16] add x2,sp,#160 add x0,sp,#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); add x2,sp,#352 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); ldp x4,x5,[sp,#0] // res ldp x6,x7,[sp,#0+16] ldp x8,x9,[x23] // in2 ldp x10,x11,[x23,#16] ldp x14,x15,[x22,#0] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#0+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+0+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+0+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#0+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#0+48] stp x14,x15,[x21,#0] stp x16,x17,[x21,#0+16] ldp x14,x15,[x22,#32] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#32+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+32+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+32+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#32+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#32+48] stp x14,x15,[x21,#32] stp x16,x17,[x21,#32+16] ldp x14,x15,[x22,#64] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#64+16] csel x8,x4,x8,ne csel x9,x5,x9,ne csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? csel x14,x8,x14,ne csel x15,x9,x15,ne csel x16,x10,x16,ne csel x17,x11,x17,ne stp x14,x15,[x21,#64] stp x16,x17,[x21,#64+16] Ladd_done: add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret .globl ecp_nistz256_point_add_affine .def ecp_nistz256_point_add_affine .type 32 .endef .align 5 ecp_nistz256_point_add_affine: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-80]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] sub sp,sp,#32*10 mov x21,x0 mov x22,x1 mov x23,x2 adrp x13,Lpoly add x13,x13,:lo12:Lpoly ldr x12,[x13,#8] ldr x13,[x13,#24] ldp x4,x5,[x1,#64] // in1_z ldp x6,x7,[x1,#64+16] orr x8,x4,x5 orr x10,x6,x7 orr x24,x8,x10 cmp x24,#0 csetm x24,ne // ~in1infty ldp x14,x15,[x2] // in2_x ldp x16,x17,[x2,#16] ldp x8,x9,[x2,#32] // in2_y ldp x10,x11,[x2,#48] orr x14,x14,x15 orr x16,x16,x17 orr x8,x8,x9 orr x10,x10,x11 orr x14,x14,x16 orr x8,x8,x10 orr x25,x14,x8 cmp x25,#0 csetm x25,ne // ~in2infty add x0,sp,#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); mov x4,x14 mov x5,x15 mov x6,x16 mov x7,x17 ldr x3,[x23] add x2,x23,#0 add x0,sp,#96 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); add x2,x22,#0 ldr x3,[x22,#64] // forward load for p256_mul_mont ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x0,sp,#160 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); add x2,x22,#64 add x0,sp,#128 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); ldr x3,[x22,#64] ldp x4,x5,[sp,#160] ldp x6,x7,[sp,#160+16] add x2,x22,#64 add x0,sp,#64 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); ldr x3,[x23,#32] ldp x4,x5,[sp,#128] ldp x6,x7,[sp,#128+16] add x2,x23,#32 add x0,sp,#128 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); add x2,x22,#32 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont ldp x6,x7,[sp,#160+16] add x0,sp,#192 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); add x0,sp,#224 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); ldp x4,x5,[sp,#192] ldp x6,x7,[sp,#192+16] add x0,sp,#288 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); ldr x3,[sp,#160] ldp x4,x5,[sp,#224] ldp x6,x7,[sp,#224+16] add x2,sp,#160 add x0,sp,#256 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); ldr x3,[x22] ldp x4,x5,[sp,#224] ldp x6,x7,[sp,#224+16] add x2,x22,#0 add x0,sp,#96 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); mov x8,x14 mov x9,x15 mov x10,x16 mov x11,x17 add x0,sp,#224 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); add x2,sp,#288 add x0,sp,#0 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); add x2,sp,#256 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); add x2,sp,#96 ldr x3,[x22,#32] // forward load for p256_mul_mont ldp x4,x5,[sp,#256] ldp x6,x7,[sp,#256+16] add x0,sp,#32 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); add x2,x22,#32 add x0,sp,#128 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); ldr x3,[sp,#192] ldp x4,x5,[sp,#32] ldp x6,x7,[sp,#32+16] add x2,sp,#192 add x0,sp,#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); add x2,sp,#128 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); ldp x4,x5,[sp,#0] // res ldp x6,x7,[sp,#0+16] ldp x8,x9,[x23] // in2 ldp x10,x11,[x23,#16] ldp x14,x15,[x22,#0] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#0+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+0+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+0+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#0+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#0+48] stp x14,x15,[x21,#0] stp x16,x17,[x21,#0+16] adrp x23,Lone_mont-64 add x23,x23,:lo12:Lone_mont-64 ldp x14,x15,[x22,#32] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#32+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+32+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+32+48] csel x14,x8,x14,ne csel x15,x9,x15,ne ldp x8,x9,[x23,#32+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne ldp x10,x11,[x23,#32+48] stp x14,x15,[x21,#32] stp x16,x17,[x21,#32+16] ldp x14,x15,[x22,#64] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#64+16] csel x8,x4,x8,ne csel x9,x5,x9,ne csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? csel x14,x8,x14,ne csel x15,x9,x15,ne csel x16,x10,x16,ne csel x17,x11,x17,ne stp x14,x15,[x21,#64] stp x16,x17,[x21,#64+16] add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x29,x30,[sp],#80 AARCH64_VALIDATE_LINK_REGISTER ret //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], // uint64_t b[4]); .globl ecp_nistz256_ord_mul_mont .def ecp_nistz256_ord_mul_mont .type 32 .endef .align 4 ecp_nistz256_ord_mul_mont: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] adrp x23,Lord add x23,x23,:lo12:Lord ldr x3,[x2] // bp[0] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x12,x13,[x23,#0] ldp x21,x22,[x23,#16] ldr x23,[x23,#32] mul x14,x4,x3 // a[0]*b[0] umulh x8,x4,x3 mul x15,x5,x3 // a[1]*b[0] umulh x9,x5,x3 mul x16,x6,x3 // a[2]*b[0] umulh x10,x6,x3 mul x17,x7,x3 // a[3]*b[0] umulh x19,x7,x3 mul x24,x14,x23 adds x15,x15,x8 // accumulate high parts of multiplication adcs x16,x16,x9 adcs x17,x17,x10 adc x19,x19,xzr mov x20,xzr ldr x3,[x2,#8*1] // b[i] lsl x8,x24,#32 subs x16,x16,x24 lsr x9,x24,#32 sbcs x17,x17,x8 sbcs x19,x19,x9 sbc x20,x20,xzr subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 mul x8,x4,x3 adc x11,x11,xzr mul x9,x5,x3 adds x14,x15,x10 mul x10,x6,x3 adcs x15,x16,x11 mul x11,x7,x3 adcs x16,x17,x24 adcs x17,x19,x24 adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts umulh x8,x4,x3 adcs x15,x15,x9 umulh x9,x5,x3 adcs x16,x16,x10 umulh x10,x6,x3 adcs x17,x17,x11 umulh x11,x7,x3 adc x19,x19,xzr mul x24,x14,x23 adds x15,x15,x8 // accumulate high parts adcs x16,x16,x9 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr ldr x3,[x2,#8*2] // b[i] lsl x8,x24,#32 subs x16,x16,x24 lsr x9,x24,#32 sbcs x17,x17,x8 sbcs x19,x19,x9 sbc x20,x20,xzr subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 mul x8,x4,x3 adc x11,x11,xzr mul x9,x5,x3 adds x14,x15,x10 mul x10,x6,x3 adcs x15,x16,x11 mul x11,x7,x3 adcs x16,x17,x24 adcs x17,x19,x24 adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts umulh x8,x4,x3 adcs x15,x15,x9 umulh x9,x5,x3 adcs x16,x16,x10 umulh x10,x6,x3 adcs x17,x17,x11 umulh x11,x7,x3 adc x19,x19,xzr mul x24,x14,x23 adds x15,x15,x8 // accumulate high parts adcs x16,x16,x9 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr ldr x3,[x2,#8*3] // b[i] lsl x8,x24,#32 subs x16,x16,x24 lsr x9,x24,#32 sbcs x17,x17,x8 sbcs x19,x19,x9 sbc x20,x20,xzr subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 mul x8,x4,x3 adc x11,x11,xzr mul x9,x5,x3 adds x14,x15,x10 mul x10,x6,x3 adcs x15,x16,x11 mul x11,x7,x3 adcs x16,x17,x24 adcs x17,x19,x24 adc x19,x20,xzr adds x14,x14,x8 // accumulate low parts umulh x8,x4,x3 adcs x15,x15,x9 umulh x9,x5,x3 adcs x16,x16,x10 umulh x10,x6,x3 adcs x17,x17,x11 umulh x11,x7,x3 adc x19,x19,xzr mul x24,x14,x23 adds x15,x15,x8 // accumulate high parts adcs x16,x16,x9 adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr lsl x8,x24,#32 // last reduction subs x16,x16,x24 lsr x9,x24,#32 sbcs x17,x17,x8 sbcs x19,x19,x9 sbc x20,x20,xzr subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 adc x11,x11,xzr adds x14,x15,x10 adcs x15,x16,x11 adcs x16,x17,x24 adcs x17,x19,x24 adc x19,x20,xzr subs x8,x14,x12 // ret -= modulus sbcs x9,x15,x13 sbcs x10,x16,x21 sbcs x11,x17,x22 sbcs xzr,x19,xzr csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo stp x14,x15,[x0] csel x17,x17,x11,lo stp x16,x17,[x0,#16] ldp x19,x20,[sp,#16] ldp x21,x22,[sp,#32] ldp x23,x24,[sp,#48] ldr x29,[sp],#64 ret //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], // uint64_t rep); .globl ecp_nistz256_ord_sqr_mont .def ecp_nistz256_ord_sqr_mont .type 32 .endef .align 4 ecp_nistz256_ord_sqr_mont: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] adrp x23,Lord add x23,x23,:lo12:Lord ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x12,x13,[x23,#0] ldp x21,x22,[x23,#16] ldr x23,[x23,#32] b Loop_ord_sqr .align 4 Loop_ord_sqr: sub x2,x2,#1 //////////////////////////////////////////////////////////////// // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul x15,x5,x4 // a[1]*a[0] umulh x9,x5,x4 mul x16,x6,x4 // a[2]*a[0] umulh x10,x6,x4 mul x17,x7,x4 // a[3]*a[0] umulh x19,x7,x4 adds x16,x16,x9 // accumulate high parts of multiplication mul x8,x6,x5 // a[2]*a[1] umulh x9,x6,x5 adcs x17,x17,x10 mul x10,x7,x5 // a[3]*a[1] umulh x11,x7,x5 adc x19,x19,xzr // can't overflow mul x20,x7,x6 // a[3]*a[2] umulh x1,x7,x6 adds x9,x9,x10 // accumulate high parts of multiplication mul x14,x4,x4 // a[0]*a[0] adc x10,x11,xzr // can't overflow adds x17,x17,x8 // accumulate low parts of multiplication umulh x4,x4,x4 adcs x19,x19,x9 mul x9,x5,x5 // a[1]*a[1] adcs x20,x20,x10 umulh x5,x5,x5 adc x1,x1,xzr // can't overflow adds x15,x15,x15 // acc[1-6]*=2 mul x10,x6,x6 // a[2]*a[2] adcs x16,x16,x16 umulh x6,x6,x6 adcs x17,x17,x17 mul x11,x7,x7 // a[3]*a[3] adcs x19,x19,x19 umulh x7,x7,x7 adcs x20,x20,x20 adcs x1,x1,x1 adc x3,xzr,xzr adds x15,x15,x4 // +a[i]*a[i] mul x24,x14,x23 adcs x16,x16,x9 adcs x17,x17,x5 adcs x19,x19,x10 adcs x20,x20,x6 adcs x1,x1,x11 adc x3,x3,x7 subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 adc x11,x11,xzr adds x14,x15,x10 adcs x15,x16,x11 adcs x16,x17,x24 adc x17,xzr,x24 // can't overflow mul x11,x14,x23 lsl x8,x24,#32 subs x15,x15,x24 lsr x9,x24,#32 sbcs x16,x16,x8 sbc x17,x17,x9 // can't borrow subs xzr,x14,#1 umulh x9,x12,x11 mul x10,x13,x11 umulh x24,x13,x11 adcs x10,x10,x9 adc x24,x24,xzr adds x14,x15,x10 adcs x15,x16,x24 adcs x16,x17,x11 adc x17,xzr,x11 // can't overflow mul x24,x14,x23 lsl x8,x11,#32 subs x15,x15,x11 lsr x9,x11,#32 sbcs x16,x16,x8 sbc x17,x17,x9 // can't borrow subs xzr,x14,#1 umulh x9,x12,x24 mul x10,x13,x24 umulh x11,x13,x24 adcs x10,x10,x9 adc x11,x11,xzr adds x14,x15,x10 adcs x15,x16,x11 adcs x16,x17,x24 adc x17,xzr,x24 // can't overflow mul x11,x14,x23 lsl x8,x24,#32 subs x15,x15,x24 lsr x9,x24,#32 sbcs x16,x16,x8 sbc x17,x17,x9 // can't borrow subs xzr,x14,#1 umulh x9,x12,x11 mul x10,x13,x11 umulh x24,x13,x11 adcs x10,x10,x9 adc x24,x24,xzr adds x14,x15,x10 adcs x15,x16,x24 adcs x16,x17,x11 adc x17,xzr,x11 // can't overflow lsl x8,x11,#32 subs x15,x15,x11 lsr x9,x11,#32 sbcs x16,x16,x8 sbc x17,x17,x9 // can't borrow adds x14,x14,x19 // accumulate upper half adcs x15,x15,x20 adcs x16,x16,x1 adcs x17,x17,x3 adc x19,xzr,xzr subs x8,x14,x12 // ret -= modulus sbcs x9,x15,x13 sbcs x10,x16,x21 sbcs x11,x17,x22 sbcs xzr,x19,xzr csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x5,x15,x9,lo csel x6,x16,x10,lo csel x7,x17,x11,lo cbnz x2,Loop_ord_sqr stp x4,x5,[x0] stp x6,x7,[x0,#16] ldp x19,x20,[sp,#16] ldp x21,x22,[sp,#32] ldp x23,x24,[sp,#48] ldr x29,[sp],#64 ret //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_select_w5 .def ecp_nistz256_select_w5 .type 32 .endef .align 4 ecp_nistz256_select_w5: AARCH64_VALID_CALL_TARGET // x10 := x0 // w9 := 0; loop counter and incremented internal index mov x10, x0 mov w9, #0 // [v16-v21] := 0 movi v16.16b, #0 movi v17.16b, #0 movi v18.16b, #0 movi v19.16b, #0 movi v20.16b, #0 movi v21.16b, #0 Lselect_w5_loop: // Loop 16 times. // Increment index (loop counter); tested at the end of the loop add w9, w9, #1 // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 // and advance x1 to point to the next entry ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 // x11 := (w9 == w2)? All 1s : All 0s cmp w9, w2 csetm x11, eq // continue loading ... ld1 {v26.2d, v27.2d}, [x1],#32 // duplicate mask_64 into Mask (all 0s or all 1s) dup v3.2d, x11 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] // i.e., values in output registers will remain the same if w9 != w2 bit v16.16b, v22.16b, v3.16b bit v17.16b, v23.16b, v3.16b bit v18.16b, v24.16b, v3.16b bit v19.16b, v25.16b, v3.16b bit v20.16b, v26.16b, v3.16b bit v21.16b, v27.16b, v3.16b // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back tbz w9, #4, Lselect_w5_loop // Write [v16-v21] to memory at the output pointer st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 st1 {v20.2d, v21.2d}, [x10] ret //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_select_w7 .def ecp_nistz256_select_w7 .type 32 .endef .align 4 ecp_nistz256_select_w7: AARCH64_VALID_CALL_TARGET // w9 := 0; loop counter and incremented internal index mov w9, #0 // [v16-v21] := 0 movi v16.16b, #0 movi v17.16b, #0 movi v18.16b, #0 movi v19.16b, #0 Lselect_w7_loop: // Loop 64 times. // Increment index (loop counter); tested at the end of the loop add w9, w9, #1 // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 // and advance x1 to point to the next entry ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 // x11 := (w9 == w2)? All 1s : All 0s cmp w9, w2 csetm x11, eq // duplicate mask_64 into Mask (all 0s or all 1s) dup v3.2d, x11 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] // i.e., values in output registers will remain the same if w9 != w2 bit v16.16b, v22.16b, v3.16b bit v17.16b, v23.16b, v3.16b bit v18.16b, v24.16b, v3.16b bit v19.16b, v25.16b, v3.16b // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back tbz w9, #6, Lselect_w7_loop // Write [v16-v19] to memory at the output pointer st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] ret #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) ring-0.17.14/pregenerated/p256-x86_64-asm-elf.S000064400000000000000000002314151046102023000165060ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text .section .rodata .align 64 .Lpoly: .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 .LOne: .long 1,1,1,1,1,1,1,1 .LTwo: .long 2,2,2,2,2,2,2,2 .LThree: .long 3,3,3,3,3,3,3,3 .LONE_mont: .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe .Lord: .quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 .LordK: .quad 0xccd1c8aaee00bc4f .text .globl ecp_nistz256_neg .hidden ecp_nistz256_neg .type ecp_nistz256_neg,@function .align 32 ecp_nistz256_neg: .cfi_startproc _CET_ENDBR pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-16 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-24 .Lneg_body: xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 xorq %r11,%r11 xorq %r13,%r13 subq 0(%rsi),%r8 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 movq %r8,%rax sbbq 24(%rsi),%r11 leaq .Lpoly(%rip),%rsi movq %r9,%rdx sbbq $0,%r13 addq 0(%rsi),%r8 movq %r10,%rcx adcq 8(%rsi),%r9 adcq 16(%rsi),%r10 movq %r11,%r12 adcq 24(%rsi),%r11 testq %r13,%r13 cmovzq %rax,%r8 cmovzq %rdx,%r9 movq %r8,0(%rdi) cmovzq %rcx,%r10 movq %r9,8(%rdi) cmovzq %r12,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 0(%rsp),%r13 .cfi_restore %r13 movq 8(%rsp),%r12 .cfi_restore %r12 leaq 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lneg_epilogue: ret .cfi_endproc .size ecp_nistz256_neg,.-ecp_nistz256_neg .globl ecp_nistz256_ord_mul_mont_nohw .hidden ecp_nistz256_ord_mul_mont_nohw .type ecp_nistz256_ord_mul_mont_nohw,@function .align 32 ecp_nistz256_ord_mul_mont_nohw: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lord_mul_body: movq 0(%rdx),%rax movq %rdx,%rbx leaq .Lord(%rip),%r14 movq .LordK(%rip),%r15 movq %rax,%rcx mulq 0(%rsi) movq %rax,%r8 movq %rcx,%rax movq %rdx,%r9 mulq 8(%rsi) addq %rax,%r9 movq %rcx,%rax adcq $0,%rdx movq %rdx,%r10 mulq 16(%rsi) addq %rax,%r10 movq %rcx,%rax adcq $0,%rdx movq %r8,%r13 imulq %r15,%r8 movq %rdx,%r11 mulq 24(%rsi) addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %rdx,%r12 mulq 0(%r14) movq %r8,%rbp addq %rax,%r13 movq %r8,%rax adcq $0,%rdx movq %rdx,%rcx subq %r8,%r10 sbbq $0,%r8 mulq 8(%r14) addq %rcx,%r9 adcq $0,%rdx addq %rax,%r9 movq %rbp,%rax adcq %rdx,%r10 movq %rbp,%rdx adcq $0,%r8 shlq $32,%rax shrq $32,%rdx subq %rax,%r11 movq 8(%rbx),%rax sbbq %rdx,%rbp addq %r8,%r11 adcq %rbp,%r12 adcq $0,%r13 movq %rax,%rcx mulq 0(%rsi) addq %rax,%r9 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rbp,%r10 adcq $0,%rdx addq %rax,%r10 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rbp,%r11 adcq $0,%rdx addq %rax,%r11 movq %rcx,%rax adcq $0,%rdx movq %r9,%rcx imulq %r15,%r9 movq %rdx,%rbp mulq 24(%rsi) addq %rbp,%r12 adcq $0,%rdx xorq %r8,%r8 addq %rax,%r12 movq %r9,%rax adcq %rdx,%r13 adcq $0,%r8 mulq 0(%r14) movq %r9,%rbp addq %rax,%rcx movq %r9,%rax adcq %rdx,%rcx subq %r9,%r11 sbbq $0,%r9 mulq 8(%r14) addq %rcx,%r10 adcq $0,%rdx addq %rax,%r10 movq %rbp,%rax adcq %rdx,%r11 movq %rbp,%rdx adcq $0,%r9 shlq $32,%rax shrq $32,%rdx subq %rax,%r12 movq 16(%rbx),%rax sbbq %rdx,%rbp addq %r9,%r12 adcq %rbp,%r13 adcq $0,%r8 movq %rax,%rcx mulq 0(%rsi) addq %rax,%r10 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rbp,%r11 adcq $0,%rdx addq %rax,%r11 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rbp,%r12 adcq $0,%rdx addq %rax,%r12 movq %rcx,%rax adcq $0,%rdx movq %r10,%rcx imulq %r15,%r10 movq %rdx,%rbp mulq 24(%rsi) addq %rbp,%r13 adcq $0,%rdx xorq %r9,%r9 addq %rax,%r13 movq %r10,%rax adcq %rdx,%r8 adcq $0,%r9 mulq 0(%r14) movq %r10,%rbp addq %rax,%rcx movq %r10,%rax adcq %rdx,%rcx subq %r10,%r12 sbbq $0,%r10 mulq 8(%r14) addq %rcx,%r11 adcq $0,%rdx addq %rax,%r11 movq %rbp,%rax adcq %rdx,%r12 movq %rbp,%rdx adcq $0,%r10 shlq $32,%rax shrq $32,%rdx subq %rax,%r13 movq 24(%rbx),%rax sbbq %rdx,%rbp addq %r10,%r13 adcq %rbp,%r8 adcq $0,%r9 movq %rax,%rcx mulq 0(%rsi) addq %rax,%r11 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rbp,%r12 adcq $0,%rdx addq %rax,%r12 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rbp,%r13 adcq $0,%rdx addq %rax,%r13 movq %rcx,%rax adcq $0,%rdx movq %r11,%rcx imulq %r15,%r11 movq %rdx,%rbp mulq 24(%rsi) addq %rbp,%r8 adcq $0,%rdx xorq %r10,%r10 addq %rax,%r8 movq %r11,%rax adcq %rdx,%r9 adcq $0,%r10 mulq 0(%r14) movq %r11,%rbp addq %rax,%rcx movq %r11,%rax adcq %rdx,%rcx subq %r11,%r13 sbbq $0,%r11 mulq 8(%r14) addq %rcx,%r12 adcq $0,%rdx addq %rax,%r12 movq %rbp,%rax adcq %rdx,%r13 movq %rbp,%rdx adcq $0,%r11 shlq $32,%rax shrq $32,%rdx subq %rax,%r8 sbbq %rdx,%rbp addq %r11,%r8 adcq %rbp,%r9 adcq $0,%r10 movq %r12,%rsi subq 0(%r14),%r12 movq %r13,%r11 sbbq 8(%r14),%r13 movq %r8,%rcx sbbq 16(%r14),%r8 movq %r9,%rbp sbbq 24(%r14),%r9 sbbq $0,%r10 cmovcq %rsi,%r12 cmovcq %r11,%r13 cmovcq %rcx,%r8 cmovcq %rbp,%r9 movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_mul_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_mul_mont_nohw,.-ecp_nistz256_ord_mul_mont_nohw .globl ecp_nistz256_ord_sqr_mont_nohw .hidden ecp_nistz256_ord_sqr_mont_nohw .type ecp_nistz256_ord_sqr_mont_nohw,@function .align 32 ecp_nistz256_ord_sqr_mont_nohw: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lord_sqr_body: movq 0(%rsi),%r8 movq 8(%rsi),%rax movq 16(%rsi),%r14 movq 24(%rsi),%r15 leaq .Lord(%rip),%rsi movq %rdx,%rbx jmp .Loop_ord_sqr .align 32 .Loop_ord_sqr: movq %rax,%rbp mulq %r8 movq %rax,%r9 .byte 102,72,15,110,205 movq %r14,%rax movq %rdx,%r10 mulq %r8 addq %rax,%r10 movq %r15,%rax .byte 102,73,15,110,214 adcq $0,%rdx movq %rdx,%r11 mulq %r8 addq %rax,%r11 movq %r15,%rax .byte 102,73,15,110,223 adcq $0,%rdx movq %rdx,%r12 mulq %r14 movq %rax,%r13 movq %r14,%rax movq %rdx,%r14 mulq %rbp addq %rax,%r11 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 mulq %rbp addq %rax,%r12 adcq $0,%rdx addq %r15,%r12 adcq %rdx,%r13 adcq $0,%r14 xorq %r15,%r15 movq %r8,%rax addq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %r12,%r12 adcq %r13,%r13 adcq %r14,%r14 adcq $0,%r15 mulq %rax movq %rax,%r8 .byte 102,72,15,126,200 movq %rdx,%rbp mulq %rax addq %rbp,%r9 adcq %rax,%r10 .byte 102,72,15,126,208 adcq $0,%rdx movq %rdx,%rbp mulq %rax addq %rbp,%r11 adcq %rax,%r12 .byte 102,72,15,126,216 adcq $0,%rdx movq %rdx,%rbp movq %r8,%rcx imulq 32(%rsi),%r8 mulq %rax addq %rbp,%r13 adcq %rax,%r14 movq 0(%rsi),%rax adcq %rdx,%r15 mulq %r8 movq %r8,%rbp addq %rax,%rcx movq 8(%rsi),%rax adcq %rdx,%rcx subq %r8,%r10 sbbq $0,%rbp mulq %r8 addq %rcx,%r9 adcq $0,%rdx addq %rax,%r9 movq %r8,%rax adcq %rdx,%r10 movq %r8,%rdx adcq $0,%rbp movq %r9,%rcx imulq 32(%rsi),%r9 shlq $32,%rax shrq $32,%rdx subq %rax,%r11 movq 0(%rsi),%rax sbbq %rdx,%r8 addq %rbp,%r11 adcq $0,%r8 mulq %r9 movq %r9,%rbp addq %rax,%rcx movq 8(%rsi),%rax adcq %rdx,%rcx subq %r9,%r11 sbbq $0,%rbp mulq %r9 addq %rcx,%r10 adcq $0,%rdx addq %rax,%r10 movq %r9,%rax adcq %rdx,%r11 movq %r9,%rdx adcq $0,%rbp movq %r10,%rcx imulq 32(%rsi),%r10 shlq $32,%rax shrq $32,%rdx subq %rax,%r8 movq 0(%rsi),%rax sbbq %rdx,%r9 addq %rbp,%r8 adcq $0,%r9 mulq %r10 movq %r10,%rbp addq %rax,%rcx movq 8(%rsi),%rax adcq %rdx,%rcx subq %r10,%r8 sbbq $0,%rbp mulq %r10 addq %rcx,%r11 adcq $0,%rdx addq %rax,%r11 movq %r10,%rax adcq %rdx,%r8 movq %r10,%rdx adcq $0,%rbp movq %r11,%rcx imulq 32(%rsi),%r11 shlq $32,%rax shrq $32,%rdx subq %rax,%r9 movq 0(%rsi),%rax sbbq %rdx,%r10 addq %rbp,%r9 adcq $0,%r10 mulq %r11 movq %r11,%rbp addq %rax,%rcx movq 8(%rsi),%rax adcq %rdx,%rcx subq %r11,%r9 sbbq $0,%rbp mulq %r11 addq %rcx,%r8 adcq $0,%rdx addq %rax,%r8 movq %r11,%rax adcq %rdx,%r9 movq %r11,%rdx adcq $0,%rbp shlq $32,%rax shrq $32,%rdx subq %rax,%r10 sbbq %rdx,%r11 addq %rbp,%r10 adcq $0,%r11 xorq %rdx,%rdx addq %r12,%r8 adcq %r13,%r9 movq %r8,%r12 adcq %r14,%r10 adcq %r15,%r11 movq %r9,%rax adcq $0,%rdx subq 0(%rsi),%r8 movq %r10,%r14 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 movq %r11,%r15 sbbq 24(%rsi),%r11 sbbq $0,%rdx cmovcq %r12,%r8 cmovncq %r9,%rax cmovncq %r10,%r14 cmovncq %r11,%r15 decq %rbx jnz .Loop_ord_sqr movq %r8,0(%rdi) movq %rax,8(%rdi) pxor %xmm1,%xmm1 movq %r14,16(%rdi) pxor %xmm2,%xmm2 movq %r15,24(%rdi) pxor %xmm3,%xmm3 movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_sqr_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_sqr_mont_nohw,.-ecp_nistz256_ord_sqr_mont_nohw .globl ecp_nistz256_ord_mul_mont_adx .hidden ecp_nistz256_ord_mul_mont_adx .type ecp_nistz256_ord_mul_mont_adx,@function .align 32 ecp_nistz256_ord_mul_mont_adx: .cfi_startproc .Lecp_nistz256_ord_mul_mont_adx: _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lord_mulx_body: movq %rdx,%rbx movq 0(%rdx),%rdx movq 0(%rsi),%r9 movq 8(%rsi),%r10 movq 16(%rsi),%r11 movq 24(%rsi),%r12 leaq -128(%rsi),%rsi leaq .Lord-128(%rip),%r14 movq .LordK(%rip),%r15 mulxq %r9,%r8,%r9 mulxq %r10,%rcx,%r10 mulxq %r11,%rbp,%r11 addq %rcx,%r9 mulxq %r12,%rcx,%r12 movq %r8,%rdx mulxq %r15,%rdx,%rax adcq %rbp,%r10 adcq %rcx,%r11 adcq $0,%r12 xorq %r13,%r13 mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r8 adoxq %rbp,%r9 mulxq 8+128(%r14),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 mulxq 16+128(%r14),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 24+128(%r14),%rcx,%rbp movq 8(%rbx),%rdx adcxq %rcx,%r11 adoxq %rbp,%r12 adcxq %r8,%r12 adoxq %r8,%r13 adcq $0,%r13 mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 mulxq 8+128(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 16+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 24+128(%rsi),%rcx,%rbp movq %r9,%rdx mulxq %r15,%rdx,%rax adcxq %rcx,%r12 adoxq %rbp,%r13 adcxq %r8,%r13 adoxq %r8,%r8 adcq $0,%r8 mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 mulxq 8+128(%r14),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 16+128(%r14),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 24+128(%r14),%rcx,%rbp movq 16(%rbx),%rdx adcxq %rcx,%r12 adoxq %rbp,%r13 adcxq %r9,%r13 adoxq %r9,%r8 adcq $0,%r8 mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 8+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 16+128(%rsi),%rcx,%rbp adcxq %rcx,%r12 adoxq %rbp,%r13 mulxq 24+128(%rsi),%rcx,%rbp movq %r10,%rdx mulxq %r15,%rdx,%rax adcxq %rcx,%r13 adoxq %rbp,%r8 adcxq %r9,%r8 adoxq %r9,%r9 adcq $0,%r9 mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 8+128(%r14),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 16+128(%r14),%rcx,%rbp adcxq %rcx,%r12 adoxq %rbp,%r13 mulxq 24+128(%r14),%rcx,%rbp movq 24(%rbx),%rdx adcxq %rcx,%r13 adoxq %rbp,%r8 adcxq %r10,%r8 adoxq %r10,%r9 adcq $0,%r9 mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 8+128(%rsi),%rcx,%rbp adcxq %rcx,%r12 adoxq %rbp,%r13 mulxq 16+128(%rsi),%rcx,%rbp adcxq %rcx,%r13 adoxq %rbp,%r8 mulxq 24+128(%rsi),%rcx,%rbp movq %r11,%rdx mulxq %r15,%rdx,%rax adcxq %rcx,%r8 adoxq %rbp,%r9 adcxq %r10,%r9 adoxq %r10,%r10 adcq $0,%r10 mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 8+128(%r14),%rcx,%rbp adcxq %rcx,%r12 adoxq %rbp,%r13 mulxq 16+128(%r14),%rcx,%rbp adcxq %rcx,%r13 adoxq %rbp,%r8 mulxq 24+128(%r14),%rcx,%rbp leaq 128(%r14),%r14 movq %r12,%rbx adcxq %rcx,%r8 adoxq %rbp,%r9 movq %r13,%rdx adcxq %r11,%r9 adoxq %r11,%r10 adcq $0,%r10 movq %r8,%rcx subq 0(%r14),%r12 sbbq 8(%r14),%r13 sbbq 16(%r14),%r8 movq %r9,%rbp sbbq 24(%r14),%r9 sbbq $0,%r10 cmovcq %rbx,%r12 cmovcq %rdx,%r13 cmovcq %rcx,%r8 cmovcq %rbp,%r9 movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_mulx_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_mul_mont_adx,.-ecp_nistz256_ord_mul_mont_adx .globl ecp_nistz256_ord_sqr_mont_adx .hidden ecp_nistz256_ord_sqr_mont_adx .type ecp_nistz256_ord_sqr_mont_adx,@function .align 32 ecp_nistz256_ord_sqr_mont_adx: .cfi_startproc _CET_ENDBR .Lecp_nistz256_ord_sqr_mont_adx: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lord_sqrx_body: movq %rdx,%rbx movq 0(%rsi),%rdx movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 leaq .Lord(%rip),%rsi jmp .Loop_ord_sqrx .align 32 .Loop_ord_sqrx: mulxq %r14,%r9,%r10 mulxq %r15,%rcx,%r11 movq %rdx,%rax .byte 102,73,15,110,206 mulxq %r8,%rbp,%r12 movq %r14,%rdx addq %rcx,%r10 .byte 102,73,15,110,215 adcq %rbp,%r11 adcq $0,%r12 xorq %r13,%r13 mulxq %r15,%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq %r8,%rcx,%rbp movq %r15,%rdx adcxq %rcx,%r12 adoxq %rbp,%r13 adcq $0,%r13 mulxq %r8,%rcx,%r14 movq %rax,%rdx .byte 102,73,15,110,216 xorq %r15,%r15 adcxq %r9,%r9 adoxq %rcx,%r13 adcxq %r10,%r10 adoxq %r15,%r14 mulxq %rdx,%r8,%rbp .byte 102,72,15,126,202 adcxq %r11,%r11 adoxq %rbp,%r9 adcxq %r12,%r12 mulxq %rdx,%rcx,%rax .byte 102,72,15,126,210 adcxq %r13,%r13 adoxq %rcx,%r10 adcxq %r14,%r14 mulxq %rdx,%rcx,%rbp .byte 0x67 .byte 102,72,15,126,218 adoxq %rax,%r11 adcxq %r15,%r15 adoxq %rcx,%r12 adoxq %rbp,%r13 mulxq %rdx,%rcx,%rax adoxq %rcx,%r14 adoxq %rax,%r15 movq %r8,%rdx mulxq 32(%rsi),%rdx,%rcx xorq %rax,%rax mulxq 0(%rsi),%rcx,%rbp adcxq %rcx,%r8 adoxq %rbp,%r9 mulxq 8(%rsi),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 mulxq 16(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 24(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r8 adcxq %rax,%r8 movq %r9,%rdx mulxq 32(%rsi),%rdx,%rcx mulxq 0(%rsi),%rcx,%rbp adoxq %rcx,%r9 adcxq %rbp,%r10 mulxq 8(%rsi),%rcx,%rbp adoxq %rcx,%r10 adcxq %rbp,%r11 mulxq 16(%rsi),%rcx,%rbp adoxq %rcx,%r11 adcxq %rbp,%r8 mulxq 24(%rsi),%rcx,%rbp adoxq %rcx,%r8 adcxq %rbp,%r9 adoxq %rax,%r9 movq %r10,%rdx mulxq 32(%rsi),%rdx,%rcx mulxq 0(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 8(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r8 mulxq 16(%rsi),%rcx,%rbp adcxq %rcx,%r8 adoxq %rbp,%r9 mulxq 24(%rsi),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 adcxq %rax,%r10 movq %r11,%rdx mulxq 32(%rsi),%rdx,%rcx mulxq 0(%rsi),%rcx,%rbp adoxq %rcx,%r11 adcxq %rbp,%r8 mulxq 8(%rsi),%rcx,%rbp adoxq %rcx,%r8 adcxq %rbp,%r9 mulxq 16(%rsi),%rcx,%rbp adoxq %rcx,%r9 adcxq %rbp,%r10 mulxq 24(%rsi),%rcx,%rbp adoxq %rcx,%r10 adcxq %rbp,%r11 adoxq %rax,%r11 addq %r8,%r12 adcq %r13,%r9 movq %r12,%rdx adcq %r14,%r10 adcq %r15,%r11 movq %r9,%r14 adcq $0,%rax subq 0(%rsi),%r12 movq %r10,%r15 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 movq %r11,%r8 sbbq 24(%rsi),%r11 sbbq $0,%rax cmovncq %r12,%rdx cmovncq %r9,%r14 cmovncq %r10,%r15 cmovncq %r11,%r8 decq %rbx jnz .Loop_ord_sqrx movq %rdx,0(%rdi) movq %r14,8(%rdi) pxor %xmm1,%xmm1 movq %r15,16(%rdi) pxor %xmm2,%xmm2 movq %r8,24(%rdi) pxor %xmm3,%xmm3 movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_sqrx_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_sqr_mont_adx,.-ecp_nistz256_ord_sqr_mont_adx .globl ecp_nistz256_mul_mont_nohw .hidden ecp_nistz256_mul_mont_nohw .type ecp_nistz256_mul_mont_nohw,@function .align 32 ecp_nistz256_mul_mont_nohw: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lmul_body: movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 movq 8(%rsi),%r10 movq 16(%rsi),%r11 movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lmul_epilogue: ret .cfi_endproc .size ecp_nistz256_mul_mont_nohw,.-ecp_nistz256_mul_mont_nohw .type __ecp_nistz256_mul_montq,@function .align 32 __ecp_nistz256_mul_montq: .cfi_startproc movq %rax,%rbp mulq %r9 movq .Lpoly+8(%rip),%r14 movq %rax,%r8 movq %rbp,%rax movq %rdx,%r9 mulq %r10 movq .Lpoly+24(%rip),%r15 addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r10 mulq %r11 addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r11 mulq %r12 addq %rax,%r11 movq %r8,%rax adcq $0,%rdx xorq %r13,%r13 movq %rdx,%r12 movq %r8,%rbp shlq $32,%r8 mulq %r15 shrq $32,%rbp addq %r8,%r9 adcq %rbp,%r10 adcq %rax,%r11 movq 8(%rbx),%rax adcq %rdx,%r12 adcq $0,%r13 xorq %r8,%r8 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 8(%rsi) addq %rcx,%r10 adcq $0,%rdx addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 16(%rsi) addq %rcx,%r11 adcq $0,%rdx addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 24(%rsi) addq %rcx,%r12 adcq $0,%rdx addq %rax,%r12 movq %r9,%rax adcq %rdx,%r13 adcq $0,%r8 movq %r9,%rbp shlq $32,%r9 mulq %r15 shrq $32,%rbp addq %r9,%r10 adcq %rbp,%r11 adcq %rax,%r12 movq 16(%rbx),%rax adcq %rdx,%r13 adcq $0,%r8 xorq %r9,%r9 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 8(%rsi) addq %rcx,%r11 adcq $0,%rdx addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 16(%rsi) addq %rcx,%r12 adcq $0,%rdx addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 24(%rsi) addq %rcx,%r13 adcq $0,%rdx addq %rax,%r13 movq %r10,%rax adcq %rdx,%r8 adcq $0,%r9 movq %r10,%rbp shlq $32,%r10 mulq %r15 shrq $32,%rbp addq %r10,%r11 adcq %rbp,%r12 adcq %rax,%r13 movq 24(%rbx),%rax adcq %rdx,%r8 adcq $0,%r9 xorq %r10,%r10 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 8(%rsi) addq %rcx,%r12 adcq $0,%rdx addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 16(%rsi) addq %rcx,%r13 adcq $0,%rdx addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 24(%rsi) addq %rcx,%r8 adcq $0,%rdx addq %rax,%r8 movq %r11,%rax adcq %rdx,%r9 adcq $0,%r10 movq %r11,%rbp shlq $32,%r11 mulq %r15 shrq $32,%rbp addq %r11,%r12 adcq %rbp,%r13 movq %r12,%rcx adcq %rax,%r8 adcq %rdx,%r9 movq %r13,%rbp adcq $0,%r10 subq $-1,%r12 movq %r8,%rbx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%rdx sbbq %r15,%r9 sbbq $0,%r10 cmovcq %rcx,%r12 cmovcq %rbp,%r13 movq %r12,0(%rdi) cmovcq %rbx,%r8 movq %r13,8(%rdi) cmovcq %rdx,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .cfi_endproc .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq .globl ecp_nistz256_sqr_mont_nohw .hidden ecp_nistz256_sqr_mont_nohw .type ecp_nistz256_sqr_mont_nohw,@function .align 32 ecp_nistz256_sqr_mont_nohw: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lsqr_body: movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lsqr_epilogue: ret .cfi_endproc .size ecp_nistz256_sqr_mont_nohw,.-ecp_nistz256_sqr_mont_nohw .type __ecp_nistz256_sqr_montq,@function .align 32 __ecp_nistz256_sqr_montq: .cfi_startproc movq %rax,%r13 mulq %r14 movq %rax,%r9 movq %r15,%rax movq %rdx,%r10 mulq %r13 addq %rax,%r10 movq %r8,%rax adcq $0,%rdx movq %rdx,%r11 mulq %r13 addq %rax,%r11 movq %r15,%rax adcq $0,%rdx movq %rdx,%r12 mulq %r14 addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %rdx,%rbp mulq %r14 addq %rax,%r12 movq %r8,%rax adcq $0,%rdx addq %rbp,%r12 movq %rdx,%r13 adcq $0,%r13 mulq %r15 xorq %r15,%r15 addq %rax,%r13 movq 0(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 addq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %r12,%r12 adcq %r13,%r13 adcq %r14,%r14 adcq $0,%r15 mulq %rax movq %rax,%r8 movq 8(%rsi),%rax movq %rdx,%rcx mulq %rax addq %rcx,%r9 adcq %rax,%r10 movq 16(%rsi),%rax adcq $0,%rdx movq %rdx,%rcx mulq %rax addq %rcx,%r11 adcq %rax,%r12 movq 24(%rsi),%rax adcq $0,%rdx movq %rdx,%rcx mulq %rax addq %rcx,%r13 adcq %rax,%r14 movq %r8,%rax adcq %rdx,%r15 movq .Lpoly+8(%rip),%rsi movq .Lpoly+24(%rip),%rbp movq %r8,%rcx shlq $32,%r8 mulq %rbp shrq $32,%rcx addq %r8,%r9 adcq %rcx,%r10 adcq %rax,%r11 movq %r9,%rax adcq $0,%rdx movq %r9,%rcx shlq $32,%r9 movq %rdx,%r8 mulq %rbp shrq $32,%rcx addq %r9,%r10 adcq %rcx,%r11 adcq %rax,%r8 movq %r10,%rax adcq $0,%rdx movq %r10,%rcx shlq $32,%r10 movq %rdx,%r9 mulq %rbp shrq $32,%rcx addq %r10,%r11 adcq %rcx,%r8 adcq %rax,%r9 movq %r11,%rax adcq $0,%rdx movq %r11,%rcx shlq $32,%r11 movq %rdx,%r10 mulq %rbp shrq $32,%rcx addq %r11,%r8 adcq %rcx,%r9 adcq %rax,%r10 adcq $0,%rdx xorq %r11,%r11 addq %r8,%r12 adcq %r9,%r13 movq %r12,%r8 adcq %r10,%r14 adcq %rdx,%r15 movq %r13,%r9 adcq $0,%r11 subq $-1,%r12 movq %r14,%r10 sbbq %rsi,%r13 sbbq $0,%r14 movq %r15,%rcx sbbq %rbp,%r15 sbbq $0,%r11 cmovcq %r8,%r12 cmovcq %r9,%r13 movq %r12,0(%rdi) cmovcq %r10,%r14 movq %r13,8(%rdi) cmovcq %rcx,%r15 movq %r14,16(%rdi) movq %r15,24(%rdi) ret .cfi_endproc .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq .globl ecp_nistz256_mul_mont_adx .hidden ecp_nistz256_mul_mont_adx .type ecp_nistz256_mul_mont_adx,@function .align 32 ecp_nistz256_mul_mont_adx: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lmulx_body: movq %rdx,%rbx movq 0(%rdx),%rdx movq 0(%rsi),%r9 movq 8(%rsi),%r10 movq 16(%rsi),%r11 movq 24(%rsi),%r12 leaq -128(%rsi),%rsi call __ecp_nistz256_mul_montx movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lmulx_epilogue: ret .cfi_endproc .size ecp_nistz256_mul_mont_adx,.-ecp_nistz256_mul_mont_adx .type __ecp_nistz256_mul_montx,@function .align 32 __ecp_nistz256_mul_montx: .cfi_startproc mulxq %r9,%r8,%r9 mulxq %r10,%rcx,%r10 movq $32,%r14 xorq %r13,%r13 mulxq %r11,%rbp,%r11 movq .Lpoly+24(%rip),%r15 adcq %rcx,%r9 mulxq %r12,%rcx,%r12 movq %r8,%rdx adcq %rbp,%r10 shlxq %r14,%r8,%rbp adcq %rcx,%r11 shrxq %r14,%r8,%rcx adcq $0,%r12 addq %rbp,%r9 adcq %rcx,%r10 mulxq %r15,%rcx,%rbp movq 8(%rbx),%rdx adcq %rcx,%r11 adcq %rbp,%r12 adcq $0,%r13 xorq %r8,%r8 mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 mulxq 8+128(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 16+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 24+128(%rsi),%rcx,%rbp movq %r9,%rdx adcxq %rcx,%r12 shlxq %r14,%r9,%rcx adoxq %rbp,%r13 shrxq %r14,%r9,%rbp adcxq %r8,%r13 adoxq %r8,%r8 adcq $0,%r8 addq %rcx,%r10 adcq %rbp,%r11 mulxq %r15,%rcx,%rbp movq 16(%rbx),%rdx adcq %rcx,%r12 adcq %rbp,%r13 adcq $0,%r8 xorq %r9,%r9 mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 8+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 16+128(%rsi),%rcx,%rbp adcxq %rcx,%r12 adoxq %rbp,%r13 mulxq 24+128(%rsi),%rcx,%rbp movq %r10,%rdx adcxq %rcx,%r13 shlxq %r14,%r10,%rcx adoxq %rbp,%r8 shrxq %r14,%r10,%rbp adcxq %r9,%r8 adoxq %r9,%r9 adcq $0,%r9 addq %rcx,%r11 adcq %rbp,%r12 mulxq %r15,%rcx,%rbp movq 24(%rbx),%rdx adcq %rcx,%r13 adcq %rbp,%r8 adcq $0,%r9 xorq %r10,%r10 mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 8+128(%rsi),%rcx,%rbp adcxq %rcx,%r12 adoxq %rbp,%r13 mulxq 16+128(%rsi),%rcx,%rbp adcxq %rcx,%r13 adoxq %rbp,%r8 mulxq 24+128(%rsi),%rcx,%rbp movq %r11,%rdx adcxq %rcx,%r8 shlxq %r14,%r11,%rcx adoxq %rbp,%r9 shrxq %r14,%r11,%rbp adcxq %r10,%r9 adoxq %r10,%r10 adcq $0,%r10 addq %rcx,%r12 adcq %rbp,%r13 mulxq %r15,%rcx,%rbp movq %r12,%rbx movq .Lpoly+8(%rip),%r14 adcq %rcx,%r8 movq %r13,%rdx adcq %rbp,%r9 adcq $0,%r10 xorl %eax,%eax movq %r8,%rcx sbbq $-1,%r12 sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%rbp sbbq %r15,%r9 sbbq $0,%r10 cmovcq %rbx,%r12 cmovcq %rdx,%r13 movq %r12,0(%rdi) cmovcq %rcx,%r8 movq %r13,8(%rdi) cmovcq %rbp,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .cfi_endproc .size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx .globl ecp_nistz256_sqr_mont_adx .hidden ecp_nistz256_sqr_mont_adx .type ecp_nistz256_sqr_mont_adx,@function .align 32 ecp_nistz256_sqr_mont_adx: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lsqrx_body: movq 0(%rsi),%rdx movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 leaq -128(%rsi),%rsi call __ecp_nistz256_sqr_montx movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lsqrx_epilogue: ret .cfi_endproc .size ecp_nistz256_sqr_mont_adx,.-ecp_nistz256_sqr_mont_adx .type __ecp_nistz256_sqr_montx,@function .align 32 __ecp_nistz256_sqr_montx: .cfi_startproc mulxq %r14,%r9,%r10 mulxq %r15,%rcx,%r11 xorl %eax,%eax adcq %rcx,%r10 mulxq %r8,%rbp,%r12 movq %r14,%rdx adcq %rbp,%r11 adcq $0,%r12 xorq %r13,%r13 mulxq %r15,%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq %r8,%rcx,%rbp movq %r15,%rdx adcxq %rcx,%r12 adoxq %rbp,%r13 adcq $0,%r13 mulxq %r8,%rcx,%r14 movq 0+128(%rsi),%rdx xorq %r15,%r15 adcxq %r9,%r9 adoxq %rcx,%r13 adcxq %r10,%r10 adoxq %r15,%r14 mulxq %rdx,%r8,%rbp movq 8+128(%rsi),%rdx adcxq %r11,%r11 adoxq %rbp,%r9 adcxq %r12,%r12 mulxq %rdx,%rcx,%rax movq 16+128(%rsi),%rdx adcxq %r13,%r13 adoxq %rcx,%r10 adcxq %r14,%r14 .byte 0x67 mulxq %rdx,%rcx,%rbp movq 24+128(%rsi),%rdx adoxq %rax,%r11 adcxq %r15,%r15 adoxq %rcx,%r12 movq $32,%rsi adoxq %rbp,%r13 .byte 0x67,0x67 mulxq %rdx,%rcx,%rax movq .Lpoly+24(%rip),%rdx adoxq %rcx,%r14 shlxq %rsi,%r8,%rcx adoxq %rax,%r15 shrxq %rsi,%r8,%rax movq %rdx,%rbp addq %rcx,%r9 adcq %rax,%r10 mulxq %r8,%rcx,%r8 adcq %rcx,%r11 shlxq %rsi,%r9,%rcx adcq $0,%r8 shrxq %rsi,%r9,%rax addq %rcx,%r10 adcq %rax,%r11 mulxq %r9,%rcx,%r9 adcq %rcx,%r8 shlxq %rsi,%r10,%rcx adcq $0,%r9 shrxq %rsi,%r10,%rax addq %rcx,%r11 adcq %rax,%r8 mulxq %r10,%rcx,%r10 adcq %rcx,%r9 shlxq %rsi,%r11,%rcx adcq $0,%r10 shrxq %rsi,%r11,%rax addq %rcx,%r8 adcq %rax,%r9 mulxq %r11,%rcx,%r11 adcq %rcx,%r10 adcq $0,%r11 xorq %rdx,%rdx addq %r8,%r12 movq .Lpoly+8(%rip),%rsi adcq %r9,%r13 movq %r12,%r8 adcq %r10,%r14 adcq %r11,%r15 movq %r13,%r9 adcq $0,%rdx subq $-1,%r12 movq %r14,%r10 sbbq %rsi,%r13 sbbq $0,%r14 movq %r15,%r11 sbbq %rbp,%r15 sbbq $0,%rdx cmovcq %r8,%r12 cmovcq %r9,%r13 movq %r12,0(%rdi) cmovcq %r10,%r14 movq %r13,8(%rdi) cmovcq %r11,%r15 movq %r14,16(%rdi) movq %r15,24(%rdi) ret .cfi_endproc .size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx .globl ecp_nistz256_select_w5_nohw .hidden ecp_nistz256_select_w5_nohw .type ecp_nistz256_select_w5_nohw,@function .align 32 ecp_nistz256_select_w5_nohw: .cfi_startproc _CET_ENDBR movdqa .LOne(%rip),%xmm0 movd %edx,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 movdqa %xmm0,%xmm8 pshufd $0,%xmm1,%xmm1 movq $16,%rax .Lselect_loop_sse_w5: movdqa %xmm8,%xmm15 paddd %xmm0,%xmm8 pcmpeqd %xmm1,%xmm15 movdqa 0(%rsi),%xmm9 movdqa 16(%rsi),%xmm10 movdqa 32(%rsi),%xmm11 movdqa 48(%rsi),%xmm12 movdqa 64(%rsi),%xmm13 movdqa 80(%rsi),%xmm14 leaq 96(%rsi),%rsi pand %xmm15,%xmm9 pand %xmm15,%xmm10 por %xmm9,%xmm2 pand %xmm15,%xmm11 por %xmm10,%xmm3 pand %xmm15,%xmm12 por %xmm11,%xmm4 pand %xmm15,%xmm13 por %xmm12,%xmm5 pand %xmm15,%xmm14 por %xmm13,%xmm6 por %xmm14,%xmm7 decq %rax jnz .Lselect_loop_sse_w5 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqu %xmm4,32(%rdi) movdqu %xmm5,48(%rdi) movdqu %xmm6,64(%rdi) movdqu %xmm7,80(%rdi) ret .cfi_endproc .LSEH_end_ecp_nistz256_select_w5_nohw: .size ecp_nistz256_select_w5_nohw,.-ecp_nistz256_select_w5_nohw .globl ecp_nistz256_select_w7_nohw .hidden ecp_nistz256_select_w7_nohw .type ecp_nistz256_select_w7_nohw,@function .align 32 ecp_nistz256_select_w7_nohw: .cfi_startproc _CET_ENDBR movdqa .LOne(%rip),%xmm8 movd %edx,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 movdqa %xmm8,%xmm0 pshufd $0,%xmm1,%xmm1 movq $64,%rax .Lselect_loop_sse_w7: movdqa %xmm8,%xmm15 paddd %xmm0,%xmm8 movdqa 0(%rsi),%xmm9 movdqa 16(%rsi),%xmm10 pcmpeqd %xmm1,%xmm15 movdqa 32(%rsi),%xmm11 movdqa 48(%rsi),%xmm12 leaq 64(%rsi),%rsi pand %xmm15,%xmm9 pand %xmm15,%xmm10 por %xmm9,%xmm2 pand %xmm15,%xmm11 por %xmm10,%xmm3 pand %xmm15,%xmm12 por %xmm11,%xmm4 prefetcht0 255(%rsi) por %xmm12,%xmm5 decq %rax jnz .Lselect_loop_sse_w7 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqu %xmm4,32(%rdi) movdqu %xmm5,48(%rdi) ret .cfi_endproc .LSEH_end_ecp_nistz256_select_w7_nohw: .size ecp_nistz256_select_w7_nohw,.-ecp_nistz256_select_w7_nohw .globl ecp_nistz256_select_w5_avx2 .hidden ecp_nistz256_select_w5_avx2 .type ecp_nistz256_select_w5_avx2,@function .align 32 ecp_nistz256_select_w5_avx2: .cfi_startproc _CET_ENDBR vzeroupper vmovdqa .LTwo(%rip),%ymm0 vpxor %ymm2,%ymm2,%ymm2 vpxor %ymm3,%ymm3,%ymm3 vpxor %ymm4,%ymm4,%ymm4 vmovdqa .LOne(%rip),%ymm5 vmovdqa .LTwo(%rip),%ymm10 vmovd %edx,%xmm1 vpermd %ymm1,%ymm2,%ymm1 movq $8,%rax .Lselect_loop_avx2_w5: vmovdqa 0(%rsi),%ymm6 vmovdqa 32(%rsi),%ymm7 vmovdqa 64(%rsi),%ymm8 vmovdqa 96(%rsi),%ymm11 vmovdqa 128(%rsi),%ymm12 vmovdqa 160(%rsi),%ymm13 vpcmpeqd %ymm1,%ymm5,%ymm9 vpcmpeqd %ymm1,%ymm10,%ymm14 vpaddd %ymm0,%ymm5,%ymm5 vpaddd %ymm0,%ymm10,%ymm10 leaq 192(%rsi),%rsi vpand %ymm9,%ymm6,%ymm6 vpand %ymm9,%ymm7,%ymm7 vpand %ymm9,%ymm8,%ymm8 vpand %ymm14,%ymm11,%ymm11 vpand %ymm14,%ymm12,%ymm12 vpand %ymm14,%ymm13,%ymm13 vpxor %ymm6,%ymm2,%ymm2 vpxor %ymm7,%ymm3,%ymm3 vpxor %ymm8,%ymm4,%ymm4 vpxor %ymm11,%ymm2,%ymm2 vpxor %ymm12,%ymm3,%ymm3 vpxor %ymm13,%ymm4,%ymm4 decq %rax jnz .Lselect_loop_avx2_w5 vmovdqu %ymm2,0(%rdi) vmovdqu %ymm3,32(%rdi) vmovdqu %ymm4,64(%rdi) vzeroupper ret .cfi_endproc .LSEH_end_ecp_nistz256_select_w5_avx2: .size ecp_nistz256_select_w5_avx2,.-ecp_nistz256_select_w5_avx2 .globl ecp_nistz256_select_w7_avx2 .hidden ecp_nistz256_select_w7_avx2 .type ecp_nistz256_select_w7_avx2,@function .align 32 ecp_nistz256_select_w7_avx2: .cfi_startproc _CET_ENDBR vzeroupper vmovdqa .LThree(%rip),%ymm0 vpxor %ymm2,%ymm2,%ymm2 vpxor %ymm3,%ymm3,%ymm3 vmovdqa .LOne(%rip),%ymm4 vmovdqa .LTwo(%rip),%ymm8 vmovdqa .LThree(%rip),%ymm12 vmovd %edx,%xmm1 vpermd %ymm1,%ymm2,%ymm1 movq $21,%rax .Lselect_loop_avx2_w7: vmovdqa 0(%rsi),%ymm5 vmovdqa 32(%rsi),%ymm6 vmovdqa 64(%rsi),%ymm9 vmovdqa 96(%rsi),%ymm10 vmovdqa 128(%rsi),%ymm13 vmovdqa 160(%rsi),%ymm14 vpcmpeqd %ymm1,%ymm4,%ymm7 vpcmpeqd %ymm1,%ymm8,%ymm11 vpcmpeqd %ymm1,%ymm12,%ymm15 vpaddd %ymm0,%ymm4,%ymm4 vpaddd %ymm0,%ymm8,%ymm8 vpaddd %ymm0,%ymm12,%ymm12 leaq 192(%rsi),%rsi vpand %ymm7,%ymm5,%ymm5 vpand %ymm7,%ymm6,%ymm6 vpand %ymm11,%ymm9,%ymm9 vpand %ymm11,%ymm10,%ymm10 vpand %ymm15,%ymm13,%ymm13 vpand %ymm15,%ymm14,%ymm14 vpxor %ymm5,%ymm2,%ymm2 vpxor %ymm6,%ymm3,%ymm3 vpxor %ymm9,%ymm2,%ymm2 vpxor %ymm10,%ymm3,%ymm3 vpxor %ymm13,%ymm2,%ymm2 vpxor %ymm14,%ymm3,%ymm3 decq %rax jnz .Lselect_loop_avx2_w7 vmovdqa 0(%rsi),%ymm5 vmovdqa 32(%rsi),%ymm6 vpcmpeqd %ymm1,%ymm4,%ymm7 vpand %ymm7,%ymm5,%ymm5 vpand %ymm7,%ymm6,%ymm6 vpxor %ymm5,%ymm2,%ymm2 vpxor %ymm6,%ymm3,%ymm3 vmovdqu %ymm2,0(%rdi) vmovdqu %ymm3,32(%rdi) vzeroupper ret .cfi_endproc .LSEH_end_ecp_nistz256_select_w7_avx2: .size ecp_nistz256_select_w7_avx2,.-ecp_nistz256_select_w7_avx2 .type __ecp_nistz256_add_toq,@function .align 32 __ecp_nistz256_add_toq: .cfi_startproc xorq %r11,%r11 addq 0(%rbx),%r12 adcq 8(%rbx),%r13 movq %r12,%rax adcq 16(%rbx),%r8 adcq 24(%rbx),%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 cmovcq %rbp,%r13 movq %r12,0(%rdi) cmovcq %rcx,%r8 movq %r13,8(%rdi) cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .cfi_endproc .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq .type __ecp_nistz256_sub_fromq,@function .align 32 __ecp_nistz256_sub_fromq: .cfi_startproc subq 0(%rbx),%r12 sbbq 8(%rbx),%r13 movq %r12,%rax sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 movq %r13,%rbp sbbq %r11,%r11 addq $-1,%r12 movq %r8,%rcx adcq %r14,%r13 adcq $0,%r8 movq %r9,%r10 adcq %r15,%r9 testq %r11,%r11 cmovzq %rax,%r12 cmovzq %rbp,%r13 movq %r12,0(%rdi) cmovzq %rcx,%r8 movq %r13,8(%rdi) cmovzq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .cfi_endproc .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq .type __ecp_nistz256_subq,@function .align 32 __ecp_nistz256_subq: .cfi_startproc subq %r12,%rax sbbq %r13,%rbp movq %rax,%r12 sbbq %r8,%rcx sbbq %r9,%r10 movq %rbp,%r13 sbbq %r11,%r11 addq $-1,%rax movq %rcx,%r8 adcq %r14,%rbp adcq $0,%rcx movq %r10,%r9 adcq %r15,%r10 testq %r11,%r11 cmovnzq %rax,%r12 cmovnzq %rbp,%r13 cmovnzq %rcx,%r8 cmovnzq %r10,%r9 ret .cfi_endproc .size __ecp_nistz256_subq,.-__ecp_nistz256_subq .type __ecp_nistz256_mul_by_2q,@function .align 32 __ecp_nistz256_mul_by_2q: .cfi_startproc xorq %r11,%r11 addq %r12,%r12 adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 cmovcq %rbp,%r13 movq %r12,0(%rdi) cmovcq %rcx,%r8 movq %r13,8(%rdi) cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .cfi_endproc .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q .globl ecp_nistz256_point_double_nohw .hidden ecp_nistz256_point_double_nohw .type ecp_nistz256_point_double_nohw,@function .align 32 ecp_nistz256_point_double_nohw: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $160+8,%rsp .cfi_adjust_cfa_offset 32*5+8 .Lpoint_doubleq_body: .Lpoint_double_shortcutq: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx movdqu 16(%rsi),%xmm1 movq 32+0(%rsi),%r12 movq 32+8(%rsi),%r13 movq 32+16(%rsi),%r8 movq 32+24(%rsi),%r9 movq .Lpoly+8(%rip),%r14 movq .Lpoly+24(%rip),%r15 movdqa %xmm0,96(%rsp) movdqa %xmm1,96+16(%rsp) leaq 32(%rdi),%r10 leaq 64(%rdi),%r11 .byte 102,72,15,110,199 .byte 102,73,15,110,202 .byte 102,73,15,110,211 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_by_2q movq 64+0(%rsi),%rax movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 leaq 64-0(%rsi),%rsi leaq 64(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 0+0(%rsp),%rax movq 8+0(%rsp),%r14 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 0(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 32(%rbx),%rax movq 64+0(%rbx),%r9 movq 64+8(%rbx),%r10 movq 64+16(%rbx),%r11 movq 64+24(%rbx),%r12 leaq 64-0(%rbx),%rsi leaq 32(%rbx),%rbx .byte 102,72,15,126,215 call __ecp_nistz256_mul_montq call __ecp_nistz256_mul_by_2q movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 leaq 64(%rsp),%rbx movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 32(%rsp),%rdi call __ecp_nistz256_add_toq movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 leaq 64(%rsp),%rbx movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 0+0(%rsp),%rax movq 8+0(%rsp),%r14 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 .byte 102,72,15,126,207 call __ecp_nistz256_sqr_montq xorq %r9,%r9 movq %r12,%rax addq $-1,%r12 movq %r13,%r10 adcq %rsi,%r13 movq %r14,%rcx adcq $0,%r14 movq %r15,%r8 adcq %rbp,%r15 adcq $0,%r9 xorq %rsi,%rsi testq $1,%rax cmovzq %rax,%r12 cmovzq %r10,%r13 cmovzq %rcx,%r14 cmovzq %r8,%r15 cmovzq %rsi,%r9 movq %r13,%rax shrq $1,%r12 shlq $63,%rax movq %r14,%r10 shrq $1,%r13 orq %rax,%r12 shlq $63,%r10 movq %r15,%rcx shrq $1,%r14 orq %r10,%r13 shlq $63,%rcx movq %r12,0(%rdi) shrq $1,%r15 movq %r13,8(%rdi) shlq $63,%r9 orq %rcx,%r14 orq %r9,%r15 movq %r14,16(%rdi) movq %r15,24(%rdi) movq 64(%rsp),%rax leaq 64(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 128(%rsp),%rdi call __ecp_nistz256_mul_by_2q leaq 32(%rsp),%rbx leaq 32(%rsp),%rdi call __ecp_nistz256_add_toq movq 96(%rsp),%rax leaq 96(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 128(%rsp),%rdi call __ecp_nistz256_mul_by_2q movq 0+32(%rsp),%rax movq 8+32(%rsp),%r14 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r15 movq 24+32(%rsp),%r8 .byte 102,72,15,126,199 call __ecp_nistz256_sqr_montq leaq 128(%rsp),%rbx movq %r14,%r8 movq %r15,%r9 movq %rsi,%r14 movq %rbp,%r15 call __ecp_nistz256_sub_fromq movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp movq 0+16(%rsp),%rcx movq 0+24(%rsp),%r10 leaq 0(%rsp),%rdi call __ecp_nistz256_subq movq 32(%rsp),%rax leaq 32(%rsp),%rbx movq %r12,%r14 xorl %ecx,%ecx movq %r12,0+0(%rsp) movq %r13,%r10 movq %r13,0+8(%rsp) cmovzq %r8,%r11 movq %r8,0+16(%rsp) leaq 0-0(%rsp),%rsi cmovzq %r9,%r12 movq %r9,0+24(%rsp) movq %r14,%r9 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq .byte 102,72,15,126,203 .byte 102,72,15,126,207 call __ecp_nistz256_sub_fromq leaq 160+56(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbx .cfi_restore %rbx movq -8(%rsi),%rbp .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpoint_doubleq_epilogue: ret .cfi_endproc .size ecp_nistz256_point_double_nohw,.-ecp_nistz256_point_double_nohw .globl ecp_nistz256_point_add_nohw .hidden ecp_nistz256_point_add_nohw .type ecp_nistz256_point_add_nohw,@function .align 32 ecp_nistz256_point_add_nohw: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $576+8,%rsp .cfi_adjust_cfa_offset 32*18+8 .Lpoint_addq_body: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm3 movdqu 64(%rsi),%xmm4 movdqu 80(%rsi),%xmm5 movq %rsi,%rbx movq %rdx,%rsi movdqa %xmm0,384(%rsp) movdqa %xmm1,384+16(%rsp) movdqa %xmm2,416(%rsp) movdqa %xmm3,416+16(%rsp) movdqa %xmm4,448(%rsp) movdqa %xmm5,448+16(%rsp) por %xmm4,%xmm5 movdqu 0(%rsi),%xmm0 pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 movdqu 48(%rsi),%xmm3 movq 64+0(%rsi),%rax movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,480(%rsp) pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,480+16(%rsp) movdqu 64(%rsi),%xmm0 movdqu 80(%rsi),%xmm1 movdqa %xmm2,512(%rsp) movdqa %xmm3,512+16(%rsp) por %xmm4,%xmm5 pxor %xmm4,%xmm4 por %xmm0,%xmm1 .byte 102,72,15,110,199 leaq 64-0(%rsi),%rsi movq %rax,544+0(%rsp) movq %r14,544+8(%rsp) movq %r15,544+16(%rsp) movq %r8,544+24(%rsp) leaq 96(%rsp),%rdi call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm1,%xmm4 por %xmm1,%xmm4 pshufd $0,%xmm5,%xmm5 pshufd $0x1e,%xmm4,%xmm3 por %xmm3,%xmm4 pxor %xmm3,%xmm3 pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 movq 64+0(%rbx),%rax movq 64+8(%rbx),%r14 movq 64+16(%rbx),%r15 movq 64+24(%rbx),%r8 .byte 102,72,15,110,203 leaq 64-0(%rbx),%rsi leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 544(%rsp),%rax leaq 544(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 leaq 0+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 224(%rsp),%rdi call __ecp_nistz256_mul_montq movq 448(%rsp),%rax leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montq movq 416(%rsp),%rax leaq 416(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 leaq 0+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 224(%rsp),%rdi call __ecp_nistz256_mul_montq movq 512(%rsp),%rax leaq 512(%rsp),%rbx movq 0+256(%rsp),%r9 movq 8+256(%rsp),%r10 leaq 0+256(%rsp),%rsi movq 16+256(%rsp),%r11 movq 24+256(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 224(%rsp),%rbx leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromq orq %r13,%r12 movdqa %xmm4,%xmm2 orq %r8,%r12 orq %r9,%r12 por %xmm5,%xmm2 .byte 102,73,15,110,220 movq 384(%rsp),%rax leaq 384(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 leaq 0+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 160(%rsp),%rdi call __ecp_nistz256_mul_montq movq 480(%rsp),%rax leaq 480(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 160(%rsp),%rbx leaq 0(%rsp),%rdi call __ecp_nistz256_sub_fromq orq %r13,%r12 orq %r8,%r12 orq %r9,%r12 .byte 102,73,15,126,208 .byte 102,73,15,126,217 orq %r8,%r12 .byte 0x3e jnz .Ladd_proceedq testq %r9,%r9 jz .Ladd_doubleq .byte 102,72,15,126,199 pxor %xmm0,%xmm0 movdqu %xmm0,0(%rdi) movdqu %xmm0,16(%rdi) movdqu %xmm0,32(%rdi) movdqu %xmm0,48(%rdi) movdqu %xmm0,64(%rdi) movdqu %xmm0,80(%rdi) jmp .Ladd_doneq .align 32 .Ladd_doubleq: .byte 102,72,15,126,206 .byte 102,72,15,126,199 addq $416,%rsp .cfi_adjust_cfa_offset -416 jmp .Lpoint_double_shortcutq .cfi_adjust_cfa_offset 416 .align 32 .Ladd_proceedq: movq 0+64(%rsp),%rax movq 8+64(%rsp),%r14 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 96(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 448(%rsp),%rax leaq 448(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 352(%rsp),%rdi call __ecp_nistz256_mul_montq movq 0+0(%rsp),%rax movq 8+0(%rsp),%r14 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 544(%rsp),%rax leaq 544(%rsp),%rbx movq 0+352(%rsp),%r9 movq 8+352(%rsp),%r10 leaq 0+352(%rsp),%rsi movq 16+352(%rsp),%r11 movq 24+352(%rsp),%r12 leaq 352(%rsp),%rdi call __ecp_nistz256_mul_montq movq 0(%rsp),%rax leaq 0(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 128(%rsp),%rdi call __ecp_nistz256_mul_montq movq 160(%rsp),%rax leaq 160(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montq xorq %r11,%r11 addq %r12,%r12 leaq 96(%rsp),%rsi adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 movq 0(%rsi),%rax cmovcq %rbp,%r13 movq 8(%rsi),%rbp cmovcq %rcx,%r8 movq 16(%rsi),%rcx cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subq leaq 128(%rsp),%rbx leaq 288(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 192+0(%rsp),%rax movq 192+8(%rsp),%rbp movq 192+16(%rsp),%rcx movq 192+24(%rsp),%r10 leaq 320(%rsp),%rdi call __ecp_nistz256_subq movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 128(%rsp),%rax leaq 128(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 leaq 0+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montq movq 320(%rsp),%rax leaq 320(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 320(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 256(%rsp),%rbx leaq 320(%rsp),%rdi call __ecp_nistz256_sub_fromq .byte 102,72,15,126,199 movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 352(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 352+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 544(%rsp),%xmm2 pand 544+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 448(%rsp),%xmm2 pand 448+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,64(%rdi) movdqu %xmm3,80(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 288(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 288+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 480(%rsp),%xmm2 pand 480+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 384(%rsp),%xmm2 pand 384+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 320(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 320+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 512(%rsp),%xmm2 pand 512+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 416(%rsp),%xmm2 pand 416+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) .Ladd_doneq: leaq 576+56(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbx .cfi_restore %rbx movq -8(%rsi),%rbp .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpoint_addq_epilogue: ret .cfi_endproc .size ecp_nistz256_point_add_nohw,.-ecp_nistz256_point_add_nohw .globl ecp_nistz256_point_add_affine_nohw .hidden ecp_nistz256_point_add_affine_nohw .type ecp_nistz256_point_add_affine_nohw,@function .align 32 ecp_nistz256_point_add_affine_nohw: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $480+8,%rsp .cfi_adjust_cfa_offset 32*15+8 .Ladd_affineq_body: movdqu 0(%rsi),%xmm0 movq %rdx,%rbx movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm3 movdqu 64(%rsi),%xmm4 movdqu 80(%rsi),%xmm5 movq 64+0(%rsi),%rax movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,320(%rsp) movdqa %xmm1,320+16(%rsp) movdqa %xmm2,352(%rsp) movdqa %xmm3,352+16(%rsp) movdqa %xmm4,384(%rsp) movdqa %xmm5,384+16(%rsp) por %xmm4,%xmm5 movdqu 0(%rbx),%xmm0 pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rbx),%xmm1 movdqu 32(%rbx),%xmm2 por %xmm3,%xmm5 movdqu 48(%rbx),%xmm3 movdqa %xmm0,416(%rsp) pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,416+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 movdqa %xmm2,448(%rsp) movdqa %xmm3,448+16(%rsp) por %xmm2,%xmm3 por %xmm4,%xmm5 pxor %xmm4,%xmm4 por %xmm1,%xmm3 leaq 64-0(%rsi),%rsi leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rax movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 pshufd $0x1e,%xmm4,%xmm3 movq %r13,%r10 por %xmm3,%xmm4 pxor %xmm3,%xmm3 movq %r14,%r11 pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 leaq 32-0(%rsp),%rsi movq %r15,%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 320(%rsp),%rbx leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 384(%rsp),%rax leaq 384(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montq movq 384(%rsp),%rax leaq 384(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 288(%rsp),%rdi call __ecp_nistz256_mul_montq movq 448(%rsp),%rax leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 352(%rsp),%rbx leaq 96(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 0+64(%rsp),%rax movq 8+64(%rsp),%r14 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 128(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 0+96(%rsp),%rax movq 8+96(%rsp),%r14 leaq 0+96(%rsp),%rsi movq 16+96(%rsp),%r15 movq 24+96(%rsp),%r8 leaq 192(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 128(%rsp),%rax leaq 128(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 160(%rsp),%rdi call __ecp_nistz256_mul_montq movq 320(%rsp),%rax leaq 320(%rsp),%rbx movq 0+128(%rsp),%r9 movq 8+128(%rsp),%r10 leaq 0+128(%rsp),%rsi movq 16+128(%rsp),%r11 movq 24+128(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq xorq %r11,%r11 addq %r12,%r12 leaq 192(%rsp),%rsi adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 movq 0(%rsi),%rax cmovcq %rbp,%r13 movq 8(%rsi),%rbp cmovcq %rcx,%r8 movq 16(%rsi),%rcx cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subq leaq 160(%rsp),%rbx leaq 224(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp movq 0+16(%rsp),%rcx movq 0+24(%rsp),%r10 leaq 64(%rsp),%rdi call __ecp_nistz256_subq movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 352(%rsp),%rax leaq 352(%rsp),%rbx movq 0+160(%rsp),%r9 movq 8+160(%rsp),%r10 leaq 0+160(%rsp),%rsi movq 16+160(%rsp),%r11 movq 24+160(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montq movq 96(%rsp),%rax leaq 96(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 64(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 32(%rsp),%rbx leaq 256(%rsp),%rdi call __ecp_nistz256_sub_fromq .byte 102,72,15,126,199 movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 288(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 288+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand .LONE_mont(%rip),%xmm2 pand .LONE_mont+16(%rip),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 384(%rsp),%xmm2 pand 384+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,64(%rdi) movdqu %xmm3,80(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 224(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 224+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 416(%rsp),%xmm2 pand 416+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 320(%rsp),%xmm2 pand 320+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 256(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 256+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 448(%rsp),%xmm2 pand 448+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 352(%rsp),%xmm2 pand 352+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) leaq 480+56(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbx .cfi_restore %rbx movq -8(%rsi),%rbp .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Ladd_affineq_epilogue: ret .cfi_endproc .size ecp_nistz256_point_add_affine_nohw,.-ecp_nistz256_point_add_affine_nohw .type __ecp_nistz256_add_tox,@function .align 32 __ecp_nistz256_add_tox: .cfi_startproc xorq %r11,%r11 adcq 0(%rbx),%r12 adcq 8(%rbx),%r13 movq %r12,%rax adcq 16(%rbx),%r8 adcq 24(%rbx),%r9 movq %r13,%rbp adcq $0,%r11 xorq %r10,%r10 sbbq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 cmovcq %rbp,%r13 movq %r12,0(%rdi) cmovcq %rcx,%r8 movq %r13,8(%rdi) cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .cfi_endproc .size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox .type __ecp_nistz256_sub_fromx,@function .align 32 __ecp_nistz256_sub_fromx: .cfi_startproc xorq %r11,%r11 sbbq 0(%rbx),%r12 sbbq 8(%rbx),%r13 movq %r12,%rax sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 movq %r13,%rbp sbbq $0,%r11 xorq %r10,%r10 adcq $-1,%r12 movq %r8,%rcx adcq %r14,%r13 adcq $0,%r8 movq %r9,%r10 adcq %r15,%r9 btq $0,%r11 cmovncq %rax,%r12 cmovncq %rbp,%r13 movq %r12,0(%rdi) cmovncq %rcx,%r8 movq %r13,8(%rdi) cmovncq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .cfi_endproc .size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx .type __ecp_nistz256_subx,@function .align 32 __ecp_nistz256_subx: .cfi_startproc xorq %r11,%r11 sbbq %r12,%rax sbbq %r13,%rbp movq %rax,%r12 sbbq %r8,%rcx sbbq %r9,%r10 movq %rbp,%r13 sbbq $0,%r11 xorq %r9,%r9 adcq $-1,%rax movq %rcx,%r8 adcq %r14,%rbp adcq $0,%rcx movq %r10,%r9 adcq %r15,%r10 btq $0,%r11 cmovcq %rax,%r12 cmovcq %rbp,%r13 cmovcq %rcx,%r8 cmovcq %r10,%r9 ret .cfi_endproc .size __ecp_nistz256_subx,.-__ecp_nistz256_subx .type __ecp_nistz256_mul_by_2x,@function .align 32 __ecp_nistz256_mul_by_2x: .cfi_startproc xorq %r11,%r11 adcq %r12,%r12 adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 xorq %r10,%r10 sbbq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 cmovcq %rbp,%r13 movq %r12,0(%rdi) cmovcq %rcx,%r8 movq %r13,8(%rdi) cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .cfi_endproc .size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x .globl ecp_nistz256_point_double_adx .hidden ecp_nistz256_point_double_adx .type ecp_nistz256_point_double_adx,@function .align 32 ecp_nistz256_point_double_adx: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $160+8,%rsp .cfi_adjust_cfa_offset 32*5+8 .Lpoint_doublex_body: .Lpoint_double_shortcutx: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx movdqu 16(%rsi),%xmm1 movq 32+0(%rsi),%r12 movq 32+8(%rsi),%r13 movq 32+16(%rsi),%r8 movq 32+24(%rsi),%r9 movq .Lpoly+8(%rip),%r14 movq .Lpoly+24(%rip),%r15 movdqa %xmm0,96(%rsp) movdqa %xmm1,96+16(%rsp) leaq 32(%rdi),%r10 leaq 64(%rdi),%r11 .byte 102,72,15,110,199 .byte 102,73,15,110,202 .byte 102,73,15,110,211 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_by_2x movq 64+0(%rsi),%rdx movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 leaq 64-128(%rsi),%rsi leaq 64(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 0+0(%rsp),%rdx movq 8+0(%rsp),%r14 leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 0(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 32(%rbx),%rdx movq 64+0(%rbx),%r9 movq 64+8(%rbx),%r10 movq 64+16(%rbx),%r11 movq 64+24(%rbx),%r12 leaq 64-128(%rbx),%rsi leaq 32(%rbx),%rbx .byte 102,72,15,126,215 call __ecp_nistz256_mul_montx call __ecp_nistz256_mul_by_2x movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 leaq 64(%rsp),%rbx movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 32(%rsp),%rdi call __ecp_nistz256_add_tox movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 leaq 64(%rsp),%rbx movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromx movq 0+0(%rsp),%rdx movq 8+0(%rsp),%r14 leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 .byte 102,72,15,126,207 call __ecp_nistz256_sqr_montx xorq %r9,%r9 movq %r12,%rax addq $-1,%r12 movq %r13,%r10 adcq %rsi,%r13 movq %r14,%rcx adcq $0,%r14 movq %r15,%r8 adcq %rbp,%r15 adcq $0,%r9 xorq %rsi,%rsi testq $1,%rax cmovzq %rax,%r12 cmovzq %r10,%r13 cmovzq %rcx,%r14 cmovzq %r8,%r15 cmovzq %rsi,%r9 movq %r13,%rax shrq $1,%r12 shlq $63,%rax movq %r14,%r10 shrq $1,%r13 orq %rax,%r12 shlq $63,%r10 movq %r15,%rcx shrq $1,%r14 orq %r10,%r13 shlq $63,%rcx movq %r12,0(%rdi) shrq $1,%r15 movq %r13,8(%rdi) shlq $63,%r9 orq %rcx,%r14 orq %r9,%r15 movq %r14,16(%rdi) movq %r15,24(%rdi) movq 64(%rsp),%rdx leaq 64(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 128(%rsp),%rdi call __ecp_nistz256_mul_by_2x leaq 32(%rsp),%rbx leaq 32(%rsp),%rdi call __ecp_nistz256_add_tox movq 96(%rsp),%rdx leaq 96(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 128(%rsp),%rdi call __ecp_nistz256_mul_by_2x movq 0+32(%rsp),%rdx movq 8+32(%rsp),%r14 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r15 movq 24+32(%rsp),%r8 .byte 102,72,15,126,199 call __ecp_nistz256_sqr_montx leaq 128(%rsp),%rbx movq %r14,%r8 movq %r15,%r9 movq %rsi,%r14 movq %rbp,%r15 call __ecp_nistz256_sub_fromx movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp movq 0+16(%rsp),%rcx movq 0+24(%rsp),%r10 leaq 0(%rsp),%rdi call __ecp_nistz256_subx movq 32(%rsp),%rdx leaq 32(%rsp),%rbx movq %r12,%r14 xorl %ecx,%ecx movq %r12,0+0(%rsp) movq %r13,%r10 movq %r13,0+8(%rsp) cmovzq %r8,%r11 movq %r8,0+16(%rsp) leaq 0-128(%rsp),%rsi cmovzq %r9,%r12 movq %r9,0+24(%rsp) movq %r14,%r9 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montx .byte 102,72,15,126,203 .byte 102,72,15,126,207 call __ecp_nistz256_sub_fromx leaq 160+56(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbx .cfi_restore %rbx movq -8(%rsi),%rbp .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpoint_doublex_epilogue: ret .cfi_endproc .size ecp_nistz256_point_double_adx,.-ecp_nistz256_point_double_adx .globl ecp_nistz256_point_add_adx .hidden ecp_nistz256_point_add_adx .type ecp_nistz256_point_add_adx,@function .align 32 ecp_nistz256_point_add_adx: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $576+8,%rsp .cfi_adjust_cfa_offset 32*18+8 .Lpoint_addx_body: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm3 movdqu 64(%rsi),%xmm4 movdqu 80(%rsi),%xmm5 movq %rsi,%rbx movq %rdx,%rsi movdqa %xmm0,384(%rsp) movdqa %xmm1,384+16(%rsp) movdqa %xmm2,416(%rsp) movdqa %xmm3,416+16(%rsp) movdqa %xmm4,448(%rsp) movdqa %xmm5,448+16(%rsp) por %xmm4,%xmm5 movdqu 0(%rsi),%xmm0 pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 movdqu 48(%rsi),%xmm3 movq 64+0(%rsi),%rdx movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,480(%rsp) pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,480+16(%rsp) movdqu 64(%rsi),%xmm0 movdqu 80(%rsi),%xmm1 movdqa %xmm2,512(%rsp) movdqa %xmm3,512+16(%rsp) por %xmm4,%xmm5 pxor %xmm4,%xmm4 por %xmm0,%xmm1 .byte 102,72,15,110,199 leaq 64-128(%rsi),%rsi movq %rdx,544+0(%rsp) movq %r14,544+8(%rsp) movq %r15,544+16(%rsp) movq %r8,544+24(%rsp) leaq 96(%rsp),%rdi call __ecp_nistz256_sqr_montx pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm1,%xmm4 por %xmm1,%xmm4 pshufd $0,%xmm5,%xmm5 pshufd $0x1e,%xmm4,%xmm3 por %xmm3,%xmm4 pxor %xmm3,%xmm3 pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 movq 64+0(%rbx),%rdx movq 64+8(%rbx),%r14 movq 64+16(%rbx),%r15 movq 64+24(%rbx),%r8 .byte 102,72,15,110,203 leaq 64-128(%rbx),%rsi leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 544(%rsp),%rdx leaq 544(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 leaq -128+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 224(%rsp),%rdi call __ecp_nistz256_mul_montx movq 448(%rsp),%rdx leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montx movq 416(%rsp),%rdx leaq 416(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 leaq -128+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 224(%rsp),%rdi call __ecp_nistz256_mul_montx movq 512(%rsp),%rdx leaq 512(%rsp),%rbx movq 0+256(%rsp),%r9 movq 8+256(%rsp),%r10 leaq -128+256(%rsp),%rsi movq 16+256(%rsp),%r11 movq 24+256(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 224(%rsp),%rbx leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromx orq %r13,%r12 movdqa %xmm4,%xmm2 orq %r8,%r12 orq %r9,%r12 por %xmm5,%xmm2 .byte 102,73,15,110,220 movq 384(%rsp),%rdx leaq 384(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 leaq -128+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 160(%rsp),%rdi call __ecp_nistz256_mul_montx movq 480(%rsp),%rdx leaq 480(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 160(%rsp),%rbx leaq 0(%rsp),%rdi call __ecp_nistz256_sub_fromx orq %r13,%r12 orq %r8,%r12 orq %r9,%r12 .byte 102,73,15,126,208 .byte 102,73,15,126,217 orq %r8,%r12 .byte 0x3e jnz .Ladd_proceedx testq %r9,%r9 jz .Ladd_doublex .byte 102,72,15,126,199 pxor %xmm0,%xmm0 movdqu %xmm0,0(%rdi) movdqu %xmm0,16(%rdi) movdqu %xmm0,32(%rdi) movdqu %xmm0,48(%rdi) movdqu %xmm0,64(%rdi) movdqu %xmm0,80(%rdi) jmp .Ladd_donex .align 32 .Ladd_doublex: .byte 102,72,15,126,206 .byte 102,72,15,126,199 addq $416,%rsp .cfi_adjust_cfa_offset -416 jmp .Lpoint_double_shortcutx .cfi_adjust_cfa_offset 416 .align 32 .Ladd_proceedx: movq 0+64(%rsp),%rdx movq 8+64(%rsp),%r14 leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 96(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 448(%rsp),%rdx leaq 448(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 352(%rsp),%rdi call __ecp_nistz256_mul_montx movq 0+0(%rsp),%rdx movq 8+0(%rsp),%r14 leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 544(%rsp),%rdx leaq 544(%rsp),%rbx movq 0+352(%rsp),%r9 movq 8+352(%rsp),%r10 leaq -128+352(%rsp),%rsi movq 16+352(%rsp),%r11 movq 24+352(%rsp),%r12 leaq 352(%rsp),%rdi call __ecp_nistz256_mul_montx movq 0(%rsp),%rdx leaq 0(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 128(%rsp),%rdi call __ecp_nistz256_mul_montx movq 160(%rsp),%rdx leaq 160(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montx xorq %r11,%r11 addq %r12,%r12 leaq 96(%rsp),%rsi adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 movq 0(%rsi),%rax cmovcq %rbp,%r13 movq 8(%rsi),%rbp cmovcq %rcx,%r8 movq 16(%rsi),%rcx cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subx leaq 128(%rsp),%rbx leaq 288(%rsp),%rdi call __ecp_nistz256_sub_fromx movq 192+0(%rsp),%rax movq 192+8(%rsp),%rbp movq 192+16(%rsp),%rcx movq 192+24(%rsp),%r10 leaq 320(%rsp),%rdi call __ecp_nistz256_subx movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 128(%rsp),%rdx leaq 128(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 leaq -128+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montx movq 320(%rsp),%rdx leaq 320(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 320(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 256(%rsp),%rbx leaq 320(%rsp),%rdi call __ecp_nistz256_sub_fromx .byte 102,72,15,126,199 movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 352(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 352+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 544(%rsp),%xmm2 pand 544+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 448(%rsp),%xmm2 pand 448+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,64(%rdi) movdqu %xmm3,80(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 288(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 288+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 480(%rsp),%xmm2 pand 480+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 384(%rsp),%xmm2 pand 384+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 320(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 320+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 512(%rsp),%xmm2 pand 512+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 416(%rsp),%xmm2 pand 416+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) .Ladd_donex: leaq 576+56(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbx .cfi_restore %rbx movq -8(%rsi),%rbp .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpoint_addx_epilogue: ret .cfi_endproc .size ecp_nistz256_point_add_adx,.-ecp_nistz256_point_add_adx .globl ecp_nistz256_point_add_affine_adx .hidden ecp_nistz256_point_add_affine_adx .type ecp_nistz256_point_add_affine_adx,@function .align 32 ecp_nistz256_point_add_affine_adx: .cfi_startproc _CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $480+8,%rsp .cfi_adjust_cfa_offset 32*15+8 .Ladd_affinex_body: movdqu 0(%rsi),%xmm0 movq %rdx,%rbx movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm3 movdqu 64(%rsi),%xmm4 movdqu 80(%rsi),%xmm5 movq 64+0(%rsi),%rdx movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,320(%rsp) movdqa %xmm1,320+16(%rsp) movdqa %xmm2,352(%rsp) movdqa %xmm3,352+16(%rsp) movdqa %xmm4,384(%rsp) movdqa %xmm5,384+16(%rsp) por %xmm4,%xmm5 movdqu 0(%rbx),%xmm0 pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rbx),%xmm1 movdqu 32(%rbx),%xmm2 por %xmm3,%xmm5 movdqu 48(%rbx),%xmm3 movdqa %xmm0,416(%rsp) pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,416+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 movdqa %xmm2,448(%rsp) movdqa %xmm3,448+16(%rsp) por %xmm2,%xmm3 por %xmm4,%xmm5 pxor %xmm4,%xmm4 por %xmm1,%xmm3 leaq 64-128(%rsi),%rsi leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montx pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rdx movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 pshufd $0x1e,%xmm4,%xmm3 movq %r13,%r10 por %xmm3,%xmm4 pxor %xmm3,%xmm3 movq %r14,%r11 pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 leaq 32-128(%rsp),%rsi movq %r15,%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 320(%rsp),%rbx leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromx movq 384(%rsp),%rdx leaq 384(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montx movq 384(%rsp),%rdx leaq 384(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 288(%rsp),%rdi call __ecp_nistz256_mul_montx movq 448(%rsp),%rdx leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 352(%rsp),%rbx leaq 96(%rsp),%rdi call __ecp_nistz256_sub_fromx movq 0+64(%rsp),%rdx movq 8+64(%rsp),%r14 leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 128(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 0+96(%rsp),%rdx movq 8+96(%rsp),%r14 leaq -128+96(%rsp),%rsi movq 16+96(%rsp),%r15 movq 24+96(%rsp),%r8 leaq 192(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 128(%rsp),%rdx leaq 128(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 160(%rsp),%rdi call __ecp_nistz256_mul_montx movq 320(%rsp),%rdx leaq 320(%rsp),%rbx movq 0+128(%rsp),%r9 movq 8+128(%rsp),%r10 leaq -128+128(%rsp),%rsi movq 16+128(%rsp),%r11 movq 24+128(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montx xorq %r11,%r11 addq %r12,%r12 leaq 192(%rsp),%rsi adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 movq 0(%rsi),%rax cmovcq %rbp,%r13 movq 8(%rsi),%rbp cmovcq %rcx,%r8 movq 16(%rsi),%rcx cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subx leaq 160(%rsp),%rbx leaq 224(%rsp),%rdi call __ecp_nistz256_sub_fromx movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp movq 0+16(%rsp),%rcx movq 0+24(%rsp),%r10 leaq 64(%rsp),%rdi call __ecp_nistz256_subx movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 352(%rsp),%rdx leaq 352(%rsp),%rbx movq 0+160(%rsp),%r9 movq 8+160(%rsp),%r10 leaq -128+160(%rsp),%rsi movq 16+160(%rsp),%r11 movq 24+160(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montx movq 96(%rsp),%rdx leaq 96(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 64(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 32(%rsp),%rbx leaq 256(%rsp),%rdi call __ecp_nistz256_sub_fromx .byte 102,72,15,126,199 movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 288(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 288+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand .LONE_mont(%rip),%xmm2 pand .LONE_mont+16(%rip),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 384(%rsp),%xmm2 pand 384+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,64(%rdi) movdqu %xmm3,80(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 224(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 224+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 416(%rsp),%xmm2 pand 416+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 320(%rsp),%xmm2 pand 320+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 256(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 256+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 448(%rsp),%xmm2 pand 448+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 352(%rsp),%xmm2 pand 352+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) leaq 480+56(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbx .cfi_restore %rbx movq -8(%rsi),%rbp .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Ladd_affinex_epilogue: ret .cfi_endproc .size ecp_nistz256_point_add_affine_adx,.-ecp_nistz256_point_add_affine_adx #endif ring-0.17.14/pregenerated/p256-x86_64-asm-macosx.S000064400000000000000000002061431046102023000172320ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text .section __DATA,__const .p2align 6 L$poly: .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 L$One: .long 1,1,1,1,1,1,1,1 L$Two: .long 2,2,2,2,2,2,2,2 L$Three: .long 3,3,3,3,3,3,3,3 L$ONE_mont: .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe L$ord: .quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 L$ordK: .quad 0xccd1c8aaee00bc4f .text .globl _ecp_nistz256_neg .private_extern _ecp_nistz256_neg .p2align 5 _ecp_nistz256_neg: _CET_ENDBR pushq %r12 pushq %r13 L$neg_body: xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 xorq %r11,%r11 xorq %r13,%r13 subq 0(%rsi),%r8 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 movq %r8,%rax sbbq 24(%rsi),%r11 leaq L$poly(%rip),%rsi movq %r9,%rdx sbbq $0,%r13 addq 0(%rsi),%r8 movq %r10,%rcx adcq 8(%rsi),%r9 adcq 16(%rsi),%r10 movq %r11,%r12 adcq 24(%rsi),%r11 testq %r13,%r13 cmovzq %rax,%r8 cmovzq %rdx,%r9 movq %r8,0(%rdi) cmovzq %rcx,%r10 movq %r9,8(%rdi) cmovzq %r12,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 0(%rsp),%r13 movq 8(%rsp),%r12 leaq 16(%rsp),%rsp L$neg_epilogue: ret .globl _ecp_nistz256_ord_mul_mont_nohw .private_extern _ecp_nistz256_ord_mul_mont_nohw .p2align 5 _ecp_nistz256_ord_mul_mont_nohw: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$ord_mul_body: movq 0(%rdx),%rax movq %rdx,%rbx leaq L$ord(%rip),%r14 movq L$ordK(%rip),%r15 movq %rax,%rcx mulq 0(%rsi) movq %rax,%r8 movq %rcx,%rax movq %rdx,%r9 mulq 8(%rsi) addq %rax,%r9 movq %rcx,%rax adcq $0,%rdx movq %rdx,%r10 mulq 16(%rsi) addq %rax,%r10 movq %rcx,%rax adcq $0,%rdx movq %r8,%r13 imulq %r15,%r8 movq %rdx,%r11 mulq 24(%rsi) addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %rdx,%r12 mulq 0(%r14) movq %r8,%rbp addq %rax,%r13 movq %r8,%rax adcq $0,%rdx movq %rdx,%rcx subq %r8,%r10 sbbq $0,%r8 mulq 8(%r14) addq %rcx,%r9 adcq $0,%rdx addq %rax,%r9 movq %rbp,%rax adcq %rdx,%r10 movq %rbp,%rdx adcq $0,%r8 shlq $32,%rax shrq $32,%rdx subq %rax,%r11 movq 8(%rbx),%rax sbbq %rdx,%rbp addq %r8,%r11 adcq %rbp,%r12 adcq $0,%r13 movq %rax,%rcx mulq 0(%rsi) addq %rax,%r9 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rbp,%r10 adcq $0,%rdx addq %rax,%r10 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rbp,%r11 adcq $0,%rdx addq %rax,%r11 movq %rcx,%rax adcq $0,%rdx movq %r9,%rcx imulq %r15,%r9 movq %rdx,%rbp mulq 24(%rsi) addq %rbp,%r12 adcq $0,%rdx xorq %r8,%r8 addq %rax,%r12 movq %r9,%rax adcq %rdx,%r13 adcq $0,%r8 mulq 0(%r14) movq %r9,%rbp addq %rax,%rcx movq %r9,%rax adcq %rdx,%rcx subq %r9,%r11 sbbq $0,%r9 mulq 8(%r14) addq %rcx,%r10 adcq $0,%rdx addq %rax,%r10 movq %rbp,%rax adcq %rdx,%r11 movq %rbp,%rdx adcq $0,%r9 shlq $32,%rax shrq $32,%rdx subq %rax,%r12 movq 16(%rbx),%rax sbbq %rdx,%rbp addq %r9,%r12 adcq %rbp,%r13 adcq $0,%r8 movq %rax,%rcx mulq 0(%rsi) addq %rax,%r10 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rbp,%r11 adcq $0,%rdx addq %rax,%r11 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rbp,%r12 adcq $0,%rdx addq %rax,%r12 movq %rcx,%rax adcq $0,%rdx movq %r10,%rcx imulq %r15,%r10 movq %rdx,%rbp mulq 24(%rsi) addq %rbp,%r13 adcq $0,%rdx xorq %r9,%r9 addq %rax,%r13 movq %r10,%rax adcq %rdx,%r8 adcq $0,%r9 mulq 0(%r14) movq %r10,%rbp addq %rax,%rcx movq %r10,%rax adcq %rdx,%rcx subq %r10,%r12 sbbq $0,%r10 mulq 8(%r14) addq %rcx,%r11 adcq $0,%rdx addq %rax,%r11 movq %rbp,%rax adcq %rdx,%r12 movq %rbp,%rdx adcq $0,%r10 shlq $32,%rax shrq $32,%rdx subq %rax,%r13 movq 24(%rbx),%rax sbbq %rdx,%rbp addq %r10,%r13 adcq %rbp,%r8 adcq $0,%r9 movq %rax,%rcx mulq 0(%rsi) addq %rax,%r11 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rbp,%r12 adcq $0,%rdx addq %rax,%r12 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rbp,%r13 adcq $0,%rdx addq %rax,%r13 movq %rcx,%rax adcq $0,%rdx movq %r11,%rcx imulq %r15,%r11 movq %rdx,%rbp mulq 24(%rsi) addq %rbp,%r8 adcq $0,%rdx xorq %r10,%r10 addq %rax,%r8 movq %r11,%rax adcq %rdx,%r9 adcq $0,%r10 mulq 0(%r14) movq %r11,%rbp addq %rax,%rcx movq %r11,%rax adcq %rdx,%rcx subq %r11,%r13 sbbq $0,%r11 mulq 8(%r14) addq %rcx,%r12 adcq $0,%rdx addq %rax,%r12 movq %rbp,%rax adcq %rdx,%r13 movq %rbp,%rdx adcq $0,%r11 shlq $32,%rax shrq $32,%rdx subq %rax,%r8 sbbq %rdx,%rbp addq %r11,%r8 adcq %rbp,%r9 adcq $0,%r10 movq %r12,%rsi subq 0(%r14),%r12 movq %r13,%r11 sbbq 8(%r14),%r13 movq %r8,%rcx sbbq 16(%r14),%r8 movq %r9,%rbp sbbq 24(%r14),%r9 sbbq $0,%r10 cmovcq %rsi,%r12 cmovcq %r11,%r13 cmovcq %rcx,%r8 cmovcq %rbp,%r9 movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 0(%rsp),%r15 movq 8(%rsp),%r14 movq 16(%rsp),%r13 movq 24(%rsp),%r12 movq 32(%rsp),%rbx movq 40(%rsp),%rbp leaq 48(%rsp),%rsp L$ord_mul_epilogue: ret .globl _ecp_nistz256_ord_sqr_mont_nohw .private_extern _ecp_nistz256_ord_sqr_mont_nohw .p2align 5 _ecp_nistz256_ord_sqr_mont_nohw: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$ord_sqr_body: movq 0(%rsi),%r8 movq 8(%rsi),%rax movq 16(%rsi),%r14 movq 24(%rsi),%r15 leaq L$ord(%rip),%rsi movq %rdx,%rbx jmp L$oop_ord_sqr .p2align 5 L$oop_ord_sqr: movq %rax,%rbp mulq %r8 movq %rax,%r9 .byte 102,72,15,110,205 movq %r14,%rax movq %rdx,%r10 mulq %r8 addq %rax,%r10 movq %r15,%rax .byte 102,73,15,110,214 adcq $0,%rdx movq %rdx,%r11 mulq %r8 addq %rax,%r11 movq %r15,%rax .byte 102,73,15,110,223 adcq $0,%rdx movq %rdx,%r12 mulq %r14 movq %rax,%r13 movq %r14,%rax movq %rdx,%r14 mulq %rbp addq %rax,%r11 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 mulq %rbp addq %rax,%r12 adcq $0,%rdx addq %r15,%r12 adcq %rdx,%r13 adcq $0,%r14 xorq %r15,%r15 movq %r8,%rax addq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %r12,%r12 adcq %r13,%r13 adcq %r14,%r14 adcq $0,%r15 mulq %rax movq %rax,%r8 .byte 102,72,15,126,200 movq %rdx,%rbp mulq %rax addq %rbp,%r9 adcq %rax,%r10 .byte 102,72,15,126,208 adcq $0,%rdx movq %rdx,%rbp mulq %rax addq %rbp,%r11 adcq %rax,%r12 .byte 102,72,15,126,216 adcq $0,%rdx movq %rdx,%rbp movq %r8,%rcx imulq 32(%rsi),%r8 mulq %rax addq %rbp,%r13 adcq %rax,%r14 movq 0(%rsi),%rax adcq %rdx,%r15 mulq %r8 movq %r8,%rbp addq %rax,%rcx movq 8(%rsi),%rax adcq %rdx,%rcx subq %r8,%r10 sbbq $0,%rbp mulq %r8 addq %rcx,%r9 adcq $0,%rdx addq %rax,%r9 movq %r8,%rax adcq %rdx,%r10 movq %r8,%rdx adcq $0,%rbp movq %r9,%rcx imulq 32(%rsi),%r9 shlq $32,%rax shrq $32,%rdx subq %rax,%r11 movq 0(%rsi),%rax sbbq %rdx,%r8 addq %rbp,%r11 adcq $0,%r8 mulq %r9 movq %r9,%rbp addq %rax,%rcx movq 8(%rsi),%rax adcq %rdx,%rcx subq %r9,%r11 sbbq $0,%rbp mulq %r9 addq %rcx,%r10 adcq $0,%rdx addq %rax,%r10 movq %r9,%rax adcq %rdx,%r11 movq %r9,%rdx adcq $0,%rbp movq %r10,%rcx imulq 32(%rsi),%r10 shlq $32,%rax shrq $32,%rdx subq %rax,%r8 movq 0(%rsi),%rax sbbq %rdx,%r9 addq %rbp,%r8 adcq $0,%r9 mulq %r10 movq %r10,%rbp addq %rax,%rcx movq 8(%rsi),%rax adcq %rdx,%rcx subq %r10,%r8 sbbq $0,%rbp mulq %r10 addq %rcx,%r11 adcq $0,%rdx addq %rax,%r11 movq %r10,%rax adcq %rdx,%r8 movq %r10,%rdx adcq $0,%rbp movq %r11,%rcx imulq 32(%rsi),%r11 shlq $32,%rax shrq $32,%rdx subq %rax,%r9 movq 0(%rsi),%rax sbbq %rdx,%r10 addq %rbp,%r9 adcq $0,%r10 mulq %r11 movq %r11,%rbp addq %rax,%rcx movq 8(%rsi),%rax adcq %rdx,%rcx subq %r11,%r9 sbbq $0,%rbp mulq %r11 addq %rcx,%r8 adcq $0,%rdx addq %rax,%r8 movq %r11,%rax adcq %rdx,%r9 movq %r11,%rdx adcq $0,%rbp shlq $32,%rax shrq $32,%rdx subq %rax,%r10 sbbq %rdx,%r11 addq %rbp,%r10 adcq $0,%r11 xorq %rdx,%rdx addq %r12,%r8 adcq %r13,%r9 movq %r8,%r12 adcq %r14,%r10 adcq %r15,%r11 movq %r9,%rax adcq $0,%rdx subq 0(%rsi),%r8 movq %r10,%r14 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 movq %r11,%r15 sbbq 24(%rsi),%r11 sbbq $0,%rdx cmovcq %r12,%r8 cmovncq %r9,%rax cmovncq %r10,%r14 cmovncq %r11,%r15 decq %rbx jnz L$oop_ord_sqr movq %r8,0(%rdi) movq %rax,8(%rdi) pxor %xmm1,%xmm1 movq %r14,16(%rdi) pxor %xmm2,%xmm2 movq %r15,24(%rdi) pxor %xmm3,%xmm3 movq 0(%rsp),%r15 movq 8(%rsp),%r14 movq 16(%rsp),%r13 movq 24(%rsp),%r12 movq 32(%rsp),%rbx movq 40(%rsp),%rbp leaq 48(%rsp),%rsp L$ord_sqr_epilogue: ret .globl _ecp_nistz256_ord_mul_mont_adx .private_extern _ecp_nistz256_ord_mul_mont_adx .p2align 5 _ecp_nistz256_ord_mul_mont_adx: L$ecp_nistz256_ord_mul_mont_adx: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$ord_mulx_body: movq %rdx,%rbx movq 0(%rdx),%rdx movq 0(%rsi),%r9 movq 8(%rsi),%r10 movq 16(%rsi),%r11 movq 24(%rsi),%r12 leaq -128(%rsi),%rsi leaq L$ord-128(%rip),%r14 movq L$ordK(%rip),%r15 mulxq %r9,%r8,%r9 mulxq %r10,%rcx,%r10 mulxq %r11,%rbp,%r11 addq %rcx,%r9 mulxq %r12,%rcx,%r12 movq %r8,%rdx mulxq %r15,%rdx,%rax adcq %rbp,%r10 adcq %rcx,%r11 adcq $0,%r12 xorq %r13,%r13 mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r8 adoxq %rbp,%r9 mulxq 8+128(%r14),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 mulxq 16+128(%r14),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 24+128(%r14),%rcx,%rbp movq 8(%rbx),%rdx adcxq %rcx,%r11 adoxq %rbp,%r12 adcxq %r8,%r12 adoxq %r8,%r13 adcq $0,%r13 mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 mulxq 8+128(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 16+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 24+128(%rsi),%rcx,%rbp movq %r9,%rdx mulxq %r15,%rdx,%rax adcxq %rcx,%r12 adoxq %rbp,%r13 adcxq %r8,%r13 adoxq %r8,%r8 adcq $0,%r8 mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 mulxq 8+128(%r14),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 16+128(%r14),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 24+128(%r14),%rcx,%rbp movq 16(%rbx),%rdx adcxq %rcx,%r12 adoxq %rbp,%r13 adcxq %r9,%r13 adoxq %r9,%r8 adcq $0,%r8 mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 8+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 16+128(%rsi),%rcx,%rbp adcxq %rcx,%r12 adoxq %rbp,%r13 mulxq 24+128(%rsi),%rcx,%rbp movq %r10,%rdx mulxq %r15,%rdx,%rax adcxq %rcx,%r13 adoxq %rbp,%r8 adcxq %r9,%r8 adoxq %r9,%r9 adcq $0,%r9 mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 8+128(%r14),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 16+128(%r14),%rcx,%rbp adcxq %rcx,%r12 adoxq %rbp,%r13 mulxq 24+128(%r14),%rcx,%rbp movq 24(%rbx),%rdx adcxq %rcx,%r13 adoxq %rbp,%r8 adcxq %r10,%r8 adoxq %r10,%r9 adcq $0,%r9 mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 8+128(%rsi),%rcx,%rbp adcxq %rcx,%r12 adoxq %rbp,%r13 mulxq 16+128(%rsi),%rcx,%rbp adcxq %rcx,%r13 adoxq %rbp,%r8 mulxq 24+128(%rsi),%rcx,%rbp movq %r11,%rdx mulxq %r15,%rdx,%rax adcxq %rcx,%r8 adoxq %rbp,%r9 adcxq %r10,%r9 adoxq %r10,%r10 adcq $0,%r10 mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 8+128(%r14),%rcx,%rbp adcxq %rcx,%r12 adoxq %rbp,%r13 mulxq 16+128(%r14),%rcx,%rbp adcxq %rcx,%r13 adoxq %rbp,%r8 mulxq 24+128(%r14),%rcx,%rbp leaq 128(%r14),%r14 movq %r12,%rbx adcxq %rcx,%r8 adoxq %rbp,%r9 movq %r13,%rdx adcxq %r11,%r9 adoxq %r11,%r10 adcq $0,%r10 movq %r8,%rcx subq 0(%r14),%r12 sbbq 8(%r14),%r13 sbbq 16(%r14),%r8 movq %r9,%rbp sbbq 24(%r14),%r9 sbbq $0,%r10 cmovcq %rbx,%r12 cmovcq %rdx,%r13 cmovcq %rcx,%r8 cmovcq %rbp,%r9 movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 0(%rsp),%r15 movq 8(%rsp),%r14 movq 16(%rsp),%r13 movq 24(%rsp),%r12 movq 32(%rsp),%rbx movq 40(%rsp),%rbp leaq 48(%rsp),%rsp L$ord_mulx_epilogue: ret .globl _ecp_nistz256_ord_sqr_mont_adx .private_extern _ecp_nistz256_ord_sqr_mont_adx .p2align 5 _ecp_nistz256_ord_sqr_mont_adx: _CET_ENDBR L$ecp_nistz256_ord_sqr_mont_adx: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$ord_sqrx_body: movq %rdx,%rbx movq 0(%rsi),%rdx movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 leaq L$ord(%rip),%rsi jmp L$oop_ord_sqrx .p2align 5 L$oop_ord_sqrx: mulxq %r14,%r9,%r10 mulxq %r15,%rcx,%r11 movq %rdx,%rax .byte 102,73,15,110,206 mulxq %r8,%rbp,%r12 movq %r14,%rdx addq %rcx,%r10 .byte 102,73,15,110,215 adcq %rbp,%r11 adcq $0,%r12 xorq %r13,%r13 mulxq %r15,%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq %r8,%rcx,%rbp movq %r15,%rdx adcxq %rcx,%r12 adoxq %rbp,%r13 adcq $0,%r13 mulxq %r8,%rcx,%r14 movq %rax,%rdx .byte 102,73,15,110,216 xorq %r15,%r15 adcxq %r9,%r9 adoxq %rcx,%r13 adcxq %r10,%r10 adoxq %r15,%r14 mulxq %rdx,%r8,%rbp .byte 102,72,15,126,202 adcxq %r11,%r11 adoxq %rbp,%r9 adcxq %r12,%r12 mulxq %rdx,%rcx,%rax .byte 102,72,15,126,210 adcxq %r13,%r13 adoxq %rcx,%r10 adcxq %r14,%r14 mulxq %rdx,%rcx,%rbp .byte 0x67 .byte 102,72,15,126,218 adoxq %rax,%r11 adcxq %r15,%r15 adoxq %rcx,%r12 adoxq %rbp,%r13 mulxq %rdx,%rcx,%rax adoxq %rcx,%r14 adoxq %rax,%r15 movq %r8,%rdx mulxq 32(%rsi),%rdx,%rcx xorq %rax,%rax mulxq 0(%rsi),%rcx,%rbp adcxq %rcx,%r8 adoxq %rbp,%r9 mulxq 8(%rsi),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 mulxq 16(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 24(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r8 adcxq %rax,%r8 movq %r9,%rdx mulxq 32(%rsi),%rdx,%rcx mulxq 0(%rsi),%rcx,%rbp adoxq %rcx,%r9 adcxq %rbp,%r10 mulxq 8(%rsi),%rcx,%rbp adoxq %rcx,%r10 adcxq %rbp,%r11 mulxq 16(%rsi),%rcx,%rbp adoxq %rcx,%r11 adcxq %rbp,%r8 mulxq 24(%rsi),%rcx,%rbp adoxq %rcx,%r8 adcxq %rbp,%r9 adoxq %rax,%r9 movq %r10,%rdx mulxq 32(%rsi),%rdx,%rcx mulxq 0(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 8(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r8 mulxq 16(%rsi),%rcx,%rbp adcxq %rcx,%r8 adoxq %rbp,%r9 mulxq 24(%rsi),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 adcxq %rax,%r10 movq %r11,%rdx mulxq 32(%rsi),%rdx,%rcx mulxq 0(%rsi),%rcx,%rbp adoxq %rcx,%r11 adcxq %rbp,%r8 mulxq 8(%rsi),%rcx,%rbp adoxq %rcx,%r8 adcxq %rbp,%r9 mulxq 16(%rsi),%rcx,%rbp adoxq %rcx,%r9 adcxq %rbp,%r10 mulxq 24(%rsi),%rcx,%rbp adoxq %rcx,%r10 adcxq %rbp,%r11 adoxq %rax,%r11 addq %r8,%r12 adcq %r13,%r9 movq %r12,%rdx adcq %r14,%r10 adcq %r15,%r11 movq %r9,%r14 adcq $0,%rax subq 0(%rsi),%r12 movq %r10,%r15 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 movq %r11,%r8 sbbq 24(%rsi),%r11 sbbq $0,%rax cmovncq %r12,%rdx cmovncq %r9,%r14 cmovncq %r10,%r15 cmovncq %r11,%r8 decq %rbx jnz L$oop_ord_sqrx movq %rdx,0(%rdi) movq %r14,8(%rdi) pxor %xmm1,%xmm1 movq %r15,16(%rdi) pxor %xmm2,%xmm2 movq %r8,24(%rdi) pxor %xmm3,%xmm3 movq 0(%rsp),%r15 movq 8(%rsp),%r14 movq 16(%rsp),%r13 movq 24(%rsp),%r12 movq 32(%rsp),%rbx movq 40(%rsp),%rbp leaq 48(%rsp),%rsp L$ord_sqrx_epilogue: ret .globl _ecp_nistz256_mul_mont_nohw .private_extern _ecp_nistz256_mul_mont_nohw .p2align 5 _ecp_nistz256_mul_mont_nohw: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$mul_body: movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 movq 8(%rsi),%r10 movq 16(%rsi),%r11 movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq movq 0(%rsp),%r15 movq 8(%rsp),%r14 movq 16(%rsp),%r13 movq 24(%rsp),%r12 movq 32(%rsp),%rbx movq 40(%rsp),%rbp leaq 48(%rsp),%rsp L$mul_epilogue: ret .p2align 5 __ecp_nistz256_mul_montq: movq %rax,%rbp mulq %r9 movq L$poly+8(%rip),%r14 movq %rax,%r8 movq %rbp,%rax movq %rdx,%r9 mulq %r10 movq L$poly+24(%rip),%r15 addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r10 mulq %r11 addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r11 mulq %r12 addq %rax,%r11 movq %r8,%rax adcq $0,%rdx xorq %r13,%r13 movq %rdx,%r12 movq %r8,%rbp shlq $32,%r8 mulq %r15 shrq $32,%rbp addq %r8,%r9 adcq %rbp,%r10 adcq %rax,%r11 movq 8(%rbx),%rax adcq %rdx,%r12 adcq $0,%r13 xorq %r8,%r8 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 8(%rsi) addq %rcx,%r10 adcq $0,%rdx addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 16(%rsi) addq %rcx,%r11 adcq $0,%rdx addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 24(%rsi) addq %rcx,%r12 adcq $0,%rdx addq %rax,%r12 movq %r9,%rax adcq %rdx,%r13 adcq $0,%r8 movq %r9,%rbp shlq $32,%r9 mulq %r15 shrq $32,%rbp addq %r9,%r10 adcq %rbp,%r11 adcq %rax,%r12 movq 16(%rbx),%rax adcq %rdx,%r13 adcq $0,%r8 xorq %r9,%r9 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 8(%rsi) addq %rcx,%r11 adcq $0,%rdx addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 16(%rsi) addq %rcx,%r12 adcq $0,%rdx addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 24(%rsi) addq %rcx,%r13 adcq $0,%rdx addq %rax,%r13 movq %r10,%rax adcq %rdx,%r8 adcq $0,%r9 movq %r10,%rbp shlq $32,%r10 mulq %r15 shrq $32,%rbp addq %r10,%r11 adcq %rbp,%r12 adcq %rax,%r13 movq 24(%rbx),%rax adcq %rdx,%r8 adcq $0,%r9 xorq %r10,%r10 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 8(%rsi) addq %rcx,%r12 adcq $0,%rdx addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 16(%rsi) addq %rcx,%r13 adcq $0,%rdx addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 24(%rsi) addq %rcx,%r8 adcq $0,%rdx addq %rax,%r8 movq %r11,%rax adcq %rdx,%r9 adcq $0,%r10 movq %r11,%rbp shlq $32,%r11 mulq %r15 shrq $32,%rbp addq %r11,%r12 adcq %rbp,%r13 movq %r12,%rcx adcq %rax,%r8 adcq %rdx,%r9 movq %r13,%rbp adcq $0,%r10 subq $-1,%r12 movq %r8,%rbx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%rdx sbbq %r15,%r9 sbbq $0,%r10 cmovcq %rcx,%r12 cmovcq %rbp,%r13 movq %r12,0(%rdi) cmovcq %rbx,%r8 movq %r13,8(%rdi) cmovcq %rdx,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .globl _ecp_nistz256_sqr_mont_nohw .private_extern _ecp_nistz256_sqr_mont_nohw .p2align 5 _ecp_nistz256_sqr_mont_nohw: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$sqr_body: movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq movq 0(%rsp),%r15 movq 8(%rsp),%r14 movq 16(%rsp),%r13 movq 24(%rsp),%r12 movq 32(%rsp),%rbx movq 40(%rsp),%rbp leaq 48(%rsp),%rsp L$sqr_epilogue: ret .p2align 5 __ecp_nistz256_sqr_montq: movq %rax,%r13 mulq %r14 movq %rax,%r9 movq %r15,%rax movq %rdx,%r10 mulq %r13 addq %rax,%r10 movq %r8,%rax adcq $0,%rdx movq %rdx,%r11 mulq %r13 addq %rax,%r11 movq %r15,%rax adcq $0,%rdx movq %rdx,%r12 mulq %r14 addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %rdx,%rbp mulq %r14 addq %rax,%r12 movq %r8,%rax adcq $0,%rdx addq %rbp,%r12 movq %rdx,%r13 adcq $0,%r13 mulq %r15 xorq %r15,%r15 addq %rax,%r13 movq 0(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 addq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %r12,%r12 adcq %r13,%r13 adcq %r14,%r14 adcq $0,%r15 mulq %rax movq %rax,%r8 movq 8(%rsi),%rax movq %rdx,%rcx mulq %rax addq %rcx,%r9 adcq %rax,%r10 movq 16(%rsi),%rax adcq $0,%rdx movq %rdx,%rcx mulq %rax addq %rcx,%r11 adcq %rax,%r12 movq 24(%rsi),%rax adcq $0,%rdx movq %rdx,%rcx mulq %rax addq %rcx,%r13 adcq %rax,%r14 movq %r8,%rax adcq %rdx,%r15 movq L$poly+8(%rip),%rsi movq L$poly+24(%rip),%rbp movq %r8,%rcx shlq $32,%r8 mulq %rbp shrq $32,%rcx addq %r8,%r9 adcq %rcx,%r10 adcq %rax,%r11 movq %r9,%rax adcq $0,%rdx movq %r9,%rcx shlq $32,%r9 movq %rdx,%r8 mulq %rbp shrq $32,%rcx addq %r9,%r10 adcq %rcx,%r11 adcq %rax,%r8 movq %r10,%rax adcq $0,%rdx movq %r10,%rcx shlq $32,%r10 movq %rdx,%r9 mulq %rbp shrq $32,%rcx addq %r10,%r11 adcq %rcx,%r8 adcq %rax,%r9 movq %r11,%rax adcq $0,%rdx movq %r11,%rcx shlq $32,%r11 movq %rdx,%r10 mulq %rbp shrq $32,%rcx addq %r11,%r8 adcq %rcx,%r9 adcq %rax,%r10 adcq $0,%rdx xorq %r11,%r11 addq %r8,%r12 adcq %r9,%r13 movq %r12,%r8 adcq %r10,%r14 adcq %rdx,%r15 movq %r13,%r9 adcq $0,%r11 subq $-1,%r12 movq %r14,%r10 sbbq %rsi,%r13 sbbq $0,%r14 movq %r15,%rcx sbbq %rbp,%r15 sbbq $0,%r11 cmovcq %r8,%r12 cmovcq %r9,%r13 movq %r12,0(%rdi) cmovcq %r10,%r14 movq %r13,8(%rdi) cmovcq %rcx,%r15 movq %r14,16(%rdi) movq %r15,24(%rdi) ret .globl _ecp_nistz256_mul_mont_adx .private_extern _ecp_nistz256_mul_mont_adx .p2align 5 _ecp_nistz256_mul_mont_adx: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$mulx_body: movq %rdx,%rbx movq 0(%rdx),%rdx movq 0(%rsi),%r9 movq 8(%rsi),%r10 movq 16(%rsi),%r11 movq 24(%rsi),%r12 leaq -128(%rsi),%rsi call __ecp_nistz256_mul_montx movq 0(%rsp),%r15 movq 8(%rsp),%r14 movq 16(%rsp),%r13 movq 24(%rsp),%r12 movq 32(%rsp),%rbx movq 40(%rsp),%rbp leaq 48(%rsp),%rsp L$mulx_epilogue: ret .p2align 5 __ecp_nistz256_mul_montx: mulxq %r9,%r8,%r9 mulxq %r10,%rcx,%r10 movq $32,%r14 xorq %r13,%r13 mulxq %r11,%rbp,%r11 movq L$poly+24(%rip),%r15 adcq %rcx,%r9 mulxq %r12,%rcx,%r12 movq %r8,%rdx adcq %rbp,%r10 shlxq %r14,%r8,%rbp adcq %rcx,%r11 shrxq %r14,%r8,%rcx adcq $0,%r12 addq %rbp,%r9 adcq %rcx,%r10 mulxq %r15,%rcx,%rbp movq 8(%rbx),%rdx adcq %rcx,%r11 adcq %rbp,%r12 adcq $0,%r13 xorq %r8,%r8 mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 mulxq 8+128(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 16+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 24+128(%rsi),%rcx,%rbp movq %r9,%rdx adcxq %rcx,%r12 shlxq %r14,%r9,%rcx adoxq %rbp,%r13 shrxq %r14,%r9,%rbp adcxq %r8,%r13 adoxq %r8,%r8 adcq $0,%r8 addq %rcx,%r10 adcq %rbp,%r11 mulxq %r15,%rcx,%rbp movq 16(%rbx),%rdx adcq %rcx,%r12 adcq %rbp,%r13 adcq $0,%r8 xorq %r9,%r9 mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 mulxq 8+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 16+128(%rsi),%rcx,%rbp adcxq %rcx,%r12 adoxq %rbp,%r13 mulxq 24+128(%rsi),%rcx,%rbp movq %r10,%rdx adcxq %rcx,%r13 shlxq %r14,%r10,%rcx adoxq %rbp,%r8 shrxq %r14,%r10,%rbp adcxq %r9,%r8 adoxq %r9,%r9 adcq $0,%r9 addq %rcx,%r11 adcq %rbp,%r12 mulxq %r15,%rcx,%rbp movq 24(%rbx),%rdx adcq %rcx,%r13 adcq %rbp,%r8 adcq $0,%r9 xorq %r10,%r10 mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq 8+128(%rsi),%rcx,%rbp adcxq %rcx,%r12 adoxq %rbp,%r13 mulxq 16+128(%rsi),%rcx,%rbp adcxq %rcx,%r13 adoxq %rbp,%r8 mulxq 24+128(%rsi),%rcx,%rbp movq %r11,%rdx adcxq %rcx,%r8 shlxq %r14,%r11,%rcx adoxq %rbp,%r9 shrxq %r14,%r11,%rbp adcxq %r10,%r9 adoxq %r10,%r10 adcq $0,%r10 addq %rcx,%r12 adcq %rbp,%r13 mulxq %r15,%rcx,%rbp movq %r12,%rbx movq L$poly+8(%rip),%r14 adcq %rcx,%r8 movq %r13,%rdx adcq %rbp,%r9 adcq $0,%r10 xorl %eax,%eax movq %r8,%rcx sbbq $-1,%r12 sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%rbp sbbq %r15,%r9 sbbq $0,%r10 cmovcq %rbx,%r12 cmovcq %rdx,%r13 movq %r12,0(%rdi) cmovcq %rcx,%r8 movq %r13,8(%rdi) cmovcq %rbp,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .globl _ecp_nistz256_sqr_mont_adx .private_extern _ecp_nistz256_sqr_mont_adx .p2align 5 _ecp_nistz256_sqr_mont_adx: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$sqrx_body: movq 0(%rsi),%rdx movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 leaq -128(%rsi),%rsi call __ecp_nistz256_sqr_montx movq 0(%rsp),%r15 movq 8(%rsp),%r14 movq 16(%rsp),%r13 movq 24(%rsp),%r12 movq 32(%rsp),%rbx movq 40(%rsp),%rbp leaq 48(%rsp),%rsp L$sqrx_epilogue: ret .p2align 5 __ecp_nistz256_sqr_montx: mulxq %r14,%r9,%r10 mulxq %r15,%rcx,%r11 xorl %eax,%eax adcq %rcx,%r10 mulxq %r8,%rbp,%r12 movq %r14,%rdx adcq %rbp,%r11 adcq $0,%r12 xorq %r13,%r13 mulxq %r15,%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 mulxq %r8,%rcx,%rbp movq %r15,%rdx adcxq %rcx,%r12 adoxq %rbp,%r13 adcq $0,%r13 mulxq %r8,%rcx,%r14 movq 0+128(%rsi),%rdx xorq %r15,%r15 adcxq %r9,%r9 adoxq %rcx,%r13 adcxq %r10,%r10 adoxq %r15,%r14 mulxq %rdx,%r8,%rbp movq 8+128(%rsi),%rdx adcxq %r11,%r11 adoxq %rbp,%r9 adcxq %r12,%r12 mulxq %rdx,%rcx,%rax movq 16+128(%rsi),%rdx adcxq %r13,%r13 adoxq %rcx,%r10 adcxq %r14,%r14 .byte 0x67 mulxq %rdx,%rcx,%rbp movq 24+128(%rsi),%rdx adoxq %rax,%r11 adcxq %r15,%r15 adoxq %rcx,%r12 movq $32,%rsi adoxq %rbp,%r13 .byte 0x67,0x67 mulxq %rdx,%rcx,%rax movq L$poly+24(%rip),%rdx adoxq %rcx,%r14 shlxq %rsi,%r8,%rcx adoxq %rax,%r15 shrxq %rsi,%r8,%rax movq %rdx,%rbp addq %rcx,%r9 adcq %rax,%r10 mulxq %r8,%rcx,%r8 adcq %rcx,%r11 shlxq %rsi,%r9,%rcx adcq $0,%r8 shrxq %rsi,%r9,%rax addq %rcx,%r10 adcq %rax,%r11 mulxq %r9,%rcx,%r9 adcq %rcx,%r8 shlxq %rsi,%r10,%rcx adcq $0,%r9 shrxq %rsi,%r10,%rax addq %rcx,%r11 adcq %rax,%r8 mulxq %r10,%rcx,%r10 adcq %rcx,%r9 shlxq %rsi,%r11,%rcx adcq $0,%r10 shrxq %rsi,%r11,%rax addq %rcx,%r8 adcq %rax,%r9 mulxq %r11,%rcx,%r11 adcq %rcx,%r10 adcq $0,%r11 xorq %rdx,%rdx addq %r8,%r12 movq L$poly+8(%rip),%rsi adcq %r9,%r13 movq %r12,%r8 adcq %r10,%r14 adcq %r11,%r15 movq %r13,%r9 adcq $0,%rdx subq $-1,%r12 movq %r14,%r10 sbbq %rsi,%r13 sbbq $0,%r14 movq %r15,%r11 sbbq %rbp,%r15 sbbq $0,%rdx cmovcq %r8,%r12 cmovcq %r9,%r13 movq %r12,0(%rdi) cmovcq %r10,%r14 movq %r13,8(%rdi) cmovcq %r11,%r15 movq %r14,16(%rdi) movq %r15,24(%rdi) ret .globl _ecp_nistz256_select_w5_nohw .private_extern _ecp_nistz256_select_w5_nohw .p2align 5 _ecp_nistz256_select_w5_nohw: _CET_ENDBR movdqa L$One(%rip),%xmm0 movd %edx,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 movdqa %xmm0,%xmm8 pshufd $0,%xmm1,%xmm1 movq $16,%rax L$select_loop_sse_w5: movdqa %xmm8,%xmm15 paddd %xmm0,%xmm8 pcmpeqd %xmm1,%xmm15 movdqa 0(%rsi),%xmm9 movdqa 16(%rsi),%xmm10 movdqa 32(%rsi),%xmm11 movdqa 48(%rsi),%xmm12 movdqa 64(%rsi),%xmm13 movdqa 80(%rsi),%xmm14 leaq 96(%rsi),%rsi pand %xmm15,%xmm9 pand %xmm15,%xmm10 por %xmm9,%xmm2 pand %xmm15,%xmm11 por %xmm10,%xmm3 pand %xmm15,%xmm12 por %xmm11,%xmm4 pand %xmm15,%xmm13 por %xmm12,%xmm5 pand %xmm15,%xmm14 por %xmm13,%xmm6 por %xmm14,%xmm7 decq %rax jnz L$select_loop_sse_w5 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqu %xmm4,32(%rdi) movdqu %xmm5,48(%rdi) movdqu %xmm6,64(%rdi) movdqu %xmm7,80(%rdi) ret L$SEH_end_ecp_nistz256_select_w5_nohw: .globl _ecp_nistz256_select_w7_nohw .private_extern _ecp_nistz256_select_w7_nohw .p2align 5 _ecp_nistz256_select_w7_nohw: _CET_ENDBR movdqa L$One(%rip),%xmm8 movd %edx,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 movdqa %xmm8,%xmm0 pshufd $0,%xmm1,%xmm1 movq $64,%rax L$select_loop_sse_w7: movdqa %xmm8,%xmm15 paddd %xmm0,%xmm8 movdqa 0(%rsi),%xmm9 movdqa 16(%rsi),%xmm10 pcmpeqd %xmm1,%xmm15 movdqa 32(%rsi),%xmm11 movdqa 48(%rsi),%xmm12 leaq 64(%rsi),%rsi pand %xmm15,%xmm9 pand %xmm15,%xmm10 por %xmm9,%xmm2 pand %xmm15,%xmm11 por %xmm10,%xmm3 pand %xmm15,%xmm12 por %xmm11,%xmm4 prefetcht0 255(%rsi) por %xmm12,%xmm5 decq %rax jnz L$select_loop_sse_w7 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqu %xmm4,32(%rdi) movdqu %xmm5,48(%rdi) ret L$SEH_end_ecp_nistz256_select_w7_nohw: .globl _ecp_nistz256_select_w5_avx2 .private_extern _ecp_nistz256_select_w5_avx2 .p2align 5 _ecp_nistz256_select_w5_avx2: _CET_ENDBR vzeroupper vmovdqa L$Two(%rip),%ymm0 vpxor %ymm2,%ymm2,%ymm2 vpxor %ymm3,%ymm3,%ymm3 vpxor %ymm4,%ymm4,%ymm4 vmovdqa L$One(%rip),%ymm5 vmovdqa L$Two(%rip),%ymm10 vmovd %edx,%xmm1 vpermd %ymm1,%ymm2,%ymm1 movq $8,%rax L$select_loop_avx2_w5: vmovdqa 0(%rsi),%ymm6 vmovdqa 32(%rsi),%ymm7 vmovdqa 64(%rsi),%ymm8 vmovdqa 96(%rsi),%ymm11 vmovdqa 128(%rsi),%ymm12 vmovdqa 160(%rsi),%ymm13 vpcmpeqd %ymm1,%ymm5,%ymm9 vpcmpeqd %ymm1,%ymm10,%ymm14 vpaddd %ymm0,%ymm5,%ymm5 vpaddd %ymm0,%ymm10,%ymm10 leaq 192(%rsi),%rsi vpand %ymm9,%ymm6,%ymm6 vpand %ymm9,%ymm7,%ymm7 vpand %ymm9,%ymm8,%ymm8 vpand %ymm14,%ymm11,%ymm11 vpand %ymm14,%ymm12,%ymm12 vpand %ymm14,%ymm13,%ymm13 vpxor %ymm6,%ymm2,%ymm2 vpxor %ymm7,%ymm3,%ymm3 vpxor %ymm8,%ymm4,%ymm4 vpxor %ymm11,%ymm2,%ymm2 vpxor %ymm12,%ymm3,%ymm3 vpxor %ymm13,%ymm4,%ymm4 decq %rax jnz L$select_loop_avx2_w5 vmovdqu %ymm2,0(%rdi) vmovdqu %ymm3,32(%rdi) vmovdqu %ymm4,64(%rdi) vzeroupper ret L$SEH_end_ecp_nistz256_select_w5_avx2: .globl _ecp_nistz256_select_w7_avx2 .private_extern _ecp_nistz256_select_w7_avx2 .p2align 5 _ecp_nistz256_select_w7_avx2: _CET_ENDBR vzeroupper vmovdqa L$Three(%rip),%ymm0 vpxor %ymm2,%ymm2,%ymm2 vpxor %ymm3,%ymm3,%ymm3 vmovdqa L$One(%rip),%ymm4 vmovdqa L$Two(%rip),%ymm8 vmovdqa L$Three(%rip),%ymm12 vmovd %edx,%xmm1 vpermd %ymm1,%ymm2,%ymm1 movq $21,%rax L$select_loop_avx2_w7: vmovdqa 0(%rsi),%ymm5 vmovdqa 32(%rsi),%ymm6 vmovdqa 64(%rsi),%ymm9 vmovdqa 96(%rsi),%ymm10 vmovdqa 128(%rsi),%ymm13 vmovdqa 160(%rsi),%ymm14 vpcmpeqd %ymm1,%ymm4,%ymm7 vpcmpeqd %ymm1,%ymm8,%ymm11 vpcmpeqd %ymm1,%ymm12,%ymm15 vpaddd %ymm0,%ymm4,%ymm4 vpaddd %ymm0,%ymm8,%ymm8 vpaddd %ymm0,%ymm12,%ymm12 leaq 192(%rsi),%rsi vpand %ymm7,%ymm5,%ymm5 vpand %ymm7,%ymm6,%ymm6 vpand %ymm11,%ymm9,%ymm9 vpand %ymm11,%ymm10,%ymm10 vpand %ymm15,%ymm13,%ymm13 vpand %ymm15,%ymm14,%ymm14 vpxor %ymm5,%ymm2,%ymm2 vpxor %ymm6,%ymm3,%ymm3 vpxor %ymm9,%ymm2,%ymm2 vpxor %ymm10,%ymm3,%ymm3 vpxor %ymm13,%ymm2,%ymm2 vpxor %ymm14,%ymm3,%ymm3 decq %rax jnz L$select_loop_avx2_w7 vmovdqa 0(%rsi),%ymm5 vmovdqa 32(%rsi),%ymm6 vpcmpeqd %ymm1,%ymm4,%ymm7 vpand %ymm7,%ymm5,%ymm5 vpand %ymm7,%ymm6,%ymm6 vpxor %ymm5,%ymm2,%ymm2 vpxor %ymm6,%ymm3,%ymm3 vmovdqu %ymm2,0(%rdi) vmovdqu %ymm3,32(%rdi) vzeroupper ret L$SEH_end_ecp_nistz256_select_w7_avx2: .p2align 5 __ecp_nistz256_add_toq: xorq %r11,%r11 addq 0(%rbx),%r12 adcq 8(%rbx),%r13 movq %r12,%rax adcq 16(%rbx),%r8 adcq 24(%rbx),%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 cmovcq %rbp,%r13 movq %r12,0(%rdi) cmovcq %rcx,%r8 movq %r13,8(%rdi) cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .p2align 5 __ecp_nistz256_sub_fromq: subq 0(%rbx),%r12 sbbq 8(%rbx),%r13 movq %r12,%rax sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 movq %r13,%rbp sbbq %r11,%r11 addq $-1,%r12 movq %r8,%rcx adcq %r14,%r13 adcq $0,%r8 movq %r9,%r10 adcq %r15,%r9 testq %r11,%r11 cmovzq %rax,%r12 cmovzq %rbp,%r13 movq %r12,0(%rdi) cmovzq %rcx,%r8 movq %r13,8(%rdi) cmovzq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .p2align 5 __ecp_nistz256_subq: subq %r12,%rax sbbq %r13,%rbp movq %rax,%r12 sbbq %r8,%rcx sbbq %r9,%r10 movq %rbp,%r13 sbbq %r11,%r11 addq $-1,%rax movq %rcx,%r8 adcq %r14,%rbp adcq $0,%rcx movq %r10,%r9 adcq %r15,%r10 testq %r11,%r11 cmovnzq %rax,%r12 cmovnzq %rbp,%r13 cmovnzq %rcx,%r8 cmovnzq %r10,%r9 ret .p2align 5 __ecp_nistz256_mul_by_2q: xorq %r11,%r11 addq %r12,%r12 adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 cmovcq %rbp,%r13 movq %r12,0(%rdi) cmovcq %rcx,%r8 movq %r13,8(%rdi) cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .globl _ecp_nistz256_point_double_nohw .private_extern _ecp_nistz256_point_double_nohw .p2align 5 _ecp_nistz256_point_double_nohw: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $160+8,%rsp L$point_doubleq_body: L$point_double_shortcutq: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx movdqu 16(%rsi),%xmm1 movq 32+0(%rsi),%r12 movq 32+8(%rsi),%r13 movq 32+16(%rsi),%r8 movq 32+24(%rsi),%r9 movq L$poly+8(%rip),%r14 movq L$poly+24(%rip),%r15 movdqa %xmm0,96(%rsp) movdqa %xmm1,96+16(%rsp) leaq 32(%rdi),%r10 leaq 64(%rdi),%r11 .byte 102,72,15,110,199 .byte 102,73,15,110,202 .byte 102,73,15,110,211 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_by_2q movq 64+0(%rsi),%rax movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 leaq 64-0(%rsi),%rsi leaq 64(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 0+0(%rsp),%rax movq 8+0(%rsp),%r14 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 0(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 32(%rbx),%rax movq 64+0(%rbx),%r9 movq 64+8(%rbx),%r10 movq 64+16(%rbx),%r11 movq 64+24(%rbx),%r12 leaq 64-0(%rbx),%rsi leaq 32(%rbx),%rbx .byte 102,72,15,126,215 call __ecp_nistz256_mul_montq call __ecp_nistz256_mul_by_2q movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 leaq 64(%rsp),%rbx movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 32(%rsp),%rdi call __ecp_nistz256_add_toq movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 leaq 64(%rsp),%rbx movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 0+0(%rsp),%rax movq 8+0(%rsp),%r14 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 .byte 102,72,15,126,207 call __ecp_nistz256_sqr_montq xorq %r9,%r9 movq %r12,%rax addq $-1,%r12 movq %r13,%r10 adcq %rsi,%r13 movq %r14,%rcx adcq $0,%r14 movq %r15,%r8 adcq %rbp,%r15 adcq $0,%r9 xorq %rsi,%rsi testq $1,%rax cmovzq %rax,%r12 cmovzq %r10,%r13 cmovzq %rcx,%r14 cmovzq %r8,%r15 cmovzq %rsi,%r9 movq %r13,%rax shrq $1,%r12 shlq $63,%rax movq %r14,%r10 shrq $1,%r13 orq %rax,%r12 shlq $63,%r10 movq %r15,%rcx shrq $1,%r14 orq %r10,%r13 shlq $63,%rcx movq %r12,0(%rdi) shrq $1,%r15 movq %r13,8(%rdi) shlq $63,%r9 orq %rcx,%r14 orq %r9,%r15 movq %r14,16(%rdi) movq %r15,24(%rdi) movq 64(%rsp),%rax leaq 64(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 128(%rsp),%rdi call __ecp_nistz256_mul_by_2q leaq 32(%rsp),%rbx leaq 32(%rsp),%rdi call __ecp_nistz256_add_toq movq 96(%rsp),%rax leaq 96(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 128(%rsp),%rdi call __ecp_nistz256_mul_by_2q movq 0+32(%rsp),%rax movq 8+32(%rsp),%r14 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r15 movq 24+32(%rsp),%r8 .byte 102,72,15,126,199 call __ecp_nistz256_sqr_montq leaq 128(%rsp),%rbx movq %r14,%r8 movq %r15,%r9 movq %rsi,%r14 movq %rbp,%r15 call __ecp_nistz256_sub_fromq movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp movq 0+16(%rsp),%rcx movq 0+24(%rsp),%r10 leaq 0(%rsp),%rdi call __ecp_nistz256_subq movq 32(%rsp),%rax leaq 32(%rsp),%rbx movq %r12,%r14 xorl %ecx,%ecx movq %r12,0+0(%rsp) movq %r13,%r10 movq %r13,0+8(%rsp) cmovzq %r8,%r11 movq %r8,0+16(%rsp) leaq 0-0(%rsp),%rsi cmovzq %r9,%r12 movq %r9,0+24(%rsp) movq %r14,%r9 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq .byte 102,72,15,126,203 .byte 102,72,15,126,207 call __ecp_nistz256_sub_fromq leaq 160+56(%rsp),%rsi movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbx movq -8(%rsi),%rbp leaq (%rsi),%rsp L$point_doubleq_epilogue: ret .globl _ecp_nistz256_point_add_nohw .private_extern _ecp_nistz256_point_add_nohw .p2align 5 _ecp_nistz256_point_add_nohw: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $576+8,%rsp L$point_addq_body: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm3 movdqu 64(%rsi),%xmm4 movdqu 80(%rsi),%xmm5 movq %rsi,%rbx movq %rdx,%rsi movdqa %xmm0,384(%rsp) movdqa %xmm1,384+16(%rsp) movdqa %xmm2,416(%rsp) movdqa %xmm3,416+16(%rsp) movdqa %xmm4,448(%rsp) movdqa %xmm5,448+16(%rsp) por %xmm4,%xmm5 movdqu 0(%rsi),%xmm0 pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 movdqu 48(%rsi),%xmm3 movq 64+0(%rsi),%rax movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,480(%rsp) pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,480+16(%rsp) movdqu 64(%rsi),%xmm0 movdqu 80(%rsi),%xmm1 movdqa %xmm2,512(%rsp) movdqa %xmm3,512+16(%rsp) por %xmm4,%xmm5 pxor %xmm4,%xmm4 por %xmm0,%xmm1 .byte 102,72,15,110,199 leaq 64-0(%rsi),%rsi movq %rax,544+0(%rsp) movq %r14,544+8(%rsp) movq %r15,544+16(%rsp) movq %r8,544+24(%rsp) leaq 96(%rsp),%rdi call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm1,%xmm4 por %xmm1,%xmm4 pshufd $0,%xmm5,%xmm5 pshufd $0x1e,%xmm4,%xmm3 por %xmm3,%xmm4 pxor %xmm3,%xmm3 pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 movq 64+0(%rbx),%rax movq 64+8(%rbx),%r14 movq 64+16(%rbx),%r15 movq 64+24(%rbx),%r8 .byte 102,72,15,110,203 leaq 64-0(%rbx),%rsi leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 544(%rsp),%rax leaq 544(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 leaq 0+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 224(%rsp),%rdi call __ecp_nistz256_mul_montq movq 448(%rsp),%rax leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montq movq 416(%rsp),%rax leaq 416(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 leaq 0+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 224(%rsp),%rdi call __ecp_nistz256_mul_montq movq 512(%rsp),%rax leaq 512(%rsp),%rbx movq 0+256(%rsp),%r9 movq 8+256(%rsp),%r10 leaq 0+256(%rsp),%rsi movq 16+256(%rsp),%r11 movq 24+256(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 224(%rsp),%rbx leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromq orq %r13,%r12 movdqa %xmm4,%xmm2 orq %r8,%r12 orq %r9,%r12 por %xmm5,%xmm2 .byte 102,73,15,110,220 movq 384(%rsp),%rax leaq 384(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 leaq 0+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 160(%rsp),%rdi call __ecp_nistz256_mul_montq movq 480(%rsp),%rax leaq 480(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 160(%rsp),%rbx leaq 0(%rsp),%rdi call __ecp_nistz256_sub_fromq orq %r13,%r12 orq %r8,%r12 orq %r9,%r12 .byte 102,73,15,126,208 .byte 102,73,15,126,217 orq %r8,%r12 .byte 0x3e jnz L$add_proceedq testq %r9,%r9 jz L$add_doubleq .byte 102,72,15,126,199 pxor %xmm0,%xmm0 movdqu %xmm0,0(%rdi) movdqu %xmm0,16(%rdi) movdqu %xmm0,32(%rdi) movdqu %xmm0,48(%rdi) movdqu %xmm0,64(%rdi) movdqu %xmm0,80(%rdi) jmp L$add_doneq .p2align 5 L$add_doubleq: .byte 102,72,15,126,206 .byte 102,72,15,126,199 addq $416,%rsp jmp L$point_double_shortcutq .p2align 5 L$add_proceedq: movq 0+64(%rsp),%rax movq 8+64(%rsp),%r14 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 96(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 448(%rsp),%rax leaq 448(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 352(%rsp),%rdi call __ecp_nistz256_mul_montq movq 0+0(%rsp),%rax movq 8+0(%rsp),%r14 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 544(%rsp),%rax leaq 544(%rsp),%rbx movq 0+352(%rsp),%r9 movq 8+352(%rsp),%r10 leaq 0+352(%rsp),%rsi movq 16+352(%rsp),%r11 movq 24+352(%rsp),%r12 leaq 352(%rsp),%rdi call __ecp_nistz256_mul_montq movq 0(%rsp),%rax leaq 0(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 128(%rsp),%rdi call __ecp_nistz256_mul_montq movq 160(%rsp),%rax leaq 160(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montq xorq %r11,%r11 addq %r12,%r12 leaq 96(%rsp),%rsi adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 movq 0(%rsi),%rax cmovcq %rbp,%r13 movq 8(%rsi),%rbp cmovcq %rcx,%r8 movq 16(%rsi),%rcx cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subq leaq 128(%rsp),%rbx leaq 288(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 192+0(%rsp),%rax movq 192+8(%rsp),%rbp movq 192+16(%rsp),%rcx movq 192+24(%rsp),%r10 leaq 320(%rsp),%rdi call __ecp_nistz256_subq movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 128(%rsp),%rax leaq 128(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 leaq 0+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montq movq 320(%rsp),%rax leaq 320(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 320(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 256(%rsp),%rbx leaq 320(%rsp),%rdi call __ecp_nistz256_sub_fromq .byte 102,72,15,126,199 movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 352(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 352+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 544(%rsp),%xmm2 pand 544+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 448(%rsp),%xmm2 pand 448+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,64(%rdi) movdqu %xmm3,80(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 288(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 288+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 480(%rsp),%xmm2 pand 480+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 384(%rsp),%xmm2 pand 384+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 320(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 320+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 512(%rsp),%xmm2 pand 512+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 416(%rsp),%xmm2 pand 416+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) L$add_doneq: leaq 576+56(%rsp),%rsi movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbx movq -8(%rsi),%rbp leaq (%rsi),%rsp L$point_addq_epilogue: ret .globl _ecp_nistz256_point_add_affine_nohw .private_extern _ecp_nistz256_point_add_affine_nohw .p2align 5 _ecp_nistz256_point_add_affine_nohw: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $480+8,%rsp L$add_affineq_body: movdqu 0(%rsi),%xmm0 movq %rdx,%rbx movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm3 movdqu 64(%rsi),%xmm4 movdqu 80(%rsi),%xmm5 movq 64+0(%rsi),%rax movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,320(%rsp) movdqa %xmm1,320+16(%rsp) movdqa %xmm2,352(%rsp) movdqa %xmm3,352+16(%rsp) movdqa %xmm4,384(%rsp) movdqa %xmm5,384+16(%rsp) por %xmm4,%xmm5 movdqu 0(%rbx),%xmm0 pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rbx),%xmm1 movdqu 32(%rbx),%xmm2 por %xmm3,%xmm5 movdqu 48(%rbx),%xmm3 movdqa %xmm0,416(%rsp) pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,416+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 movdqa %xmm2,448(%rsp) movdqa %xmm3,448+16(%rsp) por %xmm2,%xmm3 por %xmm4,%xmm5 pxor %xmm4,%xmm4 por %xmm1,%xmm3 leaq 64-0(%rsi),%rsi leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rax movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 pshufd $0x1e,%xmm4,%xmm3 movq %r13,%r10 por %xmm3,%xmm4 pxor %xmm3,%xmm3 movq %r14,%r11 pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 leaq 32-0(%rsp),%rsi movq %r15,%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 320(%rsp),%rbx leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 384(%rsp),%rax leaq 384(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montq movq 384(%rsp),%rax leaq 384(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 288(%rsp),%rdi call __ecp_nistz256_mul_montq movq 448(%rsp),%rax leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 352(%rsp),%rbx leaq 96(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 0+64(%rsp),%rax movq 8+64(%rsp),%r14 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 128(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 0+96(%rsp),%rax movq 8+96(%rsp),%r14 leaq 0+96(%rsp),%rsi movq 16+96(%rsp),%r15 movq 24+96(%rsp),%r8 leaq 192(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 128(%rsp),%rax leaq 128(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 160(%rsp),%rdi call __ecp_nistz256_mul_montq movq 320(%rsp),%rax leaq 320(%rsp),%rbx movq 0+128(%rsp),%r9 movq 8+128(%rsp),%r10 leaq 0+128(%rsp),%rsi movq 16+128(%rsp),%r11 movq 24+128(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq xorq %r11,%r11 addq %r12,%r12 leaq 192(%rsp),%rsi adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 movq 0(%rsi),%rax cmovcq %rbp,%r13 movq 8(%rsi),%rbp cmovcq %rcx,%r8 movq 16(%rsi),%rcx cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subq leaq 160(%rsp),%rbx leaq 224(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp movq 0+16(%rsp),%rcx movq 0+24(%rsp),%r10 leaq 64(%rsp),%rdi call __ecp_nistz256_subq movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 352(%rsp),%rax leaq 352(%rsp),%rbx movq 0+160(%rsp),%r9 movq 8+160(%rsp),%r10 leaq 0+160(%rsp),%rsi movq 16+160(%rsp),%r11 movq 24+160(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montq movq 96(%rsp),%rax leaq 96(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 64(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 32(%rsp),%rbx leaq 256(%rsp),%rdi call __ecp_nistz256_sub_fromq .byte 102,72,15,126,199 movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 288(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 288+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand L$ONE_mont(%rip),%xmm2 pand L$ONE_mont+16(%rip),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 384(%rsp),%xmm2 pand 384+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,64(%rdi) movdqu %xmm3,80(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 224(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 224+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 416(%rsp),%xmm2 pand 416+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 320(%rsp),%xmm2 pand 320+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 256(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 256+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 448(%rsp),%xmm2 pand 448+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 352(%rsp),%xmm2 pand 352+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) leaq 480+56(%rsp),%rsi movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbx movq -8(%rsi),%rbp leaq (%rsi),%rsp L$add_affineq_epilogue: ret .p2align 5 __ecp_nistz256_add_tox: xorq %r11,%r11 adcq 0(%rbx),%r12 adcq 8(%rbx),%r13 movq %r12,%rax adcq 16(%rbx),%r8 adcq 24(%rbx),%r9 movq %r13,%rbp adcq $0,%r11 xorq %r10,%r10 sbbq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 cmovcq %rbp,%r13 movq %r12,0(%rdi) cmovcq %rcx,%r8 movq %r13,8(%rdi) cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .p2align 5 __ecp_nistz256_sub_fromx: xorq %r11,%r11 sbbq 0(%rbx),%r12 sbbq 8(%rbx),%r13 movq %r12,%rax sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 movq %r13,%rbp sbbq $0,%r11 xorq %r10,%r10 adcq $-1,%r12 movq %r8,%rcx adcq %r14,%r13 adcq $0,%r8 movq %r9,%r10 adcq %r15,%r9 btq $0,%r11 cmovncq %rax,%r12 cmovncq %rbp,%r13 movq %r12,0(%rdi) cmovncq %rcx,%r8 movq %r13,8(%rdi) cmovncq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .p2align 5 __ecp_nistz256_subx: xorq %r11,%r11 sbbq %r12,%rax sbbq %r13,%rbp movq %rax,%r12 sbbq %r8,%rcx sbbq %r9,%r10 movq %rbp,%r13 sbbq $0,%r11 xorq %r9,%r9 adcq $-1,%rax movq %rcx,%r8 adcq %r14,%rbp adcq $0,%rcx movq %r10,%r9 adcq %r15,%r10 btq $0,%r11 cmovcq %rax,%r12 cmovcq %rbp,%r13 cmovcq %rcx,%r8 cmovcq %r10,%r9 ret .p2align 5 __ecp_nistz256_mul_by_2x: xorq %r11,%r11 adcq %r12,%r12 adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 xorq %r10,%r10 sbbq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 cmovcq %rbp,%r13 movq %r12,0(%rdi) cmovcq %rcx,%r8 movq %r13,8(%rdi) cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) ret .globl _ecp_nistz256_point_double_adx .private_extern _ecp_nistz256_point_double_adx .p2align 5 _ecp_nistz256_point_double_adx: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $160+8,%rsp L$point_doublex_body: L$point_double_shortcutx: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx movdqu 16(%rsi),%xmm1 movq 32+0(%rsi),%r12 movq 32+8(%rsi),%r13 movq 32+16(%rsi),%r8 movq 32+24(%rsi),%r9 movq L$poly+8(%rip),%r14 movq L$poly+24(%rip),%r15 movdqa %xmm0,96(%rsp) movdqa %xmm1,96+16(%rsp) leaq 32(%rdi),%r10 leaq 64(%rdi),%r11 .byte 102,72,15,110,199 .byte 102,73,15,110,202 .byte 102,73,15,110,211 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_by_2x movq 64+0(%rsi),%rdx movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 leaq 64-128(%rsi),%rsi leaq 64(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 0+0(%rsp),%rdx movq 8+0(%rsp),%r14 leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 0(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 32(%rbx),%rdx movq 64+0(%rbx),%r9 movq 64+8(%rbx),%r10 movq 64+16(%rbx),%r11 movq 64+24(%rbx),%r12 leaq 64-128(%rbx),%rsi leaq 32(%rbx),%rbx .byte 102,72,15,126,215 call __ecp_nistz256_mul_montx call __ecp_nistz256_mul_by_2x movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 leaq 64(%rsp),%rbx movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 32(%rsp),%rdi call __ecp_nistz256_add_tox movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 leaq 64(%rsp),%rbx movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromx movq 0+0(%rsp),%rdx movq 8+0(%rsp),%r14 leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 .byte 102,72,15,126,207 call __ecp_nistz256_sqr_montx xorq %r9,%r9 movq %r12,%rax addq $-1,%r12 movq %r13,%r10 adcq %rsi,%r13 movq %r14,%rcx adcq $0,%r14 movq %r15,%r8 adcq %rbp,%r15 adcq $0,%r9 xorq %rsi,%rsi testq $1,%rax cmovzq %rax,%r12 cmovzq %r10,%r13 cmovzq %rcx,%r14 cmovzq %r8,%r15 cmovzq %rsi,%r9 movq %r13,%rax shrq $1,%r12 shlq $63,%rax movq %r14,%r10 shrq $1,%r13 orq %rax,%r12 shlq $63,%r10 movq %r15,%rcx shrq $1,%r14 orq %r10,%r13 shlq $63,%rcx movq %r12,0(%rdi) shrq $1,%r15 movq %r13,8(%rdi) shlq $63,%r9 orq %rcx,%r14 orq %r9,%r15 movq %r14,16(%rdi) movq %r15,24(%rdi) movq 64(%rsp),%rdx leaq 64(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 128(%rsp),%rdi call __ecp_nistz256_mul_by_2x leaq 32(%rsp),%rbx leaq 32(%rsp),%rdi call __ecp_nistz256_add_tox movq 96(%rsp),%rdx leaq 96(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 128(%rsp),%rdi call __ecp_nistz256_mul_by_2x movq 0+32(%rsp),%rdx movq 8+32(%rsp),%r14 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r15 movq 24+32(%rsp),%r8 .byte 102,72,15,126,199 call __ecp_nistz256_sqr_montx leaq 128(%rsp),%rbx movq %r14,%r8 movq %r15,%r9 movq %rsi,%r14 movq %rbp,%r15 call __ecp_nistz256_sub_fromx movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp movq 0+16(%rsp),%rcx movq 0+24(%rsp),%r10 leaq 0(%rsp),%rdi call __ecp_nistz256_subx movq 32(%rsp),%rdx leaq 32(%rsp),%rbx movq %r12,%r14 xorl %ecx,%ecx movq %r12,0+0(%rsp) movq %r13,%r10 movq %r13,0+8(%rsp) cmovzq %r8,%r11 movq %r8,0+16(%rsp) leaq 0-128(%rsp),%rsi cmovzq %r9,%r12 movq %r9,0+24(%rsp) movq %r14,%r9 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montx .byte 102,72,15,126,203 .byte 102,72,15,126,207 call __ecp_nistz256_sub_fromx leaq 160+56(%rsp),%rsi movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbx movq -8(%rsi),%rbp leaq (%rsi),%rsp L$point_doublex_epilogue: ret .globl _ecp_nistz256_point_add_adx .private_extern _ecp_nistz256_point_add_adx .p2align 5 _ecp_nistz256_point_add_adx: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $576+8,%rsp L$point_addx_body: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm3 movdqu 64(%rsi),%xmm4 movdqu 80(%rsi),%xmm5 movq %rsi,%rbx movq %rdx,%rsi movdqa %xmm0,384(%rsp) movdqa %xmm1,384+16(%rsp) movdqa %xmm2,416(%rsp) movdqa %xmm3,416+16(%rsp) movdqa %xmm4,448(%rsp) movdqa %xmm5,448+16(%rsp) por %xmm4,%xmm5 movdqu 0(%rsi),%xmm0 pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 movdqu 48(%rsi),%xmm3 movq 64+0(%rsi),%rdx movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,480(%rsp) pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,480+16(%rsp) movdqu 64(%rsi),%xmm0 movdqu 80(%rsi),%xmm1 movdqa %xmm2,512(%rsp) movdqa %xmm3,512+16(%rsp) por %xmm4,%xmm5 pxor %xmm4,%xmm4 por %xmm0,%xmm1 .byte 102,72,15,110,199 leaq 64-128(%rsi),%rsi movq %rdx,544+0(%rsp) movq %r14,544+8(%rsp) movq %r15,544+16(%rsp) movq %r8,544+24(%rsp) leaq 96(%rsp),%rdi call __ecp_nistz256_sqr_montx pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm1,%xmm4 por %xmm1,%xmm4 pshufd $0,%xmm5,%xmm5 pshufd $0x1e,%xmm4,%xmm3 por %xmm3,%xmm4 pxor %xmm3,%xmm3 pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 movq 64+0(%rbx),%rdx movq 64+8(%rbx),%r14 movq 64+16(%rbx),%r15 movq 64+24(%rbx),%r8 .byte 102,72,15,110,203 leaq 64-128(%rbx),%rsi leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 544(%rsp),%rdx leaq 544(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 leaq -128+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 224(%rsp),%rdi call __ecp_nistz256_mul_montx movq 448(%rsp),%rdx leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montx movq 416(%rsp),%rdx leaq 416(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 leaq -128+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 224(%rsp),%rdi call __ecp_nistz256_mul_montx movq 512(%rsp),%rdx leaq 512(%rsp),%rbx movq 0+256(%rsp),%r9 movq 8+256(%rsp),%r10 leaq -128+256(%rsp),%rsi movq 16+256(%rsp),%r11 movq 24+256(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 224(%rsp),%rbx leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromx orq %r13,%r12 movdqa %xmm4,%xmm2 orq %r8,%r12 orq %r9,%r12 por %xmm5,%xmm2 .byte 102,73,15,110,220 movq 384(%rsp),%rdx leaq 384(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 leaq -128+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 160(%rsp),%rdi call __ecp_nistz256_mul_montx movq 480(%rsp),%rdx leaq 480(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 160(%rsp),%rbx leaq 0(%rsp),%rdi call __ecp_nistz256_sub_fromx orq %r13,%r12 orq %r8,%r12 orq %r9,%r12 .byte 102,73,15,126,208 .byte 102,73,15,126,217 orq %r8,%r12 .byte 0x3e jnz L$add_proceedx testq %r9,%r9 jz L$add_doublex .byte 102,72,15,126,199 pxor %xmm0,%xmm0 movdqu %xmm0,0(%rdi) movdqu %xmm0,16(%rdi) movdqu %xmm0,32(%rdi) movdqu %xmm0,48(%rdi) movdqu %xmm0,64(%rdi) movdqu %xmm0,80(%rdi) jmp L$add_donex .p2align 5 L$add_doublex: .byte 102,72,15,126,206 .byte 102,72,15,126,199 addq $416,%rsp jmp L$point_double_shortcutx .p2align 5 L$add_proceedx: movq 0+64(%rsp),%rdx movq 8+64(%rsp),%r14 leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 96(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 448(%rsp),%rdx leaq 448(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 352(%rsp),%rdi call __ecp_nistz256_mul_montx movq 0+0(%rsp),%rdx movq 8+0(%rsp),%r14 leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 544(%rsp),%rdx leaq 544(%rsp),%rbx movq 0+352(%rsp),%r9 movq 8+352(%rsp),%r10 leaq -128+352(%rsp),%rsi movq 16+352(%rsp),%r11 movq 24+352(%rsp),%r12 leaq 352(%rsp),%rdi call __ecp_nistz256_mul_montx movq 0(%rsp),%rdx leaq 0(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 128(%rsp),%rdi call __ecp_nistz256_mul_montx movq 160(%rsp),%rdx leaq 160(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montx xorq %r11,%r11 addq %r12,%r12 leaq 96(%rsp),%rsi adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 movq 0(%rsi),%rax cmovcq %rbp,%r13 movq 8(%rsi),%rbp cmovcq %rcx,%r8 movq 16(%rsi),%rcx cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subx leaq 128(%rsp),%rbx leaq 288(%rsp),%rdi call __ecp_nistz256_sub_fromx movq 192+0(%rsp),%rax movq 192+8(%rsp),%rbp movq 192+16(%rsp),%rcx movq 192+24(%rsp),%r10 leaq 320(%rsp),%rdi call __ecp_nistz256_subx movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 128(%rsp),%rdx leaq 128(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 leaq -128+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montx movq 320(%rsp),%rdx leaq 320(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 320(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 256(%rsp),%rbx leaq 320(%rsp),%rdi call __ecp_nistz256_sub_fromx .byte 102,72,15,126,199 movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 352(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 352+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 544(%rsp),%xmm2 pand 544+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 448(%rsp),%xmm2 pand 448+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,64(%rdi) movdqu %xmm3,80(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 288(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 288+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 480(%rsp),%xmm2 pand 480+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 384(%rsp),%xmm2 pand 384+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 320(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 320+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 512(%rsp),%xmm2 pand 512+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 416(%rsp),%xmm2 pand 416+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) L$add_donex: leaq 576+56(%rsp),%rsi movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbx movq -8(%rsi),%rbp leaq (%rsi),%rsp L$point_addx_epilogue: ret .globl _ecp_nistz256_point_add_affine_adx .private_extern _ecp_nistz256_point_add_affine_adx .p2align 5 _ecp_nistz256_point_add_affine_adx: _CET_ENDBR pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $480+8,%rsp L$add_affinex_body: movdqu 0(%rsi),%xmm0 movq %rdx,%rbx movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm3 movdqu 64(%rsi),%xmm4 movdqu 80(%rsi),%xmm5 movq 64+0(%rsi),%rdx movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,320(%rsp) movdqa %xmm1,320+16(%rsp) movdqa %xmm2,352(%rsp) movdqa %xmm3,352+16(%rsp) movdqa %xmm4,384(%rsp) movdqa %xmm5,384+16(%rsp) por %xmm4,%xmm5 movdqu 0(%rbx),%xmm0 pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rbx),%xmm1 movdqu 32(%rbx),%xmm2 por %xmm3,%xmm5 movdqu 48(%rbx),%xmm3 movdqa %xmm0,416(%rsp) pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,416+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 movdqa %xmm2,448(%rsp) movdqa %xmm3,448+16(%rsp) por %xmm2,%xmm3 por %xmm4,%xmm5 pxor %xmm4,%xmm4 por %xmm1,%xmm3 leaq 64-128(%rsi),%rsi leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montx pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rdx movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 pshufd $0x1e,%xmm4,%xmm3 movq %r13,%r10 por %xmm3,%xmm4 pxor %xmm3,%xmm3 movq %r14,%r11 pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 leaq 32-128(%rsp),%rsi movq %r15,%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 320(%rsp),%rbx leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromx movq 384(%rsp),%rdx leaq 384(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montx movq 384(%rsp),%rdx leaq 384(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 288(%rsp),%rdi call __ecp_nistz256_mul_montx movq 448(%rsp),%rdx leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 352(%rsp),%rbx leaq 96(%rsp),%rdi call __ecp_nistz256_sub_fromx movq 0+64(%rsp),%rdx movq 8+64(%rsp),%r14 leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 128(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 0+96(%rsp),%rdx movq 8+96(%rsp),%r14 leaq -128+96(%rsp),%rsi movq 16+96(%rsp),%r15 movq 24+96(%rsp),%r8 leaq 192(%rsp),%rdi call __ecp_nistz256_sqr_montx movq 128(%rsp),%rdx leaq 128(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 160(%rsp),%rdi call __ecp_nistz256_mul_montx movq 320(%rsp),%rdx leaq 320(%rsp),%rbx movq 0+128(%rsp),%r9 movq 8+128(%rsp),%r10 leaq -128+128(%rsp),%rsi movq 16+128(%rsp),%r11 movq 24+128(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montx xorq %r11,%r11 addq %r12,%r12 leaq 192(%rsp),%rsi adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 movq 0(%rsi),%rax cmovcq %rbp,%r13 movq 8(%rsi),%rbp cmovcq %rcx,%r8 movq 16(%rsi),%rcx cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subx leaq 160(%rsp),%rbx leaq 224(%rsp),%rdi call __ecp_nistz256_sub_fromx movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp movq 0+16(%rsp),%rcx movq 0+24(%rsp),%r10 leaq 64(%rsp),%rdi call __ecp_nistz256_subx movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 352(%rsp),%rdx leaq 352(%rsp),%rbx movq 0+160(%rsp),%r9 movq 8+160(%rsp),%r10 leaq -128+160(%rsp),%rsi movq 16+160(%rsp),%r11 movq 24+160(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montx movq 96(%rsp),%rdx leaq 96(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 64(%rsp),%rdi call __ecp_nistz256_mul_montx leaq 32(%rsp),%rbx leaq 256(%rsp),%rdi call __ecp_nistz256_sub_fromx .byte 102,72,15,126,199 movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 288(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 288+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand L$ONE_mont(%rip),%xmm2 pand L$ONE_mont+16(%rip),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 384(%rsp),%xmm2 pand 384+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,64(%rdi) movdqu %xmm3,80(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 224(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 224+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 416(%rsp),%xmm2 pand 416+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 320(%rsp),%xmm2 pand 320+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 256(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 256+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 448(%rsp),%xmm2 pand 448+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 352(%rsp),%xmm2 pand 352+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) leaq 480+56(%rsp),%rsi movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbx movq -8(%rsi),%rbp leaq (%rsi),%rsp L$add_affinex_epilogue: ret #endif ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.asm000064400000000000000000002425211046102023000172540ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %ifidn __OUTPUT_FORMAT__, win64 default rel %define XMMWORD %define YMMWORD %define ZMMWORD %define _CET_ENDBR %include "ring_core_generated/prefix_symbols_nasm.inc" section .text code align=64 section .rdata rdata align=8 ALIGN 64 $L$poly: DQ 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 $L$One: DD 1,1,1,1,1,1,1,1 $L$Two: DD 2,2,2,2,2,2,2,2 $L$Three: DD 3,3,3,3,3,3,3,3 $L$ONE_mont: DQ 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe $L$ord: DQ 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 $L$ordK: DQ 0xccd1c8aaee00bc4f section .text global ecp_nistz256_neg ALIGN 32 ecp_nistz256_neg: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_neg: mov rdi,rcx mov rsi,rdx _CET_ENDBR push r12 push r13 $L$neg_body: xor r8,r8 xor r9,r9 xor r10,r10 xor r11,r11 xor r13,r13 sub r8,QWORD[rsi] sbb r9,QWORD[8+rsi] sbb r10,QWORD[16+rsi] mov rax,r8 sbb r11,QWORD[24+rsi] lea rsi,[$L$poly] mov rdx,r9 sbb r13,0 add r8,QWORD[rsi] mov rcx,r10 adc r9,QWORD[8+rsi] adc r10,QWORD[16+rsi] mov r12,r11 adc r11,QWORD[24+rsi] test r13,r13 cmovz r8,rax cmovz r9,rdx mov QWORD[rdi],r8 cmovz r10,rcx mov QWORD[8+rdi],r9 cmovz r11,r12 mov QWORD[16+rdi],r10 mov QWORD[24+rdi],r11 mov r13,QWORD[rsp] mov r12,QWORD[8+rsp] lea rsp,[16+rsp] $L$neg_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_neg: global ecp_nistz256_ord_mul_mont_nohw ALIGN 32 ecp_nistz256_ord_mul_mont_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_ord_mul_mont_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 $L$ord_mul_body: mov rax,QWORD[rdx] mov rbx,rdx lea r14,[$L$ord] mov r15,QWORD[$L$ordK] mov rcx,rax mul QWORD[rsi] mov r8,rax mov rax,rcx mov r9,rdx mul QWORD[8+rsi] add r9,rax mov rax,rcx adc rdx,0 mov r10,rdx mul QWORD[16+rsi] add r10,rax mov rax,rcx adc rdx,0 mov r13,r8 imul r8,r15 mov r11,rdx mul QWORD[24+rsi] add r11,rax mov rax,r8 adc rdx,0 mov r12,rdx mul QWORD[r14] mov rbp,r8 add r13,rax mov rax,r8 adc rdx,0 mov rcx,rdx sub r10,r8 sbb r8,0 mul QWORD[8+r14] add r9,rcx adc rdx,0 add r9,rax mov rax,rbp adc r10,rdx mov rdx,rbp adc r8,0 shl rax,32 shr rdx,32 sub r11,rax mov rax,QWORD[8+rbx] sbb rbp,rdx add r11,r8 adc r12,rbp adc r13,0 mov rcx,rax mul QWORD[rsi] add r9,rax mov rax,rcx adc rdx,0 mov rbp,rdx mul QWORD[8+rsi] add r10,rbp adc rdx,0 add r10,rax mov rax,rcx adc rdx,0 mov rbp,rdx mul QWORD[16+rsi] add r11,rbp adc rdx,0 add r11,rax mov rax,rcx adc rdx,0 mov rcx,r9 imul r9,r15 mov rbp,rdx mul QWORD[24+rsi] add r12,rbp adc rdx,0 xor r8,r8 add r12,rax mov rax,r9 adc r13,rdx adc r8,0 mul QWORD[r14] mov rbp,r9 add rcx,rax mov rax,r9 adc rcx,rdx sub r11,r9 sbb r9,0 mul QWORD[8+r14] add r10,rcx adc rdx,0 add r10,rax mov rax,rbp adc r11,rdx mov rdx,rbp adc r9,0 shl rax,32 shr rdx,32 sub r12,rax mov rax,QWORD[16+rbx] sbb rbp,rdx add r12,r9 adc r13,rbp adc r8,0 mov rcx,rax mul QWORD[rsi] add r10,rax mov rax,rcx adc rdx,0 mov rbp,rdx mul QWORD[8+rsi] add r11,rbp adc rdx,0 add r11,rax mov rax,rcx adc rdx,0 mov rbp,rdx mul QWORD[16+rsi] add r12,rbp adc rdx,0 add r12,rax mov rax,rcx adc rdx,0 mov rcx,r10 imul r10,r15 mov rbp,rdx mul QWORD[24+rsi] add r13,rbp adc rdx,0 xor r9,r9 add r13,rax mov rax,r10 adc r8,rdx adc r9,0 mul QWORD[r14] mov rbp,r10 add rcx,rax mov rax,r10 adc rcx,rdx sub r12,r10 sbb r10,0 mul QWORD[8+r14] add r11,rcx adc rdx,0 add r11,rax mov rax,rbp adc r12,rdx mov rdx,rbp adc r10,0 shl rax,32 shr rdx,32 sub r13,rax mov rax,QWORD[24+rbx] sbb rbp,rdx add r13,r10 adc r8,rbp adc r9,0 mov rcx,rax mul QWORD[rsi] add r11,rax mov rax,rcx adc rdx,0 mov rbp,rdx mul QWORD[8+rsi] add r12,rbp adc rdx,0 add r12,rax mov rax,rcx adc rdx,0 mov rbp,rdx mul QWORD[16+rsi] add r13,rbp adc rdx,0 add r13,rax mov rax,rcx adc rdx,0 mov rcx,r11 imul r11,r15 mov rbp,rdx mul QWORD[24+rsi] add r8,rbp adc rdx,0 xor r10,r10 add r8,rax mov rax,r11 adc r9,rdx adc r10,0 mul QWORD[r14] mov rbp,r11 add rcx,rax mov rax,r11 adc rcx,rdx sub r13,r11 sbb r11,0 mul QWORD[8+r14] add r12,rcx adc rdx,0 add r12,rax mov rax,rbp adc r13,rdx mov rdx,rbp adc r11,0 shl rax,32 shr rdx,32 sub r8,rax sbb rbp,rdx add r8,r11 adc r9,rbp adc r10,0 mov rsi,r12 sub r12,QWORD[r14] mov r11,r13 sbb r13,QWORD[8+r14] mov rcx,r8 sbb r8,QWORD[16+r14] mov rbp,r9 sbb r9,QWORD[24+r14] sbb r10,0 cmovc r12,rsi cmovc r13,r11 cmovc r8,rcx cmovc r9,rbp mov QWORD[rdi],r12 mov QWORD[8+rdi],r13 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbx,QWORD[32+rsp] mov rbp,QWORD[40+rsp] lea rsp,[48+rsp] $L$ord_mul_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_ord_mul_mont_nohw: global ecp_nistz256_ord_sqr_mont_nohw ALIGN 32 ecp_nistz256_ord_sqr_mont_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_ord_sqr_mont_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 $L$ord_sqr_body: mov r8,QWORD[rsi] mov rax,QWORD[8+rsi] mov r14,QWORD[16+rsi] mov r15,QWORD[24+rsi] lea rsi,[$L$ord] mov rbx,rdx jmp NEAR $L$oop_ord_sqr ALIGN 32 $L$oop_ord_sqr: mov rbp,rax mul r8 mov r9,rax DB 102,72,15,110,205 mov rax,r14 mov r10,rdx mul r8 add r10,rax mov rax,r15 DB 102,73,15,110,214 adc rdx,0 mov r11,rdx mul r8 add r11,rax mov rax,r15 DB 102,73,15,110,223 adc rdx,0 mov r12,rdx mul r14 mov r13,rax mov rax,r14 mov r14,rdx mul rbp add r11,rax mov rax,r15 adc rdx,0 mov r15,rdx mul rbp add r12,rax adc rdx,0 add r12,r15 adc r13,rdx adc r14,0 xor r15,r15 mov rax,r8 add r9,r9 adc r10,r10 adc r11,r11 adc r12,r12 adc r13,r13 adc r14,r14 adc r15,0 mul rax mov r8,rax DB 102,72,15,126,200 mov rbp,rdx mul rax add r9,rbp adc r10,rax DB 102,72,15,126,208 adc rdx,0 mov rbp,rdx mul rax add r11,rbp adc r12,rax DB 102,72,15,126,216 adc rdx,0 mov rbp,rdx mov rcx,r8 imul r8,QWORD[32+rsi] mul rax add r13,rbp adc r14,rax mov rax,QWORD[rsi] adc r15,rdx mul r8 mov rbp,r8 add rcx,rax mov rax,QWORD[8+rsi] adc rcx,rdx sub r10,r8 sbb rbp,0 mul r8 add r9,rcx adc rdx,0 add r9,rax mov rax,r8 adc r10,rdx mov rdx,r8 adc rbp,0 mov rcx,r9 imul r9,QWORD[32+rsi] shl rax,32 shr rdx,32 sub r11,rax mov rax,QWORD[rsi] sbb r8,rdx add r11,rbp adc r8,0 mul r9 mov rbp,r9 add rcx,rax mov rax,QWORD[8+rsi] adc rcx,rdx sub r11,r9 sbb rbp,0 mul r9 add r10,rcx adc rdx,0 add r10,rax mov rax,r9 adc r11,rdx mov rdx,r9 adc rbp,0 mov rcx,r10 imul r10,QWORD[32+rsi] shl rax,32 shr rdx,32 sub r8,rax mov rax,QWORD[rsi] sbb r9,rdx add r8,rbp adc r9,0 mul r10 mov rbp,r10 add rcx,rax mov rax,QWORD[8+rsi] adc rcx,rdx sub r8,r10 sbb rbp,0 mul r10 add r11,rcx adc rdx,0 add r11,rax mov rax,r10 adc r8,rdx mov rdx,r10 adc rbp,0 mov rcx,r11 imul r11,QWORD[32+rsi] shl rax,32 shr rdx,32 sub r9,rax mov rax,QWORD[rsi] sbb r10,rdx add r9,rbp adc r10,0 mul r11 mov rbp,r11 add rcx,rax mov rax,QWORD[8+rsi] adc rcx,rdx sub r9,r11 sbb rbp,0 mul r11 add r8,rcx adc rdx,0 add r8,rax mov rax,r11 adc r9,rdx mov rdx,r11 adc rbp,0 shl rax,32 shr rdx,32 sub r10,rax sbb r11,rdx add r10,rbp adc r11,0 xor rdx,rdx add r8,r12 adc r9,r13 mov r12,r8 adc r10,r14 adc r11,r15 mov rax,r9 adc rdx,0 sub r8,QWORD[rsi] mov r14,r10 sbb r9,QWORD[8+rsi] sbb r10,QWORD[16+rsi] mov r15,r11 sbb r11,QWORD[24+rsi] sbb rdx,0 cmovc r8,r12 cmovnc rax,r9 cmovnc r14,r10 cmovnc r15,r11 dec rbx jnz NEAR $L$oop_ord_sqr mov QWORD[rdi],r8 mov QWORD[8+rdi],rax pxor xmm1,xmm1 mov QWORD[16+rdi],r14 pxor xmm2,xmm2 mov QWORD[24+rdi],r15 pxor xmm3,xmm3 mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbx,QWORD[32+rsp] mov rbp,QWORD[40+rsp] lea rsp,[48+rsp] $L$ord_sqr_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_ord_sqr_mont_nohw: global ecp_nistz256_ord_mul_mont_adx ALIGN 32 ecp_nistz256_ord_mul_mont_adx: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_ord_mul_mont_adx: mov rdi,rcx mov rsi,rdx mov rdx,r8 $L$ecp_nistz256_ord_mul_mont_adx: _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 $L$ord_mulx_body: mov rbx,rdx mov rdx,QWORD[rdx] mov r9,QWORD[rsi] mov r10,QWORD[8+rsi] mov r11,QWORD[16+rsi] mov r12,QWORD[24+rsi] lea rsi,[((-128))+rsi] lea r14,[(($L$ord-128))] mov r15,QWORD[$L$ordK] mulx r9,r8,r9 mulx r10,rcx,r10 mulx r11,rbp,r11 add r9,rcx mulx r12,rcx,r12 mov rdx,r8 mulx rax,rdx,r15 adc r10,rbp adc r11,rcx adc r12,0 xor r13,r13 mulx rbp,rcx,QWORD[((0+128))+r14] adcx r8,rcx adox r9,rbp mulx rbp,rcx,QWORD[((8+128))+r14] adcx r9,rcx adox r10,rbp mulx rbp,rcx,QWORD[((16+128))+r14] adcx r10,rcx adox r11,rbp mulx rbp,rcx,QWORD[((24+128))+r14] mov rdx,QWORD[8+rbx] adcx r11,rcx adox r12,rbp adcx r12,r8 adox r13,r8 adc r13,0 mulx rbp,rcx,QWORD[((0+128))+rsi] adcx r9,rcx adox r10,rbp mulx rbp,rcx,QWORD[((8+128))+rsi] adcx r10,rcx adox r11,rbp mulx rbp,rcx,QWORD[((16+128))+rsi] adcx r11,rcx adox r12,rbp mulx rbp,rcx,QWORD[((24+128))+rsi] mov rdx,r9 mulx rax,rdx,r15 adcx r12,rcx adox r13,rbp adcx r13,r8 adox r8,r8 adc r8,0 mulx rbp,rcx,QWORD[((0+128))+r14] adcx r9,rcx adox r10,rbp mulx rbp,rcx,QWORD[((8+128))+r14] adcx r10,rcx adox r11,rbp mulx rbp,rcx,QWORD[((16+128))+r14] adcx r11,rcx adox r12,rbp mulx rbp,rcx,QWORD[((24+128))+r14] mov rdx,QWORD[16+rbx] adcx r12,rcx adox r13,rbp adcx r13,r9 adox r8,r9 adc r8,0 mulx rbp,rcx,QWORD[((0+128))+rsi] adcx r10,rcx adox r11,rbp mulx rbp,rcx,QWORD[((8+128))+rsi] adcx r11,rcx adox r12,rbp mulx rbp,rcx,QWORD[((16+128))+rsi] adcx r12,rcx adox r13,rbp mulx rbp,rcx,QWORD[((24+128))+rsi] mov rdx,r10 mulx rax,rdx,r15 adcx r13,rcx adox r8,rbp adcx r8,r9 adox r9,r9 adc r9,0 mulx rbp,rcx,QWORD[((0+128))+r14] adcx r10,rcx adox r11,rbp mulx rbp,rcx,QWORD[((8+128))+r14] adcx r11,rcx adox r12,rbp mulx rbp,rcx,QWORD[((16+128))+r14] adcx r12,rcx adox r13,rbp mulx rbp,rcx,QWORD[((24+128))+r14] mov rdx,QWORD[24+rbx] adcx r13,rcx adox r8,rbp adcx r8,r10 adox r9,r10 adc r9,0 mulx rbp,rcx,QWORD[((0+128))+rsi] adcx r11,rcx adox r12,rbp mulx rbp,rcx,QWORD[((8+128))+rsi] adcx r12,rcx adox r13,rbp mulx rbp,rcx,QWORD[((16+128))+rsi] adcx r13,rcx adox r8,rbp mulx rbp,rcx,QWORD[((24+128))+rsi] mov rdx,r11 mulx rax,rdx,r15 adcx r8,rcx adox r9,rbp adcx r9,r10 adox r10,r10 adc r10,0 mulx rbp,rcx,QWORD[((0+128))+r14] adcx r11,rcx adox r12,rbp mulx rbp,rcx,QWORD[((8+128))+r14] adcx r12,rcx adox r13,rbp mulx rbp,rcx,QWORD[((16+128))+r14] adcx r13,rcx adox r8,rbp mulx rbp,rcx,QWORD[((24+128))+r14] lea r14,[128+r14] mov rbx,r12 adcx r8,rcx adox r9,rbp mov rdx,r13 adcx r9,r11 adox r10,r11 adc r10,0 mov rcx,r8 sub r12,QWORD[r14] sbb r13,QWORD[8+r14] sbb r8,QWORD[16+r14] mov rbp,r9 sbb r9,QWORD[24+r14] sbb r10,0 cmovc r12,rbx cmovc r13,rdx cmovc r8,rcx cmovc r9,rbp mov QWORD[rdi],r12 mov QWORD[8+rdi],r13 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbx,QWORD[32+rsp] mov rbp,QWORD[40+rsp] lea rsp,[48+rsp] $L$ord_mulx_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_ord_mul_mont_adx: global ecp_nistz256_ord_sqr_mont_adx ALIGN 32 ecp_nistz256_ord_sqr_mont_adx: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_ord_sqr_mont_adx: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR $L$ecp_nistz256_ord_sqr_mont_adx: push rbp push rbx push r12 push r13 push r14 push r15 $L$ord_sqrx_body: mov rbx,rdx mov rdx,QWORD[rsi] mov r14,QWORD[8+rsi] mov r15,QWORD[16+rsi] mov r8,QWORD[24+rsi] lea rsi,[$L$ord] jmp NEAR $L$oop_ord_sqrx ALIGN 32 $L$oop_ord_sqrx: mulx r10,r9,r14 mulx r11,rcx,r15 mov rax,rdx DB 102,73,15,110,206 mulx r12,rbp,r8 mov rdx,r14 add r10,rcx DB 102,73,15,110,215 adc r11,rbp adc r12,0 xor r13,r13 mulx rbp,rcx,r15 adcx r11,rcx adox r12,rbp mulx rbp,rcx,r8 mov rdx,r15 adcx r12,rcx adox r13,rbp adc r13,0 mulx r14,rcx,r8 mov rdx,rax DB 102,73,15,110,216 xor r15,r15 adcx r9,r9 adox r13,rcx adcx r10,r10 adox r14,r15 mulx rbp,r8,rdx DB 102,72,15,126,202 adcx r11,r11 adox r9,rbp adcx r12,r12 mulx rax,rcx,rdx DB 102,72,15,126,210 adcx r13,r13 adox r10,rcx adcx r14,r14 mulx rbp,rcx,rdx DB 0x67 DB 102,72,15,126,218 adox r11,rax adcx r15,r15 adox r12,rcx adox r13,rbp mulx rax,rcx,rdx adox r14,rcx adox r15,rax mov rdx,r8 mulx rcx,rdx,QWORD[32+rsi] xor rax,rax mulx rbp,rcx,QWORD[rsi] adcx r8,rcx adox r9,rbp mulx rbp,rcx,QWORD[8+rsi] adcx r9,rcx adox r10,rbp mulx rbp,rcx,QWORD[16+rsi] adcx r10,rcx adox r11,rbp mulx rbp,rcx,QWORD[24+rsi] adcx r11,rcx adox r8,rbp adcx r8,rax mov rdx,r9 mulx rcx,rdx,QWORD[32+rsi] mulx rbp,rcx,QWORD[rsi] adox r9,rcx adcx r10,rbp mulx rbp,rcx,QWORD[8+rsi] adox r10,rcx adcx r11,rbp mulx rbp,rcx,QWORD[16+rsi] adox r11,rcx adcx r8,rbp mulx rbp,rcx,QWORD[24+rsi] adox r8,rcx adcx r9,rbp adox r9,rax mov rdx,r10 mulx rcx,rdx,QWORD[32+rsi] mulx rbp,rcx,QWORD[rsi] adcx r10,rcx adox r11,rbp mulx rbp,rcx,QWORD[8+rsi] adcx r11,rcx adox r8,rbp mulx rbp,rcx,QWORD[16+rsi] adcx r8,rcx adox r9,rbp mulx rbp,rcx,QWORD[24+rsi] adcx r9,rcx adox r10,rbp adcx r10,rax mov rdx,r11 mulx rcx,rdx,QWORD[32+rsi] mulx rbp,rcx,QWORD[rsi] adox r11,rcx adcx r8,rbp mulx rbp,rcx,QWORD[8+rsi] adox r8,rcx adcx r9,rbp mulx rbp,rcx,QWORD[16+rsi] adox r9,rcx adcx r10,rbp mulx rbp,rcx,QWORD[24+rsi] adox r10,rcx adcx r11,rbp adox r11,rax add r12,r8 adc r9,r13 mov rdx,r12 adc r10,r14 adc r11,r15 mov r14,r9 adc rax,0 sub r12,QWORD[rsi] mov r15,r10 sbb r9,QWORD[8+rsi] sbb r10,QWORD[16+rsi] mov r8,r11 sbb r11,QWORD[24+rsi] sbb rax,0 cmovnc rdx,r12 cmovnc r14,r9 cmovnc r15,r10 cmovnc r8,r11 dec rbx jnz NEAR $L$oop_ord_sqrx mov QWORD[rdi],rdx mov QWORD[8+rdi],r14 pxor xmm1,xmm1 mov QWORD[16+rdi],r15 pxor xmm2,xmm2 mov QWORD[24+rdi],r8 pxor xmm3,xmm3 mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbx,QWORD[32+rsp] mov rbp,QWORD[40+rsp] lea rsp,[48+rsp] $L$ord_sqrx_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_ord_sqr_mont_adx: global ecp_nistz256_mul_mont_nohw ALIGN 32 ecp_nistz256_mul_mont_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_mul_mont_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 $L$mul_body: mov rbx,rdx mov rax,QWORD[rdx] mov r9,QWORD[rsi] mov r10,QWORD[8+rsi] mov r11,QWORD[16+rsi] mov r12,QWORD[24+rsi] call __ecp_nistz256_mul_montq mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbx,QWORD[32+rsp] mov rbp,QWORD[40+rsp] lea rsp,[48+rsp] $L$mul_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_mul_mont_nohw: ALIGN 32 __ecp_nistz256_mul_montq: mov rbp,rax mul r9 mov r14,QWORD[(($L$poly+8))] mov r8,rax mov rax,rbp mov r9,rdx mul r10 mov r15,QWORD[(($L$poly+24))] add r9,rax mov rax,rbp adc rdx,0 mov r10,rdx mul r11 add r10,rax mov rax,rbp adc rdx,0 mov r11,rdx mul r12 add r11,rax mov rax,r8 adc rdx,0 xor r13,r13 mov r12,rdx mov rbp,r8 shl r8,32 mul r15 shr rbp,32 add r9,r8 adc r10,rbp adc r11,rax mov rax,QWORD[8+rbx] adc r12,rdx adc r13,0 xor r8,r8 mov rbp,rax mul QWORD[rsi] add r9,rax mov rax,rbp adc rdx,0 mov rcx,rdx mul QWORD[8+rsi] add r10,rcx adc rdx,0 add r10,rax mov rax,rbp adc rdx,0 mov rcx,rdx mul QWORD[16+rsi] add r11,rcx adc rdx,0 add r11,rax mov rax,rbp adc rdx,0 mov rcx,rdx mul QWORD[24+rsi] add r12,rcx adc rdx,0 add r12,rax mov rax,r9 adc r13,rdx adc r8,0 mov rbp,r9 shl r9,32 mul r15 shr rbp,32 add r10,r9 adc r11,rbp adc r12,rax mov rax,QWORD[16+rbx] adc r13,rdx adc r8,0 xor r9,r9 mov rbp,rax mul QWORD[rsi] add r10,rax mov rax,rbp adc rdx,0 mov rcx,rdx mul QWORD[8+rsi] add r11,rcx adc rdx,0 add r11,rax mov rax,rbp adc rdx,0 mov rcx,rdx mul QWORD[16+rsi] add r12,rcx adc rdx,0 add r12,rax mov rax,rbp adc rdx,0 mov rcx,rdx mul QWORD[24+rsi] add r13,rcx adc rdx,0 add r13,rax mov rax,r10 adc r8,rdx adc r9,0 mov rbp,r10 shl r10,32 mul r15 shr rbp,32 add r11,r10 adc r12,rbp adc r13,rax mov rax,QWORD[24+rbx] adc r8,rdx adc r9,0 xor r10,r10 mov rbp,rax mul QWORD[rsi] add r11,rax mov rax,rbp adc rdx,0 mov rcx,rdx mul QWORD[8+rsi] add r12,rcx adc rdx,0 add r12,rax mov rax,rbp adc rdx,0 mov rcx,rdx mul QWORD[16+rsi] add r13,rcx adc rdx,0 add r13,rax mov rax,rbp adc rdx,0 mov rcx,rdx mul QWORD[24+rsi] add r8,rcx adc rdx,0 add r8,rax mov rax,r11 adc r9,rdx adc r10,0 mov rbp,r11 shl r11,32 mul r15 shr rbp,32 add r12,r11 adc r13,rbp mov rcx,r12 adc r8,rax adc r9,rdx mov rbp,r13 adc r10,0 sub r12,-1 mov rbx,r8 sbb r13,r14 sbb r8,0 mov rdx,r9 sbb r9,r15 sbb r10,0 cmovc r12,rcx cmovc r13,rbp mov QWORD[rdi],r12 cmovc r8,rbx mov QWORD[8+rdi],r13 cmovc r9,rdx mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 ret global ecp_nistz256_sqr_mont_nohw ALIGN 32 ecp_nistz256_sqr_mont_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_sqr_mont_nohw: mov rdi,rcx mov rsi,rdx _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 $L$sqr_body: mov rax,QWORD[rsi] mov r14,QWORD[8+rsi] mov r15,QWORD[16+rsi] mov r8,QWORD[24+rsi] call __ecp_nistz256_sqr_montq mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbx,QWORD[32+rsp] mov rbp,QWORD[40+rsp] lea rsp,[48+rsp] $L$sqr_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_sqr_mont_nohw: ALIGN 32 __ecp_nistz256_sqr_montq: mov r13,rax mul r14 mov r9,rax mov rax,r15 mov r10,rdx mul r13 add r10,rax mov rax,r8 adc rdx,0 mov r11,rdx mul r13 add r11,rax mov rax,r15 adc rdx,0 mov r12,rdx mul r14 add r11,rax mov rax,r8 adc rdx,0 mov rbp,rdx mul r14 add r12,rax mov rax,r8 adc rdx,0 add r12,rbp mov r13,rdx adc r13,0 mul r15 xor r15,r15 add r13,rax mov rax,QWORD[rsi] mov r14,rdx adc r14,0 add r9,r9 adc r10,r10 adc r11,r11 adc r12,r12 adc r13,r13 adc r14,r14 adc r15,0 mul rax mov r8,rax mov rax,QWORD[8+rsi] mov rcx,rdx mul rax add r9,rcx adc r10,rax mov rax,QWORD[16+rsi] adc rdx,0 mov rcx,rdx mul rax add r11,rcx adc r12,rax mov rax,QWORD[24+rsi] adc rdx,0 mov rcx,rdx mul rax add r13,rcx adc r14,rax mov rax,r8 adc r15,rdx mov rsi,QWORD[(($L$poly+8))] mov rbp,QWORD[(($L$poly+24))] mov rcx,r8 shl r8,32 mul rbp shr rcx,32 add r9,r8 adc r10,rcx adc r11,rax mov rax,r9 adc rdx,0 mov rcx,r9 shl r9,32 mov r8,rdx mul rbp shr rcx,32 add r10,r9 adc r11,rcx adc r8,rax mov rax,r10 adc rdx,0 mov rcx,r10 shl r10,32 mov r9,rdx mul rbp shr rcx,32 add r11,r10 adc r8,rcx adc r9,rax mov rax,r11 adc rdx,0 mov rcx,r11 shl r11,32 mov r10,rdx mul rbp shr rcx,32 add r8,r11 adc r9,rcx adc r10,rax adc rdx,0 xor r11,r11 add r12,r8 adc r13,r9 mov r8,r12 adc r14,r10 adc r15,rdx mov r9,r13 adc r11,0 sub r12,-1 mov r10,r14 sbb r13,rsi sbb r14,0 mov rcx,r15 sbb r15,rbp sbb r11,0 cmovc r12,r8 cmovc r13,r9 mov QWORD[rdi],r12 cmovc r14,r10 mov QWORD[8+rdi],r13 cmovc r15,rcx mov QWORD[16+rdi],r14 mov QWORD[24+rdi],r15 ret global ecp_nistz256_mul_mont_adx ALIGN 32 ecp_nistz256_mul_mont_adx: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_mul_mont_adx: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 $L$mulx_body: mov rbx,rdx mov rdx,QWORD[rdx] mov r9,QWORD[rsi] mov r10,QWORD[8+rsi] mov r11,QWORD[16+rsi] mov r12,QWORD[24+rsi] lea rsi,[((-128))+rsi] call __ecp_nistz256_mul_montx mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbx,QWORD[32+rsp] mov rbp,QWORD[40+rsp] lea rsp,[48+rsp] $L$mulx_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_mul_mont_adx: ALIGN 32 __ecp_nistz256_mul_montx: mulx r9,r8,r9 mulx r10,rcx,r10 mov r14,32 xor r13,r13 mulx r11,rbp,r11 mov r15,QWORD[(($L$poly+24))] adc r9,rcx mulx r12,rcx,r12 mov rdx,r8 adc r10,rbp shlx rbp,r8,r14 adc r11,rcx shrx rcx,r8,r14 adc r12,0 add r9,rbp adc r10,rcx mulx rbp,rcx,r15 mov rdx,QWORD[8+rbx] adc r11,rcx adc r12,rbp adc r13,0 xor r8,r8 mulx rbp,rcx,QWORD[((0+128))+rsi] adcx r9,rcx adox r10,rbp mulx rbp,rcx,QWORD[((8+128))+rsi] adcx r10,rcx adox r11,rbp mulx rbp,rcx,QWORD[((16+128))+rsi] adcx r11,rcx adox r12,rbp mulx rbp,rcx,QWORD[((24+128))+rsi] mov rdx,r9 adcx r12,rcx shlx rcx,r9,r14 adox r13,rbp shrx rbp,r9,r14 adcx r13,r8 adox r8,r8 adc r8,0 add r10,rcx adc r11,rbp mulx rbp,rcx,r15 mov rdx,QWORD[16+rbx] adc r12,rcx adc r13,rbp adc r8,0 xor r9,r9 mulx rbp,rcx,QWORD[((0+128))+rsi] adcx r10,rcx adox r11,rbp mulx rbp,rcx,QWORD[((8+128))+rsi] adcx r11,rcx adox r12,rbp mulx rbp,rcx,QWORD[((16+128))+rsi] adcx r12,rcx adox r13,rbp mulx rbp,rcx,QWORD[((24+128))+rsi] mov rdx,r10 adcx r13,rcx shlx rcx,r10,r14 adox r8,rbp shrx rbp,r10,r14 adcx r8,r9 adox r9,r9 adc r9,0 add r11,rcx adc r12,rbp mulx rbp,rcx,r15 mov rdx,QWORD[24+rbx] adc r13,rcx adc r8,rbp adc r9,0 xor r10,r10 mulx rbp,rcx,QWORD[((0+128))+rsi] adcx r11,rcx adox r12,rbp mulx rbp,rcx,QWORD[((8+128))+rsi] adcx r12,rcx adox r13,rbp mulx rbp,rcx,QWORD[((16+128))+rsi] adcx r13,rcx adox r8,rbp mulx rbp,rcx,QWORD[((24+128))+rsi] mov rdx,r11 adcx r8,rcx shlx rcx,r11,r14 adox r9,rbp shrx rbp,r11,r14 adcx r9,r10 adox r10,r10 adc r10,0 add r12,rcx adc r13,rbp mulx rbp,rcx,r15 mov rbx,r12 mov r14,QWORD[(($L$poly+8))] adc r8,rcx mov rdx,r13 adc r9,rbp adc r10,0 xor eax,eax mov rcx,r8 sbb r12,-1 sbb r13,r14 sbb r8,0 mov rbp,r9 sbb r9,r15 sbb r10,0 cmovc r12,rbx cmovc r13,rdx mov QWORD[rdi],r12 cmovc r8,rcx mov QWORD[8+rdi],r13 cmovc r9,rbp mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 ret global ecp_nistz256_sqr_mont_adx ALIGN 32 ecp_nistz256_sqr_mont_adx: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_sqr_mont_adx: mov rdi,rcx mov rsi,rdx _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 $L$sqrx_body: mov rdx,QWORD[rsi] mov r14,QWORD[8+rsi] mov r15,QWORD[16+rsi] mov r8,QWORD[24+rsi] lea rsi,[((-128))+rsi] call __ecp_nistz256_sqr_montx mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbx,QWORD[32+rsp] mov rbp,QWORD[40+rsp] lea rsp,[48+rsp] $L$sqrx_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_sqr_mont_adx: ALIGN 32 __ecp_nistz256_sqr_montx: mulx r10,r9,r14 mulx r11,rcx,r15 xor eax,eax adc r10,rcx mulx r12,rbp,r8 mov rdx,r14 adc r11,rbp adc r12,0 xor r13,r13 mulx rbp,rcx,r15 adcx r11,rcx adox r12,rbp mulx rbp,rcx,r8 mov rdx,r15 adcx r12,rcx adox r13,rbp adc r13,0 mulx r14,rcx,r8 mov rdx,QWORD[((0+128))+rsi] xor r15,r15 adcx r9,r9 adox r13,rcx adcx r10,r10 adox r14,r15 mulx rbp,r8,rdx mov rdx,QWORD[((8+128))+rsi] adcx r11,r11 adox r9,rbp adcx r12,r12 mulx rax,rcx,rdx mov rdx,QWORD[((16+128))+rsi] adcx r13,r13 adox r10,rcx adcx r14,r14 DB 0x67 mulx rbp,rcx,rdx mov rdx,QWORD[((24+128))+rsi] adox r11,rax adcx r15,r15 adox r12,rcx mov rsi,32 adox r13,rbp DB 0x67,0x67 mulx rax,rcx,rdx mov rdx,QWORD[(($L$poly+24))] adox r14,rcx shlx rcx,r8,rsi adox r15,rax shrx rax,r8,rsi mov rbp,rdx add r9,rcx adc r10,rax mulx r8,rcx,r8 adc r11,rcx shlx rcx,r9,rsi adc r8,0 shrx rax,r9,rsi add r10,rcx adc r11,rax mulx r9,rcx,r9 adc r8,rcx shlx rcx,r10,rsi adc r9,0 shrx rax,r10,rsi add r11,rcx adc r8,rax mulx r10,rcx,r10 adc r9,rcx shlx rcx,r11,rsi adc r10,0 shrx rax,r11,rsi add r8,rcx adc r9,rax mulx r11,rcx,r11 adc r10,rcx adc r11,0 xor rdx,rdx add r12,r8 mov rsi,QWORD[(($L$poly+8))] adc r13,r9 mov r8,r12 adc r14,r10 adc r15,r11 mov r9,r13 adc rdx,0 sub r12,-1 mov r10,r14 sbb r13,rsi sbb r14,0 mov r11,r15 sbb r15,rbp sbb rdx,0 cmovc r12,r8 cmovc r13,r9 mov QWORD[rdi],r12 cmovc r14,r10 mov QWORD[8+rdi],r13 cmovc r15,r11 mov QWORD[16+rdi],r14 mov QWORD[24+rdi],r15 ret global ecp_nistz256_select_w5_nohw ALIGN 32 ecp_nistz256_select_w5_nohw: _CET_ENDBR lea rax,[((-136))+rsp] $L$SEH_begin_ecp_nistz256_select_w5_nohw: DB 0x48,0x8d,0x60,0xe0 DB 0x0f,0x29,0x70,0xe0 DB 0x0f,0x29,0x78,0xf0 DB 0x44,0x0f,0x29,0x00 DB 0x44,0x0f,0x29,0x48,0x10 DB 0x44,0x0f,0x29,0x50,0x20 DB 0x44,0x0f,0x29,0x58,0x30 DB 0x44,0x0f,0x29,0x60,0x40 DB 0x44,0x0f,0x29,0x68,0x50 DB 0x44,0x0f,0x29,0x70,0x60 DB 0x44,0x0f,0x29,0x78,0x70 movdqa xmm0,XMMWORD[$L$One] movd xmm1,r8d pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 pxor xmm6,xmm6 pxor xmm7,xmm7 movdqa xmm8,xmm0 pshufd xmm1,xmm1,0 mov rax,16 $L$select_loop_sse_w5: movdqa xmm15,xmm8 paddd xmm8,xmm0 pcmpeqd xmm15,xmm1 movdqa xmm9,XMMWORD[rdx] movdqa xmm10,XMMWORD[16+rdx] movdqa xmm11,XMMWORD[32+rdx] movdqa xmm12,XMMWORD[48+rdx] movdqa xmm13,XMMWORD[64+rdx] movdqa xmm14,XMMWORD[80+rdx] lea rdx,[96+rdx] pand xmm9,xmm15 pand xmm10,xmm15 por xmm2,xmm9 pand xmm11,xmm15 por xmm3,xmm10 pand xmm12,xmm15 por xmm4,xmm11 pand xmm13,xmm15 por xmm5,xmm12 pand xmm14,xmm15 por xmm6,xmm13 por xmm7,xmm14 dec rax jnz NEAR $L$select_loop_sse_w5 movdqu XMMWORD[rcx],xmm2 movdqu XMMWORD[16+rcx],xmm3 movdqu XMMWORD[32+rcx],xmm4 movdqu XMMWORD[48+rcx],xmm5 movdqu XMMWORD[64+rcx],xmm6 movdqu XMMWORD[80+rcx],xmm7 movaps xmm6,XMMWORD[rsp] movaps xmm7,XMMWORD[16+rsp] movaps xmm8,XMMWORD[32+rsp] movaps xmm9,XMMWORD[48+rsp] movaps xmm10,XMMWORD[64+rsp] movaps xmm11,XMMWORD[80+rsp] movaps xmm12,XMMWORD[96+rsp] movaps xmm13,XMMWORD[112+rsp] movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] lea rsp,[168+rsp] ret $L$SEH_end_ecp_nistz256_select_w5_nohw: global ecp_nistz256_select_w7_nohw ALIGN 32 ecp_nistz256_select_w7_nohw: _CET_ENDBR lea rax,[((-136))+rsp] $L$SEH_begin_ecp_nistz256_select_w7_nohw: DB 0x48,0x8d,0x60,0xe0 DB 0x0f,0x29,0x70,0xe0 DB 0x0f,0x29,0x78,0xf0 DB 0x44,0x0f,0x29,0x00 DB 0x44,0x0f,0x29,0x48,0x10 DB 0x44,0x0f,0x29,0x50,0x20 DB 0x44,0x0f,0x29,0x58,0x30 DB 0x44,0x0f,0x29,0x60,0x40 DB 0x44,0x0f,0x29,0x68,0x50 DB 0x44,0x0f,0x29,0x70,0x60 DB 0x44,0x0f,0x29,0x78,0x70 movdqa xmm8,XMMWORD[$L$One] movd xmm1,r8d pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 movdqa xmm0,xmm8 pshufd xmm1,xmm1,0 mov rax,64 $L$select_loop_sse_w7: movdqa xmm15,xmm8 paddd xmm8,xmm0 movdqa xmm9,XMMWORD[rdx] movdqa xmm10,XMMWORD[16+rdx] pcmpeqd xmm15,xmm1 movdqa xmm11,XMMWORD[32+rdx] movdqa xmm12,XMMWORD[48+rdx] lea rdx,[64+rdx] pand xmm9,xmm15 pand xmm10,xmm15 por xmm2,xmm9 pand xmm11,xmm15 por xmm3,xmm10 pand xmm12,xmm15 por xmm4,xmm11 prefetcht0 [255+rdx] por xmm5,xmm12 dec rax jnz NEAR $L$select_loop_sse_w7 movdqu XMMWORD[rcx],xmm2 movdqu XMMWORD[16+rcx],xmm3 movdqu XMMWORD[32+rcx],xmm4 movdqu XMMWORD[48+rcx],xmm5 movaps xmm6,XMMWORD[rsp] movaps xmm7,XMMWORD[16+rsp] movaps xmm8,XMMWORD[32+rsp] movaps xmm9,XMMWORD[48+rsp] movaps xmm10,XMMWORD[64+rsp] movaps xmm11,XMMWORD[80+rsp] movaps xmm12,XMMWORD[96+rsp] movaps xmm13,XMMWORD[112+rsp] movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] lea rsp,[168+rsp] ret $L$SEH_end_ecp_nistz256_select_w7_nohw: global ecp_nistz256_select_w5_avx2 ALIGN 32 ecp_nistz256_select_w5_avx2: _CET_ENDBR vzeroupper lea rax,[((-136))+rsp] mov r11,rsp $L$SEH_begin_ecp_nistz256_select_w5_avx2: DB 0x48,0x8d,0x60,0xe0 DB 0xc5,0xf8,0x29,0x70,0xe0 DB 0xc5,0xf8,0x29,0x78,0xf0 DB 0xc5,0x78,0x29,0x40,0x00 DB 0xc5,0x78,0x29,0x48,0x10 DB 0xc5,0x78,0x29,0x50,0x20 DB 0xc5,0x78,0x29,0x58,0x30 DB 0xc5,0x78,0x29,0x60,0x40 DB 0xc5,0x78,0x29,0x68,0x50 DB 0xc5,0x78,0x29,0x70,0x60 DB 0xc5,0x78,0x29,0x78,0x70 vmovdqa ymm0,YMMWORD[$L$Two] vpxor ymm2,ymm2,ymm2 vpxor ymm3,ymm3,ymm3 vpxor ymm4,ymm4,ymm4 vmovdqa ymm5,YMMWORD[$L$One] vmovdqa ymm10,YMMWORD[$L$Two] vmovd xmm1,r8d vpermd ymm1,ymm2,ymm1 mov rax,8 $L$select_loop_avx2_w5: vmovdqa ymm6,YMMWORD[rdx] vmovdqa ymm7,YMMWORD[32+rdx] vmovdqa ymm8,YMMWORD[64+rdx] vmovdqa ymm11,YMMWORD[96+rdx] vmovdqa ymm12,YMMWORD[128+rdx] vmovdqa ymm13,YMMWORD[160+rdx] vpcmpeqd ymm9,ymm5,ymm1 vpcmpeqd ymm14,ymm10,ymm1 vpaddd ymm5,ymm5,ymm0 vpaddd ymm10,ymm10,ymm0 lea rdx,[192+rdx] vpand ymm6,ymm6,ymm9 vpand ymm7,ymm7,ymm9 vpand ymm8,ymm8,ymm9 vpand ymm11,ymm11,ymm14 vpand ymm12,ymm12,ymm14 vpand ymm13,ymm13,ymm14 vpxor ymm2,ymm2,ymm6 vpxor ymm3,ymm3,ymm7 vpxor ymm4,ymm4,ymm8 vpxor ymm2,ymm2,ymm11 vpxor ymm3,ymm3,ymm12 vpxor ymm4,ymm4,ymm13 dec rax jnz NEAR $L$select_loop_avx2_w5 vmovdqu YMMWORD[rcx],ymm2 vmovdqu YMMWORD[32+rcx],ymm3 vmovdqu YMMWORD[64+rcx],ymm4 vzeroupper movaps xmm6,XMMWORD[rsp] movaps xmm7,XMMWORD[16+rsp] movaps xmm8,XMMWORD[32+rsp] movaps xmm9,XMMWORD[48+rsp] movaps xmm10,XMMWORD[64+rsp] movaps xmm11,XMMWORD[80+rsp] movaps xmm12,XMMWORD[96+rsp] movaps xmm13,XMMWORD[112+rsp] movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] lea rsp,[r11] ret $L$SEH_end_ecp_nistz256_select_w5_avx2: global ecp_nistz256_select_w7_avx2 ALIGN 32 ecp_nistz256_select_w7_avx2: _CET_ENDBR vzeroupper mov r11,rsp lea rax,[((-136))+rsp] $L$SEH_begin_ecp_nistz256_select_w7_avx2: DB 0x48,0x8d,0x60,0xe0 DB 0xc5,0xf8,0x29,0x70,0xe0 DB 0xc5,0xf8,0x29,0x78,0xf0 DB 0xc5,0x78,0x29,0x40,0x00 DB 0xc5,0x78,0x29,0x48,0x10 DB 0xc5,0x78,0x29,0x50,0x20 DB 0xc5,0x78,0x29,0x58,0x30 DB 0xc5,0x78,0x29,0x60,0x40 DB 0xc5,0x78,0x29,0x68,0x50 DB 0xc5,0x78,0x29,0x70,0x60 DB 0xc5,0x78,0x29,0x78,0x70 vmovdqa ymm0,YMMWORD[$L$Three] vpxor ymm2,ymm2,ymm2 vpxor ymm3,ymm3,ymm3 vmovdqa ymm4,YMMWORD[$L$One] vmovdqa ymm8,YMMWORD[$L$Two] vmovdqa ymm12,YMMWORD[$L$Three] vmovd xmm1,r8d vpermd ymm1,ymm2,ymm1 mov rax,21 $L$select_loop_avx2_w7: vmovdqa ymm5,YMMWORD[rdx] vmovdqa ymm6,YMMWORD[32+rdx] vmovdqa ymm9,YMMWORD[64+rdx] vmovdqa ymm10,YMMWORD[96+rdx] vmovdqa ymm13,YMMWORD[128+rdx] vmovdqa ymm14,YMMWORD[160+rdx] vpcmpeqd ymm7,ymm4,ymm1 vpcmpeqd ymm11,ymm8,ymm1 vpcmpeqd ymm15,ymm12,ymm1 vpaddd ymm4,ymm4,ymm0 vpaddd ymm8,ymm8,ymm0 vpaddd ymm12,ymm12,ymm0 lea rdx,[192+rdx] vpand ymm5,ymm5,ymm7 vpand ymm6,ymm6,ymm7 vpand ymm9,ymm9,ymm11 vpand ymm10,ymm10,ymm11 vpand ymm13,ymm13,ymm15 vpand ymm14,ymm14,ymm15 vpxor ymm2,ymm2,ymm5 vpxor ymm3,ymm3,ymm6 vpxor ymm2,ymm2,ymm9 vpxor ymm3,ymm3,ymm10 vpxor ymm2,ymm2,ymm13 vpxor ymm3,ymm3,ymm14 dec rax jnz NEAR $L$select_loop_avx2_w7 vmovdqa ymm5,YMMWORD[rdx] vmovdqa ymm6,YMMWORD[32+rdx] vpcmpeqd ymm7,ymm4,ymm1 vpand ymm5,ymm5,ymm7 vpand ymm6,ymm6,ymm7 vpxor ymm2,ymm2,ymm5 vpxor ymm3,ymm3,ymm6 vmovdqu YMMWORD[rcx],ymm2 vmovdqu YMMWORD[32+rcx],ymm3 vzeroupper movaps xmm6,XMMWORD[rsp] movaps xmm7,XMMWORD[16+rsp] movaps xmm8,XMMWORD[32+rsp] movaps xmm9,XMMWORD[48+rsp] movaps xmm10,XMMWORD[64+rsp] movaps xmm11,XMMWORD[80+rsp] movaps xmm12,XMMWORD[96+rsp] movaps xmm13,XMMWORD[112+rsp] movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] lea rsp,[r11] ret $L$SEH_end_ecp_nistz256_select_w7_avx2: ALIGN 32 __ecp_nistz256_add_toq: xor r11,r11 add r12,QWORD[rbx] adc r13,QWORD[8+rbx] mov rax,r12 adc r8,QWORD[16+rbx] adc r9,QWORD[24+rbx] mov rbp,r13 adc r11,0 sub r12,-1 mov rcx,r8 sbb r13,r14 sbb r8,0 mov r10,r9 sbb r9,r15 sbb r11,0 cmovc r12,rax cmovc r13,rbp mov QWORD[rdi],r12 cmovc r8,rcx mov QWORD[8+rdi],r13 cmovc r9,r10 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 ret ALIGN 32 __ecp_nistz256_sub_fromq: sub r12,QWORD[rbx] sbb r13,QWORD[8+rbx] mov rax,r12 sbb r8,QWORD[16+rbx] sbb r9,QWORD[24+rbx] mov rbp,r13 sbb r11,r11 add r12,-1 mov rcx,r8 adc r13,r14 adc r8,0 mov r10,r9 adc r9,r15 test r11,r11 cmovz r12,rax cmovz r13,rbp mov QWORD[rdi],r12 cmovz r8,rcx mov QWORD[8+rdi],r13 cmovz r9,r10 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 ret ALIGN 32 __ecp_nistz256_subq: sub rax,r12 sbb rbp,r13 mov r12,rax sbb rcx,r8 sbb r10,r9 mov r13,rbp sbb r11,r11 add rax,-1 mov r8,rcx adc rbp,r14 adc rcx,0 mov r9,r10 adc r10,r15 test r11,r11 cmovnz r12,rax cmovnz r13,rbp cmovnz r8,rcx cmovnz r9,r10 ret ALIGN 32 __ecp_nistz256_mul_by_2q: xor r11,r11 add r12,r12 adc r13,r13 mov rax,r12 adc r8,r8 adc r9,r9 mov rbp,r13 adc r11,0 sub r12,-1 mov rcx,r8 sbb r13,r14 sbb r8,0 mov r10,r9 sbb r9,r15 sbb r11,0 cmovc r12,rax cmovc r13,rbp mov QWORD[rdi],r12 cmovc r8,rcx mov QWORD[8+rdi],r13 cmovc r9,r10 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 ret global ecp_nistz256_point_double_nohw ALIGN 32 ecp_nistz256_point_double_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_point_double_nohw: mov rdi,rcx mov rsi,rdx _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,32*5+8 $L$point_doubleq_body: $L$point_double_shortcutq: movdqu xmm0,XMMWORD[rsi] mov rbx,rsi movdqu xmm1,XMMWORD[16+rsi] mov r12,QWORD[((32+0))+rsi] mov r13,QWORD[((32+8))+rsi] mov r8,QWORD[((32+16))+rsi] mov r9,QWORD[((32+24))+rsi] mov r14,QWORD[(($L$poly+8))] mov r15,QWORD[(($L$poly+24))] movdqa XMMWORD[96+rsp],xmm0 movdqa XMMWORD[(96+16)+rsp],xmm1 lea r10,[32+rdi] lea r11,[64+rdi] DB 102,72,15,110,199 DB 102,73,15,110,202 DB 102,73,15,110,211 lea rdi,[rsp] call __ecp_nistz256_mul_by_2q mov rax,QWORD[((64+0))+rsi] mov r14,QWORD[((64+8))+rsi] mov r15,QWORD[((64+16))+rsi] mov r8,QWORD[((64+24))+rsi] lea rsi,[((64-0))+rsi] lea rdi,[64+rsp] call __ecp_nistz256_sqr_montq mov rax,QWORD[((0+0))+rsp] mov r14,QWORD[((8+0))+rsp] lea rsi,[((0+0))+rsp] mov r15,QWORD[((16+0))+rsp] mov r8,QWORD[((24+0))+rsp] lea rdi,[rsp] call __ecp_nistz256_sqr_montq mov rax,QWORD[32+rbx] mov r9,QWORD[((64+0))+rbx] mov r10,QWORD[((64+8))+rbx] mov r11,QWORD[((64+16))+rbx] mov r12,QWORD[((64+24))+rbx] lea rsi,[((64-0))+rbx] lea rbx,[32+rbx] DB 102,72,15,126,215 call __ecp_nistz256_mul_montq call __ecp_nistz256_mul_by_2q mov r12,QWORD[((96+0))+rsp] mov r13,QWORD[((96+8))+rsp] lea rbx,[64+rsp] mov r8,QWORD[((96+16))+rsp] mov r9,QWORD[((96+24))+rsp] lea rdi,[32+rsp] call __ecp_nistz256_add_toq mov r12,QWORD[((96+0))+rsp] mov r13,QWORD[((96+8))+rsp] lea rbx,[64+rsp] mov r8,QWORD[((96+16))+rsp] mov r9,QWORD[((96+24))+rsp] lea rdi,[64+rsp] call __ecp_nistz256_sub_fromq mov rax,QWORD[((0+0))+rsp] mov r14,QWORD[((8+0))+rsp] lea rsi,[((0+0))+rsp] mov r15,QWORD[((16+0))+rsp] mov r8,QWORD[((24+0))+rsp] DB 102,72,15,126,207 call __ecp_nistz256_sqr_montq xor r9,r9 mov rax,r12 add r12,-1 mov r10,r13 adc r13,rsi mov rcx,r14 adc r14,0 mov r8,r15 adc r15,rbp adc r9,0 xor rsi,rsi test rax,1 cmovz r12,rax cmovz r13,r10 cmovz r14,rcx cmovz r15,r8 cmovz r9,rsi mov rax,r13 shr r12,1 shl rax,63 mov r10,r14 shr r13,1 or r12,rax shl r10,63 mov rcx,r15 shr r14,1 or r13,r10 shl rcx,63 mov QWORD[rdi],r12 shr r15,1 mov QWORD[8+rdi],r13 shl r9,63 or r14,rcx or r15,r9 mov QWORD[16+rdi],r14 mov QWORD[24+rdi],r15 mov rax,QWORD[64+rsp] lea rbx,[64+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[32+rsp] call __ecp_nistz256_mul_montq lea rdi,[128+rsp] call __ecp_nistz256_mul_by_2q lea rbx,[32+rsp] lea rdi,[32+rsp] call __ecp_nistz256_add_toq mov rax,QWORD[96+rsp] lea rbx,[96+rsp] mov r9,QWORD[((0+0))+rsp] mov r10,QWORD[((8+0))+rsp] lea rsi,[((0+0))+rsp] mov r11,QWORD[((16+0))+rsp] mov r12,QWORD[((24+0))+rsp] lea rdi,[rsp] call __ecp_nistz256_mul_montq lea rdi,[128+rsp] call __ecp_nistz256_mul_by_2q mov rax,QWORD[((0+32))+rsp] mov r14,QWORD[((8+32))+rsp] lea rsi,[((0+32))+rsp] mov r15,QWORD[((16+32))+rsp] mov r8,QWORD[((24+32))+rsp] DB 102,72,15,126,199 call __ecp_nistz256_sqr_montq lea rbx,[128+rsp] mov r8,r14 mov r9,r15 mov r14,rsi mov r15,rbp call __ecp_nistz256_sub_fromq mov rax,QWORD[((0+0))+rsp] mov rbp,QWORD[((0+8))+rsp] mov rcx,QWORD[((0+16))+rsp] mov r10,QWORD[((0+24))+rsp] lea rdi,[rsp] call __ecp_nistz256_subq mov rax,QWORD[32+rsp] lea rbx,[32+rsp] mov r14,r12 xor ecx,ecx mov QWORD[((0+0))+rsp],r12 mov r10,r13 mov QWORD[((0+8))+rsp],r13 cmovz r11,r8 mov QWORD[((0+16))+rsp],r8 lea rsi,[((0-0))+rsp] cmovz r12,r9 mov QWORD[((0+24))+rsp],r9 mov r9,r14 lea rdi,[rsp] call __ecp_nistz256_mul_montq DB 102,72,15,126,203 DB 102,72,15,126,207 call __ecp_nistz256_sub_fromq lea rsi,[((160+56))+rsp] mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbx,QWORD[((-16))+rsi] mov rbp,QWORD[((-8))+rsi] lea rsp,[rsi] $L$point_doubleq_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_point_double_nohw: global ecp_nistz256_point_add_nohw ALIGN 32 ecp_nistz256_point_add_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_point_add_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,32*18+8 $L$point_addq_body: movdqu xmm0,XMMWORD[rsi] movdqu xmm1,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] movdqu xmm3,XMMWORD[48+rsi] movdqu xmm4,XMMWORD[64+rsi] movdqu xmm5,XMMWORD[80+rsi] mov rbx,rsi mov rsi,rdx movdqa XMMWORD[384+rsp],xmm0 movdqa XMMWORD[(384+16)+rsp],xmm1 movdqa XMMWORD[416+rsp],xmm2 movdqa XMMWORD[(416+16)+rsp],xmm3 movdqa XMMWORD[448+rsp],xmm4 movdqa XMMWORD[(448+16)+rsp],xmm5 por xmm5,xmm4 movdqu xmm0,XMMWORD[rsi] pshufd xmm3,xmm5,0xb1 movdqu xmm1,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] por xmm5,xmm3 movdqu xmm3,XMMWORD[48+rsi] mov rax,QWORD[((64+0))+rsi] mov r14,QWORD[((64+8))+rsi] mov r15,QWORD[((64+16))+rsi] mov r8,QWORD[((64+24))+rsi] movdqa XMMWORD[480+rsp],xmm0 pshufd xmm4,xmm5,0x1e movdqa XMMWORD[(480+16)+rsp],xmm1 movdqu xmm0,XMMWORD[64+rsi] movdqu xmm1,XMMWORD[80+rsi] movdqa XMMWORD[512+rsp],xmm2 movdqa XMMWORD[(512+16)+rsp],xmm3 por xmm5,xmm4 pxor xmm4,xmm4 por xmm1,xmm0 DB 102,72,15,110,199 lea rsi,[((64-0))+rsi] mov QWORD[((544+0))+rsp],rax mov QWORD[((544+8))+rsp],r14 mov QWORD[((544+16))+rsp],r15 mov QWORD[((544+24))+rsp],r8 lea rdi,[96+rsp] call __ecp_nistz256_sqr_montq pcmpeqd xmm5,xmm4 pshufd xmm4,xmm1,0xb1 por xmm4,xmm1 pshufd xmm5,xmm5,0 pshufd xmm3,xmm4,0x1e por xmm4,xmm3 pxor xmm3,xmm3 pcmpeqd xmm4,xmm3 pshufd xmm4,xmm4,0 mov rax,QWORD[((64+0))+rbx] mov r14,QWORD[((64+8))+rbx] mov r15,QWORD[((64+16))+rbx] mov r8,QWORD[((64+24))+rbx] DB 102,72,15,110,203 lea rsi,[((64-0))+rbx] lea rdi,[32+rsp] call __ecp_nistz256_sqr_montq mov rax,QWORD[544+rsp] lea rbx,[544+rsp] mov r9,QWORD[((0+96))+rsp] mov r10,QWORD[((8+96))+rsp] lea rsi,[((0+96))+rsp] mov r11,QWORD[((16+96))+rsp] mov r12,QWORD[((24+96))+rsp] lea rdi,[224+rsp] call __ecp_nistz256_mul_montq mov rax,QWORD[448+rsp] lea rbx,[448+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[256+rsp] call __ecp_nistz256_mul_montq mov rax,QWORD[416+rsp] lea rbx,[416+rsp] mov r9,QWORD[((0+224))+rsp] mov r10,QWORD[((8+224))+rsp] lea rsi,[((0+224))+rsp] mov r11,QWORD[((16+224))+rsp] mov r12,QWORD[((24+224))+rsp] lea rdi,[224+rsp] call __ecp_nistz256_mul_montq mov rax,QWORD[512+rsp] lea rbx,[512+rsp] mov r9,QWORD[((0+256))+rsp] mov r10,QWORD[((8+256))+rsp] lea rsi,[((0+256))+rsp] mov r11,QWORD[((16+256))+rsp] mov r12,QWORD[((24+256))+rsp] lea rdi,[256+rsp] call __ecp_nistz256_mul_montq lea rbx,[224+rsp] lea rdi,[64+rsp] call __ecp_nistz256_sub_fromq or r12,r13 movdqa xmm2,xmm4 or r12,r8 or r12,r9 por xmm2,xmm5 DB 102,73,15,110,220 mov rax,QWORD[384+rsp] lea rbx,[384+rsp] mov r9,QWORD[((0+96))+rsp] mov r10,QWORD[((8+96))+rsp] lea rsi,[((0+96))+rsp] mov r11,QWORD[((16+96))+rsp] mov r12,QWORD[((24+96))+rsp] lea rdi,[160+rsp] call __ecp_nistz256_mul_montq mov rax,QWORD[480+rsp] lea rbx,[480+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[192+rsp] call __ecp_nistz256_mul_montq lea rbx,[160+rsp] lea rdi,[rsp] call __ecp_nistz256_sub_fromq or r12,r13 or r12,r8 or r12,r9 DB 102,73,15,126,208 DB 102,73,15,126,217 or r12,r8 DB 0x3e jnz NEAR $L$add_proceedq test r9,r9 jz NEAR $L$add_doubleq DB 102,72,15,126,199 pxor xmm0,xmm0 movdqu XMMWORD[rdi],xmm0 movdqu XMMWORD[16+rdi],xmm0 movdqu XMMWORD[32+rdi],xmm0 movdqu XMMWORD[48+rdi],xmm0 movdqu XMMWORD[64+rdi],xmm0 movdqu XMMWORD[80+rdi],xmm0 jmp NEAR $L$add_doneq ALIGN 32 $L$add_doubleq: DB 102,72,15,126,206 DB 102,72,15,126,199 add rsp,416 jmp NEAR $L$point_double_shortcutq ALIGN 32 $L$add_proceedq: mov rax,QWORD[((0+64))+rsp] mov r14,QWORD[((8+64))+rsp] lea rsi,[((0+64))+rsp] mov r15,QWORD[((16+64))+rsp] mov r8,QWORD[((24+64))+rsp] lea rdi,[96+rsp] call __ecp_nistz256_sqr_montq mov rax,QWORD[448+rsp] lea rbx,[448+rsp] mov r9,QWORD[((0+0))+rsp] mov r10,QWORD[((8+0))+rsp] lea rsi,[((0+0))+rsp] mov r11,QWORD[((16+0))+rsp] mov r12,QWORD[((24+0))+rsp] lea rdi,[352+rsp] call __ecp_nistz256_mul_montq mov rax,QWORD[((0+0))+rsp] mov r14,QWORD[((8+0))+rsp] lea rsi,[((0+0))+rsp] mov r15,QWORD[((16+0))+rsp] mov r8,QWORD[((24+0))+rsp] lea rdi,[32+rsp] call __ecp_nistz256_sqr_montq mov rax,QWORD[544+rsp] lea rbx,[544+rsp] mov r9,QWORD[((0+352))+rsp] mov r10,QWORD[((8+352))+rsp] lea rsi,[((0+352))+rsp] mov r11,QWORD[((16+352))+rsp] mov r12,QWORD[((24+352))+rsp] lea rdi,[352+rsp] call __ecp_nistz256_mul_montq mov rax,QWORD[rsp] lea rbx,[rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[128+rsp] call __ecp_nistz256_mul_montq mov rax,QWORD[160+rsp] lea rbx,[160+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[192+rsp] call __ecp_nistz256_mul_montq xor r11,r11 add r12,r12 lea rsi,[96+rsp] adc r13,r13 mov rax,r12 adc r8,r8 adc r9,r9 mov rbp,r13 adc r11,0 sub r12,-1 mov rcx,r8 sbb r13,r14 sbb r8,0 mov r10,r9 sbb r9,r15 sbb r11,0 cmovc r12,rax mov rax,QWORD[rsi] cmovc r13,rbp mov rbp,QWORD[8+rsi] cmovc r8,rcx mov rcx,QWORD[16+rsi] cmovc r9,r10 mov r10,QWORD[24+rsi] call __ecp_nistz256_subq lea rbx,[128+rsp] lea rdi,[288+rsp] call __ecp_nistz256_sub_fromq mov rax,QWORD[((192+0))+rsp] mov rbp,QWORD[((192+8))+rsp] mov rcx,QWORD[((192+16))+rsp] mov r10,QWORD[((192+24))+rsp] lea rdi,[320+rsp] call __ecp_nistz256_subq mov QWORD[rdi],r12 mov QWORD[8+rdi],r13 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 mov rax,QWORD[128+rsp] lea rbx,[128+rsp] mov r9,QWORD[((0+224))+rsp] mov r10,QWORD[((8+224))+rsp] lea rsi,[((0+224))+rsp] mov r11,QWORD[((16+224))+rsp] mov r12,QWORD[((24+224))+rsp] lea rdi,[256+rsp] call __ecp_nistz256_mul_montq mov rax,QWORD[320+rsp] lea rbx,[320+rsp] mov r9,QWORD[((0+64))+rsp] mov r10,QWORD[((8+64))+rsp] lea rsi,[((0+64))+rsp] mov r11,QWORD[((16+64))+rsp] mov r12,QWORD[((24+64))+rsp] lea rdi,[320+rsp] call __ecp_nistz256_mul_montq lea rbx,[256+rsp] lea rdi,[320+rsp] call __ecp_nistz256_sub_fromq DB 102,72,15,126,199 movdqa xmm0,xmm5 movdqa xmm1,xmm5 pandn xmm0,XMMWORD[352+rsp] movdqa xmm2,xmm5 pandn xmm1,XMMWORD[((352+16))+rsp] movdqa xmm3,xmm5 pand xmm2,XMMWORD[544+rsp] pand xmm3,XMMWORD[((544+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqa xmm0,xmm4 movdqa xmm1,xmm4 pandn xmm0,xmm2 movdqa xmm2,xmm4 pandn xmm1,xmm3 movdqa xmm3,xmm4 pand xmm2,XMMWORD[448+rsp] pand xmm3,XMMWORD[((448+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqu XMMWORD[64+rdi],xmm2 movdqu XMMWORD[80+rdi],xmm3 movdqa xmm0,xmm5 movdqa xmm1,xmm5 pandn xmm0,XMMWORD[288+rsp] movdqa xmm2,xmm5 pandn xmm1,XMMWORD[((288+16))+rsp] movdqa xmm3,xmm5 pand xmm2,XMMWORD[480+rsp] pand xmm3,XMMWORD[((480+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqa xmm0,xmm4 movdqa xmm1,xmm4 pandn xmm0,xmm2 movdqa xmm2,xmm4 pandn xmm1,xmm3 movdqa xmm3,xmm4 pand xmm2,XMMWORD[384+rsp] pand xmm3,XMMWORD[((384+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqu XMMWORD[rdi],xmm2 movdqu XMMWORD[16+rdi],xmm3 movdqa xmm0,xmm5 movdqa xmm1,xmm5 pandn xmm0,XMMWORD[320+rsp] movdqa xmm2,xmm5 pandn xmm1,XMMWORD[((320+16))+rsp] movdqa xmm3,xmm5 pand xmm2,XMMWORD[512+rsp] pand xmm3,XMMWORD[((512+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqa xmm0,xmm4 movdqa xmm1,xmm4 pandn xmm0,xmm2 movdqa xmm2,xmm4 pandn xmm1,xmm3 movdqa xmm3,xmm4 pand xmm2,XMMWORD[416+rsp] pand xmm3,XMMWORD[((416+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqu XMMWORD[32+rdi],xmm2 movdqu XMMWORD[48+rdi],xmm3 $L$add_doneq: lea rsi,[((576+56))+rsp] mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbx,QWORD[((-16))+rsi] mov rbp,QWORD[((-8))+rsi] lea rsp,[rsi] $L$point_addq_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_point_add_nohw: global ecp_nistz256_point_add_affine_nohw ALIGN 32 ecp_nistz256_point_add_affine_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_point_add_affine_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,32*15+8 $L$add_affineq_body: movdqu xmm0,XMMWORD[rsi] mov rbx,rdx movdqu xmm1,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] movdqu xmm3,XMMWORD[48+rsi] movdqu xmm4,XMMWORD[64+rsi] movdqu xmm5,XMMWORD[80+rsi] mov rax,QWORD[((64+0))+rsi] mov r14,QWORD[((64+8))+rsi] mov r15,QWORD[((64+16))+rsi] mov r8,QWORD[((64+24))+rsi] movdqa XMMWORD[320+rsp],xmm0 movdqa XMMWORD[(320+16)+rsp],xmm1 movdqa XMMWORD[352+rsp],xmm2 movdqa XMMWORD[(352+16)+rsp],xmm3 movdqa XMMWORD[384+rsp],xmm4 movdqa XMMWORD[(384+16)+rsp],xmm5 por xmm5,xmm4 movdqu xmm0,XMMWORD[rbx] pshufd xmm3,xmm5,0xb1 movdqu xmm1,XMMWORD[16+rbx] movdqu xmm2,XMMWORD[32+rbx] por xmm5,xmm3 movdqu xmm3,XMMWORD[48+rbx] movdqa XMMWORD[416+rsp],xmm0 pshufd xmm4,xmm5,0x1e movdqa XMMWORD[(416+16)+rsp],xmm1 por xmm1,xmm0 DB 102,72,15,110,199 movdqa XMMWORD[448+rsp],xmm2 movdqa XMMWORD[(448+16)+rsp],xmm3 por xmm3,xmm2 por xmm5,xmm4 pxor xmm4,xmm4 por xmm3,xmm1 lea rsi,[((64-0))+rsi] lea rdi,[32+rsp] call __ecp_nistz256_sqr_montq pcmpeqd xmm5,xmm4 pshufd xmm4,xmm3,0xb1 mov rax,QWORD[rbx] mov r9,r12 por xmm4,xmm3 pshufd xmm5,xmm5,0 pshufd xmm3,xmm4,0x1e mov r10,r13 por xmm4,xmm3 pxor xmm3,xmm3 mov r11,r14 pcmpeqd xmm4,xmm3 pshufd xmm4,xmm4,0 lea rsi,[((32-0))+rsp] mov r12,r15 lea rdi,[rsp] call __ecp_nistz256_mul_montq lea rbx,[320+rsp] lea rdi,[64+rsp] call __ecp_nistz256_sub_fromq mov rax,QWORD[384+rsp] lea rbx,[384+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[32+rsp] call __ecp_nistz256_mul_montq mov rax,QWORD[384+rsp] lea rbx,[384+rsp] mov r9,QWORD[((0+64))+rsp] mov r10,QWORD[((8+64))+rsp] lea rsi,[((0+64))+rsp] mov r11,QWORD[((16+64))+rsp] mov r12,QWORD[((24+64))+rsp] lea rdi,[288+rsp] call __ecp_nistz256_mul_montq mov rax,QWORD[448+rsp] lea rbx,[448+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[32+rsp] call __ecp_nistz256_mul_montq lea rbx,[352+rsp] lea rdi,[96+rsp] call __ecp_nistz256_sub_fromq mov rax,QWORD[((0+64))+rsp] mov r14,QWORD[((8+64))+rsp] lea rsi,[((0+64))+rsp] mov r15,QWORD[((16+64))+rsp] mov r8,QWORD[((24+64))+rsp] lea rdi,[128+rsp] call __ecp_nistz256_sqr_montq mov rax,QWORD[((0+96))+rsp] mov r14,QWORD[((8+96))+rsp] lea rsi,[((0+96))+rsp] mov r15,QWORD[((16+96))+rsp] mov r8,QWORD[((24+96))+rsp] lea rdi,[192+rsp] call __ecp_nistz256_sqr_montq mov rax,QWORD[128+rsp] lea rbx,[128+rsp] mov r9,QWORD[((0+64))+rsp] mov r10,QWORD[((8+64))+rsp] lea rsi,[((0+64))+rsp] mov r11,QWORD[((16+64))+rsp] mov r12,QWORD[((24+64))+rsp] lea rdi,[160+rsp] call __ecp_nistz256_mul_montq mov rax,QWORD[320+rsp] lea rbx,[320+rsp] mov r9,QWORD[((0+128))+rsp] mov r10,QWORD[((8+128))+rsp] lea rsi,[((0+128))+rsp] mov r11,QWORD[((16+128))+rsp] mov r12,QWORD[((24+128))+rsp] lea rdi,[rsp] call __ecp_nistz256_mul_montq xor r11,r11 add r12,r12 lea rsi,[192+rsp] adc r13,r13 mov rax,r12 adc r8,r8 adc r9,r9 mov rbp,r13 adc r11,0 sub r12,-1 mov rcx,r8 sbb r13,r14 sbb r8,0 mov r10,r9 sbb r9,r15 sbb r11,0 cmovc r12,rax mov rax,QWORD[rsi] cmovc r13,rbp mov rbp,QWORD[8+rsi] cmovc r8,rcx mov rcx,QWORD[16+rsi] cmovc r9,r10 mov r10,QWORD[24+rsi] call __ecp_nistz256_subq lea rbx,[160+rsp] lea rdi,[224+rsp] call __ecp_nistz256_sub_fromq mov rax,QWORD[((0+0))+rsp] mov rbp,QWORD[((0+8))+rsp] mov rcx,QWORD[((0+16))+rsp] mov r10,QWORD[((0+24))+rsp] lea rdi,[64+rsp] call __ecp_nistz256_subq mov QWORD[rdi],r12 mov QWORD[8+rdi],r13 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 mov rax,QWORD[352+rsp] lea rbx,[352+rsp] mov r9,QWORD[((0+160))+rsp] mov r10,QWORD[((8+160))+rsp] lea rsi,[((0+160))+rsp] mov r11,QWORD[((16+160))+rsp] mov r12,QWORD[((24+160))+rsp] lea rdi,[32+rsp] call __ecp_nistz256_mul_montq mov rax,QWORD[96+rsp] lea rbx,[96+rsp] mov r9,QWORD[((0+64))+rsp] mov r10,QWORD[((8+64))+rsp] lea rsi,[((0+64))+rsp] mov r11,QWORD[((16+64))+rsp] mov r12,QWORD[((24+64))+rsp] lea rdi,[64+rsp] call __ecp_nistz256_mul_montq lea rbx,[32+rsp] lea rdi,[256+rsp] call __ecp_nistz256_sub_fromq DB 102,72,15,126,199 movdqa xmm0,xmm5 movdqa xmm1,xmm5 pandn xmm0,XMMWORD[288+rsp] movdqa xmm2,xmm5 pandn xmm1,XMMWORD[((288+16))+rsp] movdqa xmm3,xmm5 pand xmm2,XMMWORD[$L$ONE_mont] pand xmm3,XMMWORD[(($L$ONE_mont+16))] por xmm2,xmm0 por xmm3,xmm1 movdqa xmm0,xmm4 movdqa xmm1,xmm4 pandn xmm0,xmm2 movdqa xmm2,xmm4 pandn xmm1,xmm3 movdqa xmm3,xmm4 pand xmm2,XMMWORD[384+rsp] pand xmm3,XMMWORD[((384+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqu XMMWORD[64+rdi],xmm2 movdqu XMMWORD[80+rdi],xmm3 movdqa xmm0,xmm5 movdqa xmm1,xmm5 pandn xmm0,XMMWORD[224+rsp] movdqa xmm2,xmm5 pandn xmm1,XMMWORD[((224+16))+rsp] movdqa xmm3,xmm5 pand xmm2,XMMWORD[416+rsp] pand xmm3,XMMWORD[((416+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqa xmm0,xmm4 movdqa xmm1,xmm4 pandn xmm0,xmm2 movdqa xmm2,xmm4 pandn xmm1,xmm3 movdqa xmm3,xmm4 pand xmm2,XMMWORD[320+rsp] pand xmm3,XMMWORD[((320+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqu XMMWORD[rdi],xmm2 movdqu XMMWORD[16+rdi],xmm3 movdqa xmm0,xmm5 movdqa xmm1,xmm5 pandn xmm0,XMMWORD[256+rsp] movdqa xmm2,xmm5 pandn xmm1,XMMWORD[((256+16))+rsp] movdqa xmm3,xmm5 pand xmm2,XMMWORD[448+rsp] pand xmm3,XMMWORD[((448+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqa xmm0,xmm4 movdqa xmm1,xmm4 pandn xmm0,xmm2 movdqa xmm2,xmm4 pandn xmm1,xmm3 movdqa xmm3,xmm4 pand xmm2,XMMWORD[352+rsp] pand xmm3,XMMWORD[((352+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqu XMMWORD[32+rdi],xmm2 movdqu XMMWORD[48+rdi],xmm3 lea rsi,[((480+56))+rsp] mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbx,QWORD[((-16))+rsi] mov rbp,QWORD[((-8))+rsi] lea rsp,[rsi] $L$add_affineq_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_point_add_affine_nohw: ALIGN 32 __ecp_nistz256_add_tox: xor r11,r11 adc r12,QWORD[rbx] adc r13,QWORD[8+rbx] mov rax,r12 adc r8,QWORD[16+rbx] adc r9,QWORD[24+rbx] mov rbp,r13 adc r11,0 xor r10,r10 sbb r12,-1 mov rcx,r8 sbb r13,r14 sbb r8,0 mov r10,r9 sbb r9,r15 sbb r11,0 cmovc r12,rax cmovc r13,rbp mov QWORD[rdi],r12 cmovc r8,rcx mov QWORD[8+rdi],r13 cmovc r9,r10 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 ret ALIGN 32 __ecp_nistz256_sub_fromx: xor r11,r11 sbb r12,QWORD[rbx] sbb r13,QWORD[8+rbx] mov rax,r12 sbb r8,QWORD[16+rbx] sbb r9,QWORD[24+rbx] mov rbp,r13 sbb r11,0 xor r10,r10 adc r12,-1 mov rcx,r8 adc r13,r14 adc r8,0 mov r10,r9 adc r9,r15 bt r11,0 cmovnc r12,rax cmovnc r13,rbp mov QWORD[rdi],r12 cmovnc r8,rcx mov QWORD[8+rdi],r13 cmovnc r9,r10 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 ret ALIGN 32 __ecp_nistz256_subx: xor r11,r11 sbb rax,r12 sbb rbp,r13 mov r12,rax sbb rcx,r8 sbb r10,r9 mov r13,rbp sbb r11,0 xor r9,r9 adc rax,-1 mov r8,rcx adc rbp,r14 adc rcx,0 mov r9,r10 adc r10,r15 bt r11,0 cmovc r12,rax cmovc r13,rbp cmovc r8,rcx cmovc r9,r10 ret ALIGN 32 __ecp_nistz256_mul_by_2x: xor r11,r11 adc r12,r12 adc r13,r13 mov rax,r12 adc r8,r8 adc r9,r9 mov rbp,r13 adc r11,0 xor r10,r10 sbb r12,-1 mov rcx,r8 sbb r13,r14 sbb r8,0 mov r10,r9 sbb r9,r15 sbb r11,0 cmovc r12,rax cmovc r13,rbp mov QWORD[rdi],r12 cmovc r8,rcx mov QWORD[8+rdi],r13 cmovc r9,r10 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 ret global ecp_nistz256_point_double_adx ALIGN 32 ecp_nistz256_point_double_adx: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_point_double_adx: mov rdi,rcx mov rsi,rdx _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,32*5+8 $L$point_doublex_body: $L$point_double_shortcutx: movdqu xmm0,XMMWORD[rsi] mov rbx,rsi movdqu xmm1,XMMWORD[16+rsi] mov r12,QWORD[((32+0))+rsi] mov r13,QWORD[((32+8))+rsi] mov r8,QWORD[((32+16))+rsi] mov r9,QWORD[((32+24))+rsi] mov r14,QWORD[(($L$poly+8))] mov r15,QWORD[(($L$poly+24))] movdqa XMMWORD[96+rsp],xmm0 movdqa XMMWORD[(96+16)+rsp],xmm1 lea r10,[32+rdi] lea r11,[64+rdi] DB 102,72,15,110,199 DB 102,73,15,110,202 DB 102,73,15,110,211 lea rdi,[rsp] call __ecp_nistz256_mul_by_2x mov rdx,QWORD[((64+0))+rsi] mov r14,QWORD[((64+8))+rsi] mov r15,QWORD[((64+16))+rsi] mov r8,QWORD[((64+24))+rsi] lea rsi,[((64-128))+rsi] lea rdi,[64+rsp] call __ecp_nistz256_sqr_montx mov rdx,QWORD[((0+0))+rsp] mov r14,QWORD[((8+0))+rsp] lea rsi,[((-128+0))+rsp] mov r15,QWORD[((16+0))+rsp] mov r8,QWORD[((24+0))+rsp] lea rdi,[rsp] call __ecp_nistz256_sqr_montx mov rdx,QWORD[32+rbx] mov r9,QWORD[((64+0))+rbx] mov r10,QWORD[((64+8))+rbx] mov r11,QWORD[((64+16))+rbx] mov r12,QWORD[((64+24))+rbx] lea rsi,[((64-128))+rbx] lea rbx,[32+rbx] DB 102,72,15,126,215 call __ecp_nistz256_mul_montx call __ecp_nistz256_mul_by_2x mov r12,QWORD[((96+0))+rsp] mov r13,QWORD[((96+8))+rsp] lea rbx,[64+rsp] mov r8,QWORD[((96+16))+rsp] mov r9,QWORD[((96+24))+rsp] lea rdi,[32+rsp] call __ecp_nistz256_add_tox mov r12,QWORD[((96+0))+rsp] mov r13,QWORD[((96+8))+rsp] lea rbx,[64+rsp] mov r8,QWORD[((96+16))+rsp] mov r9,QWORD[((96+24))+rsp] lea rdi,[64+rsp] call __ecp_nistz256_sub_fromx mov rdx,QWORD[((0+0))+rsp] mov r14,QWORD[((8+0))+rsp] lea rsi,[((-128+0))+rsp] mov r15,QWORD[((16+0))+rsp] mov r8,QWORD[((24+0))+rsp] DB 102,72,15,126,207 call __ecp_nistz256_sqr_montx xor r9,r9 mov rax,r12 add r12,-1 mov r10,r13 adc r13,rsi mov rcx,r14 adc r14,0 mov r8,r15 adc r15,rbp adc r9,0 xor rsi,rsi test rax,1 cmovz r12,rax cmovz r13,r10 cmovz r14,rcx cmovz r15,r8 cmovz r9,rsi mov rax,r13 shr r12,1 shl rax,63 mov r10,r14 shr r13,1 or r12,rax shl r10,63 mov rcx,r15 shr r14,1 or r13,r10 shl rcx,63 mov QWORD[rdi],r12 shr r15,1 mov QWORD[8+rdi],r13 shl r9,63 or r14,rcx or r15,r9 mov QWORD[16+rdi],r14 mov QWORD[24+rdi],r15 mov rdx,QWORD[64+rsp] lea rbx,[64+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((-128+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[32+rsp] call __ecp_nistz256_mul_montx lea rdi,[128+rsp] call __ecp_nistz256_mul_by_2x lea rbx,[32+rsp] lea rdi,[32+rsp] call __ecp_nistz256_add_tox mov rdx,QWORD[96+rsp] lea rbx,[96+rsp] mov r9,QWORD[((0+0))+rsp] mov r10,QWORD[((8+0))+rsp] lea rsi,[((-128+0))+rsp] mov r11,QWORD[((16+0))+rsp] mov r12,QWORD[((24+0))+rsp] lea rdi,[rsp] call __ecp_nistz256_mul_montx lea rdi,[128+rsp] call __ecp_nistz256_mul_by_2x mov rdx,QWORD[((0+32))+rsp] mov r14,QWORD[((8+32))+rsp] lea rsi,[((-128+32))+rsp] mov r15,QWORD[((16+32))+rsp] mov r8,QWORD[((24+32))+rsp] DB 102,72,15,126,199 call __ecp_nistz256_sqr_montx lea rbx,[128+rsp] mov r8,r14 mov r9,r15 mov r14,rsi mov r15,rbp call __ecp_nistz256_sub_fromx mov rax,QWORD[((0+0))+rsp] mov rbp,QWORD[((0+8))+rsp] mov rcx,QWORD[((0+16))+rsp] mov r10,QWORD[((0+24))+rsp] lea rdi,[rsp] call __ecp_nistz256_subx mov rdx,QWORD[32+rsp] lea rbx,[32+rsp] mov r14,r12 xor ecx,ecx mov QWORD[((0+0))+rsp],r12 mov r10,r13 mov QWORD[((0+8))+rsp],r13 cmovz r11,r8 mov QWORD[((0+16))+rsp],r8 lea rsi,[((0-128))+rsp] cmovz r12,r9 mov QWORD[((0+24))+rsp],r9 mov r9,r14 lea rdi,[rsp] call __ecp_nistz256_mul_montx DB 102,72,15,126,203 DB 102,72,15,126,207 call __ecp_nistz256_sub_fromx lea rsi,[((160+56))+rsp] mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbx,QWORD[((-16))+rsi] mov rbp,QWORD[((-8))+rsi] lea rsp,[rsi] $L$point_doublex_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_point_double_adx: global ecp_nistz256_point_add_adx ALIGN 32 ecp_nistz256_point_add_adx: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_point_add_adx: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,32*18+8 $L$point_addx_body: movdqu xmm0,XMMWORD[rsi] movdqu xmm1,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] movdqu xmm3,XMMWORD[48+rsi] movdqu xmm4,XMMWORD[64+rsi] movdqu xmm5,XMMWORD[80+rsi] mov rbx,rsi mov rsi,rdx movdqa XMMWORD[384+rsp],xmm0 movdqa XMMWORD[(384+16)+rsp],xmm1 movdqa XMMWORD[416+rsp],xmm2 movdqa XMMWORD[(416+16)+rsp],xmm3 movdqa XMMWORD[448+rsp],xmm4 movdqa XMMWORD[(448+16)+rsp],xmm5 por xmm5,xmm4 movdqu xmm0,XMMWORD[rsi] pshufd xmm3,xmm5,0xb1 movdqu xmm1,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] por xmm5,xmm3 movdqu xmm3,XMMWORD[48+rsi] mov rdx,QWORD[((64+0))+rsi] mov r14,QWORD[((64+8))+rsi] mov r15,QWORD[((64+16))+rsi] mov r8,QWORD[((64+24))+rsi] movdqa XMMWORD[480+rsp],xmm0 pshufd xmm4,xmm5,0x1e movdqa XMMWORD[(480+16)+rsp],xmm1 movdqu xmm0,XMMWORD[64+rsi] movdqu xmm1,XMMWORD[80+rsi] movdqa XMMWORD[512+rsp],xmm2 movdqa XMMWORD[(512+16)+rsp],xmm3 por xmm5,xmm4 pxor xmm4,xmm4 por xmm1,xmm0 DB 102,72,15,110,199 lea rsi,[((64-128))+rsi] mov QWORD[((544+0))+rsp],rdx mov QWORD[((544+8))+rsp],r14 mov QWORD[((544+16))+rsp],r15 mov QWORD[((544+24))+rsp],r8 lea rdi,[96+rsp] call __ecp_nistz256_sqr_montx pcmpeqd xmm5,xmm4 pshufd xmm4,xmm1,0xb1 por xmm4,xmm1 pshufd xmm5,xmm5,0 pshufd xmm3,xmm4,0x1e por xmm4,xmm3 pxor xmm3,xmm3 pcmpeqd xmm4,xmm3 pshufd xmm4,xmm4,0 mov rdx,QWORD[((64+0))+rbx] mov r14,QWORD[((64+8))+rbx] mov r15,QWORD[((64+16))+rbx] mov r8,QWORD[((64+24))+rbx] DB 102,72,15,110,203 lea rsi,[((64-128))+rbx] lea rdi,[32+rsp] call __ecp_nistz256_sqr_montx mov rdx,QWORD[544+rsp] lea rbx,[544+rsp] mov r9,QWORD[((0+96))+rsp] mov r10,QWORD[((8+96))+rsp] lea rsi,[((-128+96))+rsp] mov r11,QWORD[((16+96))+rsp] mov r12,QWORD[((24+96))+rsp] lea rdi,[224+rsp] call __ecp_nistz256_mul_montx mov rdx,QWORD[448+rsp] lea rbx,[448+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((-128+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[256+rsp] call __ecp_nistz256_mul_montx mov rdx,QWORD[416+rsp] lea rbx,[416+rsp] mov r9,QWORD[((0+224))+rsp] mov r10,QWORD[((8+224))+rsp] lea rsi,[((-128+224))+rsp] mov r11,QWORD[((16+224))+rsp] mov r12,QWORD[((24+224))+rsp] lea rdi,[224+rsp] call __ecp_nistz256_mul_montx mov rdx,QWORD[512+rsp] lea rbx,[512+rsp] mov r9,QWORD[((0+256))+rsp] mov r10,QWORD[((8+256))+rsp] lea rsi,[((-128+256))+rsp] mov r11,QWORD[((16+256))+rsp] mov r12,QWORD[((24+256))+rsp] lea rdi,[256+rsp] call __ecp_nistz256_mul_montx lea rbx,[224+rsp] lea rdi,[64+rsp] call __ecp_nistz256_sub_fromx or r12,r13 movdqa xmm2,xmm4 or r12,r8 or r12,r9 por xmm2,xmm5 DB 102,73,15,110,220 mov rdx,QWORD[384+rsp] lea rbx,[384+rsp] mov r9,QWORD[((0+96))+rsp] mov r10,QWORD[((8+96))+rsp] lea rsi,[((-128+96))+rsp] mov r11,QWORD[((16+96))+rsp] mov r12,QWORD[((24+96))+rsp] lea rdi,[160+rsp] call __ecp_nistz256_mul_montx mov rdx,QWORD[480+rsp] lea rbx,[480+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((-128+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[192+rsp] call __ecp_nistz256_mul_montx lea rbx,[160+rsp] lea rdi,[rsp] call __ecp_nistz256_sub_fromx or r12,r13 or r12,r8 or r12,r9 DB 102,73,15,126,208 DB 102,73,15,126,217 or r12,r8 DB 0x3e jnz NEAR $L$add_proceedx test r9,r9 jz NEAR $L$add_doublex DB 102,72,15,126,199 pxor xmm0,xmm0 movdqu XMMWORD[rdi],xmm0 movdqu XMMWORD[16+rdi],xmm0 movdqu XMMWORD[32+rdi],xmm0 movdqu XMMWORD[48+rdi],xmm0 movdqu XMMWORD[64+rdi],xmm0 movdqu XMMWORD[80+rdi],xmm0 jmp NEAR $L$add_donex ALIGN 32 $L$add_doublex: DB 102,72,15,126,206 DB 102,72,15,126,199 add rsp,416 jmp NEAR $L$point_double_shortcutx ALIGN 32 $L$add_proceedx: mov rdx,QWORD[((0+64))+rsp] mov r14,QWORD[((8+64))+rsp] lea rsi,[((-128+64))+rsp] mov r15,QWORD[((16+64))+rsp] mov r8,QWORD[((24+64))+rsp] lea rdi,[96+rsp] call __ecp_nistz256_sqr_montx mov rdx,QWORD[448+rsp] lea rbx,[448+rsp] mov r9,QWORD[((0+0))+rsp] mov r10,QWORD[((8+0))+rsp] lea rsi,[((-128+0))+rsp] mov r11,QWORD[((16+0))+rsp] mov r12,QWORD[((24+0))+rsp] lea rdi,[352+rsp] call __ecp_nistz256_mul_montx mov rdx,QWORD[((0+0))+rsp] mov r14,QWORD[((8+0))+rsp] lea rsi,[((-128+0))+rsp] mov r15,QWORD[((16+0))+rsp] mov r8,QWORD[((24+0))+rsp] lea rdi,[32+rsp] call __ecp_nistz256_sqr_montx mov rdx,QWORD[544+rsp] lea rbx,[544+rsp] mov r9,QWORD[((0+352))+rsp] mov r10,QWORD[((8+352))+rsp] lea rsi,[((-128+352))+rsp] mov r11,QWORD[((16+352))+rsp] mov r12,QWORD[((24+352))+rsp] lea rdi,[352+rsp] call __ecp_nistz256_mul_montx mov rdx,QWORD[rsp] lea rbx,[rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((-128+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[128+rsp] call __ecp_nistz256_mul_montx mov rdx,QWORD[160+rsp] lea rbx,[160+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((-128+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[192+rsp] call __ecp_nistz256_mul_montx xor r11,r11 add r12,r12 lea rsi,[96+rsp] adc r13,r13 mov rax,r12 adc r8,r8 adc r9,r9 mov rbp,r13 adc r11,0 sub r12,-1 mov rcx,r8 sbb r13,r14 sbb r8,0 mov r10,r9 sbb r9,r15 sbb r11,0 cmovc r12,rax mov rax,QWORD[rsi] cmovc r13,rbp mov rbp,QWORD[8+rsi] cmovc r8,rcx mov rcx,QWORD[16+rsi] cmovc r9,r10 mov r10,QWORD[24+rsi] call __ecp_nistz256_subx lea rbx,[128+rsp] lea rdi,[288+rsp] call __ecp_nistz256_sub_fromx mov rax,QWORD[((192+0))+rsp] mov rbp,QWORD[((192+8))+rsp] mov rcx,QWORD[((192+16))+rsp] mov r10,QWORD[((192+24))+rsp] lea rdi,[320+rsp] call __ecp_nistz256_subx mov QWORD[rdi],r12 mov QWORD[8+rdi],r13 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 mov rdx,QWORD[128+rsp] lea rbx,[128+rsp] mov r9,QWORD[((0+224))+rsp] mov r10,QWORD[((8+224))+rsp] lea rsi,[((-128+224))+rsp] mov r11,QWORD[((16+224))+rsp] mov r12,QWORD[((24+224))+rsp] lea rdi,[256+rsp] call __ecp_nistz256_mul_montx mov rdx,QWORD[320+rsp] lea rbx,[320+rsp] mov r9,QWORD[((0+64))+rsp] mov r10,QWORD[((8+64))+rsp] lea rsi,[((-128+64))+rsp] mov r11,QWORD[((16+64))+rsp] mov r12,QWORD[((24+64))+rsp] lea rdi,[320+rsp] call __ecp_nistz256_mul_montx lea rbx,[256+rsp] lea rdi,[320+rsp] call __ecp_nistz256_sub_fromx DB 102,72,15,126,199 movdqa xmm0,xmm5 movdqa xmm1,xmm5 pandn xmm0,XMMWORD[352+rsp] movdqa xmm2,xmm5 pandn xmm1,XMMWORD[((352+16))+rsp] movdqa xmm3,xmm5 pand xmm2,XMMWORD[544+rsp] pand xmm3,XMMWORD[((544+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqa xmm0,xmm4 movdqa xmm1,xmm4 pandn xmm0,xmm2 movdqa xmm2,xmm4 pandn xmm1,xmm3 movdqa xmm3,xmm4 pand xmm2,XMMWORD[448+rsp] pand xmm3,XMMWORD[((448+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqu XMMWORD[64+rdi],xmm2 movdqu XMMWORD[80+rdi],xmm3 movdqa xmm0,xmm5 movdqa xmm1,xmm5 pandn xmm0,XMMWORD[288+rsp] movdqa xmm2,xmm5 pandn xmm1,XMMWORD[((288+16))+rsp] movdqa xmm3,xmm5 pand xmm2,XMMWORD[480+rsp] pand xmm3,XMMWORD[((480+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqa xmm0,xmm4 movdqa xmm1,xmm4 pandn xmm0,xmm2 movdqa xmm2,xmm4 pandn xmm1,xmm3 movdqa xmm3,xmm4 pand xmm2,XMMWORD[384+rsp] pand xmm3,XMMWORD[((384+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqu XMMWORD[rdi],xmm2 movdqu XMMWORD[16+rdi],xmm3 movdqa xmm0,xmm5 movdqa xmm1,xmm5 pandn xmm0,XMMWORD[320+rsp] movdqa xmm2,xmm5 pandn xmm1,XMMWORD[((320+16))+rsp] movdqa xmm3,xmm5 pand xmm2,XMMWORD[512+rsp] pand xmm3,XMMWORD[((512+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqa xmm0,xmm4 movdqa xmm1,xmm4 pandn xmm0,xmm2 movdqa xmm2,xmm4 pandn xmm1,xmm3 movdqa xmm3,xmm4 pand xmm2,XMMWORD[416+rsp] pand xmm3,XMMWORD[((416+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqu XMMWORD[32+rdi],xmm2 movdqu XMMWORD[48+rdi],xmm3 $L$add_donex: lea rsi,[((576+56))+rsp] mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbx,QWORD[((-16))+rsi] mov rbp,QWORD[((-8))+rsi] lea rsp,[rsi] $L$point_addx_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_point_add_adx: global ecp_nistz256_point_add_affine_adx ALIGN 32 ecp_nistz256_point_add_affine_adx: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_ecp_nistz256_point_add_affine_adx: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,32*15+8 $L$add_affinex_body: movdqu xmm0,XMMWORD[rsi] mov rbx,rdx movdqu xmm1,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] movdqu xmm3,XMMWORD[48+rsi] movdqu xmm4,XMMWORD[64+rsi] movdqu xmm5,XMMWORD[80+rsi] mov rdx,QWORD[((64+0))+rsi] mov r14,QWORD[((64+8))+rsi] mov r15,QWORD[((64+16))+rsi] mov r8,QWORD[((64+24))+rsi] movdqa XMMWORD[320+rsp],xmm0 movdqa XMMWORD[(320+16)+rsp],xmm1 movdqa XMMWORD[352+rsp],xmm2 movdqa XMMWORD[(352+16)+rsp],xmm3 movdqa XMMWORD[384+rsp],xmm4 movdqa XMMWORD[(384+16)+rsp],xmm5 por xmm5,xmm4 movdqu xmm0,XMMWORD[rbx] pshufd xmm3,xmm5,0xb1 movdqu xmm1,XMMWORD[16+rbx] movdqu xmm2,XMMWORD[32+rbx] por xmm5,xmm3 movdqu xmm3,XMMWORD[48+rbx] movdqa XMMWORD[416+rsp],xmm0 pshufd xmm4,xmm5,0x1e movdqa XMMWORD[(416+16)+rsp],xmm1 por xmm1,xmm0 DB 102,72,15,110,199 movdqa XMMWORD[448+rsp],xmm2 movdqa XMMWORD[(448+16)+rsp],xmm3 por xmm3,xmm2 por xmm5,xmm4 pxor xmm4,xmm4 por xmm3,xmm1 lea rsi,[((64-128))+rsi] lea rdi,[32+rsp] call __ecp_nistz256_sqr_montx pcmpeqd xmm5,xmm4 pshufd xmm4,xmm3,0xb1 mov rdx,QWORD[rbx] mov r9,r12 por xmm4,xmm3 pshufd xmm5,xmm5,0 pshufd xmm3,xmm4,0x1e mov r10,r13 por xmm4,xmm3 pxor xmm3,xmm3 mov r11,r14 pcmpeqd xmm4,xmm3 pshufd xmm4,xmm4,0 lea rsi,[((32-128))+rsp] mov r12,r15 lea rdi,[rsp] call __ecp_nistz256_mul_montx lea rbx,[320+rsp] lea rdi,[64+rsp] call __ecp_nistz256_sub_fromx mov rdx,QWORD[384+rsp] lea rbx,[384+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((-128+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[32+rsp] call __ecp_nistz256_mul_montx mov rdx,QWORD[384+rsp] lea rbx,[384+rsp] mov r9,QWORD[((0+64))+rsp] mov r10,QWORD[((8+64))+rsp] lea rsi,[((-128+64))+rsp] mov r11,QWORD[((16+64))+rsp] mov r12,QWORD[((24+64))+rsp] lea rdi,[288+rsp] call __ecp_nistz256_mul_montx mov rdx,QWORD[448+rsp] lea rbx,[448+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] lea rsi,[((-128+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[32+rsp] call __ecp_nistz256_mul_montx lea rbx,[352+rsp] lea rdi,[96+rsp] call __ecp_nistz256_sub_fromx mov rdx,QWORD[((0+64))+rsp] mov r14,QWORD[((8+64))+rsp] lea rsi,[((-128+64))+rsp] mov r15,QWORD[((16+64))+rsp] mov r8,QWORD[((24+64))+rsp] lea rdi,[128+rsp] call __ecp_nistz256_sqr_montx mov rdx,QWORD[((0+96))+rsp] mov r14,QWORD[((8+96))+rsp] lea rsi,[((-128+96))+rsp] mov r15,QWORD[((16+96))+rsp] mov r8,QWORD[((24+96))+rsp] lea rdi,[192+rsp] call __ecp_nistz256_sqr_montx mov rdx,QWORD[128+rsp] lea rbx,[128+rsp] mov r9,QWORD[((0+64))+rsp] mov r10,QWORD[((8+64))+rsp] lea rsi,[((-128+64))+rsp] mov r11,QWORD[((16+64))+rsp] mov r12,QWORD[((24+64))+rsp] lea rdi,[160+rsp] call __ecp_nistz256_mul_montx mov rdx,QWORD[320+rsp] lea rbx,[320+rsp] mov r9,QWORD[((0+128))+rsp] mov r10,QWORD[((8+128))+rsp] lea rsi,[((-128+128))+rsp] mov r11,QWORD[((16+128))+rsp] mov r12,QWORD[((24+128))+rsp] lea rdi,[rsp] call __ecp_nistz256_mul_montx xor r11,r11 add r12,r12 lea rsi,[192+rsp] adc r13,r13 mov rax,r12 adc r8,r8 adc r9,r9 mov rbp,r13 adc r11,0 sub r12,-1 mov rcx,r8 sbb r13,r14 sbb r8,0 mov r10,r9 sbb r9,r15 sbb r11,0 cmovc r12,rax mov rax,QWORD[rsi] cmovc r13,rbp mov rbp,QWORD[8+rsi] cmovc r8,rcx mov rcx,QWORD[16+rsi] cmovc r9,r10 mov r10,QWORD[24+rsi] call __ecp_nistz256_subx lea rbx,[160+rsp] lea rdi,[224+rsp] call __ecp_nistz256_sub_fromx mov rax,QWORD[((0+0))+rsp] mov rbp,QWORD[((0+8))+rsp] mov rcx,QWORD[((0+16))+rsp] mov r10,QWORD[((0+24))+rsp] lea rdi,[64+rsp] call __ecp_nistz256_subx mov QWORD[rdi],r12 mov QWORD[8+rdi],r13 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 mov rdx,QWORD[352+rsp] lea rbx,[352+rsp] mov r9,QWORD[((0+160))+rsp] mov r10,QWORD[((8+160))+rsp] lea rsi,[((-128+160))+rsp] mov r11,QWORD[((16+160))+rsp] mov r12,QWORD[((24+160))+rsp] lea rdi,[32+rsp] call __ecp_nistz256_mul_montx mov rdx,QWORD[96+rsp] lea rbx,[96+rsp] mov r9,QWORD[((0+64))+rsp] mov r10,QWORD[((8+64))+rsp] lea rsi,[((-128+64))+rsp] mov r11,QWORD[((16+64))+rsp] mov r12,QWORD[((24+64))+rsp] lea rdi,[64+rsp] call __ecp_nistz256_mul_montx lea rbx,[32+rsp] lea rdi,[256+rsp] call __ecp_nistz256_sub_fromx DB 102,72,15,126,199 movdqa xmm0,xmm5 movdqa xmm1,xmm5 pandn xmm0,XMMWORD[288+rsp] movdqa xmm2,xmm5 pandn xmm1,XMMWORD[((288+16))+rsp] movdqa xmm3,xmm5 pand xmm2,XMMWORD[$L$ONE_mont] pand xmm3,XMMWORD[(($L$ONE_mont+16))] por xmm2,xmm0 por xmm3,xmm1 movdqa xmm0,xmm4 movdqa xmm1,xmm4 pandn xmm0,xmm2 movdqa xmm2,xmm4 pandn xmm1,xmm3 movdqa xmm3,xmm4 pand xmm2,XMMWORD[384+rsp] pand xmm3,XMMWORD[((384+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqu XMMWORD[64+rdi],xmm2 movdqu XMMWORD[80+rdi],xmm3 movdqa xmm0,xmm5 movdqa xmm1,xmm5 pandn xmm0,XMMWORD[224+rsp] movdqa xmm2,xmm5 pandn xmm1,XMMWORD[((224+16))+rsp] movdqa xmm3,xmm5 pand xmm2,XMMWORD[416+rsp] pand xmm3,XMMWORD[((416+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqa xmm0,xmm4 movdqa xmm1,xmm4 pandn xmm0,xmm2 movdqa xmm2,xmm4 pandn xmm1,xmm3 movdqa xmm3,xmm4 pand xmm2,XMMWORD[320+rsp] pand xmm3,XMMWORD[((320+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqu XMMWORD[rdi],xmm2 movdqu XMMWORD[16+rdi],xmm3 movdqa xmm0,xmm5 movdqa xmm1,xmm5 pandn xmm0,XMMWORD[256+rsp] movdqa xmm2,xmm5 pandn xmm1,XMMWORD[((256+16))+rsp] movdqa xmm3,xmm5 pand xmm2,XMMWORD[448+rsp] pand xmm3,XMMWORD[((448+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqa xmm0,xmm4 movdqa xmm1,xmm4 pandn xmm0,xmm2 movdqa xmm2,xmm4 pandn xmm1,xmm3 movdqa xmm3,xmm4 pand xmm2,XMMWORD[352+rsp] pand xmm3,XMMWORD[((352+16))+rsp] por xmm2,xmm0 por xmm3,xmm1 movdqu XMMWORD[32+rdi],xmm2 movdqu XMMWORD[48+rdi],xmm3 lea rsi,[((480+56))+rsp] mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbx,QWORD[((-16))+rsi] mov rbp,QWORD[((-8))+rsi] lea rsp,[rsi] $L$add_affinex_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_ecp_nistz256_point_add_affine_adx: EXTERN __imp_RtlVirtualUnwind ALIGN 16 short_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$common_seh_tail mov rax,QWORD[152+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail lea rax,[16+rax] mov r12,QWORD[((-8))+rax] mov r13,QWORD[((-16))+rax] mov QWORD[216+r8],r12 mov QWORD[224+r8],r13 jmp NEAR $L$common_seh_tail ALIGN 16 full_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$common_seh_tail mov rax,QWORD[152+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail mov r10d,DWORD[8+r11] lea rax,[r10*1+rax] mov rbp,QWORD[((-8))+rax] mov rbx,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] mov r13,QWORD[((-32))+rax] mov r14,QWORD[((-40))+rax] mov r15,QWORD[((-48))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 mov QWORD[224+r8],r13 mov QWORD[232+r8],r14 mov QWORD[240+r8],r15 $L$common_seh_tail: mov rdi,QWORD[8+rax] mov rsi,QWORD[16+rax] mov QWORD[152+r8],rax mov QWORD[168+r8],rsi mov QWORD[176+r8],rdi mov rdi,QWORD[40+r9] mov rsi,r8 mov ecx,154 DD 0xa548f3fc mov rsi,r9 xor rcx,rcx mov rdx,QWORD[8+rsi] mov r8,QWORD[rsi] mov r9,QWORD[16+rsi] mov r10,QWORD[40+rsi] lea r11,[56+rsi] lea r12,[24+rsi] mov QWORD[32+rsp],r10 mov QWORD[40+rsp],r11 mov QWORD[48+rsp],r12 mov QWORD[56+rsp],rcx call QWORD[__imp_RtlVirtualUnwind] mov eax,1 add rsp,64 popfq pop r15 pop r14 pop r13 pop r12 pop rbp pop rbx pop rdi pop rsi ret section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_ecp_nistz256_neg wrt ..imagebase DD $L$SEH_end_ecp_nistz256_neg wrt ..imagebase DD $L$SEH_info_ecp_nistz256_neg wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_ord_mul_mont_nohw wrt ..imagebase DD $L$SEH_end_ecp_nistz256_ord_mul_mont_nohw wrt ..imagebase DD $L$SEH_info_ecp_nistz256_ord_mul_mont_nohw wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_ord_sqr_mont_nohw wrt ..imagebase DD $L$SEH_end_ecp_nistz256_ord_sqr_mont_nohw wrt ..imagebase DD $L$SEH_info_ecp_nistz256_ord_sqr_mont_nohw wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_ord_mul_mont_adx wrt ..imagebase DD $L$SEH_end_ecp_nistz256_ord_mul_mont_adx wrt ..imagebase DD $L$SEH_info_ecp_nistz256_ord_mul_mont_adx wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_ord_sqr_mont_adx wrt ..imagebase DD $L$SEH_end_ecp_nistz256_ord_sqr_mont_adx wrt ..imagebase DD $L$SEH_info_ecp_nistz256_ord_sqr_mont_adx wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_mul_mont_nohw wrt ..imagebase DD $L$SEH_end_ecp_nistz256_mul_mont_nohw wrt ..imagebase DD $L$SEH_info_ecp_nistz256_mul_mont_nohw wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_sqr_mont_nohw wrt ..imagebase DD $L$SEH_end_ecp_nistz256_sqr_mont_nohw wrt ..imagebase DD $L$SEH_info_ecp_nistz256_sqr_mont_nohw wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_mul_mont_adx wrt ..imagebase DD $L$SEH_end_ecp_nistz256_mul_mont_adx wrt ..imagebase DD $L$SEH_info_ecp_nistz256_mul_mont_adx wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_sqr_mont_adx wrt ..imagebase DD $L$SEH_end_ecp_nistz256_sqr_mont_adx wrt ..imagebase DD $L$SEH_info_ecp_nistz256_sqr_mont_adx wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_select_w5_nohw wrt ..imagebase DD $L$SEH_end_ecp_nistz256_select_w5_nohw wrt ..imagebase DD $L$SEH_info_ecp_nistz256_select_wX_nohw wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_select_w7_nohw wrt ..imagebase DD $L$SEH_end_ecp_nistz256_select_w7_nohw wrt ..imagebase DD $L$SEH_info_ecp_nistz256_select_wX_nohw wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_select_w5_avx2 wrt ..imagebase DD $L$SEH_end_ecp_nistz256_select_w5_avx2 wrt ..imagebase DD $L$SEH_info_ecp_nistz256_select_wX_avx2 wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_select_w7_avx2 wrt ..imagebase DD $L$SEH_end_ecp_nistz256_select_w7_avx2 wrt ..imagebase DD $L$SEH_info_ecp_nistz256_select_wX_avx2 wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_point_double_nohw wrt ..imagebase DD $L$SEH_end_ecp_nistz256_point_double_nohw wrt ..imagebase DD $L$SEH_info_ecp_nistz256_point_double_nohw wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_point_add_nohw wrt ..imagebase DD $L$SEH_end_ecp_nistz256_point_add_nohw wrt ..imagebase DD $L$SEH_info_ecp_nistz256_point_add_nohw wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_point_add_affine_nohw wrt ..imagebase DD $L$SEH_end_ecp_nistz256_point_add_affine_nohw wrt ..imagebase DD $L$SEH_info_ecp_nistz256_point_add_affine_nohw wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_point_double_adx wrt ..imagebase DD $L$SEH_end_ecp_nistz256_point_double_adx wrt ..imagebase DD $L$SEH_info_ecp_nistz256_point_double_adx wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_point_add_adx wrt ..imagebase DD $L$SEH_end_ecp_nistz256_point_add_adx wrt ..imagebase DD $L$SEH_info_ecp_nistz256_point_add_adx wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_point_add_affine_adx wrt ..imagebase DD $L$SEH_end_ecp_nistz256_point_add_affine_adx wrt ..imagebase DD $L$SEH_info_ecp_nistz256_point_add_affine_adx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_ecp_nistz256_neg: DB 9,0,0,0 DD short_handler wrt ..imagebase DD $L$neg_body wrt ..imagebase,$L$neg_epilogue wrt ..imagebase $L$SEH_info_ecp_nistz256_ord_mul_mont_nohw: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$ord_mul_body wrt ..imagebase,$L$ord_mul_epilogue wrt ..imagebase DD 48,0 $L$SEH_info_ecp_nistz256_ord_sqr_mont_nohw: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$ord_sqr_body wrt ..imagebase,$L$ord_sqr_epilogue wrt ..imagebase DD 48,0 $L$SEH_info_ecp_nistz256_ord_mul_mont_adx: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$ord_mulx_body wrt ..imagebase,$L$ord_mulx_epilogue wrt ..imagebase DD 48,0 $L$SEH_info_ecp_nistz256_ord_sqr_mont_adx: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$ord_sqrx_body wrt ..imagebase,$L$ord_sqrx_epilogue wrt ..imagebase DD 48,0 $L$SEH_info_ecp_nistz256_mul_mont_nohw: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase DD 48,0 $L$SEH_info_ecp_nistz256_sqr_mont_nohw: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase DD 48,0 $L$SEH_info_ecp_nistz256_mul_mont_adx: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$mulx_body wrt ..imagebase,$L$mulx_epilogue wrt ..imagebase DD 48,0 $L$SEH_info_ecp_nistz256_sqr_mont_adx: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$sqrx_body wrt ..imagebase,$L$sqrx_epilogue wrt ..imagebase DD 48,0 $L$SEH_info_ecp_nistz256_select_wX_nohw: DB 0x01,0x33,0x16,0x00 DB 0x33,0xf8,0x09,0x00 DB 0x2e,0xe8,0x08,0x00 DB 0x29,0xd8,0x07,0x00 DB 0x24,0xc8,0x06,0x00 DB 0x1f,0xb8,0x05,0x00 DB 0x1a,0xa8,0x04,0x00 DB 0x15,0x98,0x03,0x00 DB 0x10,0x88,0x02,0x00 DB 0x0c,0x78,0x01,0x00 DB 0x08,0x68,0x00,0x00 DB 0x04,0x01,0x15,0x00 ALIGN 8 $L$SEH_info_ecp_nistz256_select_wX_avx2: DB 0x01,0x36,0x17,0x0b DB 0x36,0xf8,0x09,0x00 DB 0x31,0xe8,0x08,0x00 DB 0x2c,0xd8,0x07,0x00 DB 0x27,0xc8,0x06,0x00 DB 0x22,0xb8,0x05,0x00 DB 0x1d,0xa8,0x04,0x00 DB 0x18,0x98,0x03,0x00 DB 0x13,0x88,0x02,0x00 DB 0x0e,0x78,0x01,0x00 DB 0x09,0x68,0x00,0x00 DB 0x04,0x01,0x15,0x00 DB 0x00,0xb3,0x00,0x00 ALIGN 8 $L$SEH_info_ecp_nistz256_point_double_nohw: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$point_doubleq_body wrt ..imagebase,$L$point_doubleq_epilogue wrt ..imagebase DD 32*5+56,0 $L$SEH_info_ecp_nistz256_point_add_nohw: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$point_addq_body wrt ..imagebase,$L$point_addq_epilogue wrt ..imagebase DD 32*18+56,0 $L$SEH_info_ecp_nistz256_point_add_affine_nohw: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$add_affineq_body wrt ..imagebase,$L$add_affineq_epilogue wrt ..imagebase DD 32*15+56,0 ALIGN 8 $L$SEH_info_ecp_nistz256_point_double_adx: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$point_doublex_body wrt ..imagebase,$L$point_doublex_epilogue wrt ..imagebase DD 32*5+56,0 $L$SEH_info_ecp_nistz256_point_add_adx: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$point_addx_body wrt ..imagebase,$L$point_addx_epilogue wrt ..imagebase DD 32*18+56,0 $L$SEH_info_ecp_nistz256_point_add_affine_adx: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$add_affinex_body wrt ..imagebase,$L$add_affinex_epilogue wrt ..imagebase DD 32*15+56,0 %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.o000064400000000000000000001773451046102023000167450ustar 00000000000000dg .debug$S,0@B.debug$T\N@B.textBNO! p`.rdataa@p@.pdataaE9@0@.xdataG-@@@9C:\Users\b\p\ring\pregenerated\p256-x86_64-asm-nasm.asm6)pa]rB?r,- . 0168<=> ?#@&B)C-D1E4F8G?HBIFKILLMPNTOWP[Q^SbTfUiVmWqXuYyZ}\^`cdeprstvwx}   #&)-037;>BEHKNRVZ]adgjnqtwz~    # &),037:>ADHKORVY\ `"c#g%j&n'q(u)x*{+~,-01234679:;<=>?@BCDEFHIJMNOPQRTUVWXYZ\]^_ `acdfg h#i'j*k-l0m3n7q:r=s@tCuFwIxMzQ{T|X}[~^adhlpsvy|   #'+/69>@CFINQTWZ]bfilorw{~    !"#$&'()*+-. 0 1234789":&;)=,>0@3A6B:C=D@ECFFGJIMJRLVMZN]O`PcRfSjVmWpXsYwZz\}]_`abcdefhiklmnoqruvwxy{|~ #&)-0369=AEHKNRUX[^adgknquy|   #&)-1 5 9 @ GLQVY^afilps| !"$%&()*+,-.1235679:;=#>&?+@1A7C=DCEGHPIVJ\LeMkNqPzQRTUVWXYZ]^_abcefgijkl m o p q t$ u* v0 x9 y? zE |N }T ~Z c g m s y                          " ( . 7 > A G M P V \ ` c f j n q u y }                                       % * - 2 7 : = B E I L Q W ] b e k q u z }   ! " # $ ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : = > @ A B C& D, E2 F8 G> HD IJ JP KV L\ Mb Pe Qk Sp Tv U| V W X Y Z [ \ ] ^ _ b c e f g h i j k l m n o p q t u w x" y( z. {4 |: }@ ~F L R X ^ d g j m p s v z }                                 #&)-15:>CHMRW\afg     %&'()*+,-./345678: ;<=>?@ B$C'D+E.F1G5H8J<K?LCMFNIOLPPTSUWVZW^XaYdZg[k\n]r^ubxc{d~efgijklmnoqrstuvwyz{|}~  $'+.158<?CFILPSWZ^adgjmptx{~    #&),/258 <!?#B$E%H&L'O*R+U,X-\._0b1e2h3l4o5r6v9y:|;<=>@ABCDEFHIJKMNOPQRTUVWXY[\]^_abghijklmn ostuvw x#y&z){,|037:=ADGJMQTX[^behkorux{~  %*+@EJPSX_bgjmruz~     !"#$%'() -.0123 4$5'90:6;<=E>K?QAZB`CfEoFrGxH}IJLMNRSUVWXYZ^_`bcdfghjklm noqr!s%w(x+z0{3|:}=~@CGILPSWZ]aeilptx| %*,/47:>AFLRWZ`fjovy      !$),15:=@EHM Q!V$Y%\'a(d)i*m+r.u/x1}2356789:;<=?@ABCDEGHIJKLMNPW[]^_`abcd efgh#i(k,l0m4n8o<p@rEsJuOxTyYz^|c}i~ou{&./@HLPTX]bglqv{ !&,28>DJS\de                         ! # $ "& &' *( 1* 6+ ;, @- E. J/ O1 S2 W3 \4 a5 f6 k8 n9 t; x< }= > ? @ A B C D E F G H I J S W X Y [ \ ] ^ _ ` a b c d e $f ,h 0i 4k <l Dm Lo Qp Vs [v _w dy iz n| v} ~                                    " ( . 4 = F I J ` c f j m q u x |                                          # & ) , / 2 5 9 < ? C F I L P T X \ ] ` c f i l o r! u" y$ }% & ' ( ) * , - . / 0 1 2 3 5 : < = > @ A F H J L N P R W X Y Z [ \ ] ^ _ ` a b c "d 'e ,f 1h 5i :k >l Bm Fn Jo Np Sq Xs \t au ev jw ox sy x{ || } ~                                ! $ ' + . 4 8 < @ D H K N R U X [ _ b e h l o r v z } ! ! ! ! ! ! !! &! +! 3! 6! 9! $ C$ K$ P$ U$ X$ \$ _$ b$ f$ k$ s$ {$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ % % % % % % % %% *% /% 4% 9% @% E% J% Q% V% `% e% j% o% t% y% ~% % % % % % % % % % % % % % % % % % % % % % & & & & & !& %& *& /& 4& 9& >& F&! K&# S&$ [&% `&& e&' j&( o&) t&* |&+ &0 &1 &2 &3 &4 &5 &6 &7 &8 &: &; &< &= &> &? &@ &B &C &D &E &F &G &H &I &K &M &N &O &Q &R 'S 'T 'U 'W 'Y 'Z $'[ ('\ ,'] 4'^ <'_ D'` L'a T'b \'c d'd l'e q'g y'h 'i 'j 'k 'l 'm 'n 'o 'q 'r 's 'u 'w 'x 'y 'z '{ '| '} '~ ' ' ' ( ( ( ( ( ( ( '( +( /( 4( 9( =( A( J( N( W( [( d( m( q( u( y( }( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ) ) ) ) ) ) #) () 0) 4) 8) <) @) D) H) K) P) U) V) `) e) j) m) p) s) v) w) x) z) |) ~) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) * * * * * $* (* -* 6* ?* C* G* K* O* S* X* ]* a* f* i*" l*# p*$ u*% z*& }*' *( *) ** *+ *- *. */ *0 *2 *3 *4 *6 *7 *8 *9 *: *; *< *= *> *@ *A *B *C +D +E +F +G +H +J %+K -+L 2+M 7+N <+O A+P F+Q K+R P+T X+U ]+V b+X g+Y l+Z q+[ v+\ {+] +^ +` +a +b +c +d +e +f +h +i +j +k +l +m +n +o +p +r +s +t +u ,v ,w ,x ,y ,z %, (, +, 3, 6, 9, <, ?, B, F, J, M, P, T, W, Z, ^, b, e, i, m, q, u, y, }, , , , , , , , , , , , , , , , , , , , , , - - - - - - - #- (- -- 2- 7- ?- D- I- M- Q- Z- ^- g- k- s- {- - - - - - - - - - - - - - - - - - - - - - - - - - . . . . . . %. ). -. 1. 6. :. >. G. K. T. X. a. j. n. r.v.z.~...... . . . .......... .!.%.(.).*.+.,.-.../.1.2/3/4 /5 /6/7/8/:/;/<"/=&/>*/?./@2/A6/C7/H@/KC/LF/MJ/NM/OQ/PU/QX/R\/T_/Uc/Vf/Wi/Xm/Yp/Zs/\x/]|/^/_/`/a/b/c/d/f/k/n/o/p/q/r/s/t/u/w/x/y/z/{/|/}///////000 0 000000 0#0&0*0-0004080<0?0C0G0K0O0S0T0`0e0j0m0p0s0t0u0w0y0{0}000000000000000000000000000011 11111!1%1)1-11151:1?1D1I1N1S1X1]1b1g1l1q1v1{1111 1 1 1 1 111111111111111111 1!1#1$1%1&1'1(1)2*2+2, 2-2.2/20212223"24&25*26/27428929>2:C2;H2<M2=R2>W2@_2Ad2Ci2Dn2Es2Gx2H}2I2J2K2L2M2N2O2Q2R2T2U2V2W2X2Y2Z2\2]2^2_2`2a2c2d2e2f2g2h3j3k 3l3m3n3o3p3q"3r'3s,3t03u53v83w<3xA3zF3{K3|P3~X3\3`3d3h3l3p3s3x3}3~3333333333333333333333333333444 4444 4$4(4,40494>4G4L4Q4Z4c4g4k4o4t4x4444444444444444444444444455 5555#5+53585=5B5G5L5T5Y5a5 i5 q5 y5 ~5 5555555555555555 5!5"5#5$6%6'6(6)6*"6+'6,,6-16.96/>61F62N63S64X65]66b67g68o69t6;|6<6=6?6@6A6C6D6E6F6G6K6L6S6T6U6V6W6X6Y6Z6[6]6_6`6a6c6f7h7i 7j7k7l7m7n#7p+7q37r77s<7tA7uF7vK7wS7xX7z\7{a7|f7}k7~p7u7z77777777777777777777788 8888#8&8)8.8184878:8=8A8E8H8K8O8R8U8Y8]8`8d8h8l8p8t8x8}888888888888888888889 999 9%9*9/94999A9F9N9V9[9`9d9h9q9u9~999999999999999999999999:  : : : :: :$:(:,:5:>:B:F:J:O:S:W:`:d:m:q: z:!:":#:%:&:':(:):*:+:,:-:.:/:0:3:5:7:9:;:=:?:A:D:E:F:K;M;N ;O ;Q;R;S;X;Z;\;^;`;b ;d';h+;i.;j3;k8;l=;mB;nG;oK;pO;qS;rW;s`;ti;ur;v{;w;x;y;{;|;};~;;;;;;;;;;;;;;;;;<< < <<<<<!<%<(<,<1<6<9<=<B<J<O<T<\<d<i<n<s<x<}<<<<<<<<<<<<<<<<<<<<<<<== ====#=(=-=2=7=<=A=I=N=V=^=c=h=m=r=w=============================>> > > > > > >&>.>3>7><>A>F>K>P>S>W>[>_>g> o>!w>">#>$>%>&>'>)>*>+>,>->.>/>0>1>3>4>5>7>9>:>;><>=?>?? ?@?A?B?D ?E$?F(?G,?H0?I4?J=?KF?LJ?MN?NS?OX?Q\?R`?Si?Tm?Uv?Vz?W?X?Y?Z?\?]?^?_?`?a?b?c?d?e?f?g?i?j?k?l?m?n?o?p@q@r @t@u@v@w@x@y#@z,@{5@|9@}=@~B@G@O@S@W@[@_@c@g@j@o@t@u@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@AA AAAAAA$A+A/A3A6Aring_core_0_17_14__ecp_nistz256_point_add_affine_adx7L$SEH_begin_ecp_nistz256_point_add_affine_adxL$add_affinex_body L$add_affinex_epilogue5L$SEH_end_ecp_nistz256_point_add_affine_adxshort_handlerfull_handlerL$common_seh_tail(  L$SEH_info_ecp_nistz256_neg6  L$SEH_info_ecp_nistz256_ord_mul_mont_nohw6  L$SEH_info_ecp_nistz256_ord_sqr_mont_nohw5  L$SEH_info_ecp_nistz256_ord_mul_mont_adx5  L$SEH_info_ecp_nistz256_ord_sqr_mont_adx2  L$SEH_info_ecp_nistz256_mul_mont_nohw2  L$SEH_info_ecp_nistz256_sqr_mont_nohw1  L$SEH_info_ecp_nistz256_mul_mont_adx1  L$SEH_info_ecp_nistz256_sqr_mont_adx3  L$SEH_info_ecp_nistz256_select_wX_nohw3  L$SEH_info_ecp_nistz256_select_wX_avx26  L$SEH_info_ecp_nistz256_point_double_nohw3  L$SEH_info_ecp_nistz256_point_add_nohw:  L$SEH_info_ecp_nistz256_point_add_affine_nohw5  L$SEH_info_ecp_nistz256_point_double_adx2  L$SEH_info_ecp_nistz256_point_add_adx9  L$SEH_info_ecp_nistz256_point_add_affine_adxp t s s s s -s 1s As Es Ws [s ps ts s s s s s s s s t t t t Bt Ft t t t t t t t t !u! %u! ^u" bu" u# u# u$ u$ u% u% u& u& v' v' Uv( Yv( v) v) v* v* v+ v+ v, v, "w- &w- ^w. bw. w/ w/ w0 w0 w1 w1 w2 w2 x3 x3 Ex4 Ix4 ~x5 x5 x6 x6 x7 x7 x8 x8 y9 y9 4y: 8y: my; qy; y< y< y= y= y> y> y? z? #z@ 'z@ [zA _zA zB zB zC zC zD zD zE zE {F {F I{G M{G z{H ~{H {I {I {J {J {K {K {L |L 9|M =|M l|N p|N |O |O |P |P |Q |Q *}R .}R J}S N}S {}T }T }U }U }V }V ~W ~W :~X >~X t~Y x~Y ~Z ~Z ~[ ~[ ~\ ~\ ] ] ?^ C^ ^_ b_ ` ` a a b b c c 9d =d ]e ae f f ˀg πg h h i i 4j 8j Nk Rk el il m m n n o o 2p 6p Pq Tq rr vr s s ̂t Ђt u u v v 3w 7w ox sx y y ăz ȃz { { | | ?} C} x~ |~   DŽ ˄   2 6 b f ۅ ߅   R V k o Ά ҆  > B u y   G K z ~  Q U ĉ ȉ H|$Ht$HHHATAUM1M1M1M1M1L+LNLVLL^H5LILLLNLVML^MLDLDLLDLOMDLWL_L,$Ld$Hd$H|$Ht$ÐH|$Ht$HHHLUSATAUAVAWHHL5L=HH&IHIHfIHHIHfIHHMMIHfILHII&LILHHM)IIfIHIHIHIH H I)HCHMIIHH&IHHHHfIHIHHHHfIHIHHLMHHfIHM1ILIII&LHLHM)IIfIHIHIHIH H I)HCHMIIHH&IHHHHfIHIHHHHfIHIHHLMHHfIHM1ILIII&LHLHM)IIfIHIHIHIH H I)HCHMIIHH&IHHHHfIHIHHHHfIHIHHLMHHfIHM1ILIII&LHLHM)IIfIHIHIHIH H I)HMIILM+&MMnLMFLMNILBMBLBLBL'LoLGLOL<$Lt$Ll$Ld$H\$ Hl$(Hd$0H|$Ht$ÐH|$Ht$HHHLUSATAUAVAWLHFLvL~H5HHIIfHnLIIILfInHIIILfInHIIILIHILHIHIHMIIM1LMMMMMMIHIfH~HHIIfH~HHHIIfH~HHLLF HIIHIILHHFHM)HIIHILILHLLN H H I)HIIIILHHFHM)HIIHILILHLLV H H I)HIIIILHHFHM)HIIHILILHLL^ H H I)HIIIILHHFHM)HIIHILILHH H I)IIIH1MMMMMLHL+MLNLVML^HMBICMCMCHLHGfLwfLfL<$Lt$Ll$Ld$H\$ Hl$(Hd$0H|$Ht$ÐH|$Ht$HHHLUSATAUAVAWHHLLVL^LfHvL5 L=BBBIBLIIIM1fL8L8fL8L8fL8L8HSfL8L8fM8M8IfL8L8fL8L8fL8L8LfL8L8fM8M8IfL8L8fL8L8fL8L8HSfL8L8fM8M8IfL8L8fL8L8fL8L8LfL8L8fM8M8IfL8L8fL8L8fL8L8HSfL8L8fM8M8IfL8L8fL8L8fL8L8LfL8L8fM8M8IfL8L8fL8L8fL8L8MLfL8L8LfM8M8ILM+&MnMFLMNILBLBLBLBL'LoLGLOL<$Lt$Ll$Ld$H\$ Hl$(Hd$0H|$Ht$ÐH|$Ht$HHHLUSATAUAVAWHHLvL~LFH5BBHfInBLIfInIIM1fL8L8LfL8L8IBHfInM1fM8L8fM8M8fH~fM8L8fM8fH~fM8L8fM8gfH~L8fM8L8L8L8L8LN H1.fL8L8nfL8L8nfL8L8nfL8L8fL8LN .L8fL8nL8fL8nL8fL8nL8fL8L8LN .fL8L8nfL8L8nfL8L8nfL8L8fL8LN .L8fL8nL8fL8nL8fL8nL8fL8L8MMLMMMHL+&MLNLVML^HICMCMCMCHtHLwfLfLGfL<$Lt$Ll$Ld$H\$ Hl$(Hd$0H|$Ht$ÐH|$Ht$HHHLUSATAUAVAWHHLLVL^LfFL<$Lt$Ll$Ld$H\$ Hl$(Hd$0H|$Ht$ÐHIL5IHIIL=IHHIIIHHIIILHM1ILI IH MIIHCIIM1HH&IHHHHfIHIHHHHfIHIHHHHfIHILIILI IH MIIHCIIM1HH&IHHHHfIHIHHHHfIHIHHHHfIHILIILI IH MIIHCIIM1HH&IHHHHfIHIHHHHfIHIHHHHfIHILIILI IH MILIILIILMILMILBLBL'LBLoLBLGLOÐH|$Ht$HHHUSATAUAVAWHLvL~LF/L<$Lt$Ll$Ld$H\$ Hl$(Hd$0H|$Ht$ÐIIILIIILHIIILHIIILHHIILHIIIIM1IHIIMMMMMMIHIHFHHIIHFHHHIIHFHHHIILIH5H-LI HH MIILHLI IHH MIILHLI IHH MIILHLI IHH MIIHM1MMMMIMIIMIILIIMBMBL'MBLoLBLwLH|$Ht$HHHLUSATAUAVAWHHLLVL^LfHvBL<$Lt$Ll$Ld$H\$ Hl$(Hd$0H|$Ht$ÐBBA M1BL=IBLI‰I‹IIIHSIIIM1fL8L8fL8L8fL8L8LfL8‰L8‹fM8M8IIIHSIIIM1fL8L8fL8L8fL8L8LfL8‰L8‹fM8M8IIIHSIIIM1fL8L8fL8L8fL8L8LfL8‰L8‹fM8M8IIILL5ILII1LIMILMILBLBL'LBLoLBLGLOÐH|$Ht$HHHUSATAUAVAWHLvL~LFHvKL<$Lt$Ll$Ld$H\$ Hl$(Hd$0H|$Ht$ÐBB1IBLIIM1fL8L8LfL8L8IBHM1fM8L8fM8M8HfM8L8fM8HfM8L8fM8gHL8fM8L8 L8ggHL8L8HIIBIIIIBIIIIBIIIIBIIH1MH5MMMMMHIMIIMIHMBMBL'MBLoMBLwLÐH$xH`)p)xD)D)HD)P D)X0D)`@D)hPD)p`D)xpfo fAnfffffffDofpfEofDfDvfDo fDoRfDoZ fDob0fDoj@fDorPHR`fEfEfAfEfAfEfAfEfAfEfAfAHYa i0q@yP(4$(|$D(D$ D(L$0D(T$@D(\$PD(d$`D(l$pD($D($H$ÐH$xH`)p)xD)D)HD)P D)X0D)`@D)hPD)p`D)xpfDo fAnfffffAofp@fEofDfDo fDoRfDvfDoZ fDob0HR@fEfEfAfEfAfEfAfAHYa i0(4$(|$D(D$ D(L$0D(T$@D(\$PD(d$`D(l$pD($D($H$ÐwH$xIH`)p)xx)@x)Hx)P x)X0x)`@x)hPx)p`x)xpo@o- }o@ynm6ɸo2oz }oB@}oZ`}o}oUv-v-HMEA=A%AA]me]HY a@w(4$(|$D(D$ D(L$0D(T$@D(\$PD(d$`D(l$pD($D($I#ÐwIH$xH`)p)xx)@x)Hx)P x)X0x)`@x)hPx)p`x)xpo`o% }o@}o%`ynm6ɸo*or }oJ@}oR`}o}ov=vv=HA5A-AA memeH}o*or vY w(4$(|$D(D$ D(L$0D(T$@D(\$PD(d$`D(l$pD($D($I#ÐM1L#LkLLCLKLIILMIMMILBLBL'LBLoMBLGLOÐL+#LkLLCLKLMILMIMMMLDLDL'LDLoMDLGLOÐL)LILMIMHILHMMMLELELEMEÐM1MMLMMLIILMIMMILBLBL'LBLoMBLGLOÐH|$Ht$HHHUSATAUAVAWHoHoNLf Ln(LF0LN8L5L=fD$`fL$pLW L_@fHnfInfInH<$&HF@LvHL~PLFXHv@H|$@H$Lt$H4$L|$LD$H<$HC LK@LSHL[PLcXHs@H[ fH~Ld$`Ll$hH\$@LD$pLL$xH|$ Ld$`Ll$hH\$@LD$pLL$xH|$@H$Lt$H4$L|$LD$fH~M1LIMILIMIIH1HLDMDLDMDLDLIH?MII I?LIM H?L'ILoI?I M LwLHD$@H\$@LL$ LT$(Ht$ L\$0Ld$8H|$ H$H\$ H|$ HD$`H\$`L $LT$H4$L\$Ld$H<$H$XHD$ Lt$(Ht$ L|$0LD$8fH~H$MMII|H$Hl$HL$LT$H<$HD$ H\$ M1L$$MLl$MDLD$H4$MDLL$MH<$fH~fH~H$L~LvLnLfH^HnH&H|$Ht$ÐH|$Ht$HHHLUSATAUAVAWHHooNoV o^0of@onPHHf$f$f$f$f$f$fofpݱoNoV fo^0HF@LvHL~PLFXf$fpf$oF@oNPf$f$ffffHnHv@H$ L$(L$0L$8H|$`fvfpffpfpfffvfpHC@LsHL{PLCXfHnHs@H|$ H$ H$ LL$`LT$hHt$`L\$pLd$xH$H$H$LL$ LT$(Ht$ L\$0Ld$8H$H$H$L$L$H$L$L$H$H$H$L$L$H$L$L$H$=H$H|$@kM foM M ffInH$H$LL$`LT$hHt$`L\$pLd$xH$H$H$LL$ LT$(Ht$ L\$0Ld$8H$H$H<$M M M fI~fI~M >[M2fH~fGG G0G@GPfH~fH~HĠHD$@Lt$HHt$@L|$PLD$XH|$`H$H$L $LT$H4$L\$Ld$H$`H$Lt$H4$L|$LD$H|$ HH$ H$ L$`L$hH$`L$pL$xH$`cH$H$LL$ LT$(Ht$ L\$0Ld$8H$5H$H$LL$ LT$(Ht$ L\$0Ld$8H$M1MHt$`MLMMLIILMIMMILBHLBHnLBHNMBLVEH$H$ H$H$H$L$H$@L'LoLGLOH$H$L$L$H$L$L$H$H$@H$@LL$@LT$HHt$@L\$PLd$XH$@H$H$@fH~fofof߄$`fofߌ$pfof۔$ fۜ$0fffofoffoffof۔$fۜ$ffW@_Pfofof߄$ fofߌ$0fof۔$fۜ$fffofoffoffof۔$fۜ$ff_fofof߄$@fofߌ$Pfof۔$fۜ$fffofoffoffof۔$fۜ$ffW _0H$xL~LvLnLfH^HnH&H|$Ht$ÐH|$Ht$HHHLUSATAUAVAWHoHoNoV o^0of@onPHF@LvHL~PLFXf$@f$Pf$`f$pf$f$fofpݱoKoS fo[0f$fpf$ffHnf$f$ffffHv@H|$ fvfpHMffpfpMffMfvfpHt$ MH<$H$@H|$@ H$H$LL$ LT$(Ht$ L\$0Ld$8H|$ H$H$LL$@LT$HHt$@L\$PLd$XH$ cH$H$LL$ LT$(Ht$ L\$0Ld$8H|$ 0H$`H|$`^HD$@Lt$HHt$@L|$PLD$XH$HD$`Lt$hHt$`L|$pLD$xH$rH$H$LL$@LT$HHt$@L\$PLd$XH$H$@H$@L$L$H$L$L$H<$[M1MH$MLMMLIILMIMMILBHLBHnLBHNMBLVH$H$)H$Hl$HL$LT$H|$@lL'LoLGLOH$`H$`L$L$H$L$L$H|$ {HD$`H\$`LL$@LT$HHt$@L\$PLd$XH|$@NH\$ H$|fH~fofof߄$ fofߌ$0fofffffofoffoffof۔$fۜ$ffW@_Pfofof߄$fofߌ$fof۔$fۜ$fffofoffoffof۔$@fۜ$Pff_fofof߄$fofߌ$fof۔$fۜ$fffofoffoffof۔$`fۜ$pffW _0H$L~LvLnLfH^HnH&H|$Ht$ÐM1L#LkLLCLKLIM1ILMIMMILBLBL'LBLoMBLGLOÐM1L#LkLLCLKLIM1ILMIMMILCLCL'LCLoMCLGLOÐM1LLILMIIM1HILHMMILBLBLBMBÐM1MMLMMLIM1ILMIMMILBLBL'LBLoMBLGLOÐH|$Ht$HHHUSATAUAVAWHoHoNLf Ln(LF0LN8L5L=fD$`fL$pLW L_@fHnfInfInH<$&HV@LvHL~PLFXHvH|$@(H$Lt$Ht$L|$LD$H<$HS LK@LSHL[PLcXHsH[ fH~Ld$`Ll$hH\$@LD$pLL$xH|$ yLd$`Ll$hH\$@LD$pLL$xH|$@H$Lt$Ht$L|$LD$fH~tM1LIMILIMIIH1HLDMDLDMDLDLIH?MII I?LIM H?L'ILoI?I M LwLHT$@H\$@LL$ LT$(Ht$L\$0Ld$8H|$ H$H\$ H|$ mHT$`H\$`L $LT$Ht$L\$Ld$H<$H$UHT$ Lt$(Ht$L|$0LD$8fH~RH$MMIIYH$Hl$HL$LT$H<$HT$ H\$ M1L$$MLl$MDLD$Ht$MDLL$MH<$fH~fH~H$L~LvLnLfH^HnH&H|$Ht$ÐH|$Ht$HHHLUSATAUAVAWHHooNoV o^0of@onPHHf$f$f$f$f$f$fofpݱoNoV fo^0HV@LvHL~PLFXf$fpf$oF@oNPf$f$ffffHnHvH$ L$(L$0L$8H|$`~fvfpffpfpfffvfpHS@LsHL{PLCXfHnHsH|$ 3H$ H$ LL$`LT$hHt$L\$pLd$xH$H$H$LL$ LT$(Ht$L\$0Ld$8H$H$H$L$L$Ht$`L$L$H$H$H$L$L$H$L$L$H$`H$H|$@NM foM M ffInH$H$LL$`LT$hHt$L\$pLd$xH$H$H$LL$ LT$(Ht$L\$0Ld$8H$H$H<$M M M fI~fI~M >^M5fH~fGG G0G@GPfH~fH~HĠHT$@Lt$HHt$L|$PLD$XH|$`H$H$L $LT$Ht$L\$Ld$H$`H$Lt$Ht$L|$LD$H|$ H$ H$ L$`L$hH$L$pL$xH$`H$H$LL$ LT$(Ht$L\$0Ld$8H$SH$H$LL$ LT$(Ht$L\$0Ld$8H$M1MHt$`MLMMLIILMIMMILBHLBHnLBHNMBLV#H$H$ H$H$H$L$H$@L'LoLGLOH$H$L$L$Ht$`L$L$H$0H$@H$@LL$@LT$HHt$L\$PLd$XH$@H$H$@fH~fofof߄$`fofߌ$pfof۔$ fۜ$0fffofoffoffof۔$fۜ$ffW@_Pfofof߄$ fofߌ$0fof۔$fۜ$fffofoffoffof۔$fۜ$ff_fofof߄$@fofߌ$Pfof۔$fۜ$fffofoffoffof۔$fۜ$ffW _0H$xL~LvLnLfH^HnH&H|$Ht$ÐH|$Ht$HHHLUSATAUAVAWHoHoNoV o^0of@onPHV@LvHL~PLFXf$@f$Pf$`f$pf$f$fofpݱoKoS fo[0f$fpf$ffHnf$f$ffffHvH|$ #fvfpHMffpfpMffMfvfpHt$MH<$H$@H|$@H$H$LL$ LT$(Ht$L\$0Ld$8H|$ H$H$LL$@LT$HHt$L\$PLd$XH$ H$H$LL$ LT$(Ht$L\$0Ld$8H|$ PH$`H|$`>HT$@Lt$HHt$L|$PLD$XH$HT$`Lt$hHt$L|$pLD$xH$H$H$LL$@LT$HHt$L\$PLd$XH$H$@H$@L$L$H4$L$L$H<$M1MH$MLMMLIILMIMMILBHLBHnLBHNMBLVH$H$ H$Hl$HL$LT$H|$@PL'LoLGLOH$`H$`L$L$Ht$ L$L$H|$ HT$`H\$`LL$@LT$HHt$L\$PLd$XH|$@uH\$ H$cfH~fofof߄$ fofߌ$0fofffffofoffoffof۔$fۜ$ffW@_Pfofof߄$fofߌ$fof۔$fۜ$fffofoffoffof۔$@fۜ$Pff_fofof߄$fofߌ$fof۔$fۜ$fffofoffoffof۔$`fۜ$pffW _0H$L~LvLnLfH^HnH&H|$Ht$ÐVWSUATAUAVAWH@I@xIIqMY8ENL9IESNL9H@L`LhMMVWSUATAUAVAWH@I@xIIqMY8ENL9bIESNL9JESJHhHXL`LhLpLxIIMMMMHxHpIIIIy(LƹHLH1HVLLNLV(L^8LfLT$ L\$(Ld$0HL$8H@A_A^A]A\][_^;2<C [6(8@H o-w-00??AQ%cʹO  (  @ X gp+/HeJ!8!V)Pm).hm0~33: ;u@    $(, 048 <@D HLP TX\ `dh lpt x|          @ @0 @ 0 @  0 @ 0 @ \0 @0 @ 0 @033 .)$ xh6 6 1,'"x h @! @"K)x @). @0s3 @3:x @';j@ ,04DHL\`dtx|<@DTX\lpt.filegC:\Users\b\p\ring\.debug$S,.debug$T.textB!.rdata.pdata9.xdata-.absolutL$polyL$One L$Two@L$Three`L$ordL$ordK&J grDv  @+ Uu      < L [ o   \g+Dr DP `+@ 0 IxO/@ H2Gem/W[mJ` `" M b { ! ! ! ! "& @%4 `%C ()O K)e V) `) m) ) . .G .^ @/w / 0 `0 m0 0 0- s3F ~3n 3 3 3 6 7 : : :9 ;n ; '; j@ u@ @@ A;e(@Xp-Rw8PhDm__imp_RtlVirtualUnwindL$ONE_montring_core_0_17_14__ecp_nistz256_negL$SEH_begin_ecp_nistz256_negL$neg_bodyL$neg_epilogueL$SEH_end_ecp_nistz256_negring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohwL$SEH_begin_ecp_nistz256_ord_mul_mont_nohwL$ord_mul_bodyL$ord_mul_epilogueL$SEH_end_ecp_nistz256_ord_mul_mont_nohwring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohwL$SEH_begin_ecp_nistz256_ord_sqr_mont_nohwL$ord_sqr_bodyL$oop_ord_sqrL$ord_sqr_epilogueL$SEH_end_ecp_nistz256_ord_sqr_mont_nohwring_core_0_17_14__ecp_nistz256_ord_mul_mont_adxL$SEH_begin_ecp_nistz256_ord_mul_mont_adxL$ecp_nistz256_ord_mul_mont_adxL$ord_mulx_bodyL$ord_mulx_epilogueL$SEH_end_ecp_nistz256_ord_mul_mont_adxring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adxL$SEH_begin_ecp_nistz256_ord_sqr_mont_adxL$ecp_nistz256_ord_sqr_mont_adxL$ord_sqrx_bodyL$oop_ord_sqrxL$ord_sqrx_epilogueL$SEH_end_ecp_nistz256_ord_sqr_mont_adxring_core_0_17_14__ecp_nistz256_mul_mont_nohwL$SEH_begin_ecp_nistz256_mul_mont_nohwL$mul_bodyL$mul_epilogueL$SEH_end_ecp_nistz256_mul_mont_nohw__ecp_nistz256_mul_montqring_core_0_17_14__ecp_nistz256_sqr_mont_nohwL$SEH_begin_ecp_nistz256_sqr_mont_nohwL$sqr_bodyL$sqr_epilogueL$SEH_end_ecp_nistz256_sqr_mont_nohw__ecp_nistz256_sqr_montqring_core_0_17_14__ecp_nistz256_mul_mont_adxL$SEH_begin_ecp_nistz256_mul_mont_adxL$mulx_bodyL$mulx_epilogueL$SEH_end_ecp_nistz256_mul_mont_adx__ecp_nistz256_mul_montxring_core_0_17_14__ecp_nistz256_sqr_mont_adxL$SEH_begin_ecp_nistz256_sqr_mont_adxL$sqrx_bodyL$sqrx_epilogueL$SEH_end_ecp_nistz256_sqr_mont_adx__ecp_nistz256_sqr_montxring_core_0_17_14__ecp_nistz256_select_w5_nohwL$SEH_begin_ecp_nistz256_select_w5_nohwL$select_loop_sse_w5L$SEH_end_ecp_nistz256_select_w5_nohwring_core_0_17_14__ecp_nistz256_select_w7_nohwL$SEH_begin_ecp_nistz256_select_w7_nohwL$select_loop_sse_w7L$SEH_end_ecp_nistz256_select_w7_nohwring_core_0_17_14__ecp_nistz256_select_w5_avx2L$SEH_begin_ecp_nistz256_select_w5_avx2L$select_loop_avx2_w5L$SEH_end_ecp_nistz256_select_w5_avx2ring_core_0_17_14__ecp_nistz256_select_w7_avx2L$SEH_begin_ecp_nistz256_select_w7_avx2L$select_loop_avx2_w7L$SEH_end_ecp_nistz256_select_w7_avx2__ecp_nistz256_add_toq__ecp_nistz256_sub_fromq__ecp_nistz256_subq__ecp_nistz256_mul_by_2qring_core_0_17_14__ecp_nistz256_point_double_nohwL$SEH_begin_ecp_nistz256_point_double_nohwL$point_doubleq_bodyL$point_double_shortcutqL$point_doubleq_epilogueL$SEH_end_ecp_nistz256_point_double_nohwring_core_0_17_14__ecp_nistz256_point_add_nohwL$SEH_begin_ecp_nistz256_point_add_nohwL$point_addq_bodyL$add_doubleqL$add_proceedqL$add_doneqL$point_addq_epilogueL$SEH_end_ecp_nistz256_point_add_nohwring_core_0_17_14__ecp_nistz256_point_add_affine_nohwL$SEH_begin_ecp_nistz256_point_add_affine_nohwL$add_affineq_bodyL$add_affineq_epilogueL$SEH_end_ecp_nistz256_point_add_affine_nohw__ecp_nistz256_add_tox__ecp_nistz256_sub_fromx__ecp_nistz256_subx__ecp_nistz256_mul_by_2xring_core_0_17_14__ecp_nistz256_point_double_adxL$SEH_begin_ecp_nistz256_point_double_adxL$point_doublex_bodyL$point_double_shortcutxL$point_doublex_epilogueL$SEH_end_ecp_nistz256_point_double_adxring_core_0_17_14__ecp_nistz256_point_add_adxL$SEH_begin_ecp_nistz256_point_add_adxL$point_addx_bodyL$add_doublexL$add_proceedxL$add_donexL$point_addx_epilogueL$SEH_end_ecp_nistz256_point_add_adxring_core_0_17_14__ecp_nistz256_point_add_affine_adxL$SEH_begin_ecp_nistz256_point_add_affine_adxL$add_affinex_bodyL$add_affinex_epilogueL$SEH_end_ecp_nistz256_point_add_affine_adxshort_handlerfull_handlerL$common_seh_tailL$SEH_info_ecp_nistz256_negL$SEH_info_ecp_nistz256_ord_mul_mont_nohwL$SEH_info_ecp_nistz256_ord_sqr_mont_nohwL$SEH_info_ecp_nistz256_ord_mul_mont_adxL$SEH_info_ecp_nistz256_ord_sqr_mont_adxL$SEH_info_ecp_nistz256_mul_mont_nohwL$SEH_info_ecp_nistz256_sqr_mont_nohwL$SEH_info_ecp_nistz256_mul_mont_adxL$SEH_info_ecp_nistz256_sqr_mont_adxL$SEH_info_ecp_nistz256_select_wX_nohwL$SEH_info_ecp_nistz256_select_wX_avx2L$SEH_info_ecp_nistz256_point_double_nohwL$SEH_info_ecp_nistz256_point_add_nohwL$SEH_info_ecp_nistz256_point_add_affine_nohwL$SEH_info_ecp_nistz256_point_double_adxL$SEH_info_ecp_nistz256_point_add_adxL$SEH_info_ecp_nistz256_point_add_affine_adxring-0.17.14/pregenerated/ring_core_generated/prefix_symbols.h000064400000000000000000000264631046102023000226200ustar 00000000000000 #ifndef ring_core_generated_PREFIX_SYMBOLS_H #define ring_core_generated_PREFIX_SYMBOLS_H #define ecp_nistz256_point_double p256_point_double #define ecp_nistz256_point_add p256_point_add #define ecp_nistz256_point_add_affine p256_point_add_affine #define ecp_nistz256_ord_mul_mont p256_scalar_mul_mont #define ecp_nistz256_ord_sqr_mont p256_scalar_sqr_rep_mont #define ecp_nistz256_mul_mont p256_mul_mont #define ecp_nistz256_sqr_mont p256_sqr_mont #define adx_bmi2_available ring_core_0_17_14__adx_bmi2_available #define avx2_available ring_core_0_17_14__avx2_available #define CRYPTO_memcmp ring_core_0_17_14__CRYPTO_memcmp #define CRYPTO_poly1305_finish ring_core_0_17_14__CRYPTO_poly1305_finish #define CRYPTO_poly1305_finish_neon ring_core_0_17_14__CRYPTO_poly1305_finish_neon #define CRYPTO_poly1305_init ring_core_0_17_14__CRYPTO_poly1305_init #define CRYPTO_poly1305_init_neon ring_core_0_17_14__CRYPTO_poly1305_init_neon #define CRYPTO_poly1305_update ring_core_0_17_14__CRYPTO_poly1305_update #define CRYPTO_poly1305_update_neon ring_core_0_17_14__CRYPTO_poly1305_update_neon #define ChaCha20_ctr32 ring_core_0_17_14__ChaCha20_ctr32 #define ChaCha20_ctr32_avx2 ring_core_0_17_14__ChaCha20_ctr32_avx2 #define ChaCha20_ctr32_neon ring_core_0_17_14__ChaCha20_ctr32_neon #define ChaCha20_ctr32_nohw ring_core_0_17_14__ChaCha20_ctr32_nohw #define ChaCha20_ctr32_ssse3 ring_core_0_17_14__ChaCha20_ctr32_ssse3 #define ChaCha20_ctr32_ssse3_4x ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x #define LIMB_is_zero ring_core_0_17_14__LIMB_is_zero #define LIMBS_add_mod ring_core_0_17_14__LIMBS_add_mod #define LIMBS_are_zero ring_core_0_17_14__LIMBS_are_zero #define LIMBS_equal ring_core_0_17_14__LIMBS_equal #define LIMBS_less_than ring_core_0_17_14__LIMBS_less_than #define LIMBS_reduce_once ring_core_0_17_14__LIMBS_reduce_once #define LIMBS_select_512_32 ring_core_0_17_14__LIMBS_select_512_32 #define LIMBS_shl_mod ring_core_0_17_14__LIMBS_shl_mod #define LIMBS_sub_mod ring_core_0_17_14__LIMBS_sub_mod #define LIMBS_window5_split_window ring_core_0_17_14__LIMBS_window5_split_window #define LIMBS_window5_unsplit_window ring_core_0_17_14__LIMBS_window5_unsplit_window #define LIMB_shr ring_core_0_17_14__LIMB_shr #define OPENSSL_cpuid_setup ring_core_0_17_14__OPENSSL_cpuid_setup #define aes_gcm_dec_kernel ring_core_0_17_14__aes_gcm_dec_kernel #define aes_gcm_dec_update_vaes_avx2 ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2 #define aes_gcm_enc_kernel ring_core_0_17_14__aes_gcm_enc_kernel #define aes_gcm_enc_update_vaes_avx2 ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2 #define aes_hw_ctr32_encrypt_blocks ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks #define aes_hw_set_encrypt_key ring_core_0_17_14__aes_hw_set_encrypt_key #define aes_hw_set_encrypt_key_alt ring_core_0_17_14__aes_hw_set_encrypt_key_alt #define aes_hw_set_encrypt_key_base ring_core_0_17_14__aes_hw_set_encrypt_key_base #define aes_nohw_ctr32_encrypt_blocks ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks #define aes_nohw_encrypt ring_core_0_17_14__aes_nohw_encrypt #define aes_nohw_set_encrypt_key ring_core_0_17_14__aes_nohw_set_encrypt_key #define aesni_gcm_decrypt ring_core_0_17_14__aesni_gcm_decrypt #define aesni_gcm_encrypt ring_core_0_17_14__aesni_gcm_encrypt #define bn_from_montgomery_in_place ring_core_0_17_14__bn_from_montgomery_in_place #define bn_gather5 ring_core_0_17_14__bn_gather5 #define bn_mul_mont ring_core_0_17_14__bn_mul_mont #define bn_mul_mont_nohw ring_core_0_17_14__bn_mul_mont_nohw #define bn_mul4x_mont ring_core_0_17_14__bn_mul4x_mont #define bn_mulx4x_mont ring_core_0_17_14__bn_mulx4x_mont #define bn_mul8x_mont_neon ring_core_0_17_14__bn_mul8x_mont_neon #define bn_mul4x_mont_gather5 ring_core_0_17_14__bn_mul4x_mont_gather5 #define bn_mulx4x_mont_gather5 ring_core_0_17_14__bn_mulx4x_mont_gather5 #define bn_neg_inv_mod_r_u64 ring_core_0_17_14__bn_neg_inv_mod_r_u64 #define bn_power5_nohw ring_core_0_17_14__bn_power5_nohw #define bn_powerx5 ring_core_0_17_14__bn_powerx5 #define bn_scatter5 ring_core_0_17_14__bn_scatter5 #define bn_sqr8x_internal ring_core_0_17_14__bn_sqr8x_internal #define bn_sqr8x_mont ring_core_0_17_14__bn_sqr8x_mont #define bn_sqrx8x_internal ring_core_0_17_14__bn_sqrx8x_internal #define bsaes_ctr32_encrypt_blocks ring_core_0_17_14__bsaes_ctr32_encrypt_blocks #define bssl_constant_time_test_conditional_memcpy ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy #define bssl_constant_time_test_conditional_memxor ring_core_0_17_14__bssl_constant_time_test_conditional_memxor #define bssl_constant_time_test_main ring_core_0_17_14__bssl_constant_time_test_main #define chacha20_poly1305_open ring_core_0_17_14__chacha20_poly1305_open #define chacha20_poly1305_open_avx2 ring_core_0_17_14__chacha20_poly1305_open_avx2 #define chacha20_poly1305_open_sse41 ring_core_0_17_14__chacha20_poly1305_open_sse41 #define chacha20_poly1305_seal ring_core_0_17_14__chacha20_poly1305_seal #define chacha20_poly1305_seal_avx2 ring_core_0_17_14__chacha20_poly1305_seal_avx2 #define chacha20_poly1305_seal_sse41 ring_core_0_17_14__chacha20_poly1305_seal_sse41 #define ecp_nistz256_mul_mont_adx ring_core_0_17_14__ecp_nistz256_mul_mont_adx #define ecp_nistz256_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_mul_mont_nohw #define ecp_nistz256_ord_mul_mont_adx ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx #define ecp_nistz256_ord_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw #define ecp_nistz256_ord_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx #define ecp_nistz256_ord_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw #define ecp_nistz256_point_add_adx ring_core_0_17_14__ecp_nistz256_point_add_adx #define ecp_nistz256_point_add_nohw ring_core_0_17_14__ecp_nistz256_point_add_nohw #define ecp_nistz256_point_add_affine_adx ring_core_0_17_14__ecp_nistz256_point_add_affine_adx #define ecp_nistz256_point_add_affine_nohw ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw #define ecp_nistz256_point_double_adx ring_core_0_17_14__ecp_nistz256_point_double_adx #define ecp_nistz256_point_double_nohw ring_core_0_17_14__ecp_nistz256_point_double_nohw #define ecp_nistz256_select_w5_avx2 ring_core_0_17_14__ecp_nistz256_select_w5_avx2 #define ecp_nistz256_select_w5_nohw ring_core_0_17_14__ecp_nistz256_select_w5_nohw #define ecp_nistz256_select_w7_avx2 ring_core_0_17_14__ecp_nistz256_select_w7_avx2 #define ecp_nistz256_select_w7_nohw ring_core_0_17_14__ecp_nistz256_select_w7_nohw #define ecp_nistz256_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_sqr_mont_adx #define ecp_nistz256_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw #define fiat_curve25519_adx_mul ring_core_0_17_14__fiat_curve25519_adx_mul #define fiat_curve25519_adx_square ring_core_0_17_14__fiat_curve25519_adx_square #define gcm_ghash_avx ring_core_0_17_14__gcm_ghash_avx #define gcm_ghash_clmul ring_core_0_17_14__gcm_ghash_clmul #define gcm_ghash_neon ring_core_0_17_14__gcm_ghash_neon #define gcm_ghash_vpclmulqdq_avx2_1 ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1 #define gcm_gmult_clmul ring_core_0_17_14__gcm_gmult_clmul #define gcm_gmult_neon ring_core_0_17_14__gcm_gmult_neon #define gcm_init_avx ring_core_0_17_14__gcm_init_avx #define gcm_init_clmul ring_core_0_17_14__gcm_init_clmul #define gcm_init_neon ring_core_0_17_14__gcm_init_neon #define gcm_init_vpclmulqdq_avx2 ring_core_0_17_14__gcm_init_vpclmulqdq_avx2 #define k25519Precomp ring_core_0_17_14__k25519Precomp #define limbs_mul_add_limb ring_core_0_17_14__limbs_mul_add_limb #define little_endian_bytes_from_scalar ring_core_0_17_14__little_endian_bytes_from_scalar #define ecp_nistz256_neg ring_core_0_17_14__ecp_nistz256_neg #define ecp_nistz256_select_w5 ring_core_0_17_14__ecp_nistz256_select_w5 #define ecp_nistz256_select_w7 ring_core_0_17_14__ecp_nistz256_select_w7 #define neon_available ring_core_0_17_14__neon_available #define p256_mul_mont ring_core_0_17_14__p256_mul_mont #define p256_point_add ring_core_0_17_14__p256_point_add #define p256_point_add_affine ring_core_0_17_14__p256_point_add_affine #define p256_point_double ring_core_0_17_14__p256_point_double #define p256_point_mul ring_core_0_17_14__p256_point_mul #define p256_point_mul_base ring_core_0_17_14__p256_point_mul_base #define p256_point_mul_base_vartime ring_core_0_17_14__p256_point_mul_base_vartime #define p256_scalar_mul_mont ring_core_0_17_14__p256_scalar_mul_mont #define p256_scalar_sqr_rep_mont ring_core_0_17_14__p256_scalar_sqr_rep_mont #define p256_sqr_mont ring_core_0_17_14__p256_sqr_mont #define p384_elem_div_by_2 ring_core_0_17_14__p384_elem_div_by_2 #define p384_elem_mul_mont ring_core_0_17_14__p384_elem_mul_mont #define p384_elem_neg ring_core_0_17_14__p384_elem_neg #define p384_elem_sub ring_core_0_17_14__p384_elem_sub #define p384_point_add ring_core_0_17_14__p384_point_add #define p384_point_double ring_core_0_17_14__p384_point_double #define p384_point_mul ring_core_0_17_14__p384_point_mul #define p384_scalar_mul_mont ring_core_0_17_14__p384_scalar_mul_mont #define openssl_poly1305_neon2_addmulmod ring_core_0_17_14__openssl_poly1305_neon2_addmulmod #define openssl_poly1305_neon2_blocks ring_core_0_17_14__openssl_poly1305_neon2_blocks #define sha256_block_data_order ring_core_0_17_14__sha256_block_data_order #define sha256_block_data_order_avx ring_core_0_17_14__sha256_block_data_order_avx #define sha256_block_data_order_ssse3 ring_core_0_17_14__sha256_block_data_order_ssse3 #define sha256_block_data_order_hw ring_core_0_17_14__sha256_block_data_order_hw #define sha256_block_data_order_neon ring_core_0_17_14__sha256_block_data_order_neon #define sha256_block_data_order_nohw ring_core_0_17_14__sha256_block_data_order_nohw #define sha512_block_data_order ring_core_0_17_14__sha512_block_data_order #define sha512_block_data_order_avx ring_core_0_17_14__sha512_block_data_order_avx #define sha512_block_data_order_hw ring_core_0_17_14__sha512_block_data_order_hw #define sha512_block_data_order_neon ring_core_0_17_14__sha512_block_data_order_neon #define sha512_block_data_order_nohw ring_core_0_17_14__sha512_block_data_order_nohw #define vpaes_ctr32_encrypt_blocks ring_core_0_17_14__vpaes_ctr32_encrypt_blocks #define vpaes_encrypt ring_core_0_17_14__vpaes_encrypt #define vpaes_encrypt_key_to_bsaes ring_core_0_17_14__vpaes_encrypt_key_to_bsaes #define vpaes_set_encrypt_key ring_core_0_17_14__vpaes_set_encrypt_key #define x25519_NEON ring_core_0_17_14__x25519_NEON #define x25519_fe_invert ring_core_0_17_14__x25519_fe_invert #define x25519_fe_isnegative ring_core_0_17_14__x25519_fe_isnegative #define x25519_fe_mul_ttt ring_core_0_17_14__x25519_fe_mul_ttt #define x25519_fe_neg ring_core_0_17_14__x25519_fe_neg #define x25519_fe_tobytes ring_core_0_17_14__x25519_fe_tobytes #define x25519_ge_double_scalarmult_vartime ring_core_0_17_14__x25519_ge_double_scalarmult_vartime #define x25519_ge_frombytes_vartime ring_core_0_17_14__x25519_ge_frombytes_vartime #define x25519_ge_scalarmult_base ring_core_0_17_14__x25519_ge_scalarmult_base #define x25519_ge_scalarmult_base_adx ring_core_0_17_14__x25519_ge_scalarmult_base_adx #define x25519_public_from_private_generic_masked ring_core_0_17_14__x25519_public_from_private_generic_masked #define x25519_sc_mask ring_core_0_17_14__x25519_sc_mask #define x25519_sc_muladd ring_core_0_17_14__x25519_sc_muladd #define x25519_sc_reduce ring_core_0_17_14__x25519_sc_reduce #define x25519_scalar_mult_adx ring_core_0_17_14__x25519_scalar_mult_adx #define x25519_scalar_mult_generic_masked ring_core_0_17_14__x25519_scalar_mult_generic_masked #endif ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_asm.h000064400000000000000000000555631046102023000234630ustar 00000000000000 #ifndef ring_core_generated_PREFIX_SYMBOLS_ASM_H #define ring_core_generated_PREFIX_SYMBOLS_ASM_H #if defined(__APPLE__) #define _ecp_nistz256_point_double _p256_point_double #define _ecp_nistz256_point_add _p256_point_add #define _ecp_nistz256_point_add_affine _p256_point_add_affine #define _ecp_nistz256_ord_mul_mont _p256_scalar_mul_mont #define _ecp_nistz256_ord_sqr_mont _p256_scalar_sqr_rep_mont #define _ecp_nistz256_mul_mont _p256_mul_mont #define _ecp_nistz256_sqr_mont _p256_sqr_mont #define _adx_bmi2_available _ring_core_0_17_14__adx_bmi2_available #define _avx2_available _ring_core_0_17_14__avx2_available #define _CRYPTO_memcmp _ring_core_0_17_14__CRYPTO_memcmp #define _CRYPTO_poly1305_finish _ring_core_0_17_14__CRYPTO_poly1305_finish #define _CRYPTO_poly1305_finish_neon _ring_core_0_17_14__CRYPTO_poly1305_finish_neon #define _CRYPTO_poly1305_init _ring_core_0_17_14__CRYPTO_poly1305_init #define _CRYPTO_poly1305_init_neon _ring_core_0_17_14__CRYPTO_poly1305_init_neon #define _CRYPTO_poly1305_update _ring_core_0_17_14__CRYPTO_poly1305_update #define _CRYPTO_poly1305_update_neon _ring_core_0_17_14__CRYPTO_poly1305_update_neon #define _ChaCha20_ctr32 _ring_core_0_17_14__ChaCha20_ctr32 #define _ChaCha20_ctr32_avx2 _ring_core_0_17_14__ChaCha20_ctr32_avx2 #define _ChaCha20_ctr32_neon _ring_core_0_17_14__ChaCha20_ctr32_neon #define _ChaCha20_ctr32_nohw _ring_core_0_17_14__ChaCha20_ctr32_nohw #define _ChaCha20_ctr32_ssse3 _ring_core_0_17_14__ChaCha20_ctr32_ssse3 #define _ChaCha20_ctr32_ssse3_4x _ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x #define _LIMB_is_zero _ring_core_0_17_14__LIMB_is_zero #define _LIMBS_add_mod _ring_core_0_17_14__LIMBS_add_mod #define _LIMBS_are_zero _ring_core_0_17_14__LIMBS_are_zero #define _LIMBS_equal _ring_core_0_17_14__LIMBS_equal #define _LIMBS_less_than _ring_core_0_17_14__LIMBS_less_than #define _LIMBS_reduce_once _ring_core_0_17_14__LIMBS_reduce_once #define _LIMBS_select_512_32 _ring_core_0_17_14__LIMBS_select_512_32 #define _LIMBS_shl_mod _ring_core_0_17_14__LIMBS_shl_mod #define _LIMBS_sub_mod _ring_core_0_17_14__LIMBS_sub_mod #define _LIMBS_window5_split_window _ring_core_0_17_14__LIMBS_window5_split_window #define _LIMBS_window5_unsplit_window _ring_core_0_17_14__LIMBS_window5_unsplit_window #define _LIMB_shr _ring_core_0_17_14__LIMB_shr #define _OPENSSL_cpuid_setup _ring_core_0_17_14__OPENSSL_cpuid_setup #define _aes_gcm_dec_kernel _ring_core_0_17_14__aes_gcm_dec_kernel #define _aes_gcm_dec_update_vaes_avx2 _ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2 #define _aes_gcm_enc_kernel _ring_core_0_17_14__aes_gcm_enc_kernel #define _aes_gcm_enc_update_vaes_avx2 _ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2 #define _aes_hw_ctr32_encrypt_blocks _ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks #define _aes_hw_set_encrypt_key _ring_core_0_17_14__aes_hw_set_encrypt_key #define _aes_hw_set_encrypt_key_alt _ring_core_0_17_14__aes_hw_set_encrypt_key_alt #define _aes_hw_set_encrypt_key_base _ring_core_0_17_14__aes_hw_set_encrypt_key_base #define _aes_nohw_ctr32_encrypt_blocks _ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks #define _aes_nohw_encrypt _ring_core_0_17_14__aes_nohw_encrypt #define _aes_nohw_set_encrypt_key _ring_core_0_17_14__aes_nohw_set_encrypt_key #define _aesni_gcm_decrypt _ring_core_0_17_14__aesni_gcm_decrypt #define _aesni_gcm_encrypt _ring_core_0_17_14__aesni_gcm_encrypt #define _bn_from_montgomery_in_place _ring_core_0_17_14__bn_from_montgomery_in_place #define _bn_gather5 _ring_core_0_17_14__bn_gather5 #define _bn_mul_mont _ring_core_0_17_14__bn_mul_mont #define _bn_mul_mont_nohw _ring_core_0_17_14__bn_mul_mont_nohw #define _bn_mul4x_mont _ring_core_0_17_14__bn_mul4x_mont #define _bn_mulx4x_mont _ring_core_0_17_14__bn_mulx4x_mont #define _bn_mul8x_mont_neon _ring_core_0_17_14__bn_mul8x_mont_neon #define _bn_mul4x_mont_gather5 _ring_core_0_17_14__bn_mul4x_mont_gather5 #define _bn_mulx4x_mont_gather5 _ring_core_0_17_14__bn_mulx4x_mont_gather5 #define _bn_neg_inv_mod_r_u64 _ring_core_0_17_14__bn_neg_inv_mod_r_u64 #define _bn_power5_nohw _ring_core_0_17_14__bn_power5_nohw #define _bn_powerx5 _ring_core_0_17_14__bn_powerx5 #define _bn_scatter5 _ring_core_0_17_14__bn_scatter5 #define _bn_sqr8x_internal _ring_core_0_17_14__bn_sqr8x_internal #define _bn_sqr8x_mont _ring_core_0_17_14__bn_sqr8x_mont #define _bn_sqrx8x_internal _ring_core_0_17_14__bn_sqrx8x_internal #define _bsaes_ctr32_encrypt_blocks _ring_core_0_17_14__bsaes_ctr32_encrypt_blocks #define _bssl_constant_time_test_conditional_memcpy _ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy #define _bssl_constant_time_test_conditional_memxor _ring_core_0_17_14__bssl_constant_time_test_conditional_memxor #define _bssl_constant_time_test_main _ring_core_0_17_14__bssl_constant_time_test_main #define _chacha20_poly1305_open _ring_core_0_17_14__chacha20_poly1305_open #define _chacha20_poly1305_open_avx2 _ring_core_0_17_14__chacha20_poly1305_open_avx2 #define _chacha20_poly1305_open_sse41 _ring_core_0_17_14__chacha20_poly1305_open_sse41 #define _chacha20_poly1305_seal _ring_core_0_17_14__chacha20_poly1305_seal #define _chacha20_poly1305_seal_avx2 _ring_core_0_17_14__chacha20_poly1305_seal_avx2 #define _chacha20_poly1305_seal_sse41 _ring_core_0_17_14__chacha20_poly1305_seal_sse41 #define _ecp_nistz256_mul_mont_adx _ring_core_0_17_14__ecp_nistz256_mul_mont_adx #define _ecp_nistz256_mul_mont_nohw _ring_core_0_17_14__ecp_nistz256_mul_mont_nohw #define _ecp_nistz256_ord_mul_mont_adx _ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx #define _ecp_nistz256_ord_mul_mont_nohw _ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw #define _ecp_nistz256_ord_sqr_mont_adx _ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx #define _ecp_nistz256_ord_sqr_mont_nohw _ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw #define _ecp_nistz256_point_add_adx _ring_core_0_17_14__ecp_nistz256_point_add_adx #define _ecp_nistz256_point_add_nohw _ring_core_0_17_14__ecp_nistz256_point_add_nohw #define _ecp_nistz256_point_add_affine_adx _ring_core_0_17_14__ecp_nistz256_point_add_affine_adx #define _ecp_nistz256_point_add_affine_nohw _ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw #define _ecp_nistz256_point_double_adx _ring_core_0_17_14__ecp_nistz256_point_double_adx #define _ecp_nistz256_point_double_nohw _ring_core_0_17_14__ecp_nistz256_point_double_nohw #define _ecp_nistz256_select_w5_avx2 _ring_core_0_17_14__ecp_nistz256_select_w5_avx2 #define _ecp_nistz256_select_w5_nohw _ring_core_0_17_14__ecp_nistz256_select_w5_nohw #define _ecp_nistz256_select_w7_avx2 _ring_core_0_17_14__ecp_nistz256_select_w7_avx2 #define _ecp_nistz256_select_w7_nohw _ring_core_0_17_14__ecp_nistz256_select_w7_nohw #define _ecp_nistz256_sqr_mont_adx _ring_core_0_17_14__ecp_nistz256_sqr_mont_adx #define _ecp_nistz256_sqr_mont_nohw _ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw #define _fiat_curve25519_adx_mul _ring_core_0_17_14__fiat_curve25519_adx_mul #define _fiat_curve25519_adx_square _ring_core_0_17_14__fiat_curve25519_adx_square #define _gcm_ghash_avx _ring_core_0_17_14__gcm_ghash_avx #define _gcm_ghash_clmul _ring_core_0_17_14__gcm_ghash_clmul #define _gcm_ghash_neon _ring_core_0_17_14__gcm_ghash_neon #define _gcm_ghash_vpclmulqdq_avx2_1 _ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1 #define _gcm_gmult_clmul _ring_core_0_17_14__gcm_gmult_clmul #define _gcm_gmult_neon _ring_core_0_17_14__gcm_gmult_neon #define _gcm_init_avx _ring_core_0_17_14__gcm_init_avx #define _gcm_init_clmul _ring_core_0_17_14__gcm_init_clmul #define _gcm_init_neon _ring_core_0_17_14__gcm_init_neon #define _gcm_init_vpclmulqdq_avx2 _ring_core_0_17_14__gcm_init_vpclmulqdq_avx2 #define _k25519Precomp _ring_core_0_17_14__k25519Precomp #define _limbs_mul_add_limb _ring_core_0_17_14__limbs_mul_add_limb #define _little_endian_bytes_from_scalar _ring_core_0_17_14__little_endian_bytes_from_scalar #define _ecp_nistz256_neg _ring_core_0_17_14__ecp_nistz256_neg #define _ecp_nistz256_select_w5 _ring_core_0_17_14__ecp_nistz256_select_w5 #define _ecp_nistz256_select_w7 _ring_core_0_17_14__ecp_nistz256_select_w7 #define _neon_available _ring_core_0_17_14__neon_available #define _p256_mul_mont _ring_core_0_17_14__p256_mul_mont #define _p256_point_add _ring_core_0_17_14__p256_point_add #define _p256_point_add_affine _ring_core_0_17_14__p256_point_add_affine #define _p256_point_double _ring_core_0_17_14__p256_point_double #define _p256_point_mul _ring_core_0_17_14__p256_point_mul #define _p256_point_mul_base _ring_core_0_17_14__p256_point_mul_base #define _p256_point_mul_base_vartime _ring_core_0_17_14__p256_point_mul_base_vartime #define _p256_scalar_mul_mont _ring_core_0_17_14__p256_scalar_mul_mont #define _p256_scalar_sqr_rep_mont _ring_core_0_17_14__p256_scalar_sqr_rep_mont #define _p256_sqr_mont _ring_core_0_17_14__p256_sqr_mont #define _p384_elem_div_by_2 _ring_core_0_17_14__p384_elem_div_by_2 #define _p384_elem_mul_mont _ring_core_0_17_14__p384_elem_mul_mont #define _p384_elem_neg _ring_core_0_17_14__p384_elem_neg #define _p384_elem_sub _ring_core_0_17_14__p384_elem_sub #define _p384_point_add _ring_core_0_17_14__p384_point_add #define _p384_point_double _ring_core_0_17_14__p384_point_double #define _p384_point_mul _ring_core_0_17_14__p384_point_mul #define _p384_scalar_mul_mont _ring_core_0_17_14__p384_scalar_mul_mont #define _openssl_poly1305_neon2_addmulmod _ring_core_0_17_14__openssl_poly1305_neon2_addmulmod #define _openssl_poly1305_neon2_blocks _ring_core_0_17_14__openssl_poly1305_neon2_blocks #define _sha256_block_data_order _ring_core_0_17_14__sha256_block_data_order #define _sha256_block_data_order_avx _ring_core_0_17_14__sha256_block_data_order_avx #define _sha256_block_data_order_ssse3 _ring_core_0_17_14__sha256_block_data_order_ssse3 #define _sha256_block_data_order_hw _ring_core_0_17_14__sha256_block_data_order_hw #define _sha256_block_data_order_neon _ring_core_0_17_14__sha256_block_data_order_neon #define _sha256_block_data_order_nohw _ring_core_0_17_14__sha256_block_data_order_nohw #define _sha512_block_data_order _ring_core_0_17_14__sha512_block_data_order #define _sha512_block_data_order_avx _ring_core_0_17_14__sha512_block_data_order_avx #define _sha512_block_data_order_hw _ring_core_0_17_14__sha512_block_data_order_hw #define _sha512_block_data_order_neon _ring_core_0_17_14__sha512_block_data_order_neon #define _sha512_block_data_order_nohw _ring_core_0_17_14__sha512_block_data_order_nohw #define _vpaes_ctr32_encrypt_blocks _ring_core_0_17_14__vpaes_ctr32_encrypt_blocks #define _vpaes_encrypt _ring_core_0_17_14__vpaes_encrypt #define _vpaes_encrypt_key_to_bsaes _ring_core_0_17_14__vpaes_encrypt_key_to_bsaes #define _vpaes_set_encrypt_key _ring_core_0_17_14__vpaes_set_encrypt_key #define _x25519_NEON _ring_core_0_17_14__x25519_NEON #define _x25519_fe_invert _ring_core_0_17_14__x25519_fe_invert #define _x25519_fe_isnegative _ring_core_0_17_14__x25519_fe_isnegative #define _x25519_fe_mul_ttt _ring_core_0_17_14__x25519_fe_mul_ttt #define _x25519_fe_neg _ring_core_0_17_14__x25519_fe_neg #define _x25519_fe_tobytes _ring_core_0_17_14__x25519_fe_tobytes #define _x25519_ge_double_scalarmult_vartime _ring_core_0_17_14__x25519_ge_double_scalarmult_vartime #define _x25519_ge_frombytes_vartime _ring_core_0_17_14__x25519_ge_frombytes_vartime #define _x25519_ge_scalarmult_base _ring_core_0_17_14__x25519_ge_scalarmult_base #define _x25519_ge_scalarmult_base_adx _ring_core_0_17_14__x25519_ge_scalarmult_base_adx #define _x25519_public_from_private_generic_masked _ring_core_0_17_14__x25519_public_from_private_generic_masked #define _x25519_sc_mask _ring_core_0_17_14__x25519_sc_mask #define _x25519_sc_muladd _ring_core_0_17_14__x25519_sc_muladd #define _x25519_sc_reduce _ring_core_0_17_14__x25519_sc_reduce #define _x25519_scalar_mult_adx _ring_core_0_17_14__x25519_scalar_mult_adx #define _x25519_scalar_mult_generic_masked _ring_core_0_17_14__x25519_scalar_mult_generic_masked #else #define ecp_nistz256_point_double p256_point_double #define ecp_nistz256_point_add p256_point_add #define ecp_nistz256_point_add_affine p256_point_add_affine #define ecp_nistz256_ord_mul_mont p256_scalar_mul_mont #define ecp_nistz256_ord_sqr_mont p256_scalar_sqr_rep_mont #define ecp_nistz256_mul_mont p256_mul_mont #define ecp_nistz256_sqr_mont p256_sqr_mont #define adx_bmi2_available ring_core_0_17_14__adx_bmi2_available #define avx2_available ring_core_0_17_14__avx2_available #define CRYPTO_memcmp ring_core_0_17_14__CRYPTO_memcmp #define CRYPTO_poly1305_finish ring_core_0_17_14__CRYPTO_poly1305_finish #define CRYPTO_poly1305_finish_neon ring_core_0_17_14__CRYPTO_poly1305_finish_neon #define CRYPTO_poly1305_init ring_core_0_17_14__CRYPTO_poly1305_init #define CRYPTO_poly1305_init_neon ring_core_0_17_14__CRYPTO_poly1305_init_neon #define CRYPTO_poly1305_update ring_core_0_17_14__CRYPTO_poly1305_update #define CRYPTO_poly1305_update_neon ring_core_0_17_14__CRYPTO_poly1305_update_neon #define ChaCha20_ctr32 ring_core_0_17_14__ChaCha20_ctr32 #define ChaCha20_ctr32_avx2 ring_core_0_17_14__ChaCha20_ctr32_avx2 #define ChaCha20_ctr32_neon ring_core_0_17_14__ChaCha20_ctr32_neon #define ChaCha20_ctr32_nohw ring_core_0_17_14__ChaCha20_ctr32_nohw #define ChaCha20_ctr32_ssse3 ring_core_0_17_14__ChaCha20_ctr32_ssse3 #define ChaCha20_ctr32_ssse3_4x ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x #define LIMB_is_zero ring_core_0_17_14__LIMB_is_zero #define LIMBS_add_mod ring_core_0_17_14__LIMBS_add_mod #define LIMBS_are_zero ring_core_0_17_14__LIMBS_are_zero #define LIMBS_equal ring_core_0_17_14__LIMBS_equal #define LIMBS_less_than ring_core_0_17_14__LIMBS_less_than #define LIMBS_reduce_once ring_core_0_17_14__LIMBS_reduce_once #define LIMBS_select_512_32 ring_core_0_17_14__LIMBS_select_512_32 #define LIMBS_shl_mod ring_core_0_17_14__LIMBS_shl_mod #define LIMBS_sub_mod ring_core_0_17_14__LIMBS_sub_mod #define LIMBS_window5_split_window ring_core_0_17_14__LIMBS_window5_split_window #define LIMBS_window5_unsplit_window ring_core_0_17_14__LIMBS_window5_unsplit_window #define LIMB_shr ring_core_0_17_14__LIMB_shr #define OPENSSL_cpuid_setup ring_core_0_17_14__OPENSSL_cpuid_setup #define aes_gcm_dec_kernel ring_core_0_17_14__aes_gcm_dec_kernel #define aes_gcm_dec_update_vaes_avx2 ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2 #define aes_gcm_enc_kernel ring_core_0_17_14__aes_gcm_enc_kernel #define aes_gcm_enc_update_vaes_avx2 ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2 #define aes_hw_ctr32_encrypt_blocks ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks #define aes_hw_set_encrypt_key ring_core_0_17_14__aes_hw_set_encrypt_key #define aes_hw_set_encrypt_key_alt ring_core_0_17_14__aes_hw_set_encrypt_key_alt #define aes_hw_set_encrypt_key_base ring_core_0_17_14__aes_hw_set_encrypt_key_base #define aes_nohw_ctr32_encrypt_blocks ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks #define aes_nohw_encrypt ring_core_0_17_14__aes_nohw_encrypt #define aes_nohw_set_encrypt_key ring_core_0_17_14__aes_nohw_set_encrypt_key #define aesni_gcm_decrypt ring_core_0_17_14__aesni_gcm_decrypt #define aesni_gcm_encrypt ring_core_0_17_14__aesni_gcm_encrypt #define bn_from_montgomery_in_place ring_core_0_17_14__bn_from_montgomery_in_place #define bn_gather5 ring_core_0_17_14__bn_gather5 #define bn_mul_mont ring_core_0_17_14__bn_mul_mont #define bn_mul_mont_nohw ring_core_0_17_14__bn_mul_mont_nohw #define bn_mul4x_mont ring_core_0_17_14__bn_mul4x_mont #define bn_mulx4x_mont ring_core_0_17_14__bn_mulx4x_mont #define bn_mul8x_mont_neon ring_core_0_17_14__bn_mul8x_mont_neon #define bn_mul4x_mont_gather5 ring_core_0_17_14__bn_mul4x_mont_gather5 #define bn_mulx4x_mont_gather5 ring_core_0_17_14__bn_mulx4x_mont_gather5 #define bn_neg_inv_mod_r_u64 ring_core_0_17_14__bn_neg_inv_mod_r_u64 #define bn_power5_nohw ring_core_0_17_14__bn_power5_nohw #define bn_powerx5 ring_core_0_17_14__bn_powerx5 #define bn_scatter5 ring_core_0_17_14__bn_scatter5 #define bn_sqr8x_internal ring_core_0_17_14__bn_sqr8x_internal #define bn_sqr8x_mont ring_core_0_17_14__bn_sqr8x_mont #define bn_sqrx8x_internal ring_core_0_17_14__bn_sqrx8x_internal #define bsaes_ctr32_encrypt_blocks ring_core_0_17_14__bsaes_ctr32_encrypt_blocks #define bssl_constant_time_test_conditional_memcpy ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy #define bssl_constant_time_test_conditional_memxor ring_core_0_17_14__bssl_constant_time_test_conditional_memxor #define bssl_constant_time_test_main ring_core_0_17_14__bssl_constant_time_test_main #define chacha20_poly1305_open ring_core_0_17_14__chacha20_poly1305_open #define chacha20_poly1305_open_avx2 ring_core_0_17_14__chacha20_poly1305_open_avx2 #define chacha20_poly1305_open_sse41 ring_core_0_17_14__chacha20_poly1305_open_sse41 #define chacha20_poly1305_seal ring_core_0_17_14__chacha20_poly1305_seal #define chacha20_poly1305_seal_avx2 ring_core_0_17_14__chacha20_poly1305_seal_avx2 #define chacha20_poly1305_seal_sse41 ring_core_0_17_14__chacha20_poly1305_seal_sse41 #define ecp_nistz256_mul_mont_adx ring_core_0_17_14__ecp_nistz256_mul_mont_adx #define ecp_nistz256_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_mul_mont_nohw #define ecp_nistz256_ord_mul_mont_adx ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx #define ecp_nistz256_ord_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw #define ecp_nistz256_ord_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx #define ecp_nistz256_ord_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw #define ecp_nistz256_point_add_adx ring_core_0_17_14__ecp_nistz256_point_add_adx #define ecp_nistz256_point_add_nohw ring_core_0_17_14__ecp_nistz256_point_add_nohw #define ecp_nistz256_point_add_affine_adx ring_core_0_17_14__ecp_nistz256_point_add_affine_adx #define ecp_nistz256_point_add_affine_nohw ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw #define ecp_nistz256_point_double_adx ring_core_0_17_14__ecp_nistz256_point_double_adx #define ecp_nistz256_point_double_nohw ring_core_0_17_14__ecp_nistz256_point_double_nohw #define ecp_nistz256_select_w5_avx2 ring_core_0_17_14__ecp_nistz256_select_w5_avx2 #define ecp_nistz256_select_w5_nohw ring_core_0_17_14__ecp_nistz256_select_w5_nohw #define ecp_nistz256_select_w7_avx2 ring_core_0_17_14__ecp_nistz256_select_w7_avx2 #define ecp_nistz256_select_w7_nohw ring_core_0_17_14__ecp_nistz256_select_w7_nohw #define ecp_nistz256_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_sqr_mont_adx #define ecp_nistz256_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw #define fiat_curve25519_adx_mul ring_core_0_17_14__fiat_curve25519_adx_mul #define fiat_curve25519_adx_square ring_core_0_17_14__fiat_curve25519_adx_square #define gcm_ghash_avx ring_core_0_17_14__gcm_ghash_avx #define gcm_ghash_clmul ring_core_0_17_14__gcm_ghash_clmul #define gcm_ghash_neon ring_core_0_17_14__gcm_ghash_neon #define gcm_ghash_vpclmulqdq_avx2_1 ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1 #define gcm_gmult_clmul ring_core_0_17_14__gcm_gmult_clmul #define gcm_gmult_neon ring_core_0_17_14__gcm_gmult_neon #define gcm_init_avx ring_core_0_17_14__gcm_init_avx #define gcm_init_clmul ring_core_0_17_14__gcm_init_clmul #define gcm_init_neon ring_core_0_17_14__gcm_init_neon #define gcm_init_vpclmulqdq_avx2 ring_core_0_17_14__gcm_init_vpclmulqdq_avx2 #define k25519Precomp ring_core_0_17_14__k25519Precomp #define limbs_mul_add_limb ring_core_0_17_14__limbs_mul_add_limb #define little_endian_bytes_from_scalar ring_core_0_17_14__little_endian_bytes_from_scalar #define ecp_nistz256_neg ring_core_0_17_14__ecp_nistz256_neg #define ecp_nistz256_select_w5 ring_core_0_17_14__ecp_nistz256_select_w5 #define ecp_nistz256_select_w7 ring_core_0_17_14__ecp_nistz256_select_w7 #define neon_available ring_core_0_17_14__neon_available #define p256_mul_mont ring_core_0_17_14__p256_mul_mont #define p256_point_add ring_core_0_17_14__p256_point_add #define p256_point_add_affine ring_core_0_17_14__p256_point_add_affine #define p256_point_double ring_core_0_17_14__p256_point_double #define p256_point_mul ring_core_0_17_14__p256_point_mul #define p256_point_mul_base ring_core_0_17_14__p256_point_mul_base #define p256_point_mul_base_vartime ring_core_0_17_14__p256_point_mul_base_vartime #define p256_scalar_mul_mont ring_core_0_17_14__p256_scalar_mul_mont #define p256_scalar_sqr_rep_mont ring_core_0_17_14__p256_scalar_sqr_rep_mont #define p256_sqr_mont ring_core_0_17_14__p256_sqr_mont #define p384_elem_div_by_2 ring_core_0_17_14__p384_elem_div_by_2 #define p384_elem_mul_mont ring_core_0_17_14__p384_elem_mul_mont #define p384_elem_neg ring_core_0_17_14__p384_elem_neg #define p384_elem_sub ring_core_0_17_14__p384_elem_sub #define p384_point_add ring_core_0_17_14__p384_point_add #define p384_point_double ring_core_0_17_14__p384_point_double #define p384_point_mul ring_core_0_17_14__p384_point_mul #define p384_scalar_mul_mont ring_core_0_17_14__p384_scalar_mul_mont #define openssl_poly1305_neon2_addmulmod ring_core_0_17_14__openssl_poly1305_neon2_addmulmod #define openssl_poly1305_neon2_blocks ring_core_0_17_14__openssl_poly1305_neon2_blocks #define sha256_block_data_order ring_core_0_17_14__sha256_block_data_order #define sha256_block_data_order_avx ring_core_0_17_14__sha256_block_data_order_avx #define sha256_block_data_order_ssse3 ring_core_0_17_14__sha256_block_data_order_ssse3 #define sha256_block_data_order_hw ring_core_0_17_14__sha256_block_data_order_hw #define sha256_block_data_order_neon ring_core_0_17_14__sha256_block_data_order_neon #define sha256_block_data_order_nohw ring_core_0_17_14__sha256_block_data_order_nohw #define sha512_block_data_order ring_core_0_17_14__sha512_block_data_order #define sha512_block_data_order_avx ring_core_0_17_14__sha512_block_data_order_avx #define sha512_block_data_order_hw ring_core_0_17_14__sha512_block_data_order_hw #define sha512_block_data_order_neon ring_core_0_17_14__sha512_block_data_order_neon #define sha512_block_data_order_nohw ring_core_0_17_14__sha512_block_data_order_nohw #define vpaes_ctr32_encrypt_blocks ring_core_0_17_14__vpaes_ctr32_encrypt_blocks #define vpaes_encrypt ring_core_0_17_14__vpaes_encrypt #define vpaes_encrypt_key_to_bsaes ring_core_0_17_14__vpaes_encrypt_key_to_bsaes #define vpaes_set_encrypt_key ring_core_0_17_14__vpaes_set_encrypt_key #define x25519_NEON ring_core_0_17_14__x25519_NEON #define x25519_fe_invert ring_core_0_17_14__x25519_fe_invert #define x25519_fe_isnegative ring_core_0_17_14__x25519_fe_isnegative #define x25519_fe_mul_ttt ring_core_0_17_14__x25519_fe_mul_ttt #define x25519_fe_neg ring_core_0_17_14__x25519_fe_neg #define x25519_fe_tobytes ring_core_0_17_14__x25519_fe_tobytes #define x25519_ge_double_scalarmult_vartime ring_core_0_17_14__x25519_ge_double_scalarmult_vartime #define x25519_ge_frombytes_vartime ring_core_0_17_14__x25519_ge_frombytes_vartime #define x25519_ge_scalarmult_base ring_core_0_17_14__x25519_ge_scalarmult_base #define x25519_ge_scalarmult_base_adx ring_core_0_17_14__x25519_ge_scalarmult_base_adx #define x25519_public_from_private_generic_masked ring_core_0_17_14__x25519_public_from_private_generic_masked #define x25519_sc_mask ring_core_0_17_14__x25519_sc_mask #define x25519_sc_muladd ring_core_0_17_14__x25519_sc_muladd #define x25519_sc_reduce ring_core_0_17_14__x25519_sc_reduce #define x25519_scalar_mult_adx ring_core_0_17_14__x25519_scalar_mult_adx #define x25519_scalar_mult_generic_masked ring_core_0_17_14__x25519_scalar_mult_generic_masked #endif #endif ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_nasm.inc000064400000000000000000000556011046102023000241540ustar 00000000000000 %ifndef ring_core_generated_PREFIX_SYMBOLS_NASM_INC %define ring_core_generated_PREFIX_SYMBOLS_NASM_INC %ifidn __OUTPUT_FORMAT__,win32 %define _ecp_nistz256_point_double _p256_point_double %define _ecp_nistz256_point_add _p256_point_add %define _ecp_nistz256_point_add_affine _p256_point_add_affine %define _ecp_nistz256_ord_mul_mont _p256_scalar_mul_mont %define _ecp_nistz256_ord_sqr_mont _p256_scalar_sqr_rep_mont %define _ecp_nistz256_mul_mont _p256_mul_mont %define _ecp_nistz256_sqr_mont _p256_sqr_mont %define _adx_bmi2_available _ring_core_0_17_14__adx_bmi2_available %define _avx2_available _ring_core_0_17_14__avx2_available %define _CRYPTO_memcmp _ring_core_0_17_14__CRYPTO_memcmp %define _CRYPTO_poly1305_finish _ring_core_0_17_14__CRYPTO_poly1305_finish %define _CRYPTO_poly1305_finish_neon _ring_core_0_17_14__CRYPTO_poly1305_finish_neon %define _CRYPTO_poly1305_init _ring_core_0_17_14__CRYPTO_poly1305_init %define _CRYPTO_poly1305_init_neon _ring_core_0_17_14__CRYPTO_poly1305_init_neon %define _CRYPTO_poly1305_update _ring_core_0_17_14__CRYPTO_poly1305_update %define _CRYPTO_poly1305_update_neon _ring_core_0_17_14__CRYPTO_poly1305_update_neon %define _ChaCha20_ctr32 _ring_core_0_17_14__ChaCha20_ctr32 %define _ChaCha20_ctr32_avx2 _ring_core_0_17_14__ChaCha20_ctr32_avx2 %define _ChaCha20_ctr32_neon _ring_core_0_17_14__ChaCha20_ctr32_neon %define _ChaCha20_ctr32_nohw _ring_core_0_17_14__ChaCha20_ctr32_nohw %define _ChaCha20_ctr32_ssse3 _ring_core_0_17_14__ChaCha20_ctr32_ssse3 %define _ChaCha20_ctr32_ssse3_4x _ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x %define _LIMB_is_zero _ring_core_0_17_14__LIMB_is_zero %define _LIMBS_add_mod _ring_core_0_17_14__LIMBS_add_mod %define _LIMBS_are_zero _ring_core_0_17_14__LIMBS_are_zero %define _LIMBS_equal _ring_core_0_17_14__LIMBS_equal %define _LIMBS_less_than _ring_core_0_17_14__LIMBS_less_than %define _LIMBS_reduce_once _ring_core_0_17_14__LIMBS_reduce_once %define _LIMBS_select_512_32 _ring_core_0_17_14__LIMBS_select_512_32 %define _LIMBS_shl_mod _ring_core_0_17_14__LIMBS_shl_mod %define _LIMBS_sub_mod _ring_core_0_17_14__LIMBS_sub_mod %define _LIMBS_window5_split_window _ring_core_0_17_14__LIMBS_window5_split_window %define _LIMBS_window5_unsplit_window _ring_core_0_17_14__LIMBS_window5_unsplit_window %define _LIMB_shr _ring_core_0_17_14__LIMB_shr %define _OPENSSL_cpuid_setup _ring_core_0_17_14__OPENSSL_cpuid_setup %define _aes_gcm_dec_kernel _ring_core_0_17_14__aes_gcm_dec_kernel %define _aes_gcm_dec_update_vaes_avx2 _ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2 %define _aes_gcm_enc_kernel _ring_core_0_17_14__aes_gcm_enc_kernel %define _aes_gcm_enc_update_vaes_avx2 _ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2 %define _aes_hw_ctr32_encrypt_blocks _ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks %define _aes_hw_set_encrypt_key _ring_core_0_17_14__aes_hw_set_encrypt_key %define _aes_hw_set_encrypt_key_alt _ring_core_0_17_14__aes_hw_set_encrypt_key_alt %define _aes_hw_set_encrypt_key_base _ring_core_0_17_14__aes_hw_set_encrypt_key_base %define _aes_nohw_ctr32_encrypt_blocks _ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks %define _aes_nohw_encrypt _ring_core_0_17_14__aes_nohw_encrypt %define _aes_nohw_set_encrypt_key _ring_core_0_17_14__aes_nohw_set_encrypt_key %define _aesni_gcm_decrypt _ring_core_0_17_14__aesni_gcm_decrypt %define _aesni_gcm_encrypt _ring_core_0_17_14__aesni_gcm_encrypt %define _bn_from_montgomery_in_place _ring_core_0_17_14__bn_from_montgomery_in_place %define _bn_gather5 _ring_core_0_17_14__bn_gather5 %define _bn_mul_mont _ring_core_0_17_14__bn_mul_mont %define _bn_mul_mont_nohw _ring_core_0_17_14__bn_mul_mont_nohw %define _bn_mul4x_mont _ring_core_0_17_14__bn_mul4x_mont %define _bn_mulx4x_mont _ring_core_0_17_14__bn_mulx4x_mont %define _bn_mul8x_mont_neon _ring_core_0_17_14__bn_mul8x_mont_neon %define _bn_mul4x_mont_gather5 _ring_core_0_17_14__bn_mul4x_mont_gather5 %define _bn_mulx4x_mont_gather5 _ring_core_0_17_14__bn_mulx4x_mont_gather5 %define _bn_neg_inv_mod_r_u64 _ring_core_0_17_14__bn_neg_inv_mod_r_u64 %define _bn_power5_nohw _ring_core_0_17_14__bn_power5_nohw %define _bn_powerx5 _ring_core_0_17_14__bn_powerx5 %define _bn_scatter5 _ring_core_0_17_14__bn_scatter5 %define _bn_sqr8x_internal _ring_core_0_17_14__bn_sqr8x_internal %define _bn_sqr8x_mont _ring_core_0_17_14__bn_sqr8x_mont %define _bn_sqrx8x_internal _ring_core_0_17_14__bn_sqrx8x_internal %define _bsaes_ctr32_encrypt_blocks _ring_core_0_17_14__bsaes_ctr32_encrypt_blocks %define _bssl_constant_time_test_conditional_memcpy _ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy %define _bssl_constant_time_test_conditional_memxor _ring_core_0_17_14__bssl_constant_time_test_conditional_memxor %define _bssl_constant_time_test_main _ring_core_0_17_14__bssl_constant_time_test_main %define _chacha20_poly1305_open _ring_core_0_17_14__chacha20_poly1305_open %define _chacha20_poly1305_open_avx2 _ring_core_0_17_14__chacha20_poly1305_open_avx2 %define _chacha20_poly1305_open_sse41 _ring_core_0_17_14__chacha20_poly1305_open_sse41 %define _chacha20_poly1305_seal _ring_core_0_17_14__chacha20_poly1305_seal %define _chacha20_poly1305_seal_avx2 _ring_core_0_17_14__chacha20_poly1305_seal_avx2 %define _chacha20_poly1305_seal_sse41 _ring_core_0_17_14__chacha20_poly1305_seal_sse41 %define _ecp_nistz256_mul_mont_adx _ring_core_0_17_14__ecp_nistz256_mul_mont_adx %define _ecp_nistz256_mul_mont_nohw _ring_core_0_17_14__ecp_nistz256_mul_mont_nohw %define _ecp_nistz256_ord_mul_mont_adx _ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx %define _ecp_nistz256_ord_mul_mont_nohw _ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw %define _ecp_nistz256_ord_sqr_mont_adx _ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx %define _ecp_nistz256_ord_sqr_mont_nohw _ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw %define _ecp_nistz256_point_add_adx _ring_core_0_17_14__ecp_nistz256_point_add_adx %define _ecp_nistz256_point_add_nohw _ring_core_0_17_14__ecp_nistz256_point_add_nohw %define _ecp_nistz256_point_add_affine_adx _ring_core_0_17_14__ecp_nistz256_point_add_affine_adx %define _ecp_nistz256_point_add_affine_nohw _ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw %define _ecp_nistz256_point_double_adx _ring_core_0_17_14__ecp_nistz256_point_double_adx %define _ecp_nistz256_point_double_nohw _ring_core_0_17_14__ecp_nistz256_point_double_nohw %define _ecp_nistz256_select_w5_avx2 _ring_core_0_17_14__ecp_nistz256_select_w5_avx2 %define _ecp_nistz256_select_w5_nohw _ring_core_0_17_14__ecp_nistz256_select_w5_nohw %define _ecp_nistz256_select_w7_avx2 _ring_core_0_17_14__ecp_nistz256_select_w7_avx2 %define _ecp_nistz256_select_w7_nohw _ring_core_0_17_14__ecp_nistz256_select_w7_nohw %define _ecp_nistz256_sqr_mont_adx _ring_core_0_17_14__ecp_nistz256_sqr_mont_adx %define _ecp_nistz256_sqr_mont_nohw _ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw %define _fiat_curve25519_adx_mul _ring_core_0_17_14__fiat_curve25519_adx_mul %define _fiat_curve25519_adx_square _ring_core_0_17_14__fiat_curve25519_adx_square %define _gcm_ghash_avx _ring_core_0_17_14__gcm_ghash_avx %define _gcm_ghash_clmul _ring_core_0_17_14__gcm_ghash_clmul %define _gcm_ghash_neon _ring_core_0_17_14__gcm_ghash_neon %define _gcm_ghash_vpclmulqdq_avx2_1 _ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1 %define _gcm_gmult_clmul _ring_core_0_17_14__gcm_gmult_clmul %define _gcm_gmult_neon _ring_core_0_17_14__gcm_gmult_neon %define _gcm_init_avx _ring_core_0_17_14__gcm_init_avx %define _gcm_init_clmul _ring_core_0_17_14__gcm_init_clmul %define _gcm_init_neon _ring_core_0_17_14__gcm_init_neon %define _gcm_init_vpclmulqdq_avx2 _ring_core_0_17_14__gcm_init_vpclmulqdq_avx2 %define _k25519Precomp _ring_core_0_17_14__k25519Precomp %define _limbs_mul_add_limb _ring_core_0_17_14__limbs_mul_add_limb %define _little_endian_bytes_from_scalar _ring_core_0_17_14__little_endian_bytes_from_scalar %define _ecp_nistz256_neg _ring_core_0_17_14__ecp_nistz256_neg %define _ecp_nistz256_select_w5 _ring_core_0_17_14__ecp_nistz256_select_w5 %define _ecp_nistz256_select_w7 _ring_core_0_17_14__ecp_nistz256_select_w7 %define _neon_available _ring_core_0_17_14__neon_available %define _p256_mul_mont _ring_core_0_17_14__p256_mul_mont %define _p256_point_add _ring_core_0_17_14__p256_point_add %define _p256_point_add_affine _ring_core_0_17_14__p256_point_add_affine %define _p256_point_double _ring_core_0_17_14__p256_point_double %define _p256_point_mul _ring_core_0_17_14__p256_point_mul %define _p256_point_mul_base _ring_core_0_17_14__p256_point_mul_base %define _p256_point_mul_base_vartime _ring_core_0_17_14__p256_point_mul_base_vartime %define _p256_scalar_mul_mont _ring_core_0_17_14__p256_scalar_mul_mont %define _p256_scalar_sqr_rep_mont _ring_core_0_17_14__p256_scalar_sqr_rep_mont %define _p256_sqr_mont _ring_core_0_17_14__p256_sqr_mont %define _p384_elem_div_by_2 _ring_core_0_17_14__p384_elem_div_by_2 %define _p384_elem_mul_mont _ring_core_0_17_14__p384_elem_mul_mont %define _p384_elem_neg _ring_core_0_17_14__p384_elem_neg %define _p384_elem_sub _ring_core_0_17_14__p384_elem_sub %define _p384_point_add _ring_core_0_17_14__p384_point_add %define _p384_point_double _ring_core_0_17_14__p384_point_double %define _p384_point_mul _ring_core_0_17_14__p384_point_mul %define _p384_scalar_mul_mont _ring_core_0_17_14__p384_scalar_mul_mont %define _openssl_poly1305_neon2_addmulmod _ring_core_0_17_14__openssl_poly1305_neon2_addmulmod %define _openssl_poly1305_neon2_blocks _ring_core_0_17_14__openssl_poly1305_neon2_blocks %define _sha256_block_data_order _ring_core_0_17_14__sha256_block_data_order %define _sha256_block_data_order_avx _ring_core_0_17_14__sha256_block_data_order_avx %define _sha256_block_data_order_ssse3 _ring_core_0_17_14__sha256_block_data_order_ssse3 %define _sha256_block_data_order_hw _ring_core_0_17_14__sha256_block_data_order_hw %define _sha256_block_data_order_neon _ring_core_0_17_14__sha256_block_data_order_neon %define _sha256_block_data_order_nohw _ring_core_0_17_14__sha256_block_data_order_nohw %define _sha512_block_data_order _ring_core_0_17_14__sha512_block_data_order %define _sha512_block_data_order_avx _ring_core_0_17_14__sha512_block_data_order_avx %define _sha512_block_data_order_hw _ring_core_0_17_14__sha512_block_data_order_hw %define _sha512_block_data_order_neon _ring_core_0_17_14__sha512_block_data_order_neon %define _sha512_block_data_order_nohw _ring_core_0_17_14__sha512_block_data_order_nohw %define _vpaes_ctr32_encrypt_blocks _ring_core_0_17_14__vpaes_ctr32_encrypt_blocks %define _vpaes_encrypt _ring_core_0_17_14__vpaes_encrypt %define _vpaes_encrypt_key_to_bsaes _ring_core_0_17_14__vpaes_encrypt_key_to_bsaes %define _vpaes_set_encrypt_key _ring_core_0_17_14__vpaes_set_encrypt_key %define _x25519_NEON _ring_core_0_17_14__x25519_NEON %define _x25519_fe_invert _ring_core_0_17_14__x25519_fe_invert %define _x25519_fe_isnegative _ring_core_0_17_14__x25519_fe_isnegative %define _x25519_fe_mul_ttt _ring_core_0_17_14__x25519_fe_mul_ttt %define _x25519_fe_neg _ring_core_0_17_14__x25519_fe_neg %define _x25519_fe_tobytes _ring_core_0_17_14__x25519_fe_tobytes %define _x25519_ge_double_scalarmult_vartime _ring_core_0_17_14__x25519_ge_double_scalarmult_vartime %define _x25519_ge_frombytes_vartime _ring_core_0_17_14__x25519_ge_frombytes_vartime %define _x25519_ge_scalarmult_base _ring_core_0_17_14__x25519_ge_scalarmult_base %define _x25519_ge_scalarmult_base_adx _ring_core_0_17_14__x25519_ge_scalarmult_base_adx %define _x25519_public_from_private_generic_masked _ring_core_0_17_14__x25519_public_from_private_generic_masked %define _x25519_sc_mask _ring_core_0_17_14__x25519_sc_mask %define _x25519_sc_muladd _ring_core_0_17_14__x25519_sc_muladd %define _x25519_sc_reduce _ring_core_0_17_14__x25519_sc_reduce %define _x25519_scalar_mult_adx _ring_core_0_17_14__x25519_scalar_mult_adx %define _x25519_scalar_mult_generic_masked _ring_core_0_17_14__x25519_scalar_mult_generic_masked %else %define ecp_nistz256_point_double p256_point_double %define ecp_nistz256_point_add p256_point_add %define ecp_nistz256_point_add_affine p256_point_add_affine %define ecp_nistz256_ord_mul_mont p256_scalar_mul_mont %define ecp_nistz256_ord_sqr_mont p256_scalar_sqr_rep_mont %define ecp_nistz256_mul_mont p256_mul_mont %define ecp_nistz256_sqr_mont p256_sqr_mont %define adx_bmi2_available ring_core_0_17_14__adx_bmi2_available %define avx2_available ring_core_0_17_14__avx2_available %define CRYPTO_memcmp ring_core_0_17_14__CRYPTO_memcmp %define CRYPTO_poly1305_finish ring_core_0_17_14__CRYPTO_poly1305_finish %define CRYPTO_poly1305_finish_neon ring_core_0_17_14__CRYPTO_poly1305_finish_neon %define CRYPTO_poly1305_init ring_core_0_17_14__CRYPTO_poly1305_init %define CRYPTO_poly1305_init_neon ring_core_0_17_14__CRYPTO_poly1305_init_neon %define CRYPTO_poly1305_update ring_core_0_17_14__CRYPTO_poly1305_update %define CRYPTO_poly1305_update_neon ring_core_0_17_14__CRYPTO_poly1305_update_neon %define ChaCha20_ctr32 ring_core_0_17_14__ChaCha20_ctr32 %define ChaCha20_ctr32_avx2 ring_core_0_17_14__ChaCha20_ctr32_avx2 %define ChaCha20_ctr32_neon ring_core_0_17_14__ChaCha20_ctr32_neon %define ChaCha20_ctr32_nohw ring_core_0_17_14__ChaCha20_ctr32_nohw %define ChaCha20_ctr32_ssse3 ring_core_0_17_14__ChaCha20_ctr32_ssse3 %define ChaCha20_ctr32_ssse3_4x ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x %define LIMB_is_zero ring_core_0_17_14__LIMB_is_zero %define LIMBS_add_mod ring_core_0_17_14__LIMBS_add_mod %define LIMBS_are_zero ring_core_0_17_14__LIMBS_are_zero %define LIMBS_equal ring_core_0_17_14__LIMBS_equal %define LIMBS_less_than ring_core_0_17_14__LIMBS_less_than %define LIMBS_reduce_once ring_core_0_17_14__LIMBS_reduce_once %define LIMBS_select_512_32 ring_core_0_17_14__LIMBS_select_512_32 %define LIMBS_shl_mod ring_core_0_17_14__LIMBS_shl_mod %define LIMBS_sub_mod ring_core_0_17_14__LIMBS_sub_mod %define LIMBS_window5_split_window ring_core_0_17_14__LIMBS_window5_split_window %define LIMBS_window5_unsplit_window ring_core_0_17_14__LIMBS_window5_unsplit_window %define LIMB_shr ring_core_0_17_14__LIMB_shr %define OPENSSL_cpuid_setup ring_core_0_17_14__OPENSSL_cpuid_setup %define aes_gcm_dec_kernel ring_core_0_17_14__aes_gcm_dec_kernel %define aes_gcm_dec_update_vaes_avx2 ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2 %define aes_gcm_enc_kernel ring_core_0_17_14__aes_gcm_enc_kernel %define aes_gcm_enc_update_vaes_avx2 ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2 %define aes_hw_ctr32_encrypt_blocks ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks %define aes_hw_set_encrypt_key ring_core_0_17_14__aes_hw_set_encrypt_key %define aes_hw_set_encrypt_key_alt ring_core_0_17_14__aes_hw_set_encrypt_key_alt %define aes_hw_set_encrypt_key_base ring_core_0_17_14__aes_hw_set_encrypt_key_base %define aes_nohw_ctr32_encrypt_blocks ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks %define aes_nohw_encrypt ring_core_0_17_14__aes_nohw_encrypt %define aes_nohw_set_encrypt_key ring_core_0_17_14__aes_nohw_set_encrypt_key %define aesni_gcm_decrypt ring_core_0_17_14__aesni_gcm_decrypt %define aesni_gcm_encrypt ring_core_0_17_14__aesni_gcm_encrypt %define bn_from_montgomery_in_place ring_core_0_17_14__bn_from_montgomery_in_place %define bn_gather5 ring_core_0_17_14__bn_gather5 %define bn_mul_mont ring_core_0_17_14__bn_mul_mont %define bn_mul_mont_nohw ring_core_0_17_14__bn_mul_mont_nohw %define bn_mul4x_mont ring_core_0_17_14__bn_mul4x_mont %define bn_mulx4x_mont ring_core_0_17_14__bn_mulx4x_mont %define bn_mul8x_mont_neon ring_core_0_17_14__bn_mul8x_mont_neon %define bn_mul4x_mont_gather5 ring_core_0_17_14__bn_mul4x_mont_gather5 %define bn_mulx4x_mont_gather5 ring_core_0_17_14__bn_mulx4x_mont_gather5 %define bn_neg_inv_mod_r_u64 ring_core_0_17_14__bn_neg_inv_mod_r_u64 %define bn_power5_nohw ring_core_0_17_14__bn_power5_nohw %define bn_powerx5 ring_core_0_17_14__bn_powerx5 %define bn_scatter5 ring_core_0_17_14__bn_scatter5 %define bn_sqr8x_internal ring_core_0_17_14__bn_sqr8x_internal %define bn_sqr8x_mont ring_core_0_17_14__bn_sqr8x_mont %define bn_sqrx8x_internal ring_core_0_17_14__bn_sqrx8x_internal %define bsaes_ctr32_encrypt_blocks ring_core_0_17_14__bsaes_ctr32_encrypt_blocks %define bssl_constant_time_test_conditional_memcpy ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy %define bssl_constant_time_test_conditional_memxor ring_core_0_17_14__bssl_constant_time_test_conditional_memxor %define bssl_constant_time_test_main ring_core_0_17_14__bssl_constant_time_test_main %define chacha20_poly1305_open ring_core_0_17_14__chacha20_poly1305_open %define chacha20_poly1305_open_avx2 ring_core_0_17_14__chacha20_poly1305_open_avx2 %define chacha20_poly1305_open_sse41 ring_core_0_17_14__chacha20_poly1305_open_sse41 %define chacha20_poly1305_seal ring_core_0_17_14__chacha20_poly1305_seal %define chacha20_poly1305_seal_avx2 ring_core_0_17_14__chacha20_poly1305_seal_avx2 %define chacha20_poly1305_seal_sse41 ring_core_0_17_14__chacha20_poly1305_seal_sse41 %define ecp_nistz256_mul_mont_adx ring_core_0_17_14__ecp_nistz256_mul_mont_adx %define ecp_nistz256_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_mul_mont_nohw %define ecp_nistz256_ord_mul_mont_adx ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx %define ecp_nistz256_ord_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw %define ecp_nistz256_ord_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx %define ecp_nistz256_ord_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw %define ecp_nistz256_point_add_adx ring_core_0_17_14__ecp_nistz256_point_add_adx %define ecp_nistz256_point_add_nohw ring_core_0_17_14__ecp_nistz256_point_add_nohw %define ecp_nistz256_point_add_affine_adx ring_core_0_17_14__ecp_nistz256_point_add_affine_adx %define ecp_nistz256_point_add_affine_nohw ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw %define ecp_nistz256_point_double_adx ring_core_0_17_14__ecp_nistz256_point_double_adx %define ecp_nistz256_point_double_nohw ring_core_0_17_14__ecp_nistz256_point_double_nohw %define ecp_nistz256_select_w5_avx2 ring_core_0_17_14__ecp_nistz256_select_w5_avx2 %define ecp_nistz256_select_w5_nohw ring_core_0_17_14__ecp_nistz256_select_w5_nohw %define ecp_nistz256_select_w7_avx2 ring_core_0_17_14__ecp_nistz256_select_w7_avx2 %define ecp_nistz256_select_w7_nohw ring_core_0_17_14__ecp_nistz256_select_w7_nohw %define ecp_nistz256_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_sqr_mont_adx %define ecp_nistz256_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw %define fiat_curve25519_adx_mul ring_core_0_17_14__fiat_curve25519_adx_mul %define fiat_curve25519_adx_square ring_core_0_17_14__fiat_curve25519_adx_square %define gcm_ghash_avx ring_core_0_17_14__gcm_ghash_avx %define gcm_ghash_clmul ring_core_0_17_14__gcm_ghash_clmul %define gcm_ghash_neon ring_core_0_17_14__gcm_ghash_neon %define gcm_ghash_vpclmulqdq_avx2_1 ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1 %define gcm_gmult_clmul ring_core_0_17_14__gcm_gmult_clmul %define gcm_gmult_neon ring_core_0_17_14__gcm_gmult_neon %define gcm_init_avx ring_core_0_17_14__gcm_init_avx %define gcm_init_clmul ring_core_0_17_14__gcm_init_clmul %define gcm_init_neon ring_core_0_17_14__gcm_init_neon %define gcm_init_vpclmulqdq_avx2 ring_core_0_17_14__gcm_init_vpclmulqdq_avx2 %define k25519Precomp ring_core_0_17_14__k25519Precomp %define limbs_mul_add_limb ring_core_0_17_14__limbs_mul_add_limb %define little_endian_bytes_from_scalar ring_core_0_17_14__little_endian_bytes_from_scalar %define ecp_nistz256_neg ring_core_0_17_14__ecp_nistz256_neg %define ecp_nistz256_select_w5 ring_core_0_17_14__ecp_nistz256_select_w5 %define ecp_nistz256_select_w7 ring_core_0_17_14__ecp_nistz256_select_w7 %define neon_available ring_core_0_17_14__neon_available %define p256_mul_mont ring_core_0_17_14__p256_mul_mont %define p256_point_add ring_core_0_17_14__p256_point_add %define p256_point_add_affine ring_core_0_17_14__p256_point_add_affine %define p256_point_double ring_core_0_17_14__p256_point_double %define p256_point_mul ring_core_0_17_14__p256_point_mul %define p256_point_mul_base ring_core_0_17_14__p256_point_mul_base %define p256_point_mul_base_vartime ring_core_0_17_14__p256_point_mul_base_vartime %define p256_scalar_mul_mont ring_core_0_17_14__p256_scalar_mul_mont %define p256_scalar_sqr_rep_mont ring_core_0_17_14__p256_scalar_sqr_rep_mont %define p256_sqr_mont ring_core_0_17_14__p256_sqr_mont %define p384_elem_div_by_2 ring_core_0_17_14__p384_elem_div_by_2 %define p384_elem_mul_mont ring_core_0_17_14__p384_elem_mul_mont %define p384_elem_neg ring_core_0_17_14__p384_elem_neg %define p384_elem_sub ring_core_0_17_14__p384_elem_sub %define p384_point_add ring_core_0_17_14__p384_point_add %define p384_point_double ring_core_0_17_14__p384_point_double %define p384_point_mul ring_core_0_17_14__p384_point_mul %define p384_scalar_mul_mont ring_core_0_17_14__p384_scalar_mul_mont %define openssl_poly1305_neon2_addmulmod ring_core_0_17_14__openssl_poly1305_neon2_addmulmod %define openssl_poly1305_neon2_blocks ring_core_0_17_14__openssl_poly1305_neon2_blocks %define sha256_block_data_order ring_core_0_17_14__sha256_block_data_order %define sha256_block_data_order_avx ring_core_0_17_14__sha256_block_data_order_avx %define sha256_block_data_order_ssse3 ring_core_0_17_14__sha256_block_data_order_ssse3 %define sha256_block_data_order_hw ring_core_0_17_14__sha256_block_data_order_hw %define sha256_block_data_order_neon ring_core_0_17_14__sha256_block_data_order_neon %define sha256_block_data_order_nohw ring_core_0_17_14__sha256_block_data_order_nohw %define sha512_block_data_order ring_core_0_17_14__sha512_block_data_order %define sha512_block_data_order_avx ring_core_0_17_14__sha512_block_data_order_avx %define sha512_block_data_order_hw ring_core_0_17_14__sha512_block_data_order_hw %define sha512_block_data_order_neon ring_core_0_17_14__sha512_block_data_order_neon %define sha512_block_data_order_nohw ring_core_0_17_14__sha512_block_data_order_nohw %define vpaes_ctr32_encrypt_blocks ring_core_0_17_14__vpaes_ctr32_encrypt_blocks %define vpaes_encrypt ring_core_0_17_14__vpaes_encrypt %define vpaes_encrypt_key_to_bsaes ring_core_0_17_14__vpaes_encrypt_key_to_bsaes %define vpaes_set_encrypt_key ring_core_0_17_14__vpaes_set_encrypt_key %define x25519_NEON ring_core_0_17_14__x25519_NEON %define x25519_fe_invert ring_core_0_17_14__x25519_fe_invert %define x25519_fe_isnegative ring_core_0_17_14__x25519_fe_isnegative %define x25519_fe_mul_ttt ring_core_0_17_14__x25519_fe_mul_ttt %define x25519_fe_neg ring_core_0_17_14__x25519_fe_neg %define x25519_fe_tobytes ring_core_0_17_14__x25519_fe_tobytes %define x25519_ge_double_scalarmult_vartime ring_core_0_17_14__x25519_ge_double_scalarmult_vartime %define x25519_ge_frombytes_vartime ring_core_0_17_14__x25519_ge_frombytes_vartime %define x25519_ge_scalarmult_base ring_core_0_17_14__x25519_ge_scalarmult_base %define x25519_ge_scalarmult_base_adx ring_core_0_17_14__x25519_ge_scalarmult_base_adx %define x25519_public_from_private_generic_masked ring_core_0_17_14__x25519_public_from_private_generic_masked %define x25519_sc_mask ring_core_0_17_14__x25519_sc_mask %define x25519_sc_muladd ring_core_0_17_14__x25519_sc_muladd %define x25519_sc_reduce ring_core_0_17_14__x25519_sc_reduce %define x25519_scalar_mult_adx ring_core_0_17_14__x25519_scalar_mult_adx %define x25519_scalar_mult_generic_masked ring_core_0_17_14__x25519_scalar_mult_generic_masked %endif %endif ring-0.17.14/pregenerated/sha256-armv4-linux32.S000064400000000000000000001654401046102023000171010ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) @ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. @ @ Licensed under the Apache License, Version 2.0 (the "License"); @ you may not use this file except in compliance with the License. @ You may obtain a copy of the License at @ @ https://www.apache.org/licenses/LICENSE-2.0 @ @ Unless required by applicable law or agreed to in writing, software @ distributed under the License is distributed on an "AS IS" BASIS, @ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @ See the License for the specific language governing permissions and @ limitations under the License. @ ==================================================================== @ Written by Andy Polyakov for the OpenSSL @ project. @ ==================================================================== @ SHA256 block procedure for ARMv4. May 2007. @ Performance is ~2x better than gcc 3.4 generated code and in "abso- @ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per @ byte [on single-issue Xscale PXA250 core]. @ July 2010. @ @ Rescheduling for dual-issue pipeline resulted in 22% improvement on @ Cortex A8 core and ~20 cycles per processed byte. @ February 2011. @ @ Profiler-assisted and platform-specific optimization resulted in 16% @ improvement on Cortex A8 core and ~15.4 cycles per processed byte. @ September 2013. @ @ Add NEON implementation. On Cortex A8 it was measured to process one @ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon @ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only @ code (meaning that latter performs sub-optimally, nothing was done @ about it). @ May 2014. @ @ Add ARMv8 code path performing at 2.0 cpb on Apple A7. #ifdef __KERNEL__ # define __ARM_ARCH __LINUX_ARM_ARCH__ # define __ARM_MAX_ARCH__ 7 #endif @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both @ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those @ instructions are manually-encoded. (See unsha256.) .arch armv7-a .text #if defined(__thumb2__) .syntax unified .thumb #else .code 32 #endif .type K256,%object .align 5 K256: .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .size K256,.-K256 .word 0 @ terminator .align 5 .globl sha256_block_data_order_nohw .hidden sha256_block_data_order_nohw .type sha256_block_data_order_nohw,%function sha256_block_data_order_nohw: add r2,r1,r2,lsl#6 @ len to point at the end of inp stmdb sp!,{r0,r1,r2,r4-r11,lr} ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} adr r14,K256 sub sp,sp,#16*4 @ alloca(X[16]) .Loop: # if __ARM_ARCH>=7 ldr r2,[r1],#4 # else ldrb r2,[r1,#3] # endif eor r3,r5,r6 @ magic eor r12,r12,r12 #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 0 # if 0==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r8,r8,ror#5 add r4,r4,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r8,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 0 add r4,r4,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 0==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r8,r8,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r8,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r11,r11,r2 @ h+=X[i] str r2,[sp,#0*4] eor r2,r9,r10 add r11,r11,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r8 add r11,r11,r12 @ h+=K256[i] eor r2,r2,r10 @ Ch(e,f,g) eor r0,r4,r4,ror#11 add r11,r11,r2 @ h+=Ch(e,f,g) #if 0==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 0<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r4,r5 @ a^b, b^c in next round #else ldr r2,[sp,#2*4] @ from future BODY_16_xx eor r12,r4,r5 @ a^b, b^c in next round ldr r1,[sp,#15*4] @ from future BODY_16_xx #endif eor r0,r0,r4,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r7,r7,r11 @ d+=h eor r3,r3,r5 @ Maj(a,b,c) add r11,r11,r0,ror#2 @ h+=Sigma0(a) @ add r11,r11,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 1 # if 1==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r7,r7,ror#5 add r11,r11,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r7,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 1 add r11,r11,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 1==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r7,r7,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r7,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r10,r10,r2 @ h+=X[i] str r2,[sp,#1*4] eor r2,r8,r9 add r10,r10,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r7 add r10,r10,r3 @ h+=K256[i] eor r2,r2,r9 @ Ch(e,f,g) eor r0,r11,r11,ror#11 add r10,r10,r2 @ h+=Ch(e,f,g) #if 1==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 1<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r11,r4 @ a^b, b^c in next round #else ldr r2,[sp,#3*4] @ from future BODY_16_xx eor r3,r11,r4 @ a^b, b^c in next round ldr r1,[sp,#0*4] @ from future BODY_16_xx #endif eor r0,r0,r11,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r6,r6,r10 @ d+=h eor r12,r12,r4 @ Maj(a,b,c) add r10,r10,r0,ror#2 @ h+=Sigma0(a) @ add r10,r10,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 2 # if 2==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r6,r6,ror#5 add r10,r10,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r6,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 2 add r10,r10,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 2==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r6,r6,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r6,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r9,r9,r2 @ h+=X[i] str r2,[sp,#2*4] eor r2,r7,r8 add r9,r9,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r6 add r9,r9,r12 @ h+=K256[i] eor r2,r2,r8 @ Ch(e,f,g) eor r0,r10,r10,ror#11 add r9,r9,r2 @ h+=Ch(e,f,g) #if 2==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 2<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r10,r11 @ a^b, b^c in next round #else ldr r2,[sp,#4*4] @ from future BODY_16_xx eor r12,r10,r11 @ a^b, b^c in next round ldr r1,[sp,#1*4] @ from future BODY_16_xx #endif eor r0,r0,r10,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r5,r5,r9 @ d+=h eor r3,r3,r11 @ Maj(a,b,c) add r9,r9,r0,ror#2 @ h+=Sigma0(a) @ add r9,r9,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 3 # if 3==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r5,r5,ror#5 add r9,r9,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r5,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 3 add r9,r9,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 3==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r5,r5,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r5,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r8,r8,r2 @ h+=X[i] str r2,[sp,#3*4] eor r2,r6,r7 add r8,r8,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r5 add r8,r8,r3 @ h+=K256[i] eor r2,r2,r7 @ Ch(e,f,g) eor r0,r9,r9,ror#11 add r8,r8,r2 @ h+=Ch(e,f,g) #if 3==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 3<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r9,r10 @ a^b, b^c in next round #else ldr r2,[sp,#5*4] @ from future BODY_16_xx eor r3,r9,r10 @ a^b, b^c in next round ldr r1,[sp,#2*4] @ from future BODY_16_xx #endif eor r0,r0,r9,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r4,r4,r8 @ d+=h eor r12,r12,r10 @ Maj(a,b,c) add r8,r8,r0,ror#2 @ h+=Sigma0(a) @ add r8,r8,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 4 # if 4==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r4,r4,ror#5 add r8,r8,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r4,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 4 add r8,r8,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 4==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r4,r4,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r4,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r7,r7,r2 @ h+=X[i] str r2,[sp,#4*4] eor r2,r5,r6 add r7,r7,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r4 add r7,r7,r12 @ h+=K256[i] eor r2,r2,r6 @ Ch(e,f,g) eor r0,r8,r8,ror#11 add r7,r7,r2 @ h+=Ch(e,f,g) #if 4==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 4<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r8,r9 @ a^b, b^c in next round #else ldr r2,[sp,#6*4] @ from future BODY_16_xx eor r12,r8,r9 @ a^b, b^c in next round ldr r1,[sp,#3*4] @ from future BODY_16_xx #endif eor r0,r0,r8,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r11,r11,r7 @ d+=h eor r3,r3,r9 @ Maj(a,b,c) add r7,r7,r0,ror#2 @ h+=Sigma0(a) @ add r7,r7,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 5 # if 5==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r11,r11,ror#5 add r7,r7,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r11,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 5 add r7,r7,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 5==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r11,r11,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r11,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r6,r6,r2 @ h+=X[i] str r2,[sp,#5*4] eor r2,r4,r5 add r6,r6,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r11 add r6,r6,r3 @ h+=K256[i] eor r2,r2,r5 @ Ch(e,f,g) eor r0,r7,r7,ror#11 add r6,r6,r2 @ h+=Ch(e,f,g) #if 5==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 5<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r7,r8 @ a^b, b^c in next round #else ldr r2,[sp,#7*4] @ from future BODY_16_xx eor r3,r7,r8 @ a^b, b^c in next round ldr r1,[sp,#4*4] @ from future BODY_16_xx #endif eor r0,r0,r7,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r10,r10,r6 @ d+=h eor r12,r12,r8 @ Maj(a,b,c) add r6,r6,r0,ror#2 @ h+=Sigma0(a) @ add r6,r6,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 6 # if 6==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r10,r10,ror#5 add r6,r6,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r10,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 6 add r6,r6,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 6==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r10,r10,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r10,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r5,r5,r2 @ h+=X[i] str r2,[sp,#6*4] eor r2,r11,r4 add r5,r5,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r10 add r5,r5,r12 @ h+=K256[i] eor r2,r2,r4 @ Ch(e,f,g) eor r0,r6,r6,ror#11 add r5,r5,r2 @ h+=Ch(e,f,g) #if 6==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 6<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r6,r7 @ a^b, b^c in next round #else ldr r2,[sp,#8*4] @ from future BODY_16_xx eor r12,r6,r7 @ a^b, b^c in next round ldr r1,[sp,#5*4] @ from future BODY_16_xx #endif eor r0,r0,r6,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r9,r9,r5 @ d+=h eor r3,r3,r7 @ Maj(a,b,c) add r5,r5,r0,ror#2 @ h+=Sigma0(a) @ add r5,r5,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 7 # if 7==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r9,r9,ror#5 add r5,r5,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r9,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 7 add r5,r5,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 7==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r9,r9,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r9,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r4,r4,r2 @ h+=X[i] str r2,[sp,#7*4] eor r2,r10,r11 add r4,r4,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r9 add r4,r4,r3 @ h+=K256[i] eor r2,r2,r11 @ Ch(e,f,g) eor r0,r5,r5,ror#11 add r4,r4,r2 @ h+=Ch(e,f,g) #if 7==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 7<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r5,r6 @ a^b, b^c in next round #else ldr r2,[sp,#9*4] @ from future BODY_16_xx eor r3,r5,r6 @ a^b, b^c in next round ldr r1,[sp,#6*4] @ from future BODY_16_xx #endif eor r0,r0,r5,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r8,r8,r4 @ d+=h eor r12,r12,r6 @ Maj(a,b,c) add r4,r4,r0,ror#2 @ h+=Sigma0(a) @ add r4,r4,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 8 # if 8==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r8,r8,ror#5 add r4,r4,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r8,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 8 add r4,r4,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 8==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r8,r8,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r8,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r11,r11,r2 @ h+=X[i] str r2,[sp,#8*4] eor r2,r9,r10 add r11,r11,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r8 add r11,r11,r12 @ h+=K256[i] eor r2,r2,r10 @ Ch(e,f,g) eor r0,r4,r4,ror#11 add r11,r11,r2 @ h+=Ch(e,f,g) #if 8==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 8<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r4,r5 @ a^b, b^c in next round #else ldr r2,[sp,#10*4] @ from future BODY_16_xx eor r12,r4,r5 @ a^b, b^c in next round ldr r1,[sp,#7*4] @ from future BODY_16_xx #endif eor r0,r0,r4,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r7,r7,r11 @ d+=h eor r3,r3,r5 @ Maj(a,b,c) add r11,r11,r0,ror#2 @ h+=Sigma0(a) @ add r11,r11,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 9 # if 9==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r7,r7,ror#5 add r11,r11,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r7,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 9 add r11,r11,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 9==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r7,r7,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r7,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r10,r10,r2 @ h+=X[i] str r2,[sp,#9*4] eor r2,r8,r9 add r10,r10,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r7 add r10,r10,r3 @ h+=K256[i] eor r2,r2,r9 @ Ch(e,f,g) eor r0,r11,r11,ror#11 add r10,r10,r2 @ h+=Ch(e,f,g) #if 9==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 9<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r11,r4 @ a^b, b^c in next round #else ldr r2,[sp,#11*4] @ from future BODY_16_xx eor r3,r11,r4 @ a^b, b^c in next round ldr r1,[sp,#8*4] @ from future BODY_16_xx #endif eor r0,r0,r11,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r6,r6,r10 @ d+=h eor r12,r12,r4 @ Maj(a,b,c) add r10,r10,r0,ror#2 @ h+=Sigma0(a) @ add r10,r10,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 10 # if 10==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r6,r6,ror#5 add r10,r10,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r6,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 10 add r10,r10,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 10==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r6,r6,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r6,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r9,r9,r2 @ h+=X[i] str r2,[sp,#10*4] eor r2,r7,r8 add r9,r9,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r6 add r9,r9,r12 @ h+=K256[i] eor r2,r2,r8 @ Ch(e,f,g) eor r0,r10,r10,ror#11 add r9,r9,r2 @ h+=Ch(e,f,g) #if 10==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 10<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r10,r11 @ a^b, b^c in next round #else ldr r2,[sp,#12*4] @ from future BODY_16_xx eor r12,r10,r11 @ a^b, b^c in next round ldr r1,[sp,#9*4] @ from future BODY_16_xx #endif eor r0,r0,r10,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r5,r5,r9 @ d+=h eor r3,r3,r11 @ Maj(a,b,c) add r9,r9,r0,ror#2 @ h+=Sigma0(a) @ add r9,r9,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 11 # if 11==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r5,r5,ror#5 add r9,r9,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r5,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 11 add r9,r9,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 11==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r5,r5,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r5,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r8,r8,r2 @ h+=X[i] str r2,[sp,#11*4] eor r2,r6,r7 add r8,r8,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r5 add r8,r8,r3 @ h+=K256[i] eor r2,r2,r7 @ Ch(e,f,g) eor r0,r9,r9,ror#11 add r8,r8,r2 @ h+=Ch(e,f,g) #if 11==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 11<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r9,r10 @ a^b, b^c in next round #else ldr r2,[sp,#13*4] @ from future BODY_16_xx eor r3,r9,r10 @ a^b, b^c in next round ldr r1,[sp,#10*4] @ from future BODY_16_xx #endif eor r0,r0,r9,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r4,r4,r8 @ d+=h eor r12,r12,r10 @ Maj(a,b,c) add r8,r8,r0,ror#2 @ h+=Sigma0(a) @ add r8,r8,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 12 # if 12==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r4,r4,ror#5 add r8,r8,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r4,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 12 add r8,r8,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 12==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r4,r4,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r4,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r7,r7,r2 @ h+=X[i] str r2,[sp,#12*4] eor r2,r5,r6 add r7,r7,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r4 add r7,r7,r12 @ h+=K256[i] eor r2,r2,r6 @ Ch(e,f,g) eor r0,r8,r8,ror#11 add r7,r7,r2 @ h+=Ch(e,f,g) #if 12==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 12<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r8,r9 @ a^b, b^c in next round #else ldr r2,[sp,#14*4] @ from future BODY_16_xx eor r12,r8,r9 @ a^b, b^c in next round ldr r1,[sp,#11*4] @ from future BODY_16_xx #endif eor r0,r0,r8,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r11,r11,r7 @ d+=h eor r3,r3,r9 @ Maj(a,b,c) add r7,r7,r0,ror#2 @ h+=Sigma0(a) @ add r7,r7,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 13 # if 13==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r11,r11,ror#5 add r7,r7,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r11,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 13 add r7,r7,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 13==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r11,r11,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r11,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r6,r6,r2 @ h+=X[i] str r2,[sp,#13*4] eor r2,r4,r5 add r6,r6,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r11 add r6,r6,r3 @ h+=K256[i] eor r2,r2,r5 @ Ch(e,f,g) eor r0,r7,r7,ror#11 add r6,r6,r2 @ h+=Ch(e,f,g) #if 13==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 13<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r7,r8 @ a^b, b^c in next round #else ldr r2,[sp,#15*4] @ from future BODY_16_xx eor r3,r7,r8 @ a^b, b^c in next round ldr r1,[sp,#12*4] @ from future BODY_16_xx #endif eor r0,r0,r7,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r10,r10,r6 @ d+=h eor r12,r12,r8 @ Maj(a,b,c) add r6,r6,r0,ror#2 @ h+=Sigma0(a) @ add r6,r6,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 14 # if 14==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r10,r10,ror#5 add r6,r6,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r10,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 14 add r6,r6,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 14==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r10,r10,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r10,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r5,r5,r2 @ h+=X[i] str r2,[sp,#14*4] eor r2,r11,r4 add r5,r5,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r10 add r5,r5,r12 @ h+=K256[i] eor r2,r2,r4 @ Ch(e,f,g) eor r0,r6,r6,ror#11 add r5,r5,r2 @ h+=Ch(e,f,g) #if 14==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 14<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r6,r7 @ a^b, b^c in next round #else ldr r2,[sp,#0*4] @ from future BODY_16_xx eor r12,r6,r7 @ a^b, b^c in next round ldr r1,[sp,#13*4] @ from future BODY_16_xx #endif eor r0,r0,r6,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r9,r9,r5 @ d+=h eor r3,r3,r7 @ Maj(a,b,c) add r5,r5,r0,ror#2 @ h+=Sigma0(a) @ add r5,r5,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 @ ldr r2,[r1],#4 @ 15 # if 15==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r9,r9,ror#5 add r5,r5,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r9,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 15 add r5,r5,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 15==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r9,r9,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r9,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r4,r4,r2 @ h+=X[i] str r2,[sp,#15*4] eor r2,r10,r11 add r4,r4,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r9 add r4,r4,r3 @ h+=K256[i] eor r2,r2,r11 @ Ch(e,f,g) eor r0,r5,r5,ror#11 add r4,r4,r2 @ h+=Ch(e,f,g) #if 15==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 15<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r5,r6 @ a^b, b^c in next round #else ldr r2,[sp,#1*4] @ from future BODY_16_xx eor r3,r5,r6 @ a^b, b^c in next round ldr r1,[sp,#14*4] @ from future BODY_16_xx #endif eor r0,r0,r5,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r8,r8,r4 @ d+=h eor r12,r12,r6 @ Maj(a,b,c) add r4,r4,r0,ror#2 @ h+=Sigma0(a) @ add r4,r4,r12 @ h+=Maj(a,b,c) .Lrounds_16_xx: @ ldr r2,[sp,#1*4] @ 16 @ ldr r1,[sp,#14*4] mov r0,r2,ror#7 add r4,r4,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#0*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#9*4] add r12,r12,r0 eor r0,r8,r8,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r8,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r11,r11,r2 @ h+=X[i] str r2,[sp,#0*4] eor r2,r9,r10 add r11,r11,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r8 add r11,r11,r12 @ h+=K256[i] eor r2,r2,r10 @ Ch(e,f,g) eor r0,r4,r4,ror#11 add r11,r11,r2 @ h+=Ch(e,f,g) #if 16==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 16<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r4,r5 @ a^b, b^c in next round #else ldr r2,[sp,#2*4] @ from future BODY_16_xx eor r12,r4,r5 @ a^b, b^c in next round ldr r1,[sp,#15*4] @ from future BODY_16_xx #endif eor r0,r0,r4,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r7,r7,r11 @ d+=h eor r3,r3,r5 @ Maj(a,b,c) add r11,r11,r0,ror#2 @ h+=Sigma0(a) @ add r11,r11,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#2*4] @ 17 @ ldr r1,[sp,#15*4] mov r0,r2,ror#7 add r11,r11,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#1*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#10*4] add r3,r3,r0 eor r0,r7,r7,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r7,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r10,r10,r2 @ h+=X[i] str r2,[sp,#1*4] eor r2,r8,r9 add r10,r10,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r7 add r10,r10,r3 @ h+=K256[i] eor r2,r2,r9 @ Ch(e,f,g) eor r0,r11,r11,ror#11 add r10,r10,r2 @ h+=Ch(e,f,g) #if 17==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 17<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r11,r4 @ a^b, b^c in next round #else ldr r2,[sp,#3*4] @ from future BODY_16_xx eor r3,r11,r4 @ a^b, b^c in next round ldr r1,[sp,#0*4] @ from future BODY_16_xx #endif eor r0,r0,r11,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r6,r6,r10 @ d+=h eor r12,r12,r4 @ Maj(a,b,c) add r10,r10,r0,ror#2 @ h+=Sigma0(a) @ add r10,r10,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#3*4] @ 18 @ ldr r1,[sp,#0*4] mov r0,r2,ror#7 add r10,r10,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#2*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#11*4] add r12,r12,r0 eor r0,r6,r6,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r6,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r9,r9,r2 @ h+=X[i] str r2,[sp,#2*4] eor r2,r7,r8 add r9,r9,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r6 add r9,r9,r12 @ h+=K256[i] eor r2,r2,r8 @ Ch(e,f,g) eor r0,r10,r10,ror#11 add r9,r9,r2 @ h+=Ch(e,f,g) #if 18==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 18<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r10,r11 @ a^b, b^c in next round #else ldr r2,[sp,#4*4] @ from future BODY_16_xx eor r12,r10,r11 @ a^b, b^c in next round ldr r1,[sp,#1*4] @ from future BODY_16_xx #endif eor r0,r0,r10,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r5,r5,r9 @ d+=h eor r3,r3,r11 @ Maj(a,b,c) add r9,r9,r0,ror#2 @ h+=Sigma0(a) @ add r9,r9,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#4*4] @ 19 @ ldr r1,[sp,#1*4] mov r0,r2,ror#7 add r9,r9,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#3*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#12*4] add r3,r3,r0 eor r0,r5,r5,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r5,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r8,r8,r2 @ h+=X[i] str r2,[sp,#3*4] eor r2,r6,r7 add r8,r8,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r5 add r8,r8,r3 @ h+=K256[i] eor r2,r2,r7 @ Ch(e,f,g) eor r0,r9,r9,ror#11 add r8,r8,r2 @ h+=Ch(e,f,g) #if 19==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 19<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r9,r10 @ a^b, b^c in next round #else ldr r2,[sp,#5*4] @ from future BODY_16_xx eor r3,r9,r10 @ a^b, b^c in next round ldr r1,[sp,#2*4] @ from future BODY_16_xx #endif eor r0,r0,r9,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r4,r4,r8 @ d+=h eor r12,r12,r10 @ Maj(a,b,c) add r8,r8,r0,ror#2 @ h+=Sigma0(a) @ add r8,r8,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#5*4] @ 20 @ ldr r1,[sp,#2*4] mov r0,r2,ror#7 add r8,r8,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#4*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#13*4] add r12,r12,r0 eor r0,r4,r4,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r4,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r7,r7,r2 @ h+=X[i] str r2,[sp,#4*4] eor r2,r5,r6 add r7,r7,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r4 add r7,r7,r12 @ h+=K256[i] eor r2,r2,r6 @ Ch(e,f,g) eor r0,r8,r8,ror#11 add r7,r7,r2 @ h+=Ch(e,f,g) #if 20==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 20<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r8,r9 @ a^b, b^c in next round #else ldr r2,[sp,#6*4] @ from future BODY_16_xx eor r12,r8,r9 @ a^b, b^c in next round ldr r1,[sp,#3*4] @ from future BODY_16_xx #endif eor r0,r0,r8,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r11,r11,r7 @ d+=h eor r3,r3,r9 @ Maj(a,b,c) add r7,r7,r0,ror#2 @ h+=Sigma0(a) @ add r7,r7,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#6*4] @ 21 @ ldr r1,[sp,#3*4] mov r0,r2,ror#7 add r7,r7,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#5*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#14*4] add r3,r3,r0 eor r0,r11,r11,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r11,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r6,r6,r2 @ h+=X[i] str r2,[sp,#5*4] eor r2,r4,r5 add r6,r6,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r11 add r6,r6,r3 @ h+=K256[i] eor r2,r2,r5 @ Ch(e,f,g) eor r0,r7,r7,ror#11 add r6,r6,r2 @ h+=Ch(e,f,g) #if 21==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 21<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r7,r8 @ a^b, b^c in next round #else ldr r2,[sp,#7*4] @ from future BODY_16_xx eor r3,r7,r8 @ a^b, b^c in next round ldr r1,[sp,#4*4] @ from future BODY_16_xx #endif eor r0,r0,r7,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r10,r10,r6 @ d+=h eor r12,r12,r8 @ Maj(a,b,c) add r6,r6,r0,ror#2 @ h+=Sigma0(a) @ add r6,r6,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#7*4] @ 22 @ ldr r1,[sp,#4*4] mov r0,r2,ror#7 add r6,r6,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#6*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#15*4] add r12,r12,r0 eor r0,r10,r10,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r10,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r5,r5,r2 @ h+=X[i] str r2,[sp,#6*4] eor r2,r11,r4 add r5,r5,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r10 add r5,r5,r12 @ h+=K256[i] eor r2,r2,r4 @ Ch(e,f,g) eor r0,r6,r6,ror#11 add r5,r5,r2 @ h+=Ch(e,f,g) #if 22==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 22<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r6,r7 @ a^b, b^c in next round #else ldr r2,[sp,#8*4] @ from future BODY_16_xx eor r12,r6,r7 @ a^b, b^c in next round ldr r1,[sp,#5*4] @ from future BODY_16_xx #endif eor r0,r0,r6,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r9,r9,r5 @ d+=h eor r3,r3,r7 @ Maj(a,b,c) add r5,r5,r0,ror#2 @ h+=Sigma0(a) @ add r5,r5,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#8*4] @ 23 @ ldr r1,[sp,#5*4] mov r0,r2,ror#7 add r5,r5,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#7*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#0*4] add r3,r3,r0 eor r0,r9,r9,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r9,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r4,r4,r2 @ h+=X[i] str r2,[sp,#7*4] eor r2,r10,r11 add r4,r4,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r9 add r4,r4,r3 @ h+=K256[i] eor r2,r2,r11 @ Ch(e,f,g) eor r0,r5,r5,ror#11 add r4,r4,r2 @ h+=Ch(e,f,g) #if 23==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 23<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r5,r6 @ a^b, b^c in next round #else ldr r2,[sp,#9*4] @ from future BODY_16_xx eor r3,r5,r6 @ a^b, b^c in next round ldr r1,[sp,#6*4] @ from future BODY_16_xx #endif eor r0,r0,r5,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r8,r8,r4 @ d+=h eor r12,r12,r6 @ Maj(a,b,c) add r4,r4,r0,ror#2 @ h+=Sigma0(a) @ add r4,r4,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#9*4] @ 24 @ ldr r1,[sp,#6*4] mov r0,r2,ror#7 add r4,r4,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#8*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#1*4] add r12,r12,r0 eor r0,r8,r8,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r8,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r11,r11,r2 @ h+=X[i] str r2,[sp,#8*4] eor r2,r9,r10 add r11,r11,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r8 add r11,r11,r12 @ h+=K256[i] eor r2,r2,r10 @ Ch(e,f,g) eor r0,r4,r4,ror#11 add r11,r11,r2 @ h+=Ch(e,f,g) #if 24==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 24<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r4,r5 @ a^b, b^c in next round #else ldr r2,[sp,#10*4] @ from future BODY_16_xx eor r12,r4,r5 @ a^b, b^c in next round ldr r1,[sp,#7*4] @ from future BODY_16_xx #endif eor r0,r0,r4,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r7,r7,r11 @ d+=h eor r3,r3,r5 @ Maj(a,b,c) add r11,r11,r0,ror#2 @ h+=Sigma0(a) @ add r11,r11,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#10*4] @ 25 @ ldr r1,[sp,#7*4] mov r0,r2,ror#7 add r11,r11,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#9*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#2*4] add r3,r3,r0 eor r0,r7,r7,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r7,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r10,r10,r2 @ h+=X[i] str r2,[sp,#9*4] eor r2,r8,r9 add r10,r10,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r7 add r10,r10,r3 @ h+=K256[i] eor r2,r2,r9 @ Ch(e,f,g) eor r0,r11,r11,ror#11 add r10,r10,r2 @ h+=Ch(e,f,g) #if 25==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 25<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r11,r4 @ a^b, b^c in next round #else ldr r2,[sp,#11*4] @ from future BODY_16_xx eor r3,r11,r4 @ a^b, b^c in next round ldr r1,[sp,#8*4] @ from future BODY_16_xx #endif eor r0,r0,r11,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r6,r6,r10 @ d+=h eor r12,r12,r4 @ Maj(a,b,c) add r10,r10,r0,ror#2 @ h+=Sigma0(a) @ add r10,r10,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#11*4] @ 26 @ ldr r1,[sp,#8*4] mov r0,r2,ror#7 add r10,r10,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#10*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#3*4] add r12,r12,r0 eor r0,r6,r6,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r6,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r9,r9,r2 @ h+=X[i] str r2,[sp,#10*4] eor r2,r7,r8 add r9,r9,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r6 add r9,r9,r12 @ h+=K256[i] eor r2,r2,r8 @ Ch(e,f,g) eor r0,r10,r10,ror#11 add r9,r9,r2 @ h+=Ch(e,f,g) #if 26==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 26<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r10,r11 @ a^b, b^c in next round #else ldr r2,[sp,#12*4] @ from future BODY_16_xx eor r12,r10,r11 @ a^b, b^c in next round ldr r1,[sp,#9*4] @ from future BODY_16_xx #endif eor r0,r0,r10,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r5,r5,r9 @ d+=h eor r3,r3,r11 @ Maj(a,b,c) add r9,r9,r0,ror#2 @ h+=Sigma0(a) @ add r9,r9,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#12*4] @ 27 @ ldr r1,[sp,#9*4] mov r0,r2,ror#7 add r9,r9,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#11*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#4*4] add r3,r3,r0 eor r0,r5,r5,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r5,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r8,r8,r2 @ h+=X[i] str r2,[sp,#11*4] eor r2,r6,r7 add r8,r8,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r5 add r8,r8,r3 @ h+=K256[i] eor r2,r2,r7 @ Ch(e,f,g) eor r0,r9,r9,ror#11 add r8,r8,r2 @ h+=Ch(e,f,g) #if 27==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 27<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r9,r10 @ a^b, b^c in next round #else ldr r2,[sp,#13*4] @ from future BODY_16_xx eor r3,r9,r10 @ a^b, b^c in next round ldr r1,[sp,#10*4] @ from future BODY_16_xx #endif eor r0,r0,r9,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r4,r4,r8 @ d+=h eor r12,r12,r10 @ Maj(a,b,c) add r8,r8,r0,ror#2 @ h+=Sigma0(a) @ add r8,r8,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#13*4] @ 28 @ ldr r1,[sp,#10*4] mov r0,r2,ror#7 add r8,r8,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#12*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#5*4] add r12,r12,r0 eor r0,r4,r4,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r4,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r7,r7,r2 @ h+=X[i] str r2,[sp,#12*4] eor r2,r5,r6 add r7,r7,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r4 add r7,r7,r12 @ h+=K256[i] eor r2,r2,r6 @ Ch(e,f,g) eor r0,r8,r8,ror#11 add r7,r7,r2 @ h+=Ch(e,f,g) #if 28==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 28<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r8,r9 @ a^b, b^c in next round #else ldr r2,[sp,#14*4] @ from future BODY_16_xx eor r12,r8,r9 @ a^b, b^c in next round ldr r1,[sp,#11*4] @ from future BODY_16_xx #endif eor r0,r0,r8,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r11,r11,r7 @ d+=h eor r3,r3,r9 @ Maj(a,b,c) add r7,r7,r0,ror#2 @ h+=Sigma0(a) @ add r7,r7,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#14*4] @ 29 @ ldr r1,[sp,#11*4] mov r0,r2,ror#7 add r7,r7,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#13*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#6*4] add r3,r3,r0 eor r0,r11,r11,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r11,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r6,r6,r2 @ h+=X[i] str r2,[sp,#13*4] eor r2,r4,r5 add r6,r6,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r11 add r6,r6,r3 @ h+=K256[i] eor r2,r2,r5 @ Ch(e,f,g) eor r0,r7,r7,ror#11 add r6,r6,r2 @ h+=Ch(e,f,g) #if 29==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 29<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r7,r8 @ a^b, b^c in next round #else ldr r2,[sp,#15*4] @ from future BODY_16_xx eor r3,r7,r8 @ a^b, b^c in next round ldr r1,[sp,#12*4] @ from future BODY_16_xx #endif eor r0,r0,r7,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r10,r10,r6 @ d+=h eor r12,r12,r8 @ Maj(a,b,c) add r6,r6,r0,ror#2 @ h+=Sigma0(a) @ add r6,r6,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#15*4] @ 30 @ ldr r1,[sp,#12*4] mov r0,r2,ror#7 add r6,r6,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#14*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#7*4] add r12,r12,r0 eor r0,r10,r10,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r10,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r5,r5,r2 @ h+=X[i] str r2,[sp,#14*4] eor r2,r11,r4 add r5,r5,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r10 add r5,r5,r12 @ h+=K256[i] eor r2,r2,r4 @ Ch(e,f,g) eor r0,r6,r6,ror#11 add r5,r5,r2 @ h+=Ch(e,f,g) #if 30==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 30<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r6,r7 @ a^b, b^c in next round #else ldr r2,[sp,#0*4] @ from future BODY_16_xx eor r12,r6,r7 @ a^b, b^c in next round ldr r1,[sp,#13*4] @ from future BODY_16_xx #endif eor r0,r0,r6,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r9,r9,r5 @ d+=h eor r3,r3,r7 @ Maj(a,b,c) add r5,r5,r0,ror#2 @ h+=Sigma0(a) @ add r5,r5,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#0*4] @ 31 @ ldr r1,[sp,#13*4] mov r0,r2,ror#7 add r5,r5,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#15*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#8*4] add r3,r3,r0 eor r0,r9,r9,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r9,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r4,r4,r2 @ h+=X[i] str r2,[sp,#15*4] eor r2,r10,r11 add r4,r4,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r9 add r4,r4,r3 @ h+=K256[i] eor r2,r2,r11 @ Ch(e,f,g) eor r0,r5,r5,ror#11 add r4,r4,r2 @ h+=Ch(e,f,g) #if 31==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 31<15 # if __ARM_ARCH>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r5,r6 @ a^b, b^c in next round #else ldr r2,[sp,#1*4] @ from future BODY_16_xx eor r3,r5,r6 @ a^b, b^c in next round ldr r1,[sp,#14*4] @ from future BODY_16_xx #endif eor r0,r0,r5,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r8,r8,r4 @ d+=h eor r12,r12,r6 @ Maj(a,b,c) add r4,r4,r0,ror#2 @ h+=Sigma0(a) @ add r4,r4,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH>=7 ite eq @ Thumb2 thing, sanity check in ARM #endif ldreq r3,[sp,#16*4] @ pull ctx bne .Lrounds_16_xx add r4,r4,r12 @ h+=Maj(a,b,c) from the past ldr r0,[r3,#0] ldr r2,[r3,#4] ldr r12,[r3,#8] add r4,r4,r0 ldr r0,[r3,#12] add r5,r5,r2 ldr r2,[r3,#16] add r6,r6,r12 ldr r12,[r3,#20] add r7,r7,r0 ldr r0,[r3,#24] add r8,r8,r2 ldr r2,[r3,#28] add r9,r9,r12 ldr r1,[sp,#17*4] @ pull inp ldr r12,[sp,#18*4] @ pull inp+len add r10,r10,r0 add r11,r11,r2 stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} cmp r1,r12 sub r14,r14,#256 @ rewind Ktbl bne .Loop add sp,sp,#19*4 @ destroy frame #if __ARM_ARCH>=5 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} #else ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif .size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon .LK256_shortcut_neon: @ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. #if defined(__thumb2__) .word K256-(.LK256_add_neon+4) #else .word K256-(.LK256_add_neon+8) #endif .globl sha256_block_data_order_neon .hidden sha256_block_data_order_neon .type sha256_block_data_order_neon,%function .align 5 .skip 16 sha256_block_data_order_neon: stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} sub r11,sp,#16*4+16 @ K256 is just at the boundary of being easily referenced by an ADR from @ this function. In Arm mode, when building with __ARM_ARCH=6, it does @ not fit. By moving code around, we could make it fit, but this is too @ fragile. For simplicity, just load the offset from @ .LK256_shortcut_neon. @ @ TODO(davidben): adrl would avoid a load, but clang-assembler does not @ support it. We might be able to emulate it with a macro, but Android's @ did not work when I tried it. @ https://android.googlesource.com/platform/ndk/+/refs/heads/main/docs/ClangMigration.md#arm ldr r14,.LK256_shortcut_neon .LK256_add_neon: add r14,pc,r14 bic r11,r11,#15 @ align for 128-bit stores mov r12,sp mov sp,r11 @ alloca add r2,r1,r2,lsl#6 @ len to point at the end of inp vld1.8 {q0},[r1]! vld1.8 {q1},[r1]! vld1.8 {q2},[r1]! vld1.8 {q3},[r1]! vld1.32 {q8},[r14,:128]! vld1.32 {q9},[r14,:128]! vld1.32 {q10},[r14,:128]! vld1.32 {q11},[r14,:128]! vrev32.8 q0,q0 @ yes, even on str r0,[sp,#64] vrev32.8 q1,q1 @ big-endian str r1,[sp,#68] mov r1,sp vrev32.8 q2,q2 str r2,[sp,#72] vrev32.8 q3,q3 str r12,[sp,#76] @ save original sp vadd.i32 q8,q8,q0 vadd.i32 q9,q9,q1 vst1.32 {q8},[r1,:128]! vadd.i32 q10,q10,q2 vst1.32 {q9},[r1,:128]! vadd.i32 q11,q11,q3 vst1.32 {q10},[r1,:128]! vst1.32 {q11},[r1,:128]! ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} sub r1,r1,#64 ldr r2,[sp,#0] eor r12,r12,r12 eor r3,r5,r6 b .L_00_48 .align 4 .L_00_48: vext.8 q8,q0,q1,#4 add r11,r11,r2 eor r2,r9,r10 eor r0,r8,r8,ror#5 vext.8 q9,q2,q3,#4 add r4,r4,r12 and r2,r2,r8 eor r12,r0,r8,ror#19 vshr.u32 q10,q8,#7 eor r0,r4,r4,ror#11 eor r2,r2,r10 vadd.i32 q0,q0,q9 add r11,r11,r12,ror#6 eor r12,r4,r5 vshr.u32 q9,q8,#3 eor r0,r0,r4,ror#20 add r11,r11,r2 vsli.32 q10,q8,#25 ldr r2,[sp,#4] and r3,r3,r12 vshr.u32 q11,q8,#18 add r7,r7,r11 add r11,r11,r0,ror#2 eor r3,r3,r5 veor q9,q9,q10 add r10,r10,r2 vsli.32 q11,q8,#14 eor r2,r8,r9 eor r0,r7,r7,ror#5 vshr.u32 d24,d7,#17 add r11,r11,r3 and r2,r2,r7 veor q9,q9,q11 eor r3,r0,r7,ror#19 eor r0,r11,r11,ror#11 vsli.32 d24,d7,#15 eor r2,r2,r9 add r10,r10,r3,ror#6 vshr.u32 d25,d7,#10 eor r3,r11,r4 eor r0,r0,r11,ror#20 vadd.i32 q0,q0,q9 add r10,r10,r2 ldr r2,[sp,#8] veor d25,d25,d24 and r12,r12,r3 add r6,r6,r10 vshr.u32 d24,d7,#19 add r10,r10,r0,ror#2 eor r12,r12,r4 vsli.32 d24,d7,#13 add r9,r9,r2 eor r2,r7,r8 veor d25,d25,d24 eor r0,r6,r6,ror#5 add r10,r10,r12 vadd.i32 d0,d0,d25 and r2,r2,r6 eor r12,r0,r6,ror#19 vshr.u32 d24,d0,#17 eor r0,r10,r10,ror#11 eor r2,r2,r8 vsli.32 d24,d0,#15 add r9,r9,r12,ror#6 eor r12,r10,r11 vshr.u32 d25,d0,#10 eor r0,r0,r10,ror#20 add r9,r9,r2 veor d25,d25,d24 ldr r2,[sp,#12] and r3,r3,r12 vshr.u32 d24,d0,#19 add r5,r5,r9 add r9,r9,r0,ror#2 eor r3,r3,r11 vld1.32 {q8},[r14,:128]! add r8,r8,r2 vsli.32 d24,d0,#13 eor r2,r6,r7 eor r0,r5,r5,ror#5 veor d25,d25,d24 add r9,r9,r3 and r2,r2,r5 vadd.i32 d1,d1,d25 eor r3,r0,r5,ror#19 eor r0,r9,r9,ror#11 vadd.i32 q8,q8,q0 eor r2,r2,r7 add r8,r8,r3,ror#6 eor r3,r9,r10 eor r0,r0,r9,ror#20 add r8,r8,r2 ldr r2,[sp,#16] and r12,r12,r3 add r4,r4,r8 vst1.32 {q8},[r1,:128]! add r8,r8,r0,ror#2 eor r12,r12,r10 vext.8 q8,q1,q2,#4 add r7,r7,r2 eor r2,r5,r6 eor r0,r4,r4,ror#5 vext.8 q9,q3,q0,#4 add r8,r8,r12 and r2,r2,r4 eor r12,r0,r4,ror#19 vshr.u32 q10,q8,#7 eor r0,r8,r8,ror#11 eor r2,r2,r6 vadd.i32 q1,q1,q9 add r7,r7,r12,ror#6 eor r12,r8,r9 vshr.u32 q9,q8,#3 eor r0,r0,r8,ror#20 add r7,r7,r2 vsli.32 q10,q8,#25 ldr r2,[sp,#20] and r3,r3,r12 vshr.u32 q11,q8,#18 add r11,r11,r7 add r7,r7,r0,ror#2 eor r3,r3,r9 veor q9,q9,q10 add r6,r6,r2 vsli.32 q11,q8,#14 eor r2,r4,r5 eor r0,r11,r11,ror#5 vshr.u32 d24,d1,#17 add r7,r7,r3 and r2,r2,r11 veor q9,q9,q11 eor r3,r0,r11,ror#19 eor r0,r7,r7,ror#11 vsli.32 d24,d1,#15 eor r2,r2,r5 add r6,r6,r3,ror#6 vshr.u32 d25,d1,#10 eor r3,r7,r8 eor r0,r0,r7,ror#20 vadd.i32 q1,q1,q9 add r6,r6,r2 ldr r2,[sp,#24] veor d25,d25,d24 and r12,r12,r3 add r10,r10,r6 vshr.u32 d24,d1,#19 add r6,r6,r0,ror#2 eor r12,r12,r8 vsli.32 d24,d1,#13 add r5,r5,r2 eor r2,r11,r4 veor d25,d25,d24 eor r0,r10,r10,ror#5 add r6,r6,r12 vadd.i32 d2,d2,d25 and r2,r2,r10 eor r12,r0,r10,ror#19 vshr.u32 d24,d2,#17 eor r0,r6,r6,ror#11 eor r2,r2,r4 vsli.32 d24,d2,#15 add r5,r5,r12,ror#6 eor r12,r6,r7 vshr.u32 d25,d2,#10 eor r0,r0,r6,ror#20 add r5,r5,r2 veor d25,d25,d24 ldr r2,[sp,#28] and r3,r3,r12 vshr.u32 d24,d2,#19 add r9,r9,r5 add r5,r5,r0,ror#2 eor r3,r3,r7 vld1.32 {q8},[r14,:128]! add r4,r4,r2 vsli.32 d24,d2,#13 eor r2,r10,r11 eor r0,r9,r9,ror#5 veor d25,d25,d24 add r5,r5,r3 and r2,r2,r9 vadd.i32 d3,d3,d25 eor r3,r0,r9,ror#19 eor r0,r5,r5,ror#11 vadd.i32 q8,q8,q1 eor r2,r2,r11 add r4,r4,r3,ror#6 eor r3,r5,r6 eor r0,r0,r5,ror#20 add r4,r4,r2 ldr r2,[sp,#32] and r12,r12,r3 add r8,r8,r4 vst1.32 {q8},[r1,:128]! add r4,r4,r0,ror#2 eor r12,r12,r6 vext.8 q8,q2,q3,#4 add r11,r11,r2 eor r2,r9,r10 eor r0,r8,r8,ror#5 vext.8 q9,q0,q1,#4 add r4,r4,r12 and r2,r2,r8 eor r12,r0,r8,ror#19 vshr.u32 q10,q8,#7 eor r0,r4,r4,ror#11 eor r2,r2,r10 vadd.i32 q2,q2,q9 add r11,r11,r12,ror#6 eor r12,r4,r5 vshr.u32 q9,q8,#3 eor r0,r0,r4,ror#20 add r11,r11,r2 vsli.32 q10,q8,#25 ldr r2,[sp,#36] and r3,r3,r12 vshr.u32 q11,q8,#18 add r7,r7,r11 add r11,r11,r0,ror#2 eor r3,r3,r5 veor q9,q9,q10 add r10,r10,r2 vsli.32 q11,q8,#14 eor r2,r8,r9 eor r0,r7,r7,ror#5 vshr.u32 d24,d3,#17 add r11,r11,r3 and r2,r2,r7 veor q9,q9,q11 eor r3,r0,r7,ror#19 eor r0,r11,r11,ror#11 vsli.32 d24,d3,#15 eor r2,r2,r9 add r10,r10,r3,ror#6 vshr.u32 d25,d3,#10 eor r3,r11,r4 eor r0,r0,r11,ror#20 vadd.i32 q2,q2,q9 add r10,r10,r2 ldr r2,[sp,#40] veor d25,d25,d24 and r12,r12,r3 add r6,r6,r10 vshr.u32 d24,d3,#19 add r10,r10,r0,ror#2 eor r12,r12,r4 vsli.32 d24,d3,#13 add r9,r9,r2 eor r2,r7,r8 veor d25,d25,d24 eor r0,r6,r6,ror#5 add r10,r10,r12 vadd.i32 d4,d4,d25 and r2,r2,r6 eor r12,r0,r6,ror#19 vshr.u32 d24,d4,#17 eor r0,r10,r10,ror#11 eor r2,r2,r8 vsli.32 d24,d4,#15 add r9,r9,r12,ror#6 eor r12,r10,r11 vshr.u32 d25,d4,#10 eor r0,r0,r10,ror#20 add r9,r9,r2 veor d25,d25,d24 ldr r2,[sp,#44] and r3,r3,r12 vshr.u32 d24,d4,#19 add r5,r5,r9 add r9,r9,r0,ror#2 eor r3,r3,r11 vld1.32 {q8},[r14,:128]! add r8,r8,r2 vsli.32 d24,d4,#13 eor r2,r6,r7 eor r0,r5,r5,ror#5 veor d25,d25,d24 add r9,r9,r3 and r2,r2,r5 vadd.i32 d5,d5,d25 eor r3,r0,r5,ror#19 eor r0,r9,r9,ror#11 vadd.i32 q8,q8,q2 eor r2,r2,r7 add r8,r8,r3,ror#6 eor r3,r9,r10 eor r0,r0,r9,ror#20 add r8,r8,r2 ldr r2,[sp,#48] and r12,r12,r3 add r4,r4,r8 vst1.32 {q8},[r1,:128]! add r8,r8,r0,ror#2 eor r12,r12,r10 vext.8 q8,q3,q0,#4 add r7,r7,r2 eor r2,r5,r6 eor r0,r4,r4,ror#5 vext.8 q9,q1,q2,#4 add r8,r8,r12 and r2,r2,r4 eor r12,r0,r4,ror#19 vshr.u32 q10,q8,#7 eor r0,r8,r8,ror#11 eor r2,r2,r6 vadd.i32 q3,q3,q9 add r7,r7,r12,ror#6 eor r12,r8,r9 vshr.u32 q9,q8,#3 eor r0,r0,r8,ror#20 add r7,r7,r2 vsli.32 q10,q8,#25 ldr r2,[sp,#52] and r3,r3,r12 vshr.u32 q11,q8,#18 add r11,r11,r7 add r7,r7,r0,ror#2 eor r3,r3,r9 veor q9,q9,q10 add r6,r6,r2 vsli.32 q11,q8,#14 eor r2,r4,r5 eor r0,r11,r11,ror#5 vshr.u32 d24,d5,#17 add r7,r7,r3 and r2,r2,r11 veor q9,q9,q11 eor r3,r0,r11,ror#19 eor r0,r7,r7,ror#11 vsli.32 d24,d5,#15 eor r2,r2,r5 add r6,r6,r3,ror#6 vshr.u32 d25,d5,#10 eor r3,r7,r8 eor r0,r0,r7,ror#20 vadd.i32 q3,q3,q9 add r6,r6,r2 ldr r2,[sp,#56] veor d25,d25,d24 and r12,r12,r3 add r10,r10,r6 vshr.u32 d24,d5,#19 add r6,r6,r0,ror#2 eor r12,r12,r8 vsli.32 d24,d5,#13 add r5,r5,r2 eor r2,r11,r4 veor d25,d25,d24 eor r0,r10,r10,ror#5 add r6,r6,r12 vadd.i32 d6,d6,d25 and r2,r2,r10 eor r12,r0,r10,ror#19 vshr.u32 d24,d6,#17 eor r0,r6,r6,ror#11 eor r2,r2,r4 vsli.32 d24,d6,#15 add r5,r5,r12,ror#6 eor r12,r6,r7 vshr.u32 d25,d6,#10 eor r0,r0,r6,ror#20 add r5,r5,r2 veor d25,d25,d24 ldr r2,[sp,#60] and r3,r3,r12 vshr.u32 d24,d6,#19 add r9,r9,r5 add r5,r5,r0,ror#2 eor r3,r3,r7 vld1.32 {q8},[r14,:128]! add r4,r4,r2 vsli.32 d24,d6,#13 eor r2,r10,r11 eor r0,r9,r9,ror#5 veor d25,d25,d24 add r5,r5,r3 and r2,r2,r9 vadd.i32 d7,d7,d25 eor r3,r0,r9,ror#19 eor r0,r5,r5,ror#11 vadd.i32 q8,q8,q3 eor r2,r2,r11 add r4,r4,r3,ror#6 eor r3,r5,r6 eor r0,r0,r5,ror#20 add r4,r4,r2 ldr r2,[r14] and r12,r12,r3 add r8,r8,r4 vst1.32 {q8},[r1,:128]! add r4,r4,r0,ror#2 eor r12,r12,r6 teq r2,#0 @ check for K256 terminator ldr r2,[sp,#0] sub r1,r1,#64 bne .L_00_48 ldr r1,[sp,#68] ldr r0,[sp,#72] sub r14,r14,#256 @ rewind r14 teq r1,r0 it eq subeq r1,r1,#64 @ avoid SEGV vld1.8 {q0},[r1]! @ load next input block vld1.8 {q1},[r1]! vld1.8 {q2},[r1]! vld1.8 {q3},[r1]! it ne strne r1,[sp,#68] mov r1,sp add r11,r11,r2 eor r2,r9,r10 eor r0,r8,r8,ror#5 add r4,r4,r12 vld1.32 {q8},[r14,:128]! and r2,r2,r8 eor r12,r0,r8,ror#19 eor r0,r4,r4,ror#11 eor r2,r2,r10 vrev32.8 q0,q0 add r11,r11,r12,ror#6 eor r12,r4,r5 eor r0,r0,r4,ror#20 add r11,r11,r2 vadd.i32 q8,q8,q0 ldr r2,[sp,#4] and r3,r3,r12 add r7,r7,r11 add r11,r11,r0,ror#2 eor r3,r3,r5 add r10,r10,r2 eor r2,r8,r9 eor r0,r7,r7,ror#5 add r11,r11,r3 and r2,r2,r7 eor r3,r0,r7,ror#19 eor r0,r11,r11,ror#11 eor r2,r2,r9 add r10,r10,r3,ror#6 eor r3,r11,r4 eor r0,r0,r11,ror#20 add r10,r10,r2 ldr r2,[sp,#8] and r12,r12,r3 add r6,r6,r10 add r10,r10,r0,ror#2 eor r12,r12,r4 add r9,r9,r2 eor r2,r7,r8 eor r0,r6,r6,ror#5 add r10,r10,r12 and r2,r2,r6 eor r12,r0,r6,ror#19 eor r0,r10,r10,ror#11 eor r2,r2,r8 add r9,r9,r12,ror#6 eor r12,r10,r11 eor r0,r0,r10,ror#20 add r9,r9,r2 ldr r2,[sp,#12] and r3,r3,r12 add r5,r5,r9 add r9,r9,r0,ror#2 eor r3,r3,r11 add r8,r8,r2 eor r2,r6,r7 eor r0,r5,r5,ror#5 add r9,r9,r3 and r2,r2,r5 eor r3,r0,r5,ror#19 eor r0,r9,r9,ror#11 eor r2,r2,r7 add r8,r8,r3,ror#6 eor r3,r9,r10 eor r0,r0,r9,ror#20 add r8,r8,r2 ldr r2,[sp,#16] and r12,r12,r3 add r4,r4,r8 add r8,r8,r0,ror#2 eor r12,r12,r10 vst1.32 {q8},[r1,:128]! add r7,r7,r2 eor r2,r5,r6 eor r0,r4,r4,ror#5 add r8,r8,r12 vld1.32 {q8},[r14,:128]! and r2,r2,r4 eor r12,r0,r4,ror#19 eor r0,r8,r8,ror#11 eor r2,r2,r6 vrev32.8 q1,q1 add r7,r7,r12,ror#6 eor r12,r8,r9 eor r0,r0,r8,ror#20 add r7,r7,r2 vadd.i32 q8,q8,q1 ldr r2,[sp,#20] and r3,r3,r12 add r11,r11,r7 add r7,r7,r0,ror#2 eor r3,r3,r9 add r6,r6,r2 eor r2,r4,r5 eor r0,r11,r11,ror#5 add r7,r7,r3 and r2,r2,r11 eor r3,r0,r11,ror#19 eor r0,r7,r7,ror#11 eor r2,r2,r5 add r6,r6,r3,ror#6 eor r3,r7,r8 eor r0,r0,r7,ror#20 add r6,r6,r2 ldr r2,[sp,#24] and r12,r12,r3 add r10,r10,r6 add r6,r6,r0,ror#2 eor r12,r12,r8 add r5,r5,r2 eor r2,r11,r4 eor r0,r10,r10,ror#5 add r6,r6,r12 and r2,r2,r10 eor r12,r0,r10,ror#19 eor r0,r6,r6,ror#11 eor r2,r2,r4 add r5,r5,r12,ror#6 eor r12,r6,r7 eor r0,r0,r6,ror#20 add r5,r5,r2 ldr r2,[sp,#28] and r3,r3,r12 add r9,r9,r5 add r5,r5,r0,ror#2 eor r3,r3,r7 add r4,r4,r2 eor r2,r10,r11 eor r0,r9,r9,ror#5 add r5,r5,r3 and r2,r2,r9 eor r3,r0,r9,ror#19 eor r0,r5,r5,ror#11 eor r2,r2,r11 add r4,r4,r3,ror#6 eor r3,r5,r6 eor r0,r0,r5,ror#20 add r4,r4,r2 ldr r2,[sp,#32] and r12,r12,r3 add r8,r8,r4 add r4,r4,r0,ror#2 eor r12,r12,r6 vst1.32 {q8},[r1,:128]! add r11,r11,r2 eor r2,r9,r10 eor r0,r8,r8,ror#5 add r4,r4,r12 vld1.32 {q8},[r14,:128]! and r2,r2,r8 eor r12,r0,r8,ror#19 eor r0,r4,r4,ror#11 eor r2,r2,r10 vrev32.8 q2,q2 add r11,r11,r12,ror#6 eor r12,r4,r5 eor r0,r0,r4,ror#20 add r11,r11,r2 vadd.i32 q8,q8,q2 ldr r2,[sp,#36] and r3,r3,r12 add r7,r7,r11 add r11,r11,r0,ror#2 eor r3,r3,r5 add r10,r10,r2 eor r2,r8,r9 eor r0,r7,r7,ror#5 add r11,r11,r3 and r2,r2,r7 eor r3,r0,r7,ror#19 eor r0,r11,r11,ror#11 eor r2,r2,r9 add r10,r10,r3,ror#6 eor r3,r11,r4 eor r0,r0,r11,ror#20 add r10,r10,r2 ldr r2,[sp,#40] and r12,r12,r3 add r6,r6,r10 add r10,r10,r0,ror#2 eor r12,r12,r4 add r9,r9,r2 eor r2,r7,r8 eor r0,r6,r6,ror#5 add r10,r10,r12 and r2,r2,r6 eor r12,r0,r6,ror#19 eor r0,r10,r10,ror#11 eor r2,r2,r8 add r9,r9,r12,ror#6 eor r12,r10,r11 eor r0,r0,r10,ror#20 add r9,r9,r2 ldr r2,[sp,#44] and r3,r3,r12 add r5,r5,r9 add r9,r9,r0,ror#2 eor r3,r3,r11 add r8,r8,r2 eor r2,r6,r7 eor r0,r5,r5,ror#5 add r9,r9,r3 and r2,r2,r5 eor r3,r0,r5,ror#19 eor r0,r9,r9,ror#11 eor r2,r2,r7 add r8,r8,r3,ror#6 eor r3,r9,r10 eor r0,r0,r9,ror#20 add r8,r8,r2 ldr r2,[sp,#48] and r12,r12,r3 add r4,r4,r8 add r8,r8,r0,ror#2 eor r12,r12,r10 vst1.32 {q8},[r1,:128]! add r7,r7,r2 eor r2,r5,r6 eor r0,r4,r4,ror#5 add r8,r8,r12 vld1.32 {q8},[r14,:128]! and r2,r2,r4 eor r12,r0,r4,ror#19 eor r0,r8,r8,ror#11 eor r2,r2,r6 vrev32.8 q3,q3 add r7,r7,r12,ror#6 eor r12,r8,r9 eor r0,r0,r8,ror#20 add r7,r7,r2 vadd.i32 q8,q8,q3 ldr r2,[sp,#52] and r3,r3,r12 add r11,r11,r7 add r7,r7,r0,ror#2 eor r3,r3,r9 add r6,r6,r2 eor r2,r4,r5 eor r0,r11,r11,ror#5 add r7,r7,r3 and r2,r2,r11 eor r3,r0,r11,ror#19 eor r0,r7,r7,ror#11 eor r2,r2,r5 add r6,r6,r3,ror#6 eor r3,r7,r8 eor r0,r0,r7,ror#20 add r6,r6,r2 ldr r2,[sp,#56] and r12,r12,r3 add r10,r10,r6 add r6,r6,r0,ror#2 eor r12,r12,r8 add r5,r5,r2 eor r2,r11,r4 eor r0,r10,r10,ror#5 add r6,r6,r12 and r2,r2,r10 eor r12,r0,r10,ror#19 eor r0,r6,r6,ror#11 eor r2,r2,r4 add r5,r5,r12,ror#6 eor r12,r6,r7 eor r0,r0,r6,ror#20 add r5,r5,r2 ldr r2,[sp,#60] and r3,r3,r12 add r9,r9,r5 add r5,r5,r0,ror#2 eor r3,r3,r7 add r4,r4,r2 eor r2,r10,r11 eor r0,r9,r9,ror#5 add r5,r5,r3 and r2,r2,r9 eor r3,r0,r9,ror#19 eor r0,r5,r5,ror#11 eor r2,r2,r11 add r4,r4,r3,ror#6 eor r3,r5,r6 eor r0,r0,r5,ror#20 add r4,r4,r2 ldr r2,[sp,#64] and r12,r12,r3 add r8,r8,r4 add r4,r4,r0,ror#2 eor r12,r12,r6 vst1.32 {q8},[r1,:128]! ldr r0,[r2,#0] add r4,r4,r12 @ h+=Maj(a,b,c) from the past ldr r12,[r2,#4] ldr r3,[r2,#8] ldr r1,[r2,#12] add r4,r4,r0 @ accumulate ldr r0,[r2,#16] add r5,r5,r12 ldr r12,[r2,#20] add r6,r6,r3 ldr r3,[r2,#24] add r7,r7,r1 ldr r1,[r2,#28] add r8,r8,r0 str r4,[r2],#4 add r9,r9,r12 str r5,[r2],#4 add r10,r10,r3 str r6,[r2],#4 add r11,r11,r1 str r7,[r2],#4 stmia r2,{r8,r9,r10,r11} ittte ne movne r1,sp ldrne r2,[sp,#0] eorne r12,r12,r12 ldreq sp,[sp,#76] @ restore original sp itt ne eorne r3,r5,r6 bne .L_00_48 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} .size sha256_block_data_order_neon,.-sha256_block_data_order_neon #endif .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) ring-0.17.14/pregenerated/sha256-armv8-ios64.S000064400000000000000000001023461046102023000165410ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ==================================================================== // Written by Andy Polyakov for the OpenSSL // project. // ==================================================================== // // SHA256/512 for ARMv8. // // Performance in cycles per processed byte and improvement coefficient // over code generated with "default" compiler: // // SHA256-hw SHA256(*) SHA512 // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) // Denver 2.01 10.5 (+26%) 6.70 (+8%) // X-Gene 20.0 (+100%) 12.8 (+300%(***)) // Mongoose 2.36 13.0 (+50%) 8.36 (+33%) // Kryo 1.92 17.4 (+30%) 11.2 (+8%) // // (*) Software SHA256 results are of lesser relevance, presented // mostly for informational purposes. // (**) The result is a trade-off: it's possible to improve it by // 10% (or by 1 cycle per round), but at the cost of 20% loss // on Cortex-A53 (or by 4 cycles per round). // (***) Super-impressive coefficients over gcc-generated code are // indication of some compiler "pathology", most notably code // generated with -mgeneral-regs-only is significantly faster // and the gap is only 40-90%. #ifndef __KERNEL__ #endif .text .globl _sha256_block_data_order_nohw .private_extern _sha256_block_data_order_nohw .align 6 _sha256_block_data_order_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#4*4 ldp w20,w21,[x0] // load context ldp w22,w23,[x0,#2*4] ldp w24,w25,[x0,#4*4] add x2,x1,x2,lsl#6 // end of input ldp w26,w27,[x0,#6*4] adrp x30,LK256@PAGE add x30,x30,LK256@PAGEOFF stp x0,x2,[x29,#96] Loop: ldp w3,w4,[x1],#2*4 ldr w19,[x30],#4 // *K++ eor w28,w21,w22 // magic seed str x1,[x29,#112] #ifndef __AARCH64EB__ rev w3,w3 // 0 #endif ror w16,w24,#6 add w27,w27,w19 // h+=K[i] eor w6,w24,w24,ror#14 and w17,w25,w24 bic w19,w26,w24 add w27,w27,w3 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w20,w21 // a^b, b^c in next round eor w16,w16,w6,ror#11 // Sigma1(e) ror w6,w20,#2 add w27,w27,w17 // h+=Ch(e,f,g) eor w17,w20,w20,ror#9 add w27,w27,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w23,w23,w27 // d+=h eor w28,w28,w21 // Maj(a,b,c) eor w17,w6,w17,ror#13 // Sigma0(a) add w27,w27,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w27,w27,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w4,w4 // 1 #endif ldp w5,w6,[x1],#2*4 add w27,w27,w17 // h+=Sigma0(a) ror w16,w23,#6 add w26,w26,w28 // h+=K[i] eor w7,w23,w23,ror#14 and w17,w24,w23 bic w28,w25,w23 add w26,w26,w4 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w27,w20 // a^b, b^c in next round eor w16,w16,w7,ror#11 // Sigma1(e) ror w7,w27,#2 add w26,w26,w17 // h+=Ch(e,f,g) eor w17,w27,w27,ror#9 add w26,w26,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w22,w22,w26 // d+=h eor w19,w19,w20 // Maj(a,b,c) eor w17,w7,w17,ror#13 // Sigma0(a) add w26,w26,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w26,w26,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w5,w5 // 2 #endif add w26,w26,w17 // h+=Sigma0(a) ror w16,w22,#6 add w25,w25,w19 // h+=K[i] eor w8,w22,w22,ror#14 and w17,w23,w22 bic w19,w24,w22 add w25,w25,w5 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w26,w27 // a^b, b^c in next round eor w16,w16,w8,ror#11 // Sigma1(e) ror w8,w26,#2 add w25,w25,w17 // h+=Ch(e,f,g) eor w17,w26,w26,ror#9 add w25,w25,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w21,w21,w25 // d+=h eor w28,w28,w27 // Maj(a,b,c) eor w17,w8,w17,ror#13 // Sigma0(a) add w25,w25,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w25,w25,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w6,w6 // 3 #endif ldp w7,w8,[x1],#2*4 add w25,w25,w17 // h+=Sigma0(a) ror w16,w21,#6 add w24,w24,w28 // h+=K[i] eor w9,w21,w21,ror#14 and w17,w22,w21 bic w28,w23,w21 add w24,w24,w6 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w25,w26 // a^b, b^c in next round eor w16,w16,w9,ror#11 // Sigma1(e) ror w9,w25,#2 add w24,w24,w17 // h+=Ch(e,f,g) eor w17,w25,w25,ror#9 add w24,w24,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w20,w20,w24 // d+=h eor w19,w19,w26 // Maj(a,b,c) eor w17,w9,w17,ror#13 // Sigma0(a) add w24,w24,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w24,w24,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w7,w7 // 4 #endif add w24,w24,w17 // h+=Sigma0(a) ror w16,w20,#6 add w23,w23,w19 // h+=K[i] eor w10,w20,w20,ror#14 and w17,w21,w20 bic w19,w22,w20 add w23,w23,w7 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w24,w25 // a^b, b^c in next round eor w16,w16,w10,ror#11 // Sigma1(e) ror w10,w24,#2 add w23,w23,w17 // h+=Ch(e,f,g) eor w17,w24,w24,ror#9 add w23,w23,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w27,w27,w23 // d+=h eor w28,w28,w25 // Maj(a,b,c) eor w17,w10,w17,ror#13 // Sigma0(a) add w23,w23,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w23,w23,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w8,w8 // 5 #endif ldp w9,w10,[x1],#2*4 add w23,w23,w17 // h+=Sigma0(a) ror w16,w27,#6 add w22,w22,w28 // h+=K[i] eor w11,w27,w27,ror#14 and w17,w20,w27 bic w28,w21,w27 add w22,w22,w8 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w23,w24 // a^b, b^c in next round eor w16,w16,w11,ror#11 // Sigma1(e) ror w11,w23,#2 add w22,w22,w17 // h+=Ch(e,f,g) eor w17,w23,w23,ror#9 add w22,w22,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w26,w26,w22 // d+=h eor w19,w19,w24 // Maj(a,b,c) eor w17,w11,w17,ror#13 // Sigma0(a) add w22,w22,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w22,w22,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w9,w9 // 6 #endif add w22,w22,w17 // h+=Sigma0(a) ror w16,w26,#6 add w21,w21,w19 // h+=K[i] eor w12,w26,w26,ror#14 and w17,w27,w26 bic w19,w20,w26 add w21,w21,w9 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w22,w23 // a^b, b^c in next round eor w16,w16,w12,ror#11 // Sigma1(e) ror w12,w22,#2 add w21,w21,w17 // h+=Ch(e,f,g) eor w17,w22,w22,ror#9 add w21,w21,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w25,w25,w21 // d+=h eor w28,w28,w23 // Maj(a,b,c) eor w17,w12,w17,ror#13 // Sigma0(a) add w21,w21,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w21,w21,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w10,w10 // 7 #endif ldp w11,w12,[x1],#2*4 add w21,w21,w17 // h+=Sigma0(a) ror w16,w25,#6 add w20,w20,w28 // h+=K[i] eor w13,w25,w25,ror#14 and w17,w26,w25 bic w28,w27,w25 add w20,w20,w10 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w21,w22 // a^b, b^c in next round eor w16,w16,w13,ror#11 // Sigma1(e) ror w13,w21,#2 add w20,w20,w17 // h+=Ch(e,f,g) eor w17,w21,w21,ror#9 add w20,w20,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w24,w24,w20 // d+=h eor w19,w19,w22 // Maj(a,b,c) eor w17,w13,w17,ror#13 // Sigma0(a) add w20,w20,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w20,w20,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w11,w11 // 8 #endif add w20,w20,w17 // h+=Sigma0(a) ror w16,w24,#6 add w27,w27,w19 // h+=K[i] eor w14,w24,w24,ror#14 and w17,w25,w24 bic w19,w26,w24 add w27,w27,w11 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w20,w21 // a^b, b^c in next round eor w16,w16,w14,ror#11 // Sigma1(e) ror w14,w20,#2 add w27,w27,w17 // h+=Ch(e,f,g) eor w17,w20,w20,ror#9 add w27,w27,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w23,w23,w27 // d+=h eor w28,w28,w21 // Maj(a,b,c) eor w17,w14,w17,ror#13 // Sigma0(a) add w27,w27,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w27,w27,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w12,w12 // 9 #endif ldp w13,w14,[x1],#2*4 add w27,w27,w17 // h+=Sigma0(a) ror w16,w23,#6 add w26,w26,w28 // h+=K[i] eor w15,w23,w23,ror#14 and w17,w24,w23 bic w28,w25,w23 add w26,w26,w12 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w27,w20 // a^b, b^c in next round eor w16,w16,w15,ror#11 // Sigma1(e) ror w15,w27,#2 add w26,w26,w17 // h+=Ch(e,f,g) eor w17,w27,w27,ror#9 add w26,w26,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w22,w22,w26 // d+=h eor w19,w19,w20 // Maj(a,b,c) eor w17,w15,w17,ror#13 // Sigma0(a) add w26,w26,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w26,w26,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w13,w13 // 10 #endif add w26,w26,w17 // h+=Sigma0(a) ror w16,w22,#6 add w25,w25,w19 // h+=K[i] eor w0,w22,w22,ror#14 and w17,w23,w22 bic w19,w24,w22 add w25,w25,w13 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w26,w27 // a^b, b^c in next round eor w16,w16,w0,ror#11 // Sigma1(e) ror w0,w26,#2 add w25,w25,w17 // h+=Ch(e,f,g) eor w17,w26,w26,ror#9 add w25,w25,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w21,w21,w25 // d+=h eor w28,w28,w27 // Maj(a,b,c) eor w17,w0,w17,ror#13 // Sigma0(a) add w25,w25,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w25,w25,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w14,w14 // 11 #endif ldp w15,w0,[x1],#2*4 add w25,w25,w17 // h+=Sigma0(a) str w6,[sp,#12] ror w16,w21,#6 add w24,w24,w28 // h+=K[i] eor w6,w21,w21,ror#14 and w17,w22,w21 bic w28,w23,w21 add w24,w24,w14 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w25,w26 // a^b, b^c in next round eor w16,w16,w6,ror#11 // Sigma1(e) ror w6,w25,#2 add w24,w24,w17 // h+=Ch(e,f,g) eor w17,w25,w25,ror#9 add w24,w24,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w20,w20,w24 // d+=h eor w19,w19,w26 // Maj(a,b,c) eor w17,w6,w17,ror#13 // Sigma0(a) add w24,w24,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w24,w24,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w15,w15 // 12 #endif add w24,w24,w17 // h+=Sigma0(a) str w7,[sp,#0] ror w16,w20,#6 add w23,w23,w19 // h+=K[i] eor w7,w20,w20,ror#14 and w17,w21,w20 bic w19,w22,w20 add w23,w23,w15 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w24,w25 // a^b, b^c in next round eor w16,w16,w7,ror#11 // Sigma1(e) ror w7,w24,#2 add w23,w23,w17 // h+=Ch(e,f,g) eor w17,w24,w24,ror#9 add w23,w23,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w27,w27,w23 // d+=h eor w28,w28,w25 // Maj(a,b,c) eor w17,w7,w17,ror#13 // Sigma0(a) add w23,w23,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w23,w23,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w0,w0 // 13 #endif ldp w1,w2,[x1] add w23,w23,w17 // h+=Sigma0(a) str w8,[sp,#4] ror w16,w27,#6 add w22,w22,w28 // h+=K[i] eor w8,w27,w27,ror#14 and w17,w20,w27 bic w28,w21,w27 add w22,w22,w0 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w23,w24 // a^b, b^c in next round eor w16,w16,w8,ror#11 // Sigma1(e) ror w8,w23,#2 add w22,w22,w17 // h+=Ch(e,f,g) eor w17,w23,w23,ror#9 add w22,w22,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w26,w26,w22 // d+=h eor w19,w19,w24 // Maj(a,b,c) eor w17,w8,w17,ror#13 // Sigma0(a) add w22,w22,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w22,w22,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w1,w1 // 14 #endif ldr w6,[sp,#12] add w22,w22,w17 // h+=Sigma0(a) str w9,[sp,#8] ror w16,w26,#6 add w21,w21,w19 // h+=K[i] eor w9,w26,w26,ror#14 and w17,w27,w26 bic w19,w20,w26 add w21,w21,w1 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w22,w23 // a^b, b^c in next round eor w16,w16,w9,ror#11 // Sigma1(e) ror w9,w22,#2 add w21,w21,w17 // h+=Ch(e,f,g) eor w17,w22,w22,ror#9 add w21,w21,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w25,w25,w21 // d+=h eor w28,w28,w23 // Maj(a,b,c) eor w17,w9,w17,ror#13 // Sigma0(a) add w21,w21,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w21,w21,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w2,w2 // 15 #endif ldr w7,[sp,#0] add w21,w21,w17 // h+=Sigma0(a) str w10,[sp,#12] ror w16,w25,#6 add w20,w20,w28 // h+=K[i] ror w9,w4,#7 and w17,w26,w25 ror w8,w1,#17 bic w28,w27,w25 ror w10,w21,#2 add w20,w20,w2 // h+=X[i] eor w16,w16,w25,ror#11 eor w9,w9,w4,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w21,w22 // a^b, b^c in next round eor w16,w16,w25,ror#25 // Sigma1(e) eor w10,w10,w21,ror#13 add w20,w20,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w8,w8,w1,ror#19 eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) add w20,w20,w16 // h+=Sigma1(e) eor w19,w19,w22 // Maj(a,b,c) eor w17,w10,w21,ror#22 // Sigma0(a) eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) add w3,w3,w12 add w24,w24,w20 // d+=h add w20,w20,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w3,w3,w9 add w20,w20,w17 // h+=Sigma0(a) add w3,w3,w8 Loop_16_xx: ldr w8,[sp,#4] str w11,[sp,#0] ror w16,w24,#6 add w27,w27,w19 // h+=K[i] ror w10,w5,#7 and w17,w25,w24 ror w9,w2,#17 bic w19,w26,w24 ror w11,w20,#2 add w27,w27,w3 // h+=X[i] eor w16,w16,w24,ror#11 eor w10,w10,w5,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w20,w21 // a^b, b^c in next round eor w16,w16,w24,ror#25 // Sigma1(e) eor w11,w11,w20,ror#13 add w27,w27,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w9,w9,w2,ror#19 eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) add w27,w27,w16 // h+=Sigma1(e) eor w28,w28,w21 // Maj(a,b,c) eor w17,w11,w20,ror#22 // Sigma0(a) eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) add w4,w4,w13 add w23,w23,w27 // d+=h add w27,w27,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w4,w4,w10 add w27,w27,w17 // h+=Sigma0(a) add w4,w4,w9 ldr w9,[sp,#8] str w12,[sp,#4] ror w16,w23,#6 add w26,w26,w28 // h+=K[i] ror w11,w6,#7 and w17,w24,w23 ror w10,w3,#17 bic w28,w25,w23 ror w12,w27,#2 add w26,w26,w4 // h+=X[i] eor w16,w16,w23,ror#11 eor w11,w11,w6,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w27,w20 // a^b, b^c in next round eor w16,w16,w23,ror#25 // Sigma1(e) eor w12,w12,w27,ror#13 add w26,w26,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w10,w10,w3,ror#19 eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) add w26,w26,w16 // h+=Sigma1(e) eor w19,w19,w20 // Maj(a,b,c) eor w17,w12,w27,ror#22 // Sigma0(a) eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) add w5,w5,w14 add w22,w22,w26 // d+=h add w26,w26,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w5,w5,w11 add w26,w26,w17 // h+=Sigma0(a) add w5,w5,w10 ldr w10,[sp,#12] str w13,[sp,#8] ror w16,w22,#6 add w25,w25,w19 // h+=K[i] ror w12,w7,#7 and w17,w23,w22 ror w11,w4,#17 bic w19,w24,w22 ror w13,w26,#2 add w25,w25,w5 // h+=X[i] eor w16,w16,w22,ror#11 eor w12,w12,w7,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w26,w27 // a^b, b^c in next round eor w16,w16,w22,ror#25 // Sigma1(e) eor w13,w13,w26,ror#13 add w25,w25,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w11,w11,w4,ror#19 eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) add w25,w25,w16 // h+=Sigma1(e) eor w28,w28,w27 // Maj(a,b,c) eor w17,w13,w26,ror#22 // Sigma0(a) eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) add w6,w6,w15 add w21,w21,w25 // d+=h add w25,w25,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w6,w6,w12 add w25,w25,w17 // h+=Sigma0(a) add w6,w6,w11 ldr w11,[sp,#0] str w14,[sp,#12] ror w16,w21,#6 add w24,w24,w28 // h+=K[i] ror w13,w8,#7 and w17,w22,w21 ror w12,w5,#17 bic w28,w23,w21 ror w14,w25,#2 add w24,w24,w6 // h+=X[i] eor w16,w16,w21,ror#11 eor w13,w13,w8,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w25,w26 // a^b, b^c in next round eor w16,w16,w21,ror#25 // Sigma1(e) eor w14,w14,w25,ror#13 add w24,w24,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w12,w12,w5,ror#19 eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) add w24,w24,w16 // h+=Sigma1(e) eor w19,w19,w26 // Maj(a,b,c) eor w17,w14,w25,ror#22 // Sigma0(a) eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) add w7,w7,w0 add w20,w20,w24 // d+=h add w24,w24,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w7,w7,w13 add w24,w24,w17 // h+=Sigma0(a) add w7,w7,w12 ldr w12,[sp,#4] str w15,[sp,#0] ror w16,w20,#6 add w23,w23,w19 // h+=K[i] ror w14,w9,#7 and w17,w21,w20 ror w13,w6,#17 bic w19,w22,w20 ror w15,w24,#2 add w23,w23,w7 // h+=X[i] eor w16,w16,w20,ror#11 eor w14,w14,w9,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w24,w25 // a^b, b^c in next round eor w16,w16,w20,ror#25 // Sigma1(e) eor w15,w15,w24,ror#13 add w23,w23,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w13,w13,w6,ror#19 eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) add w23,w23,w16 // h+=Sigma1(e) eor w28,w28,w25 // Maj(a,b,c) eor w17,w15,w24,ror#22 // Sigma0(a) eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) add w8,w8,w1 add w27,w27,w23 // d+=h add w23,w23,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w8,w8,w14 add w23,w23,w17 // h+=Sigma0(a) add w8,w8,w13 ldr w13,[sp,#8] str w0,[sp,#4] ror w16,w27,#6 add w22,w22,w28 // h+=K[i] ror w15,w10,#7 and w17,w20,w27 ror w14,w7,#17 bic w28,w21,w27 ror w0,w23,#2 add w22,w22,w8 // h+=X[i] eor w16,w16,w27,ror#11 eor w15,w15,w10,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w23,w24 // a^b, b^c in next round eor w16,w16,w27,ror#25 // Sigma1(e) eor w0,w0,w23,ror#13 add w22,w22,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w14,w14,w7,ror#19 eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) add w22,w22,w16 // h+=Sigma1(e) eor w19,w19,w24 // Maj(a,b,c) eor w17,w0,w23,ror#22 // Sigma0(a) eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) add w9,w9,w2 add w26,w26,w22 // d+=h add w22,w22,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w9,w9,w15 add w22,w22,w17 // h+=Sigma0(a) add w9,w9,w14 ldr w14,[sp,#12] str w1,[sp,#8] ror w16,w26,#6 add w21,w21,w19 // h+=K[i] ror w0,w11,#7 and w17,w27,w26 ror w15,w8,#17 bic w19,w20,w26 ror w1,w22,#2 add w21,w21,w9 // h+=X[i] eor w16,w16,w26,ror#11 eor w0,w0,w11,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w22,w23 // a^b, b^c in next round eor w16,w16,w26,ror#25 // Sigma1(e) eor w1,w1,w22,ror#13 add w21,w21,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w15,w15,w8,ror#19 eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) add w21,w21,w16 // h+=Sigma1(e) eor w28,w28,w23 // Maj(a,b,c) eor w17,w1,w22,ror#22 // Sigma0(a) eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) add w10,w10,w3 add w25,w25,w21 // d+=h add w21,w21,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w10,w10,w0 add w21,w21,w17 // h+=Sigma0(a) add w10,w10,w15 ldr w15,[sp,#0] str w2,[sp,#12] ror w16,w25,#6 add w20,w20,w28 // h+=K[i] ror w1,w12,#7 and w17,w26,w25 ror w0,w9,#17 bic w28,w27,w25 ror w2,w21,#2 add w20,w20,w10 // h+=X[i] eor w16,w16,w25,ror#11 eor w1,w1,w12,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w21,w22 // a^b, b^c in next round eor w16,w16,w25,ror#25 // Sigma1(e) eor w2,w2,w21,ror#13 add w20,w20,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w0,w0,w9,ror#19 eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) add w20,w20,w16 // h+=Sigma1(e) eor w19,w19,w22 // Maj(a,b,c) eor w17,w2,w21,ror#22 // Sigma0(a) eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) add w11,w11,w4 add w24,w24,w20 // d+=h add w20,w20,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w11,w11,w1 add w20,w20,w17 // h+=Sigma0(a) add w11,w11,w0 ldr w0,[sp,#4] str w3,[sp,#0] ror w16,w24,#6 add w27,w27,w19 // h+=K[i] ror w2,w13,#7 and w17,w25,w24 ror w1,w10,#17 bic w19,w26,w24 ror w3,w20,#2 add w27,w27,w11 // h+=X[i] eor w16,w16,w24,ror#11 eor w2,w2,w13,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w20,w21 // a^b, b^c in next round eor w16,w16,w24,ror#25 // Sigma1(e) eor w3,w3,w20,ror#13 add w27,w27,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w1,w1,w10,ror#19 eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) add w27,w27,w16 // h+=Sigma1(e) eor w28,w28,w21 // Maj(a,b,c) eor w17,w3,w20,ror#22 // Sigma0(a) eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) add w12,w12,w5 add w23,w23,w27 // d+=h add w27,w27,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w12,w12,w2 add w27,w27,w17 // h+=Sigma0(a) add w12,w12,w1 ldr w1,[sp,#8] str w4,[sp,#4] ror w16,w23,#6 add w26,w26,w28 // h+=K[i] ror w3,w14,#7 and w17,w24,w23 ror w2,w11,#17 bic w28,w25,w23 ror w4,w27,#2 add w26,w26,w12 // h+=X[i] eor w16,w16,w23,ror#11 eor w3,w3,w14,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w27,w20 // a^b, b^c in next round eor w16,w16,w23,ror#25 // Sigma1(e) eor w4,w4,w27,ror#13 add w26,w26,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w2,w2,w11,ror#19 eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) add w26,w26,w16 // h+=Sigma1(e) eor w19,w19,w20 // Maj(a,b,c) eor w17,w4,w27,ror#22 // Sigma0(a) eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) add w13,w13,w6 add w22,w22,w26 // d+=h add w26,w26,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w13,w13,w3 add w26,w26,w17 // h+=Sigma0(a) add w13,w13,w2 ldr w2,[sp,#12] str w5,[sp,#8] ror w16,w22,#6 add w25,w25,w19 // h+=K[i] ror w4,w15,#7 and w17,w23,w22 ror w3,w12,#17 bic w19,w24,w22 ror w5,w26,#2 add w25,w25,w13 // h+=X[i] eor w16,w16,w22,ror#11 eor w4,w4,w15,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w26,w27 // a^b, b^c in next round eor w16,w16,w22,ror#25 // Sigma1(e) eor w5,w5,w26,ror#13 add w25,w25,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w3,w3,w12,ror#19 eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) add w25,w25,w16 // h+=Sigma1(e) eor w28,w28,w27 // Maj(a,b,c) eor w17,w5,w26,ror#22 // Sigma0(a) eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) add w14,w14,w7 add w21,w21,w25 // d+=h add w25,w25,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w14,w14,w4 add w25,w25,w17 // h+=Sigma0(a) add w14,w14,w3 ldr w3,[sp,#0] str w6,[sp,#12] ror w16,w21,#6 add w24,w24,w28 // h+=K[i] ror w5,w0,#7 and w17,w22,w21 ror w4,w13,#17 bic w28,w23,w21 ror w6,w25,#2 add w24,w24,w14 // h+=X[i] eor w16,w16,w21,ror#11 eor w5,w5,w0,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w25,w26 // a^b, b^c in next round eor w16,w16,w21,ror#25 // Sigma1(e) eor w6,w6,w25,ror#13 add w24,w24,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w4,w4,w13,ror#19 eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) add w24,w24,w16 // h+=Sigma1(e) eor w19,w19,w26 // Maj(a,b,c) eor w17,w6,w25,ror#22 // Sigma0(a) eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) add w15,w15,w8 add w20,w20,w24 // d+=h add w24,w24,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w15,w15,w5 add w24,w24,w17 // h+=Sigma0(a) add w15,w15,w4 ldr w4,[sp,#4] str w7,[sp,#0] ror w16,w20,#6 add w23,w23,w19 // h+=K[i] ror w6,w1,#7 and w17,w21,w20 ror w5,w14,#17 bic w19,w22,w20 ror w7,w24,#2 add w23,w23,w15 // h+=X[i] eor w16,w16,w20,ror#11 eor w6,w6,w1,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w24,w25 // a^b, b^c in next round eor w16,w16,w20,ror#25 // Sigma1(e) eor w7,w7,w24,ror#13 add w23,w23,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w5,w5,w14,ror#19 eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) add w23,w23,w16 // h+=Sigma1(e) eor w28,w28,w25 // Maj(a,b,c) eor w17,w7,w24,ror#22 // Sigma0(a) eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) add w0,w0,w9 add w27,w27,w23 // d+=h add w23,w23,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w0,w0,w6 add w23,w23,w17 // h+=Sigma0(a) add w0,w0,w5 ldr w5,[sp,#8] str w8,[sp,#4] ror w16,w27,#6 add w22,w22,w28 // h+=K[i] ror w7,w2,#7 and w17,w20,w27 ror w6,w15,#17 bic w28,w21,w27 ror w8,w23,#2 add w22,w22,w0 // h+=X[i] eor w16,w16,w27,ror#11 eor w7,w7,w2,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w23,w24 // a^b, b^c in next round eor w16,w16,w27,ror#25 // Sigma1(e) eor w8,w8,w23,ror#13 add w22,w22,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w6,w6,w15,ror#19 eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) add w22,w22,w16 // h+=Sigma1(e) eor w19,w19,w24 // Maj(a,b,c) eor w17,w8,w23,ror#22 // Sigma0(a) eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) add w1,w1,w10 add w26,w26,w22 // d+=h add w22,w22,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w1,w1,w7 add w22,w22,w17 // h+=Sigma0(a) add w1,w1,w6 ldr w6,[sp,#12] str w9,[sp,#8] ror w16,w26,#6 add w21,w21,w19 // h+=K[i] ror w8,w3,#7 and w17,w27,w26 ror w7,w0,#17 bic w19,w20,w26 ror w9,w22,#2 add w21,w21,w1 // h+=X[i] eor w16,w16,w26,ror#11 eor w8,w8,w3,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w22,w23 // a^b, b^c in next round eor w16,w16,w26,ror#25 // Sigma1(e) eor w9,w9,w22,ror#13 add w21,w21,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w7,w7,w0,ror#19 eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) add w21,w21,w16 // h+=Sigma1(e) eor w28,w28,w23 // Maj(a,b,c) eor w17,w9,w22,ror#22 // Sigma0(a) eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) add w2,w2,w11 add w25,w25,w21 // d+=h add w21,w21,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w2,w2,w8 add w21,w21,w17 // h+=Sigma0(a) add w2,w2,w7 ldr w7,[sp,#0] str w10,[sp,#12] ror w16,w25,#6 add w20,w20,w28 // h+=K[i] ror w9,w4,#7 and w17,w26,w25 ror w8,w1,#17 bic w28,w27,w25 ror w10,w21,#2 add w20,w20,w2 // h+=X[i] eor w16,w16,w25,ror#11 eor w9,w9,w4,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w21,w22 // a^b, b^c in next round eor w16,w16,w25,ror#25 // Sigma1(e) eor w10,w10,w21,ror#13 add w20,w20,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w8,w8,w1,ror#19 eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) add w20,w20,w16 // h+=Sigma1(e) eor w19,w19,w22 // Maj(a,b,c) eor w17,w10,w21,ror#22 // Sigma0(a) eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) add w3,w3,w12 add w24,w24,w20 // d+=h add w20,w20,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w3,w3,w9 add w20,w20,w17 // h+=Sigma0(a) add w3,w3,w8 cbnz w19,Loop_16_xx ldp x0,x2,[x29,#96] ldr x1,[x29,#112] sub x30,x30,#260 // rewind ldp w3,w4,[x0] ldp w5,w6,[x0,#2*4] add x1,x1,#14*4 // advance input pointer ldp w7,w8,[x0,#4*4] add w20,w20,w3 ldp w9,w10,[x0,#6*4] add w21,w21,w4 add w22,w22,w5 add w23,w23,w6 stp w20,w21,[x0] add w24,w24,w7 add w25,w25,w8 stp w22,w23,[x0,#2*4] add w26,w26,w9 add w27,w27,w10 cmp x1,x2 stp w24,w25,[x0,#4*4] stp w26,w27,[x0,#6*4] b.ne Loop ldp x19,x20,[x29,#16] add sp,sp,#4*4 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 AARCH64_VALIDATE_LINK_REGISTER ret .section __TEXT,__const .align 6 LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 .text #ifndef __KERNEL__ .globl _sha256_block_data_order_hw .private_extern _sha256_block_data_order_hw .align 6 _sha256_block_data_order_hw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ld1 {v0.4s,v1.4s},[x0] adrp x3,LK256@PAGE add x3,x3,LK256@PAGEOFF Loop_hw: ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 sub x2,x2,#1 ld1 {v16.4s},[x3],#16 rev32 v4.16b,v4.16b rev32 v5.16b,v5.16b rev32 v6.16b,v6.16b rev32 v7.16b,v7.16b orr v18.16b,v0.16b,v0.16b // offload orr v19.16b,v1.16b,v1.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .long 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .long 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .long 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .long 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .long 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .long 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .long 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .long 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .long 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .long 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .long 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .long 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ld1 {v17.4s},[x3] add v16.4s,v16.4s,v6.4s sub x3,x3,#64*4-16 // rewind orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s add v17.4s,v17.4s,v7.4s orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s add v0.4s,v0.4s,v18.4s add v1.4s,v1.4s,v19.4s cbnz x2,Loop_hw st1 {v0.4s,v1.4s},[x0] ldr x29,[sp],#16 ret #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) ring-0.17.14/pregenerated/sha256-armv8-linux64.S000064400000000000000000001026771046102023000171150ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ==================================================================== // Written by Andy Polyakov for the OpenSSL // project. // ==================================================================== // // SHA256/512 for ARMv8. // // Performance in cycles per processed byte and improvement coefficient // over code generated with "default" compiler: // // SHA256-hw SHA256(*) SHA512 // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) // Denver 2.01 10.5 (+26%) 6.70 (+8%) // X-Gene 20.0 (+100%) 12.8 (+300%(***)) // Mongoose 2.36 13.0 (+50%) 8.36 (+33%) // Kryo 1.92 17.4 (+30%) 11.2 (+8%) // // (*) Software SHA256 results are of lesser relevance, presented // mostly for informational purposes. // (**) The result is a trade-off: it's possible to improve it by // 10% (or by 1 cycle per round), but at the cost of 20% loss // on Cortex-A53 (or by 4 cycles per round). // (***) Super-impressive coefficients over gcc-generated code are // indication of some compiler "pathology", most notably code // generated with -mgeneral-regs-only is significantly faster // and the gap is only 40-90%. #ifndef __KERNEL__ #endif .text .globl sha256_block_data_order_nohw .hidden sha256_block_data_order_nohw .type sha256_block_data_order_nohw,%function .align 6 sha256_block_data_order_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#4*4 ldp w20,w21,[x0] // load context ldp w22,w23,[x0,#2*4] ldp w24,w25,[x0,#4*4] add x2,x1,x2,lsl#6 // end of input ldp w26,w27,[x0,#6*4] adrp x30,.LK256 add x30,x30,:lo12:.LK256 stp x0,x2,[x29,#96] .Loop: ldp w3,w4,[x1],#2*4 ldr w19,[x30],#4 // *K++ eor w28,w21,w22 // magic seed str x1,[x29,#112] #ifndef __AARCH64EB__ rev w3,w3 // 0 #endif ror w16,w24,#6 add w27,w27,w19 // h+=K[i] eor w6,w24,w24,ror#14 and w17,w25,w24 bic w19,w26,w24 add w27,w27,w3 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w20,w21 // a^b, b^c in next round eor w16,w16,w6,ror#11 // Sigma1(e) ror w6,w20,#2 add w27,w27,w17 // h+=Ch(e,f,g) eor w17,w20,w20,ror#9 add w27,w27,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w23,w23,w27 // d+=h eor w28,w28,w21 // Maj(a,b,c) eor w17,w6,w17,ror#13 // Sigma0(a) add w27,w27,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w27,w27,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w4,w4 // 1 #endif ldp w5,w6,[x1],#2*4 add w27,w27,w17 // h+=Sigma0(a) ror w16,w23,#6 add w26,w26,w28 // h+=K[i] eor w7,w23,w23,ror#14 and w17,w24,w23 bic w28,w25,w23 add w26,w26,w4 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w27,w20 // a^b, b^c in next round eor w16,w16,w7,ror#11 // Sigma1(e) ror w7,w27,#2 add w26,w26,w17 // h+=Ch(e,f,g) eor w17,w27,w27,ror#9 add w26,w26,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w22,w22,w26 // d+=h eor w19,w19,w20 // Maj(a,b,c) eor w17,w7,w17,ror#13 // Sigma0(a) add w26,w26,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w26,w26,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w5,w5 // 2 #endif add w26,w26,w17 // h+=Sigma0(a) ror w16,w22,#6 add w25,w25,w19 // h+=K[i] eor w8,w22,w22,ror#14 and w17,w23,w22 bic w19,w24,w22 add w25,w25,w5 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w26,w27 // a^b, b^c in next round eor w16,w16,w8,ror#11 // Sigma1(e) ror w8,w26,#2 add w25,w25,w17 // h+=Ch(e,f,g) eor w17,w26,w26,ror#9 add w25,w25,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w21,w21,w25 // d+=h eor w28,w28,w27 // Maj(a,b,c) eor w17,w8,w17,ror#13 // Sigma0(a) add w25,w25,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w25,w25,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w6,w6 // 3 #endif ldp w7,w8,[x1],#2*4 add w25,w25,w17 // h+=Sigma0(a) ror w16,w21,#6 add w24,w24,w28 // h+=K[i] eor w9,w21,w21,ror#14 and w17,w22,w21 bic w28,w23,w21 add w24,w24,w6 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w25,w26 // a^b, b^c in next round eor w16,w16,w9,ror#11 // Sigma1(e) ror w9,w25,#2 add w24,w24,w17 // h+=Ch(e,f,g) eor w17,w25,w25,ror#9 add w24,w24,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w20,w20,w24 // d+=h eor w19,w19,w26 // Maj(a,b,c) eor w17,w9,w17,ror#13 // Sigma0(a) add w24,w24,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w24,w24,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w7,w7 // 4 #endif add w24,w24,w17 // h+=Sigma0(a) ror w16,w20,#6 add w23,w23,w19 // h+=K[i] eor w10,w20,w20,ror#14 and w17,w21,w20 bic w19,w22,w20 add w23,w23,w7 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w24,w25 // a^b, b^c in next round eor w16,w16,w10,ror#11 // Sigma1(e) ror w10,w24,#2 add w23,w23,w17 // h+=Ch(e,f,g) eor w17,w24,w24,ror#9 add w23,w23,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w27,w27,w23 // d+=h eor w28,w28,w25 // Maj(a,b,c) eor w17,w10,w17,ror#13 // Sigma0(a) add w23,w23,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w23,w23,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w8,w8 // 5 #endif ldp w9,w10,[x1],#2*4 add w23,w23,w17 // h+=Sigma0(a) ror w16,w27,#6 add w22,w22,w28 // h+=K[i] eor w11,w27,w27,ror#14 and w17,w20,w27 bic w28,w21,w27 add w22,w22,w8 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w23,w24 // a^b, b^c in next round eor w16,w16,w11,ror#11 // Sigma1(e) ror w11,w23,#2 add w22,w22,w17 // h+=Ch(e,f,g) eor w17,w23,w23,ror#9 add w22,w22,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w26,w26,w22 // d+=h eor w19,w19,w24 // Maj(a,b,c) eor w17,w11,w17,ror#13 // Sigma0(a) add w22,w22,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w22,w22,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w9,w9 // 6 #endif add w22,w22,w17 // h+=Sigma0(a) ror w16,w26,#6 add w21,w21,w19 // h+=K[i] eor w12,w26,w26,ror#14 and w17,w27,w26 bic w19,w20,w26 add w21,w21,w9 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w22,w23 // a^b, b^c in next round eor w16,w16,w12,ror#11 // Sigma1(e) ror w12,w22,#2 add w21,w21,w17 // h+=Ch(e,f,g) eor w17,w22,w22,ror#9 add w21,w21,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w25,w25,w21 // d+=h eor w28,w28,w23 // Maj(a,b,c) eor w17,w12,w17,ror#13 // Sigma0(a) add w21,w21,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w21,w21,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w10,w10 // 7 #endif ldp w11,w12,[x1],#2*4 add w21,w21,w17 // h+=Sigma0(a) ror w16,w25,#6 add w20,w20,w28 // h+=K[i] eor w13,w25,w25,ror#14 and w17,w26,w25 bic w28,w27,w25 add w20,w20,w10 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w21,w22 // a^b, b^c in next round eor w16,w16,w13,ror#11 // Sigma1(e) ror w13,w21,#2 add w20,w20,w17 // h+=Ch(e,f,g) eor w17,w21,w21,ror#9 add w20,w20,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w24,w24,w20 // d+=h eor w19,w19,w22 // Maj(a,b,c) eor w17,w13,w17,ror#13 // Sigma0(a) add w20,w20,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w20,w20,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w11,w11 // 8 #endif add w20,w20,w17 // h+=Sigma0(a) ror w16,w24,#6 add w27,w27,w19 // h+=K[i] eor w14,w24,w24,ror#14 and w17,w25,w24 bic w19,w26,w24 add w27,w27,w11 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w20,w21 // a^b, b^c in next round eor w16,w16,w14,ror#11 // Sigma1(e) ror w14,w20,#2 add w27,w27,w17 // h+=Ch(e,f,g) eor w17,w20,w20,ror#9 add w27,w27,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w23,w23,w27 // d+=h eor w28,w28,w21 // Maj(a,b,c) eor w17,w14,w17,ror#13 // Sigma0(a) add w27,w27,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w27,w27,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w12,w12 // 9 #endif ldp w13,w14,[x1],#2*4 add w27,w27,w17 // h+=Sigma0(a) ror w16,w23,#6 add w26,w26,w28 // h+=K[i] eor w15,w23,w23,ror#14 and w17,w24,w23 bic w28,w25,w23 add w26,w26,w12 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w27,w20 // a^b, b^c in next round eor w16,w16,w15,ror#11 // Sigma1(e) ror w15,w27,#2 add w26,w26,w17 // h+=Ch(e,f,g) eor w17,w27,w27,ror#9 add w26,w26,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w22,w22,w26 // d+=h eor w19,w19,w20 // Maj(a,b,c) eor w17,w15,w17,ror#13 // Sigma0(a) add w26,w26,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w26,w26,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w13,w13 // 10 #endif add w26,w26,w17 // h+=Sigma0(a) ror w16,w22,#6 add w25,w25,w19 // h+=K[i] eor w0,w22,w22,ror#14 and w17,w23,w22 bic w19,w24,w22 add w25,w25,w13 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w26,w27 // a^b, b^c in next round eor w16,w16,w0,ror#11 // Sigma1(e) ror w0,w26,#2 add w25,w25,w17 // h+=Ch(e,f,g) eor w17,w26,w26,ror#9 add w25,w25,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w21,w21,w25 // d+=h eor w28,w28,w27 // Maj(a,b,c) eor w17,w0,w17,ror#13 // Sigma0(a) add w25,w25,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w25,w25,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w14,w14 // 11 #endif ldp w15,w0,[x1],#2*4 add w25,w25,w17 // h+=Sigma0(a) str w6,[sp,#12] ror w16,w21,#6 add w24,w24,w28 // h+=K[i] eor w6,w21,w21,ror#14 and w17,w22,w21 bic w28,w23,w21 add w24,w24,w14 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w25,w26 // a^b, b^c in next round eor w16,w16,w6,ror#11 // Sigma1(e) ror w6,w25,#2 add w24,w24,w17 // h+=Ch(e,f,g) eor w17,w25,w25,ror#9 add w24,w24,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w20,w20,w24 // d+=h eor w19,w19,w26 // Maj(a,b,c) eor w17,w6,w17,ror#13 // Sigma0(a) add w24,w24,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w24,w24,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w15,w15 // 12 #endif add w24,w24,w17 // h+=Sigma0(a) str w7,[sp,#0] ror w16,w20,#6 add w23,w23,w19 // h+=K[i] eor w7,w20,w20,ror#14 and w17,w21,w20 bic w19,w22,w20 add w23,w23,w15 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w24,w25 // a^b, b^c in next round eor w16,w16,w7,ror#11 // Sigma1(e) ror w7,w24,#2 add w23,w23,w17 // h+=Ch(e,f,g) eor w17,w24,w24,ror#9 add w23,w23,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w27,w27,w23 // d+=h eor w28,w28,w25 // Maj(a,b,c) eor w17,w7,w17,ror#13 // Sigma0(a) add w23,w23,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w23,w23,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w0,w0 // 13 #endif ldp w1,w2,[x1] add w23,w23,w17 // h+=Sigma0(a) str w8,[sp,#4] ror w16,w27,#6 add w22,w22,w28 // h+=K[i] eor w8,w27,w27,ror#14 and w17,w20,w27 bic w28,w21,w27 add w22,w22,w0 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w23,w24 // a^b, b^c in next round eor w16,w16,w8,ror#11 // Sigma1(e) ror w8,w23,#2 add w22,w22,w17 // h+=Ch(e,f,g) eor w17,w23,w23,ror#9 add w22,w22,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w26,w26,w22 // d+=h eor w19,w19,w24 // Maj(a,b,c) eor w17,w8,w17,ror#13 // Sigma0(a) add w22,w22,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w22,w22,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w1,w1 // 14 #endif ldr w6,[sp,#12] add w22,w22,w17 // h+=Sigma0(a) str w9,[sp,#8] ror w16,w26,#6 add w21,w21,w19 // h+=K[i] eor w9,w26,w26,ror#14 and w17,w27,w26 bic w19,w20,w26 add w21,w21,w1 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w22,w23 // a^b, b^c in next round eor w16,w16,w9,ror#11 // Sigma1(e) ror w9,w22,#2 add w21,w21,w17 // h+=Ch(e,f,g) eor w17,w22,w22,ror#9 add w21,w21,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w25,w25,w21 // d+=h eor w28,w28,w23 // Maj(a,b,c) eor w17,w9,w17,ror#13 // Sigma0(a) add w21,w21,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w21,w21,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w2,w2 // 15 #endif ldr w7,[sp,#0] add w21,w21,w17 // h+=Sigma0(a) str w10,[sp,#12] ror w16,w25,#6 add w20,w20,w28 // h+=K[i] ror w9,w4,#7 and w17,w26,w25 ror w8,w1,#17 bic w28,w27,w25 ror w10,w21,#2 add w20,w20,w2 // h+=X[i] eor w16,w16,w25,ror#11 eor w9,w9,w4,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w21,w22 // a^b, b^c in next round eor w16,w16,w25,ror#25 // Sigma1(e) eor w10,w10,w21,ror#13 add w20,w20,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w8,w8,w1,ror#19 eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) add w20,w20,w16 // h+=Sigma1(e) eor w19,w19,w22 // Maj(a,b,c) eor w17,w10,w21,ror#22 // Sigma0(a) eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) add w3,w3,w12 add w24,w24,w20 // d+=h add w20,w20,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w3,w3,w9 add w20,w20,w17 // h+=Sigma0(a) add w3,w3,w8 .Loop_16_xx: ldr w8,[sp,#4] str w11,[sp,#0] ror w16,w24,#6 add w27,w27,w19 // h+=K[i] ror w10,w5,#7 and w17,w25,w24 ror w9,w2,#17 bic w19,w26,w24 ror w11,w20,#2 add w27,w27,w3 // h+=X[i] eor w16,w16,w24,ror#11 eor w10,w10,w5,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w20,w21 // a^b, b^c in next round eor w16,w16,w24,ror#25 // Sigma1(e) eor w11,w11,w20,ror#13 add w27,w27,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w9,w9,w2,ror#19 eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) add w27,w27,w16 // h+=Sigma1(e) eor w28,w28,w21 // Maj(a,b,c) eor w17,w11,w20,ror#22 // Sigma0(a) eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) add w4,w4,w13 add w23,w23,w27 // d+=h add w27,w27,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w4,w4,w10 add w27,w27,w17 // h+=Sigma0(a) add w4,w4,w9 ldr w9,[sp,#8] str w12,[sp,#4] ror w16,w23,#6 add w26,w26,w28 // h+=K[i] ror w11,w6,#7 and w17,w24,w23 ror w10,w3,#17 bic w28,w25,w23 ror w12,w27,#2 add w26,w26,w4 // h+=X[i] eor w16,w16,w23,ror#11 eor w11,w11,w6,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w27,w20 // a^b, b^c in next round eor w16,w16,w23,ror#25 // Sigma1(e) eor w12,w12,w27,ror#13 add w26,w26,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w10,w10,w3,ror#19 eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) add w26,w26,w16 // h+=Sigma1(e) eor w19,w19,w20 // Maj(a,b,c) eor w17,w12,w27,ror#22 // Sigma0(a) eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) add w5,w5,w14 add w22,w22,w26 // d+=h add w26,w26,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w5,w5,w11 add w26,w26,w17 // h+=Sigma0(a) add w5,w5,w10 ldr w10,[sp,#12] str w13,[sp,#8] ror w16,w22,#6 add w25,w25,w19 // h+=K[i] ror w12,w7,#7 and w17,w23,w22 ror w11,w4,#17 bic w19,w24,w22 ror w13,w26,#2 add w25,w25,w5 // h+=X[i] eor w16,w16,w22,ror#11 eor w12,w12,w7,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w26,w27 // a^b, b^c in next round eor w16,w16,w22,ror#25 // Sigma1(e) eor w13,w13,w26,ror#13 add w25,w25,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w11,w11,w4,ror#19 eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) add w25,w25,w16 // h+=Sigma1(e) eor w28,w28,w27 // Maj(a,b,c) eor w17,w13,w26,ror#22 // Sigma0(a) eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) add w6,w6,w15 add w21,w21,w25 // d+=h add w25,w25,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w6,w6,w12 add w25,w25,w17 // h+=Sigma0(a) add w6,w6,w11 ldr w11,[sp,#0] str w14,[sp,#12] ror w16,w21,#6 add w24,w24,w28 // h+=K[i] ror w13,w8,#7 and w17,w22,w21 ror w12,w5,#17 bic w28,w23,w21 ror w14,w25,#2 add w24,w24,w6 // h+=X[i] eor w16,w16,w21,ror#11 eor w13,w13,w8,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w25,w26 // a^b, b^c in next round eor w16,w16,w21,ror#25 // Sigma1(e) eor w14,w14,w25,ror#13 add w24,w24,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w12,w12,w5,ror#19 eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) add w24,w24,w16 // h+=Sigma1(e) eor w19,w19,w26 // Maj(a,b,c) eor w17,w14,w25,ror#22 // Sigma0(a) eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) add w7,w7,w0 add w20,w20,w24 // d+=h add w24,w24,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w7,w7,w13 add w24,w24,w17 // h+=Sigma0(a) add w7,w7,w12 ldr w12,[sp,#4] str w15,[sp,#0] ror w16,w20,#6 add w23,w23,w19 // h+=K[i] ror w14,w9,#7 and w17,w21,w20 ror w13,w6,#17 bic w19,w22,w20 ror w15,w24,#2 add w23,w23,w7 // h+=X[i] eor w16,w16,w20,ror#11 eor w14,w14,w9,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w24,w25 // a^b, b^c in next round eor w16,w16,w20,ror#25 // Sigma1(e) eor w15,w15,w24,ror#13 add w23,w23,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w13,w13,w6,ror#19 eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) add w23,w23,w16 // h+=Sigma1(e) eor w28,w28,w25 // Maj(a,b,c) eor w17,w15,w24,ror#22 // Sigma0(a) eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) add w8,w8,w1 add w27,w27,w23 // d+=h add w23,w23,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w8,w8,w14 add w23,w23,w17 // h+=Sigma0(a) add w8,w8,w13 ldr w13,[sp,#8] str w0,[sp,#4] ror w16,w27,#6 add w22,w22,w28 // h+=K[i] ror w15,w10,#7 and w17,w20,w27 ror w14,w7,#17 bic w28,w21,w27 ror w0,w23,#2 add w22,w22,w8 // h+=X[i] eor w16,w16,w27,ror#11 eor w15,w15,w10,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w23,w24 // a^b, b^c in next round eor w16,w16,w27,ror#25 // Sigma1(e) eor w0,w0,w23,ror#13 add w22,w22,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w14,w14,w7,ror#19 eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) add w22,w22,w16 // h+=Sigma1(e) eor w19,w19,w24 // Maj(a,b,c) eor w17,w0,w23,ror#22 // Sigma0(a) eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) add w9,w9,w2 add w26,w26,w22 // d+=h add w22,w22,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w9,w9,w15 add w22,w22,w17 // h+=Sigma0(a) add w9,w9,w14 ldr w14,[sp,#12] str w1,[sp,#8] ror w16,w26,#6 add w21,w21,w19 // h+=K[i] ror w0,w11,#7 and w17,w27,w26 ror w15,w8,#17 bic w19,w20,w26 ror w1,w22,#2 add w21,w21,w9 // h+=X[i] eor w16,w16,w26,ror#11 eor w0,w0,w11,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w22,w23 // a^b, b^c in next round eor w16,w16,w26,ror#25 // Sigma1(e) eor w1,w1,w22,ror#13 add w21,w21,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w15,w15,w8,ror#19 eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) add w21,w21,w16 // h+=Sigma1(e) eor w28,w28,w23 // Maj(a,b,c) eor w17,w1,w22,ror#22 // Sigma0(a) eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) add w10,w10,w3 add w25,w25,w21 // d+=h add w21,w21,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w10,w10,w0 add w21,w21,w17 // h+=Sigma0(a) add w10,w10,w15 ldr w15,[sp,#0] str w2,[sp,#12] ror w16,w25,#6 add w20,w20,w28 // h+=K[i] ror w1,w12,#7 and w17,w26,w25 ror w0,w9,#17 bic w28,w27,w25 ror w2,w21,#2 add w20,w20,w10 // h+=X[i] eor w16,w16,w25,ror#11 eor w1,w1,w12,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w21,w22 // a^b, b^c in next round eor w16,w16,w25,ror#25 // Sigma1(e) eor w2,w2,w21,ror#13 add w20,w20,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w0,w0,w9,ror#19 eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) add w20,w20,w16 // h+=Sigma1(e) eor w19,w19,w22 // Maj(a,b,c) eor w17,w2,w21,ror#22 // Sigma0(a) eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) add w11,w11,w4 add w24,w24,w20 // d+=h add w20,w20,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w11,w11,w1 add w20,w20,w17 // h+=Sigma0(a) add w11,w11,w0 ldr w0,[sp,#4] str w3,[sp,#0] ror w16,w24,#6 add w27,w27,w19 // h+=K[i] ror w2,w13,#7 and w17,w25,w24 ror w1,w10,#17 bic w19,w26,w24 ror w3,w20,#2 add w27,w27,w11 // h+=X[i] eor w16,w16,w24,ror#11 eor w2,w2,w13,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w20,w21 // a^b, b^c in next round eor w16,w16,w24,ror#25 // Sigma1(e) eor w3,w3,w20,ror#13 add w27,w27,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w1,w1,w10,ror#19 eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) add w27,w27,w16 // h+=Sigma1(e) eor w28,w28,w21 // Maj(a,b,c) eor w17,w3,w20,ror#22 // Sigma0(a) eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) add w12,w12,w5 add w23,w23,w27 // d+=h add w27,w27,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w12,w12,w2 add w27,w27,w17 // h+=Sigma0(a) add w12,w12,w1 ldr w1,[sp,#8] str w4,[sp,#4] ror w16,w23,#6 add w26,w26,w28 // h+=K[i] ror w3,w14,#7 and w17,w24,w23 ror w2,w11,#17 bic w28,w25,w23 ror w4,w27,#2 add w26,w26,w12 // h+=X[i] eor w16,w16,w23,ror#11 eor w3,w3,w14,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w27,w20 // a^b, b^c in next round eor w16,w16,w23,ror#25 // Sigma1(e) eor w4,w4,w27,ror#13 add w26,w26,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w2,w2,w11,ror#19 eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) add w26,w26,w16 // h+=Sigma1(e) eor w19,w19,w20 // Maj(a,b,c) eor w17,w4,w27,ror#22 // Sigma0(a) eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) add w13,w13,w6 add w22,w22,w26 // d+=h add w26,w26,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w13,w13,w3 add w26,w26,w17 // h+=Sigma0(a) add w13,w13,w2 ldr w2,[sp,#12] str w5,[sp,#8] ror w16,w22,#6 add w25,w25,w19 // h+=K[i] ror w4,w15,#7 and w17,w23,w22 ror w3,w12,#17 bic w19,w24,w22 ror w5,w26,#2 add w25,w25,w13 // h+=X[i] eor w16,w16,w22,ror#11 eor w4,w4,w15,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w26,w27 // a^b, b^c in next round eor w16,w16,w22,ror#25 // Sigma1(e) eor w5,w5,w26,ror#13 add w25,w25,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w3,w3,w12,ror#19 eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) add w25,w25,w16 // h+=Sigma1(e) eor w28,w28,w27 // Maj(a,b,c) eor w17,w5,w26,ror#22 // Sigma0(a) eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) add w14,w14,w7 add w21,w21,w25 // d+=h add w25,w25,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w14,w14,w4 add w25,w25,w17 // h+=Sigma0(a) add w14,w14,w3 ldr w3,[sp,#0] str w6,[sp,#12] ror w16,w21,#6 add w24,w24,w28 // h+=K[i] ror w5,w0,#7 and w17,w22,w21 ror w4,w13,#17 bic w28,w23,w21 ror w6,w25,#2 add w24,w24,w14 // h+=X[i] eor w16,w16,w21,ror#11 eor w5,w5,w0,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w25,w26 // a^b, b^c in next round eor w16,w16,w21,ror#25 // Sigma1(e) eor w6,w6,w25,ror#13 add w24,w24,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w4,w4,w13,ror#19 eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) add w24,w24,w16 // h+=Sigma1(e) eor w19,w19,w26 // Maj(a,b,c) eor w17,w6,w25,ror#22 // Sigma0(a) eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) add w15,w15,w8 add w20,w20,w24 // d+=h add w24,w24,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w15,w15,w5 add w24,w24,w17 // h+=Sigma0(a) add w15,w15,w4 ldr w4,[sp,#4] str w7,[sp,#0] ror w16,w20,#6 add w23,w23,w19 // h+=K[i] ror w6,w1,#7 and w17,w21,w20 ror w5,w14,#17 bic w19,w22,w20 ror w7,w24,#2 add w23,w23,w15 // h+=X[i] eor w16,w16,w20,ror#11 eor w6,w6,w1,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w24,w25 // a^b, b^c in next round eor w16,w16,w20,ror#25 // Sigma1(e) eor w7,w7,w24,ror#13 add w23,w23,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w5,w5,w14,ror#19 eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) add w23,w23,w16 // h+=Sigma1(e) eor w28,w28,w25 // Maj(a,b,c) eor w17,w7,w24,ror#22 // Sigma0(a) eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) add w0,w0,w9 add w27,w27,w23 // d+=h add w23,w23,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w0,w0,w6 add w23,w23,w17 // h+=Sigma0(a) add w0,w0,w5 ldr w5,[sp,#8] str w8,[sp,#4] ror w16,w27,#6 add w22,w22,w28 // h+=K[i] ror w7,w2,#7 and w17,w20,w27 ror w6,w15,#17 bic w28,w21,w27 ror w8,w23,#2 add w22,w22,w0 // h+=X[i] eor w16,w16,w27,ror#11 eor w7,w7,w2,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w23,w24 // a^b, b^c in next round eor w16,w16,w27,ror#25 // Sigma1(e) eor w8,w8,w23,ror#13 add w22,w22,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w6,w6,w15,ror#19 eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) add w22,w22,w16 // h+=Sigma1(e) eor w19,w19,w24 // Maj(a,b,c) eor w17,w8,w23,ror#22 // Sigma0(a) eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) add w1,w1,w10 add w26,w26,w22 // d+=h add w22,w22,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w1,w1,w7 add w22,w22,w17 // h+=Sigma0(a) add w1,w1,w6 ldr w6,[sp,#12] str w9,[sp,#8] ror w16,w26,#6 add w21,w21,w19 // h+=K[i] ror w8,w3,#7 and w17,w27,w26 ror w7,w0,#17 bic w19,w20,w26 ror w9,w22,#2 add w21,w21,w1 // h+=X[i] eor w16,w16,w26,ror#11 eor w8,w8,w3,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w22,w23 // a^b, b^c in next round eor w16,w16,w26,ror#25 // Sigma1(e) eor w9,w9,w22,ror#13 add w21,w21,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w7,w7,w0,ror#19 eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) add w21,w21,w16 // h+=Sigma1(e) eor w28,w28,w23 // Maj(a,b,c) eor w17,w9,w22,ror#22 // Sigma0(a) eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) add w2,w2,w11 add w25,w25,w21 // d+=h add w21,w21,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w2,w2,w8 add w21,w21,w17 // h+=Sigma0(a) add w2,w2,w7 ldr w7,[sp,#0] str w10,[sp,#12] ror w16,w25,#6 add w20,w20,w28 // h+=K[i] ror w9,w4,#7 and w17,w26,w25 ror w8,w1,#17 bic w28,w27,w25 ror w10,w21,#2 add w20,w20,w2 // h+=X[i] eor w16,w16,w25,ror#11 eor w9,w9,w4,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w21,w22 // a^b, b^c in next round eor w16,w16,w25,ror#25 // Sigma1(e) eor w10,w10,w21,ror#13 add w20,w20,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w8,w8,w1,ror#19 eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) add w20,w20,w16 // h+=Sigma1(e) eor w19,w19,w22 // Maj(a,b,c) eor w17,w10,w21,ror#22 // Sigma0(a) eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) add w3,w3,w12 add w24,w24,w20 // d+=h add w20,w20,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w3,w3,w9 add w20,w20,w17 // h+=Sigma0(a) add w3,w3,w8 cbnz w19,.Loop_16_xx ldp x0,x2,[x29,#96] ldr x1,[x29,#112] sub x30,x30,#260 // rewind ldp w3,w4,[x0] ldp w5,w6,[x0,#2*4] add x1,x1,#14*4 // advance input pointer ldp w7,w8,[x0,#4*4] add w20,w20,w3 ldp w9,w10,[x0,#6*4] add w21,w21,w4 add w22,w22,w5 add w23,w23,w6 stp w20,w21,[x0] add w24,w24,w7 add w25,w25,w8 stp w22,w23,[x0,#2*4] add w26,w26,w9 add w27,w27,w10 cmp x1,x2 stp w24,w25,[x0,#4*4] stp w26,w27,[x0,#6*4] b.ne .Loop ldp x19,x20,[x29,#16] add sp,sp,#4*4 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 AARCH64_VALIDATE_LINK_REGISTER ret .size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw .section .rodata .align 6 .type .LK256,%object .LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .size .LK256,.-.LK256 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 .text #ifndef __KERNEL__ .globl sha256_block_data_order_hw .hidden sha256_block_data_order_hw .type sha256_block_data_order_hw,%function .align 6 sha256_block_data_order_hw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ld1 {v0.4s,v1.4s},[x0] adrp x3,.LK256 add x3,x3,:lo12:.LK256 .Loop_hw: ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 sub x2,x2,#1 ld1 {v16.4s},[x3],#16 rev32 v4.16b,v4.16b rev32 v5.16b,v5.16b rev32 v6.16b,v6.16b rev32 v7.16b,v7.16b orr v18.16b,v0.16b,v0.16b // offload orr v19.16b,v1.16b,v1.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .inst 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .inst 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .inst 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ld1 {v17.4s},[x3] add v16.4s,v16.4s,v6.4s sub x3,x3,#64*4-16 // rewind orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s add v17.4s,v17.4s,v7.4s orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s add v0.4s,v0.4s,v18.4s add v1.4s,v1.4s,v19.4s cbnz x2,.Loop_hw st1 {v0.4s,v1.4s},[x0] ldr x29,[sp],#16 ret .size sha256_block_data_order_hw,.-sha256_block_data_order_hw #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) ring-0.17.14/pregenerated/sha256-armv8-win64.S000064400000000000000000001023251046102023000165410ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ==================================================================== // Written by Andy Polyakov for the OpenSSL // project. // ==================================================================== // // SHA256/512 for ARMv8. // // Performance in cycles per processed byte and improvement coefficient // over code generated with "default" compiler: // // SHA256-hw SHA256(*) SHA512 // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) // Denver 2.01 10.5 (+26%) 6.70 (+8%) // X-Gene 20.0 (+100%) 12.8 (+300%(***)) // Mongoose 2.36 13.0 (+50%) 8.36 (+33%) // Kryo 1.92 17.4 (+30%) 11.2 (+8%) // // (*) Software SHA256 results are of lesser relevance, presented // mostly for informational purposes. // (**) The result is a trade-off: it's possible to improve it by // 10% (or by 1 cycle per round), but at the cost of 20% loss // on Cortex-A53 (or by 4 cycles per round). // (***) Super-impressive coefficients over gcc-generated code are // indication of some compiler "pathology", most notably code // generated with -mgeneral-regs-only is significantly faster // and the gap is only 40-90%. #ifndef __KERNEL__ #endif .text .globl sha256_block_data_order_nohw .def sha256_block_data_order_nohw .type 32 .endef .align 6 sha256_block_data_order_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#4*4 ldp w20,w21,[x0] // load context ldp w22,w23,[x0,#2*4] ldp w24,w25,[x0,#4*4] add x2,x1,x2,lsl#6 // end of input ldp w26,w27,[x0,#6*4] adrp x30,LK256 add x30,x30,:lo12:LK256 stp x0,x2,[x29,#96] Loop: ldp w3,w4,[x1],#2*4 ldr w19,[x30],#4 // *K++ eor w28,w21,w22 // magic seed str x1,[x29,#112] #ifndef __AARCH64EB__ rev w3,w3 // 0 #endif ror w16,w24,#6 add w27,w27,w19 // h+=K[i] eor w6,w24,w24,ror#14 and w17,w25,w24 bic w19,w26,w24 add w27,w27,w3 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w20,w21 // a^b, b^c in next round eor w16,w16,w6,ror#11 // Sigma1(e) ror w6,w20,#2 add w27,w27,w17 // h+=Ch(e,f,g) eor w17,w20,w20,ror#9 add w27,w27,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w23,w23,w27 // d+=h eor w28,w28,w21 // Maj(a,b,c) eor w17,w6,w17,ror#13 // Sigma0(a) add w27,w27,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w27,w27,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w4,w4 // 1 #endif ldp w5,w6,[x1],#2*4 add w27,w27,w17 // h+=Sigma0(a) ror w16,w23,#6 add w26,w26,w28 // h+=K[i] eor w7,w23,w23,ror#14 and w17,w24,w23 bic w28,w25,w23 add w26,w26,w4 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w27,w20 // a^b, b^c in next round eor w16,w16,w7,ror#11 // Sigma1(e) ror w7,w27,#2 add w26,w26,w17 // h+=Ch(e,f,g) eor w17,w27,w27,ror#9 add w26,w26,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w22,w22,w26 // d+=h eor w19,w19,w20 // Maj(a,b,c) eor w17,w7,w17,ror#13 // Sigma0(a) add w26,w26,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w26,w26,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w5,w5 // 2 #endif add w26,w26,w17 // h+=Sigma0(a) ror w16,w22,#6 add w25,w25,w19 // h+=K[i] eor w8,w22,w22,ror#14 and w17,w23,w22 bic w19,w24,w22 add w25,w25,w5 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w26,w27 // a^b, b^c in next round eor w16,w16,w8,ror#11 // Sigma1(e) ror w8,w26,#2 add w25,w25,w17 // h+=Ch(e,f,g) eor w17,w26,w26,ror#9 add w25,w25,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w21,w21,w25 // d+=h eor w28,w28,w27 // Maj(a,b,c) eor w17,w8,w17,ror#13 // Sigma0(a) add w25,w25,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w25,w25,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w6,w6 // 3 #endif ldp w7,w8,[x1],#2*4 add w25,w25,w17 // h+=Sigma0(a) ror w16,w21,#6 add w24,w24,w28 // h+=K[i] eor w9,w21,w21,ror#14 and w17,w22,w21 bic w28,w23,w21 add w24,w24,w6 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w25,w26 // a^b, b^c in next round eor w16,w16,w9,ror#11 // Sigma1(e) ror w9,w25,#2 add w24,w24,w17 // h+=Ch(e,f,g) eor w17,w25,w25,ror#9 add w24,w24,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w20,w20,w24 // d+=h eor w19,w19,w26 // Maj(a,b,c) eor w17,w9,w17,ror#13 // Sigma0(a) add w24,w24,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w24,w24,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w7,w7 // 4 #endif add w24,w24,w17 // h+=Sigma0(a) ror w16,w20,#6 add w23,w23,w19 // h+=K[i] eor w10,w20,w20,ror#14 and w17,w21,w20 bic w19,w22,w20 add w23,w23,w7 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w24,w25 // a^b, b^c in next round eor w16,w16,w10,ror#11 // Sigma1(e) ror w10,w24,#2 add w23,w23,w17 // h+=Ch(e,f,g) eor w17,w24,w24,ror#9 add w23,w23,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w27,w27,w23 // d+=h eor w28,w28,w25 // Maj(a,b,c) eor w17,w10,w17,ror#13 // Sigma0(a) add w23,w23,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w23,w23,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w8,w8 // 5 #endif ldp w9,w10,[x1],#2*4 add w23,w23,w17 // h+=Sigma0(a) ror w16,w27,#6 add w22,w22,w28 // h+=K[i] eor w11,w27,w27,ror#14 and w17,w20,w27 bic w28,w21,w27 add w22,w22,w8 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w23,w24 // a^b, b^c in next round eor w16,w16,w11,ror#11 // Sigma1(e) ror w11,w23,#2 add w22,w22,w17 // h+=Ch(e,f,g) eor w17,w23,w23,ror#9 add w22,w22,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w26,w26,w22 // d+=h eor w19,w19,w24 // Maj(a,b,c) eor w17,w11,w17,ror#13 // Sigma0(a) add w22,w22,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w22,w22,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w9,w9 // 6 #endif add w22,w22,w17 // h+=Sigma0(a) ror w16,w26,#6 add w21,w21,w19 // h+=K[i] eor w12,w26,w26,ror#14 and w17,w27,w26 bic w19,w20,w26 add w21,w21,w9 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w22,w23 // a^b, b^c in next round eor w16,w16,w12,ror#11 // Sigma1(e) ror w12,w22,#2 add w21,w21,w17 // h+=Ch(e,f,g) eor w17,w22,w22,ror#9 add w21,w21,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w25,w25,w21 // d+=h eor w28,w28,w23 // Maj(a,b,c) eor w17,w12,w17,ror#13 // Sigma0(a) add w21,w21,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w21,w21,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w10,w10 // 7 #endif ldp w11,w12,[x1],#2*4 add w21,w21,w17 // h+=Sigma0(a) ror w16,w25,#6 add w20,w20,w28 // h+=K[i] eor w13,w25,w25,ror#14 and w17,w26,w25 bic w28,w27,w25 add w20,w20,w10 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w21,w22 // a^b, b^c in next round eor w16,w16,w13,ror#11 // Sigma1(e) ror w13,w21,#2 add w20,w20,w17 // h+=Ch(e,f,g) eor w17,w21,w21,ror#9 add w20,w20,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w24,w24,w20 // d+=h eor w19,w19,w22 // Maj(a,b,c) eor w17,w13,w17,ror#13 // Sigma0(a) add w20,w20,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w20,w20,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w11,w11 // 8 #endif add w20,w20,w17 // h+=Sigma0(a) ror w16,w24,#6 add w27,w27,w19 // h+=K[i] eor w14,w24,w24,ror#14 and w17,w25,w24 bic w19,w26,w24 add w27,w27,w11 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w20,w21 // a^b, b^c in next round eor w16,w16,w14,ror#11 // Sigma1(e) ror w14,w20,#2 add w27,w27,w17 // h+=Ch(e,f,g) eor w17,w20,w20,ror#9 add w27,w27,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w23,w23,w27 // d+=h eor w28,w28,w21 // Maj(a,b,c) eor w17,w14,w17,ror#13 // Sigma0(a) add w27,w27,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w27,w27,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w12,w12 // 9 #endif ldp w13,w14,[x1],#2*4 add w27,w27,w17 // h+=Sigma0(a) ror w16,w23,#6 add w26,w26,w28 // h+=K[i] eor w15,w23,w23,ror#14 and w17,w24,w23 bic w28,w25,w23 add w26,w26,w12 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w27,w20 // a^b, b^c in next round eor w16,w16,w15,ror#11 // Sigma1(e) ror w15,w27,#2 add w26,w26,w17 // h+=Ch(e,f,g) eor w17,w27,w27,ror#9 add w26,w26,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w22,w22,w26 // d+=h eor w19,w19,w20 // Maj(a,b,c) eor w17,w15,w17,ror#13 // Sigma0(a) add w26,w26,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w26,w26,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w13,w13 // 10 #endif add w26,w26,w17 // h+=Sigma0(a) ror w16,w22,#6 add w25,w25,w19 // h+=K[i] eor w0,w22,w22,ror#14 and w17,w23,w22 bic w19,w24,w22 add w25,w25,w13 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w26,w27 // a^b, b^c in next round eor w16,w16,w0,ror#11 // Sigma1(e) ror w0,w26,#2 add w25,w25,w17 // h+=Ch(e,f,g) eor w17,w26,w26,ror#9 add w25,w25,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w21,w21,w25 // d+=h eor w28,w28,w27 // Maj(a,b,c) eor w17,w0,w17,ror#13 // Sigma0(a) add w25,w25,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w25,w25,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w14,w14 // 11 #endif ldp w15,w0,[x1],#2*4 add w25,w25,w17 // h+=Sigma0(a) str w6,[sp,#12] ror w16,w21,#6 add w24,w24,w28 // h+=K[i] eor w6,w21,w21,ror#14 and w17,w22,w21 bic w28,w23,w21 add w24,w24,w14 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w25,w26 // a^b, b^c in next round eor w16,w16,w6,ror#11 // Sigma1(e) ror w6,w25,#2 add w24,w24,w17 // h+=Ch(e,f,g) eor w17,w25,w25,ror#9 add w24,w24,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w20,w20,w24 // d+=h eor w19,w19,w26 // Maj(a,b,c) eor w17,w6,w17,ror#13 // Sigma0(a) add w24,w24,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w24,w24,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w15,w15 // 12 #endif add w24,w24,w17 // h+=Sigma0(a) str w7,[sp,#0] ror w16,w20,#6 add w23,w23,w19 // h+=K[i] eor w7,w20,w20,ror#14 and w17,w21,w20 bic w19,w22,w20 add w23,w23,w15 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w24,w25 // a^b, b^c in next round eor w16,w16,w7,ror#11 // Sigma1(e) ror w7,w24,#2 add w23,w23,w17 // h+=Ch(e,f,g) eor w17,w24,w24,ror#9 add w23,w23,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w27,w27,w23 // d+=h eor w28,w28,w25 // Maj(a,b,c) eor w17,w7,w17,ror#13 // Sigma0(a) add w23,w23,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w23,w23,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w0,w0 // 13 #endif ldp w1,w2,[x1] add w23,w23,w17 // h+=Sigma0(a) str w8,[sp,#4] ror w16,w27,#6 add w22,w22,w28 // h+=K[i] eor w8,w27,w27,ror#14 and w17,w20,w27 bic w28,w21,w27 add w22,w22,w0 // h+=X[i] orr w17,w17,w28 // Ch(e,f,g) eor w28,w23,w24 // a^b, b^c in next round eor w16,w16,w8,ror#11 // Sigma1(e) ror w8,w23,#2 add w22,w22,w17 // h+=Ch(e,f,g) eor w17,w23,w23,ror#9 add w22,w22,w16 // h+=Sigma1(e) and w19,w19,w28 // (b^c)&=(a^b) add w26,w26,w22 // d+=h eor w19,w19,w24 // Maj(a,b,c) eor w17,w8,w17,ror#13 // Sigma0(a) add w22,w22,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round //add w22,w22,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w1,w1 // 14 #endif ldr w6,[sp,#12] add w22,w22,w17 // h+=Sigma0(a) str w9,[sp,#8] ror w16,w26,#6 add w21,w21,w19 // h+=K[i] eor w9,w26,w26,ror#14 and w17,w27,w26 bic w19,w20,w26 add w21,w21,w1 // h+=X[i] orr w17,w17,w19 // Ch(e,f,g) eor w19,w22,w23 // a^b, b^c in next round eor w16,w16,w9,ror#11 // Sigma1(e) ror w9,w22,#2 add w21,w21,w17 // h+=Ch(e,f,g) eor w17,w22,w22,ror#9 add w21,w21,w16 // h+=Sigma1(e) and w28,w28,w19 // (b^c)&=(a^b) add w25,w25,w21 // d+=h eor w28,w28,w23 // Maj(a,b,c) eor w17,w9,w17,ror#13 // Sigma0(a) add w21,w21,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round //add w21,w21,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w2,w2 // 15 #endif ldr w7,[sp,#0] add w21,w21,w17 // h+=Sigma0(a) str w10,[sp,#12] ror w16,w25,#6 add w20,w20,w28 // h+=K[i] ror w9,w4,#7 and w17,w26,w25 ror w8,w1,#17 bic w28,w27,w25 ror w10,w21,#2 add w20,w20,w2 // h+=X[i] eor w16,w16,w25,ror#11 eor w9,w9,w4,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w21,w22 // a^b, b^c in next round eor w16,w16,w25,ror#25 // Sigma1(e) eor w10,w10,w21,ror#13 add w20,w20,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w8,w8,w1,ror#19 eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) add w20,w20,w16 // h+=Sigma1(e) eor w19,w19,w22 // Maj(a,b,c) eor w17,w10,w21,ror#22 // Sigma0(a) eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) add w3,w3,w12 add w24,w24,w20 // d+=h add w20,w20,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w3,w3,w9 add w20,w20,w17 // h+=Sigma0(a) add w3,w3,w8 Loop_16_xx: ldr w8,[sp,#4] str w11,[sp,#0] ror w16,w24,#6 add w27,w27,w19 // h+=K[i] ror w10,w5,#7 and w17,w25,w24 ror w9,w2,#17 bic w19,w26,w24 ror w11,w20,#2 add w27,w27,w3 // h+=X[i] eor w16,w16,w24,ror#11 eor w10,w10,w5,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w20,w21 // a^b, b^c in next round eor w16,w16,w24,ror#25 // Sigma1(e) eor w11,w11,w20,ror#13 add w27,w27,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w9,w9,w2,ror#19 eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) add w27,w27,w16 // h+=Sigma1(e) eor w28,w28,w21 // Maj(a,b,c) eor w17,w11,w20,ror#22 // Sigma0(a) eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) add w4,w4,w13 add w23,w23,w27 // d+=h add w27,w27,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w4,w4,w10 add w27,w27,w17 // h+=Sigma0(a) add w4,w4,w9 ldr w9,[sp,#8] str w12,[sp,#4] ror w16,w23,#6 add w26,w26,w28 // h+=K[i] ror w11,w6,#7 and w17,w24,w23 ror w10,w3,#17 bic w28,w25,w23 ror w12,w27,#2 add w26,w26,w4 // h+=X[i] eor w16,w16,w23,ror#11 eor w11,w11,w6,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w27,w20 // a^b, b^c in next round eor w16,w16,w23,ror#25 // Sigma1(e) eor w12,w12,w27,ror#13 add w26,w26,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w10,w10,w3,ror#19 eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) add w26,w26,w16 // h+=Sigma1(e) eor w19,w19,w20 // Maj(a,b,c) eor w17,w12,w27,ror#22 // Sigma0(a) eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) add w5,w5,w14 add w22,w22,w26 // d+=h add w26,w26,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w5,w5,w11 add w26,w26,w17 // h+=Sigma0(a) add w5,w5,w10 ldr w10,[sp,#12] str w13,[sp,#8] ror w16,w22,#6 add w25,w25,w19 // h+=K[i] ror w12,w7,#7 and w17,w23,w22 ror w11,w4,#17 bic w19,w24,w22 ror w13,w26,#2 add w25,w25,w5 // h+=X[i] eor w16,w16,w22,ror#11 eor w12,w12,w7,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w26,w27 // a^b, b^c in next round eor w16,w16,w22,ror#25 // Sigma1(e) eor w13,w13,w26,ror#13 add w25,w25,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w11,w11,w4,ror#19 eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) add w25,w25,w16 // h+=Sigma1(e) eor w28,w28,w27 // Maj(a,b,c) eor w17,w13,w26,ror#22 // Sigma0(a) eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) add w6,w6,w15 add w21,w21,w25 // d+=h add w25,w25,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w6,w6,w12 add w25,w25,w17 // h+=Sigma0(a) add w6,w6,w11 ldr w11,[sp,#0] str w14,[sp,#12] ror w16,w21,#6 add w24,w24,w28 // h+=K[i] ror w13,w8,#7 and w17,w22,w21 ror w12,w5,#17 bic w28,w23,w21 ror w14,w25,#2 add w24,w24,w6 // h+=X[i] eor w16,w16,w21,ror#11 eor w13,w13,w8,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w25,w26 // a^b, b^c in next round eor w16,w16,w21,ror#25 // Sigma1(e) eor w14,w14,w25,ror#13 add w24,w24,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w12,w12,w5,ror#19 eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) add w24,w24,w16 // h+=Sigma1(e) eor w19,w19,w26 // Maj(a,b,c) eor w17,w14,w25,ror#22 // Sigma0(a) eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) add w7,w7,w0 add w20,w20,w24 // d+=h add w24,w24,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w7,w7,w13 add w24,w24,w17 // h+=Sigma0(a) add w7,w7,w12 ldr w12,[sp,#4] str w15,[sp,#0] ror w16,w20,#6 add w23,w23,w19 // h+=K[i] ror w14,w9,#7 and w17,w21,w20 ror w13,w6,#17 bic w19,w22,w20 ror w15,w24,#2 add w23,w23,w7 // h+=X[i] eor w16,w16,w20,ror#11 eor w14,w14,w9,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w24,w25 // a^b, b^c in next round eor w16,w16,w20,ror#25 // Sigma1(e) eor w15,w15,w24,ror#13 add w23,w23,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w13,w13,w6,ror#19 eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) add w23,w23,w16 // h+=Sigma1(e) eor w28,w28,w25 // Maj(a,b,c) eor w17,w15,w24,ror#22 // Sigma0(a) eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) add w8,w8,w1 add w27,w27,w23 // d+=h add w23,w23,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w8,w8,w14 add w23,w23,w17 // h+=Sigma0(a) add w8,w8,w13 ldr w13,[sp,#8] str w0,[sp,#4] ror w16,w27,#6 add w22,w22,w28 // h+=K[i] ror w15,w10,#7 and w17,w20,w27 ror w14,w7,#17 bic w28,w21,w27 ror w0,w23,#2 add w22,w22,w8 // h+=X[i] eor w16,w16,w27,ror#11 eor w15,w15,w10,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w23,w24 // a^b, b^c in next round eor w16,w16,w27,ror#25 // Sigma1(e) eor w0,w0,w23,ror#13 add w22,w22,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w14,w14,w7,ror#19 eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) add w22,w22,w16 // h+=Sigma1(e) eor w19,w19,w24 // Maj(a,b,c) eor w17,w0,w23,ror#22 // Sigma0(a) eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) add w9,w9,w2 add w26,w26,w22 // d+=h add w22,w22,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w9,w9,w15 add w22,w22,w17 // h+=Sigma0(a) add w9,w9,w14 ldr w14,[sp,#12] str w1,[sp,#8] ror w16,w26,#6 add w21,w21,w19 // h+=K[i] ror w0,w11,#7 and w17,w27,w26 ror w15,w8,#17 bic w19,w20,w26 ror w1,w22,#2 add w21,w21,w9 // h+=X[i] eor w16,w16,w26,ror#11 eor w0,w0,w11,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w22,w23 // a^b, b^c in next round eor w16,w16,w26,ror#25 // Sigma1(e) eor w1,w1,w22,ror#13 add w21,w21,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w15,w15,w8,ror#19 eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) add w21,w21,w16 // h+=Sigma1(e) eor w28,w28,w23 // Maj(a,b,c) eor w17,w1,w22,ror#22 // Sigma0(a) eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) add w10,w10,w3 add w25,w25,w21 // d+=h add w21,w21,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w10,w10,w0 add w21,w21,w17 // h+=Sigma0(a) add w10,w10,w15 ldr w15,[sp,#0] str w2,[sp,#12] ror w16,w25,#6 add w20,w20,w28 // h+=K[i] ror w1,w12,#7 and w17,w26,w25 ror w0,w9,#17 bic w28,w27,w25 ror w2,w21,#2 add w20,w20,w10 // h+=X[i] eor w16,w16,w25,ror#11 eor w1,w1,w12,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w21,w22 // a^b, b^c in next round eor w16,w16,w25,ror#25 // Sigma1(e) eor w2,w2,w21,ror#13 add w20,w20,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w0,w0,w9,ror#19 eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) add w20,w20,w16 // h+=Sigma1(e) eor w19,w19,w22 // Maj(a,b,c) eor w17,w2,w21,ror#22 // Sigma0(a) eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) add w11,w11,w4 add w24,w24,w20 // d+=h add w20,w20,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w11,w11,w1 add w20,w20,w17 // h+=Sigma0(a) add w11,w11,w0 ldr w0,[sp,#4] str w3,[sp,#0] ror w16,w24,#6 add w27,w27,w19 // h+=K[i] ror w2,w13,#7 and w17,w25,w24 ror w1,w10,#17 bic w19,w26,w24 ror w3,w20,#2 add w27,w27,w11 // h+=X[i] eor w16,w16,w24,ror#11 eor w2,w2,w13,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w20,w21 // a^b, b^c in next round eor w16,w16,w24,ror#25 // Sigma1(e) eor w3,w3,w20,ror#13 add w27,w27,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w1,w1,w10,ror#19 eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) add w27,w27,w16 // h+=Sigma1(e) eor w28,w28,w21 // Maj(a,b,c) eor w17,w3,w20,ror#22 // Sigma0(a) eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) add w12,w12,w5 add w23,w23,w27 // d+=h add w27,w27,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w12,w12,w2 add w27,w27,w17 // h+=Sigma0(a) add w12,w12,w1 ldr w1,[sp,#8] str w4,[sp,#4] ror w16,w23,#6 add w26,w26,w28 // h+=K[i] ror w3,w14,#7 and w17,w24,w23 ror w2,w11,#17 bic w28,w25,w23 ror w4,w27,#2 add w26,w26,w12 // h+=X[i] eor w16,w16,w23,ror#11 eor w3,w3,w14,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w27,w20 // a^b, b^c in next round eor w16,w16,w23,ror#25 // Sigma1(e) eor w4,w4,w27,ror#13 add w26,w26,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w2,w2,w11,ror#19 eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) add w26,w26,w16 // h+=Sigma1(e) eor w19,w19,w20 // Maj(a,b,c) eor w17,w4,w27,ror#22 // Sigma0(a) eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) add w13,w13,w6 add w22,w22,w26 // d+=h add w26,w26,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w13,w13,w3 add w26,w26,w17 // h+=Sigma0(a) add w13,w13,w2 ldr w2,[sp,#12] str w5,[sp,#8] ror w16,w22,#6 add w25,w25,w19 // h+=K[i] ror w4,w15,#7 and w17,w23,w22 ror w3,w12,#17 bic w19,w24,w22 ror w5,w26,#2 add w25,w25,w13 // h+=X[i] eor w16,w16,w22,ror#11 eor w4,w4,w15,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w26,w27 // a^b, b^c in next round eor w16,w16,w22,ror#25 // Sigma1(e) eor w5,w5,w26,ror#13 add w25,w25,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w3,w3,w12,ror#19 eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) add w25,w25,w16 // h+=Sigma1(e) eor w28,w28,w27 // Maj(a,b,c) eor w17,w5,w26,ror#22 // Sigma0(a) eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) add w14,w14,w7 add w21,w21,w25 // d+=h add w25,w25,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w14,w14,w4 add w25,w25,w17 // h+=Sigma0(a) add w14,w14,w3 ldr w3,[sp,#0] str w6,[sp,#12] ror w16,w21,#6 add w24,w24,w28 // h+=K[i] ror w5,w0,#7 and w17,w22,w21 ror w4,w13,#17 bic w28,w23,w21 ror w6,w25,#2 add w24,w24,w14 // h+=X[i] eor w16,w16,w21,ror#11 eor w5,w5,w0,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w25,w26 // a^b, b^c in next round eor w16,w16,w21,ror#25 // Sigma1(e) eor w6,w6,w25,ror#13 add w24,w24,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w4,w4,w13,ror#19 eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) add w24,w24,w16 // h+=Sigma1(e) eor w19,w19,w26 // Maj(a,b,c) eor w17,w6,w25,ror#22 // Sigma0(a) eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) add w15,w15,w8 add w20,w20,w24 // d+=h add w24,w24,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w15,w15,w5 add w24,w24,w17 // h+=Sigma0(a) add w15,w15,w4 ldr w4,[sp,#4] str w7,[sp,#0] ror w16,w20,#6 add w23,w23,w19 // h+=K[i] ror w6,w1,#7 and w17,w21,w20 ror w5,w14,#17 bic w19,w22,w20 ror w7,w24,#2 add w23,w23,w15 // h+=X[i] eor w16,w16,w20,ror#11 eor w6,w6,w1,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w24,w25 // a^b, b^c in next round eor w16,w16,w20,ror#25 // Sigma1(e) eor w7,w7,w24,ror#13 add w23,w23,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w5,w5,w14,ror#19 eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) add w23,w23,w16 // h+=Sigma1(e) eor w28,w28,w25 // Maj(a,b,c) eor w17,w7,w24,ror#22 // Sigma0(a) eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) add w0,w0,w9 add w27,w27,w23 // d+=h add w23,w23,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w0,w0,w6 add w23,w23,w17 // h+=Sigma0(a) add w0,w0,w5 ldr w5,[sp,#8] str w8,[sp,#4] ror w16,w27,#6 add w22,w22,w28 // h+=K[i] ror w7,w2,#7 and w17,w20,w27 ror w6,w15,#17 bic w28,w21,w27 ror w8,w23,#2 add w22,w22,w0 // h+=X[i] eor w16,w16,w27,ror#11 eor w7,w7,w2,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w23,w24 // a^b, b^c in next round eor w16,w16,w27,ror#25 // Sigma1(e) eor w8,w8,w23,ror#13 add w22,w22,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w6,w6,w15,ror#19 eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) add w22,w22,w16 // h+=Sigma1(e) eor w19,w19,w24 // Maj(a,b,c) eor w17,w8,w23,ror#22 // Sigma0(a) eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) add w1,w1,w10 add w26,w26,w22 // d+=h add w22,w22,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w1,w1,w7 add w22,w22,w17 // h+=Sigma0(a) add w1,w1,w6 ldr w6,[sp,#12] str w9,[sp,#8] ror w16,w26,#6 add w21,w21,w19 // h+=K[i] ror w8,w3,#7 and w17,w27,w26 ror w7,w0,#17 bic w19,w20,w26 ror w9,w22,#2 add w21,w21,w1 // h+=X[i] eor w16,w16,w26,ror#11 eor w8,w8,w3,ror#18 orr w17,w17,w19 // Ch(e,f,g) eor w19,w22,w23 // a^b, b^c in next round eor w16,w16,w26,ror#25 // Sigma1(e) eor w9,w9,w22,ror#13 add w21,w21,w17 // h+=Ch(e,f,g) and w28,w28,w19 // (b^c)&=(a^b) eor w7,w7,w0,ror#19 eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) add w21,w21,w16 // h+=Sigma1(e) eor w28,w28,w23 // Maj(a,b,c) eor w17,w9,w22,ror#22 // Sigma0(a) eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) add w2,w2,w11 add w25,w25,w21 // d+=h add w21,w21,w28 // h+=Maj(a,b,c) ldr w28,[x30],#4 // *K++, w19 in next round add w2,w2,w8 add w21,w21,w17 // h+=Sigma0(a) add w2,w2,w7 ldr w7,[sp,#0] str w10,[sp,#12] ror w16,w25,#6 add w20,w20,w28 // h+=K[i] ror w9,w4,#7 and w17,w26,w25 ror w8,w1,#17 bic w28,w27,w25 ror w10,w21,#2 add w20,w20,w2 // h+=X[i] eor w16,w16,w25,ror#11 eor w9,w9,w4,ror#18 orr w17,w17,w28 // Ch(e,f,g) eor w28,w21,w22 // a^b, b^c in next round eor w16,w16,w25,ror#25 // Sigma1(e) eor w10,w10,w21,ror#13 add w20,w20,w17 // h+=Ch(e,f,g) and w19,w19,w28 // (b^c)&=(a^b) eor w8,w8,w1,ror#19 eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) add w20,w20,w16 // h+=Sigma1(e) eor w19,w19,w22 // Maj(a,b,c) eor w17,w10,w21,ror#22 // Sigma0(a) eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) add w3,w3,w12 add w24,w24,w20 // d+=h add w20,w20,w19 // h+=Maj(a,b,c) ldr w19,[x30],#4 // *K++, w28 in next round add w3,w3,w9 add w20,w20,w17 // h+=Sigma0(a) add w3,w3,w8 cbnz w19,Loop_16_xx ldp x0,x2,[x29,#96] ldr x1,[x29,#112] sub x30,x30,#260 // rewind ldp w3,w4,[x0] ldp w5,w6,[x0,#2*4] add x1,x1,#14*4 // advance input pointer ldp w7,w8,[x0,#4*4] add w20,w20,w3 ldp w9,w10,[x0,#6*4] add w21,w21,w4 add w22,w22,w5 add w23,w23,w6 stp w20,w21,[x0] add w24,w24,w7 add w25,w25,w8 stp w22,w23,[x0,#2*4] add w26,w26,w9 add w27,w27,w10 cmp x1,x2 stp w24,w25,[x0,#4*4] stp w26,w27,[x0,#6*4] b.ne Loop ldp x19,x20,[x29,#16] add sp,sp,#4*4 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 AARCH64_VALIDATE_LINK_REGISTER ret .section .rodata .align 6 LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 .text #ifndef __KERNEL__ .globl sha256_block_data_order_hw .def sha256_block_data_order_hw .type 32 .endef .align 6 sha256_block_data_order_hw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ld1 {v0.4s,v1.4s},[x0] adrp x3,LK256 add x3,x3,:lo12:LK256 Loop_hw: ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 sub x2,x2,#1 ld1 {v16.4s},[x3],#16 rev32 v4.16b,v4.16b rev32 v5.16b,v5.16b rev32 v6.16b,v6.16b rev32 v7.16b,v7.16b orr v18.16b,v0.16b,v0.16b // offload orr v19.16b,v1.16b,v1.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .long 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .long 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .long 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .long 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .long 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .long 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .long 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .long 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .long 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .long 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .long 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .long 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ld1 {v17.4s},[x3] add v16.4s,v16.4s,v6.4s sub x3,x3,#64*4-16 // rewind orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s add v17.4s,v17.4s,v7.4s orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s add v0.4s,v0.4s,v18.4s add v1.4s,v1.4s,v19.4s cbnz x2,Loop_hw st1 {v0.4s,v1.4s},[x0] ldr x29,[sp],#16 ret #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) ring-0.17.14/pregenerated/sha256-x86_64-elf.S000064400000000000000000002120231046102023000162360ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text .globl sha256_block_data_order_nohw .hidden sha256_block_data_order_nohw .type sha256_block_data_order_nohw,@function .align 16 sha256_block_data_order_nohw: .cfi_startproc _CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx andq $-64,%rsp movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) movq %rax,88(%rsp) .cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue: movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d jmp .Lloop .align 16 .Lloop: movl %ebx,%edi leaq K256(%rip),%rbp xorl %ecx,%edi movl 0(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,0(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl (%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d leaq 4(%rbp),%rbp addl %r14d,%r11d movl 4(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,4(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl (%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d leaq 4(%rbp),%rbp addl %r14d,%r10d movl 8(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,8(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl (%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d leaq 4(%rbp),%rbp addl %r14d,%r9d movl 12(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,12(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl (%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d leaq 20(%rbp),%rbp addl %r14d,%r8d movl 16(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,16(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl (%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx leaq 4(%rbp),%rbp addl %r14d,%edx movl 20(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,20(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl (%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx leaq 4(%rbp),%rbp addl %r14d,%ecx movl 24(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,24(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl (%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx leaq 4(%rbp),%rbp addl %r14d,%ebx movl 28(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,28(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl (%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 20(%rbp),%rbp addl %r14d,%eax movl 32(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,32(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl (%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d leaq 4(%rbp),%rbp addl %r14d,%r11d movl 36(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,36(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl (%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d leaq 4(%rbp),%rbp addl %r14d,%r10d movl 40(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,40(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl (%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d leaq 4(%rbp),%rbp addl %r14d,%r9d movl 44(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,44(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl (%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d leaq 20(%rbp),%rbp addl %r14d,%r8d movl 48(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,48(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl (%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx leaq 4(%rbp),%rbp addl %r14d,%edx movl 52(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,52(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl (%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx leaq 4(%rbp),%rbp addl %r14d,%ecx movl 56(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,56(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl (%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx leaq 4(%rbp),%rbp addl %r14d,%ebx movl 60(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,60(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl (%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 20(%rbp),%rbp jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: movl 4(%rsp),%r13d movl 56(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%eax movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 36(%rsp),%r12d addl 0(%rsp),%r12d movl %r8d,%r13d addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,0(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl (%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d leaq 4(%rbp),%rbp movl 8(%rsp),%r13d movl 60(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r11d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 40(%rsp),%r12d addl 4(%rsp),%r12d movl %edx,%r13d addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,4(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl (%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d leaq 4(%rbp),%rbp movl 12(%rsp),%r13d movl 0(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r10d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 44(%rsp),%r12d addl 8(%rsp),%r12d movl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,8(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl (%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d leaq 4(%rbp),%rbp movl 16(%rsp),%r13d movl 4(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r9d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 48(%rsp),%r12d addl 12(%rsp),%r12d movl %ebx,%r13d addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,12(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl (%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d leaq 20(%rbp),%rbp movl 20(%rsp),%r13d movl 8(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r8d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 52(%rsp),%r12d addl 16(%rsp),%r12d movl %eax,%r13d addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,16(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl (%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx leaq 4(%rbp),%rbp movl 24(%rsp),%r13d movl 12(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%edx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 56(%rsp),%r12d addl 20(%rsp),%r12d movl %r11d,%r13d addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,20(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl (%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx leaq 4(%rbp),%rbp movl 28(%rsp),%r13d movl 16(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ecx movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 60(%rsp),%r12d addl 24(%rsp),%r12d movl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,24(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl (%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx leaq 4(%rbp),%rbp movl 32(%rsp),%r13d movl 20(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ebx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 0(%rsp),%r12d addl 28(%rsp),%r12d movl %r9d,%r13d addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,28(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl (%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 20(%rbp),%rbp movl 36(%rsp),%r13d movl 24(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%eax movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 4(%rsp),%r12d addl 32(%rsp),%r12d movl %r8d,%r13d addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,32(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl (%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d leaq 4(%rbp),%rbp movl 40(%rsp),%r13d movl 28(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r11d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 8(%rsp),%r12d addl 36(%rsp),%r12d movl %edx,%r13d addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,36(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl (%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d leaq 4(%rbp),%rbp movl 44(%rsp),%r13d movl 32(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r10d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 12(%rsp),%r12d addl 40(%rsp),%r12d movl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,40(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl (%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d leaq 4(%rbp),%rbp movl 48(%rsp),%r13d movl 36(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r9d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 16(%rsp),%r12d addl 44(%rsp),%r12d movl %ebx,%r13d addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,44(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl (%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d leaq 20(%rbp),%rbp movl 52(%rsp),%r13d movl 40(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r8d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 20(%rsp),%r12d addl 48(%rsp),%r12d movl %eax,%r13d addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,48(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl (%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx leaq 4(%rbp),%rbp movl 56(%rsp),%r13d movl 44(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%edx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 24(%rsp),%r12d addl 52(%rsp),%r12d movl %r11d,%r13d addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,52(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl (%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx leaq 4(%rbp),%rbp movl 60(%rsp),%r13d movl 48(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ecx movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 28(%rsp),%r12d addl 56(%rsp),%r12d movl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,56(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl (%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx leaq 4(%rbp),%rbp movl 0(%rsp),%r13d movl 52(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ebx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 32(%rsp),%r12d addl 60(%rsp),%r12d movl %r9d,%r13d addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,60(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl (%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 20(%rbp),%rbp cmpb $0,3(%rbp) jnz .Lrounds_16_xx movq 64+0(%rsp),%rdi addl %r14d,%eax leaq 64(%rsi),%rsi addl 0(%rdi),%eax addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d cmpq 64+16(%rsp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb .Lloop movq 88(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue: ret .cfi_endproc .size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw .section .rodata .align 64 .type K256,@object K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text .globl sha256_block_data_order_hw .hidden sha256_block_data_order_hw .type sha256_block_data_order_hw,@function .align 64 sha256_block_data_order_hw: .cfi_startproc _CET_ENDBR leaq K256+128(%rip),%rcx movdqu (%rdi),%xmm1 movdqu 16(%rdi),%xmm2 movdqa 512-128(%rcx),%xmm7 pshufd $0x1b,%xmm1,%xmm0 pshufd $0xb1,%xmm1,%xmm1 pshufd $0x1b,%xmm2,%xmm2 movdqa %xmm7,%xmm8 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 jmp .Loop_shaext .align 16 .Loop_shaext: movdqu (%rsi),%xmm3 movdqu 16(%rsi),%xmm4 movdqu 32(%rsi),%xmm5 .byte 102,15,56,0,223 movdqu 48(%rsi),%xmm6 movdqa 0-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 102,15,56,0,231 movdqa %xmm2,%xmm10 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 nop movdqa %xmm1,%xmm9 .byte 15,56,203,202 movdqa 32-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 102,15,56,0,239 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 leaq 64(%rsi),%rsi .byte 15,56,204,220 .byte 15,56,203,202 movdqa 64-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 102,15,56,0,247 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 96-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 128-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 160-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 192-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 224-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 256-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 288-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 320-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 352-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 384-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 416-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 .byte 15,56,203,202 paddd %xmm7,%xmm6 movdqa 448-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 .byte 15,56,205,245 movdqa %xmm8,%xmm7 .byte 15,56,203,202 movdqa 480-128(%rcx),%xmm0 paddd %xmm6,%xmm0 nop .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 decq %rdx nop .byte 15,56,203,202 paddd %xmm10,%xmm2 paddd %xmm9,%xmm1 jnz .Loop_shaext pshufd $0xb1,%xmm2,%xmm2 pshufd $0x1b,%xmm1,%xmm7 pshufd $0xb1,%xmm1,%xmm1 punpckhqdq %xmm2,%xmm1 .byte 102,15,58,15,215,8 movdqu %xmm1,(%rdi) movdqu %xmm2,16(%rdi) ret .cfi_endproc .size sha256_block_data_order_hw,.-sha256_block_data_order_hw .globl sha256_block_data_order_ssse3 .hidden sha256_block_data_order_ssse3 .type sha256_block_data_order_ssse3,@function .align 64 sha256_block_data_order_ssse3: .cfi_startproc _CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx andq $-64,%rsp movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) movq %rax,88(%rsp) .cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue_ssse3: movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d jmp .Lloop_ssse3 .align 16 .Lloop_ssse3: movdqa K256+512(%rip),%xmm7 movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 .byte 102,15,56,0,199 movdqu 48(%rsi),%xmm3 leaq K256(%rip),%rbp .byte 102,15,56,0,207 movdqa 0(%rbp),%xmm4 movdqa 32(%rbp),%xmm5 .byte 102,15,56,0,215 paddd %xmm0,%xmm4 movdqa 64(%rbp),%xmm6 .byte 102,15,56,0,223 movdqa 96(%rbp),%xmm7 paddd %xmm1,%xmm5 paddd %xmm2,%xmm6 paddd %xmm3,%xmm7 movdqa %xmm4,0(%rsp) movl %eax,%r14d movdqa %xmm5,16(%rsp) movl %ebx,%edi movdqa %xmm6,32(%rsp) xorl %ecx,%edi movdqa %xmm7,48(%rsp) movl %r8d,%r13d jmp .Lssse3_00_47 .align 16 .Lssse3_00_47: subq $-128,%rbp rorl $14,%r13d movdqa %xmm1,%xmm4 movl %r14d,%eax movl %r9d,%r12d movdqa %xmm3,%xmm7 rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d .byte 102,15,58,15,224,4 andl %r8d,%r12d xorl %r8d,%r13d .byte 102,15,58,15,250,4 addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %ebx,%r15d addl %r12d,%r11d movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi paddd %xmm7,%xmm0 rorl $2,%r14d addl %r11d,%edx psrld $7,%xmm6 addl %edi,%r11d movl %edx,%r13d pshufd $250,%xmm3,%xmm7 addl %r11d,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%r11d movl %r8d,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %r11d,%r14d pxor %xmm5,%xmm4 andl %edx,%r12d xorl %edx,%r13d pslld $11,%xmm5 addl 4(%rsp),%r10d movl %r11d,%edi pxor %xmm6,%xmm4 xorl %r9d,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %eax,%edi addl %r12d,%r10d pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d psrld $10,%xmm7 addl %r13d,%r10d xorl %eax,%r15d paddd %xmm4,%xmm0 rorl $2,%r14d addl %r10d,%ecx psrlq $17,%xmm6 addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %ecx,%r13d xorl %r8d,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d pshufd $128,%xmm7,%xmm7 xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d psrldq $8,%xmm7 xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d paddd %xmm7,%xmm0 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d pshufd $80,%xmm0,%xmm7 xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx movdqa %xmm7,%xmm6 addl %edi,%r9d movl %ebx,%r13d psrld $10,%xmm7 addl %r9d,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%r9d movl %ecx,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d psrlq $2,%xmm6 andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d pxor %xmm6,%xmm7 movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %r10d,%edi addl %r12d,%r8d movdqa 0(%rbp),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d paddd %xmm7,%xmm0 rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d paddd %xmm0,%xmm6 movl %eax,%r13d addl %r8d,%r14d movdqa %xmm6,0(%rsp) rorl $14,%r13d movdqa %xmm2,%xmm4 movl %r14d,%r8d movl %ebx,%r12d movdqa %xmm0,%xmm7 rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d .byte 102,15,58,15,225,4 andl %eax,%r12d xorl %eax,%r13d .byte 102,15,58,15,251,4 addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %r9d,%r15d addl %r12d,%edx movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi paddd %xmm7,%xmm1 rorl $2,%r14d addl %edx,%r11d psrld $7,%xmm6 addl %edi,%edx movl %r11d,%r13d pshufd $250,%xmm0,%xmm7 addl %edx,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%edx movl %eax,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %edx,%r14d pxor %xmm5,%xmm4 andl %r11d,%r12d xorl %r11d,%r13d pslld $11,%xmm5 addl 20(%rsp),%ecx movl %edx,%edi pxor %xmm6,%xmm4 xorl %ebx,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %r8d,%edi addl %r12d,%ecx pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d psrld $10,%xmm7 addl %r13d,%ecx xorl %r8d,%r15d paddd %xmm4,%xmm1 rorl $2,%r14d addl %ecx,%r10d psrlq $17,%xmm6 addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %r10d,%r13d xorl %eax,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d pshufd $128,%xmm7,%xmm7 xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d psrldq $8,%xmm7 xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d paddd %xmm7,%xmm1 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx pshufd $80,%xmm1,%xmm7 xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d movdqa %xmm7,%xmm6 addl %edi,%ebx movl %r9d,%r13d psrld $10,%xmm7 addl %ebx,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%ebx movl %r10d,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d psrlq $2,%xmm6 andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax pxor %xmm6,%xmm7 movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %ecx,%edi addl %r12d,%eax movdqa 32(%rbp),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d paddd %xmm7,%xmm1 rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax paddd %xmm1,%xmm6 movl %r8d,%r13d addl %eax,%r14d movdqa %xmm6,16(%rsp) rorl $14,%r13d movdqa %xmm3,%xmm4 movl %r14d,%eax movl %r9d,%r12d movdqa %xmm1,%xmm7 rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d .byte 102,15,58,15,226,4 andl %r8d,%r12d xorl %r8d,%r13d .byte 102,15,58,15,248,4 addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %ebx,%r15d addl %r12d,%r11d movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi paddd %xmm7,%xmm2 rorl $2,%r14d addl %r11d,%edx psrld $7,%xmm6 addl %edi,%r11d movl %edx,%r13d pshufd $250,%xmm1,%xmm7 addl %r11d,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%r11d movl %r8d,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %r11d,%r14d pxor %xmm5,%xmm4 andl %edx,%r12d xorl %edx,%r13d pslld $11,%xmm5 addl 36(%rsp),%r10d movl %r11d,%edi pxor %xmm6,%xmm4 xorl %r9d,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %eax,%edi addl %r12d,%r10d pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d psrld $10,%xmm7 addl %r13d,%r10d xorl %eax,%r15d paddd %xmm4,%xmm2 rorl $2,%r14d addl %r10d,%ecx psrlq $17,%xmm6 addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %ecx,%r13d xorl %r8d,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d pshufd $128,%xmm7,%xmm7 xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d psrldq $8,%xmm7 xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d paddd %xmm7,%xmm2 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d pshufd $80,%xmm2,%xmm7 xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx movdqa %xmm7,%xmm6 addl %edi,%r9d movl %ebx,%r13d psrld $10,%xmm7 addl %r9d,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%r9d movl %ecx,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d psrlq $2,%xmm6 andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d pxor %xmm6,%xmm7 movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %r10d,%edi addl %r12d,%r8d movdqa 64(%rbp),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d paddd %xmm7,%xmm2 rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d paddd %xmm2,%xmm6 movl %eax,%r13d addl %r8d,%r14d movdqa %xmm6,32(%rsp) rorl $14,%r13d movdqa %xmm0,%xmm4 movl %r14d,%r8d movl %ebx,%r12d movdqa %xmm2,%xmm7 rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d .byte 102,15,58,15,227,4 andl %eax,%r12d xorl %eax,%r13d .byte 102,15,58,15,249,4 addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %r9d,%r15d addl %r12d,%edx movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi paddd %xmm7,%xmm3 rorl $2,%r14d addl %edx,%r11d psrld $7,%xmm6 addl %edi,%edx movl %r11d,%r13d pshufd $250,%xmm2,%xmm7 addl %edx,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%edx movl %eax,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %edx,%r14d pxor %xmm5,%xmm4 andl %r11d,%r12d xorl %r11d,%r13d pslld $11,%xmm5 addl 52(%rsp),%ecx movl %edx,%edi pxor %xmm6,%xmm4 xorl %ebx,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %r8d,%edi addl %r12d,%ecx pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d psrld $10,%xmm7 addl %r13d,%ecx xorl %r8d,%r15d paddd %xmm4,%xmm3 rorl $2,%r14d addl %ecx,%r10d psrlq $17,%xmm6 addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %r10d,%r13d xorl %eax,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d pshufd $128,%xmm7,%xmm7 xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d psrldq $8,%xmm7 xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d paddd %xmm7,%xmm3 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx pshufd $80,%xmm3,%xmm7 xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d movdqa %xmm7,%xmm6 addl %edi,%ebx movl %r9d,%r13d psrld $10,%xmm7 addl %ebx,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%ebx movl %r10d,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d psrlq $2,%xmm6 andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax pxor %xmm6,%xmm7 movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %ecx,%edi addl %r12d,%eax movdqa 96(%rbp),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d paddd %xmm7,%xmm3 rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax paddd %xmm3,%xmm6 movl %r8d,%r13d addl %eax,%r14d movdqa %xmm6,48(%rsp) cmpb $0,131(%rbp) jne .Lssse3_00_47 rorl $14,%r13d movl %r14d,%eax movl %r9d,%r12d rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d xorl %ebx,%r15d addl %r12d,%r11d rorl $6,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi rorl $2,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d rorl $14,%r13d movl %r14d,%r11d movl %r8d,%r12d rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 4(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d rorl $11,%r14d xorl %eax,%edi addl %r12d,%r10d rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d rorl $2,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d xorl %ecx,%r13d xorl %r8d,%r12d rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d rorl $14,%r13d movl %r14d,%r9d movl %ecx,%r12d rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d xorl %r10d,%edi addl %r12d,%r8d rorl $6,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d rorl $14,%r13d movl %r14d,%r8d movl %ebx,%r12d rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d xorl %r9d,%r15d addl %r12d,%edx rorl $6,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi rorl $2,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d rorl $14,%r13d movl %r14d,%edx movl %eax,%r12d rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 20(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d rorl $11,%r14d xorl %r8d,%edi addl %r12d,%ecx rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d rorl $2,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d xorl %r10d,%r13d xorl %eax,%r12d rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d rorl $14,%r13d movl %r14d,%ebx movl %r10d,%r12d rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d xorl %ecx,%edi addl %r12d,%eax rorl $6,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d rorl $14,%r13d movl %r14d,%eax movl %r9d,%r12d rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d xorl %ebx,%r15d addl %r12d,%r11d rorl $6,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi rorl $2,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d rorl $14,%r13d movl %r14d,%r11d movl %r8d,%r12d rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 36(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d rorl $11,%r14d xorl %eax,%edi addl %r12d,%r10d rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d rorl $2,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d xorl %ecx,%r13d xorl %r8d,%r12d rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d rorl $14,%r13d movl %r14d,%r9d movl %ecx,%r12d rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d xorl %r10d,%edi addl %r12d,%r8d rorl $6,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d rorl $14,%r13d movl %r14d,%r8d movl %ebx,%r12d rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d xorl %r9d,%r15d addl %r12d,%edx rorl $6,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi rorl $2,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d rorl $14,%r13d movl %r14d,%edx movl %eax,%r12d rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 52(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d rorl $11,%r14d xorl %r8d,%edi addl %r12d,%ecx rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d rorl $2,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d xorl %r10d,%r13d xorl %eax,%r12d rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d rorl $14,%r13d movl %r14d,%ebx movl %r10d,%r12d rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d xorl %ecx,%edi addl %r12d,%eax rorl $6,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d movq 64+0(%rsp),%rdi movl %r14d,%eax addl 0(%rdi),%eax leaq 64(%rsi),%rsi addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d cmpq 64+16(%rsp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb .Lloop_ssse3 movq 88(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_ssse3: ret .cfi_endproc .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 .globl sha256_block_data_order_avx .hidden sha256_block_data_order_avx .type sha256_block_data_order_avx,@function .align 64 sha256_block_data_order_avx: .cfi_startproc _CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx andq $-64,%rsp movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) movq %rax,88(%rsp) .cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue_avx: vzeroupper movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d vmovdqa K256+512+32(%rip),%xmm8 vmovdqa K256+512+64(%rip),%xmm9 jmp .Lloop_avx .align 16 .Lloop_avx: vmovdqa K256+512(%rip),%xmm7 vmovdqu 0(%rsi),%xmm0 vmovdqu 16(%rsi),%xmm1 vmovdqu 32(%rsi),%xmm2 vmovdqu 48(%rsi),%xmm3 vpshufb %xmm7,%xmm0,%xmm0 leaq K256(%rip),%rbp vpshufb %xmm7,%xmm1,%xmm1 vpshufb %xmm7,%xmm2,%xmm2 vpaddd 0(%rbp),%xmm0,%xmm4 vpshufb %xmm7,%xmm3,%xmm3 vpaddd 32(%rbp),%xmm1,%xmm5 vpaddd 64(%rbp),%xmm2,%xmm6 vpaddd 96(%rbp),%xmm3,%xmm7 vmovdqa %xmm4,0(%rsp) movl %eax,%r14d vmovdqa %xmm5,16(%rsp) movl %ebx,%edi vmovdqa %xmm6,32(%rsp) xorl %ecx,%edi vmovdqa %xmm7,48(%rsp) movl %r8d,%r13d jmp .Lavx_00_47 .align 16 .Lavx_00_47: subq $-128,%rbp vpalignr $4,%xmm0,%xmm1,%xmm4 shrdl $14,%r13d,%r13d movl %r14d,%eax movl %r9d,%r12d vpalignr $4,%xmm2,%xmm3,%xmm7 shrdl $9,%r14d,%r14d xorl %r8d,%r13d xorl %r10d,%r12d vpsrld $7,%xmm4,%xmm6 shrdl $5,%r13d,%r13d xorl %eax,%r14d andl %r8d,%r12d vpaddd %xmm7,%xmm0,%xmm0 xorl %r8d,%r13d addl 0(%rsp),%r11d movl %eax,%r15d vpsrld $3,%xmm4,%xmm7 xorl %r10d,%r12d shrdl $11,%r14d,%r14d xorl %ebx,%r15d vpslld $14,%xmm4,%xmm5 addl %r12d,%r11d shrdl $6,%r13d,%r13d andl %r15d,%edi vpxor %xmm6,%xmm7,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi vpshufd $250,%xmm3,%xmm7 shrdl $2,%r14d,%r14d addl %r11d,%edx addl %edi,%r11d vpsrld $11,%xmm6,%xmm6 movl %edx,%r13d addl %r11d,%r14d shrdl $14,%r13d,%r13d vpxor %xmm5,%xmm4,%xmm4 movl %r14d,%r11d movl %r8d,%r12d shrdl $9,%r14d,%r14d vpslld $11,%xmm5,%xmm5 xorl %edx,%r13d xorl %r9d,%r12d shrdl $5,%r13d,%r13d vpxor %xmm6,%xmm4,%xmm4 xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d vpsrld $10,%xmm7,%xmm6 addl 4(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d vpxor %xmm5,%xmm4,%xmm4 shrdl $11,%r14d,%r14d xorl %eax,%edi addl %r12d,%r10d vpsrlq $17,%xmm7,%xmm7 shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r11d,%r14d vpaddd %xmm4,%xmm0,%xmm0 addl %r13d,%r10d xorl %eax,%r15d shrdl $2,%r14d,%r14d vpxor %xmm7,%xmm6,%xmm6 addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d vpsrlq $2,%xmm7,%xmm7 addl %r10d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r10d vpxor %xmm7,%xmm6,%xmm6 movl %edx,%r12d shrdl $9,%r14d,%r14d xorl %ecx,%r13d vpshufb %xmm8,%xmm6,%xmm6 xorl %r8d,%r12d shrdl $5,%r13d,%r13d xorl %r10d,%r14d vpaddd %xmm6,%xmm0,%xmm0 andl %ecx,%r12d xorl %ecx,%r13d addl 8(%rsp),%r9d vpshufd $80,%xmm0,%xmm7 movl %r10d,%r15d xorl %r8d,%r12d shrdl $11,%r14d,%r14d vpsrld $10,%xmm7,%xmm6 xorl %r11d,%r15d addl %r12d,%r9d shrdl $6,%r13d,%r13d vpsrlq $17,%xmm7,%xmm7 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d vpxor %xmm7,%xmm6,%xmm6 xorl %r11d,%edi shrdl $2,%r14d,%r14d addl %r9d,%ebx vpsrlq $2,%xmm7,%xmm7 addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d vpxor %xmm7,%xmm6,%xmm6 shrdl $14,%r13d,%r13d movl %r14d,%r9d movl %ecx,%r12d vpshufb %xmm9,%xmm6,%xmm6 shrdl $9,%r14d,%r14d xorl %ebx,%r13d xorl %edx,%r12d vpaddd %xmm6,%xmm0,%xmm0 shrdl $5,%r13d,%r13d xorl %r9d,%r14d andl %ebx,%r12d vpaddd 0(%rbp),%xmm0,%xmm6 xorl %ebx,%r13d addl 12(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d shrdl $11,%r14d,%r14d xorl %r10d,%edi addl %r12d,%r8d shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d shrdl $2,%r14d,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d vmovdqa %xmm6,0(%rsp) vpalignr $4,%xmm1,%xmm2,%xmm4 shrdl $14,%r13d,%r13d movl %r14d,%r8d movl %ebx,%r12d vpalignr $4,%xmm3,%xmm0,%xmm7 shrdl $9,%r14d,%r14d xorl %eax,%r13d xorl %ecx,%r12d vpsrld $7,%xmm4,%xmm6 shrdl $5,%r13d,%r13d xorl %r8d,%r14d andl %eax,%r12d vpaddd %xmm7,%xmm1,%xmm1 xorl %eax,%r13d addl 16(%rsp),%edx movl %r8d,%r15d vpsrld $3,%xmm4,%xmm7 xorl %ecx,%r12d shrdl $11,%r14d,%r14d xorl %r9d,%r15d vpslld $14,%xmm4,%xmm5 addl %r12d,%edx shrdl $6,%r13d,%r13d andl %r15d,%edi vpxor %xmm6,%xmm7,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi vpshufd $250,%xmm0,%xmm7 shrdl $2,%r14d,%r14d addl %edx,%r11d addl %edi,%edx vpsrld $11,%xmm6,%xmm6 movl %r11d,%r13d addl %edx,%r14d shrdl $14,%r13d,%r13d vpxor %xmm5,%xmm4,%xmm4 movl %r14d,%edx movl %eax,%r12d shrdl $9,%r14d,%r14d vpslld $11,%xmm5,%xmm5 xorl %r11d,%r13d xorl %ebx,%r12d shrdl $5,%r13d,%r13d vpxor %xmm6,%xmm4,%xmm4 xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d vpsrld $10,%xmm7,%xmm6 addl 20(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d vpxor %xmm5,%xmm4,%xmm4 shrdl $11,%r14d,%r14d xorl %r8d,%edi addl %r12d,%ecx vpsrlq $17,%xmm7,%xmm7 shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %edx,%r14d vpaddd %xmm4,%xmm1,%xmm1 addl %r13d,%ecx xorl %r8d,%r15d shrdl $2,%r14d,%r14d vpxor %xmm7,%xmm6,%xmm6 addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d vpsrlq $2,%xmm7,%xmm7 addl %ecx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%ecx vpxor %xmm7,%xmm6,%xmm6 movl %r11d,%r12d shrdl $9,%r14d,%r14d xorl %r10d,%r13d vpshufb %xmm8,%xmm6,%xmm6 xorl %eax,%r12d shrdl $5,%r13d,%r13d xorl %ecx,%r14d vpaddd %xmm6,%xmm1,%xmm1 andl %r10d,%r12d xorl %r10d,%r13d addl 24(%rsp),%ebx vpshufd $80,%xmm1,%xmm7 movl %ecx,%r15d xorl %eax,%r12d shrdl $11,%r14d,%r14d vpsrld $10,%xmm7,%xmm6 xorl %edx,%r15d addl %r12d,%ebx shrdl $6,%r13d,%r13d vpsrlq $17,%xmm7,%xmm7 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx vpxor %xmm7,%xmm6,%xmm6 xorl %edx,%edi shrdl $2,%r14d,%r14d addl %ebx,%r9d vpsrlq $2,%xmm7,%xmm7 addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d vpxor %xmm7,%xmm6,%xmm6 shrdl $14,%r13d,%r13d movl %r14d,%ebx movl %r10d,%r12d vpshufb %xmm9,%xmm6,%xmm6 shrdl $9,%r14d,%r14d xorl %r9d,%r13d xorl %r11d,%r12d vpaddd %xmm6,%xmm1,%xmm1 shrdl $5,%r13d,%r13d xorl %ebx,%r14d andl %r9d,%r12d vpaddd 32(%rbp),%xmm1,%xmm6 xorl %r9d,%r13d addl 28(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d shrdl $11,%r14d,%r14d xorl %ecx,%edi addl %r12d,%eax shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d shrdl $2,%r14d,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d vmovdqa %xmm6,16(%rsp) vpalignr $4,%xmm2,%xmm3,%xmm4 shrdl $14,%r13d,%r13d movl %r14d,%eax movl %r9d,%r12d vpalignr $4,%xmm0,%xmm1,%xmm7 shrdl $9,%r14d,%r14d xorl %r8d,%r13d xorl %r10d,%r12d vpsrld $7,%xmm4,%xmm6 shrdl $5,%r13d,%r13d xorl %eax,%r14d andl %r8d,%r12d vpaddd %xmm7,%xmm2,%xmm2 xorl %r8d,%r13d addl 32(%rsp),%r11d movl %eax,%r15d vpsrld $3,%xmm4,%xmm7 xorl %r10d,%r12d shrdl $11,%r14d,%r14d xorl %ebx,%r15d vpslld $14,%xmm4,%xmm5 addl %r12d,%r11d shrdl $6,%r13d,%r13d andl %r15d,%edi vpxor %xmm6,%xmm7,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi vpshufd $250,%xmm1,%xmm7 shrdl $2,%r14d,%r14d addl %r11d,%edx addl %edi,%r11d vpsrld $11,%xmm6,%xmm6 movl %edx,%r13d addl %r11d,%r14d shrdl $14,%r13d,%r13d vpxor %xmm5,%xmm4,%xmm4 movl %r14d,%r11d movl %r8d,%r12d shrdl $9,%r14d,%r14d vpslld $11,%xmm5,%xmm5 xorl %edx,%r13d xorl %r9d,%r12d shrdl $5,%r13d,%r13d vpxor %xmm6,%xmm4,%xmm4 xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d vpsrld $10,%xmm7,%xmm6 addl 36(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d vpxor %xmm5,%xmm4,%xmm4 shrdl $11,%r14d,%r14d xorl %eax,%edi addl %r12d,%r10d vpsrlq $17,%xmm7,%xmm7 shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r11d,%r14d vpaddd %xmm4,%xmm2,%xmm2 addl %r13d,%r10d xorl %eax,%r15d shrdl $2,%r14d,%r14d vpxor %xmm7,%xmm6,%xmm6 addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d vpsrlq $2,%xmm7,%xmm7 addl %r10d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r10d vpxor %xmm7,%xmm6,%xmm6 movl %edx,%r12d shrdl $9,%r14d,%r14d xorl %ecx,%r13d vpshufb %xmm8,%xmm6,%xmm6 xorl %r8d,%r12d shrdl $5,%r13d,%r13d xorl %r10d,%r14d vpaddd %xmm6,%xmm2,%xmm2 andl %ecx,%r12d xorl %ecx,%r13d addl 40(%rsp),%r9d vpshufd $80,%xmm2,%xmm7 movl %r10d,%r15d xorl %r8d,%r12d shrdl $11,%r14d,%r14d vpsrld $10,%xmm7,%xmm6 xorl %r11d,%r15d addl %r12d,%r9d shrdl $6,%r13d,%r13d vpsrlq $17,%xmm7,%xmm7 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d vpxor %xmm7,%xmm6,%xmm6 xorl %r11d,%edi shrdl $2,%r14d,%r14d addl %r9d,%ebx vpsrlq $2,%xmm7,%xmm7 addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d vpxor %xmm7,%xmm6,%xmm6 shrdl $14,%r13d,%r13d movl %r14d,%r9d movl %ecx,%r12d vpshufb %xmm9,%xmm6,%xmm6 shrdl $9,%r14d,%r14d xorl %ebx,%r13d xorl %edx,%r12d vpaddd %xmm6,%xmm2,%xmm2 shrdl $5,%r13d,%r13d xorl %r9d,%r14d andl %ebx,%r12d vpaddd 64(%rbp),%xmm2,%xmm6 xorl %ebx,%r13d addl 44(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d shrdl $11,%r14d,%r14d xorl %r10d,%edi addl %r12d,%r8d shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d shrdl $2,%r14d,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d vmovdqa %xmm6,32(%rsp) vpalignr $4,%xmm3,%xmm0,%xmm4 shrdl $14,%r13d,%r13d movl %r14d,%r8d movl %ebx,%r12d vpalignr $4,%xmm1,%xmm2,%xmm7 shrdl $9,%r14d,%r14d xorl %eax,%r13d xorl %ecx,%r12d vpsrld $7,%xmm4,%xmm6 shrdl $5,%r13d,%r13d xorl %r8d,%r14d andl %eax,%r12d vpaddd %xmm7,%xmm3,%xmm3 xorl %eax,%r13d addl 48(%rsp),%edx movl %r8d,%r15d vpsrld $3,%xmm4,%xmm7 xorl %ecx,%r12d shrdl $11,%r14d,%r14d xorl %r9d,%r15d vpslld $14,%xmm4,%xmm5 addl %r12d,%edx shrdl $6,%r13d,%r13d andl %r15d,%edi vpxor %xmm6,%xmm7,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi vpshufd $250,%xmm2,%xmm7 shrdl $2,%r14d,%r14d addl %edx,%r11d addl %edi,%edx vpsrld $11,%xmm6,%xmm6 movl %r11d,%r13d addl %edx,%r14d shrdl $14,%r13d,%r13d vpxor %xmm5,%xmm4,%xmm4 movl %r14d,%edx movl %eax,%r12d shrdl $9,%r14d,%r14d vpslld $11,%xmm5,%xmm5 xorl %r11d,%r13d xorl %ebx,%r12d shrdl $5,%r13d,%r13d vpxor %xmm6,%xmm4,%xmm4 xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d vpsrld $10,%xmm7,%xmm6 addl 52(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d vpxor %xmm5,%xmm4,%xmm4 shrdl $11,%r14d,%r14d xorl %r8d,%edi addl %r12d,%ecx vpsrlq $17,%xmm7,%xmm7 shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %edx,%r14d vpaddd %xmm4,%xmm3,%xmm3 addl %r13d,%ecx xorl %r8d,%r15d shrdl $2,%r14d,%r14d vpxor %xmm7,%xmm6,%xmm6 addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d vpsrlq $2,%xmm7,%xmm7 addl %ecx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%ecx vpxor %xmm7,%xmm6,%xmm6 movl %r11d,%r12d shrdl $9,%r14d,%r14d xorl %r10d,%r13d vpshufb %xmm8,%xmm6,%xmm6 xorl %eax,%r12d shrdl $5,%r13d,%r13d xorl %ecx,%r14d vpaddd %xmm6,%xmm3,%xmm3 andl %r10d,%r12d xorl %r10d,%r13d addl 56(%rsp),%ebx vpshufd $80,%xmm3,%xmm7 movl %ecx,%r15d xorl %eax,%r12d shrdl $11,%r14d,%r14d vpsrld $10,%xmm7,%xmm6 xorl %edx,%r15d addl %r12d,%ebx shrdl $6,%r13d,%r13d vpsrlq $17,%xmm7,%xmm7 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx vpxor %xmm7,%xmm6,%xmm6 xorl %edx,%edi shrdl $2,%r14d,%r14d addl %ebx,%r9d vpsrlq $2,%xmm7,%xmm7 addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d vpxor %xmm7,%xmm6,%xmm6 shrdl $14,%r13d,%r13d movl %r14d,%ebx movl %r10d,%r12d vpshufb %xmm9,%xmm6,%xmm6 shrdl $9,%r14d,%r14d xorl %r9d,%r13d xorl %r11d,%r12d vpaddd %xmm6,%xmm3,%xmm3 shrdl $5,%r13d,%r13d xorl %ebx,%r14d andl %r9d,%r12d vpaddd 96(%rbp),%xmm3,%xmm6 xorl %r9d,%r13d addl 60(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d shrdl $11,%r14d,%r14d xorl %ecx,%edi addl %r12d,%eax shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d shrdl $2,%r14d,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d vmovdqa %xmm6,48(%rsp) cmpb $0,131(%rbp) jne .Lavx_00_47 shrdl $14,%r13d,%r13d movl %r14d,%eax movl %r9d,%r12d shrdl $9,%r14d,%r14d xorl %r8d,%r13d xorl %r10d,%r12d shrdl $5,%r13d,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d shrdl $11,%r14d,%r14d xorl %ebx,%r15d addl %r12d,%r11d shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi shrdl $2,%r14d,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r11d movl %r8d,%r12d shrdl $9,%r14d,%r14d xorl %edx,%r13d xorl %r9d,%r12d shrdl $5,%r13d,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 4(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d shrdl $11,%r14d,%r14d xorl %eax,%edi addl %r12d,%r10d shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d shrdl $2,%r14d,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r10d movl %edx,%r12d shrdl $9,%r14d,%r14d xorl %ecx,%r13d xorl %r8d,%r12d shrdl $5,%r13d,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d shrdl $11,%r14d,%r14d xorl %r11d,%r15d addl %r12d,%r9d shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi shrdl $2,%r14d,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r9d movl %ecx,%r12d shrdl $9,%r14d,%r14d xorl %ebx,%r13d xorl %edx,%r12d shrdl $5,%r13d,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d shrdl $11,%r14d,%r14d xorl %r10d,%edi addl %r12d,%r8d shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d shrdl $2,%r14d,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r8d movl %ebx,%r12d shrdl $9,%r14d,%r14d xorl %eax,%r13d xorl %ecx,%r12d shrdl $5,%r13d,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d shrdl $11,%r14d,%r14d xorl %r9d,%r15d addl %r12d,%edx shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi shrdl $2,%r14d,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%edx movl %eax,%r12d shrdl $9,%r14d,%r14d xorl %r11d,%r13d xorl %ebx,%r12d shrdl $5,%r13d,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 20(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d shrdl $11,%r14d,%r14d xorl %r8d,%edi addl %r12d,%ecx shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d shrdl $2,%r14d,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%ecx movl %r11d,%r12d shrdl $9,%r14d,%r14d xorl %r10d,%r13d xorl %eax,%r12d shrdl $5,%r13d,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d shrdl $11,%r14d,%r14d xorl %edx,%r15d addl %r12d,%ebx shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi shrdl $2,%r14d,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%ebx movl %r10d,%r12d shrdl $9,%r14d,%r14d xorl %r9d,%r13d xorl %r11d,%r12d shrdl $5,%r13d,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d shrdl $11,%r14d,%r14d xorl %ecx,%edi addl %r12d,%eax shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d shrdl $2,%r14d,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d shrdl $14,%r13d,%r13d movl %r14d,%eax movl %r9d,%r12d shrdl $9,%r14d,%r14d xorl %r8d,%r13d xorl %r10d,%r12d shrdl $5,%r13d,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d shrdl $11,%r14d,%r14d xorl %ebx,%r15d addl %r12d,%r11d shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi shrdl $2,%r14d,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r11d movl %r8d,%r12d shrdl $9,%r14d,%r14d xorl %edx,%r13d xorl %r9d,%r12d shrdl $5,%r13d,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 36(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d shrdl $11,%r14d,%r14d xorl %eax,%edi addl %r12d,%r10d shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d shrdl $2,%r14d,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r10d movl %edx,%r12d shrdl $9,%r14d,%r14d xorl %ecx,%r13d xorl %r8d,%r12d shrdl $5,%r13d,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d shrdl $11,%r14d,%r14d xorl %r11d,%r15d addl %r12d,%r9d shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi shrdl $2,%r14d,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r9d movl %ecx,%r12d shrdl $9,%r14d,%r14d xorl %ebx,%r13d xorl %edx,%r12d shrdl $5,%r13d,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d shrdl $11,%r14d,%r14d xorl %r10d,%edi addl %r12d,%r8d shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d shrdl $2,%r14d,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r8d movl %ebx,%r12d shrdl $9,%r14d,%r14d xorl %eax,%r13d xorl %ecx,%r12d shrdl $5,%r13d,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d shrdl $11,%r14d,%r14d xorl %r9d,%r15d addl %r12d,%edx shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi shrdl $2,%r14d,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%edx movl %eax,%r12d shrdl $9,%r14d,%r14d xorl %r11d,%r13d xorl %ebx,%r12d shrdl $5,%r13d,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 52(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d shrdl $11,%r14d,%r14d xorl %r8d,%edi addl %r12d,%ecx shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d shrdl $2,%r14d,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%ecx movl %r11d,%r12d shrdl $9,%r14d,%r14d xorl %r10d,%r13d xorl %eax,%r12d shrdl $5,%r13d,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d shrdl $11,%r14d,%r14d xorl %edx,%r15d addl %r12d,%ebx shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi shrdl $2,%r14d,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%ebx movl %r10d,%r12d shrdl $9,%r14d,%r14d xorl %r9d,%r13d xorl %r11d,%r12d shrdl $5,%r13d,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d shrdl $11,%r14d,%r14d xorl %ecx,%edi addl %r12d,%eax shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d shrdl $2,%r14d,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d movq 64+0(%rsp),%rdi movl %r14d,%eax addl 0(%rdi),%eax leaq 64(%rsi),%rsi addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d cmpq 64+16(%rsp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb .Lloop_avx movq 88(%rsp),%rsi .cfi_def_cfa %rsi,8 vzeroupper movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size sha256_block_data_order_avx,.-sha256_block_data_order_avx #endif ring-0.17.14/pregenerated/sha256-x86_64-macosx.S000064400000000000000000002070341046102023000167700ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text .globl _sha256_block_data_order_nohw .private_extern _sha256_block_data_order_nohw .p2align 4 _sha256_block_data_order_nohw: _CET_ENDBR movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx andq $-64,%rsp movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) movq %rax,88(%rsp) L$prologue: movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d jmp L$loop .p2align 4 L$loop: movl %ebx,%edi leaq K256(%rip),%rbp xorl %ecx,%edi movl 0(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,0(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl (%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d leaq 4(%rbp),%rbp addl %r14d,%r11d movl 4(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,4(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl (%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d leaq 4(%rbp),%rbp addl %r14d,%r10d movl 8(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,8(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl (%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d leaq 4(%rbp),%rbp addl %r14d,%r9d movl 12(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,12(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl (%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d leaq 20(%rbp),%rbp addl %r14d,%r8d movl 16(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,16(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl (%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx leaq 4(%rbp),%rbp addl %r14d,%edx movl 20(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,20(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl (%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx leaq 4(%rbp),%rbp addl %r14d,%ecx movl 24(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,24(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl (%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx leaq 4(%rbp),%rbp addl %r14d,%ebx movl 28(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,28(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl (%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 20(%rbp),%rbp addl %r14d,%eax movl 32(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,32(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl (%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d leaq 4(%rbp),%rbp addl %r14d,%r11d movl 36(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,36(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl (%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d leaq 4(%rbp),%rbp addl %r14d,%r10d movl 40(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,40(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl (%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d leaq 4(%rbp),%rbp addl %r14d,%r9d movl 44(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,44(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl (%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d leaq 20(%rbp),%rbp addl %r14d,%r8d movl 48(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,48(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl (%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx leaq 4(%rbp),%rbp addl %r14d,%edx movl 52(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,52(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl (%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx leaq 4(%rbp),%rbp addl %r14d,%ecx movl 56(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,56(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl (%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx leaq 4(%rbp),%rbp addl %r14d,%ebx movl 60(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,60(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl (%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 20(%rbp),%rbp jmp L$rounds_16_xx .p2align 4 L$rounds_16_xx: movl 4(%rsp),%r13d movl 56(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%eax movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 36(%rsp),%r12d addl 0(%rsp),%r12d movl %r8d,%r13d addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,0(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl (%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d leaq 4(%rbp),%rbp movl 8(%rsp),%r13d movl 60(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r11d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 40(%rsp),%r12d addl 4(%rsp),%r12d movl %edx,%r13d addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,4(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl (%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d leaq 4(%rbp),%rbp movl 12(%rsp),%r13d movl 0(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r10d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 44(%rsp),%r12d addl 8(%rsp),%r12d movl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,8(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl (%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d leaq 4(%rbp),%rbp movl 16(%rsp),%r13d movl 4(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r9d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 48(%rsp),%r12d addl 12(%rsp),%r12d movl %ebx,%r13d addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,12(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl (%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d leaq 20(%rbp),%rbp movl 20(%rsp),%r13d movl 8(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r8d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 52(%rsp),%r12d addl 16(%rsp),%r12d movl %eax,%r13d addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,16(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl (%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx leaq 4(%rbp),%rbp movl 24(%rsp),%r13d movl 12(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%edx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 56(%rsp),%r12d addl 20(%rsp),%r12d movl %r11d,%r13d addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,20(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl (%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx leaq 4(%rbp),%rbp movl 28(%rsp),%r13d movl 16(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ecx movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 60(%rsp),%r12d addl 24(%rsp),%r12d movl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,24(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl (%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx leaq 4(%rbp),%rbp movl 32(%rsp),%r13d movl 20(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ebx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 0(%rsp),%r12d addl 28(%rsp),%r12d movl %r9d,%r13d addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,28(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl (%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 20(%rbp),%rbp movl 36(%rsp),%r13d movl 24(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%eax movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 4(%rsp),%r12d addl 32(%rsp),%r12d movl %r8d,%r13d addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,32(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl (%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d leaq 4(%rbp),%rbp movl 40(%rsp),%r13d movl 28(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r11d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 8(%rsp),%r12d addl 36(%rsp),%r12d movl %edx,%r13d addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,36(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl (%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d leaq 4(%rbp),%rbp movl 44(%rsp),%r13d movl 32(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r10d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 12(%rsp),%r12d addl 40(%rsp),%r12d movl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,40(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl (%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d leaq 4(%rbp),%rbp movl 48(%rsp),%r13d movl 36(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r9d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 16(%rsp),%r12d addl 44(%rsp),%r12d movl %ebx,%r13d addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,44(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl (%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d leaq 20(%rbp),%rbp movl 52(%rsp),%r13d movl 40(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r8d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 20(%rsp),%r12d addl 48(%rsp),%r12d movl %eax,%r13d addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,48(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl (%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx leaq 4(%rbp),%rbp movl 56(%rsp),%r13d movl 44(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%edx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 24(%rsp),%r12d addl 52(%rsp),%r12d movl %r11d,%r13d addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,52(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl (%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx leaq 4(%rbp),%rbp movl 60(%rsp),%r13d movl 48(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ecx movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 28(%rsp),%r12d addl 56(%rsp),%r12d movl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,56(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl (%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx leaq 4(%rbp),%rbp movl 0(%rsp),%r13d movl 52(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ebx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 32(%rsp),%r12d addl 60(%rsp),%r12d movl %r9d,%r13d addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,60(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl (%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 20(%rbp),%rbp cmpb $0,3(%rbp) jnz L$rounds_16_xx movq 64+0(%rsp),%rdi addl %r14d,%eax leaq 64(%rsi),%rsi addl 0(%rdi),%eax addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d cmpq 64+16(%rsp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb L$loop movq 88(%rsp),%rsi movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$epilogue: ret .section __DATA,__const .p2align 6 K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text .globl _sha256_block_data_order_hw .private_extern _sha256_block_data_order_hw .p2align 6 _sha256_block_data_order_hw: _CET_ENDBR leaq K256+128(%rip),%rcx movdqu (%rdi),%xmm1 movdqu 16(%rdi),%xmm2 movdqa 512-128(%rcx),%xmm7 pshufd $0x1b,%xmm1,%xmm0 pshufd $0xb1,%xmm1,%xmm1 pshufd $0x1b,%xmm2,%xmm2 movdqa %xmm7,%xmm8 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 jmp L$oop_shaext .p2align 4 L$oop_shaext: movdqu (%rsi),%xmm3 movdqu 16(%rsi),%xmm4 movdqu 32(%rsi),%xmm5 .byte 102,15,56,0,223 movdqu 48(%rsi),%xmm6 movdqa 0-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 102,15,56,0,231 movdqa %xmm2,%xmm10 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 nop movdqa %xmm1,%xmm9 .byte 15,56,203,202 movdqa 32-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 102,15,56,0,239 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 leaq 64(%rsi),%rsi .byte 15,56,204,220 .byte 15,56,203,202 movdqa 64-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 102,15,56,0,247 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 96-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 128-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 160-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 192-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 224-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 256-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 288-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 320-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 352-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 384-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 416-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 .byte 15,56,203,202 paddd %xmm7,%xmm6 movdqa 448-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 .byte 15,56,205,245 movdqa %xmm8,%xmm7 .byte 15,56,203,202 movdqa 480-128(%rcx),%xmm0 paddd %xmm6,%xmm0 nop .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 decq %rdx nop .byte 15,56,203,202 paddd %xmm10,%xmm2 paddd %xmm9,%xmm1 jnz L$oop_shaext pshufd $0xb1,%xmm2,%xmm2 pshufd $0x1b,%xmm1,%xmm7 pshufd $0xb1,%xmm1,%xmm1 punpckhqdq %xmm2,%xmm1 .byte 102,15,58,15,215,8 movdqu %xmm1,(%rdi) movdqu %xmm2,16(%rdi) ret .globl _sha256_block_data_order_ssse3 .private_extern _sha256_block_data_order_ssse3 .p2align 6 _sha256_block_data_order_ssse3: _CET_ENDBR movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx andq $-64,%rsp movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) movq %rax,88(%rsp) L$prologue_ssse3: movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d jmp L$loop_ssse3 .p2align 4 L$loop_ssse3: movdqa K256+512(%rip),%xmm7 movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 .byte 102,15,56,0,199 movdqu 48(%rsi),%xmm3 leaq K256(%rip),%rbp .byte 102,15,56,0,207 movdqa 0(%rbp),%xmm4 movdqa 32(%rbp),%xmm5 .byte 102,15,56,0,215 paddd %xmm0,%xmm4 movdqa 64(%rbp),%xmm6 .byte 102,15,56,0,223 movdqa 96(%rbp),%xmm7 paddd %xmm1,%xmm5 paddd %xmm2,%xmm6 paddd %xmm3,%xmm7 movdqa %xmm4,0(%rsp) movl %eax,%r14d movdqa %xmm5,16(%rsp) movl %ebx,%edi movdqa %xmm6,32(%rsp) xorl %ecx,%edi movdqa %xmm7,48(%rsp) movl %r8d,%r13d jmp L$ssse3_00_47 .p2align 4 L$ssse3_00_47: subq $-128,%rbp rorl $14,%r13d movdqa %xmm1,%xmm4 movl %r14d,%eax movl %r9d,%r12d movdqa %xmm3,%xmm7 rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d .byte 102,15,58,15,224,4 andl %r8d,%r12d xorl %r8d,%r13d .byte 102,15,58,15,250,4 addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %ebx,%r15d addl %r12d,%r11d movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi paddd %xmm7,%xmm0 rorl $2,%r14d addl %r11d,%edx psrld $7,%xmm6 addl %edi,%r11d movl %edx,%r13d pshufd $250,%xmm3,%xmm7 addl %r11d,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%r11d movl %r8d,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %r11d,%r14d pxor %xmm5,%xmm4 andl %edx,%r12d xorl %edx,%r13d pslld $11,%xmm5 addl 4(%rsp),%r10d movl %r11d,%edi pxor %xmm6,%xmm4 xorl %r9d,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %eax,%edi addl %r12d,%r10d pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d psrld $10,%xmm7 addl %r13d,%r10d xorl %eax,%r15d paddd %xmm4,%xmm0 rorl $2,%r14d addl %r10d,%ecx psrlq $17,%xmm6 addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %ecx,%r13d xorl %r8d,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d pshufd $128,%xmm7,%xmm7 xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d psrldq $8,%xmm7 xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d paddd %xmm7,%xmm0 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d pshufd $80,%xmm0,%xmm7 xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx movdqa %xmm7,%xmm6 addl %edi,%r9d movl %ebx,%r13d psrld $10,%xmm7 addl %r9d,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%r9d movl %ecx,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d psrlq $2,%xmm6 andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d pxor %xmm6,%xmm7 movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %r10d,%edi addl %r12d,%r8d movdqa 0(%rbp),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d paddd %xmm7,%xmm0 rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d paddd %xmm0,%xmm6 movl %eax,%r13d addl %r8d,%r14d movdqa %xmm6,0(%rsp) rorl $14,%r13d movdqa %xmm2,%xmm4 movl %r14d,%r8d movl %ebx,%r12d movdqa %xmm0,%xmm7 rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d .byte 102,15,58,15,225,4 andl %eax,%r12d xorl %eax,%r13d .byte 102,15,58,15,251,4 addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %r9d,%r15d addl %r12d,%edx movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi paddd %xmm7,%xmm1 rorl $2,%r14d addl %edx,%r11d psrld $7,%xmm6 addl %edi,%edx movl %r11d,%r13d pshufd $250,%xmm0,%xmm7 addl %edx,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%edx movl %eax,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %edx,%r14d pxor %xmm5,%xmm4 andl %r11d,%r12d xorl %r11d,%r13d pslld $11,%xmm5 addl 20(%rsp),%ecx movl %edx,%edi pxor %xmm6,%xmm4 xorl %ebx,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %r8d,%edi addl %r12d,%ecx pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d psrld $10,%xmm7 addl %r13d,%ecx xorl %r8d,%r15d paddd %xmm4,%xmm1 rorl $2,%r14d addl %ecx,%r10d psrlq $17,%xmm6 addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %r10d,%r13d xorl %eax,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d pshufd $128,%xmm7,%xmm7 xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d psrldq $8,%xmm7 xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d paddd %xmm7,%xmm1 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx pshufd $80,%xmm1,%xmm7 xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d movdqa %xmm7,%xmm6 addl %edi,%ebx movl %r9d,%r13d psrld $10,%xmm7 addl %ebx,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%ebx movl %r10d,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d psrlq $2,%xmm6 andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax pxor %xmm6,%xmm7 movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %ecx,%edi addl %r12d,%eax movdqa 32(%rbp),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d paddd %xmm7,%xmm1 rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax paddd %xmm1,%xmm6 movl %r8d,%r13d addl %eax,%r14d movdqa %xmm6,16(%rsp) rorl $14,%r13d movdqa %xmm3,%xmm4 movl %r14d,%eax movl %r9d,%r12d movdqa %xmm1,%xmm7 rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d .byte 102,15,58,15,226,4 andl %r8d,%r12d xorl %r8d,%r13d .byte 102,15,58,15,248,4 addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %ebx,%r15d addl %r12d,%r11d movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi paddd %xmm7,%xmm2 rorl $2,%r14d addl %r11d,%edx psrld $7,%xmm6 addl %edi,%r11d movl %edx,%r13d pshufd $250,%xmm1,%xmm7 addl %r11d,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%r11d movl %r8d,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %r11d,%r14d pxor %xmm5,%xmm4 andl %edx,%r12d xorl %edx,%r13d pslld $11,%xmm5 addl 36(%rsp),%r10d movl %r11d,%edi pxor %xmm6,%xmm4 xorl %r9d,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %eax,%edi addl %r12d,%r10d pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d psrld $10,%xmm7 addl %r13d,%r10d xorl %eax,%r15d paddd %xmm4,%xmm2 rorl $2,%r14d addl %r10d,%ecx psrlq $17,%xmm6 addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %ecx,%r13d xorl %r8d,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d pshufd $128,%xmm7,%xmm7 xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d psrldq $8,%xmm7 xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d paddd %xmm7,%xmm2 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d pshufd $80,%xmm2,%xmm7 xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx movdqa %xmm7,%xmm6 addl %edi,%r9d movl %ebx,%r13d psrld $10,%xmm7 addl %r9d,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%r9d movl %ecx,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d psrlq $2,%xmm6 andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d pxor %xmm6,%xmm7 movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %r10d,%edi addl %r12d,%r8d movdqa 64(%rbp),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d paddd %xmm7,%xmm2 rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d paddd %xmm2,%xmm6 movl %eax,%r13d addl %r8d,%r14d movdqa %xmm6,32(%rsp) rorl $14,%r13d movdqa %xmm0,%xmm4 movl %r14d,%r8d movl %ebx,%r12d movdqa %xmm2,%xmm7 rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d .byte 102,15,58,15,227,4 andl %eax,%r12d xorl %eax,%r13d .byte 102,15,58,15,249,4 addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %r9d,%r15d addl %r12d,%edx movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi paddd %xmm7,%xmm3 rorl $2,%r14d addl %edx,%r11d psrld $7,%xmm6 addl %edi,%edx movl %r11d,%r13d pshufd $250,%xmm2,%xmm7 addl %edx,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%edx movl %eax,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %edx,%r14d pxor %xmm5,%xmm4 andl %r11d,%r12d xorl %r11d,%r13d pslld $11,%xmm5 addl 52(%rsp),%ecx movl %edx,%edi pxor %xmm6,%xmm4 xorl %ebx,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %r8d,%edi addl %r12d,%ecx pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d psrld $10,%xmm7 addl %r13d,%ecx xorl %r8d,%r15d paddd %xmm4,%xmm3 rorl $2,%r14d addl %ecx,%r10d psrlq $17,%xmm6 addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %r10d,%r13d xorl %eax,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d pshufd $128,%xmm7,%xmm7 xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d psrldq $8,%xmm7 xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d paddd %xmm7,%xmm3 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx pshufd $80,%xmm3,%xmm7 xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d movdqa %xmm7,%xmm6 addl %edi,%ebx movl %r9d,%r13d psrld $10,%xmm7 addl %ebx,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%ebx movl %r10d,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d psrlq $2,%xmm6 andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax pxor %xmm6,%xmm7 movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %ecx,%edi addl %r12d,%eax movdqa 96(%rbp),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d paddd %xmm7,%xmm3 rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax paddd %xmm3,%xmm6 movl %r8d,%r13d addl %eax,%r14d movdqa %xmm6,48(%rsp) cmpb $0,131(%rbp) jne L$ssse3_00_47 rorl $14,%r13d movl %r14d,%eax movl %r9d,%r12d rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d xorl %ebx,%r15d addl %r12d,%r11d rorl $6,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi rorl $2,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d rorl $14,%r13d movl %r14d,%r11d movl %r8d,%r12d rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 4(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d rorl $11,%r14d xorl %eax,%edi addl %r12d,%r10d rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d rorl $2,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d xorl %ecx,%r13d xorl %r8d,%r12d rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d rorl $14,%r13d movl %r14d,%r9d movl %ecx,%r12d rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d xorl %r10d,%edi addl %r12d,%r8d rorl $6,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d rorl $14,%r13d movl %r14d,%r8d movl %ebx,%r12d rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d xorl %r9d,%r15d addl %r12d,%edx rorl $6,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi rorl $2,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d rorl $14,%r13d movl %r14d,%edx movl %eax,%r12d rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 20(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d rorl $11,%r14d xorl %r8d,%edi addl %r12d,%ecx rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d rorl $2,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d xorl %r10d,%r13d xorl %eax,%r12d rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d rorl $14,%r13d movl %r14d,%ebx movl %r10d,%r12d rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d xorl %ecx,%edi addl %r12d,%eax rorl $6,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d rorl $14,%r13d movl %r14d,%eax movl %r9d,%r12d rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d xorl %ebx,%r15d addl %r12d,%r11d rorl $6,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi rorl $2,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d rorl $14,%r13d movl %r14d,%r11d movl %r8d,%r12d rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 36(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d rorl $11,%r14d xorl %eax,%edi addl %r12d,%r10d rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d rorl $2,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d xorl %ecx,%r13d xorl %r8d,%r12d rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d rorl $14,%r13d movl %r14d,%r9d movl %ecx,%r12d rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d xorl %r10d,%edi addl %r12d,%r8d rorl $6,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d rorl $14,%r13d movl %r14d,%r8d movl %ebx,%r12d rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d xorl %r9d,%r15d addl %r12d,%edx rorl $6,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi rorl $2,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d rorl $14,%r13d movl %r14d,%edx movl %eax,%r12d rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 52(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d rorl $11,%r14d xorl %r8d,%edi addl %r12d,%ecx rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d rorl $2,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d xorl %r10d,%r13d xorl %eax,%r12d rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d rorl $14,%r13d movl %r14d,%ebx movl %r10d,%r12d rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d xorl %ecx,%edi addl %r12d,%eax rorl $6,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d movq 64+0(%rsp),%rdi movl %r14d,%eax addl 0(%rdi),%eax leaq 64(%rsi),%rsi addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d cmpq 64+16(%rsp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb L$loop_ssse3 movq 88(%rsp),%rsi movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$epilogue_ssse3: ret .globl _sha256_block_data_order_avx .private_extern _sha256_block_data_order_avx .p2align 6 _sha256_block_data_order_avx: _CET_ENDBR movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx andq $-64,%rsp movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) movq %rax,88(%rsp) L$prologue_avx: vzeroupper movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d vmovdqa K256+512+32(%rip),%xmm8 vmovdqa K256+512+64(%rip),%xmm9 jmp L$loop_avx .p2align 4 L$loop_avx: vmovdqa K256+512(%rip),%xmm7 vmovdqu 0(%rsi),%xmm0 vmovdqu 16(%rsi),%xmm1 vmovdqu 32(%rsi),%xmm2 vmovdqu 48(%rsi),%xmm3 vpshufb %xmm7,%xmm0,%xmm0 leaq K256(%rip),%rbp vpshufb %xmm7,%xmm1,%xmm1 vpshufb %xmm7,%xmm2,%xmm2 vpaddd 0(%rbp),%xmm0,%xmm4 vpshufb %xmm7,%xmm3,%xmm3 vpaddd 32(%rbp),%xmm1,%xmm5 vpaddd 64(%rbp),%xmm2,%xmm6 vpaddd 96(%rbp),%xmm3,%xmm7 vmovdqa %xmm4,0(%rsp) movl %eax,%r14d vmovdqa %xmm5,16(%rsp) movl %ebx,%edi vmovdqa %xmm6,32(%rsp) xorl %ecx,%edi vmovdqa %xmm7,48(%rsp) movl %r8d,%r13d jmp L$avx_00_47 .p2align 4 L$avx_00_47: subq $-128,%rbp vpalignr $4,%xmm0,%xmm1,%xmm4 shrdl $14,%r13d,%r13d movl %r14d,%eax movl %r9d,%r12d vpalignr $4,%xmm2,%xmm3,%xmm7 shrdl $9,%r14d,%r14d xorl %r8d,%r13d xorl %r10d,%r12d vpsrld $7,%xmm4,%xmm6 shrdl $5,%r13d,%r13d xorl %eax,%r14d andl %r8d,%r12d vpaddd %xmm7,%xmm0,%xmm0 xorl %r8d,%r13d addl 0(%rsp),%r11d movl %eax,%r15d vpsrld $3,%xmm4,%xmm7 xorl %r10d,%r12d shrdl $11,%r14d,%r14d xorl %ebx,%r15d vpslld $14,%xmm4,%xmm5 addl %r12d,%r11d shrdl $6,%r13d,%r13d andl %r15d,%edi vpxor %xmm6,%xmm7,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi vpshufd $250,%xmm3,%xmm7 shrdl $2,%r14d,%r14d addl %r11d,%edx addl %edi,%r11d vpsrld $11,%xmm6,%xmm6 movl %edx,%r13d addl %r11d,%r14d shrdl $14,%r13d,%r13d vpxor %xmm5,%xmm4,%xmm4 movl %r14d,%r11d movl %r8d,%r12d shrdl $9,%r14d,%r14d vpslld $11,%xmm5,%xmm5 xorl %edx,%r13d xorl %r9d,%r12d shrdl $5,%r13d,%r13d vpxor %xmm6,%xmm4,%xmm4 xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d vpsrld $10,%xmm7,%xmm6 addl 4(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d vpxor %xmm5,%xmm4,%xmm4 shrdl $11,%r14d,%r14d xorl %eax,%edi addl %r12d,%r10d vpsrlq $17,%xmm7,%xmm7 shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r11d,%r14d vpaddd %xmm4,%xmm0,%xmm0 addl %r13d,%r10d xorl %eax,%r15d shrdl $2,%r14d,%r14d vpxor %xmm7,%xmm6,%xmm6 addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d vpsrlq $2,%xmm7,%xmm7 addl %r10d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r10d vpxor %xmm7,%xmm6,%xmm6 movl %edx,%r12d shrdl $9,%r14d,%r14d xorl %ecx,%r13d vpshufb %xmm8,%xmm6,%xmm6 xorl %r8d,%r12d shrdl $5,%r13d,%r13d xorl %r10d,%r14d vpaddd %xmm6,%xmm0,%xmm0 andl %ecx,%r12d xorl %ecx,%r13d addl 8(%rsp),%r9d vpshufd $80,%xmm0,%xmm7 movl %r10d,%r15d xorl %r8d,%r12d shrdl $11,%r14d,%r14d vpsrld $10,%xmm7,%xmm6 xorl %r11d,%r15d addl %r12d,%r9d shrdl $6,%r13d,%r13d vpsrlq $17,%xmm7,%xmm7 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d vpxor %xmm7,%xmm6,%xmm6 xorl %r11d,%edi shrdl $2,%r14d,%r14d addl %r9d,%ebx vpsrlq $2,%xmm7,%xmm7 addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d vpxor %xmm7,%xmm6,%xmm6 shrdl $14,%r13d,%r13d movl %r14d,%r9d movl %ecx,%r12d vpshufb %xmm9,%xmm6,%xmm6 shrdl $9,%r14d,%r14d xorl %ebx,%r13d xorl %edx,%r12d vpaddd %xmm6,%xmm0,%xmm0 shrdl $5,%r13d,%r13d xorl %r9d,%r14d andl %ebx,%r12d vpaddd 0(%rbp),%xmm0,%xmm6 xorl %ebx,%r13d addl 12(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d shrdl $11,%r14d,%r14d xorl %r10d,%edi addl %r12d,%r8d shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d shrdl $2,%r14d,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d vmovdqa %xmm6,0(%rsp) vpalignr $4,%xmm1,%xmm2,%xmm4 shrdl $14,%r13d,%r13d movl %r14d,%r8d movl %ebx,%r12d vpalignr $4,%xmm3,%xmm0,%xmm7 shrdl $9,%r14d,%r14d xorl %eax,%r13d xorl %ecx,%r12d vpsrld $7,%xmm4,%xmm6 shrdl $5,%r13d,%r13d xorl %r8d,%r14d andl %eax,%r12d vpaddd %xmm7,%xmm1,%xmm1 xorl %eax,%r13d addl 16(%rsp),%edx movl %r8d,%r15d vpsrld $3,%xmm4,%xmm7 xorl %ecx,%r12d shrdl $11,%r14d,%r14d xorl %r9d,%r15d vpslld $14,%xmm4,%xmm5 addl %r12d,%edx shrdl $6,%r13d,%r13d andl %r15d,%edi vpxor %xmm6,%xmm7,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi vpshufd $250,%xmm0,%xmm7 shrdl $2,%r14d,%r14d addl %edx,%r11d addl %edi,%edx vpsrld $11,%xmm6,%xmm6 movl %r11d,%r13d addl %edx,%r14d shrdl $14,%r13d,%r13d vpxor %xmm5,%xmm4,%xmm4 movl %r14d,%edx movl %eax,%r12d shrdl $9,%r14d,%r14d vpslld $11,%xmm5,%xmm5 xorl %r11d,%r13d xorl %ebx,%r12d shrdl $5,%r13d,%r13d vpxor %xmm6,%xmm4,%xmm4 xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d vpsrld $10,%xmm7,%xmm6 addl 20(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d vpxor %xmm5,%xmm4,%xmm4 shrdl $11,%r14d,%r14d xorl %r8d,%edi addl %r12d,%ecx vpsrlq $17,%xmm7,%xmm7 shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %edx,%r14d vpaddd %xmm4,%xmm1,%xmm1 addl %r13d,%ecx xorl %r8d,%r15d shrdl $2,%r14d,%r14d vpxor %xmm7,%xmm6,%xmm6 addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d vpsrlq $2,%xmm7,%xmm7 addl %ecx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%ecx vpxor %xmm7,%xmm6,%xmm6 movl %r11d,%r12d shrdl $9,%r14d,%r14d xorl %r10d,%r13d vpshufb %xmm8,%xmm6,%xmm6 xorl %eax,%r12d shrdl $5,%r13d,%r13d xorl %ecx,%r14d vpaddd %xmm6,%xmm1,%xmm1 andl %r10d,%r12d xorl %r10d,%r13d addl 24(%rsp),%ebx vpshufd $80,%xmm1,%xmm7 movl %ecx,%r15d xorl %eax,%r12d shrdl $11,%r14d,%r14d vpsrld $10,%xmm7,%xmm6 xorl %edx,%r15d addl %r12d,%ebx shrdl $6,%r13d,%r13d vpsrlq $17,%xmm7,%xmm7 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx vpxor %xmm7,%xmm6,%xmm6 xorl %edx,%edi shrdl $2,%r14d,%r14d addl %ebx,%r9d vpsrlq $2,%xmm7,%xmm7 addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d vpxor %xmm7,%xmm6,%xmm6 shrdl $14,%r13d,%r13d movl %r14d,%ebx movl %r10d,%r12d vpshufb %xmm9,%xmm6,%xmm6 shrdl $9,%r14d,%r14d xorl %r9d,%r13d xorl %r11d,%r12d vpaddd %xmm6,%xmm1,%xmm1 shrdl $5,%r13d,%r13d xorl %ebx,%r14d andl %r9d,%r12d vpaddd 32(%rbp),%xmm1,%xmm6 xorl %r9d,%r13d addl 28(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d shrdl $11,%r14d,%r14d xorl %ecx,%edi addl %r12d,%eax shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d shrdl $2,%r14d,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d vmovdqa %xmm6,16(%rsp) vpalignr $4,%xmm2,%xmm3,%xmm4 shrdl $14,%r13d,%r13d movl %r14d,%eax movl %r9d,%r12d vpalignr $4,%xmm0,%xmm1,%xmm7 shrdl $9,%r14d,%r14d xorl %r8d,%r13d xorl %r10d,%r12d vpsrld $7,%xmm4,%xmm6 shrdl $5,%r13d,%r13d xorl %eax,%r14d andl %r8d,%r12d vpaddd %xmm7,%xmm2,%xmm2 xorl %r8d,%r13d addl 32(%rsp),%r11d movl %eax,%r15d vpsrld $3,%xmm4,%xmm7 xorl %r10d,%r12d shrdl $11,%r14d,%r14d xorl %ebx,%r15d vpslld $14,%xmm4,%xmm5 addl %r12d,%r11d shrdl $6,%r13d,%r13d andl %r15d,%edi vpxor %xmm6,%xmm7,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi vpshufd $250,%xmm1,%xmm7 shrdl $2,%r14d,%r14d addl %r11d,%edx addl %edi,%r11d vpsrld $11,%xmm6,%xmm6 movl %edx,%r13d addl %r11d,%r14d shrdl $14,%r13d,%r13d vpxor %xmm5,%xmm4,%xmm4 movl %r14d,%r11d movl %r8d,%r12d shrdl $9,%r14d,%r14d vpslld $11,%xmm5,%xmm5 xorl %edx,%r13d xorl %r9d,%r12d shrdl $5,%r13d,%r13d vpxor %xmm6,%xmm4,%xmm4 xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d vpsrld $10,%xmm7,%xmm6 addl 36(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d vpxor %xmm5,%xmm4,%xmm4 shrdl $11,%r14d,%r14d xorl %eax,%edi addl %r12d,%r10d vpsrlq $17,%xmm7,%xmm7 shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r11d,%r14d vpaddd %xmm4,%xmm2,%xmm2 addl %r13d,%r10d xorl %eax,%r15d shrdl $2,%r14d,%r14d vpxor %xmm7,%xmm6,%xmm6 addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d vpsrlq $2,%xmm7,%xmm7 addl %r10d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r10d vpxor %xmm7,%xmm6,%xmm6 movl %edx,%r12d shrdl $9,%r14d,%r14d xorl %ecx,%r13d vpshufb %xmm8,%xmm6,%xmm6 xorl %r8d,%r12d shrdl $5,%r13d,%r13d xorl %r10d,%r14d vpaddd %xmm6,%xmm2,%xmm2 andl %ecx,%r12d xorl %ecx,%r13d addl 40(%rsp),%r9d vpshufd $80,%xmm2,%xmm7 movl %r10d,%r15d xorl %r8d,%r12d shrdl $11,%r14d,%r14d vpsrld $10,%xmm7,%xmm6 xorl %r11d,%r15d addl %r12d,%r9d shrdl $6,%r13d,%r13d vpsrlq $17,%xmm7,%xmm7 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d vpxor %xmm7,%xmm6,%xmm6 xorl %r11d,%edi shrdl $2,%r14d,%r14d addl %r9d,%ebx vpsrlq $2,%xmm7,%xmm7 addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d vpxor %xmm7,%xmm6,%xmm6 shrdl $14,%r13d,%r13d movl %r14d,%r9d movl %ecx,%r12d vpshufb %xmm9,%xmm6,%xmm6 shrdl $9,%r14d,%r14d xorl %ebx,%r13d xorl %edx,%r12d vpaddd %xmm6,%xmm2,%xmm2 shrdl $5,%r13d,%r13d xorl %r9d,%r14d andl %ebx,%r12d vpaddd 64(%rbp),%xmm2,%xmm6 xorl %ebx,%r13d addl 44(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d shrdl $11,%r14d,%r14d xorl %r10d,%edi addl %r12d,%r8d shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d shrdl $2,%r14d,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d vmovdqa %xmm6,32(%rsp) vpalignr $4,%xmm3,%xmm0,%xmm4 shrdl $14,%r13d,%r13d movl %r14d,%r8d movl %ebx,%r12d vpalignr $4,%xmm1,%xmm2,%xmm7 shrdl $9,%r14d,%r14d xorl %eax,%r13d xorl %ecx,%r12d vpsrld $7,%xmm4,%xmm6 shrdl $5,%r13d,%r13d xorl %r8d,%r14d andl %eax,%r12d vpaddd %xmm7,%xmm3,%xmm3 xorl %eax,%r13d addl 48(%rsp),%edx movl %r8d,%r15d vpsrld $3,%xmm4,%xmm7 xorl %ecx,%r12d shrdl $11,%r14d,%r14d xorl %r9d,%r15d vpslld $14,%xmm4,%xmm5 addl %r12d,%edx shrdl $6,%r13d,%r13d andl %r15d,%edi vpxor %xmm6,%xmm7,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi vpshufd $250,%xmm2,%xmm7 shrdl $2,%r14d,%r14d addl %edx,%r11d addl %edi,%edx vpsrld $11,%xmm6,%xmm6 movl %r11d,%r13d addl %edx,%r14d shrdl $14,%r13d,%r13d vpxor %xmm5,%xmm4,%xmm4 movl %r14d,%edx movl %eax,%r12d shrdl $9,%r14d,%r14d vpslld $11,%xmm5,%xmm5 xorl %r11d,%r13d xorl %ebx,%r12d shrdl $5,%r13d,%r13d vpxor %xmm6,%xmm4,%xmm4 xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d vpsrld $10,%xmm7,%xmm6 addl 52(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d vpxor %xmm5,%xmm4,%xmm4 shrdl $11,%r14d,%r14d xorl %r8d,%edi addl %r12d,%ecx vpsrlq $17,%xmm7,%xmm7 shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %edx,%r14d vpaddd %xmm4,%xmm3,%xmm3 addl %r13d,%ecx xorl %r8d,%r15d shrdl $2,%r14d,%r14d vpxor %xmm7,%xmm6,%xmm6 addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d vpsrlq $2,%xmm7,%xmm7 addl %ecx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%ecx vpxor %xmm7,%xmm6,%xmm6 movl %r11d,%r12d shrdl $9,%r14d,%r14d xorl %r10d,%r13d vpshufb %xmm8,%xmm6,%xmm6 xorl %eax,%r12d shrdl $5,%r13d,%r13d xorl %ecx,%r14d vpaddd %xmm6,%xmm3,%xmm3 andl %r10d,%r12d xorl %r10d,%r13d addl 56(%rsp),%ebx vpshufd $80,%xmm3,%xmm7 movl %ecx,%r15d xorl %eax,%r12d shrdl $11,%r14d,%r14d vpsrld $10,%xmm7,%xmm6 xorl %edx,%r15d addl %r12d,%ebx shrdl $6,%r13d,%r13d vpsrlq $17,%xmm7,%xmm7 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx vpxor %xmm7,%xmm6,%xmm6 xorl %edx,%edi shrdl $2,%r14d,%r14d addl %ebx,%r9d vpsrlq $2,%xmm7,%xmm7 addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d vpxor %xmm7,%xmm6,%xmm6 shrdl $14,%r13d,%r13d movl %r14d,%ebx movl %r10d,%r12d vpshufb %xmm9,%xmm6,%xmm6 shrdl $9,%r14d,%r14d xorl %r9d,%r13d xorl %r11d,%r12d vpaddd %xmm6,%xmm3,%xmm3 shrdl $5,%r13d,%r13d xorl %ebx,%r14d andl %r9d,%r12d vpaddd 96(%rbp),%xmm3,%xmm6 xorl %r9d,%r13d addl 60(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d shrdl $11,%r14d,%r14d xorl %ecx,%edi addl %r12d,%eax shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d shrdl $2,%r14d,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d vmovdqa %xmm6,48(%rsp) cmpb $0,131(%rbp) jne L$avx_00_47 shrdl $14,%r13d,%r13d movl %r14d,%eax movl %r9d,%r12d shrdl $9,%r14d,%r14d xorl %r8d,%r13d xorl %r10d,%r12d shrdl $5,%r13d,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d shrdl $11,%r14d,%r14d xorl %ebx,%r15d addl %r12d,%r11d shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi shrdl $2,%r14d,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r11d movl %r8d,%r12d shrdl $9,%r14d,%r14d xorl %edx,%r13d xorl %r9d,%r12d shrdl $5,%r13d,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 4(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d shrdl $11,%r14d,%r14d xorl %eax,%edi addl %r12d,%r10d shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d shrdl $2,%r14d,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r10d movl %edx,%r12d shrdl $9,%r14d,%r14d xorl %ecx,%r13d xorl %r8d,%r12d shrdl $5,%r13d,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d shrdl $11,%r14d,%r14d xorl %r11d,%r15d addl %r12d,%r9d shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi shrdl $2,%r14d,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r9d movl %ecx,%r12d shrdl $9,%r14d,%r14d xorl %ebx,%r13d xorl %edx,%r12d shrdl $5,%r13d,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d shrdl $11,%r14d,%r14d xorl %r10d,%edi addl %r12d,%r8d shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d shrdl $2,%r14d,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r8d movl %ebx,%r12d shrdl $9,%r14d,%r14d xorl %eax,%r13d xorl %ecx,%r12d shrdl $5,%r13d,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d shrdl $11,%r14d,%r14d xorl %r9d,%r15d addl %r12d,%edx shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi shrdl $2,%r14d,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%edx movl %eax,%r12d shrdl $9,%r14d,%r14d xorl %r11d,%r13d xorl %ebx,%r12d shrdl $5,%r13d,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 20(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d shrdl $11,%r14d,%r14d xorl %r8d,%edi addl %r12d,%ecx shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d shrdl $2,%r14d,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%ecx movl %r11d,%r12d shrdl $9,%r14d,%r14d xorl %r10d,%r13d xorl %eax,%r12d shrdl $5,%r13d,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d shrdl $11,%r14d,%r14d xorl %edx,%r15d addl %r12d,%ebx shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi shrdl $2,%r14d,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%ebx movl %r10d,%r12d shrdl $9,%r14d,%r14d xorl %r9d,%r13d xorl %r11d,%r12d shrdl $5,%r13d,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d shrdl $11,%r14d,%r14d xorl %ecx,%edi addl %r12d,%eax shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d shrdl $2,%r14d,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d shrdl $14,%r13d,%r13d movl %r14d,%eax movl %r9d,%r12d shrdl $9,%r14d,%r14d xorl %r8d,%r13d xorl %r10d,%r12d shrdl $5,%r13d,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d shrdl $11,%r14d,%r14d xorl %ebx,%r15d addl %r12d,%r11d shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi shrdl $2,%r14d,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r11d movl %r8d,%r12d shrdl $9,%r14d,%r14d xorl %edx,%r13d xorl %r9d,%r12d shrdl $5,%r13d,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 36(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d shrdl $11,%r14d,%r14d xorl %eax,%edi addl %r12d,%r10d shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d shrdl $2,%r14d,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r10d movl %edx,%r12d shrdl $9,%r14d,%r14d xorl %ecx,%r13d xorl %r8d,%r12d shrdl $5,%r13d,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d shrdl $11,%r14d,%r14d xorl %r11d,%r15d addl %r12d,%r9d shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi shrdl $2,%r14d,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r9d movl %ecx,%r12d shrdl $9,%r14d,%r14d xorl %ebx,%r13d xorl %edx,%r12d shrdl $5,%r13d,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d shrdl $11,%r14d,%r14d xorl %r10d,%edi addl %r12d,%r8d shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d shrdl $2,%r14d,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d shrdl $14,%r13d,%r13d movl %r14d,%r8d movl %ebx,%r12d shrdl $9,%r14d,%r14d xorl %eax,%r13d xorl %ecx,%r12d shrdl $5,%r13d,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d shrdl $11,%r14d,%r14d xorl %r9d,%r15d addl %r12d,%edx shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi shrdl $2,%r14d,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%edx movl %eax,%r12d shrdl $9,%r14d,%r14d xorl %r11d,%r13d xorl %ebx,%r12d shrdl $5,%r13d,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 52(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d shrdl $11,%r14d,%r14d xorl %r8d,%edi addl %r12d,%ecx shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d shrdl $2,%r14d,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%ecx movl %r11d,%r12d shrdl $9,%r14d,%r14d xorl %r10d,%r13d xorl %eax,%r12d shrdl $5,%r13d,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d shrdl $11,%r14d,%r14d xorl %edx,%r15d addl %r12d,%ebx shrdl $6,%r13d,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi shrdl $2,%r14d,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d shrdl $14,%r13d,%r13d movl %r14d,%ebx movl %r10d,%r12d shrdl $9,%r14d,%r14d xorl %r9d,%r13d xorl %r11d,%r12d shrdl $5,%r13d,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d shrdl $11,%r14d,%r14d xorl %ecx,%edi addl %r12d,%eax shrdl $6,%r13d,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d shrdl $2,%r14d,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d movq 64+0(%rsp),%rdi movl %r14d,%eax addl 0(%rdi),%eax leaq 64(%rsi),%rsi addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d cmpq 64+16(%rsp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb L$loop_avx movq 88(%rsp),%rsi vzeroupper movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$epilogue_avx: ret #endif ring-0.17.14/pregenerated/sha256-x86_64-nasm.asm000064400000000000000000002014601046102023000170070ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %ifidn __OUTPUT_FORMAT__, win64 default rel %define XMMWORD %define YMMWORD %define ZMMWORD %define _CET_ENDBR %include "ring_core_generated/prefix_symbols_nasm.inc" section .text code align=64 global sha256_block_data_order_nohw ALIGN 16 sha256_block_data_order_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_sha256_block_data_order_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 shl rdx,4 sub rsp,16*4+4*8 lea rdx,[rdx*4+rsi] and rsp,-64 mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx mov QWORD[88+rsp],rax $L$prologue: mov eax,DWORD[rdi] mov ebx,DWORD[4+rdi] mov ecx,DWORD[8+rdi] mov edx,DWORD[12+rdi] mov r8d,DWORD[16+rdi] mov r9d,DWORD[20+rdi] mov r10d,DWORD[24+rdi] mov r11d,DWORD[28+rdi] jmp NEAR $L$loop ALIGN 16 $L$loop: mov edi,ebx lea rbp,[K256] xor edi,ecx mov r12d,DWORD[rsi] mov r13d,r8d mov r14d,eax bswap r12d ror r13d,14 mov r15d,r9d xor r13d,r8d ror r14d,9 xor r15d,r10d mov DWORD[rsp],r12d xor r14d,eax and r15d,r8d ror r13d,5 add r12d,r11d xor r15d,r10d ror r14d,11 xor r13d,r8d add r12d,r15d mov r15d,eax add r12d,DWORD[rbp] xor r14d,eax xor r15d,ebx ror r13d,6 mov r11d,ebx and edi,r15d ror r14d,2 add r12d,r13d xor r11d,edi add edx,r12d add r11d,r12d lea rbp,[4+rbp] add r11d,r14d mov r12d,DWORD[4+rsi] mov r13d,edx mov r14d,r11d bswap r12d ror r13d,14 mov edi,r8d xor r13d,edx ror r14d,9 xor edi,r9d mov DWORD[4+rsp],r12d xor r14d,r11d and edi,edx ror r13d,5 add r12d,r10d xor edi,r9d ror r14d,11 xor r13d,edx add r12d,edi mov edi,r11d add r12d,DWORD[rbp] xor r14d,r11d xor edi,eax ror r13d,6 mov r10d,eax and r15d,edi ror r14d,2 add r12d,r13d xor r10d,r15d add ecx,r12d add r10d,r12d lea rbp,[4+rbp] add r10d,r14d mov r12d,DWORD[8+rsi] mov r13d,ecx mov r14d,r10d bswap r12d ror r13d,14 mov r15d,edx xor r13d,ecx ror r14d,9 xor r15d,r8d mov DWORD[8+rsp],r12d xor r14d,r10d and r15d,ecx ror r13d,5 add r12d,r9d xor r15d,r8d ror r14d,11 xor r13d,ecx add r12d,r15d mov r15d,r10d add r12d,DWORD[rbp] xor r14d,r10d xor r15d,r11d ror r13d,6 mov r9d,r11d and edi,r15d ror r14d,2 add r12d,r13d xor r9d,edi add ebx,r12d add r9d,r12d lea rbp,[4+rbp] add r9d,r14d mov r12d,DWORD[12+rsi] mov r13d,ebx mov r14d,r9d bswap r12d ror r13d,14 mov edi,ecx xor r13d,ebx ror r14d,9 xor edi,edx mov DWORD[12+rsp],r12d xor r14d,r9d and edi,ebx ror r13d,5 add r12d,r8d xor edi,edx ror r14d,11 xor r13d,ebx add r12d,edi mov edi,r9d add r12d,DWORD[rbp] xor r14d,r9d xor edi,r10d ror r13d,6 mov r8d,r10d and r15d,edi ror r14d,2 add r12d,r13d xor r8d,r15d add eax,r12d add r8d,r12d lea rbp,[20+rbp] add r8d,r14d mov r12d,DWORD[16+rsi] mov r13d,eax mov r14d,r8d bswap r12d ror r13d,14 mov r15d,ebx xor r13d,eax ror r14d,9 xor r15d,ecx mov DWORD[16+rsp],r12d xor r14d,r8d and r15d,eax ror r13d,5 add r12d,edx xor r15d,ecx ror r14d,11 xor r13d,eax add r12d,r15d mov r15d,r8d add r12d,DWORD[rbp] xor r14d,r8d xor r15d,r9d ror r13d,6 mov edx,r9d and edi,r15d ror r14d,2 add r12d,r13d xor edx,edi add r11d,r12d add edx,r12d lea rbp,[4+rbp] add edx,r14d mov r12d,DWORD[20+rsi] mov r13d,r11d mov r14d,edx bswap r12d ror r13d,14 mov edi,eax xor r13d,r11d ror r14d,9 xor edi,ebx mov DWORD[20+rsp],r12d xor r14d,edx and edi,r11d ror r13d,5 add r12d,ecx xor edi,ebx ror r14d,11 xor r13d,r11d add r12d,edi mov edi,edx add r12d,DWORD[rbp] xor r14d,edx xor edi,r8d ror r13d,6 mov ecx,r8d and r15d,edi ror r14d,2 add r12d,r13d xor ecx,r15d add r10d,r12d add ecx,r12d lea rbp,[4+rbp] add ecx,r14d mov r12d,DWORD[24+rsi] mov r13d,r10d mov r14d,ecx bswap r12d ror r13d,14 mov r15d,r11d xor r13d,r10d ror r14d,9 xor r15d,eax mov DWORD[24+rsp],r12d xor r14d,ecx and r15d,r10d ror r13d,5 add r12d,ebx xor r15d,eax ror r14d,11 xor r13d,r10d add r12d,r15d mov r15d,ecx add r12d,DWORD[rbp] xor r14d,ecx xor r15d,edx ror r13d,6 mov ebx,edx and edi,r15d ror r14d,2 add r12d,r13d xor ebx,edi add r9d,r12d add ebx,r12d lea rbp,[4+rbp] add ebx,r14d mov r12d,DWORD[28+rsi] mov r13d,r9d mov r14d,ebx bswap r12d ror r13d,14 mov edi,r10d xor r13d,r9d ror r14d,9 xor edi,r11d mov DWORD[28+rsp],r12d xor r14d,ebx and edi,r9d ror r13d,5 add r12d,eax xor edi,r11d ror r14d,11 xor r13d,r9d add r12d,edi mov edi,ebx add r12d,DWORD[rbp] xor r14d,ebx xor edi,ecx ror r13d,6 mov eax,ecx and r15d,edi ror r14d,2 add r12d,r13d xor eax,r15d add r8d,r12d add eax,r12d lea rbp,[20+rbp] add eax,r14d mov r12d,DWORD[32+rsi] mov r13d,r8d mov r14d,eax bswap r12d ror r13d,14 mov r15d,r9d xor r13d,r8d ror r14d,9 xor r15d,r10d mov DWORD[32+rsp],r12d xor r14d,eax and r15d,r8d ror r13d,5 add r12d,r11d xor r15d,r10d ror r14d,11 xor r13d,r8d add r12d,r15d mov r15d,eax add r12d,DWORD[rbp] xor r14d,eax xor r15d,ebx ror r13d,6 mov r11d,ebx and edi,r15d ror r14d,2 add r12d,r13d xor r11d,edi add edx,r12d add r11d,r12d lea rbp,[4+rbp] add r11d,r14d mov r12d,DWORD[36+rsi] mov r13d,edx mov r14d,r11d bswap r12d ror r13d,14 mov edi,r8d xor r13d,edx ror r14d,9 xor edi,r9d mov DWORD[36+rsp],r12d xor r14d,r11d and edi,edx ror r13d,5 add r12d,r10d xor edi,r9d ror r14d,11 xor r13d,edx add r12d,edi mov edi,r11d add r12d,DWORD[rbp] xor r14d,r11d xor edi,eax ror r13d,6 mov r10d,eax and r15d,edi ror r14d,2 add r12d,r13d xor r10d,r15d add ecx,r12d add r10d,r12d lea rbp,[4+rbp] add r10d,r14d mov r12d,DWORD[40+rsi] mov r13d,ecx mov r14d,r10d bswap r12d ror r13d,14 mov r15d,edx xor r13d,ecx ror r14d,9 xor r15d,r8d mov DWORD[40+rsp],r12d xor r14d,r10d and r15d,ecx ror r13d,5 add r12d,r9d xor r15d,r8d ror r14d,11 xor r13d,ecx add r12d,r15d mov r15d,r10d add r12d,DWORD[rbp] xor r14d,r10d xor r15d,r11d ror r13d,6 mov r9d,r11d and edi,r15d ror r14d,2 add r12d,r13d xor r9d,edi add ebx,r12d add r9d,r12d lea rbp,[4+rbp] add r9d,r14d mov r12d,DWORD[44+rsi] mov r13d,ebx mov r14d,r9d bswap r12d ror r13d,14 mov edi,ecx xor r13d,ebx ror r14d,9 xor edi,edx mov DWORD[44+rsp],r12d xor r14d,r9d and edi,ebx ror r13d,5 add r12d,r8d xor edi,edx ror r14d,11 xor r13d,ebx add r12d,edi mov edi,r9d add r12d,DWORD[rbp] xor r14d,r9d xor edi,r10d ror r13d,6 mov r8d,r10d and r15d,edi ror r14d,2 add r12d,r13d xor r8d,r15d add eax,r12d add r8d,r12d lea rbp,[20+rbp] add r8d,r14d mov r12d,DWORD[48+rsi] mov r13d,eax mov r14d,r8d bswap r12d ror r13d,14 mov r15d,ebx xor r13d,eax ror r14d,9 xor r15d,ecx mov DWORD[48+rsp],r12d xor r14d,r8d and r15d,eax ror r13d,5 add r12d,edx xor r15d,ecx ror r14d,11 xor r13d,eax add r12d,r15d mov r15d,r8d add r12d,DWORD[rbp] xor r14d,r8d xor r15d,r9d ror r13d,6 mov edx,r9d and edi,r15d ror r14d,2 add r12d,r13d xor edx,edi add r11d,r12d add edx,r12d lea rbp,[4+rbp] add edx,r14d mov r12d,DWORD[52+rsi] mov r13d,r11d mov r14d,edx bswap r12d ror r13d,14 mov edi,eax xor r13d,r11d ror r14d,9 xor edi,ebx mov DWORD[52+rsp],r12d xor r14d,edx and edi,r11d ror r13d,5 add r12d,ecx xor edi,ebx ror r14d,11 xor r13d,r11d add r12d,edi mov edi,edx add r12d,DWORD[rbp] xor r14d,edx xor edi,r8d ror r13d,6 mov ecx,r8d and r15d,edi ror r14d,2 add r12d,r13d xor ecx,r15d add r10d,r12d add ecx,r12d lea rbp,[4+rbp] add ecx,r14d mov r12d,DWORD[56+rsi] mov r13d,r10d mov r14d,ecx bswap r12d ror r13d,14 mov r15d,r11d xor r13d,r10d ror r14d,9 xor r15d,eax mov DWORD[56+rsp],r12d xor r14d,ecx and r15d,r10d ror r13d,5 add r12d,ebx xor r15d,eax ror r14d,11 xor r13d,r10d add r12d,r15d mov r15d,ecx add r12d,DWORD[rbp] xor r14d,ecx xor r15d,edx ror r13d,6 mov ebx,edx and edi,r15d ror r14d,2 add r12d,r13d xor ebx,edi add r9d,r12d add ebx,r12d lea rbp,[4+rbp] add ebx,r14d mov r12d,DWORD[60+rsi] mov r13d,r9d mov r14d,ebx bswap r12d ror r13d,14 mov edi,r10d xor r13d,r9d ror r14d,9 xor edi,r11d mov DWORD[60+rsp],r12d xor r14d,ebx and edi,r9d ror r13d,5 add r12d,eax xor edi,r11d ror r14d,11 xor r13d,r9d add r12d,edi mov edi,ebx add r12d,DWORD[rbp] xor r14d,ebx xor edi,ecx ror r13d,6 mov eax,ecx and r15d,edi ror r14d,2 add r12d,r13d xor eax,r15d add r8d,r12d add eax,r12d lea rbp,[20+rbp] jmp NEAR $L$rounds_16_xx ALIGN 16 $L$rounds_16_xx: mov r13d,DWORD[4+rsp] mov r15d,DWORD[56+rsp] mov r12d,r13d ror r13d,11 add eax,r14d mov r14d,r15d ror r15d,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor r15d,r14d shr r14d,10 ror r15d,17 xor r12d,r13d xor r15d,r14d add r12d,DWORD[36+rsp] add r12d,DWORD[rsp] mov r13d,r8d add r12d,r15d mov r14d,eax ror r13d,14 mov r15d,r9d xor r13d,r8d ror r14d,9 xor r15d,r10d mov DWORD[rsp],r12d xor r14d,eax and r15d,r8d ror r13d,5 add r12d,r11d xor r15d,r10d ror r14d,11 xor r13d,r8d add r12d,r15d mov r15d,eax add r12d,DWORD[rbp] xor r14d,eax xor r15d,ebx ror r13d,6 mov r11d,ebx and edi,r15d ror r14d,2 add r12d,r13d xor r11d,edi add edx,r12d add r11d,r12d lea rbp,[4+rbp] mov r13d,DWORD[8+rsp] mov edi,DWORD[60+rsp] mov r12d,r13d ror r13d,11 add r11d,r14d mov r14d,edi ror edi,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor edi,r14d shr r14d,10 ror edi,17 xor r12d,r13d xor edi,r14d add r12d,DWORD[40+rsp] add r12d,DWORD[4+rsp] mov r13d,edx add r12d,edi mov r14d,r11d ror r13d,14 mov edi,r8d xor r13d,edx ror r14d,9 xor edi,r9d mov DWORD[4+rsp],r12d xor r14d,r11d and edi,edx ror r13d,5 add r12d,r10d xor edi,r9d ror r14d,11 xor r13d,edx add r12d,edi mov edi,r11d add r12d,DWORD[rbp] xor r14d,r11d xor edi,eax ror r13d,6 mov r10d,eax and r15d,edi ror r14d,2 add r12d,r13d xor r10d,r15d add ecx,r12d add r10d,r12d lea rbp,[4+rbp] mov r13d,DWORD[12+rsp] mov r15d,DWORD[rsp] mov r12d,r13d ror r13d,11 add r10d,r14d mov r14d,r15d ror r15d,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor r15d,r14d shr r14d,10 ror r15d,17 xor r12d,r13d xor r15d,r14d add r12d,DWORD[44+rsp] add r12d,DWORD[8+rsp] mov r13d,ecx add r12d,r15d mov r14d,r10d ror r13d,14 mov r15d,edx xor r13d,ecx ror r14d,9 xor r15d,r8d mov DWORD[8+rsp],r12d xor r14d,r10d and r15d,ecx ror r13d,5 add r12d,r9d xor r15d,r8d ror r14d,11 xor r13d,ecx add r12d,r15d mov r15d,r10d add r12d,DWORD[rbp] xor r14d,r10d xor r15d,r11d ror r13d,6 mov r9d,r11d and edi,r15d ror r14d,2 add r12d,r13d xor r9d,edi add ebx,r12d add r9d,r12d lea rbp,[4+rbp] mov r13d,DWORD[16+rsp] mov edi,DWORD[4+rsp] mov r12d,r13d ror r13d,11 add r9d,r14d mov r14d,edi ror edi,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor edi,r14d shr r14d,10 ror edi,17 xor r12d,r13d xor edi,r14d add r12d,DWORD[48+rsp] add r12d,DWORD[12+rsp] mov r13d,ebx add r12d,edi mov r14d,r9d ror r13d,14 mov edi,ecx xor r13d,ebx ror r14d,9 xor edi,edx mov DWORD[12+rsp],r12d xor r14d,r9d and edi,ebx ror r13d,5 add r12d,r8d xor edi,edx ror r14d,11 xor r13d,ebx add r12d,edi mov edi,r9d add r12d,DWORD[rbp] xor r14d,r9d xor edi,r10d ror r13d,6 mov r8d,r10d and r15d,edi ror r14d,2 add r12d,r13d xor r8d,r15d add eax,r12d add r8d,r12d lea rbp,[20+rbp] mov r13d,DWORD[20+rsp] mov r15d,DWORD[8+rsp] mov r12d,r13d ror r13d,11 add r8d,r14d mov r14d,r15d ror r15d,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor r15d,r14d shr r14d,10 ror r15d,17 xor r12d,r13d xor r15d,r14d add r12d,DWORD[52+rsp] add r12d,DWORD[16+rsp] mov r13d,eax add r12d,r15d mov r14d,r8d ror r13d,14 mov r15d,ebx xor r13d,eax ror r14d,9 xor r15d,ecx mov DWORD[16+rsp],r12d xor r14d,r8d and r15d,eax ror r13d,5 add r12d,edx xor r15d,ecx ror r14d,11 xor r13d,eax add r12d,r15d mov r15d,r8d add r12d,DWORD[rbp] xor r14d,r8d xor r15d,r9d ror r13d,6 mov edx,r9d and edi,r15d ror r14d,2 add r12d,r13d xor edx,edi add r11d,r12d add edx,r12d lea rbp,[4+rbp] mov r13d,DWORD[24+rsp] mov edi,DWORD[12+rsp] mov r12d,r13d ror r13d,11 add edx,r14d mov r14d,edi ror edi,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor edi,r14d shr r14d,10 ror edi,17 xor r12d,r13d xor edi,r14d add r12d,DWORD[56+rsp] add r12d,DWORD[20+rsp] mov r13d,r11d add r12d,edi mov r14d,edx ror r13d,14 mov edi,eax xor r13d,r11d ror r14d,9 xor edi,ebx mov DWORD[20+rsp],r12d xor r14d,edx and edi,r11d ror r13d,5 add r12d,ecx xor edi,ebx ror r14d,11 xor r13d,r11d add r12d,edi mov edi,edx add r12d,DWORD[rbp] xor r14d,edx xor edi,r8d ror r13d,6 mov ecx,r8d and r15d,edi ror r14d,2 add r12d,r13d xor ecx,r15d add r10d,r12d add ecx,r12d lea rbp,[4+rbp] mov r13d,DWORD[28+rsp] mov r15d,DWORD[16+rsp] mov r12d,r13d ror r13d,11 add ecx,r14d mov r14d,r15d ror r15d,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor r15d,r14d shr r14d,10 ror r15d,17 xor r12d,r13d xor r15d,r14d add r12d,DWORD[60+rsp] add r12d,DWORD[24+rsp] mov r13d,r10d add r12d,r15d mov r14d,ecx ror r13d,14 mov r15d,r11d xor r13d,r10d ror r14d,9 xor r15d,eax mov DWORD[24+rsp],r12d xor r14d,ecx and r15d,r10d ror r13d,5 add r12d,ebx xor r15d,eax ror r14d,11 xor r13d,r10d add r12d,r15d mov r15d,ecx add r12d,DWORD[rbp] xor r14d,ecx xor r15d,edx ror r13d,6 mov ebx,edx and edi,r15d ror r14d,2 add r12d,r13d xor ebx,edi add r9d,r12d add ebx,r12d lea rbp,[4+rbp] mov r13d,DWORD[32+rsp] mov edi,DWORD[20+rsp] mov r12d,r13d ror r13d,11 add ebx,r14d mov r14d,edi ror edi,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor edi,r14d shr r14d,10 ror edi,17 xor r12d,r13d xor edi,r14d add r12d,DWORD[rsp] add r12d,DWORD[28+rsp] mov r13d,r9d add r12d,edi mov r14d,ebx ror r13d,14 mov edi,r10d xor r13d,r9d ror r14d,9 xor edi,r11d mov DWORD[28+rsp],r12d xor r14d,ebx and edi,r9d ror r13d,5 add r12d,eax xor edi,r11d ror r14d,11 xor r13d,r9d add r12d,edi mov edi,ebx add r12d,DWORD[rbp] xor r14d,ebx xor edi,ecx ror r13d,6 mov eax,ecx and r15d,edi ror r14d,2 add r12d,r13d xor eax,r15d add r8d,r12d add eax,r12d lea rbp,[20+rbp] mov r13d,DWORD[36+rsp] mov r15d,DWORD[24+rsp] mov r12d,r13d ror r13d,11 add eax,r14d mov r14d,r15d ror r15d,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor r15d,r14d shr r14d,10 ror r15d,17 xor r12d,r13d xor r15d,r14d add r12d,DWORD[4+rsp] add r12d,DWORD[32+rsp] mov r13d,r8d add r12d,r15d mov r14d,eax ror r13d,14 mov r15d,r9d xor r13d,r8d ror r14d,9 xor r15d,r10d mov DWORD[32+rsp],r12d xor r14d,eax and r15d,r8d ror r13d,5 add r12d,r11d xor r15d,r10d ror r14d,11 xor r13d,r8d add r12d,r15d mov r15d,eax add r12d,DWORD[rbp] xor r14d,eax xor r15d,ebx ror r13d,6 mov r11d,ebx and edi,r15d ror r14d,2 add r12d,r13d xor r11d,edi add edx,r12d add r11d,r12d lea rbp,[4+rbp] mov r13d,DWORD[40+rsp] mov edi,DWORD[28+rsp] mov r12d,r13d ror r13d,11 add r11d,r14d mov r14d,edi ror edi,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor edi,r14d shr r14d,10 ror edi,17 xor r12d,r13d xor edi,r14d add r12d,DWORD[8+rsp] add r12d,DWORD[36+rsp] mov r13d,edx add r12d,edi mov r14d,r11d ror r13d,14 mov edi,r8d xor r13d,edx ror r14d,9 xor edi,r9d mov DWORD[36+rsp],r12d xor r14d,r11d and edi,edx ror r13d,5 add r12d,r10d xor edi,r9d ror r14d,11 xor r13d,edx add r12d,edi mov edi,r11d add r12d,DWORD[rbp] xor r14d,r11d xor edi,eax ror r13d,6 mov r10d,eax and r15d,edi ror r14d,2 add r12d,r13d xor r10d,r15d add ecx,r12d add r10d,r12d lea rbp,[4+rbp] mov r13d,DWORD[44+rsp] mov r15d,DWORD[32+rsp] mov r12d,r13d ror r13d,11 add r10d,r14d mov r14d,r15d ror r15d,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor r15d,r14d shr r14d,10 ror r15d,17 xor r12d,r13d xor r15d,r14d add r12d,DWORD[12+rsp] add r12d,DWORD[40+rsp] mov r13d,ecx add r12d,r15d mov r14d,r10d ror r13d,14 mov r15d,edx xor r13d,ecx ror r14d,9 xor r15d,r8d mov DWORD[40+rsp],r12d xor r14d,r10d and r15d,ecx ror r13d,5 add r12d,r9d xor r15d,r8d ror r14d,11 xor r13d,ecx add r12d,r15d mov r15d,r10d add r12d,DWORD[rbp] xor r14d,r10d xor r15d,r11d ror r13d,6 mov r9d,r11d and edi,r15d ror r14d,2 add r12d,r13d xor r9d,edi add ebx,r12d add r9d,r12d lea rbp,[4+rbp] mov r13d,DWORD[48+rsp] mov edi,DWORD[36+rsp] mov r12d,r13d ror r13d,11 add r9d,r14d mov r14d,edi ror edi,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor edi,r14d shr r14d,10 ror edi,17 xor r12d,r13d xor edi,r14d add r12d,DWORD[16+rsp] add r12d,DWORD[44+rsp] mov r13d,ebx add r12d,edi mov r14d,r9d ror r13d,14 mov edi,ecx xor r13d,ebx ror r14d,9 xor edi,edx mov DWORD[44+rsp],r12d xor r14d,r9d and edi,ebx ror r13d,5 add r12d,r8d xor edi,edx ror r14d,11 xor r13d,ebx add r12d,edi mov edi,r9d add r12d,DWORD[rbp] xor r14d,r9d xor edi,r10d ror r13d,6 mov r8d,r10d and r15d,edi ror r14d,2 add r12d,r13d xor r8d,r15d add eax,r12d add r8d,r12d lea rbp,[20+rbp] mov r13d,DWORD[52+rsp] mov r15d,DWORD[40+rsp] mov r12d,r13d ror r13d,11 add r8d,r14d mov r14d,r15d ror r15d,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor r15d,r14d shr r14d,10 ror r15d,17 xor r12d,r13d xor r15d,r14d add r12d,DWORD[20+rsp] add r12d,DWORD[48+rsp] mov r13d,eax add r12d,r15d mov r14d,r8d ror r13d,14 mov r15d,ebx xor r13d,eax ror r14d,9 xor r15d,ecx mov DWORD[48+rsp],r12d xor r14d,r8d and r15d,eax ror r13d,5 add r12d,edx xor r15d,ecx ror r14d,11 xor r13d,eax add r12d,r15d mov r15d,r8d add r12d,DWORD[rbp] xor r14d,r8d xor r15d,r9d ror r13d,6 mov edx,r9d and edi,r15d ror r14d,2 add r12d,r13d xor edx,edi add r11d,r12d add edx,r12d lea rbp,[4+rbp] mov r13d,DWORD[56+rsp] mov edi,DWORD[44+rsp] mov r12d,r13d ror r13d,11 add edx,r14d mov r14d,edi ror edi,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor edi,r14d shr r14d,10 ror edi,17 xor r12d,r13d xor edi,r14d add r12d,DWORD[24+rsp] add r12d,DWORD[52+rsp] mov r13d,r11d add r12d,edi mov r14d,edx ror r13d,14 mov edi,eax xor r13d,r11d ror r14d,9 xor edi,ebx mov DWORD[52+rsp],r12d xor r14d,edx and edi,r11d ror r13d,5 add r12d,ecx xor edi,ebx ror r14d,11 xor r13d,r11d add r12d,edi mov edi,edx add r12d,DWORD[rbp] xor r14d,edx xor edi,r8d ror r13d,6 mov ecx,r8d and r15d,edi ror r14d,2 add r12d,r13d xor ecx,r15d add r10d,r12d add ecx,r12d lea rbp,[4+rbp] mov r13d,DWORD[60+rsp] mov r15d,DWORD[48+rsp] mov r12d,r13d ror r13d,11 add ecx,r14d mov r14d,r15d ror r15d,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor r15d,r14d shr r14d,10 ror r15d,17 xor r12d,r13d xor r15d,r14d add r12d,DWORD[28+rsp] add r12d,DWORD[56+rsp] mov r13d,r10d add r12d,r15d mov r14d,ecx ror r13d,14 mov r15d,r11d xor r13d,r10d ror r14d,9 xor r15d,eax mov DWORD[56+rsp],r12d xor r14d,ecx and r15d,r10d ror r13d,5 add r12d,ebx xor r15d,eax ror r14d,11 xor r13d,r10d add r12d,r15d mov r15d,ecx add r12d,DWORD[rbp] xor r14d,ecx xor r15d,edx ror r13d,6 mov ebx,edx and edi,r15d ror r14d,2 add r12d,r13d xor ebx,edi add r9d,r12d add ebx,r12d lea rbp,[4+rbp] mov r13d,DWORD[rsp] mov edi,DWORD[52+rsp] mov r12d,r13d ror r13d,11 add ebx,r14d mov r14d,edi ror edi,2 xor r13d,r12d shr r12d,3 ror r13d,7 xor edi,r14d shr r14d,10 ror edi,17 xor r12d,r13d xor edi,r14d add r12d,DWORD[32+rsp] add r12d,DWORD[60+rsp] mov r13d,r9d add r12d,edi mov r14d,ebx ror r13d,14 mov edi,r10d xor r13d,r9d ror r14d,9 xor edi,r11d mov DWORD[60+rsp],r12d xor r14d,ebx and edi,r9d ror r13d,5 add r12d,eax xor edi,r11d ror r14d,11 xor r13d,r9d add r12d,edi mov edi,ebx add r12d,DWORD[rbp] xor r14d,ebx xor edi,ecx ror r13d,6 mov eax,ecx and r15d,edi ror r14d,2 add r12d,r13d xor eax,r15d add r8d,r12d add eax,r12d lea rbp,[20+rbp] cmp BYTE[3+rbp],0 jnz NEAR $L$rounds_16_xx mov rdi,QWORD[((64+0))+rsp] add eax,r14d lea rsi,[64+rsi] add eax,DWORD[rdi] add ebx,DWORD[4+rdi] add ecx,DWORD[8+rdi] add edx,DWORD[12+rdi] add r8d,DWORD[16+rdi] add r9d,DWORD[20+rdi] add r10d,DWORD[24+rdi] add r11d,DWORD[28+rdi] cmp rsi,QWORD[((64+16))+rsp] mov DWORD[rdi],eax mov DWORD[4+rdi],ebx mov DWORD[8+rdi],ecx mov DWORD[12+rdi],edx mov DWORD[16+rdi],r8d mov DWORD[20+rdi],r9d mov DWORD[24+rdi],r10d mov DWORD[28+rdi],r11d jb NEAR $L$loop mov rsi,QWORD[88+rsp] mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_sha256_block_data_order_nohw: section .rdata rdata align=8 ALIGN 64 K256: DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f DD 0x03020100,0x0b0a0908,0xffffffff,0xffffffff DD 0x03020100,0x0b0a0908,0xffffffff,0xffffffff DD 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 DD 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 DB 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 DB 111,114,103,62,0 section .text global sha256_block_data_order_hw ALIGN 64 sha256_block_data_order_hw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_sha256_block_data_order_hw: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR lea rsp,[((-88))+rsp] movaps XMMWORD[(-8-80)+rax],xmm6 movaps XMMWORD[(-8-64)+rax],xmm7 movaps XMMWORD[(-8-48)+rax],xmm8 movaps XMMWORD[(-8-32)+rax],xmm9 movaps XMMWORD[(-8-16)+rax],xmm10 $L$prologue_shaext: lea rcx,[((K256+128))] movdqu xmm1,XMMWORD[rdi] movdqu xmm2,XMMWORD[16+rdi] movdqa xmm7,XMMWORD[((512-128))+rcx] pshufd xmm0,xmm1,0x1b pshufd xmm1,xmm1,0xb1 pshufd xmm2,xmm2,0x1b movdqa xmm8,xmm7 DB 102,15,58,15,202,8 punpcklqdq xmm2,xmm0 jmp NEAR $L$oop_shaext ALIGN 16 $L$oop_shaext: movdqu xmm3,XMMWORD[rsi] movdqu xmm4,XMMWORD[16+rsi] movdqu xmm5,XMMWORD[32+rsi] DB 102,15,56,0,223 movdqu xmm6,XMMWORD[48+rsi] movdqa xmm0,XMMWORD[((0-128))+rcx] paddd xmm0,xmm3 DB 102,15,56,0,231 movdqa xmm10,xmm2 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e nop movdqa xmm9,xmm1 DB 15,56,203,202 movdqa xmm0,XMMWORD[((32-128))+rcx] paddd xmm0,xmm4 DB 102,15,56,0,239 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e lea rsi,[64+rsi] DB 15,56,204,220 DB 15,56,203,202 movdqa xmm0,XMMWORD[((64-128))+rcx] paddd xmm0,xmm5 DB 102,15,56,0,247 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e movdqa xmm7,xmm6 DB 102,15,58,15,253,4 nop paddd xmm3,xmm7 DB 15,56,204,229 DB 15,56,203,202 movdqa xmm0,XMMWORD[((96-128))+rcx] paddd xmm0,xmm6 DB 15,56,205,222 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e movdqa xmm7,xmm3 DB 102,15,58,15,254,4 nop paddd xmm4,xmm7 DB 15,56,204,238 DB 15,56,203,202 movdqa xmm0,XMMWORD[((128-128))+rcx] paddd xmm0,xmm3 DB 15,56,205,227 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e movdqa xmm7,xmm4 DB 102,15,58,15,251,4 nop paddd xmm5,xmm7 DB 15,56,204,243 DB 15,56,203,202 movdqa xmm0,XMMWORD[((160-128))+rcx] paddd xmm0,xmm4 DB 15,56,205,236 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e movdqa xmm7,xmm5 DB 102,15,58,15,252,4 nop paddd xmm6,xmm7 DB 15,56,204,220 DB 15,56,203,202 movdqa xmm0,XMMWORD[((192-128))+rcx] paddd xmm0,xmm5 DB 15,56,205,245 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e movdqa xmm7,xmm6 DB 102,15,58,15,253,4 nop paddd xmm3,xmm7 DB 15,56,204,229 DB 15,56,203,202 movdqa xmm0,XMMWORD[((224-128))+rcx] paddd xmm0,xmm6 DB 15,56,205,222 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e movdqa xmm7,xmm3 DB 102,15,58,15,254,4 nop paddd xmm4,xmm7 DB 15,56,204,238 DB 15,56,203,202 movdqa xmm0,XMMWORD[((256-128))+rcx] paddd xmm0,xmm3 DB 15,56,205,227 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e movdqa xmm7,xmm4 DB 102,15,58,15,251,4 nop paddd xmm5,xmm7 DB 15,56,204,243 DB 15,56,203,202 movdqa xmm0,XMMWORD[((288-128))+rcx] paddd xmm0,xmm4 DB 15,56,205,236 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e movdqa xmm7,xmm5 DB 102,15,58,15,252,4 nop paddd xmm6,xmm7 DB 15,56,204,220 DB 15,56,203,202 movdqa xmm0,XMMWORD[((320-128))+rcx] paddd xmm0,xmm5 DB 15,56,205,245 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e movdqa xmm7,xmm6 DB 102,15,58,15,253,4 nop paddd xmm3,xmm7 DB 15,56,204,229 DB 15,56,203,202 movdqa xmm0,XMMWORD[((352-128))+rcx] paddd xmm0,xmm6 DB 15,56,205,222 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e movdqa xmm7,xmm3 DB 102,15,58,15,254,4 nop paddd xmm4,xmm7 DB 15,56,204,238 DB 15,56,203,202 movdqa xmm0,XMMWORD[((384-128))+rcx] paddd xmm0,xmm3 DB 15,56,205,227 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e movdqa xmm7,xmm4 DB 102,15,58,15,251,4 nop paddd xmm5,xmm7 DB 15,56,204,243 DB 15,56,203,202 movdqa xmm0,XMMWORD[((416-128))+rcx] paddd xmm0,xmm4 DB 15,56,205,236 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e movdqa xmm7,xmm5 DB 102,15,58,15,252,4 DB 15,56,203,202 paddd xmm6,xmm7 movdqa xmm0,XMMWORD[((448-128))+rcx] paddd xmm0,xmm5 DB 15,56,203,209 pshufd xmm0,xmm0,0x0e DB 15,56,205,245 movdqa xmm7,xmm8 DB 15,56,203,202 movdqa xmm0,XMMWORD[((480-128))+rcx] paddd xmm0,xmm6 nop DB 15,56,203,209 pshufd xmm0,xmm0,0x0e dec rdx nop DB 15,56,203,202 paddd xmm2,xmm10 paddd xmm1,xmm9 jnz NEAR $L$oop_shaext pshufd xmm2,xmm2,0xb1 pshufd xmm7,xmm1,0x1b pshufd xmm1,xmm1,0xb1 punpckhqdq xmm1,xmm2 DB 102,15,58,15,215,8 movdqu XMMWORD[rdi],xmm1 movdqu XMMWORD[16+rdi],xmm2 movaps xmm6,XMMWORD[((-8-80))+rax] movaps xmm7,XMMWORD[((-8-64))+rax] movaps xmm8,XMMWORD[((-8-48))+rax] movaps xmm9,XMMWORD[((-8-32))+rax] movaps xmm10,XMMWORD[((-8-16))+rax] mov rsp,rax $L$epilogue_shaext: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_sha256_block_data_order_hw: global sha256_block_data_order_ssse3 ALIGN 64 sha256_block_data_order_ssse3: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_sha256_block_data_order_ssse3: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 shl rdx,4 sub rsp,160 lea rdx,[rdx*4+rsi] and rsp,-64 mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx mov QWORD[88+rsp],rax movaps XMMWORD[(64+32)+rsp],xmm6 movaps XMMWORD[(64+48)+rsp],xmm7 movaps XMMWORD[(64+64)+rsp],xmm8 movaps XMMWORD[(64+80)+rsp],xmm9 $L$prologue_ssse3: mov eax,DWORD[rdi] mov ebx,DWORD[4+rdi] mov ecx,DWORD[8+rdi] mov edx,DWORD[12+rdi] mov r8d,DWORD[16+rdi] mov r9d,DWORD[20+rdi] mov r10d,DWORD[24+rdi] mov r11d,DWORD[28+rdi] jmp NEAR $L$loop_ssse3 ALIGN 16 $L$loop_ssse3: movdqa xmm7,XMMWORD[((K256+512))] movdqu xmm0,XMMWORD[rsi] movdqu xmm1,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] DB 102,15,56,0,199 movdqu xmm3,XMMWORD[48+rsi] lea rbp,[K256] DB 102,15,56,0,207 movdqa xmm4,XMMWORD[rbp] movdqa xmm5,XMMWORD[32+rbp] DB 102,15,56,0,215 paddd xmm4,xmm0 movdqa xmm6,XMMWORD[64+rbp] DB 102,15,56,0,223 movdqa xmm7,XMMWORD[96+rbp] paddd xmm5,xmm1 paddd xmm6,xmm2 paddd xmm7,xmm3 movdqa XMMWORD[rsp],xmm4 mov r14d,eax movdqa XMMWORD[16+rsp],xmm5 mov edi,ebx movdqa XMMWORD[32+rsp],xmm6 xor edi,ecx movdqa XMMWORD[48+rsp],xmm7 mov r13d,r8d jmp NEAR $L$ssse3_00_47 ALIGN 16 $L$ssse3_00_47: sub rbp,-128 ror r13d,14 movdqa xmm4,xmm1 mov eax,r14d mov r12d,r9d movdqa xmm7,xmm3 ror r14d,9 xor r13d,r8d xor r12d,r10d ror r13d,5 xor r14d,eax DB 102,15,58,15,224,4 and r12d,r8d xor r13d,r8d DB 102,15,58,15,250,4 add r11d,DWORD[rsp] mov r15d,eax xor r12d,r10d ror r14d,11 movdqa xmm5,xmm4 xor r15d,ebx add r11d,r12d movdqa xmm6,xmm4 ror r13d,6 and edi,r15d psrld xmm4,3 xor r14d,eax add r11d,r13d xor edi,ebx paddd xmm0,xmm7 ror r14d,2 add edx,r11d psrld xmm6,7 add r11d,edi mov r13d,edx pshufd xmm7,xmm3,250 add r14d,r11d ror r13d,14 pslld xmm5,14 mov r11d,r14d mov r12d,r8d pxor xmm4,xmm6 ror r14d,9 xor r13d,edx xor r12d,r9d ror r13d,5 psrld xmm6,11 xor r14d,r11d pxor xmm4,xmm5 and r12d,edx xor r13d,edx pslld xmm5,11 add r10d,DWORD[4+rsp] mov edi,r11d pxor xmm4,xmm6 xor r12d,r9d ror r14d,11 movdqa xmm6,xmm7 xor edi,eax add r10d,r12d pxor xmm4,xmm5 ror r13d,6 and r15d,edi xor r14d,r11d psrld xmm7,10 add r10d,r13d xor r15d,eax paddd xmm0,xmm4 ror r14d,2 add ecx,r10d psrlq xmm6,17 add r10d,r15d mov r13d,ecx add r14d,r10d pxor xmm7,xmm6 ror r13d,14 mov r10d,r14d mov r12d,edx ror r14d,9 psrlq xmm6,2 xor r13d,ecx xor r12d,r8d pxor xmm7,xmm6 ror r13d,5 xor r14d,r10d and r12d,ecx pshufd xmm7,xmm7,128 xor r13d,ecx add r9d,DWORD[8+rsp] mov r15d,r10d psrldq xmm7,8 xor r12d,r8d ror r14d,11 xor r15d,r11d add r9d,r12d ror r13d,6 paddd xmm0,xmm7 and edi,r15d xor r14d,r10d add r9d,r13d pshufd xmm7,xmm0,80 xor edi,r11d ror r14d,2 add ebx,r9d movdqa xmm6,xmm7 add r9d,edi mov r13d,ebx psrld xmm7,10 add r14d,r9d ror r13d,14 psrlq xmm6,17 mov r9d,r14d mov r12d,ecx pxor xmm7,xmm6 ror r14d,9 xor r13d,ebx xor r12d,edx ror r13d,5 xor r14d,r9d psrlq xmm6,2 and r12d,ebx xor r13d,ebx add r8d,DWORD[12+rsp] pxor xmm7,xmm6 mov edi,r9d xor r12d,edx ror r14d,11 pshufd xmm7,xmm7,8 xor edi,r10d add r8d,r12d movdqa xmm6,XMMWORD[rbp] ror r13d,6 and r15d,edi pslldq xmm7,8 xor r14d,r9d add r8d,r13d xor r15d,r10d paddd xmm0,xmm7 ror r14d,2 add eax,r8d add r8d,r15d paddd xmm6,xmm0 mov r13d,eax add r14d,r8d movdqa XMMWORD[rsp],xmm6 ror r13d,14 movdqa xmm4,xmm2 mov r8d,r14d mov r12d,ebx movdqa xmm7,xmm0 ror r14d,9 xor r13d,eax xor r12d,ecx ror r13d,5 xor r14d,r8d DB 102,15,58,15,225,4 and r12d,eax xor r13d,eax DB 102,15,58,15,251,4 add edx,DWORD[16+rsp] mov r15d,r8d xor r12d,ecx ror r14d,11 movdqa xmm5,xmm4 xor r15d,r9d add edx,r12d movdqa xmm6,xmm4 ror r13d,6 and edi,r15d psrld xmm4,3 xor r14d,r8d add edx,r13d xor edi,r9d paddd xmm1,xmm7 ror r14d,2 add r11d,edx psrld xmm6,7 add edx,edi mov r13d,r11d pshufd xmm7,xmm0,250 add r14d,edx ror r13d,14 pslld xmm5,14 mov edx,r14d mov r12d,eax pxor xmm4,xmm6 ror r14d,9 xor r13d,r11d xor r12d,ebx ror r13d,5 psrld xmm6,11 xor r14d,edx pxor xmm4,xmm5 and r12d,r11d xor r13d,r11d pslld xmm5,11 add ecx,DWORD[20+rsp] mov edi,edx pxor xmm4,xmm6 xor r12d,ebx ror r14d,11 movdqa xmm6,xmm7 xor edi,r8d add ecx,r12d pxor xmm4,xmm5 ror r13d,6 and r15d,edi xor r14d,edx psrld xmm7,10 add ecx,r13d xor r15d,r8d paddd xmm1,xmm4 ror r14d,2 add r10d,ecx psrlq xmm6,17 add ecx,r15d mov r13d,r10d add r14d,ecx pxor xmm7,xmm6 ror r13d,14 mov ecx,r14d mov r12d,r11d ror r14d,9 psrlq xmm6,2 xor r13d,r10d xor r12d,eax pxor xmm7,xmm6 ror r13d,5 xor r14d,ecx and r12d,r10d pshufd xmm7,xmm7,128 xor r13d,r10d add ebx,DWORD[24+rsp] mov r15d,ecx psrldq xmm7,8 xor r12d,eax ror r14d,11 xor r15d,edx add ebx,r12d ror r13d,6 paddd xmm1,xmm7 and edi,r15d xor r14d,ecx add ebx,r13d pshufd xmm7,xmm1,80 xor edi,edx ror r14d,2 add r9d,ebx movdqa xmm6,xmm7 add ebx,edi mov r13d,r9d psrld xmm7,10 add r14d,ebx ror r13d,14 psrlq xmm6,17 mov ebx,r14d mov r12d,r10d pxor xmm7,xmm6 ror r14d,9 xor r13d,r9d xor r12d,r11d ror r13d,5 xor r14d,ebx psrlq xmm6,2 and r12d,r9d xor r13d,r9d add eax,DWORD[28+rsp] pxor xmm7,xmm6 mov edi,ebx xor r12d,r11d ror r14d,11 pshufd xmm7,xmm7,8 xor edi,ecx add eax,r12d movdqa xmm6,XMMWORD[32+rbp] ror r13d,6 and r15d,edi pslldq xmm7,8 xor r14d,ebx add eax,r13d xor r15d,ecx paddd xmm1,xmm7 ror r14d,2 add r8d,eax add eax,r15d paddd xmm6,xmm1 mov r13d,r8d add r14d,eax movdqa XMMWORD[16+rsp],xmm6 ror r13d,14 movdqa xmm4,xmm3 mov eax,r14d mov r12d,r9d movdqa xmm7,xmm1 ror r14d,9 xor r13d,r8d xor r12d,r10d ror r13d,5 xor r14d,eax DB 102,15,58,15,226,4 and r12d,r8d xor r13d,r8d DB 102,15,58,15,248,4 add r11d,DWORD[32+rsp] mov r15d,eax xor r12d,r10d ror r14d,11 movdqa xmm5,xmm4 xor r15d,ebx add r11d,r12d movdqa xmm6,xmm4 ror r13d,6 and edi,r15d psrld xmm4,3 xor r14d,eax add r11d,r13d xor edi,ebx paddd xmm2,xmm7 ror r14d,2 add edx,r11d psrld xmm6,7 add r11d,edi mov r13d,edx pshufd xmm7,xmm1,250 add r14d,r11d ror r13d,14 pslld xmm5,14 mov r11d,r14d mov r12d,r8d pxor xmm4,xmm6 ror r14d,9 xor r13d,edx xor r12d,r9d ror r13d,5 psrld xmm6,11 xor r14d,r11d pxor xmm4,xmm5 and r12d,edx xor r13d,edx pslld xmm5,11 add r10d,DWORD[36+rsp] mov edi,r11d pxor xmm4,xmm6 xor r12d,r9d ror r14d,11 movdqa xmm6,xmm7 xor edi,eax add r10d,r12d pxor xmm4,xmm5 ror r13d,6 and r15d,edi xor r14d,r11d psrld xmm7,10 add r10d,r13d xor r15d,eax paddd xmm2,xmm4 ror r14d,2 add ecx,r10d psrlq xmm6,17 add r10d,r15d mov r13d,ecx add r14d,r10d pxor xmm7,xmm6 ror r13d,14 mov r10d,r14d mov r12d,edx ror r14d,9 psrlq xmm6,2 xor r13d,ecx xor r12d,r8d pxor xmm7,xmm6 ror r13d,5 xor r14d,r10d and r12d,ecx pshufd xmm7,xmm7,128 xor r13d,ecx add r9d,DWORD[40+rsp] mov r15d,r10d psrldq xmm7,8 xor r12d,r8d ror r14d,11 xor r15d,r11d add r9d,r12d ror r13d,6 paddd xmm2,xmm7 and edi,r15d xor r14d,r10d add r9d,r13d pshufd xmm7,xmm2,80 xor edi,r11d ror r14d,2 add ebx,r9d movdqa xmm6,xmm7 add r9d,edi mov r13d,ebx psrld xmm7,10 add r14d,r9d ror r13d,14 psrlq xmm6,17 mov r9d,r14d mov r12d,ecx pxor xmm7,xmm6 ror r14d,9 xor r13d,ebx xor r12d,edx ror r13d,5 xor r14d,r9d psrlq xmm6,2 and r12d,ebx xor r13d,ebx add r8d,DWORD[44+rsp] pxor xmm7,xmm6 mov edi,r9d xor r12d,edx ror r14d,11 pshufd xmm7,xmm7,8 xor edi,r10d add r8d,r12d movdqa xmm6,XMMWORD[64+rbp] ror r13d,6 and r15d,edi pslldq xmm7,8 xor r14d,r9d add r8d,r13d xor r15d,r10d paddd xmm2,xmm7 ror r14d,2 add eax,r8d add r8d,r15d paddd xmm6,xmm2 mov r13d,eax add r14d,r8d movdqa XMMWORD[32+rsp],xmm6 ror r13d,14 movdqa xmm4,xmm0 mov r8d,r14d mov r12d,ebx movdqa xmm7,xmm2 ror r14d,9 xor r13d,eax xor r12d,ecx ror r13d,5 xor r14d,r8d DB 102,15,58,15,227,4 and r12d,eax xor r13d,eax DB 102,15,58,15,249,4 add edx,DWORD[48+rsp] mov r15d,r8d xor r12d,ecx ror r14d,11 movdqa xmm5,xmm4 xor r15d,r9d add edx,r12d movdqa xmm6,xmm4 ror r13d,6 and edi,r15d psrld xmm4,3 xor r14d,r8d add edx,r13d xor edi,r9d paddd xmm3,xmm7 ror r14d,2 add r11d,edx psrld xmm6,7 add edx,edi mov r13d,r11d pshufd xmm7,xmm2,250 add r14d,edx ror r13d,14 pslld xmm5,14 mov edx,r14d mov r12d,eax pxor xmm4,xmm6 ror r14d,9 xor r13d,r11d xor r12d,ebx ror r13d,5 psrld xmm6,11 xor r14d,edx pxor xmm4,xmm5 and r12d,r11d xor r13d,r11d pslld xmm5,11 add ecx,DWORD[52+rsp] mov edi,edx pxor xmm4,xmm6 xor r12d,ebx ror r14d,11 movdqa xmm6,xmm7 xor edi,r8d add ecx,r12d pxor xmm4,xmm5 ror r13d,6 and r15d,edi xor r14d,edx psrld xmm7,10 add ecx,r13d xor r15d,r8d paddd xmm3,xmm4 ror r14d,2 add r10d,ecx psrlq xmm6,17 add ecx,r15d mov r13d,r10d add r14d,ecx pxor xmm7,xmm6 ror r13d,14 mov ecx,r14d mov r12d,r11d ror r14d,9 psrlq xmm6,2 xor r13d,r10d xor r12d,eax pxor xmm7,xmm6 ror r13d,5 xor r14d,ecx and r12d,r10d pshufd xmm7,xmm7,128 xor r13d,r10d add ebx,DWORD[56+rsp] mov r15d,ecx psrldq xmm7,8 xor r12d,eax ror r14d,11 xor r15d,edx add ebx,r12d ror r13d,6 paddd xmm3,xmm7 and edi,r15d xor r14d,ecx add ebx,r13d pshufd xmm7,xmm3,80 xor edi,edx ror r14d,2 add r9d,ebx movdqa xmm6,xmm7 add ebx,edi mov r13d,r9d psrld xmm7,10 add r14d,ebx ror r13d,14 psrlq xmm6,17 mov ebx,r14d mov r12d,r10d pxor xmm7,xmm6 ror r14d,9 xor r13d,r9d xor r12d,r11d ror r13d,5 xor r14d,ebx psrlq xmm6,2 and r12d,r9d xor r13d,r9d add eax,DWORD[60+rsp] pxor xmm7,xmm6 mov edi,ebx xor r12d,r11d ror r14d,11 pshufd xmm7,xmm7,8 xor edi,ecx add eax,r12d movdqa xmm6,XMMWORD[96+rbp] ror r13d,6 and r15d,edi pslldq xmm7,8 xor r14d,ebx add eax,r13d xor r15d,ecx paddd xmm3,xmm7 ror r14d,2 add r8d,eax add eax,r15d paddd xmm6,xmm3 mov r13d,r8d add r14d,eax movdqa XMMWORD[48+rsp],xmm6 cmp BYTE[131+rbp],0 jne NEAR $L$ssse3_00_47 ror r13d,14 mov eax,r14d mov r12d,r9d ror r14d,9 xor r13d,r8d xor r12d,r10d ror r13d,5 xor r14d,eax and r12d,r8d xor r13d,r8d add r11d,DWORD[rsp] mov r15d,eax xor r12d,r10d ror r14d,11 xor r15d,ebx add r11d,r12d ror r13d,6 and edi,r15d xor r14d,eax add r11d,r13d xor edi,ebx ror r14d,2 add edx,r11d add r11d,edi mov r13d,edx add r14d,r11d ror r13d,14 mov r11d,r14d mov r12d,r8d ror r14d,9 xor r13d,edx xor r12d,r9d ror r13d,5 xor r14d,r11d and r12d,edx xor r13d,edx add r10d,DWORD[4+rsp] mov edi,r11d xor r12d,r9d ror r14d,11 xor edi,eax add r10d,r12d ror r13d,6 and r15d,edi xor r14d,r11d add r10d,r13d xor r15d,eax ror r14d,2 add ecx,r10d add r10d,r15d mov r13d,ecx add r14d,r10d ror r13d,14 mov r10d,r14d mov r12d,edx ror r14d,9 xor r13d,ecx xor r12d,r8d ror r13d,5 xor r14d,r10d and r12d,ecx xor r13d,ecx add r9d,DWORD[8+rsp] mov r15d,r10d xor r12d,r8d ror r14d,11 xor r15d,r11d add r9d,r12d ror r13d,6 and edi,r15d xor r14d,r10d add r9d,r13d xor edi,r11d ror r14d,2 add ebx,r9d add r9d,edi mov r13d,ebx add r14d,r9d ror r13d,14 mov r9d,r14d mov r12d,ecx ror r14d,9 xor r13d,ebx xor r12d,edx ror r13d,5 xor r14d,r9d and r12d,ebx xor r13d,ebx add r8d,DWORD[12+rsp] mov edi,r9d xor r12d,edx ror r14d,11 xor edi,r10d add r8d,r12d ror r13d,6 and r15d,edi xor r14d,r9d add r8d,r13d xor r15d,r10d ror r14d,2 add eax,r8d add r8d,r15d mov r13d,eax add r14d,r8d ror r13d,14 mov r8d,r14d mov r12d,ebx ror r14d,9 xor r13d,eax xor r12d,ecx ror r13d,5 xor r14d,r8d and r12d,eax xor r13d,eax add edx,DWORD[16+rsp] mov r15d,r8d xor r12d,ecx ror r14d,11 xor r15d,r9d add edx,r12d ror r13d,6 and edi,r15d xor r14d,r8d add edx,r13d xor edi,r9d ror r14d,2 add r11d,edx add edx,edi mov r13d,r11d add r14d,edx ror r13d,14 mov edx,r14d mov r12d,eax ror r14d,9 xor r13d,r11d xor r12d,ebx ror r13d,5 xor r14d,edx and r12d,r11d xor r13d,r11d add ecx,DWORD[20+rsp] mov edi,edx xor r12d,ebx ror r14d,11 xor edi,r8d add ecx,r12d ror r13d,6 and r15d,edi xor r14d,edx add ecx,r13d xor r15d,r8d ror r14d,2 add r10d,ecx add ecx,r15d mov r13d,r10d add r14d,ecx ror r13d,14 mov ecx,r14d mov r12d,r11d ror r14d,9 xor r13d,r10d xor r12d,eax ror r13d,5 xor r14d,ecx and r12d,r10d xor r13d,r10d add ebx,DWORD[24+rsp] mov r15d,ecx xor r12d,eax ror r14d,11 xor r15d,edx add ebx,r12d ror r13d,6 and edi,r15d xor r14d,ecx add ebx,r13d xor edi,edx ror r14d,2 add r9d,ebx add ebx,edi mov r13d,r9d add r14d,ebx ror r13d,14 mov ebx,r14d mov r12d,r10d ror r14d,9 xor r13d,r9d xor r12d,r11d ror r13d,5 xor r14d,ebx and r12d,r9d xor r13d,r9d add eax,DWORD[28+rsp] mov edi,ebx xor r12d,r11d ror r14d,11 xor edi,ecx add eax,r12d ror r13d,6 and r15d,edi xor r14d,ebx add eax,r13d xor r15d,ecx ror r14d,2 add r8d,eax add eax,r15d mov r13d,r8d add r14d,eax ror r13d,14 mov eax,r14d mov r12d,r9d ror r14d,9 xor r13d,r8d xor r12d,r10d ror r13d,5 xor r14d,eax and r12d,r8d xor r13d,r8d add r11d,DWORD[32+rsp] mov r15d,eax xor r12d,r10d ror r14d,11 xor r15d,ebx add r11d,r12d ror r13d,6 and edi,r15d xor r14d,eax add r11d,r13d xor edi,ebx ror r14d,2 add edx,r11d add r11d,edi mov r13d,edx add r14d,r11d ror r13d,14 mov r11d,r14d mov r12d,r8d ror r14d,9 xor r13d,edx xor r12d,r9d ror r13d,5 xor r14d,r11d and r12d,edx xor r13d,edx add r10d,DWORD[36+rsp] mov edi,r11d xor r12d,r9d ror r14d,11 xor edi,eax add r10d,r12d ror r13d,6 and r15d,edi xor r14d,r11d add r10d,r13d xor r15d,eax ror r14d,2 add ecx,r10d add r10d,r15d mov r13d,ecx add r14d,r10d ror r13d,14 mov r10d,r14d mov r12d,edx ror r14d,9 xor r13d,ecx xor r12d,r8d ror r13d,5 xor r14d,r10d and r12d,ecx xor r13d,ecx add r9d,DWORD[40+rsp] mov r15d,r10d xor r12d,r8d ror r14d,11 xor r15d,r11d add r9d,r12d ror r13d,6 and edi,r15d xor r14d,r10d add r9d,r13d xor edi,r11d ror r14d,2 add ebx,r9d add r9d,edi mov r13d,ebx add r14d,r9d ror r13d,14 mov r9d,r14d mov r12d,ecx ror r14d,9 xor r13d,ebx xor r12d,edx ror r13d,5 xor r14d,r9d and r12d,ebx xor r13d,ebx add r8d,DWORD[44+rsp] mov edi,r9d xor r12d,edx ror r14d,11 xor edi,r10d add r8d,r12d ror r13d,6 and r15d,edi xor r14d,r9d add r8d,r13d xor r15d,r10d ror r14d,2 add eax,r8d add r8d,r15d mov r13d,eax add r14d,r8d ror r13d,14 mov r8d,r14d mov r12d,ebx ror r14d,9 xor r13d,eax xor r12d,ecx ror r13d,5 xor r14d,r8d and r12d,eax xor r13d,eax add edx,DWORD[48+rsp] mov r15d,r8d xor r12d,ecx ror r14d,11 xor r15d,r9d add edx,r12d ror r13d,6 and edi,r15d xor r14d,r8d add edx,r13d xor edi,r9d ror r14d,2 add r11d,edx add edx,edi mov r13d,r11d add r14d,edx ror r13d,14 mov edx,r14d mov r12d,eax ror r14d,9 xor r13d,r11d xor r12d,ebx ror r13d,5 xor r14d,edx and r12d,r11d xor r13d,r11d add ecx,DWORD[52+rsp] mov edi,edx xor r12d,ebx ror r14d,11 xor edi,r8d add ecx,r12d ror r13d,6 and r15d,edi xor r14d,edx add ecx,r13d xor r15d,r8d ror r14d,2 add r10d,ecx add ecx,r15d mov r13d,r10d add r14d,ecx ror r13d,14 mov ecx,r14d mov r12d,r11d ror r14d,9 xor r13d,r10d xor r12d,eax ror r13d,5 xor r14d,ecx and r12d,r10d xor r13d,r10d add ebx,DWORD[56+rsp] mov r15d,ecx xor r12d,eax ror r14d,11 xor r15d,edx add ebx,r12d ror r13d,6 and edi,r15d xor r14d,ecx add ebx,r13d xor edi,edx ror r14d,2 add r9d,ebx add ebx,edi mov r13d,r9d add r14d,ebx ror r13d,14 mov ebx,r14d mov r12d,r10d ror r14d,9 xor r13d,r9d xor r12d,r11d ror r13d,5 xor r14d,ebx and r12d,r9d xor r13d,r9d add eax,DWORD[60+rsp] mov edi,ebx xor r12d,r11d ror r14d,11 xor edi,ecx add eax,r12d ror r13d,6 and r15d,edi xor r14d,ebx add eax,r13d xor r15d,ecx ror r14d,2 add r8d,eax add eax,r15d mov r13d,r8d add r14d,eax mov rdi,QWORD[((64+0))+rsp] mov eax,r14d add eax,DWORD[rdi] lea rsi,[64+rsi] add ebx,DWORD[4+rdi] add ecx,DWORD[8+rdi] add edx,DWORD[12+rdi] add r8d,DWORD[16+rdi] add r9d,DWORD[20+rdi] add r10d,DWORD[24+rdi] add r11d,DWORD[28+rdi] cmp rsi,QWORD[((64+16))+rsp] mov DWORD[rdi],eax mov DWORD[4+rdi],ebx mov DWORD[8+rdi],ecx mov DWORD[12+rdi],edx mov DWORD[16+rdi],r8d mov DWORD[20+rdi],r9d mov DWORD[24+rdi],r10d mov DWORD[28+rdi],r11d jb NEAR $L$loop_ssse3 mov rsi,QWORD[88+rsp] movaps xmm6,XMMWORD[((64+32))+rsp] movaps xmm7,XMMWORD[((64+48))+rsp] movaps xmm8,XMMWORD[((64+64))+rsp] movaps xmm9,XMMWORD[((64+80))+rsp] mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$epilogue_ssse3: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_sha256_block_data_order_ssse3: global sha256_block_data_order_avx ALIGN 64 sha256_block_data_order_avx: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_sha256_block_data_order_avx: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 shl rdx,4 sub rsp,160 lea rdx,[rdx*4+rsi] and rsp,-64 mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx mov QWORD[88+rsp],rax movaps XMMWORD[(64+32)+rsp],xmm6 movaps XMMWORD[(64+48)+rsp],xmm7 movaps XMMWORD[(64+64)+rsp],xmm8 movaps XMMWORD[(64+80)+rsp],xmm9 $L$prologue_avx: vzeroupper mov eax,DWORD[rdi] mov ebx,DWORD[4+rdi] mov ecx,DWORD[8+rdi] mov edx,DWORD[12+rdi] mov r8d,DWORD[16+rdi] mov r9d,DWORD[20+rdi] mov r10d,DWORD[24+rdi] mov r11d,DWORD[28+rdi] vmovdqa xmm8,XMMWORD[((K256+512+32))] vmovdqa xmm9,XMMWORD[((K256+512+64))] jmp NEAR $L$loop_avx ALIGN 16 $L$loop_avx: vmovdqa xmm7,XMMWORD[((K256+512))] vmovdqu xmm0,XMMWORD[rsi] vmovdqu xmm1,XMMWORD[16+rsi] vmovdqu xmm2,XMMWORD[32+rsi] vmovdqu xmm3,XMMWORD[48+rsi] vpshufb xmm0,xmm0,xmm7 lea rbp,[K256] vpshufb xmm1,xmm1,xmm7 vpshufb xmm2,xmm2,xmm7 vpaddd xmm4,xmm0,XMMWORD[rbp] vpshufb xmm3,xmm3,xmm7 vpaddd xmm5,xmm1,XMMWORD[32+rbp] vpaddd xmm6,xmm2,XMMWORD[64+rbp] vpaddd xmm7,xmm3,XMMWORD[96+rbp] vmovdqa XMMWORD[rsp],xmm4 mov r14d,eax vmovdqa XMMWORD[16+rsp],xmm5 mov edi,ebx vmovdqa XMMWORD[32+rsp],xmm6 xor edi,ecx vmovdqa XMMWORD[48+rsp],xmm7 mov r13d,r8d jmp NEAR $L$avx_00_47 ALIGN 16 $L$avx_00_47: sub rbp,-128 vpalignr xmm4,xmm1,xmm0,4 shrd r13d,r13d,14 mov eax,r14d mov r12d,r9d vpalignr xmm7,xmm3,xmm2,4 shrd r14d,r14d,9 xor r13d,r8d xor r12d,r10d vpsrld xmm6,xmm4,7 shrd r13d,r13d,5 xor r14d,eax and r12d,r8d vpaddd xmm0,xmm0,xmm7 xor r13d,r8d add r11d,DWORD[rsp] mov r15d,eax vpsrld xmm7,xmm4,3 xor r12d,r10d shrd r14d,r14d,11 xor r15d,ebx vpslld xmm5,xmm4,14 add r11d,r12d shrd r13d,r13d,6 and edi,r15d vpxor xmm4,xmm7,xmm6 xor r14d,eax add r11d,r13d xor edi,ebx vpshufd xmm7,xmm3,250 shrd r14d,r14d,2 add edx,r11d add r11d,edi vpsrld xmm6,xmm6,11 mov r13d,edx add r14d,r11d shrd r13d,r13d,14 vpxor xmm4,xmm4,xmm5 mov r11d,r14d mov r12d,r8d shrd r14d,r14d,9 vpslld xmm5,xmm5,11 xor r13d,edx xor r12d,r9d shrd r13d,r13d,5 vpxor xmm4,xmm4,xmm6 xor r14d,r11d and r12d,edx xor r13d,edx vpsrld xmm6,xmm7,10 add r10d,DWORD[4+rsp] mov edi,r11d xor r12d,r9d vpxor xmm4,xmm4,xmm5 shrd r14d,r14d,11 xor edi,eax add r10d,r12d vpsrlq xmm7,xmm7,17 shrd r13d,r13d,6 and r15d,edi xor r14d,r11d vpaddd xmm0,xmm0,xmm4 add r10d,r13d xor r15d,eax shrd r14d,r14d,2 vpxor xmm6,xmm6,xmm7 add ecx,r10d add r10d,r15d mov r13d,ecx vpsrlq xmm7,xmm7,2 add r14d,r10d shrd r13d,r13d,14 mov r10d,r14d vpxor xmm6,xmm6,xmm7 mov r12d,edx shrd r14d,r14d,9 xor r13d,ecx vpshufb xmm6,xmm6,xmm8 xor r12d,r8d shrd r13d,r13d,5 xor r14d,r10d vpaddd xmm0,xmm0,xmm6 and r12d,ecx xor r13d,ecx add r9d,DWORD[8+rsp] vpshufd xmm7,xmm0,80 mov r15d,r10d xor r12d,r8d shrd r14d,r14d,11 vpsrld xmm6,xmm7,10 xor r15d,r11d add r9d,r12d shrd r13d,r13d,6 vpsrlq xmm7,xmm7,17 and edi,r15d xor r14d,r10d add r9d,r13d vpxor xmm6,xmm6,xmm7 xor edi,r11d shrd r14d,r14d,2 add ebx,r9d vpsrlq xmm7,xmm7,2 add r9d,edi mov r13d,ebx add r14d,r9d vpxor xmm6,xmm6,xmm7 shrd r13d,r13d,14 mov r9d,r14d mov r12d,ecx vpshufb xmm6,xmm6,xmm9 shrd r14d,r14d,9 xor r13d,ebx xor r12d,edx vpaddd xmm0,xmm0,xmm6 shrd r13d,r13d,5 xor r14d,r9d and r12d,ebx vpaddd xmm6,xmm0,XMMWORD[rbp] xor r13d,ebx add r8d,DWORD[12+rsp] mov edi,r9d xor r12d,edx shrd r14d,r14d,11 xor edi,r10d add r8d,r12d shrd r13d,r13d,6 and r15d,edi xor r14d,r9d add r8d,r13d xor r15d,r10d shrd r14d,r14d,2 add eax,r8d add r8d,r15d mov r13d,eax add r14d,r8d vmovdqa XMMWORD[rsp],xmm6 vpalignr xmm4,xmm2,xmm1,4 shrd r13d,r13d,14 mov r8d,r14d mov r12d,ebx vpalignr xmm7,xmm0,xmm3,4 shrd r14d,r14d,9 xor r13d,eax xor r12d,ecx vpsrld xmm6,xmm4,7 shrd r13d,r13d,5 xor r14d,r8d and r12d,eax vpaddd xmm1,xmm1,xmm7 xor r13d,eax add edx,DWORD[16+rsp] mov r15d,r8d vpsrld xmm7,xmm4,3 xor r12d,ecx shrd r14d,r14d,11 xor r15d,r9d vpslld xmm5,xmm4,14 add edx,r12d shrd r13d,r13d,6 and edi,r15d vpxor xmm4,xmm7,xmm6 xor r14d,r8d add edx,r13d xor edi,r9d vpshufd xmm7,xmm0,250 shrd r14d,r14d,2 add r11d,edx add edx,edi vpsrld xmm6,xmm6,11 mov r13d,r11d add r14d,edx shrd r13d,r13d,14 vpxor xmm4,xmm4,xmm5 mov edx,r14d mov r12d,eax shrd r14d,r14d,9 vpslld xmm5,xmm5,11 xor r13d,r11d xor r12d,ebx shrd r13d,r13d,5 vpxor xmm4,xmm4,xmm6 xor r14d,edx and r12d,r11d xor r13d,r11d vpsrld xmm6,xmm7,10 add ecx,DWORD[20+rsp] mov edi,edx xor r12d,ebx vpxor xmm4,xmm4,xmm5 shrd r14d,r14d,11 xor edi,r8d add ecx,r12d vpsrlq xmm7,xmm7,17 shrd r13d,r13d,6 and r15d,edi xor r14d,edx vpaddd xmm1,xmm1,xmm4 add ecx,r13d xor r15d,r8d shrd r14d,r14d,2 vpxor xmm6,xmm6,xmm7 add r10d,ecx add ecx,r15d mov r13d,r10d vpsrlq xmm7,xmm7,2 add r14d,ecx shrd r13d,r13d,14 mov ecx,r14d vpxor xmm6,xmm6,xmm7 mov r12d,r11d shrd r14d,r14d,9 xor r13d,r10d vpshufb xmm6,xmm6,xmm8 xor r12d,eax shrd r13d,r13d,5 xor r14d,ecx vpaddd xmm1,xmm1,xmm6 and r12d,r10d xor r13d,r10d add ebx,DWORD[24+rsp] vpshufd xmm7,xmm1,80 mov r15d,ecx xor r12d,eax shrd r14d,r14d,11 vpsrld xmm6,xmm7,10 xor r15d,edx add ebx,r12d shrd r13d,r13d,6 vpsrlq xmm7,xmm7,17 and edi,r15d xor r14d,ecx add ebx,r13d vpxor xmm6,xmm6,xmm7 xor edi,edx shrd r14d,r14d,2 add r9d,ebx vpsrlq xmm7,xmm7,2 add ebx,edi mov r13d,r9d add r14d,ebx vpxor xmm6,xmm6,xmm7 shrd r13d,r13d,14 mov ebx,r14d mov r12d,r10d vpshufb xmm6,xmm6,xmm9 shrd r14d,r14d,9 xor r13d,r9d xor r12d,r11d vpaddd xmm1,xmm1,xmm6 shrd r13d,r13d,5 xor r14d,ebx and r12d,r9d vpaddd xmm6,xmm1,XMMWORD[32+rbp] xor r13d,r9d add eax,DWORD[28+rsp] mov edi,ebx xor r12d,r11d shrd r14d,r14d,11 xor edi,ecx add eax,r12d shrd r13d,r13d,6 and r15d,edi xor r14d,ebx add eax,r13d xor r15d,ecx shrd r14d,r14d,2 add r8d,eax add eax,r15d mov r13d,r8d add r14d,eax vmovdqa XMMWORD[16+rsp],xmm6 vpalignr xmm4,xmm3,xmm2,4 shrd r13d,r13d,14 mov eax,r14d mov r12d,r9d vpalignr xmm7,xmm1,xmm0,4 shrd r14d,r14d,9 xor r13d,r8d xor r12d,r10d vpsrld xmm6,xmm4,7 shrd r13d,r13d,5 xor r14d,eax and r12d,r8d vpaddd xmm2,xmm2,xmm7 xor r13d,r8d add r11d,DWORD[32+rsp] mov r15d,eax vpsrld xmm7,xmm4,3 xor r12d,r10d shrd r14d,r14d,11 xor r15d,ebx vpslld xmm5,xmm4,14 add r11d,r12d shrd r13d,r13d,6 and edi,r15d vpxor xmm4,xmm7,xmm6 xor r14d,eax add r11d,r13d xor edi,ebx vpshufd xmm7,xmm1,250 shrd r14d,r14d,2 add edx,r11d add r11d,edi vpsrld xmm6,xmm6,11 mov r13d,edx add r14d,r11d shrd r13d,r13d,14 vpxor xmm4,xmm4,xmm5 mov r11d,r14d mov r12d,r8d shrd r14d,r14d,9 vpslld xmm5,xmm5,11 xor r13d,edx xor r12d,r9d shrd r13d,r13d,5 vpxor xmm4,xmm4,xmm6 xor r14d,r11d and r12d,edx xor r13d,edx vpsrld xmm6,xmm7,10 add r10d,DWORD[36+rsp] mov edi,r11d xor r12d,r9d vpxor xmm4,xmm4,xmm5 shrd r14d,r14d,11 xor edi,eax add r10d,r12d vpsrlq xmm7,xmm7,17 shrd r13d,r13d,6 and r15d,edi xor r14d,r11d vpaddd xmm2,xmm2,xmm4 add r10d,r13d xor r15d,eax shrd r14d,r14d,2 vpxor xmm6,xmm6,xmm7 add ecx,r10d add r10d,r15d mov r13d,ecx vpsrlq xmm7,xmm7,2 add r14d,r10d shrd r13d,r13d,14 mov r10d,r14d vpxor xmm6,xmm6,xmm7 mov r12d,edx shrd r14d,r14d,9 xor r13d,ecx vpshufb xmm6,xmm6,xmm8 xor r12d,r8d shrd r13d,r13d,5 xor r14d,r10d vpaddd xmm2,xmm2,xmm6 and r12d,ecx xor r13d,ecx add r9d,DWORD[40+rsp] vpshufd xmm7,xmm2,80 mov r15d,r10d xor r12d,r8d shrd r14d,r14d,11 vpsrld xmm6,xmm7,10 xor r15d,r11d add r9d,r12d shrd r13d,r13d,6 vpsrlq xmm7,xmm7,17 and edi,r15d xor r14d,r10d add r9d,r13d vpxor xmm6,xmm6,xmm7 xor edi,r11d shrd r14d,r14d,2 add ebx,r9d vpsrlq xmm7,xmm7,2 add r9d,edi mov r13d,ebx add r14d,r9d vpxor xmm6,xmm6,xmm7 shrd r13d,r13d,14 mov r9d,r14d mov r12d,ecx vpshufb xmm6,xmm6,xmm9 shrd r14d,r14d,9 xor r13d,ebx xor r12d,edx vpaddd xmm2,xmm2,xmm6 shrd r13d,r13d,5 xor r14d,r9d and r12d,ebx vpaddd xmm6,xmm2,XMMWORD[64+rbp] xor r13d,ebx add r8d,DWORD[44+rsp] mov edi,r9d xor r12d,edx shrd r14d,r14d,11 xor edi,r10d add r8d,r12d shrd r13d,r13d,6 and r15d,edi xor r14d,r9d add r8d,r13d xor r15d,r10d shrd r14d,r14d,2 add eax,r8d add r8d,r15d mov r13d,eax add r14d,r8d vmovdqa XMMWORD[32+rsp],xmm6 vpalignr xmm4,xmm0,xmm3,4 shrd r13d,r13d,14 mov r8d,r14d mov r12d,ebx vpalignr xmm7,xmm2,xmm1,4 shrd r14d,r14d,9 xor r13d,eax xor r12d,ecx vpsrld xmm6,xmm4,7 shrd r13d,r13d,5 xor r14d,r8d and r12d,eax vpaddd xmm3,xmm3,xmm7 xor r13d,eax add edx,DWORD[48+rsp] mov r15d,r8d vpsrld xmm7,xmm4,3 xor r12d,ecx shrd r14d,r14d,11 xor r15d,r9d vpslld xmm5,xmm4,14 add edx,r12d shrd r13d,r13d,6 and edi,r15d vpxor xmm4,xmm7,xmm6 xor r14d,r8d add edx,r13d xor edi,r9d vpshufd xmm7,xmm2,250 shrd r14d,r14d,2 add r11d,edx add edx,edi vpsrld xmm6,xmm6,11 mov r13d,r11d add r14d,edx shrd r13d,r13d,14 vpxor xmm4,xmm4,xmm5 mov edx,r14d mov r12d,eax shrd r14d,r14d,9 vpslld xmm5,xmm5,11 xor r13d,r11d xor r12d,ebx shrd r13d,r13d,5 vpxor xmm4,xmm4,xmm6 xor r14d,edx and r12d,r11d xor r13d,r11d vpsrld xmm6,xmm7,10 add ecx,DWORD[52+rsp] mov edi,edx xor r12d,ebx vpxor xmm4,xmm4,xmm5 shrd r14d,r14d,11 xor edi,r8d add ecx,r12d vpsrlq xmm7,xmm7,17 shrd r13d,r13d,6 and r15d,edi xor r14d,edx vpaddd xmm3,xmm3,xmm4 add ecx,r13d xor r15d,r8d shrd r14d,r14d,2 vpxor xmm6,xmm6,xmm7 add r10d,ecx add ecx,r15d mov r13d,r10d vpsrlq xmm7,xmm7,2 add r14d,ecx shrd r13d,r13d,14 mov ecx,r14d vpxor xmm6,xmm6,xmm7 mov r12d,r11d shrd r14d,r14d,9 xor r13d,r10d vpshufb xmm6,xmm6,xmm8 xor r12d,eax shrd r13d,r13d,5 xor r14d,ecx vpaddd xmm3,xmm3,xmm6 and r12d,r10d xor r13d,r10d add ebx,DWORD[56+rsp] vpshufd xmm7,xmm3,80 mov r15d,ecx xor r12d,eax shrd r14d,r14d,11 vpsrld xmm6,xmm7,10 xor r15d,edx add ebx,r12d shrd r13d,r13d,6 vpsrlq xmm7,xmm7,17 and edi,r15d xor r14d,ecx add ebx,r13d vpxor xmm6,xmm6,xmm7 xor edi,edx shrd r14d,r14d,2 add r9d,ebx vpsrlq xmm7,xmm7,2 add ebx,edi mov r13d,r9d add r14d,ebx vpxor xmm6,xmm6,xmm7 shrd r13d,r13d,14 mov ebx,r14d mov r12d,r10d vpshufb xmm6,xmm6,xmm9 shrd r14d,r14d,9 xor r13d,r9d xor r12d,r11d vpaddd xmm3,xmm3,xmm6 shrd r13d,r13d,5 xor r14d,ebx and r12d,r9d vpaddd xmm6,xmm3,XMMWORD[96+rbp] xor r13d,r9d add eax,DWORD[60+rsp] mov edi,ebx xor r12d,r11d shrd r14d,r14d,11 xor edi,ecx add eax,r12d shrd r13d,r13d,6 and r15d,edi xor r14d,ebx add eax,r13d xor r15d,ecx shrd r14d,r14d,2 add r8d,eax add eax,r15d mov r13d,r8d add r14d,eax vmovdqa XMMWORD[48+rsp],xmm6 cmp BYTE[131+rbp],0 jne NEAR $L$avx_00_47 shrd r13d,r13d,14 mov eax,r14d mov r12d,r9d shrd r14d,r14d,9 xor r13d,r8d xor r12d,r10d shrd r13d,r13d,5 xor r14d,eax and r12d,r8d xor r13d,r8d add r11d,DWORD[rsp] mov r15d,eax xor r12d,r10d shrd r14d,r14d,11 xor r15d,ebx add r11d,r12d shrd r13d,r13d,6 and edi,r15d xor r14d,eax add r11d,r13d xor edi,ebx shrd r14d,r14d,2 add edx,r11d add r11d,edi mov r13d,edx add r14d,r11d shrd r13d,r13d,14 mov r11d,r14d mov r12d,r8d shrd r14d,r14d,9 xor r13d,edx xor r12d,r9d shrd r13d,r13d,5 xor r14d,r11d and r12d,edx xor r13d,edx add r10d,DWORD[4+rsp] mov edi,r11d xor r12d,r9d shrd r14d,r14d,11 xor edi,eax add r10d,r12d shrd r13d,r13d,6 and r15d,edi xor r14d,r11d add r10d,r13d xor r15d,eax shrd r14d,r14d,2 add ecx,r10d add r10d,r15d mov r13d,ecx add r14d,r10d shrd r13d,r13d,14 mov r10d,r14d mov r12d,edx shrd r14d,r14d,9 xor r13d,ecx xor r12d,r8d shrd r13d,r13d,5 xor r14d,r10d and r12d,ecx xor r13d,ecx add r9d,DWORD[8+rsp] mov r15d,r10d xor r12d,r8d shrd r14d,r14d,11 xor r15d,r11d add r9d,r12d shrd r13d,r13d,6 and edi,r15d xor r14d,r10d add r9d,r13d xor edi,r11d shrd r14d,r14d,2 add ebx,r9d add r9d,edi mov r13d,ebx add r14d,r9d shrd r13d,r13d,14 mov r9d,r14d mov r12d,ecx shrd r14d,r14d,9 xor r13d,ebx xor r12d,edx shrd r13d,r13d,5 xor r14d,r9d and r12d,ebx xor r13d,ebx add r8d,DWORD[12+rsp] mov edi,r9d xor r12d,edx shrd r14d,r14d,11 xor edi,r10d add r8d,r12d shrd r13d,r13d,6 and r15d,edi xor r14d,r9d add r8d,r13d xor r15d,r10d shrd r14d,r14d,2 add eax,r8d add r8d,r15d mov r13d,eax add r14d,r8d shrd r13d,r13d,14 mov r8d,r14d mov r12d,ebx shrd r14d,r14d,9 xor r13d,eax xor r12d,ecx shrd r13d,r13d,5 xor r14d,r8d and r12d,eax xor r13d,eax add edx,DWORD[16+rsp] mov r15d,r8d xor r12d,ecx shrd r14d,r14d,11 xor r15d,r9d add edx,r12d shrd r13d,r13d,6 and edi,r15d xor r14d,r8d add edx,r13d xor edi,r9d shrd r14d,r14d,2 add r11d,edx add edx,edi mov r13d,r11d add r14d,edx shrd r13d,r13d,14 mov edx,r14d mov r12d,eax shrd r14d,r14d,9 xor r13d,r11d xor r12d,ebx shrd r13d,r13d,5 xor r14d,edx and r12d,r11d xor r13d,r11d add ecx,DWORD[20+rsp] mov edi,edx xor r12d,ebx shrd r14d,r14d,11 xor edi,r8d add ecx,r12d shrd r13d,r13d,6 and r15d,edi xor r14d,edx add ecx,r13d xor r15d,r8d shrd r14d,r14d,2 add r10d,ecx add ecx,r15d mov r13d,r10d add r14d,ecx shrd r13d,r13d,14 mov ecx,r14d mov r12d,r11d shrd r14d,r14d,9 xor r13d,r10d xor r12d,eax shrd r13d,r13d,5 xor r14d,ecx and r12d,r10d xor r13d,r10d add ebx,DWORD[24+rsp] mov r15d,ecx xor r12d,eax shrd r14d,r14d,11 xor r15d,edx add ebx,r12d shrd r13d,r13d,6 and edi,r15d xor r14d,ecx add ebx,r13d xor edi,edx shrd r14d,r14d,2 add r9d,ebx add ebx,edi mov r13d,r9d add r14d,ebx shrd r13d,r13d,14 mov ebx,r14d mov r12d,r10d shrd r14d,r14d,9 xor r13d,r9d xor r12d,r11d shrd r13d,r13d,5 xor r14d,ebx and r12d,r9d xor r13d,r9d add eax,DWORD[28+rsp] mov edi,ebx xor r12d,r11d shrd r14d,r14d,11 xor edi,ecx add eax,r12d shrd r13d,r13d,6 and r15d,edi xor r14d,ebx add eax,r13d xor r15d,ecx shrd r14d,r14d,2 add r8d,eax add eax,r15d mov r13d,r8d add r14d,eax shrd r13d,r13d,14 mov eax,r14d mov r12d,r9d shrd r14d,r14d,9 xor r13d,r8d xor r12d,r10d shrd r13d,r13d,5 xor r14d,eax and r12d,r8d xor r13d,r8d add r11d,DWORD[32+rsp] mov r15d,eax xor r12d,r10d shrd r14d,r14d,11 xor r15d,ebx add r11d,r12d shrd r13d,r13d,6 and edi,r15d xor r14d,eax add r11d,r13d xor edi,ebx shrd r14d,r14d,2 add edx,r11d add r11d,edi mov r13d,edx add r14d,r11d shrd r13d,r13d,14 mov r11d,r14d mov r12d,r8d shrd r14d,r14d,9 xor r13d,edx xor r12d,r9d shrd r13d,r13d,5 xor r14d,r11d and r12d,edx xor r13d,edx add r10d,DWORD[36+rsp] mov edi,r11d xor r12d,r9d shrd r14d,r14d,11 xor edi,eax add r10d,r12d shrd r13d,r13d,6 and r15d,edi xor r14d,r11d add r10d,r13d xor r15d,eax shrd r14d,r14d,2 add ecx,r10d add r10d,r15d mov r13d,ecx add r14d,r10d shrd r13d,r13d,14 mov r10d,r14d mov r12d,edx shrd r14d,r14d,9 xor r13d,ecx xor r12d,r8d shrd r13d,r13d,5 xor r14d,r10d and r12d,ecx xor r13d,ecx add r9d,DWORD[40+rsp] mov r15d,r10d xor r12d,r8d shrd r14d,r14d,11 xor r15d,r11d add r9d,r12d shrd r13d,r13d,6 and edi,r15d xor r14d,r10d add r9d,r13d xor edi,r11d shrd r14d,r14d,2 add ebx,r9d add r9d,edi mov r13d,ebx add r14d,r9d shrd r13d,r13d,14 mov r9d,r14d mov r12d,ecx shrd r14d,r14d,9 xor r13d,ebx xor r12d,edx shrd r13d,r13d,5 xor r14d,r9d and r12d,ebx xor r13d,ebx add r8d,DWORD[44+rsp] mov edi,r9d xor r12d,edx shrd r14d,r14d,11 xor edi,r10d add r8d,r12d shrd r13d,r13d,6 and r15d,edi xor r14d,r9d add r8d,r13d xor r15d,r10d shrd r14d,r14d,2 add eax,r8d add r8d,r15d mov r13d,eax add r14d,r8d shrd r13d,r13d,14 mov r8d,r14d mov r12d,ebx shrd r14d,r14d,9 xor r13d,eax xor r12d,ecx shrd r13d,r13d,5 xor r14d,r8d and r12d,eax xor r13d,eax add edx,DWORD[48+rsp] mov r15d,r8d xor r12d,ecx shrd r14d,r14d,11 xor r15d,r9d add edx,r12d shrd r13d,r13d,6 and edi,r15d xor r14d,r8d add edx,r13d xor edi,r9d shrd r14d,r14d,2 add r11d,edx add edx,edi mov r13d,r11d add r14d,edx shrd r13d,r13d,14 mov edx,r14d mov r12d,eax shrd r14d,r14d,9 xor r13d,r11d xor r12d,ebx shrd r13d,r13d,5 xor r14d,edx and r12d,r11d xor r13d,r11d add ecx,DWORD[52+rsp] mov edi,edx xor r12d,ebx shrd r14d,r14d,11 xor edi,r8d add ecx,r12d shrd r13d,r13d,6 and r15d,edi xor r14d,edx add ecx,r13d xor r15d,r8d shrd r14d,r14d,2 add r10d,ecx add ecx,r15d mov r13d,r10d add r14d,ecx shrd r13d,r13d,14 mov ecx,r14d mov r12d,r11d shrd r14d,r14d,9 xor r13d,r10d xor r12d,eax shrd r13d,r13d,5 xor r14d,ecx and r12d,r10d xor r13d,r10d add ebx,DWORD[56+rsp] mov r15d,ecx xor r12d,eax shrd r14d,r14d,11 xor r15d,edx add ebx,r12d shrd r13d,r13d,6 and edi,r15d xor r14d,ecx add ebx,r13d xor edi,edx shrd r14d,r14d,2 add r9d,ebx add ebx,edi mov r13d,r9d add r14d,ebx shrd r13d,r13d,14 mov ebx,r14d mov r12d,r10d shrd r14d,r14d,9 xor r13d,r9d xor r12d,r11d shrd r13d,r13d,5 xor r14d,ebx and r12d,r9d xor r13d,r9d add eax,DWORD[60+rsp] mov edi,ebx xor r12d,r11d shrd r14d,r14d,11 xor edi,ecx add eax,r12d shrd r13d,r13d,6 and r15d,edi xor r14d,ebx add eax,r13d xor r15d,ecx shrd r14d,r14d,2 add r8d,eax add eax,r15d mov r13d,r8d add r14d,eax mov rdi,QWORD[((64+0))+rsp] mov eax,r14d add eax,DWORD[rdi] lea rsi,[64+rsi] add ebx,DWORD[4+rdi] add ecx,DWORD[8+rdi] add edx,DWORD[12+rdi] add r8d,DWORD[16+rdi] add r9d,DWORD[20+rdi] add r10d,DWORD[24+rdi] add r11d,DWORD[28+rdi] cmp rsi,QWORD[((64+16))+rsp] mov DWORD[rdi],eax mov DWORD[4+rdi],ebx mov DWORD[8+rdi],ecx mov DWORD[12+rdi],edx mov DWORD[16+rdi],r8d mov DWORD[20+rdi],r9d mov DWORD[24+rdi],r10d mov DWORD[28+rdi],r11d jb NEAR $L$loop_avx mov rsi,QWORD[88+rsp] vzeroupper movaps xmm6,XMMWORD[((64+32))+rsp] movaps xmm7,XMMWORD[((64+48))+rsp] movaps xmm8,XMMWORD[((64+64))+rsp] movaps xmm9,XMMWORD[((64+80))+rsp] mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_sha256_block_data_order_avx: EXTERN __imp_RtlVirtualUnwind ALIGN 16 se_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$in_prologue mov rax,QWORD[152+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$in_prologue mov rsi,rax mov rax,QWORD[((64+24))+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] mov r13,QWORD[((-32))+rax] mov r14,QWORD[((-40))+rax] mov r15,QWORD[((-48))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 mov QWORD[224+r8],r13 mov QWORD[232+r8],r14 mov QWORD[240+r8],r15 lea r10,[$L$epilogue] cmp rbx,r10 jb NEAR $L$in_prologue lea rsi,[((64+32))+rsi] lea rdi,[512+r8] mov ecx,8 DD 0xa548f3fc $L$in_prologue: mov rdi,QWORD[8+rax] mov rsi,QWORD[16+rax] mov QWORD[152+r8],rax mov QWORD[168+r8],rsi mov QWORD[176+r8],rdi mov rdi,QWORD[40+r9] mov rsi,r8 mov ecx,154 DD 0xa548f3fc mov rsi,r9 xor rcx,rcx mov rdx,QWORD[8+rsi] mov r8,QWORD[rsi] mov r9,QWORD[16+rsi] mov r10,QWORD[40+rsi] lea r11,[56+rsi] lea r12,[24+rsi] mov QWORD[32+rsp],r10 mov QWORD[40+rsp],r11 mov QWORD[48+rsp],r12 mov QWORD[56+rsp],rcx call QWORD[__imp_RtlVirtualUnwind] mov eax,1 add rsp,64 popfq pop r15 pop r14 pop r13 pop r12 pop rbp pop rbx pop rdi pop rsi ret ALIGN 16 shaext_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] lea r10,[$L$prologue_shaext] cmp rbx,r10 jb NEAR $L$in_prologue lea r10,[$L$epilogue_shaext] cmp rbx,r10 jae NEAR $L$in_prologue lea rsi,[((-8-80))+rax] lea rdi,[512+r8] mov ecx,10 DD 0xa548f3fc jmp NEAR $L$in_prologue section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_sha256_block_data_order_nohw wrt ..imagebase DD $L$SEH_end_sha256_block_data_order_nohw wrt ..imagebase DD $L$SEH_info_sha256_block_data_order_nohw wrt ..imagebase DD $L$SEH_begin_sha256_block_data_order_hw wrt ..imagebase DD $L$SEH_end_sha256_block_data_order_hw wrt ..imagebase DD $L$SEH_info_sha256_block_data_order_hw wrt ..imagebase DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase DD $L$SEH_end_sha256_block_data_order_avx wrt ..imagebase DD $L$SEH_info_sha256_block_data_order_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha256_block_data_order_nohw: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase $L$SEH_info_sha256_block_data_order_hw: DB 9,0,0,0 DD shaext_handler wrt ..imagebase $L$SEH_info_sha256_block_data_order_ssse3: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase $L$SEH_info_sha256_block_data_order_avx: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/sha256-x86_64-nasm.o000064400000000000000000001410751046102023000164720ustar 00000000000000dg3.debug$S|}H@B.debug$Tl@B.text5 p`.rdata@p@.pdata0 @0@.xdata8h @@@7C:\Users\b\p\ring\pregenerated\sha256-x86_64-nasm.asmO7Q*ļcSv5v   "$&(!*#,'-+.//3081=2B3G7I8L9O:R;V<Z=^>b?gApCrDyE{F~GHIJKMNOQRSUVWYZ[]^_abcefgijkmnopqrstvwxz{ | ~ #'*,036:=@CFJMQTWZ^adhkpsvz} #&),036:=BEHLORVY\_cfimp s w z | "#$&'(*+,./0234678:;<=>?@ACD EGHIKL M#O'P*Q-S0T4U7W:X>Y@[C\G]J_L`OaRcVdYe]f`gchfijjmlpmtnwp|qrtuvxyz|}~ "%),0369=@CGJORTX[^behkortx{~       !$&*-/369 <!@"C$F%J&M(P)T*W,Z-].`0d1g2k3n4q5t6x7{9~:;=>?ABCEFGIJKMNOQRSUVWYZ[\]^_`bcdfghjklnop r stvwxz!{%|(~+.158<?BEILOSV[^aehkorux| !$'+.269=ADGLPSVY]`cgjnqtx{~     !#$%&()*+,-/01 345789;#<&=)?,@0A3C5D9E<G?HCIFKILLMOOSPXQ\S_TcUfViWmYpZt[x\{]_`abdefghiklmopqstuwxy{|}          " % ( + 0 5 8 ; > B D G K M R U W [ ^ ` d g j m q t w { ~                                               # & * - / 2 5 9 > B E I L O R U Y ] ` d g j m r w z }      ! # $ % ' ( ) + , - / 0 1 3 4 5 7 8 9 ; < = ? @ A C D E F G I J K L M O P Q R T U V W! X% Y( [+ \/ ]2 _7 `: a= cA dD eG gK hN iQ kT lX m[ o^ pb qd sg tk un wp xs yv {z | }                                               " % ) , / 3 6 : > A E I L O T Y \ _ b f i l p s x { ~                                                # & ) - 0 3 6 : != #? $C %F 'I (M )P +S ,V -Y /] 0b 1g 3j 4n 5q 6t 7x 9{ : ; < = ? @ A B D E F G H I K L M O P Q S T U W X Y [ \ ] _ ` a c d e g h i klm opqrsuv"w&x)y-{0|3}6~;@CFIMORVX]`bfikorux|  !$'+.158:=@DIMPTWZ]`dhkorux}     !#$%&')*+, -/0124#5&6)7,8093;6<:==?B@EAHCLDOERGVHYI\K_LcMfOiPmQoSrTvUyW{X~Y[\]_`abcefghiklmnpqrstuwxy{|}  #'-259;>ADHLPTY[^adhlptz    $(-0!4"9#>$C%H'M(Q)V*[+_,d-e.j/n1s2w3|45678:;<=>?@ABCDFGHIJKLMNOPQRSTUVW X YZ[\]!^%_)`.a2b8c9d=eAfEgJhNiRjVk[l_menfojpnqrrws{tuvwxyz{|}~ "#'+/7;?CHLRSW[_gkosx| #',049>CFKPQ     !"!#&$+%0&7'<(A)F*K+O,T-Y.^/b0f1j2o3r4x5z6789:>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^ _ `abcd e#f&g*h.i1j4k8l=m@nDoGpJqOrTsWt[u^vbwfxhykzo{s|v}y~~  %(+/369=@EHKPTWZ^cfinruz}  #',/26:=@DILPS V [ _ a e h l p s v z ~                     ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 !: $; (< -= 0> 3? 7@ ;A >B AC ED HE MF PG SH WI [J ]K `L dM iN kO nP sQ wR zS T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~   " ' * . 3 6 9 = A D G K P S W Z ] b g j n q u y { ~                                             ! $ ' , / 3 8 ; > B F I L P S X [ ^ c g j m q v y |                                             " % * , / 4 7 ; @ C F J N Q T X ] ` d g j! o" s# u$ y% |& ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T $U 'V +W -X 0Y 5Z 8[ <\ A] D^ G_ K` Oa Rb Uc Yd \e af dg gh ki oj qk tl xm }n o p q r s t u v w x y z { | } ~                               " % ) , / 3 6 9 < A D G K M P T W Z ] ` d g j m p t w z ~                                              # & * - 0 4 7 : = A D G K N Q U X [ ^ a e h j  m  p  t  w  z  ~                     ! " # $ % & ' ( ) * + , - . !/ !0 !1 !2 !3 !4 !5 !6 !7 !8 !9 !!: %!; (!< +!= /!> 2!? 5!@ 8!A !C A!D E!E G!F J!G N!H Q!I T!J W!K Z!L ^!M a!N d!O g!P j!Q n!R q!S t!T x!U {!V ~!W !X !Y !Z ![ !\ !] !^ !_ !` !a !b !c !d !e !f !g !h !i !j !k !l !m !n !o !p !q !r !s !t !u !v !w !x !y !z !{ !| !} !~ " " " " " " " " " " "" %" (" ," /" 2" 5" :" =" @" D" G" J" N" Q" T" W" Z" ^" a" d" g" j" n" q" t" x" {" ~" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " # # # # # # # # # # "# %# (# ,# /# 2# 5# 9# ;# ># B# E# H# L# O# R# U# X# \# _# b# e# h# l# o# r# v# y# |# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # $ $ $ $ $! $" $$ $% $& $' "$( %$) )$* -$+ 1$, 5$. :$0 <$1 ?$2 B$3 E$4 I$5 M$6 Q$7 U$8 [$: `$< e$= j$> s$? |$@ $B $D $F $H $J $L $O $P $Q $V $X $Y $Z $\ $] $^ $c $e $g $i $k $m $o $q $r $s $t $u $v %w %x %z %{ %| %} &% )% +% .% 1% 4% 8% <% @% D% L% T% Y% `% h% l% q% v% {% % % % % % % % % % % % % % % % % % % % % % % % % % % % % & & & & & & & & & !& $& )& ,& 1& 4& 8& ;& >& @& E& J& M& P& U& X& [& `& d& g& j& o& t& w& z& & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & & ' ' ' ' ' ' ' ' ' $' )' ,' /' 4' 9' <' ?' B' F' I' N' Q' V' Y' \' _' c' h' k' n' s' x' {' ~' ' ' ' ' ' ' ' '! '" '# '$ '% '& '' '( ') '* '+ ', '- '. '/ '0 '1 '2 '3 '4 '5 '6 '7 '8 '9 ': (; (< (= (> (? (@ (A (B (C "(D '(E *(F /(G 2(H 6(I 9(J <(K ?(L D(M I(N L(O N(P S(Q V(R Y(S ^(T b(U e(V h(W m(X r(Y u(Z x([ }(\ (] (^ (_ (` (a (b (c (d (e (f (g (h (i (j (k (l (m (n (o (p (q (r (s (t (u (v (w (x (y (z ({ (| (} (~ ( ) ) ) ) ) ) ) ) ) %) () +) 0) 5) 8) ;) >) B) D) I) L) Q) S) V) Y) ]) b) e) h) m) r) u) x) |) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) * * * * * * * * * #* (* +* /* 2* 5* 7* <* A* D* G* L* O* R* W* [* ^* a* f* k* n* q* v* z* }* * * * * * * * * * * * * * * * * * * * * * * * * * * ********++ +  + + + ++ +#+&+++0+3+6+9+=+@+E+H+M+P+S+V+Z+ _+!b+"e+#j+$o+%r+&u+'y+(~+)+*+++,+-+.+/+0+1+2+3+4+5+6+7+8+9+:+;+<+=+>+?+@+A+B+C+D+E+F+G+H+I+J,K,L,M ,N,O,P,Q,R,S",T',U*,V.,W1,X4,Y7,Z<,[A,\D,]F,^K,_N,`Q,aV,bZ,c],d`,ee,fj,gm,hp,iu,jy,k|,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,{,|,},~,,,,,,,,,,,,,,,,,--- ----- -#-(---0-3-6-:-<-A-D-I-K-N-Q-U-Z-]-`-e-j-m-p-t-y-|------------------------------------... ...... .#.&.)...1.4.9.<.?.D.G.J.M.R.U.X.]._.b.g.j.m.p.s.x.{.~................. . . . . ................// /  /!/"/#/$/%/&"/'%/((/)+/*0/+3/,6/-9/.u/?z/@}/A/B/C/D/E/F/G/H/I/J/K/L/M/N/O/P/Q/R/S/T/U/V/W/X/Y/Z/[/\/]/^/_/`/a/b/c/d/e/f0g0h0i 0j0k0l0m0n0o0p#0q&0r)0s.0t10u40v70w90x>0yA0zC0{F0|I0}N0~Q0T0Y0\0_0d0g0j0m0q0s0v0{0}00000000000000000000000000000000000000111 111111!1&1)1,1113161;1>1A1D1G1L1O1R1U1X1]1`1c1h1k1n1s1v1y1|1111111111111111111111111111111111111122 2 22222 2#2&2+2.21242 82 ;2 >2 C2 F2I2N2Q2T2W2Z2_2b2d2g2j2o2r2u2z2}222 2!2"2#2$2%2&2'2(2)2*2+2,2-2.2/202122232425262728292:2;2<2=2>2?2@2A2B2C3D3E3F 3G 3H3I3J3K3L3M"3N%3O(3P-3Q03R33S83T;3U>3VA3WE3XG3YJ3ZO3[Q3\T3]Y3^\3__3`b3ae3bj3cm3dp3es3fv3g{3h~3j3k3l3m3n3o3p3q3r3t3v3w3x3y3z3{3|3}3~333333333333344 4 44444444444!4%4,4044474;4>4D4K4O4S4V4\4_4c4g4k4o4s4w4{4444444444444444444444444555 55555"5'5-525657595;5=5?5@5A5B5C5D5P5Q5R5S5T5V5X5Z5\5]5a5 e5 l5 s5 v5|55555555:C:\Users\b\p\ring\pregenerated\sha256-x86_64-nasm.o4'The Netwide Assembler 2.13.039ring_core_0_17_14__sha256_block_data_order_nohw2L$SEH_begin_sha256_block_data_order_nohwL$prologueL$loopL$rounds_16_xxL$epilogue0L$SEH_end_sha256_block_data_order_nohw "K2567ring_core_0_17_14__sha256_block_data_order_hw0L$SEH_begin_sha256_block_data_order_hwL$prologue_shaextL$oop_shaextL$epilogue_shaext.L$SEH_end_sha256_block_data_order_hw:ring_core_0_17_14__sha256_block_data_order_ssse33L$SEH_begin_sha256_block_data_order_ssse3L$prologue_ssse3L$loop_ssse3L$ssse3_00_47L$epilogue_ssse31L$SEH_end_sha256_block_data_order_ssse38ring_core_0_17_14__sha256_block_data_order_avx1L$SEH_begin_sha256_block_data_order_avxL$prologue_avxL$loop_avxL$avx_00_47L$epilogue_avx/L$SEH_end_sha256_block_data_order_avxse_handlerL$in_prologueshaext_handler4  L$SEH_info_sha256_block_data_order_nohw2  L$SEH_info_sha256_block_data_order_hw5  L$SEH_info_sha256_block_data_order_ssse33  L$SEH_info_sha256_block_data_order_avxl p w w w w )x -x ?x Cx Qx Ux kx ox x x x x x x x y 1y 5y Ny Ry fy jy y y y y y y $z (z @z! Dz! Xz" \z" qz# uz# z$ z$ z% z% z& z& -{' 1{' G{( K{( ]{) a{) t{* x{* {+ {+ {, {, {- {- {. {. |/ |/ B|0 F|0 v|1 z|1 |2 |2 H|$Ht$HHHLHSUATAUAVAWHH`HHH|$@Ht$HHT$PHD$X_OW DGDODWD_ H-1D&EAAAEE1A E1D$$A1E!AEE1A E1EADeA1A1AAD!AEA1DEHmEDfAEAADA1A D1Dd$E1!AED1A A1ADDeE11AAA!AEE1DEHmEDfAEAAAA1A E1Dd$E1A!AEE1A A1EEDeE1E1AED!AEA1DEHmEDf AEAAA1A 1Dd$ E1!AE1A A1ADDeE1D1AEA!AEE1DEHmEDfAEAAAA1A A1Dd$E1A!AAA1A A1EEDeE1E1ADD!AE1EDHmDDfEAAAE1A 1Dd$A1D!AA1A E1ADeA1D1ADA!AED1EDHmDDfEAAAEE1A A1Dd$A1E!AAA1A E1EADeA1A1AD!AE1EDHmDDfEAAADE1A D1Dd$A1D!AAD1A E1ADeA11AA!AED1EDHmDDf EAAAEE1A E1Dd$ A1E!AEE1A E1EADeA1A1AAD!AEA1DEHmEDf$AEAADA1A D1Dd$$E1!AED1A A1ADDeE11AAA!AEE1DEHmEDf(AEAAAA1A E1Dd$(E1A!AEE1A A1EEDeE1E1AED!AEA1DEHmEDf,AEAAA1A 1Dd$,E1!AE1A A1ADDeE1D1AEA!AEE1DEHmEDf0AEAAAA1A A1Dd$0E1A!AAA1A A1EEDeE1E1ADD!AE1EDHmDDf4EAAAE1A 1Dd$4A1D!AA1A E1ADeA1D1ADA!AED1EDHmDDf8EAAAEE1A A1Dd$8A1E!AAA1A E1EADeA1A1AD!AE1EDHmDDfm1'YRQ>m1'Y GQcg)) GQcg)) '8!.m,M 8S '8!.m,M 8STs e jv.,rTs e jv.,r迢KfpK£Qlǡ迢KfpK£Ql$օ5pj$օ5pjl7LwH'4l7LwH'4 9JNOʜ[o.h 9JNOʜ[o.htocxxȄnjtocxxȄnjlPxqlPxq        SHA256 block transform for x86_64, CRYPTOGAMS by Q$$ 4(    $(,  4G P5 4$ 4&%4  $,04.filegC:\Users\b\p\ring\.debug$S|H.debug$Tl.text5 .rdata.pdata0 .xdata8 .absolutK tGL$looppK256'04FFQk$$+$Z$&%`%%4 444P5-S|(__imp_RtlVirtualUnwindring_core_0_17_14__sha256_block_data_order_nohwL$SEH_begin_sha256_block_data_order_nohwL$prologueL$rounds_16_xxL$epilogueL$SEH_end_sha256_block_data_order_nohwring_core_0_17_14__sha256_block_data_order_hwL$SEH_begin_sha256_block_data_order_hwL$prologue_shaextL$oop_shaextL$epilogue_shaextL$SEH_end_sha256_block_data_order_hwring_core_0_17_14__sha256_block_data_order_ssse3L$SEH_begin_sha256_block_data_order_ssse3L$prologue_ssse3L$loop_ssse3L$ssse3_00_47L$epilogue_ssse3L$SEH_end_sha256_block_data_order_ssse3ring_core_0_17_14__sha256_block_data_order_avxL$SEH_begin_sha256_block_data_order_avxL$prologue_avxL$loop_avxL$avx_00_47L$epilogue_avxL$SEH_end_sha256_block_data_order_avxse_handlerL$in_prologueshaext_handlerL$SEH_info_sha256_block_data_order_nohwL$SEH_info_sha256_block_data_order_hwL$SEH_info_sha256_block_data_order_ssse3L$SEH_info_sha256_block_data_order_avxring-0.17.14/pregenerated/sha512-armv4-linux32.S000064400000000000000000001235501046102023000170700ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) @ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. @ @ Licensed under the Apache License, Version 2.0 (the "License"); @ you may not use this file except in compliance with the License. @ You may obtain a copy of the License at @ @ https://www.apache.org/licenses/LICENSE-2.0 @ @ Unless required by applicable law or agreed to in writing, software @ distributed under the License is distributed on an "AS IS" BASIS, @ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @ See the License for the specific language governing permissions and @ limitations under the License. @ ==================================================================== @ Written by Andy Polyakov for the OpenSSL @ project. @ ==================================================================== @ SHA512 block procedure for ARMv4. September 2007. @ This code is ~4.5 (four and a half) times faster than code generated @ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue @ Xscale PXA250 core]. @ @ July 2010. @ @ Rescheduling for dual-issue pipeline resulted in 6% improvement on @ Cortex A8 core and ~40 cycles per processed byte. @ February 2011. @ @ Profiler-assisted and platform-specific optimization resulted in 7% @ improvement on Coxtex A8 core and ~38 cycles per byte. @ March 2011. @ @ Add NEON implementation. On Cortex A8 it was measured to process @ one byte in 23.3 cycles or ~60% faster than integer-only code. @ August 2012. @ @ Improve NEON performance by 12% on Snapdragon S4. In absolute @ terms it's 22.6 cycles per byte, which is disappointing result. @ Technical writers asserted that 3-way S4 pipeline can sustain @ multiple NEON instructions per cycle, but dual NEON issue could @ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html @ for further details. On side note Cortex-A15 processes one byte in @ 16 cycles. @ Byte order [in]dependence. ========================================= @ @ Originally caller was expected to maintain specific *dword* order in @ h[0-7], namely with most significant dword at *lower* address, which @ was reflected in below two parameters as 0 and 4. Now caller is @ expected to maintain native byte order for whole 64-bit values. #ifndef __KERNEL__ # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} # define VFP_ABI_POP vldmia sp!,{d8-d15} #else # define __ARM_MAX_ARCH__ 7 # define VFP_ABI_PUSH # define VFP_ABI_POP #endif @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both @ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. .arch armv7-a #ifdef __ARMEL__ # define LO 0 # define HI 4 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 #else # define HI 0 # define LO 4 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 #endif .text #if defined(__thumb2__) .syntax unified .thumb # define adrl adr #else .code 32 #endif .type K512,%object .align 5 K512: WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 .globl sha512_block_data_order_nohw .hidden sha512_block_data_order_nohw .type sha512_block_data_order_nohw,%function sha512_block_data_order_nohw: add r2,r1,r2,lsl#7 @ len to point at the end of inp stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} adr r14,K512 sub sp,sp,#9*8 ldr r7,[r0,#32+LO] ldr r8,[r0,#32+HI] ldr r9, [r0,#48+LO] ldr r10, [r0,#48+HI] ldr r11, [r0,#56+LO] ldr r12, [r0,#56+HI] .Loop: str r9, [sp,#48+0] str r10, [sp,#48+4] str r11, [sp,#56+0] str r12, [sp,#56+4] ldr r5,[r0,#0+LO] ldr r6,[r0,#0+HI] ldr r3,[r0,#8+LO] ldr r4,[r0,#8+HI] ldr r9, [r0,#16+LO] ldr r10, [r0,#16+HI] ldr r11, [r0,#24+LO] ldr r12, [r0,#24+HI] str r3,[sp,#8+0] str r4,[sp,#8+4] str r9, [sp,#16+0] str r10, [sp,#16+4] str r11, [sp,#24+0] str r12, [sp,#24+4] ldr r3,[r0,#40+LO] ldr r4,[r0,#40+HI] str r3,[sp,#40+0] str r4,[sp,#40+4] .L00_15: #if __ARM_ARCH<7 ldrb r3,[r1,#7] ldrb r9, [r1,#6] ldrb r10, [r1,#5] ldrb r11, [r1,#4] ldrb r4,[r1,#3] ldrb r12, [r1,#2] orr r3,r3,r9,lsl#8 ldrb r9, [r1,#1] orr r3,r3,r10,lsl#16 ldrb r10, [r1],#8 orr r3,r3,r11,lsl#24 orr r4,r4,r12,lsl#8 orr r4,r4,r9,lsl#16 orr r4,r4,r10,lsl#24 #else ldr r3,[r1,#4] ldr r4,[r1],#8 #ifdef __ARMEL__ rev r3,r3 rev r4,r4 #endif #endif @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 mov r9,r7,lsr#14 str r3,[sp,#64+0] mov r10,r8,lsr#14 str r4,[sp,#64+4] eor r9,r9,r8,lsl#18 ldr r11,[sp,#56+0] @ h.lo eor r10,r10,r7,lsl#18 ldr r12,[sp,#56+4] @ h.hi eor r9,r9,r7,lsr#18 eor r10,r10,r8,lsr#18 eor r9,r9,r8,lsl#14 eor r10,r10,r7,lsl#14 eor r9,r9,r8,lsr#9 eor r10,r10,r7,lsr#9 eor r9,r9,r7,lsl#23 eor r10,r10,r8,lsl#23 @ Sigma1(e) adds r3,r3,r9 ldr r9,[sp,#40+0] @ f.lo adc r4,r4,r10 @ T += Sigma1(e) ldr r10,[sp,#40+4] @ f.hi adds r3,r3,r11 ldr r11,[sp,#48+0] @ g.lo adc r4,r4,r12 @ T += h ldr r12,[sp,#48+4] @ g.hi eor r9,r9,r11 str r7,[sp,#32+0] eor r10,r10,r12 str r8,[sp,#32+4] and r9,r9,r7 str r5,[sp,#0+0] and r10,r10,r8 str r6,[sp,#0+4] eor r9,r9,r11 ldr r11,[r14,#LO] @ K[i].lo eor r10,r10,r12 @ Ch(e,f,g) ldr r12,[r14,#HI] @ K[i].hi adds r3,r3,r9 ldr r7,[sp,#24+0] @ d.lo adc r4,r4,r10 @ T += Ch(e,f,g) ldr r8,[sp,#24+4] @ d.hi adds r3,r3,r11 and r9,r11,#0xff adc r4,r4,r12 @ T += K[i] adds r7,r7,r3 ldr r11,[sp,#8+0] @ b.lo adc r8,r8,r4 @ d += T teq r9,#148 ldr r12,[sp,#16+0] @ c.lo #if __ARM_ARCH>=7 it eq @ Thumb2 thing, sanity check in ARM #endif orreq r14,r14,#1 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 mov r9,r5,lsr#28 mov r10,r6,lsr#28 eor r9,r9,r6,lsl#4 eor r10,r10,r5,lsl#4 eor r9,r9,r6,lsr#2 eor r10,r10,r5,lsr#2 eor r9,r9,r5,lsl#30 eor r10,r10,r6,lsl#30 eor r9,r9,r6,lsr#7 eor r10,r10,r5,lsr#7 eor r9,r9,r5,lsl#25 eor r10,r10,r6,lsl#25 @ Sigma0(a) adds r3,r3,r9 and r9,r5,r11 adc r4,r4,r10 @ T += Sigma0(a) ldr r10,[sp,#8+4] @ b.hi orr r5,r5,r11 ldr r11,[sp,#16+4] @ c.hi and r5,r5,r12 and r12,r6,r10 orr r6,r6,r10 orr r5,r5,r9 @ Maj(a,b,c).lo and r6,r6,r11 adds r5,r5,r3 orr r6,r6,r12 @ Maj(a,b,c).hi sub sp,sp,#8 adc r6,r6,r4 @ h += T tst r14,#1 add r14,r14,#8 tst r14,#1 beq .L00_15 ldr r9,[sp,#184+0] ldr r10,[sp,#184+4] bic r14,r14,#1 .L16_79: @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 mov r3,r9,lsr#1 ldr r11,[sp,#80+0] mov r4,r10,lsr#1 ldr r12,[sp,#80+4] eor r3,r3,r10,lsl#31 eor r4,r4,r9,lsl#31 eor r3,r3,r9,lsr#8 eor r4,r4,r10,lsr#8 eor r3,r3,r10,lsl#24 eor r4,r4,r9,lsl#24 eor r3,r3,r9,lsr#7 eor r4,r4,r10,lsr#7 eor r3,r3,r10,lsl#25 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 mov r9,r11,lsr#19 mov r10,r12,lsr#19 eor r9,r9,r12,lsl#13 eor r10,r10,r11,lsl#13 eor r9,r9,r12,lsr#29 eor r10,r10,r11,lsr#29 eor r9,r9,r11,lsl#3 eor r10,r10,r12,lsl#3 eor r9,r9,r11,lsr#6 eor r10,r10,r12,lsr#6 ldr r11,[sp,#120+0] eor r9,r9,r12,lsl#26 ldr r12,[sp,#120+4] adds r3,r3,r9 ldr r9,[sp,#192+0] adc r4,r4,r10 ldr r10,[sp,#192+4] adds r3,r3,r11 adc r4,r4,r12 adds r3,r3,r9 adc r4,r4,r10 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 mov r9,r7,lsr#14 str r3,[sp,#64+0] mov r10,r8,lsr#14 str r4,[sp,#64+4] eor r9,r9,r8,lsl#18 ldr r11,[sp,#56+0] @ h.lo eor r10,r10,r7,lsl#18 ldr r12,[sp,#56+4] @ h.hi eor r9,r9,r7,lsr#18 eor r10,r10,r8,lsr#18 eor r9,r9,r8,lsl#14 eor r10,r10,r7,lsl#14 eor r9,r9,r8,lsr#9 eor r10,r10,r7,lsr#9 eor r9,r9,r7,lsl#23 eor r10,r10,r8,lsl#23 @ Sigma1(e) adds r3,r3,r9 ldr r9,[sp,#40+0] @ f.lo adc r4,r4,r10 @ T += Sigma1(e) ldr r10,[sp,#40+4] @ f.hi adds r3,r3,r11 ldr r11,[sp,#48+0] @ g.lo adc r4,r4,r12 @ T += h ldr r12,[sp,#48+4] @ g.hi eor r9,r9,r11 str r7,[sp,#32+0] eor r10,r10,r12 str r8,[sp,#32+4] and r9,r9,r7 str r5,[sp,#0+0] and r10,r10,r8 str r6,[sp,#0+4] eor r9,r9,r11 ldr r11,[r14,#LO] @ K[i].lo eor r10,r10,r12 @ Ch(e,f,g) ldr r12,[r14,#HI] @ K[i].hi adds r3,r3,r9 ldr r7,[sp,#24+0] @ d.lo adc r4,r4,r10 @ T += Ch(e,f,g) ldr r8,[sp,#24+4] @ d.hi adds r3,r3,r11 and r9,r11,#0xff adc r4,r4,r12 @ T += K[i] adds r7,r7,r3 ldr r11,[sp,#8+0] @ b.lo adc r8,r8,r4 @ d += T teq r9,#23 ldr r12,[sp,#16+0] @ c.lo #if __ARM_ARCH>=7 it eq @ Thumb2 thing, sanity check in ARM #endif orreq r14,r14,#1 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 mov r9,r5,lsr#28 mov r10,r6,lsr#28 eor r9,r9,r6,lsl#4 eor r10,r10,r5,lsl#4 eor r9,r9,r6,lsr#2 eor r10,r10,r5,lsr#2 eor r9,r9,r5,lsl#30 eor r10,r10,r6,lsl#30 eor r9,r9,r6,lsr#7 eor r10,r10,r5,lsr#7 eor r9,r9,r5,lsl#25 eor r10,r10,r6,lsl#25 @ Sigma0(a) adds r3,r3,r9 and r9,r5,r11 adc r4,r4,r10 @ T += Sigma0(a) ldr r10,[sp,#8+4] @ b.hi orr r5,r5,r11 ldr r11,[sp,#16+4] @ c.hi and r5,r5,r12 and r12,r6,r10 orr r6,r6,r10 orr r5,r5,r9 @ Maj(a,b,c).lo and r6,r6,r11 adds r5,r5,r3 orr r6,r6,r12 @ Maj(a,b,c).hi sub sp,sp,#8 adc r6,r6,r4 @ h += T tst r14,#1 add r14,r14,#8 #if __ARM_ARCH>=7 ittt eq @ Thumb2 thing, sanity check in ARM #endif ldreq r9,[sp,#184+0] ldreq r10,[sp,#184+4] beq .L16_79 bic r14,r14,#1 ldr r3,[sp,#8+0] ldr r4,[sp,#8+4] ldr r9, [r0,#0+LO] ldr r10, [r0,#0+HI] ldr r11, [r0,#8+LO] ldr r12, [r0,#8+HI] adds r9,r5,r9 str r9, [r0,#0+LO] adc r10,r6,r10 str r10, [r0,#0+HI] adds r11,r3,r11 str r11, [r0,#8+LO] adc r12,r4,r12 str r12, [r0,#8+HI] ldr r5,[sp,#16+0] ldr r6,[sp,#16+4] ldr r3,[sp,#24+0] ldr r4,[sp,#24+4] ldr r9, [r0,#16+LO] ldr r10, [r0,#16+HI] ldr r11, [r0,#24+LO] ldr r12, [r0,#24+HI] adds r9,r5,r9 str r9, [r0,#16+LO] adc r10,r6,r10 str r10, [r0,#16+HI] adds r11,r3,r11 str r11, [r0,#24+LO] adc r12,r4,r12 str r12, [r0,#24+HI] ldr r3,[sp,#40+0] ldr r4,[sp,#40+4] ldr r9, [r0,#32+LO] ldr r10, [r0,#32+HI] ldr r11, [r0,#40+LO] ldr r12, [r0,#40+HI] adds r7,r7,r9 str r7,[r0,#32+LO] adc r8,r8,r10 str r8,[r0,#32+HI] adds r11,r3,r11 str r11, [r0,#40+LO] adc r12,r4,r12 str r12, [r0,#40+HI] ldr r5,[sp,#48+0] ldr r6,[sp,#48+4] ldr r3,[sp,#56+0] ldr r4,[sp,#56+4] ldr r9, [r0,#48+LO] ldr r10, [r0,#48+HI] ldr r11, [r0,#56+LO] ldr r12, [r0,#56+HI] adds r9,r5,r9 str r9, [r0,#48+LO] adc r10,r6,r10 str r10, [r0,#48+HI] adds r11,r3,r11 str r11, [r0,#56+LO] adc r12,r4,r12 str r12, [r0,#56+HI] add sp,sp,#640 sub r14,r14,#640 teq r1,r2 bne .Loop add sp,sp,#8*9 @ destroy frame #if __ARM_ARCH>=5 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} #else ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif .size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon .globl sha512_block_data_order_neon .hidden sha512_block_data_order_neon .type sha512_block_data_order_neon,%function .align 4 sha512_block_data_order_neon: dmb @ errata #451034 on early Cortex A8 add r2,r1,r2,lsl#7 @ len to point at the end of inp adr r3,K512 VFP_ABI_PUSH vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context .Loop_neon: vshr.u64 d24,d20,#14 @ 0 #if 0<16 vld1.64 {d0},[r1]! @ handles unaligned #endif vshr.u64 d25,d20,#18 #if 0>0 vadd.i64 d16,d30 @ h+=Maj from the past #endif vshr.u64 d26,d20,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d20,#50 vsli.64 d25,d20,#46 vmov d29,d20 vsli.64 d26,d20,#23 #if 0<16 && defined(__ARMEL__) vrev64.8 d0,d0 #endif veor d25,d24 vbsl d29,d21,d22 @ Ch(e,f,g) vshr.u64 d24,d16,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d23 vshr.u64 d25,d16,#34 vsli.64 d24,d16,#36 vadd.i64 d27,d26 vshr.u64 d26,d16,#39 vadd.i64 d28,d0 vsli.64 d25,d16,#30 veor d30,d16,d17 vsli.64 d26,d16,#25 veor d23,d24,d25 vadd.i64 d27,d28 vbsl d30,d18,d17 @ Maj(a,b,c) veor d23,d26 @ Sigma0(a) vadd.i64 d19,d27 vadd.i64 d30,d27 @ vadd.i64 d23,d30 vshr.u64 d24,d19,#14 @ 1 #if 1<16 vld1.64 {d1},[r1]! @ handles unaligned #endif vshr.u64 d25,d19,#18 #if 1>0 vadd.i64 d23,d30 @ h+=Maj from the past #endif vshr.u64 d26,d19,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d19,#50 vsli.64 d25,d19,#46 vmov d29,d19 vsli.64 d26,d19,#23 #if 1<16 && defined(__ARMEL__) vrev64.8 d1,d1 #endif veor d25,d24 vbsl d29,d20,d21 @ Ch(e,f,g) vshr.u64 d24,d23,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d22 vshr.u64 d25,d23,#34 vsli.64 d24,d23,#36 vadd.i64 d27,d26 vshr.u64 d26,d23,#39 vadd.i64 d28,d1 vsli.64 d25,d23,#30 veor d30,d23,d16 vsli.64 d26,d23,#25 veor d22,d24,d25 vadd.i64 d27,d28 vbsl d30,d17,d16 @ Maj(a,b,c) veor d22,d26 @ Sigma0(a) vadd.i64 d18,d27 vadd.i64 d30,d27 @ vadd.i64 d22,d30 vshr.u64 d24,d18,#14 @ 2 #if 2<16 vld1.64 {d2},[r1]! @ handles unaligned #endif vshr.u64 d25,d18,#18 #if 2>0 vadd.i64 d22,d30 @ h+=Maj from the past #endif vshr.u64 d26,d18,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d18,#50 vsli.64 d25,d18,#46 vmov d29,d18 vsli.64 d26,d18,#23 #if 2<16 && defined(__ARMEL__) vrev64.8 d2,d2 #endif veor d25,d24 vbsl d29,d19,d20 @ Ch(e,f,g) vshr.u64 d24,d22,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d21 vshr.u64 d25,d22,#34 vsli.64 d24,d22,#36 vadd.i64 d27,d26 vshr.u64 d26,d22,#39 vadd.i64 d28,d2 vsli.64 d25,d22,#30 veor d30,d22,d23 vsli.64 d26,d22,#25 veor d21,d24,d25 vadd.i64 d27,d28 vbsl d30,d16,d23 @ Maj(a,b,c) veor d21,d26 @ Sigma0(a) vadd.i64 d17,d27 vadd.i64 d30,d27 @ vadd.i64 d21,d30 vshr.u64 d24,d17,#14 @ 3 #if 3<16 vld1.64 {d3},[r1]! @ handles unaligned #endif vshr.u64 d25,d17,#18 #if 3>0 vadd.i64 d21,d30 @ h+=Maj from the past #endif vshr.u64 d26,d17,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d17,#50 vsli.64 d25,d17,#46 vmov d29,d17 vsli.64 d26,d17,#23 #if 3<16 && defined(__ARMEL__) vrev64.8 d3,d3 #endif veor d25,d24 vbsl d29,d18,d19 @ Ch(e,f,g) vshr.u64 d24,d21,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d20 vshr.u64 d25,d21,#34 vsli.64 d24,d21,#36 vadd.i64 d27,d26 vshr.u64 d26,d21,#39 vadd.i64 d28,d3 vsli.64 d25,d21,#30 veor d30,d21,d22 vsli.64 d26,d21,#25 veor d20,d24,d25 vadd.i64 d27,d28 vbsl d30,d23,d22 @ Maj(a,b,c) veor d20,d26 @ Sigma0(a) vadd.i64 d16,d27 vadd.i64 d30,d27 @ vadd.i64 d20,d30 vshr.u64 d24,d16,#14 @ 4 #if 4<16 vld1.64 {d4},[r1]! @ handles unaligned #endif vshr.u64 d25,d16,#18 #if 4>0 vadd.i64 d20,d30 @ h+=Maj from the past #endif vshr.u64 d26,d16,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d16,#50 vsli.64 d25,d16,#46 vmov d29,d16 vsli.64 d26,d16,#23 #if 4<16 && defined(__ARMEL__) vrev64.8 d4,d4 #endif veor d25,d24 vbsl d29,d17,d18 @ Ch(e,f,g) vshr.u64 d24,d20,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d19 vshr.u64 d25,d20,#34 vsli.64 d24,d20,#36 vadd.i64 d27,d26 vshr.u64 d26,d20,#39 vadd.i64 d28,d4 vsli.64 d25,d20,#30 veor d30,d20,d21 vsli.64 d26,d20,#25 veor d19,d24,d25 vadd.i64 d27,d28 vbsl d30,d22,d21 @ Maj(a,b,c) veor d19,d26 @ Sigma0(a) vadd.i64 d23,d27 vadd.i64 d30,d27 @ vadd.i64 d19,d30 vshr.u64 d24,d23,#14 @ 5 #if 5<16 vld1.64 {d5},[r1]! @ handles unaligned #endif vshr.u64 d25,d23,#18 #if 5>0 vadd.i64 d19,d30 @ h+=Maj from the past #endif vshr.u64 d26,d23,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d23,#50 vsli.64 d25,d23,#46 vmov d29,d23 vsli.64 d26,d23,#23 #if 5<16 && defined(__ARMEL__) vrev64.8 d5,d5 #endif veor d25,d24 vbsl d29,d16,d17 @ Ch(e,f,g) vshr.u64 d24,d19,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d18 vshr.u64 d25,d19,#34 vsli.64 d24,d19,#36 vadd.i64 d27,d26 vshr.u64 d26,d19,#39 vadd.i64 d28,d5 vsli.64 d25,d19,#30 veor d30,d19,d20 vsli.64 d26,d19,#25 veor d18,d24,d25 vadd.i64 d27,d28 vbsl d30,d21,d20 @ Maj(a,b,c) veor d18,d26 @ Sigma0(a) vadd.i64 d22,d27 vadd.i64 d30,d27 @ vadd.i64 d18,d30 vshr.u64 d24,d22,#14 @ 6 #if 6<16 vld1.64 {d6},[r1]! @ handles unaligned #endif vshr.u64 d25,d22,#18 #if 6>0 vadd.i64 d18,d30 @ h+=Maj from the past #endif vshr.u64 d26,d22,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d22,#50 vsli.64 d25,d22,#46 vmov d29,d22 vsli.64 d26,d22,#23 #if 6<16 && defined(__ARMEL__) vrev64.8 d6,d6 #endif veor d25,d24 vbsl d29,d23,d16 @ Ch(e,f,g) vshr.u64 d24,d18,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d17 vshr.u64 d25,d18,#34 vsli.64 d24,d18,#36 vadd.i64 d27,d26 vshr.u64 d26,d18,#39 vadd.i64 d28,d6 vsli.64 d25,d18,#30 veor d30,d18,d19 vsli.64 d26,d18,#25 veor d17,d24,d25 vadd.i64 d27,d28 vbsl d30,d20,d19 @ Maj(a,b,c) veor d17,d26 @ Sigma0(a) vadd.i64 d21,d27 vadd.i64 d30,d27 @ vadd.i64 d17,d30 vshr.u64 d24,d21,#14 @ 7 #if 7<16 vld1.64 {d7},[r1]! @ handles unaligned #endif vshr.u64 d25,d21,#18 #if 7>0 vadd.i64 d17,d30 @ h+=Maj from the past #endif vshr.u64 d26,d21,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d21,#50 vsli.64 d25,d21,#46 vmov d29,d21 vsli.64 d26,d21,#23 #if 7<16 && defined(__ARMEL__) vrev64.8 d7,d7 #endif veor d25,d24 vbsl d29,d22,d23 @ Ch(e,f,g) vshr.u64 d24,d17,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d16 vshr.u64 d25,d17,#34 vsli.64 d24,d17,#36 vadd.i64 d27,d26 vshr.u64 d26,d17,#39 vadd.i64 d28,d7 vsli.64 d25,d17,#30 veor d30,d17,d18 vsli.64 d26,d17,#25 veor d16,d24,d25 vadd.i64 d27,d28 vbsl d30,d19,d18 @ Maj(a,b,c) veor d16,d26 @ Sigma0(a) vadd.i64 d20,d27 vadd.i64 d30,d27 @ vadd.i64 d16,d30 vshr.u64 d24,d20,#14 @ 8 #if 8<16 vld1.64 {d8},[r1]! @ handles unaligned #endif vshr.u64 d25,d20,#18 #if 8>0 vadd.i64 d16,d30 @ h+=Maj from the past #endif vshr.u64 d26,d20,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d20,#50 vsli.64 d25,d20,#46 vmov d29,d20 vsli.64 d26,d20,#23 #if 8<16 && defined(__ARMEL__) vrev64.8 d8,d8 #endif veor d25,d24 vbsl d29,d21,d22 @ Ch(e,f,g) vshr.u64 d24,d16,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d23 vshr.u64 d25,d16,#34 vsli.64 d24,d16,#36 vadd.i64 d27,d26 vshr.u64 d26,d16,#39 vadd.i64 d28,d8 vsli.64 d25,d16,#30 veor d30,d16,d17 vsli.64 d26,d16,#25 veor d23,d24,d25 vadd.i64 d27,d28 vbsl d30,d18,d17 @ Maj(a,b,c) veor d23,d26 @ Sigma0(a) vadd.i64 d19,d27 vadd.i64 d30,d27 @ vadd.i64 d23,d30 vshr.u64 d24,d19,#14 @ 9 #if 9<16 vld1.64 {d9},[r1]! @ handles unaligned #endif vshr.u64 d25,d19,#18 #if 9>0 vadd.i64 d23,d30 @ h+=Maj from the past #endif vshr.u64 d26,d19,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d19,#50 vsli.64 d25,d19,#46 vmov d29,d19 vsli.64 d26,d19,#23 #if 9<16 && defined(__ARMEL__) vrev64.8 d9,d9 #endif veor d25,d24 vbsl d29,d20,d21 @ Ch(e,f,g) vshr.u64 d24,d23,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d22 vshr.u64 d25,d23,#34 vsli.64 d24,d23,#36 vadd.i64 d27,d26 vshr.u64 d26,d23,#39 vadd.i64 d28,d9 vsli.64 d25,d23,#30 veor d30,d23,d16 vsli.64 d26,d23,#25 veor d22,d24,d25 vadd.i64 d27,d28 vbsl d30,d17,d16 @ Maj(a,b,c) veor d22,d26 @ Sigma0(a) vadd.i64 d18,d27 vadd.i64 d30,d27 @ vadd.i64 d22,d30 vshr.u64 d24,d18,#14 @ 10 #if 10<16 vld1.64 {d10},[r1]! @ handles unaligned #endif vshr.u64 d25,d18,#18 #if 10>0 vadd.i64 d22,d30 @ h+=Maj from the past #endif vshr.u64 d26,d18,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d18,#50 vsli.64 d25,d18,#46 vmov d29,d18 vsli.64 d26,d18,#23 #if 10<16 && defined(__ARMEL__) vrev64.8 d10,d10 #endif veor d25,d24 vbsl d29,d19,d20 @ Ch(e,f,g) vshr.u64 d24,d22,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d21 vshr.u64 d25,d22,#34 vsli.64 d24,d22,#36 vadd.i64 d27,d26 vshr.u64 d26,d22,#39 vadd.i64 d28,d10 vsli.64 d25,d22,#30 veor d30,d22,d23 vsli.64 d26,d22,#25 veor d21,d24,d25 vadd.i64 d27,d28 vbsl d30,d16,d23 @ Maj(a,b,c) veor d21,d26 @ Sigma0(a) vadd.i64 d17,d27 vadd.i64 d30,d27 @ vadd.i64 d21,d30 vshr.u64 d24,d17,#14 @ 11 #if 11<16 vld1.64 {d11},[r1]! @ handles unaligned #endif vshr.u64 d25,d17,#18 #if 11>0 vadd.i64 d21,d30 @ h+=Maj from the past #endif vshr.u64 d26,d17,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d17,#50 vsli.64 d25,d17,#46 vmov d29,d17 vsli.64 d26,d17,#23 #if 11<16 && defined(__ARMEL__) vrev64.8 d11,d11 #endif veor d25,d24 vbsl d29,d18,d19 @ Ch(e,f,g) vshr.u64 d24,d21,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d20 vshr.u64 d25,d21,#34 vsli.64 d24,d21,#36 vadd.i64 d27,d26 vshr.u64 d26,d21,#39 vadd.i64 d28,d11 vsli.64 d25,d21,#30 veor d30,d21,d22 vsli.64 d26,d21,#25 veor d20,d24,d25 vadd.i64 d27,d28 vbsl d30,d23,d22 @ Maj(a,b,c) veor d20,d26 @ Sigma0(a) vadd.i64 d16,d27 vadd.i64 d30,d27 @ vadd.i64 d20,d30 vshr.u64 d24,d16,#14 @ 12 #if 12<16 vld1.64 {d12},[r1]! @ handles unaligned #endif vshr.u64 d25,d16,#18 #if 12>0 vadd.i64 d20,d30 @ h+=Maj from the past #endif vshr.u64 d26,d16,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d16,#50 vsli.64 d25,d16,#46 vmov d29,d16 vsli.64 d26,d16,#23 #if 12<16 && defined(__ARMEL__) vrev64.8 d12,d12 #endif veor d25,d24 vbsl d29,d17,d18 @ Ch(e,f,g) vshr.u64 d24,d20,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d19 vshr.u64 d25,d20,#34 vsli.64 d24,d20,#36 vadd.i64 d27,d26 vshr.u64 d26,d20,#39 vadd.i64 d28,d12 vsli.64 d25,d20,#30 veor d30,d20,d21 vsli.64 d26,d20,#25 veor d19,d24,d25 vadd.i64 d27,d28 vbsl d30,d22,d21 @ Maj(a,b,c) veor d19,d26 @ Sigma0(a) vadd.i64 d23,d27 vadd.i64 d30,d27 @ vadd.i64 d19,d30 vshr.u64 d24,d23,#14 @ 13 #if 13<16 vld1.64 {d13},[r1]! @ handles unaligned #endif vshr.u64 d25,d23,#18 #if 13>0 vadd.i64 d19,d30 @ h+=Maj from the past #endif vshr.u64 d26,d23,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d23,#50 vsli.64 d25,d23,#46 vmov d29,d23 vsli.64 d26,d23,#23 #if 13<16 && defined(__ARMEL__) vrev64.8 d13,d13 #endif veor d25,d24 vbsl d29,d16,d17 @ Ch(e,f,g) vshr.u64 d24,d19,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d18 vshr.u64 d25,d19,#34 vsli.64 d24,d19,#36 vadd.i64 d27,d26 vshr.u64 d26,d19,#39 vadd.i64 d28,d13 vsli.64 d25,d19,#30 veor d30,d19,d20 vsli.64 d26,d19,#25 veor d18,d24,d25 vadd.i64 d27,d28 vbsl d30,d21,d20 @ Maj(a,b,c) veor d18,d26 @ Sigma0(a) vadd.i64 d22,d27 vadd.i64 d30,d27 @ vadd.i64 d18,d30 vshr.u64 d24,d22,#14 @ 14 #if 14<16 vld1.64 {d14},[r1]! @ handles unaligned #endif vshr.u64 d25,d22,#18 #if 14>0 vadd.i64 d18,d30 @ h+=Maj from the past #endif vshr.u64 d26,d22,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d22,#50 vsli.64 d25,d22,#46 vmov d29,d22 vsli.64 d26,d22,#23 #if 14<16 && defined(__ARMEL__) vrev64.8 d14,d14 #endif veor d25,d24 vbsl d29,d23,d16 @ Ch(e,f,g) vshr.u64 d24,d18,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d17 vshr.u64 d25,d18,#34 vsli.64 d24,d18,#36 vadd.i64 d27,d26 vshr.u64 d26,d18,#39 vadd.i64 d28,d14 vsli.64 d25,d18,#30 veor d30,d18,d19 vsli.64 d26,d18,#25 veor d17,d24,d25 vadd.i64 d27,d28 vbsl d30,d20,d19 @ Maj(a,b,c) veor d17,d26 @ Sigma0(a) vadd.i64 d21,d27 vadd.i64 d30,d27 @ vadd.i64 d17,d30 vshr.u64 d24,d21,#14 @ 15 #if 15<16 vld1.64 {d15},[r1]! @ handles unaligned #endif vshr.u64 d25,d21,#18 #if 15>0 vadd.i64 d17,d30 @ h+=Maj from the past #endif vshr.u64 d26,d21,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d21,#50 vsli.64 d25,d21,#46 vmov d29,d21 vsli.64 d26,d21,#23 #if 15<16 && defined(__ARMEL__) vrev64.8 d15,d15 #endif veor d25,d24 vbsl d29,d22,d23 @ Ch(e,f,g) vshr.u64 d24,d17,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d16 vshr.u64 d25,d17,#34 vsli.64 d24,d17,#36 vadd.i64 d27,d26 vshr.u64 d26,d17,#39 vadd.i64 d28,d15 vsli.64 d25,d17,#30 veor d30,d17,d18 vsli.64 d26,d17,#25 veor d16,d24,d25 vadd.i64 d27,d28 vbsl d30,d19,d18 @ Maj(a,b,c) veor d16,d26 @ Sigma0(a) vadd.i64 d20,d27 vadd.i64 d30,d27 @ vadd.i64 d16,d30 mov r12,#4 .L16_79_neon: subs r12,#1 vshr.u64 q12,q7,#19 vshr.u64 q13,q7,#61 vadd.i64 d16,d30 @ h+=Maj from the past vshr.u64 q15,q7,#6 vsli.64 q12,q7,#45 vext.8 q14,q0,q1,#8 @ X[i+1] vsli.64 q13,q7,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q0,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q4,q5,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d20,#14 @ from NEON_00_15 vadd.i64 q0,q14 vshr.u64 d25,d20,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d20,#41 @ from NEON_00_15 vadd.i64 q0,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d20,#50 vsli.64 d25,d20,#46 vmov d29,d20 vsli.64 d26,d20,#23 #if 16<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d21,d22 @ Ch(e,f,g) vshr.u64 d24,d16,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d23 vshr.u64 d25,d16,#34 vsli.64 d24,d16,#36 vadd.i64 d27,d26 vshr.u64 d26,d16,#39 vadd.i64 d28,d0 vsli.64 d25,d16,#30 veor d30,d16,d17 vsli.64 d26,d16,#25 veor d23,d24,d25 vadd.i64 d27,d28 vbsl d30,d18,d17 @ Maj(a,b,c) veor d23,d26 @ Sigma0(a) vadd.i64 d19,d27 vadd.i64 d30,d27 @ vadd.i64 d23,d30 vshr.u64 d24,d19,#14 @ 17 #if 17<16 vld1.64 {d1},[r1]! @ handles unaligned #endif vshr.u64 d25,d19,#18 #if 17>0 vadd.i64 d23,d30 @ h+=Maj from the past #endif vshr.u64 d26,d19,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d19,#50 vsli.64 d25,d19,#46 vmov d29,d19 vsli.64 d26,d19,#23 #if 17<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d20,d21 @ Ch(e,f,g) vshr.u64 d24,d23,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d22 vshr.u64 d25,d23,#34 vsli.64 d24,d23,#36 vadd.i64 d27,d26 vshr.u64 d26,d23,#39 vadd.i64 d28,d1 vsli.64 d25,d23,#30 veor d30,d23,d16 vsli.64 d26,d23,#25 veor d22,d24,d25 vadd.i64 d27,d28 vbsl d30,d17,d16 @ Maj(a,b,c) veor d22,d26 @ Sigma0(a) vadd.i64 d18,d27 vadd.i64 d30,d27 @ vadd.i64 d22,d30 vshr.u64 q12,q0,#19 vshr.u64 q13,q0,#61 vadd.i64 d22,d30 @ h+=Maj from the past vshr.u64 q15,q0,#6 vsli.64 q12,q0,#45 vext.8 q14,q1,q2,#8 @ X[i+1] vsli.64 q13,q0,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q1,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q5,q6,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d18,#14 @ from NEON_00_15 vadd.i64 q1,q14 vshr.u64 d25,d18,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d18,#41 @ from NEON_00_15 vadd.i64 q1,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d18,#50 vsli.64 d25,d18,#46 vmov d29,d18 vsli.64 d26,d18,#23 #if 18<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d19,d20 @ Ch(e,f,g) vshr.u64 d24,d22,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d21 vshr.u64 d25,d22,#34 vsli.64 d24,d22,#36 vadd.i64 d27,d26 vshr.u64 d26,d22,#39 vadd.i64 d28,d2 vsli.64 d25,d22,#30 veor d30,d22,d23 vsli.64 d26,d22,#25 veor d21,d24,d25 vadd.i64 d27,d28 vbsl d30,d16,d23 @ Maj(a,b,c) veor d21,d26 @ Sigma0(a) vadd.i64 d17,d27 vadd.i64 d30,d27 @ vadd.i64 d21,d30 vshr.u64 d24,d17,#14 @ 19 #if 19<16 vld1.64 {d3},[r1]! @ handles unaligned #endif vshr.u64 d25,d17,#18 #if 19>0 vadd.i64 d21,d30 @ h+=Maj from the past #endif vshr.u64 d26,d17,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d17,#50 vsli.64 d25,d17,#46 vmov d29,d17 vsli.64 d26,d17,#23 #if 19<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d18,d19 @ Ch(e,f,g) vshr.u64 d24,d21,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d20 vshr.u64 d25,d21,#34 vsli.64 d24,d21,#36 vadd.i64 d27,d26 vshr.u64 d26,d21,#39 vadd.i64 d28,d3 vsli.64 d25,d21,#30 veor d30,d21,d22 vsli.64 d26,d21,#25 veor d20,d24,d25 vadd.i64 d27,d28 vbsl d30,d23,d22 @ Maj(a,b,c) veor d20,d26 @ Sigma0(a) vadd.i64 d16,d27 vadd.i64 d30,d27 @ vadd.i64 d20,d30 vshr.u64 q12,q1,#19 vshr.u64 q13,q1,#61 vadd.i64 d20,d30 @ h+=Maj from the past vshr.u64 q15,q1,#6 vsli.64 q12,q1,#45 vext.8 q14,q2,q3,#8 @ X[i+1] vsli.64 q13,q1,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q2,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q6,q7,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d16,#14 @ from NEON_00_15 vadd.i64 q2,q14 vshr.u64 d25,d16,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d16,#41 @ from NEON_00_15 vadd.i64 q2,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d16,#50 vsli.64 d25,d16,#46 vmov d29,d16 vsli.64 d26,d16,#23 #if 20<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d17,d18 @ Ch(e,f,g) vshr.u64 d24,d20,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d19 vshr.u64 d25,d20,#34 vsli.64 d24,d20,#36 vadd.i64 d27,d26 vshr.u64 d26,d20,#39 vadd.i64 d28,d4 vsli.64 d25,d20,#30 veor d30,d20,d21 vsli.64 d26,d20,#25 veor d19,d24,d25 vadd.i64 d27,d28 vbsl d30,d22,d21 @ Maj(a,b,c) veor d19,d26 @ Sigma0(a) vadd.i64 d23,d27 vadd.i64 d30,d27 @ vadd.i64 d19,d30 vshr.u64 d24,d23,#14 @ 21 #if 21<16 vld1.64 {d5},[r1]! @ handles unaligned #endif vshr.u64 d25,d23,#18 #if 21>0 vadd.i64 d19,d30 @ h+=Maj from the past #endif vshr.u64 d26,d23,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d23,#50 vsli.64 d25,d23,#46 vmov d29,d23 vsli.64 d26,d23,#23 #if 21<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d16,d17 @ Ch(e,f,g) vshr.u64 d24,d19,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d18 vshr.u64 d25,d19,#34 vsli.64 d24,d19,#36 vadd.i64 d27,d26 vshr.u64 d26,d19,#39 vadd.i64 d28,d5 vsli.64 d25,d19,#30 veor d30,d19,d20 vsli.64 d26,d19,#25 veor d18,d24,d25 vadd.i64 d27,d28 vbsl d30,d21,d20 @ Maj(a,b,c) veor d18,d26 @ Sigma0(a) vadd.i64 d22,d27 vadd.i64 d30,d27 @ vadd.i64 d18,d30 vshr.u64 q12,q2,#19 vshr.u64 q13,q2,#61 vadd.i64 d18,d30 @ h+=Maj from the past vshr.u64 q15,q2,#6 vsli.64 q12,q2,#45 vext.8 q14,q3,q4,#8 @ X[i+1] vsli.64 q13,q2,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q3,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q7,q0,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d22,#14 @ from NEON_00_15 vadd.i64 q3,q14 vshr.u64 d25,d22,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d22,#41 @ from NEON_00_15 vadd.i64 q3,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d22,#50 vsli.64 d25,d22,#46 vmov d29,d22 vsli.64 d26,d22,#23 #if 22<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d23,d16 @ Ch(e,f,g) vshr.u64 d24,d18,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d17 vshr.u64 d25,d18,#34 vsli.64 d24,d18,#36 vadd.i64 d27,d26 vshr.u64 d26,d18,#39 vadd.i64 d28,d6 vsli.64 d25,d18,#30 veor d30,d18,d19 vsli.64 d26,d18,#25 veor d17,d24,d25 vadd.i64 d27,d28 vbsl d30,d20,d19 @ Maj(a,b,c) veor d17,d26 @ Sigma0(a) vadd.i64 d21,d27 vadd.i64 d30,d27 @ vadd.i64 d17,d30 vshr.u64 d24,d21,#14 @ 23 #if 23<16 vld1.64 {d7},[r1]! @ handles unaligned #endif vshr.u64 d25,d21,#18 #if 23>0 vadd.i64 d17,d30 @ h+=Maj from the past #endif vshr.u64 d26,d21,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d21,#50 vsli.64 d25,d21,#46 vmov d29,d21 vsli.64 d26,d21,#23 #if 23<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d22,d23 @ Ch(e,f,g) vshr.u64 d24,d17,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d16 vshr.u64 d25,d17,#34 vsli.64 d24,d17,#36 vadd.i64 d27,d26 vshr.u64 d26,d17,#39 vadd.i64 d28,d7 vsli.64 d25,d17,#30 veor d30,d17,d18 vsli.64 d26,d17,#25 veor d16,d24,d25 vadd.i64 d27,d28 vbsl d30,d19,d18 @ Maj(a,b,c) veor d16,d26 @ Sigma0(a) vadd.i64 d20,d27 vadd.i64 d30,d27 @ vadd.i64 d16,d30 vshr.u64 q12,q3,#19 vshr.u64 q13,q3,#61 vadd.i64 d16,d30 @ h+=Maj from the past vshr.u64 q15,q3,#6 vsli.64 q12,q3,#45 vext.8 q14,q4,q5,#8 @ X[i+1] vsli.64 q13,q3,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q4,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q0,q1,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d20,#14 @ from NEON_00_15 vadd.i64 q4,q14 vshr.u64 d25,d20,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d20,#41 @ from NEON_00_15 vadd.i64 q4,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d20,#50 vsli.64 d25,d20,#46 vmov d29,d20 vsli.64 d26,d20,#23 #if 24<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d21,d22 @ Ch(e,f,g) vshr.u64 d24,d16,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d23 vshr.u64 d25,d16,#34 vsli.64 d24,d16,#36 vadd.i64 d27,d26 vshr.u64 d26,d16,#39 vadd.i64 d28,d8 vsli.64 d25,d16,#30 veor d30,d16,d17 vsli.64 d26,d16,#25 veor d23,d24,d25 vadd.i64 d27,d28 vbsl d30,d18,d17 @ Maj(a,b,c) veor d23,d26 @ Sigma0(a) vadd.i64 d19,d27 vadd.i64 d30,d27 @ vadd.i64 d23,d30 vshr.u64 d24,d19,#14 @ 25 #if 25<16 vld1.64 {d9},[r1]! @ handles unaligned #endif vshr.u64 d25,d19,#18 #if 25>0 vadd.i64 d23,d30 @ h+=Maj from the past #endif vshr.u64 d26,d19,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d19,#50 vsli.64 d25,d19,#46 vmov d29,d19 vsli.64 d26,d19,#23 #if 25<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d20,d21 @ Ch(e,f,g) vshr.u64 d24,d23,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d22 vshr.u64 d25,d23,#34 vsli.64 d24,d23,#36 vadd.i64 d27,d26 vshr.u64 d26,d23,#39 vadd.i64 d28,d9 vsli.64 d25,d23,#30 veor d30,d23,d16 vsli.64 d26,d23,#25 veor d22,d24,d25 vadd.i64 d27,d28 vbsl d30,d17,d16 @ Maj(a,b,c) veor d22,d26 @ Sigma0(a) vadd.i64 d18,d27 vadd.i64 d30,d27 @ vadd.i64 d22,d30 vshr.u64 q12,q4,#19 vshr.u64 q13,q4,#61 vadd.i64 d22,d30 @ h+=Maj from the past vshr.u64 q15,q4,#6 vsli.64 q12,q4,#45 vext.8 q14,q5,q6,#8 @ X[i+1] vsli.64 q13,q4,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q5,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q1,q2,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d18,#14 @ from NEON_00_15 vadd.i64 q5,q14 vshr.u64 d25,d18,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d18,#41 @ from NEON_00_15 vadd.i64 q5,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d18,#50 vsli.64 d25,d18,#46 vmov d29,d18 vsli.64 d26,d18,#23 #if 26<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d19,d20 @ Ch(e,f,g) vshr.u64 d24,d22,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d21 vshr.u64 d25,d22,#34 vsli.64 d24,d22,#36 vadd.i64 d27,d26 vshr.u64 d26,d22,#39 vadd.i64 d28,d10 vsli.64 d25,d22,#30 veor d30,d22,d23 vsli.64 d26,d22,#25 veor d21,d24,d25 vadd.i64 d27,d28 vbsl d30,d16,d23 @ Maj(a,b,c) veor d21,d26 @ Sigma0(a) vadd.i64 d17,d27 vadd.i64 d30,d27 @ vadd.i64 d21,d30 vshr.u64 d24,d17,#14 @ 27 #if 27<16 vld1.64 {d11},[r1]! @ handles unaligned #endif vshr.u64 d25,d17,#18 #if 27>0 vadd.i64 d21,d30 @ h+=Maj from the past #endif vshr.u64 d26,d17,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d17,#50 vsli.64 d25,d17,#46 vmov d29,d17 vsli.64 d26,d17,#23 #if 27<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d18,d19 @ Ch(e,f,g) vshr.u64 d24,d21,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d20 vshr.u64 d25,d21,#34 vsli.64 d24,d21,#36 vadd.i64 d27,d26 vshr.u64 d26,d21,#39 vadd.i64 d28,d11 vsli.64 d25,d21,#30 veor d30,d21,d22 vsli.64 d26,d21,#25 veor d20,d24,d25 vadd.i64 d27,d28 vbsl d30,d23,d22 @ Maj(a,b,c) veor d20,d26 @ Sigma0(a) vadd.i64 d16,d27 vadd.i64 d30,d27 @ vadd.i64 d20,d30 vshr.u64 q12,q5,#19 vshr.u64 q13,q5,#61 vadd.i64 d20,d30 @ h+=Maj from the past vshr.u64 q15,q5,#6 vsli.64 q12,q5,#45 vext.8 q14,q6,q7,#8 @ X[i+1] vsli.64 q13,q5,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q6,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q2,q3,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d16,#14 @ from NEON_00_15 vadd.i64 q6,q14 vshr.u64 d25,d16,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d16,#41 @ from NEON_00_15 vadd.i64 q6,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d16,#50 vsli.64 d25,d16,#46 vmov d29,d16 vsli.64 d26,d16,#23 #if 28<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d17,d18 @ Ch(e,f,g) vshr.u64 d24,d20,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d19 vshr.u64 d25,d20,#34 vsli.64 d24,d20,#36 vadd.i64 d27,d26 vshr.u64 d26,d20,#39 vadd.i64 d28,d12 vsli.64 d25,d20,#30 veor d30,d20,d21 vsli.64 d26,d20,#25 veor d19,d24,d25 vadd.i64 d27,d28 vbsl d30,d22,d21 @ Maj(a,b,c) veor d19,d26 @ Sigma0(a) vadd.i64 d23,d27 vadd.i64 d30,d27 @ vadd.i64 d19,d30 vshr.u64 d24,d23,#14 @ 29 #if 29<16 vld1.64 {d13},[r1]! @ handles unaligned #endif vshr.u64 d25,d23,#18 #if 29>0 vadd.i64 d19,d30 @ h+=Maj from the past #endif vshr.u64 d26,d23,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d23,#50 vsli.64 d25,d23,#46 vmov d29,d23 vsli.64 d26,d23,#23 #if 29<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d16,d17 @ Ch(e,f,g) vshr.u64 d24,d19,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d18 vshr.u64 d25,d19,#34 vsli.64 d24,d19,#36 vadd.i64 d27,d26 vshr.u64 d26,d19,#39 vadd.i64 d28,d13 vsli.64 d25,d19,#30 veor d30,d19,d20 vsli.64 d26,d19,#25 veor d18,d24,d25 vadd.i64 d27,d28 vbsl d30,d21,d20 @ Maj(a,b,c) veor d18,d26 @ Sigma0(a) vadd.i64 d22,d27 vadd.i64 d30,d27 @ vadd.i64 d18,d30 vshr.u64 q12,q6,#19 vshr.u64 q13,q6,#61 vadd.i64 d18,d30 @ h+=Maj from the past vshr.u64 q15,q6,#6 vsli.64 q12,q6,#45 vext.8 q14,q7,q0,#8 @ X[i+1] vsli.64 q13,q6,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q7,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q3,q4,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d22,#14 @ from NEON_00_15 vadd.i64 q7,q14 vshr.u64 d25,d22,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d22,#41 @ from NEON_00_15 vadd.i64 q7,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d22,#50 vsli.64 d25,d22,#46 vmov d29,d22 vsli.64 d26,d22,#23 #if 30<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d23,d16 @ Ch(e,f,g) vshr.u64 d24,d18,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d17 vshr.u64 d25,d18,#34 vsli.64 d24,d18,#36 vadd.i64 d27,d26 vshr.u64 d26,d18,#39 vadd.i64 d28,d14 vsli.64 d25,d18,#30 veor d30,d18,d19 vsli.64 d26,d18,#25 veor d17,d24,d25 vadd.i64 d27,d28 vbsl d30,d20,d19 @ Maj(a,b,c) veor d17,d26 @ Sigma0(a) vadd.i64 d21,d27 vadd.i64 d30,d27 @ vadd.i64 d17,d30 vshr.u64 d24,d21,#14 @ 31 #if 31<16 vld1.64 {d15},[r1]! @ handles unaligned #endif vshr.u64 d25,d21,#18 #if 31>0 vadd.i64 d17,d30 @ h+=Maj from the past #endif vshr.u64 d26,d21,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d21,#50 vsli.64 d25,d21,#46 vmov d29,d21 vsli.64 d26,d21,#23 #if 31<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d22,d23 @ Ch(e,f,g) vshr.u64 d24,d17,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d16 vshr.u64 d25,d17,#34 vsli.64 d24,d17,#36 vadd.i64 d27,d26 vshr.u64 d26,d17,#39 vadd.i64 d28,d15 vsli.64 d25,d17,#30 veor d30,d17,d18 vsli.64 d26,d17,#25 veor d16,d24,d25 vadd.i64 d27,d28 vbsl d30,d19,d18 @ Maj(a,b,c) veor d16,d26 @ Sigma0(a) vadd.i64 d20,d27 vadd.i64 d30,d27 @ vadd.i64 d16,d30 bne .L16_79_neon vadd.i64 d16,d30 @ h+=Maj from the past vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp vadd.i64 q8,q12 @ vectorized accumulate vadd.i64 q9,q13 vadd.i64 q10,q14 vadd.i64 q11,q15 vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context teq r1,r2 sub r3,#640 @ rewind K512 bne .Loop_neon VFP_ABI_POP bx lr @ .word 0xe12fff1e .size sha512_block_data_order_neon,.-sha512_block_data_order_neon #endif .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) ring-0.17.14/pregenerated/sha512-armv8-ios64.S000064400000000000000000001376001046102023000165350ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ==================================================================== // Written by Andy Polyakov for the OpenSSL // project. // ==================================================================== // // SHA256/512 for ARMv8. // // Performance in cycles per processed byte and improvement coefficient // over code generated with "default" compiler: // // SHA256-hw SHA256(*) SHA512 // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) // Denver 2.01 10.5 (+26%) 6.70 (+8%) // X-Gene 20.0 (+100%) 12.8 (+300%(***)) // Mongoose 2.36 13.0 (+50%) 8.36 (+33%) // Kryo 1.92 17.4 (+30%) 11.2 (+8%) // // (*) Software SHA256 results are of lesser relevance, presented // mostly for informational purposes. // (**) The result is a trade-off: it's possible to improve it by // 10% (or by 1 cycle per round), but at the cost of 20% loss // on Cortex-A53 (or by 4 cycles per round). // (***) Super-impressive coefficients over gcc-generated code are // indication of some compiler "pathology", most notably code // generated with -mgeneral-regs-only is significantly faster // and the gap is only 40-90%. #ifndef __KERNEL__ #endif .text .globl _sha512_block_data_order_nohw .private_extern _sha512_block_data_order_nohw .align 6 _sha512_block_data_order_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#4*8 ldp x20,x21,[x0] // load context ldp x22,x23,[x0,#2*8] ldp x24,x25,[x0,#4*8] add x2,x1,x2,lsl#7 // end of input ldp x26,x27,[x0,#6*8] adrp x30,LK512@PAGE add x30,x30,LK512@PAGEOFF stp x0,x2,[x29,#96] Loop: ldp x3,x4,[x1],#2*8 ldr x19,[x30],#8 // *K++ eor x28,x21,x22 // magic seed str x1,[x29,#112] #ifndef __AARCH64EB__ rev x3,x3 // 0 #endif ror x16,x24,#14 add x27,x27,x19 // h+=K[i] eor x6,x24,x24,ror#23 and x17,x25,x24 bic x19,x26,x24 add x27,x27,x3 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x20,x21 // a^b, b^c in next round eor x16,x16,x6,ror#18 // Sigma1(e) ror x6,x20,#28 add x27,x27,x17 // h+=Ch(e,f,g) eor x17,x20,x20,ror#5 add x27,x27,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x23,x23,x27 // d+=h eor x28,x28,x21 // Maj(a,b,c) eor x17,x6,x17,ror#34 // Sigma0(a) add x27,x27,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x27,x27,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x4,x4 // 1 #endif ldp x5,x6,[x1],#2*8 add x27,x27,x17 // h+=Sigma0(a) ror x16,x23,#14 add x26,x26,x28 // h+=K[i] eor x7,x23,x23,ror#23 and x17,x24,x23 bic x28,x25,x23 add x26,x26,x4 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x27,x20 // a^b, b^c in next round eor x16,x16,x7,ror#18 // Sigma1(e) ror x7,x27,#28 add x26,x26,x17 // h+=Ch(e,f,g) eor x17,x27,x27,ror#5 add x26,x26,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x22,x22,x26 // d+=h eor x19,x19,x20 // Maj(a,b,c) eor x17,x7,x17,ror#34 // Sigma0(a) add x26,x26,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x26,x26,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x5,x5 // 2 #endif add x26,x26,x17 // h+=Sigma0(a) ror x16,x22,#14 add x25,x25,x19 // h+=K[i] eor x8,x22,x22,ror#23 and x17,x23,x22 bic x19,x24,x22 add x25,x25,x5 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x26,x27 // a^b, b^c in next round eor x16,x16,x8,ror#18 // Sigma1(e) ror x8,x26,#28 add x25,x25,x17 // h+=Ch(e,f,g) eor x17,x26,x26,ror#5 add x25,x25,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x21,x21,x25 // d+=h eor x28,x28,x27 // Maj(a,b,c) eor x17,x8,x17,ror#34 // Sigma0(a) add x25,x25,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x25,x25,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x6,x6 // 3 #endif ldp x7,x8,[x1],#2*8 add x25,x25,x17 // h+=Sigma0(a) ror x16,x21,#14 add x24,x24,x28 // h+=K[i] eor x9,x21,x21,ror#23 and x17,x22,x21 bic x28,x23,x21 add x24,x24,x6 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x25,x26 // a^b, b^c in next round eor x16,x16,x9,ror#18 // Sigma1(e) ror x9,x25,#28 add x24,x24,x17 // h+=Ch(e,f,g) eor x17,x25,x25,ror#5 add x24,x24,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x20,x20,x24 // d+=h eor x19,x19,x26 // Maj(a,b,c) eor x17,x9,x17,ror#34 // Sigma0(a) add x24,x24,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x24,x24,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x7,x7 // 4 #endif add x24,x24,x17 // h+=Sigma0(a) ror x16,x20,#14 add x23,x23,x19 // h+=K[i] eor x10,x20,x20,ror#23 and x17,x21,x20 bic x19,x22,x20 add x23,x23,x7 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x24,x25 // a^b, b^c in next round eor x16,x16,x10,ror#18 // Sigma1(e) ror x10,x24,#28 add x23,x23,x17 // h+=Ch(e,f,g) eor x17,x24,x24,ror#5 add x23,x23,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x27,x27,x23 // d+=h eor x28,x28,x25 // Maj(a,b,c) eor x17,x10,x17,ror#34 // Sigma0(a) add x23,x23,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x23,x23,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x8,x8 // 5 #endif ldp x9,x10,[x1],#2*8 add x23,x23,x17 // h+=Sigma0(a) ror x16,x27,#14 add x22,x22,x28 // h+=K[i] eor x11,x27,x27,ror#23 and x17,x20,x27 bic x28,x21,x27 add x22,x22,x8 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x23,x24 // a^b, b^c in next round eor x16,x16,x11,ror#18 // Sigma1(e) ror x11,x23,#28 add x22,x22,x17 // h+=Ch(e,f,g) eor x17,x23,x23,ror#5 add x22,x22,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x26,x26,x22 // d+=h eor x19,x19,x24 // Maj(a,b,c) eor x17,x11,x17,ror#34 // Sigma0(a) add x22,x22,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x22,x22,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x9,x9 // 6 #endif add x22,x22,x17 // h+=Sigma0(a) ror x16,x26,#14 add x21,x21,x19 // h+=K[i] eor x12,x26,x26,ror#23 and x17,x27,x26 bic x19,x20,x26 add x21,x21,x9 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x22,x23 // a^b, b^c in next round eor x16,x16,x12,ror#18 // Sigma1(e) ror x12,x22,#28 add x21,x21,x17 // h+=Ch(e,f,g) eor x17,x22,x22,ror#5 add x21,x21,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x25,x25,x21 // d+=h eor x28,x28,x23 // Maj(a,b,c) eor x17,x12,x17,ror#34 // Sigma0(a) add x21,x21,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x21,x21,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x10,x10 // 7 #endif ldp x11,x12,[x1],#2*8 add x21,x21,x17 // h+=Sigma0(a) ror x16,x25,#14 add x20,x20,x28 // h+=K[i] eor x13,x25,x25,ror#23 and x17,x26,x25 bic x28,x27,x25 add x20,x20,x10 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x21,x22 // a^b, b^c in next round eor x16,x16,x13,ror#18 // Sigma1(e) ror x13,x21,#28 add x20,x20,x17 // h+=Ch(e,f,g) eor x17,x21,x21,ror#5 add x20,x20,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x24,x24,x20 // d+=h eor x19,x19,x22 // Maj(a,b,c) eor x17,x13,x17,ror#34 // Sigma0(a) add x20,x20,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x20,x20,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x11,x11 // 8 #endif add x20,x20,x17 // h+=Sigma0(a) ror x16,x24,#14 add x27,x27,x19 // h+=K[i] eor x14,x24,x24,ror#23 and x17,x25,x24 bic x19,x26,x24 add x27,x27,x11 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x20,x21 // a^b, b^c in next round eor x16,x16,x14,ror#18 // Sigma1(e) ror x14,x20,#28 add x27,x27,x17 // h+=Ch(e,f,g) eor x17,x20,x20,ror#5 add x27,x27,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x23,x23,x27 // d+=h eor x28,x28,x21 // Maj(a,b,c) eor x17,x14,x17,ror#34 // Sigma0(a) add x27,x27,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x27,x27,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x12,x12 // 9 #endif ldp x13,x14,[x1],#2*8 add x27,x27,x17 // h+=Sigma0(a) ror x16,x23,#14 add x26,x26,x28 // h+=K[i] eor x15,x23,x23,ror#23 and x17,x24,x23 bic x28,x25,x23 add x26,x26,x12 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x27,x20 // a^b, b^c in next round eor x16,x16,x15,ror#18 // Sigma1(e) ror x15,x27,#28 add x26,x26,x17 // h+=Ch(e,f,g) eor x17,x27,x27,ror#5 add x26,x26,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x22,x22,x26 // d+=h eor x19,x19,x20 // Maj(a,b,c) eor x17,x15,x17,ror#34 // Sigma0(a) add x26,x26,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x26,x26,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x13,x13 // 10 #endif add x26,x26,x17 // h+=Sigma0(a) ror x16,x22,#14 add x25,x25,x19 // h+=K[i] eor x0,x22,x22,ror#23 and x17,x23,x22 bic x19,x24,x22 add x25,x25,x13 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x26,x27 // a^b, b^c in next round eor x16,x16,x0,ror#18 // Sigma1(e) ror x0,x26,#28 add x25,x25,x17 // h+=Ch(e,f,g) eor x17,x26,x26,ror#5 add x25,x25,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x21,x21,x25 // d+=h eor x28,x28,x27 // Maj(a,b,c) eor x17,x0,x17,ror#34 // Sigma0(a) add x25,x25,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x25,x25,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x14,x14 // 11 #endif ldp x15,x0,[x1],#2*8 add x25,x25,x17 // h+=Sigma0(a) str x6,[sp,#24] ror x16,x21,#14 add x24,x24,x28 // h+=K[i] eor x6,x21,x21,ror#23 and x17,x22,x21 bic x28,x23,x21 add x24,x24,x14 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x25,x26 // a^b, b^c in next round eor x16,x16,x6,ror#18 // Sigma1(e) ror x6,x25,#28 add x24,x24,x17 // h+=Ch(e,f,g) eor x17,x25,x25,ror#5 add x24,x24,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x20,x20,x24 // d+=h eor x19,x19,x26 // Maj(a,b,c) eor x17,x6,x17,ror#34 // Sigma0(a) add x24,x24,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x24,x24,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x15,x15 // 12 #endif add x24,x24,x17 // h+=Sigma0(a) str x7,[sp,#0] ror x16,x20,#14 add x23,x23,x19 // h+=K[i] eor x7,x20,x20,ror#23 and x17,x21,x20 bic x19,x22,x20 add x23,x23,x15 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x24,x25 // a^b, b^c in next round eor x16,x16,x7,ror#18 // Sigma1(e) ror x7,x24,#28 add x23,x23,x17 // h+=Ch(e,f,g) eor x17,x24,x24,ror#5 add x23,x23,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x27,x27,x23 // d+=h eor x28,x28,x25 // Maj(a,b,c) eor x17,x7,x17,ror#34 // Sigma0(a) add x23,x23,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x23,x23,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x0,x0 // 13 #endif ldp x1,x2,[x1] add x23,x23,x17 // h+=Sigma0(a) str x8,[sp,#8] ror x16,x27,#14 add x22,x22,x28 // h+=K[i] eor x8,x27,x27,ror#23 and x17,x20,x27 bic x28,x21,x27 add x22,x22,x0 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x23,x24 // a^b, b^c in next round eor x16,x16,x8,ror#18 // Sigma1(e) ror x8,x23,#28 add x22,x22,x17 // h+=Ch(e,f,g) eor x17,x23,x23,ror#5 add x22,x22,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x26,x26,x22 // d+=h eor x19,x19,x24 // Maj(a,b,c) eor x17,x8,x17,ror#34 // Sigma0(a) add x22,x22,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x22,x22,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x1,x1 // 14 #endif ldr x6,[sp,#24] add x22,x22,x17 // h+=Sigma0(a) str x9,[sp,#16] ror x16,x26,#14 add x21,x21,x19 // h+=K[i] eor x9,x26,x26,ror#23 and x17,x27,x26 bic x19,x20,x26 add x21,x21,x1 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x22,x23 // a^b, b^c in next round eor x16,x16,x9,ror#18 // Sigma1(e) ror x9,x22,#28 add x21,x21,x17 // h+=Ch(e,f,g) eor x17,x22,x22,ror#5 add x21,x21,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x25,x25,x21 // d+=h eor x28,x28,x23 // Maj(a,b,c) eor x17,x9,x17,ror#34 // Sigma0(a) add x21,x21,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x21,x21,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x2,x2 // 15 #endif ldr x7,[sp,#0] add x21,x21,x17 // h+=Sigma0(a) str x10,[sp,#24] ror x16,x25,#14 add x20,x20,x28 // h+=K[i] ror x9,x4,#1 and x17,x26,x25 ror x8,x1,#19 bic x28,x27,x25 ror x10,x21,#28 add x20,x20,x2 // h+=X[i] eor x16,x16,x25,ror#18 eor x9,x9,x4,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x21,x22 // a^b, b^c in next round eor x16,x16,x25,ror#41 // Sigma1(e) eor x10,x10,x21,ror#34 add x20,x20,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x8,x8,x1,ror#61 eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) add x20,x20,x16 // h+=Sigma1(e) eor x19,x19,x22 // Maj(a,b,c) eor x17,x10,x21,ror#39 // Sigma0(a) eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) add x3,x3,x12 add x24,x24,x20 // d+=h add x20,x20,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x3,x3,x9 add x20,x20,x17 // h+=Sigma0(a) add x3,x3,x8 Loop_16_xx: ldr x8,[sp,#8] str x11,[sp,#0] ror x16,x24,#14 add x27,x27,x19 // h+=K[i] ror x10,x5,#1 and x17,x25,x24 ror x9,x2,#19 bic x19,x26,x24 ror x11,x20,#28 add x27,x27,x3 // h+=X[i] eor x16,x16,x24,ror#18 eor x10,x10,x5,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x20,x21 // a^b, b^c in next round eor x16,x16,x24,ror#41 // Sigma1(e) eor x11,x11,x20,ror#34 add x27,x27,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x9,x9,x2,ror#61 eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) add x27,x27,x16 // h+=Sigma1(e) eor x28,x28,x21 // Maj(a,b,c) eor x17,x11,x20,ror#39 // Sigma0(a) eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) add x4,x4,x13 add x23,x23,x27 // d+=h add x27,x27,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x4,x4,x10 add x27,x27,x17 // h+=Sigma0(a) add x4,x4,x9 ldr x9,[sp,#16] str x12,[sp,#8] ror x16,x23,#14 add x26,x26,x28 // h+=K[i] ror x11,x6,#1 and x17,x24,x23 ror x10,x3,#19 bic x28,x25,x23 ror x12,x27,#28 add x26,x26,x4 // h+=X[i] eor x16,x16,x23,ror#18 eor x11,x11,x6,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x27,x20 // a^b, b^c in next round eor x16,x16,x23,ror#41 // Sigma1(e) eor x12,x12,x27,ror#34 add x26,x26,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x10,x10,x3,ror#61 eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) add x26,x26,x16 // h+=Sigma1(e) eor x19,x19,x20 // Maj(a,b,c) eor x17,x12,x27,ror#39 // Sigma0(a) eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) add x5,x5,x14 add x22,x22,x26 // d+=h add x26,x26,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x5,x5,x11 add x26,x26,x17 // h+=Sigma0(a) add x5,x5,x10 ldr x10,[sp,#24] str x13,[sp,#16] ror x16,x22,#14 add x25,x25,x19 // h+=K[i] ror x12,x7,#1 and x17,x23,x22 ror x11,x4,#19 bic x19,x24,x22 ror x13,x26,#28 add x25,x25,x5 // h+=X[i] eor x16,x16,x22,ror#18 eor x12,x12,x7,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x26,x27 // a^b, b^c in next round eor x16,x16,x22,ror#41 // Sigma1(e) eor x13,x13,x26,ror#34 add x25,x25,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x11,x11,x4,ror#61 eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) add x25,x25,x16 // h+=Sigma1(e) eor x28,x28,x27 // Maj(a,b,c) eor x17,x13,x26,ror#39 // Sigma0(a) eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) add x6,x6,x15 add x21,x21,x25 // d+=h add x25,x25,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x6,x6,x12 add x25,x25,x17 // h+=Sigma0(a) add x6,x6,x11 ldr x11,[sp,#0] str x14,[sp,#24] ror x16,x21,#14 add x24,x24,x28 // h+=K[i] ror x13,x8,#1 and x17,x22,x21 ror x12,x5,#19 bic x28,x23,x21 ror x14,x25,#28 add x24,x24,x6 // h+=X[i] eor x16,x16,x21,ror#18 eor x13,x13,x8,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x25,x26 // a^b, b^c in next round eor x16,x16,x21,ror#41 // Sigma1(e) eor x14,x14,x25,ror#34 add x24,x24,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x12,x12,x5,ror#61 eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) add x24,x24,x16 // h+=Sigma1(e) eor x19,x19,x26 // Maj(a,b,c) eor x17,x14,x25,ror#39 // Sigma0(a) eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) add x7,x7,x0 add x20,x20,x24 // d+=h add x24,x24,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x7,x7,x13 add x24,x24,x17 // h+=Sigma0(a) add x7,x7,x12 ldr x12,[sp,#8] str x15,[sp,#0] ror x16,x20,#14 add x23,x23,x19 // h+=K[i] ror x14,x9,#1 and x17,x21,x20 ror x13,x6,#19 bic x19,x22,x20 ror x15,x24,#28 add x23,x23,x7 // h+=X[i] eor x16,x16,x20,ror#18 eor x14,x14,x9,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x24,x25 // a^b, b^c in next round eor x16,x16,x20,ror#41 // Sigma1(e) eor x15,x15,x24,ror#34 add x23,x23,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x13,x13,x6,ror#61 eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) add x23,x23,x16 // h+=Sigma1(e) eor x28,x28,x25 // Maj(a,b,c) eor x17,x15,x24,ror#39 // Sigma0(a) eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) add x8,x8,x1 add x27,x27,x23 // d+=h add x23,x23,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x8,x8,x14 add x23,x23,x17 // h+=Sigma0(a) add x8,x8,x13 ldr x13,[sp,#16] str x0,[sp,#8] ror x16,x27,#14 add x22,x22,x28 // h+=K[i] ror x15,x10,#1 and x17,x20,x27 ror x14,x7,#19 bic x28,x21,x27 ror x0,x23,#28 add x22,x22,x8 // h+=X[i] eor x16,x16,x27,ror#18 eor x15,x15,x10,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x23,x24 // a^b, b^c in next round eor x16,x16,x27,ror#41 // Sigma1(e) eor x0,x0,x23,ror#34 add x22,x22,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x14,x14,x7,ror#61 eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) add x22,x22,x16 // h+=Sigma1(e) eor x19,x19,x24 // Maj(a,b,c) eor x17,x0,x23,ror#39 // Sigma0(a) eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) add x9,x9,x2 add x26,x26,x22 // d+=h add x22,x22,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x9,x9,x15 add x22,x22,x17 // h+=Sigma0(a) add x9,x9,x14 ldr x14,[sp,#24] str x1,[sp,#16] ror x16,x26,#14 add x21,x21,x19 // h+=K[i] ror x0,x11,#1 and x17,x27,x26 ror x15,x8,#19 bic x19,x20,x26 ror x1,x22,#28 add x21,x21,x9 // h+=X[i] eor x16,x16,x26,ror#18 eor x0,x0,x11,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x22,x23 // a^b, b^c in next round eor x16,x16,x26,ror#41 // Sigma1(e) eor x1,x1,x22,ror#34 add x21,x21,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x15,x15,x8,ror#61 eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) add x21,x21,x16 // h+=Sigma1(e) eor x28,x28,x23 // Maj(a,b,c) eor x17,x1,x22,ror#39 // Sigma0(a) eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) add x10,x10,x3 add x25,x25,x21 // d+=h add x21,x21,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x10,x10,x0 add x21,x21,x17 // h+=Sigma0(a) add x10,x10,x15 ldr x15,[sp,#0] str x2,[sp,#24] ror x16,x25,#14 add x20,x20,x28 // h+=K[i] ror x1,x12,#1 and x17,x26,x25 ror x0,x9,#19 bic x28,x27,x25 ror x2,x21,#28 add x20,x20,x10 // h+=X[i] eor x16,x16,x25,ror#18 eor x1,x1,x12,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x21,x22 // a^b, b^c in next round eor x16,x16,x25,ror#41 // Sigma1(e) eor x2,x2,x21,ror#34 add x20,x20,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x0,x0,x9,ror#61 eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) add x20,x20,x16 // h+=Sigma1(e) eor x19,x19,x22 // Maj(a,b,c) eor x17,x2,x21,ror#39 // Sigma0(a) eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) add x11,x11,x4 add x24,x24,x20 // d+=h add x20,x20,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x11,x11,x1 add x20,x20,x17 // h+=Sigma0(a) add x11,x11,x0 ldr x0,[sp,#8] str x3,[sp,#0] ror x16,x24,#14 add x27,x27,x19 // h+=K[i] ror x2,x13,#1 and x17,x25,x24 ror x1,x10,#19 bic x19,x26,x24 ror x3,x20,#28 add x27,x27,x11 // h+=X[i] eor x16,x16,x24,ror#18 eor x2,x2,x13,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x20,x21 // a^b, b^c in next round eor x16,x16,x24,ror#41 // Sigma1(e) eor x3,x3,x20,ror#34 add x27,x27,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x1,x1,x10,ror#61 eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) add x27,x27,x16 // h+=Sigma1(e) eor x28,x28,x21 // Maj(a,b,c) eor x17,x3,x20,ror#39 // Sigma0(a) eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) add x12,x12,x5 add x23,x23,x27 // d+=h add x27,x27,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x12,x12,x2 add x27,x27,x17 // h+=Sigma0(a) add x12,x12,x1 ldr x1,[sp,#16] str x4,[sp,#8] ror x16,x23,#14 add x26,x26,x28 // h+=K[i] ror x3,x14,#1 and x17,x24,x23 ror x2,x11,#19 bic x28,x25,x23 ror x4,x27,#28 add x26,x26,x12 // h+=X[i] eor x16,x16,x23,ror#18 eor x3,x3,x14,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x27,x20 // a^b, b^c in next round eor x16,x16,x23,ror#41 // Sigma1(e) eor x4,x4,x27,ror#34 add x26,x26,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x2,x2,x11,ror#61 eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) add x26,x26,x16 // h+=Sigma1(e) eor x19,x19,x20 // Maj(a,b,c) eor x17,x4,x27,ror#39 // Sigma0(a) eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) add x13,x13,x6 add x22,x22,x26 // d+=h add x26,x26,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x13,x13,x3 add x26,x26,x17 // h+=Sigma0(a) add x13,x13,x2 ldr x2,[sp,#24] str x5,[sp,#16] ror x16,x22,#14 add x25,x25,x19 // h+=K[i] ror x4,x15,#1 and x17,x23,x22 ror x3,x12,#19 bic x19,x24,x22 ror x5,x26,#28 add x25,x25,x13 // h+=X[i] eor x16,x16,x22,ror#18 eor x4,x4,x15,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x26,x27 // a^b, b^c in next round eor x16,x16,x22,ror#41 // Sigma1(e) eor x5,x5,x26,ror#34 add x25,x25,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x3,x3,x12,ror#61 eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) add x25,x25,x16 // h+=Sigma1(e) eor x28,x28,x27 // Maj(a,b,c) eor x17,x5,x26,ror#39 // Sigma0(a) eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) add x14,x14,x7 add x21,x21,x25 // d+=h add x25,x25,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x14,x14,x4 add x25,x25,x17 // h+=Sigma0(a) add x14,x14,x3 ldr x3,[sp,#0] str x6,[sp,#24] ror x16,x21,#14 add x24,x24,x28 // h+=K[i] ror x5,x0,#1 and x17,x22,x21 ror x4,x13,#19 bic x28,x23,x21 ror x6,x25,#28 add x24,x24,x14 // h+=X[i] eor x16,x16,x21,ror#18 eor x5,x5,x0,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x25,x26 // a^b, b^c in next round eor x16,x16,x21,ror#41 // Sigma1(e) eor x6,x6,x25,ror#34 add x24,x24,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x4,x4,x13,ror#61 eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) add x24,x24,x16 // h+=Sigma1(e) eor x19,x19,x26 // Maj(a,b,c) eor x17,x6,x25,ror#39 // Sigma0(a) eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) add x15,x15,x8 add x20,x20,x24 // d+=h add x24,x24,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x15,x15,x5 add x24,x24,x17 // h+=Sigma0(a) add x15,x15,x4 ldr x4,[sp,#8] str x7,[sp,#0] ror x16,x20,#14 add x23,x23,x19 // h+=K[i] ror x6,x1,#1 and x17,x21,x20 ror x5,x14,#19 bic x19,x22,x20 ror x7,x24,#28 add x23,x23,x15 // h+=X[i] eor x16,x16,x20,ror#18 eor x6,x6,x1,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x24,x25 // a^b, b^c in next round eor x16,x16,x20,ror#41 // Sigma1(e) eor x7,x7,x24,ror#34 add x23,x23,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x5,x5,x14,ror#61 eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) add x23,x23,x16 // h+=Sigma1(e) eor x28,x28,x25 // Maj(a,b,c) eor x17,x7,x24,ror#39 // Sigma0(a) eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) add x0,x0,x9 add x27,x27,x23 // d+=h add x23,x23,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x0,x0,x6 add x23,x23,x17 // h+=Sigma0(a) add x0,x0,x5 ldr x5,[sp,#16] str x8,[sp,#8] ror x16,x27,#14 add x22,x22,x28 // h+=K[i] ror x7,x2,#1 and x17,x20,x27 ror x6,x15,#19 bic x28,x21,x27 ror x8,x23,#28 add x22,x22,x0 // h+=X[i] eor x16,x16,x27,ror#18 eor x7,x7,x2,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x23,x24 // a^b, b^c in next round eor x16,x16,x27,ror#41 // Sigma1(e) eor x8,x8,x23,ror#34 add x22,x22,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x6,x6,x15,ror#61 eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) add x22,x22,x16 // h+=Sigma1(e) eor x19,x19,x24 // Maj(a,b,c) eor x17,x8,x23,ror#39 // Sigma0(a) eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) add x1,x1,x10 add x26,x26,x22 // d+=h add x22,x22,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x1,x1,x7 add x22,x22,x17 // h+=Sigma0(a) add x1,x1,x6 ldr x6,[sp,#24] str x9,[sp,#16] ror x16,x26,#14 add x21,x21,x19 // h+=K[i] ror x8,x3,#1 and x17,x27,x26 ror x7,x0,#19 bic x19,x20,x26 ror x9,x22,#28 add x21,x21,x1 // h+=X[i] eor x16,x16,x26,ror#18 eor x8,x8,x3,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x22,x23 // a^b, b^c in next round eor x16,x16,x26,ror#41 // Sigma1(e) eor x9,x9,x22,ror#34 add x21,x21,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x7,x7,x0,ror#61 eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) add x21,x21,x16 // h+=Sigma1(e) eor x28,x28,x23 // Maj(a,b,c) eor x17,x9,x22,ror#39 // Sigma0(a) eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) add x2,x2,x11 add x25,x25,x21 // d+=h add x21,x21,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x2,x2,x8 add x21,x21,x17 // h+=Sigma0(a) add x2,x2,x7 ldr x7,[sp,#0] str x10,[sp,#24] ror x16,x25,#14 add x20,x20,x28 // h+=K[i] ror x9,x4,#1 and x17,x26,x25 ror x8,x1,#19 bic x28,x27,x25 ror x10,x21,#28 add x20,x20,x2 // h+=X[i] eor x16,x16,x25,ror#18 eor x9,x9,x4,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x21,x22 // a^b, b^c in next round eor x16,x16,x25,ror#41 // Sigma1(e) eor x10,x10,x21,ror#34 add x20,x20,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x8,x8,x1,ror#61 eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) add x20,x20,x16 // h+=Sigma1(e) eor x19,x19,x22 // Maj(a,b,c) eor x17,x10,x21,ror#39 // Sigma0(a) eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) add x3,x3,x12 add x24,x24,x20 // d+=h add x20,x20,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x3,x3,x9 add x20,x20,x17 // h+=Sigma0(a) add x3,x3,x8 cbnz x19,Loop_16_xx ldp x0,x2,[x29,#96] ldr x1,[x29,#112] sub x30,x30,#648 // rewind ldp x3,x4,[x0] ldp x5,x6,[x0,#2*8] add x1,x1,#14*8 // advance input pointer ldp x7,x8,[x0,#4*8] add x20,x20,x3 ldp x9,x10,[x0,#6*8] add x21,x21,x4 add x22,x22,x5 add x23,x23,x6 stp x20,x21,[x0] add x24,x24,x7 add x25,x25,x8 stp x22,x23,[x0,#2*8] add x26,x26,x9 add x27,x27,x10 cmp x1,x2 stp x24,x25,[x0,#4*8] stp x26,x27,[x0,#6*8] b.ne Loop ldp x19,x20,[x29,#16] add sp,sp,#4*8 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 AARCH64_VALIDATE_LINK_REGISTER ret .section __TEXT,__const .align 6 LK512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0 // terminator .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 .text #ifndef __KERNEL__ .globl _sha512_block_data_order_hw .private_extern _sha512_block_data_order_hw .align 6 _sha512_block_data_order_hw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context adrp x3,LK512@PAGE add x3,x3,LK512@PAGEOFF rev64 v16.16b,v16.16b rev64 v17.16b,v17.16b rev64 v18.16b,v18.16b rev64 v19.16b,v19.16b rev64 v20.16b,v20.16b rev64 v21.16b,v21.16b rev64 v22.16b,v22.16b rev64 v23.16b,v23.16b b Loop_hw .align 4 Loop_hw: ld1 {v24.2d},[x3],#16 subs x2,x2,#1 sub x4,x1,#128 orr v26.16b,v0.16b,v0.16b // offload orr v27.16b,v1.16b,v1.16b orr v28.16b,v2.16b,v2.16b orr v29.16b,v3.16b,v3.16b csel x1,x1,x4,ne // conditional rewind add v24.2d,v24.2d,v16.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08230 //sha512su0 v16.16b,v17.16b ext v7.16b,v20.16b,v21.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v17.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08251 //sha512su0 v17.16b,v18.16b ext v7.16b,v21.16b,v22.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v24.2d,v24.2d,v18.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08272 //sha512su0 v18.16b,v19.16b ext v7.16b,v22.16b,v23.16b,#8 .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v25.2d,v25.2d,v19.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08293 //sha512su0 v19.16b,v20.16b ext v7.16b,v23.16b,v16.16b,#8 .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v24.2d,v24.2d,v20.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082b4 //sha512su0 v20.16b,v21.16b ext v7.16b,v16.16b,v17.16b,#8 .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v25.2d,v25.2d,v21.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec082d5 //sha512su0 v21.16b,v22.16b ext v7.16b,v17.16b,v18.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v24.2d,v24.2d,v22.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082f6 //sha512su0 v22.16b,v23.16b ext v7.16b,v18.16b,v19.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v25.2d,v25.2d,v23.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08217 //sha512su0 v23.16b,v16.16b ext v7.16b,v19.16b,v20.16b,#8 .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v24.2d,v24.2d,v16.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08230 //sha512su0 v16.16b,v17.16b ext v7.16b,v20.16b,v21.16b,#8 .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v25.2d,v25.2d,v17.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08251 //sha512su0 v17.16b,v18.16b ext v7.16b,v21.16b,v22.16b,#8 .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v24.2d,v24.2d,v18.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08272 //sha512su0 v18.16b,v19.16b ext v7.16b,v22.16b,v23.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v19.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08293 //sha512su0 v19.16b,v20.16b ext v7.16b,v23.16b,v16.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v24.2d,v24.2d,v20.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082b4 //sha512su0 v20.16b,v21.16b ext v7.16b,v16.16b,v17.16b,#8 .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v25.2d,v25.2d,v21.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec082d5 //sha512su0 v21.16b,v22.16b ext v7.16b,v17.16b,v18.16b,#8 .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v24.2d,v24.2d,v22.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082f6 //sha512su0 v22.16b,v23.16b ext v7.16b,v18.16b,v19.16b,#8 .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v25.2d,v25.2d,v23.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08217 //sha512su0 v23.16b,v16.16b ext v7.16b,v19.16b,v20.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v24.2d,v24.2d,v16.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08230 //sha512su0 v16.16b,v17.16b ext v7.16b,v20.16b,v21.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v25.2d,v25.2d,v17.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08251 //sha512su0 v17.16b,v18.16b ext v7.16b,v21.16b,v22.16b,#8 .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v24.2d,v24.2d,v18.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08272 //sha512su0 v18.16b,v19.16b ext v7.16b,v22.16b,v23.16b,#8 .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v25.2d,v25.2d,v19.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08293 //sha512su0 v19.16b,v20.16b ext v7.16b,v23.16b,v16.16b,#8 .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v24.2d,v24.2d,v20.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082b4 //sha512su0 v20.16b,v21.16b ext v7.16b,v16.16b,v17.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v21.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec082d5 //sha512su0 v21.16b,v22.16b ext v7.16b,v17.16b,v18.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v24.2d,v24.2d,v22.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082f6 //sha512su0 v22.16b,v23.16b ext v7.16b,v18.16b,v19.16b,#8 .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v25.2d,v25.2d,v23.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08217 //sha512su0 v23.16b,v16.16b ext v7.16b,v19.16b,v20.16b,#8 .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v24.2d,v24.2d,v16.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08230 //sha512su0 v16.16b,v17.16b ext v7.16b,v20.16b,v21.16b,#8 .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v25.2d,v25.2d,v17.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08251 //sha512su0 v17.16b,v18.16b ext v7.16b,v21.16b,v22.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v24.2d,v24.2d,v18.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08272 //sha512su0 v18.16b,v19.16b ext v7.16b,v22.16b,v23.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v25.2d,v25.2d,v19.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08293 //sha512su0 v19.16b,v20.16b ext v7.16b,v23.16b,v16.16b,#8 .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v24.2d,v24.2d,v20.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082b4 //sha512su0 v20.16b,v21.16b ext v7.16b,v16.16b,v17.16b,#8 .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v25.2d,v25.2d,v21.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec082d5 //sha512su0 v21.16b,v22.16b ext v7.16b,v17.16b,v18.16b,#8 .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v24.2d,v24.2d,v22.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082f6 //sha512su0 v22.16b,v23.16b ext v7.16b,v18.16b,v19.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v23.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08217 //sha512su0 v23.16b,v16.16b ext v7.16b,v19.16b,v20.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b ld1 {v25.2d},[x3],#16 add v24.2d,v24.2d,v16.2d ld1 {v16.16b},[x1],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b rev64 v16.16b,v16.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b ld1 {v24.2d},[x3],#16 add v25.2d,v25.2d,v17.2d ld1 {v17.16b},[x1],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b rev64 v17.16b,v17.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b ld1 {v25.2d},[x3],#16 add v24.2d,v24.2d,v18.2d ld1 {v18.16b},[x1],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b rev64 v18.16b,v18.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b ld1 {v24.2d},[x3],#16 add v25.2d,v25.2d,v19.2d ld1 {v19.16b},[x1],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b rev64 v19.16b,v19.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b ld1 {v25.2d},[x3],#16 add v24.2d,v24.2d,v20.2d ld1 {v20.16b},[x1],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b rev64 v20.16b,v20.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b ld1 {v24.2d},[x3],#16 add v25.2d,v25.2d,v21.2d ld1 {v21.16b},[x1],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b rev64 v21.16b,v21.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b ld1 {v25.2d},[x3],#16 add v24.2d,v24.2d,v22.2d ld1 {v22.16b},[x1],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b rev64 v22.16b,v22.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b sub x3,x3,#80*8 // rewind add v25.2d,v25.2d,v23.2d ld1 {v23.16b},[x1],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b rev64 v23.16b,v23.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v0.2d,v0.2d,v26.2d // accumulate add v1.2d,v1.2d,v27.2d add v2.2d,v2.2d,v28.2d add v3.2d,v3.2d,v29.2d cbnz x2,Loop_hw st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context ldr x29,[sp],#16 ret #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) ring-0.17.14/pregenerated/sha512-armv8-linux64.S000064400000000000000000001401321046102023000170740ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ==================================================================== // Written by Andy Polyakov for the OpenSSL // project. // ==================================================================== // // SHA256/512 for ARMv8. // // Performance in cycles per processed byte and improvement coefficient // over code generated with "default" compiler: // // SHA256-hw SHA256(*) SHA512 // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) // Denver 2.01 10.5 (+26%) 6.70 (+8%) // X-Gene 20.0 (+100%) 12.8 (+300%(***)) // Mongoose 2.36 13.0 (+50%) 8.36 (+33%) // Kryo 1.92 17.4 (+30%) 11.2 (+8%) // // (*) Software SHA256 results are of lesser relevance, presented // mostly for informational purposes. // (**) The result is a trade-off: it's possible to improve it by // 10% (or by 1 cycle per round), but at the cost of 20% loss // on Cortex-A53 (or by 4 cycles per round). // (***) Super-impressive coefficients over gcc-generated code are // indication of some compiler "pathology", most notably code // generated with -mgeneral-regs-only is significantly faster // and the gap is only 40-90%. #ifndef __KERNEL__ #endif .text .globl sha512_block_data_order_nohw .hidden sha512_block_data_order_nohw .type sha512_block_data_order_nohw,%function .align 6 sha512_block_data_order_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#4*8 ldp x20,x21,[x0] // load context ldp x22,x23,[x0,#2*8] ldp x24,x25,[x0,#4*8] add x2,x1,x2,lsl#7 // end of input ldp x26,x27,[x0,#6*8] adrp x30,.LK512 add x30,x30,:lo12:.LK512 stp x0,x2,[x29,#96] .Loop: ldp x3,x4,[x1],#2*8 ldr x19,[x30],#8 // *K++ eor x28,x21,x22 // magic seed str x1,[x29,#112] #ifndef __AARCH64EB__ rev x3,x3 // 0 #endif ror x16,x24,#14 add x27,x27,x19 // h+=K[i] eor x6,x24,x24,ror#23 and x17,x25,x24 bic x19,x26,x24 add x27,x27,x3 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x20,x21 // a^b, b^c in next round eor x16,x16,x6,ror#18 // Sigma1(e) ror x6,x20,#28 add x27,x27,x17 // h+=Ch(e,f,g) eor x17,x20,x20,ror#5 add x27,x27,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x23,x23,x27 // d+=h eor x28,x28,x21 // Maj(a,b,c) eor x17,x6,x17,ror#34 // Sigma0(a) add x27,x27,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x27,x27,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x4,x4 // 1 #endif ldp x5,x6,[x1],#2*8 add x27,x27,x17 // h+=Sigma0(a) ror x16,x23,#14 add x26,x26,x28 // h+=K[i] eor x7,x23,x23,ror#23 and x17,x24,x23 bic x28,x25,x23 add x26,x26,x4 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x27,x20 // a^b, b^c in next round eor x16,x16,x7,ror#18 // Sigma1(e) ror x7,x27,#28 add x26,x26,x17 // h+=Ch(e,f,g) eor x17,x27,x27,ror#5 add x26,x26,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x22,x22,x26 // d+=h eor x19,x19,x20 // Maj(a,b,c) eor x17,x7,x17,ror#34 // Sigma0(a) add x26,x26,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x26,x26,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x5,x5 // 2 #endif add x26,x26,x17 // h+=Sigma0(a) ror x16,x22,#14 add x25,x25,x19 // h+=K[i] eor x8,x22,x22,ror#23 and x17,x23,x22 bic x19,x24,x22 add x25,x25,x5 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x26,x27 // a^b, b^c in next round eor x16,x16,x8,ror#18 // Sigma1(e) ror x8,x26,#28 add x25,x25,x17 // h+=Ch(e,f,g) eor x17,x26,x26,ror#5 add x25,x25,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x21,x21,x25 // d+=h eor x28,x28,x27 // Maj(a,b,c) eor x17,x8,x17,ror#34 // Sigma0(a) add x25,x25,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x25,x25,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x6,x6 // 3 #endif ldp x7,x8,[x1],#2*8 add x25,x25,x17 // h+=Sigma0(a) ror x16,x21,#14 add x24,x24,x28 // h+=K[i] eor x9,x21,x21,ror#23 and x17,x22,x21 bic x28,x23,x21 add x24,x24,x6 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x25,x26 // a^b, b^c in next round eor x16,x16,x9,ror#18 // Sigma1(e) ror x9,x25,#28 add x24,x24,x17 // h+=Ch(e,f,g) eor x17,x25,x25,ror#5 add x24,x24,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x20,x20,x24 // d+=h eor x19,x19,x26 // Maj(a,b,c) eor x17,x9,x17,ror#34 // Sigma0(a) add x24,x24,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x24,x24,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x7,x7 // 4 #endif add x24,x24,x17 // h+=Sigma0(a) ror x16,x20,#14 add x23,x23,x19 // h+=K[i] eor x10,x20,x20,ror#23 and x17,x21,x20 bic x19,x22,x20 add x23,x23,x7 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x24,x25 // a^b, b^c in next round eor x16,x16,x10,ror#18 // Sigma1(e) ror x10,x24,#28 add x23,x23,x17 // h+=Ch(e,f,g) eor x17,x24,x24,ror#5 add x23,x23,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x27,x27,x23 // d+=h eor x28,x28,x25 // Maj(a,b,c) eor x17,x10,x17,ror#34 // Sigma0(a) add x23,x23,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x23,x23,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x8,x8 // 5 #endif ldp x9,x10,[x1],#2*8 add x23,x23,x17 // h+=Sigma0(a) ror x16,x27,#14 add x22,x22,x28 // h+=K[i] eor x11,x27,x27,ror#23 and x17,x20,x27 bic x28,x21,x27 add x22,x22,x8 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x23,x24 // a^b, b^c in next round eor x16,x16,x11,ror#18 // Sigma1(e) ror x11,x23,#28 add x22,x22,x17 // h+=Ch(e,f,g) eor x17,x23,x23,ror#5 add x22,x22,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x26,x26,x22 // d+=h eor x19,x19,x24 // Maj(a,b,c) eor x17,x11,x17,ror#34 // Sigma0(a) add x22,x22,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x22,x22,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x9,x9 // 6 #endif add x22,x22,x17 // h+=Sigma0(a) ror x16,x26,#14 add x21,x21,x19 // h+=K[i] eor x12,x26,x26,ror#23 and x17,x27,x26 bic x19,x20,x26 add x21,x21,x9 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x22,x23 // a^b, b^c in next round eor x16,x16,x12,ror#18 // Sigma1(e) ror x12,x22,#28 add x21,x21,x17 // h+=Ch(e,f,g) eor x17,x22,x22,ror#5 add x21,x21,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x25,x25,x21 // d+=h eor x28,x28,x23 // Maj(a,b,c) eor x17,x12,x17,ror#34 // Sigma0(a) add x21,x21,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x21,x21,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x10,x10 // 7 #endif ldp x11,x12,[x1],#2*8 add x21,x21,x17 // h+=Sigma0(a) ror x16,x25,#14 add x20,x20,x28 // h+=K[i] eor x13,x25,x25,ror#23 and x17,x26,x25 bic x28,x27,x25 add x20,x20,x10 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x21,x22 // a^b, b^c in next round eor x16,x16,x13,ror#18 // Sigma1(e) ror x13,x21,#28 add x20,x20,x17 // h+=Ch(e,f,g) eor x17,x21,x21,ror#5 add x20,x20,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x24,x24,x20 // d+=h eor x19,x19,x22 // Maj(a,b,c) eor x17,x13,x17,ror#34 // Sigma0(a) add x20,x20,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x20,x20,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x11,x11 // 8 #endif add x20,x20,x17 // h+=Sigma0(a) ror x16,x24,#14 add x27,x27,x19 // h+=K[i] eor x14,x24,x24,ror#23 and x17,x25,x24 bic x19,x26,x24 add x27,x27,x11 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x20,x21 // a^b, b^c in next round eor x16,x16,x14,ror#18 // Sigma1(e) ror x14,x20,#28 add x27,x27,x17 // h+=Ch(e,f,g) eor x17,x20,x20,ror#5 add x27,x27,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x23,x23,x27 // d+=h eor x28,x28,x21 // Maj(a,b,c) eor x17,x14,x17,ror#34 // Sigma0(a) add x27,x27,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x27,x27,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x12,x12 // 9 #endif ldp x13,x14,[x1],#2*8 add x27,x27,x17 // h+=Sigma0(a) ror x16,x23,#14 add x26,x26,x28 // h+=K[i] eor x15,x23,x23,ror#23 and x17,x24,x23 bic x28,x25,x23 add x26,x26,x12 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x27,x20 // a^b, b^c in next round eor x16,x16,x15,ror#18 // Sigma1(e) ror x15,x27,#28 add x26,x26,x17 // h+=Ch(e,f,g) eor x17,x27,x27,ror#5 add x26,x26,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x22,x22,x26 // d+=h eor x19,x19,x20 // Maj(a,b,c) eor x17,x15,x17,ror#34 // Sigma0(a) add x26,x26,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x26,x26,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x13,x13 // 10 #endif add x26,x26,x17 // h+=Sigma0(a) ror x16,x22,#14 add x25,x25,x19 // h+=K[i] eor x0,x22,x22,ror#23 and x17,x23,x22 bic x19,x24,x22 add x25,x25,x13 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x26,x27 // a^b, b^c in next round eor x16,x16,x0,ror#18 // Sigma1(e) ror x0,x26,#28 add x25,x25,x17 // h+=Ch(e,f,g) eor x17,x26,x26,ror#5 add x25,x25,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x21,x21,x25 // d+=h eor x28,x28,x27 // Maj(a,b,c) eor x17,x0,x17,ror#34 // Sigma0(a) add x25,x25,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x25,x25,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x14,x14 // 11 #endif ldp x15,x0,[x1],#2*8 add x25,x25,x17 // h+=Sigma0(a) str x6,[sp,#24] ror x16,x21,#14 add x24,x24,x28 // h+=K[i] eor x6,x21,x21,ror#23 and x17,x22,x21 bic x28,x23,x21 add x24,x24,x14 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x25,x26 // a^b, b^c in next round eor x16,x16,x6,ror#18 // Sigma1(e) ror x6,x25,#28 add x24,x24,x17 // h+=Ch(e,f,g) eor x17,x25,x25,ror#5 add x24,x24,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x20,x20,x24 // d+=h eor x19,x19,x26 // Maj(a,b,c) eor x17,x6,x17,ror#34 // Sigma0(a) add x24,x24,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x24,x24,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x15,x15 // 12 #endif add x24,x24,x17 // h+=Sigma0(a) str x7,[sp,#0] ror x16,x20,#14 add x23,x23,x19 // h+=K[i] eor x7,x20,x20,ror#23 and x17,x21,x20 bic x19,x22,x20 add x23,x23,x15 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x24,x25 // a^b, b^c in next round eor x16,x16,x7,ror#18 // Sigma1(e) ror x7,x24,#28 add x23,x23,x17 // h+=Ch(e,f,g) eor x17,x24,x24,ror#5 add x23,x23,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x27,x27,x23 // d+=h eor x28,x28,x25 // Maj(a,b,c) eor x17,x7,x17,ror#34 // Sigma0(a) add x23,x23,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x23,x23,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x0,x0 // 13 #endif ldp x1,x2,[x1] add x23,x23,x17 // h+=Sigma0(a) str x8,[sp,#8] ror x16,x27,#14 add x22,x22,x28 // h+=K[i] eor x8,x27,x27,ror#23 and x17,x20,x27 bic x28,x21,x27 add x22,x22,x0 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x23,x24 // a^b, b^c in next round eor x16,x16,x8,ror#18 // Sigma1(e) ror x8,x23,#28 add x22,x22,x17 // h+=Ch(e,f,g) eor x17,x23,x23,ror#5 add x22,x22,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x26,x26,x22 // d+=h eor x19,x19,x24 // Maj(a,b,c) eor x17,x8,x17,ror#34 // Sigma0(a) add x22,x22,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x22,x22,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x1,x1 // 14 #endif ldr x6,[sp,#24] add x22,x22,x17 // h+=Sigma0(a) str x9,[sp,#16] ror x16,x26,#14 add x21,x21,x19 // h+=K[i] eor x9,x26,x26,ror#23 and x17,x27,x26 bic x19,x20,x26 add x21,x21,x1 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x22,x23 // a^b, b^c in next round eor x16,x16,x9,ror#18 // Sigma1(e) ror x9,x22,#28 add x21,x21,x17 // h+=Ch(e,f,g) eor x17,x22,x22,ror#5 add x21,x21,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x25,x25,x21 // d+=h eor x28,x28,x23 // Maj(a,b,c) eor x17,x9,x17,ror#34 // Sigma0(a) add x21,x21,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x21,x21,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x2,x2 // 15 #endif ldr x7,[sp,#0] add x21,x21,x17 // h+=Sigma0(a) str x10,[sp,#24] ror x16,x25,#14 add x20,x20,x28 // h+=K[i] ror x9,x4,#1 and x17,x26,x25 ror x8,x1,#19 bic x28,x27,x25 ror x10,x21,#28 add x20,x20,x2 // h+=X[i] eor x16,x16,x25,ror#18 eor x9,x9,x4,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x21,x22 // a^b, b^c in next round eor x16,x16,x25,ror#41 // Sigma1(e) eor x10,x10,x21,ror#34 add x20,x20,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x8,x8,x1,ror#61 eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) add x20,x20,x16 // h+=Sigma1(e) eor x19,x19,x22 // Maj(a,b,c) eor x17,x10,x21,ror#39 // Sigma0(a) eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) add x3,x3,x12 add x24,x24,x20 // d+=h add x20,x20,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x3,x3,x9 add x20,x20,x17 // h+=Sigma0(a) add x3,x3,x8 .Loop_16_xx: ldr x8,[sp,#8] str x11,[sp,#0] ror x16,x24,#14 add x27,x27,x19 // h+=K[i] ror x10,x5,#1 and x17,x25,x24 ror x9,x2,#19 bic x19,x26,x24 ror x11,x20,#28 add x27,x27,x3 // h+=X[i] eor x16,x16,x24,ror#18 eor x10,x10,x5,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x20,x21 // a^b, b^c in next round eor x16,x16,x24,ror#41 // Sigma1(e) eor x11,x11,x20,ror#34 add x27,x27,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x9,x9,x2,ror#61 eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) add x27,x27,x16 // h+=Sigma1(e) eor x28,x28,x21 // Maj(a,b,c) eor x17,x11,x20,ror#39 // Sigma0(a) eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) add x4,x4,x13 add x23,x23,x27 // d+=h add x27,x27,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x4,x4,x10 add x27,x27,x17 // h+=Sigma0(a) add x4,x4,x9 ldr x9,[sp,#16] str x12,[sp,#8] ror x16,x23,#14 add x26,x26,x28 // h+=K[i] ror x11,x6,#1 and x17,x24,x23 ror x10,x3,#19 bic x28,x25,x23 ror x12,x27,#28 add x26,x26,x4 // h+=X[i] eor x16,x16,x23,ror#18 eor x11,x11,x6,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x27,x20 // a^b, b^c in next round eor x16,x16,x23,ror#41 // Sigma1(e) eor x12,x12,x27,ror#34 add x26,x26,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x10,x10,x3,ror#61 eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) add x26,x26,x16 // h+=Sigma1(e) eor x19,x19,x20 // Maj(a,b,c) eor x17,x12,x27,ror#39 // Sigma0(a) eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) add x5,x5,x14 add x22,x22,x26 // d+=h add x26,x26,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x5,x5,x11 add x26,x26,x17 // h+=Sigma0(a) add x5,x5,x10 ldr x10,[sp,#24] str x13,[sp,#16] ror x16,x22,#14 add x25,x25,x19 // h+=K[i] ror x12,x7,#1 and x17,x23,x22 ror x11,x4,#19 bic x19,x24,x22 ror x13,x26,#28 add x25,x25,x5 // h+=X[i] eor x16,x16,x22,ror#18 eor x12,x12,x7,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x26,x27 // a^b, b^c in next round eor x16,x16,x22,ror#41 // Sigma1(e) eor x13,x13,x26,ror#34 add x25,x25,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x11,x11,x4,ror#61 eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) add x25,x25,x16 // h+=Sigma1(e) eor x28,x28,x27 // Maj(a,b,c) eor x17,x13,x26,ror#39 // Sigma0(a) eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) add x6,x6,x15 add x21,x21,x25 // d+=h add x25,x25,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x6,x6,x12 add x25,x25,x17 // h+=Sigma0(a) add x6,x6,x11 ldr x11,[sp,#0] str x14,[sp,#24] ror x16,x21,#14 add x24,x24,x28 // h+=K[i] ror x13,x8,#1 and x17,x22,x21 ror x12,x5,#19 bic x28,x23,x21 ror x14,x25,#28 add x24,x24,x6 // h+=X[i] eor x16,x16,x21,ror#18 eor x13,x13,x8,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x25,x26 // a^b, b^c in next round eor x16,x16,x21,ror#41 // Sigma1(e) eor x14,x14,x25,ror#34 add x24,x24,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x12,x12,x5,ror#61 eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) add x24,x24,x16 // h+=Sigma1(e) eor x19,x19,x26 // Maj(a,b,c) eor x17,x14,x25,ror#39 // Sigma0(a) eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) add x7,x7,x0 add x20,x20,x24 // d+=h add x24,x24,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x7,x7,x13 add x24,x24,x17 // h+=Sigma0(a) add x7,x7,x12 ldr x12,[sp,#8] str x15,[sp,#0] ror x16,x20,#14 add x23,x23,x19 // h+=K[i] ror x14,x9,#1 and x17,x21,x20 ror x13,x6,#19 bic x19,x22,x20 ror x15,x24,#28 add x23,x23,x7 // h+=X[i] eor x16,x16,x20,ror#18 eor x14,x14,x9,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x24,x25 // a^b, b^c in next round eor x16,x16,x20,ror#41 // Sigma1(e) eor x15,x15,x24,ror#34 add x23,x23,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x13,x13,x6,ror#61 eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) add x23,x23,x16 // h+=Sigma1(e) eor x28,x28,x25 // Maj(a,b,c) eor x17,x15,x24,ror#39 // Sigma0(a) eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) add x8,x8,x1 add x27,x27,x23 // d+=h add x23,x23,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x8,x8,x14 add x23,x23,x17 // h+=Sigma0(a) add x8,x8,x13 ldr x13,[sp,#16] str x0,[sp,#8] ror x16,x27,#14 add x22,x22,x28 // h+=K[i] ror x15,x10,#1 and x17,x20,x27 ror x14,x7,#19 bic x28,x21,x27 ror x0,x23,#28 add x22,x22,x8 // h+=X[i] eor x16,x16,x27,ror#18 eor x15,x15,x10,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x23,x24 // a^b, b^c in next round eor x16,x16,x27,ror#41 // Sigma1(e) eor x0,x0,x23,ror#34 add x22,x22,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x14,x14,x7,ror#61 eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) add x22,x22,x16 // h+=Sigma1(e) eor x19,x19,x24 // Maj(a,b,c) eor x17,x0,x23,ror#39 // Sigma0(a) eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) add x9,x9,x2 add x26,x26,x22 // d+=h add x22,x22,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x9,x9,x15 add x22,x22,x17 // h+=Sigma0(a) add x9,x9,x14 ldr x14,[sp,#24] str x1,[sp,#16] ror x16,x26,#14 add x21,x21,x19 // h+=K[i] ror x0,x11,#1 and x17,x27,x26 ror x15,x8,#19 bic x19,x20,x26 ror x1,x22,#28 add x21,x21,x9 // h+=X[i] eor x16,x16,x26,ror#18 eor x0,x0,x11,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x22,x23 // a^b, b^c in next round eor x16,x16,x26,ror#41 // Sigma1(e) eor x1,x1,x22,ror#34 add x21,x21,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x15,x15,x8,ror#61 eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) add x21,x21,x16 // h+=Sigma1(e) eor x28,x28,x23 // Maj(a,b,c) eor x17,x1,x22,ror#39 // Sigma0(a) eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) add x10,x10,x3 add x25,x25,x21 // d+=h add x21,x21,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x10,x10,x0 add x21,x21,x17 // h+=Sigma0(a) add x10,x10,x15 ldr x15,[sp,#0] str x2,[sp,#24] ror x16,x25,#14 add x20,x20,x28 // h+=K[i] ror x1,x12,#1 and x17,x26,x25 ror x0,x9,#19 bic x28,x27,x25 ror x2,x21,#28 add x20,x20,x10 // h+=X[i] eor x16,x16,x25,ror#18 eor x1,x1,x12,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x21,x22 // a^b, b^c in next round eor x16,x16,x25,ror#41 // Sigma1(e) eor x2,x2,x21,ror#34 add x20,x20,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x0,x0,x9,ror#61 eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) add x20,x20,x16 // h+=Sigma1(e) eor x19,x19,x22 // Maj(a,b,c) eor x17,x2,x21,ror#39 // Sigma0(a) eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) add x11,x11,x4 add x24,x24,x20 // d+=h add x20,x20,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x11,x11,x1 add x20,x20,x17 // h+=Sigma0(a) add x11,x11,x0 ldr x0,[sp,#8] str x3,[sp,#0] ror x16,x24,#14 add x27,x27,x19 // h+=K[i] ror x2,x13,#1 and x17,x25,x24 ror x1,x10,#19 bic x19,x26,x24 ror x3,x20,#28 add x27,x27,x11 // h+=X[i] eor x16,x16,x24,ror#18 eor x2,x2,x13,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x20,x21 // a^b, b^c in next round eor x16,x16,x24,ror#41 // Sigma1(e) eor x3,x3,x20,ror#34 add x27,x27,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x1,x1,x10,ror#61 eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) add x27,x27,x16 // h+=Sigma1(e) eor x28,x28,x21 // Maj(a,b,c) eor x17,x3,x20,ror#39 // Sigma0(a) eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) add x12,x12,x5 add x23,x23,x27 // d+=h add x27,x27,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x12,x12,x2 add x27,x27,x17 // h+=Sigma0(a) add x12,x12,x1 ldr x1,[sp,#16] str x4,[sp,#8] ror x16,x23,#14 add x26,x26,x28 // h+=K[i] ror x3,x14,#1 and x17,x24,x23 ror x2,x11,#19 bic x28,x25,x23 ror x4,x27,#28 add x26,x26,x12 // h+=X[i] eor x16,x16,x23,ror#18 eor x3,x3,x14,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x27,x20 // a^b, b^c in next round eor x16,x16,x23,ror#41 // Sigma1(e) eor x4,x4,x27,ror#34 add x26,x26,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x2,x2,x11,ror#61 eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) add x26,x26,x16 // h+=Sigma1(e) eor x19,x19,x20 // Maj(a,b,c) eor x17,x4,x27,ror#39 // Sigma0(a) eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) add x13,x13,x6 add x22,x22,x26 // d+=h add x26,x26,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x13,x13,x3 add x26,x26,x17 // h+=Sigma0(a) add x13,x13,x2 ldr x2,[sp,#24] str x5,[sp,#16] ror x16,x22,#14 add x25,x25,x19 // h+=K[i] ror x4,x15,#1 and x17,x23,x22 ror x3,x12,#19 bic x19,x24,x22 ror x5,x26,#28 add x25,x25,x13 // h+=X[i] eor x16,x16,x22,ror#18 eor x4,x4,x15,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x26,x27 // a^b, b^c in next round eor x16,x16,x22,ror#41 // Sigma1(e) eor x5,x5,x26,ror#34 add x25,x25,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x3,x3,x12,ror#61 eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) add x25,x25,x16 // h+=Sigma1(e) eor x28,x28,x27 // Maj(a,b,c) eor x17,x5,x26,ror#39 // Sigma0(a) eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) add x14,x14,x7 add x21,x21,x25 // d+=h add x25,x25,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x14,x14,x4 add x25,x25,x17 // h+=Sigma0(a) add x14,x14,x3 ldr x3,[sp,#0] str x6,[sp,#24] ror x16,x21,#14 add x24,x24,x28 // h+=K[i] ror x5,x0,#1 and x17,x22,x21 ror x4,x13,#19 bic x28,x23,x21 ror x6,x25,#28 add x24,x24,x14 // h+=X[i] eor x16,x16,x21,ror#18 eor x5,x5,x0,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x25,x26 // a^b, b^c in next round eor x16,x16,x21,ror#41 // Sigma1(e) eor x6,x6,x25,ror#34 add x24,x24,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x4,x4,x13,ror#61 eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) add x24,x24,x16 // h+=Sigma1(e) eor x19,x19,x26 // Maj(a,b,c) eor x17,x6,x25,ror#39 // Sigma0(a) eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) add x15,x15,x8 add x20,x20,x24 // d+=h add x24,x24,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x15,x15,x5 add x24,x24,x17 // h+=Sigma0(a) add x15,x15,x4 ldr x4,[sp,#8] str x7,[sp,#0] ror x16,x20,#14 add x23,x23,x19 // h+=K[i] ror x6,x1,#1 and x17,x21,x20 ror x5,x14,#19 bic x19,x22,x20 ror x7,x24,#28 add x23,x23,x15 // h+=X[i] eor x16,x16,x20,ror#18 eor x6,x6,x1,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x24,x25 // a^b, b^c in next round eor x16,x16,x20,ror#41 // Sigma1(e) eor x7,x7,x24,ror#34 add x23,x23,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x5,x5,x14,ror#61 eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) add x23,x23,x16 // h+=Sigma1(e) eor x28,x28,x25 // Maj(a,b,c) eor x17,x7,x24,ror#39 // Sigma0(a) eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) add x0,x0,x9 add x27,x27,x23 // d+=h add x23,x23,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x0,x0,x6 add x23,x23,x17 // h+=Sigma0(a) add x0,x0,x5 ldr x5,[sp,#16] str x8,[sp,#8] ror x16,x27,#14 add x22,x22,x28 // h+=K[i] ror x7,x2,#1 and x17,x20,x27 ror x6,x15,#19 bic x28,x21,x27 ror x8,x23,#28 add x22,x22,x0 // h+=X[i] eor x16,x16,x27,ror#18 eor x7,x7,x2,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x23,x24 // a^b, b^c in next round eor x16,x16,x27,ror#41 // Sigma1(e) eor x8,x8,x23,ror#34 add x22,x22,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x6,x6,x15,ror#61 eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) add x22,x22,x16 // h+=Sigma1(e) eor x19,x19,x24 // Maj(a,b,c) eor x17,x8,x23,ror#39 // Sigma0(a) eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) add x1,x1,x10 add x26,x26,x22 // d+=h add x22,x22,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x1,x1,x7 add x22,x22,x17 // h+=Sigma0(a) add x1,x1,x6 ldr x6,[sp,#24] str x9,[sp,#16] ror x16,x26,#14 add x21,x21,x19 // h+=K[i] ror x8,x3,#1 and x17,x27,x26 ror x7,x0,#19 bic x19,x20,x26 ror x9,x22,#28 add x21,x21,x1 // h+=X[i] eor x16,x16,x26,ror#18 eor x8,x8,x3,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x22,x23 // a^b, b^c in next round eor x16,x16,x26,ror#41 // Sigma1(e) eor x9,x9,x22,ror#34 add x21,x21,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x7,x7,x0,ror#61 eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) add x21,x21,x16 // h+=Sigma1(e) eor x28,x28,x23 // Maj(a,b,c) eor x17,x9,x22,ror#39 // Sigma0(a) eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) add x2,x2,x11 add x25,x25,x21 // d+=h add x21,x21,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x2,x2,x8 add x21,x21,x17 // h+=Sigma0(a) add x2,x2,x7 ldr x7,[sp,#0] str x10,[sp,#24] ror x16,x25,#14 add x20,x20,x28 // h+=K[i] ror x9,x4,#1 and x17,x26,x25 ror x8,x1,#19 bic x28,x27,x25 ror x10,x21,#28 add x20,x20,x2 // h+=X[i] eor x16,x16,x25,ror#18 eor x9,x9,x4,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x21,x22 // a^b, b^c in next round eor x16,x16,x25,ror#41 // Sigma1(e) eor x10,x10,x21,ror#34 add x20,x20,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x8,x8,x1,ror#61 eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) add x20,x20,x16 // h+=Sigma1(e) eor x19,x19,x22 // Maj(a,b,c) eor x17,x10,x21,ror#39 // Sigma0(a) eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) add x3,x3,x12 add x24,x24,x20 // d+=h add x20,x20,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x3,x3,x9 add x20,x20,x17 // h+=Sigma0(a) add x3,x3,x8 cbnz x19,.Loop_16_xx ldp x0,x2,[x29,#96] ldr x1,[x29,#112] sub x30,x30,#648 // rewind ldp x3,x4,[x0] ldp x5,x6,[x0,#2*8] add x1,x1,#14*8 // advance input pointer ldp x7,x8,[x0,#4*8] add x20,x20,x3 ldp x9,x10,[x0,#6*8] add x21,x21,x4 add x22,x22,x5 add x23,x23,x6 stp x20,x21,[x0] add x24,x24,x7 add x25,x25,x8 stp x22,x23,[x0,#2*8] add x26,x26,x9 add x27,x27,x10 cmp x1,x2 stp x24,x25,[x0,#4*8] stp x26,x27,[x0,#6*8] b.ne .Loop ldp x19,x20,[x29,#16] add sp,sp,#4*8 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 AARCH64_VALIDATE_LINK_REGISTER ret .size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw .section .rodata .align 6 .type .LK512,%object .LK512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0 // terminator .size .LK512,.-.LK512 .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 .text #ifndef __KERNEL__ .globl sha512_block_data_order_hw .hidden sha512_block_data_order_hw .type sha512_block_data_order_hw,%function .align 6 sha512_block_data_order_hw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context adrp x3,.LK512 add x3,x3,:lo12:.LK512 rev64 v16.16b,v16.16b rev64 v17.16b,v17.16b rev64 v18.16b,v18.16b rev64 v19.16b,v19.16b rev64 v20.16b,v20.16b rev64 v21.16b,v21.16b rev64 v22.16b,v22.16b rev64 v23.16b,v23.16b b .Loop_hw .align 4 .Loop_hw: ld1 {v24.2d},[x3],#16 subs x2,x2,#1 sub x4,x1,#128 orr v26.16b,v0.16b,v0.16b // offload orr v27.16b,v1.16b,v1.16b orr v28.16b,v2.16b,v2.16b orr v29.16b,v3.16b,v3.16b csel x1,x1,x4,ne // conditional rewind add v24.2d,v24.2d,v16.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec08230 //sha512su0 v16.16b,v17.16b ext v7.16b,v20.16b,v21.16b,#8 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v17.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec08251 //sha512su0 v17.16b,v18.16b ext v7.16b,v21.16b,v22.16b,#8 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v24.2d,v24.2d,v18.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec08272 //sha512su0 v18.16b,v19.16b ext v7.16b,v22.16b,v23.16b,#8 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v25.2d,v25.2d,v19.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec08293 //sha512su0 v19.16b,v20.16b ext v7.16b,v23.16b,v16.16b,#8 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v24.2d,v24.2d,v20.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b ext v7.16b,v16.16b,v17.16b,#8 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v25.2d,v25.2d,v21.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b ext v7.16b,v17.16b,v18.16b,#8 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v24.2d,v24.2d,v22.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b ext v7.16b,v18.16b,v19.16b,#8 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v25.2d,v25.2d,v23.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec08217 //sha512su0 v23.16b,v16.16b ext v7.16b,v19.16b,v20.16b,#8 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v24.2d,v24.2d,v16.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec08230 //sha512su0 v16.16b,v17.16b ext v7.16b,v20.16b,v21.16b,#8 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v25.2d,v25.2d,v17.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec08251 //sha512su0 v17.16b,v18.16b ext v7.16b,v21.16b,v22.16b,#8 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v24.2d,v24.2d,v18.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec08272 //sha512su0 v18.16b,v19.16b ext v7.16b,v22.16b,v23.16b,#8 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v19.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec08293 //sha512su0 v19.16b,v20.16b ext v7.16b,v23.16b,v16.16b,#8 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v24.2d,v24.2d,v20.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b ext v7.16b,v16.16b,v17.16b,#8 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v25.2d,v25.2d,v21.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b ext v7.16b,v17.16b,v18.16b,#8 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v24.2d,v24.2d,v22.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b ext v7.16b,v18.16b,v19.16b,#8 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v25.2d,v25.2d,v23.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec08217 //sha512su0 v23.16b,v16.16b ext v7.16b,v19.16b,v20.16b,#8 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v24.2d,v24.2d,v16.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec08230 //sha512su0 v16.16b,v17.16b ext v7.16b,v20.16b,v21.16b,#8 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v25.2d,v25.2d,v17.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec08251 //sha512su0 v17.16b,v18.16b ext v7.16b,v21.16b,v22.16b,#8 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v24.2d,v24.2d,v18.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec08272 //sha512su0 v18.16b,v19.16b ext v7.16b,v22.16b,v23.16b,#8 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v25.2d,v25.2d,v19.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec08293 //sha512su0 v19.16b,v20.16b ext v7.16b,v23.16b,v16.16b,#8 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v24.2d,v24.2d,v20.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b ext v7.16b,v16.16b,v17.16b,#8 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v21.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b ext v7.16b,v17.16b,v18.16b,#8 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v24.2d,v24.2d,v22.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b ext v7.16b,v18.16b,v19.16b,#8 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v25.2d,v25.2d,v23.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec08217 //sha512su0 v23.16b,v16.16b ext v7.16b,v19.16b,v20.16b,#8 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v24.2d,v24.2d,v16.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec08230 //sha512su0 v16.16b,v17.16b ext v7.16b,v20.16b,v21.16b,#8 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v25.2d,v25.2d,v17.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec08251 //sha512su0 v17.16b,v18.16b ext v7.16b,v21.16b,v22.16b,#8 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v24.2d,v24.2d,v18.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec08272 //sha512su0 v18.16b,v19.16b ext v7.16b,v22.16b,v23.16b,#8 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v25.2d,v25.2d,v19.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec08293 //sha512su0 v19.16b,v20.16b ext v7.16b,v23.16b,v16.16b,#8 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v24.2d,v24.2d,v20.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b ext v7.16b,v16.16b,v17.16b,#8 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v25.2d,v25.2d,v21.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b ext v7.16b,v17.16b,v18.16b,#8 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v24.2d,v24.2d,v22.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b ext v7.16b,v18.16b,v19.16b,#8 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v23.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" .inst 0xcec08217 //sha512su0 v23.16b,v16.16b ext v7.16b,v19.16b,v20.16b,#8 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b ld1 {v25.2d},[x3],#16 add v24.2d,v24.2d,v16.2d ld1 {v16.16b},[x1],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b rev64 v16.16b,v16.16b add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b ld1 {v24.2d},[x3],#16 add v25.2d,v25.2d,v17.2d ld1 {v17.16b},[x1],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b rev64 v17.16b,v17.16b add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b ld1 {v25.2d},[x3],#16 add v24.2d,v24.2d,v18.2d ld1 {v18.16b},[x1],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b rev64 v18.16b,v18.16b add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b ld1 {v24.2d},[x3],#16 add v25.2d,v25.2d,v19.2d ld1 {v19.16b},[x1],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b rev64 v19.16b,v19.16b add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b ld1 {v25.2d},[x3],#16 add v24.2d,v24.2d,v20.2d ld1 {v20.16b},[x1],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b rev64 v20.16b,v20.16b add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b ld1 {v24.2d},[x3],#16 add v25.2d,v25.2d,v21.2d ld1 {v21.16b},[x1],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b rev64 v21.16b,v21.16b add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b ld1 {v25.2d},[x3],#16 add v24.2d,v24.2d,v22.2d ld1 {v22.16b},[x1],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b rev64 v22.16b,v22.16b add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b sub x3,x3,#80*8 // rewind add v25.2d,v25.2d,v23.2d ld1 {v23.16b},[x1],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b rev64 v23.16b,v23.16b add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v0.2d,v0.2d,v26.2d // accumulate add v1.2d,v1.2d,v27.2d add v2.2d,v2.2d,v28.2d add v3.2d,v3.2d,v29.2d cbnz x2,.Loop_hw st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context ldr x29,[sp],#16 ret .size sha512_block_data_order_hw,.-sha512_block_data_order_hw #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) ring-0.17.14/pregenerated/sha512-armv8-win64.S000064400000000000000000001375571046102023000165530ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ==================================================================== // Written by Andy Polyakov for the OpenSSL // project. // ==================================================================== // // SHA256/512 for ARMv8. // // Performance in cycles per processed byte and improvement coefficient // over code generated with "default" compiler: // // SHA256-hw SHA256(*) SHA512 // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) // Denver 2.01 10.5 (+26%) 6.70 (+8%) // X-Gene 20.0 (+100%) 12.8 (+300%(***)) // Mongoose 2.36 13.0 (+50%) 8.36 (+33%) // Kryo 1.92 17.4 (+30%) 11.2 (+8%) // // (*) Software SHA256 results are of lesser relevance, presented // mostly for informational purposes. // (**) The result is a trade-off: it's possible to improve it by // 10% (or by 1 cycle per round), but at the cost of 20% loss // on Cortex-A53 (or by 4 cycles per round). // (***) Super-impressive coefficients over gcc-generated code are // indication of some compiler "pathology", most notably code // generated with -mgeneral-regs-only is significantly faster // and the gap is only 40-90%. #ifndef __KERNEL__ #endif .text .globl sha512_block_data_order_nohw .def sha512_block_data_order_nohw .type 32 .endef .align 6 sha512_block_data_order_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#4*8 ldp x20,x21,[x0] // load context ldp x22,x23,[x0,#2*8] ldp x24,x25,[x0,#4*8] add x2,x1,x2,lsl#7 // end of input ldp x26,x27,[x0,#6*8] adrp x30,LK512 add x30,x30,:lo12:LK512 stp x0,x2,[x29,#96] Loop: ldp x3,x4,[x1],#2*8 ldr x19,[x30],#8 // *K++ eor x28,x21,x22 // magic seed str x1,[x29,#112] #ifndef __AARCH64EB__ rev x3,x3 // 0 #endif ror x16,x24,#14 add x27,x27,x19 // h+=K[i] eor x6,x24,x24,ror#23 and x17,x25,x24 bic x19,x26,x24 add x27,x27,x3 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x20,x21 // a^b, b^c in next round eor x16,x16,x6,ror#18 // Sigma1(e) ror x6,x20,#28 add x27,x27,x17 // h+=Ch(e,f,g) eor x17,x20,x20,ror#5 add x27,x27,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x23,x23,x27 // d+=h eor x28,x28,x21 // Maj(a,b,c) eor x17,x6,x17,ror#34 // Sigma0(a) add x27,x27,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x27,x27,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x4,x4 // 1 #endif ldp x5,x6,[x1],#2*8 add x27,x27,x17 // h+=Sigma0(a) ror x16,x23,#14 add x26,x26,x28 // h+=K[i] eor x7,x23,x23,ror#23 and x17,x24,x23 bic x28,x25,x23 add x26,x26,x4 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x27,x20 // a^b, b^c in next round eor x16,x16,x7,ror#18 // Sigma1(e) ror x7,x27,#28 add x26,x26,x17 // h+=Ch(e,f,g) eor x17,x27,x27,ror#5 add x26,x26,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x22,x22,x26 // d+=h eor x19,x19,x20 // Maj(a,b,c) eor x17,x7,x17,ror#34 // Sigma0(a) add x26,x26,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x26,x26,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x5,x5 // 2 #endif add x26,x26,x17 // h+=Sigma0(a) ror x16,x22,#14 add x25,x25,x19 // h+=K[i] eor x8,x22,x22,ror#23 and x17,x23,x22 bic x19,x24,x22 add x25,x25,x5 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x26,x27 // a^b, b^c in next round eor x16,x16,x8,ror#18 // Sigma1(e) ror x8,x26,#28 add x25,x25,x17 // h+=Ch(e,f,g) eor x17,x26,x26,ror#5 add x25,x25,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x21,x21,x25 // d+=h eor x28,x28,x27 // Maj(a,b,c) eor x17,x8,x17,ror#34 // Sigma0(a) add x25,x25,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x25,x25,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x6,x6 // 3 #endif ldp x7,x8,[x1],#2*8 add x25,x25,x17 // h+=Sigma0(a) ror x16,x21,#14 add x24,x24,x28 // h+=K[i] eor x9,x21,x21,ror#23 and x17,x22,x21 bic x28,x23,x21 add x24,x24,x6 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x25,x26 // a^b, b^c in next round eor x16,x16,x9,ror#18 // Sigma1(e) ror x9,x25,#28 add x24,x24,x17 // h+=Ch(e,f,g) eor x17,x25,x25,ror#5 add x24,x24,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x20,x20,x24 // d+=h eor x19,x19,x26 // Maj(a,b,c) eor x17,x9,x17,ror#34 // Sigma0(a) add x24,x24,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x24,x24,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x7,x7 // 4 #endif add x24,x24,x17 // h+=Sigma0(a) ror x16,x20,#14 add x23,x23,x19 // h+=K[i] eor x10,x20,x20,ror#23 and x17,x21,x20 bic x19,x22,x20 add x23,x23,x7 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x24,x25 // a^b, b^c in next round eor x16,x16,x10,ror#18 // Sigma1(e) ror x10,x24,#28 add x23,x23,x17 // h+=Ch(e,f,g) eor x17,x24,x24,ror#5 add x23,x23,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x27,x27,x23 // d+=h eor x28,x28,x25 // Maj(a,b,c) eor x17,x10,x17,ror#34 // Sigma0(a) add x23,x23,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x23,x23,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x8,x8 // 5 #endif ldp x9,x10,[x1],#2*8 add x23,x23,x17 // h+=Sigma0(a) ror x16,x27,#14 add x22,x22,x28 // h+=K[i] eor x11,x27,x27,ror#23 and x17,x20,x27 bic x28,x21,x27 add x22,x22,x8 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x23,x24 // a^b, b^c in next round eor x16,x16,x11,ror#18 // Sigma1(e) ror x11,x23,#28 add x22,x22,x17 // h+=Ch(e,f,g) eor x17,x23,x23,ror#5 add x22,x22,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x26,x26,x22 // d+=h eor x19,x19,x24 // Maj(a,b,c) eor x17,x11,x17,ror#34 // Sigma0(a) add x22,x22,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x22,x22,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x9,x9 // 6 #endif add x22,x22,x17 // h+=Sigma0(a) ror x16,x26,#14 add x21,x21,x19 // h+=K[i] eor x12,x26,x26,ror#23 and x17,x27,x26 bic x19,x20,x26 add x21,x21,x9 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x22,x23 // a^b, b^c in next round eor x16,x16,x12,ror#18 // Sigma1(e) ror x12,x22,#28 add x21,x21,x17 // h+=Ch(e,f,g) eor x17,x22,x22,ror#5 add x21,x21,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x25,x25,x21 // d+=h eor x28,x28,x23 // Maj(a,b,c) eor x17,x12,x17,ror#34 // Sigma0(a) add x21,x21,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x21,x21,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x10,x10 // 7 #endif ldp x11,x12,[x1],#2*8 add x21,x21,x17 // h+=Sigma0(a) ror x16,x25,#14 add x20,x20,x28 // h+=K[i] eor x13,x25,x25,ror#23 and x17,x26,x25 bic x28,x27,x25 add x20,x20,x10 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x21,x22 // a^b, b^c in next round eor x16,x16,x13,ror#18 // Sigma1(e) ror x13,x21,#28 add x20,x20,x17 // h+=Ch(e,f,g) eor x17,x21,x21,ror#5 add x20,x20,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x24,x24,x20 // d+=h eor x19,x19,x22 // Maj(a,b,c) eor x17,x13,x17,ror#34 // Sigma0(a) add x20,x20,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x20,x20,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x11,x11 // 8 #endif add x20,x20,x17 // h+=Sigma0(a) ror x16,x24,#14 add x27,x27,x19 // h+=K[i] eor x14,x24,x24,ror#23 and x17,x25,x24 bic x19,x26,x24 add x27,x27,x11 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x20,x21 // a^b, b^c in next round eor x16,x16,x14,ror#18 // Sigma1(e) ror x14,x20,#28 add x27,x27,x17 // h+=Ch(e,f,g) eor x17,x20,x20,ror#5 add x27,x27,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x23,x23,x27 // d+=h eor x28,x28,x21 // Maj(a,b,c) eor x17,x14,x17,ror#34 // Sigma0(a) add x27,x27,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x27,x27,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x12,x12 // 9 #endif ldp x13,x14,[x1],#2*8 add x27,x27,x17 // h+=Sigma0(a) ror x16,x23,#14 add x26,x26,x28 // h+=K[i] eor x15,x23,x23,ror#23 and x17,x24,x23 bic x28,x25,x23 add x26,x26,x12 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x27,x20 // a^b, b^c in next round eor x16,x16,x15,ror#18 // Sigma1(e) ror x15,x27,#28 add x26,x26,x17 // h+=Ch(e,f,g) eor x17,x27,x27,ror#5 add x26,x26,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x22,x22,x26 // d+=h eor x19,x19,x20 // Maj(a,b,c) eor x17,x15,x17,ror#34 // Sigma0(a) add x26,x26,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x26,x26,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x13,x13 // 10 #endif add x26,x26,x17 // h+=Sigma0(a) ror x16,x22,#14 add x25,x25,x19 // h+=K[i] eor x0,x22,x22,ror#23 and x17,x23,x22 bic x19,x24,x22 add x25,x25,x13 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x26,x27 // a^b, b^c in next round eor x16,x16,x0,ror#18 // Sigma1(e) ror x0,x26,#28 add x25,x25,x17 // h+=Ch(e,f,g) eor x17,x26,x26,ror#5 add x25,x25,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x21,x21,x25 // d+=h eor x28,x28,x27 // Maj(a,b,c) eor x17,x0,x17,ror#34 // Sigma0(a) add x25,x25,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x25,x25,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x14,x14 // 11 #endif ldp x15,x0,[x1],#2*8 add x25,x25,x17 // h+=Sigma0(a) str x6,[sp,#24] ror x16,x21,#14 add x24,x24,x28 // h+=K[i] eor x6,x21,x21,ror#23 and x17,x22,x21 bic x28,x23,x21 add x24,x24,x14 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x25,x26 // a^b, b^c in next round eor x16,x16,x6,ror#18 // Sigma1(e) ror x6,x25,#28 add x24,x24,x17 // h+=Ch(e,f,g) eor x17,x25,x25,ror#5 add x24,x24,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x20,x20,x24 // d+=h eor x19,x19,x26 // Maj(a,b,c) eor x17,x6,x17,ror#34 // Sigma0(a) add x24,x24,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x24,x24,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x15,x15 // 12 #endif add x24,x24,x17 // h+=Sigma0(a) str x7,[sp,#0] ror x16,x20,#14 add x23,x23,x19 // h+=K[i] eor x7,x20,x20,ror#23 and x17,x21,x20 bic x19,x22,x20 add x23,x23,x15 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x24,x25 // a^b, b^c in next round eor x16,x16,x7,ror#18 // Sigma1(e) ror x7,x24,#28 add x23,x23,x17 // h+=Ch(e,f,g) eor x17,x24,x24,ror#5 add x23,x23,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x27,x27,x23 // d+=h eor x28,x28,x25 // Maj(a,b,c) eor x17,x7,x17,ror#34 // Sigma0(a) add x23,x23,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x23,x23,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x0,x0 // 13 #endif ldp x1,x2,[x1] add x23,x23,x17 // h+=Sigma0(a) str x8,[sp,#8] ror x16,x27,#14 add x22,x22,x28 // h+=K[i] eor x8,x27,x27,ror#23 and x17,x20,x27 bic x28,x21,x27 add x22,x22,x0 // h+=X[i] orr x17,x17,x28 // Ch(e,f,g) eor x28,x23,x24 // a^b, b^c in next round eor x16,x16,x8,ror#18 // Sigma1(e) ror x8,x23,#28 add x22,x22,x17 // h+=Ch(e,f,g) eor x17,x23,x23,ror#5 add x22,x22,x16 // h+=Sigma1(e) and x19,x19,x28 // (b^c)&=(a^b) add x26,x26,x22 // d+=h eor x19,x19,x24 // Maj(a,b,c) eor x17,x8,x17,ror#34 // Sigma0(a) add x22,x22,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round //add x22,x22,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x1,x1 // 14 #endif ldr x6,[sp,#24] add x22,x22,x17 // h+=Sigma0(a) str x9,[sp,#16] ror x16,x26,#14 add x21,x21,x19 // h+=K[i] eor x9,x26,x26,ror#23 and x17,x27,x26 bic x19,x20,x26 add x21,x21,x1 // h+=X[i] orr x17,x17,x19 // Ch(e,f,g) eor x19,x22,x23 // a^b, b^c in next round eor x16,x16,x9,ror#18 // Sigma1(e) ror x9,x22,#28 add x21,x21,x17 // h+=Ch(e,f,g) eor x17,x22,x22,ror#5 add x21,x21,x16 // h+=Sigma1(e) and x28,x28,x19 // (b^c)&=(a^b) add x25,x25,x21 // d+=h eor x28,x28,x23 // Maj(a,b,c) eor x17,x9,x17,ror#34 // Sigma0(a) add x21,x21,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round //add x21,x21,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x2,x2 // 15 #endif ldr x7,[sp,#0] add x21,x21,x17 // h+=Sigma0(a) str x10,[sp,#24] ror x16,x25,#14 add x20,x20,x28 // h+=K[i] ror x9,x4,#1 and x17,x26,x25 ror x8,x1,#19 bic x28,x27,x25 ror x10,x21,#28 add x20,x20,x2 // h+=X[i] eor x16,x16,x25,ror#18 eor x9,x9,x4,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x21,x22 // a^b, b^c in next round eor x16,x16,x25,ror#41 // Sigma1(e) eor x10,x10,x21,ror#34 add x20,x20,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x8,x8,x1,ror#61 eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) add x20,x20,x16 // h+=Sigma1(e) eor x19,x19,x22 // Maj(a,b,c) eor x17,x10,x21,ror#39 // Sigma0(a) eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) add x3,x3,x12 add x24,x24,x20 // d+=h add x20,x20,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x3,x3,x9 add x20,x20,x17 // h+=Sigma0(a) add x3,x3,x8 Loop_16_xx: ldr x8,[sp,#8] str x11,[sp,#0] ror x16,x24,#14 add x27,x27,x19 // h+=K[i] ror x10,x5,#1 and x17,x25,x24 ror x9,x2,#19 bic x19,x26,x24 ror x11,x20,#28 add x27,x27,x3 // h+=X[i] eor x16,x16,x24,ror#18 eor x10,x10,x5,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x20,x21 // a^b, b^c in next round eor x16,x16,x24,ror#41 // Sigma1(e) eor x11,x11,x20,ror#34 add x27,x27,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x9,x9,x2,ror#61 eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) add x27,x27,x16 // h+=Sigma1(e) eor x28,x28,x21 // Maj(a,b,c) eor x17,x11,x20,ror#39 // Sigma0(a) eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) add x4,x4,x13 add x23,x23,x27 // d+=h add x27,x27,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x4,x4,x10 add x27,x27,x17 // h+=Sigma0(a) add x4,x4,x9 ldr x9,[sp,#16] str x12,[sp,#8] ror x16,x23,#14 add x26,x26,x28 // h+=K[i] ror x11,x6,#1 and x17,x24,x23 ror x10,x3,#19 bic x28,x25,x23 ror x12,x27,#28 add x26,x26,x4 // h+=X[i] eor x16,x16,x23,ror#18 eor x11,x11,x6,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x27,x20 // a^b, b^c in next round eor x16,x16,x23,ror#41 // Sigma1(e) eor x12,x12,x27,ror#34 add x26,x26,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x10,x10,x3,ror#61 eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) add x26,x26,x16 // h+=Sigma1(e) eor x19,x19,x20 // Maj(a,b,c) eor x17,x12,x27,ror#39 // Sigma0(a) eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) add x5,x5,x14 add x22,x22,x26 // d+=h add x26,x26,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x5,x5,x11 add x26,x26,x17 // h+=Sigma0(a) add x5,x5,x10 ldr x10,[sp,#24] str x13,[sp,#16] ror x16,x22,#14 add x25,x25,x19 // h+=K[i] ror x12,x7,#1 and x17,x23,x22 ror x11,x4,#19 bic x19,x24,x22 ror x13,x26,#28 add x25,x25,x5 // h+=X[i] eor x16,x16,x22,ror#18 eor x12,x12,x7,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x26,x27 // a^b, b^c in next round eor x16,x16,x22,ror#41 // Sigma1(e) eor x13,x13,x26,ror#34 add x25,x25,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x11,x11,x4,ror#61 eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) add x25,x25,x16 // h+=Sigma1(e) eor x28,x28,x27 // Maj(a,b,c) eor x17,x13,x26,ror#39 // Sigma0(a) eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) add x6,x6,x15 add x21,x21,x25 // d+=h add x25,x25,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x6,x6,x12 add x25,x25,x17 // h+=Sigma0(a) add x6,x6,x11 ldr x11,[sp,#0] str x14,[sp,#24] ror x16,x21,#14 add x24,x24,x28 // h+=K[i] ror x13,x8,#1 and x17,x22,x21 ror x12,x5,#19 bic x28,x23,x21 ror x14,x25,#28 add x24,x24,x6 // h+=X[i] eor x16,x16,x21,ror#18 eor x13,x13,x8,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x25,x26 // a^b, b^c in next round eor x16,x16,x21,ror#41 // Sigma1(e) eor x14,x14,x25,ror#34 add x24,x24,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x12,x12,x5,ror#61 eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) add x24,x24,x16 // h+=Sigma1(e) eor x19,x19,x26 // Maj(a,b,c) eor x17,x14,x25,ror#39 // Sigma0(a) eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) add x7,x7,x0 add x20,x20,x24 // d+=h add x24,x24,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x7,x7,x13 add x24,x24,x17 // h+=Sigma0(a) add x7,x7,x12 ldr x12,[sp,#8] str x15,[sp,#0] ror x16,x20,#14 add x23,x23,x19 // h+=K[i] ror x14,x9,#1 and x17,x21,x20 ror x13,x6,#19 bic x19,x22,x20 ror x15,x24,#28 add x23,x23,x7 // h+=X[i] eor x16,x16,x20,ror#18 eor x14,x14,x9,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x24,x25 // a^b, b^c in next round eor x16,x16,x20,ror#41 // Sigma1(e) eor x15,x15,x24,ror#34 add x23,x23,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x13,x13,x6,ror#61 eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) add x23,x23,x16 // h+=Sigma1(e) eor x28,x28,x25 // Maj(a,b,c) eor x17,x15,x24,ror#39 // Sigma0(a) eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) add x8,x8,x1 add x27,x27,x23 // d+=h add x23,x23,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x8,x8,x14 add x23,x23,x17 // h+=Sigma0(a) add x8,x8,x13 ldr x13,[sp,#16] str x0,[sp,#8] ror x16,x27,#14 add x22,x22,x28 // h+=K[i] ror x15,x10,#1 and x17,x20,x27 ror x14,x7,#19 bic x28,x21,x27 ror x0,x23,#28 add x22,x22,x8 // h+=X[i] eor x16,x16,x27,ror#18 eor x15,x15,x10,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x23,x24 // a^b, b^c in next round eor x16,x16,x27,ror#41 // Sigma1(e) eor x0,x0,x23,ror#34 add x22,x22,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x14,x14,x7,ror#61 eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) add x22,x22,x16 // h+=Sigma1(e) eor x19,x19,x24 // Maj(a,b,c) eor x17,x0,x23,ror#39 // Sigma0(a) eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) add x9,x9,x2 add x26,x26,x22 // d+=h add x22,x22,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x9,x9,x15 add x22,x22,x17 // h+=Sigma0(a) add x9,x9,x14 ldr x14,[sp,#24] str x1,[sp,#16] ror x16,x26,#14 add x21,x21,x19 // h+=K[i] ror x0,x11,#1 and x17,x27,x26 ror x15,x8,#19 bic x19,x20,x26 ror x1,x22,#28 add x21,x21,x9 // h+=X[i] eor x16,x16,x26,ror#18 eor x0,x0,x11,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x22,x23 // a^b, b^c in next round eor x16,x16,x26,ror#41 // Sigma1(e) eor x1,x1,x22,ror#34 add x21,x21,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x15,x15,x8,ror#61 eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) add x21,x21,x16 // h+=Sigma1(e) eor x28,x28,x23 // Maj(a,b,c) eor x17,x1,x22,ror#39 // Sigma0(a) eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) add x10,x10,x3 add x25,x25,x21 // d+=h add x21,x21,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x10,x10,x0 add x21,x21,x17 // h+=Sigma0(a) add x10,x10,x15 ldr x15,[sp,#0] str x2,[sp,#24] ror x16,x25,#14 add x20,x20,x28 // h+=K[i] ror x1,x12,#1 and x17,x26,x25 ror x0,x9,#19 bic x28,x27,x25 ror x2,x21,#28 add x20,x20,x10 // h+=X[i] eor x16,x16,x25,ror#18 eor x1,x1,x12,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x21,x22 // a^b, b^c in next round eor x16,x16,x25,ror#41 // Sigma1(e) eor x2,x2,x21,ror#34 add x20,x20,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x0,x0,x9,ror#61 eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) add x20,x20,x16 // h+=Sigma1(e) eor x19,x19,x22 // Maj(a,b,c) eor x17,x2,x21,ror#39 // Sigma0(a) eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) add x11,x11,x4 add x24,x24,x20 // d+=h add x20,x20,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x11,x11,x1 add x20,x20,x17 // h+=Sigma0(a) add x11,x11,x0 ldr x0,[sp,#8] str x3,[sp,#0] ror x16,x24,#14 add x27,x27,x19 // h+=K[i] ror x2,x13,#1 and x17,x25,x24 ror x1,x10,#19 bic x19,x26,x24 ror x3,x20,#28 add x27,x27,x11 // h+=X[i] eor x16,x16,x24,ror#18 eor x2,x2,x13,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x20,x21 // a^b, b^c in next round eor x16,x16,x24,ror#41 // Sigma1(e) eor x3,x3,x20,ror#34 add x27,x27,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x1,x1,x10,ror#61 eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) add x27,x27,x16 // h+=Sigma1(e) eor x28,x28,x21 // Maj(a,b,c) eor x17,x3,x20,ror#39 // Sigma0(a) eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) add x12,x12,x5 add x23,x23,x27 // d+=h add x27,x27,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x12,x12,x2 add x27,x27,x17 // h+=Sigma0(a) add x12,x12,x1 ldr x1,[sp,#16] str x4,[sp,#8] ror x16,x23,#14 add x26,x26,x28 // h+=K[i] ror x3,x14,#1 and x17,x24,x23 ror x2,x11,#19 bic x28,x25,x23 ror x4,x27,#28 add x26,x26,x12 // h+=X[i] eor x16,x16,x23,ror#18 eor x3,x3,x14,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x27,x20 // a^b, b^c in next round eor x16,x16,x23,ror#41 // Sigma1(e) eor x4,x4,x27,ror#34 add x26,x26,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x2,x2,x11,ror#61 eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) add x26,x26,x16 // h+=Sigma1(e) eor x19,x19,x20 // Maj(a,b,c) eor x17,x4,x27,ror#39 // Sigma0(a) eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) add x13,x13,x6 add x22,x22,x26 // d+=h add x26,x26,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x13,x13,x3 add x26,x26,x17 // h+=Sigma0(a) add x13,x13,x2 ldr x2,[sp,#24] str x5,[sp,#16] ror x16,x22,#14 add x25,x25,x19 // h+=K[i] ror x4,x15,#1 and x17,x23,x22 ror x3,x12,#19 bic x19,x24,x22 ror x5,x26,#28 add x25,x25,x13 // h+=X[i] eor x16,x16,x22,ror#18 eor x4,x4,x15,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x26,x27 // a^b, b^c in next round eor x16,x16,x22,ror#41 // Sigma1(e) eor x5,x5,x26,ror#34 add x25,x25,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x3,x3,x12,ror#61 eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) add x25,x25,x16 // h+=Sigma1(e) eor x28,x28,x27 // Maj(a,b,c) eor x17,x5,x26,ror#39 // Sigma0(a) eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) add x14,x14,x7 add x21,x21,x25 // d+=h add x25,x25,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x14,x14,x4 add x25,x25,x17 // h+=Sigma0(a) add x14,x14,x3 ldr x3,[sp,#0] str x6,[sp,#24] ror x16,x21,#14 add x24,x24,x28 // h+=K[i] ror x5,x0,#1 and x17,x22,x21 ror x4,x13,#19 bic x28,x23,x21 ror x6,x25,#28 add x24,x24,x14 // h+=X[i] eor x16,x16,x21,ror#18 eor x5,x5,x0,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x25,x26 // a^b, b^c in next round eor x16,x16,x21,ror#41 // Sigma1(e) eor x6,x6,x25,ror#34 add x24,x24,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x4,x4,x13,ror#61 eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) add x24,x24,x16 // h+=Sigma1(e) eor x19,x19,x26 // Maj(a,b,c) eor x17,x6,x25,ror#39 // Sigma0(a) eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) add x15,x15,x8 add x20,x20,x24 // d+=h add x24,x24,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x15,x15,x5 add x24,x24,x17 // h+=Sigma0(a) add x15,x15,x4 ldr x4,[sp,#8] str x7,[sp,#0] ror x16,x20,#14 add x23,x23,x19 // h+=K[i] ror x6,x1,#1 and x17,x21,x20 ror x5,x14,#19 bic x19,x22,x20 ror x7,x24,#28 add x23,x23,x15 // h+=X[i] eor x16,x16,x20,ror#18 eor x6,x6,x1,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x24,x25 // a^b, b^c in next round eor x16,x16,x20,ror#41 // Sigma1(e) eor x7,x7,x24,ror#34 add x23,x23,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x5,x5,x14,ror#61 eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) add x23,x23,x16 // h+=Sigma1(e) eor x28,x28,x25 // Maj(a,b,c) eor x17,x7,x24,ror#39 // Sigma0(a) eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) add x0,x0,x9 add x27,x27,x23 // d+=h add x23,x23,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x0,x0,x6 add x23,x23,x17 // h+=Sigma0(a) add x0,x0,x5 ldr x5,[sp,#16] str x8,[sp,#8] ror x16,x27,#14 add x22,x22,x28 // h+=K[i] ror x7,x2,#1 and x17,x20,x27 ror x6,x15,#19 bic x28,x21,x27 ror x8,x23,#28 add x22,x22,x0 // h+=X[i] eor x16,x16,x27,ror#18 eor x7,x7,x2,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x23,x24 // a^b, b^c in next round eor x16,x16,x27,ror#41 // Sigma1(e) eor x8,x8,x23,ror#34 add x22,x22,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x6,x6,x15,ror#61 eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) add x22,x22,x16 // h+=Sigma1(e) eor x19,x19,x24 // Maj(a,b,c) eor x17,x8,x23,ror#39 // Sigma0(a) eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) add x1,x1,x10 add x26,x26,x22 // d+=h add x22,x22,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x1,x1,x7 add x22,x22,x17 // h+=Sigma0(a) add x1,x1,x6 ldr x6,[sp,#24] str x9,[sp,#16] ror x16,x26,#14 add x21,x21,x19 // h+=K[i] ror x8,x3,#1 and x17,x27,x26 ror x7,x0,#19 bic x19,x20,x26 ror x9,x22,#28 add x21,x21,x1 // h+=X[i] eor x16,x16,x26,ror#18 eor x8,x8,x3,ror#8 orr x17,x17,x19 // Ch(e,f,g) eor x19,x22,x23 // a^b, b^c in next round eor x16,x16,x26,ror#41 // Sigma1(e) eor x9,x9,x22,ror#34 add x21,x21,x17 // h+=Ch(e,f,g) and x28,x28,x19 // (b^c)&=(a^b) eor x7,x7,x0,ror#61 eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) add x21,x21,x16 // h+=Sigma1(e) eor x28,x28,x23 // Maj(a,b,c) eor x17,x9,x22,ror#39 // Sigma0(a) eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) add x2,x2,x11 add x25,x25,x21 // d+=h add x21,x21,x28 // h+=Maj(a,b,c) ldr x28,[x30],#8 // *K++, x19 in next round add x2,x2,x8 add x21,x21,x17 // h+=Sigma0(a) add x2,x2,x7 ldr x7,[sp,#0] str x10,[sp,#24] ror x16,x25,#14 add x20,x20,x28 // h+=K[i] ror x9,x4,#1 and x17,x26,x25 ror x8,x1,#19 bic x28,x27,x25 ror x10,x21,#28 add x20,x20,x2 // h+=X[i] eor x16,x16,x25,ror#18 eor x9,x9,x4,ror#8 orr x17,x17,x28 // Ch(e,f,g) eor x28,x21,x22 // a^b, b^c in next round eor x16,x16,x25,ror#41 // Sigma1(e) eor x10,x10,x21,ror#34 add x20,x20,x17 // h+=Ch(e,f,g) and x19,x19,x28 // (b^c)&=(a^b) eor x8,x8,x1,ror#61 eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) add x20,x20,x16 // h+=Sigma1(e) eor x19,x19,x22 // Maj(a,b,c) eor x17,x10,x21,ror#39 // Sigma0(a) eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) add x3,x3,x12 add x24,x24,x20 // d+=h add x20,x20,x19 // h+=Maj(a,b,c) ldr x19,[x30],#8 // *K++, x28 in next round add x3,x3,x9 add x20,x20,x17 // h+=Sigma0(a) add x3,x3,x8 cbnz x19,Loop_16_xx ldp x0,x2,[x29,#96] ldr x1,[x29,#112] sub x30,x30,#648 // rewind ldp x3,x4,[x0] ldp x5,x6,[x0,#2*8] add x1,x1,#14*8 // advance input pointer ldp x7,x8,[x0,#4*8] add x20,x20,x3 ldp x9,x10,[x0,#6*8] add x21,x21,x4 add x22,x22,x5 add x23,x23,x6 stp x20,x21,[x0] add x24,x24,x7 add x25,x25,x8 stp x22,x23,[x0,#2*8] add x26,x26,x9 add x27,x27,x10 cmp x1,x2 stp x24,x25,[x0,#4*8] stp x26,x27,[x0,#6*8] b.ne Loop ldp x19,x20,[x29,#16] add sp,sp,#4*8 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 AARCH64_VALIDATE_LINK_REGISTER ret .section .rodata .align 6 LK512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0 // terminator .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 .text #ifndef __KERNEL__ .globl sha512_block_data_order_hw .def sha512_block_data_order_hw .type 32 .endef .align 6 sha512_block_data_order_hw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context adrp x3,LK512 add x3,x3,:lo12:LK512 rev64 v16.16b,v16.16b rev64 v17.16b,v17.16b rev64 v18.16b,v18.16b rev64 v19.16b,v19.16b rev64 v20.16b,v20.16b rev64 v21.16b,v21.16b rev64 v22.16b,v22.16b rev64 v23.16b,v23.16b b Loop_hw .align 4 Loop_hw: ld1 {v24.2d},[x3],#16 subs x2,x2,#1 sub x4,x1,#128 orr v26.16b,v0.16b,v0.16b // offload orr v27.16b,v1.16b,v1.16b orr v28.16b,v2.16b,v2.16b orr v29.16b,v3.16b,v3.16b csel x1,x1,x4,ne // conditional rewind add v24.2d,v24.2d,v16.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08230 //sha512su0 v16.16b,v17.16b ext v7.16b,v20.16b,v21.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v17.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08251 //sha512su0 v17.16b,v18.16b ext v7.16b,v21.16b,v22.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v24.2d,v24.2d,v18.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08272 //sha512su0 v18.16b,v19.16b ext v7.16b,v22.16b,v23.16b,#8 .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v25.2d,v25.2d,v19.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08293 //sha512su0 v19.16b,v20.16b ext v7.16b,v23.16b,v16.16b,#8 .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v24.2d,v24.2d,v20.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082b4 //sha512su0 v20.16b,v21.16b ext v7.16b,v16.16b,v17.16b,#8 .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v25.2d,v25.2d,v21.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec082d5 //sha512su0 v21.16b,v22.16b ext v7.16b,v17.16b,v18.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v24.2d,v24.2d,v22.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082f6 //sha512su0 v22.16b,v23.16b ext v7.16b,v18.16b,v19.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v25.2d,v25.2d,v23.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08217 //sha512su0 v23.16b,v16.16b ext v7.16b,v19.16b,v20.16b,#8 .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v24.2d,v24.2d,v16.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08230 //sha512su0 v16.16b,v17.16b ext v7.16b,v20.16b,v21.16b,#8 .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v25.2d,v25.2d,v17.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08251 //sha512su0 v17.16b,v18.16b ext v7.16b,v21.16b,v22.16b,#8 .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v24.2d,v24.2d,v18.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08272 //sha512su0 v18.16b,v19.16b ext v7.16b,v22.16b,v23.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v19.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08293 //sha512su0 v19.16b,v20.16b ext v7.16b,v23.16b,v16.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v24.2d,v24.2d,v20.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082b4 //sha512su0 v20.16b,v21.16b ext v7.16b,v16.16b,v17.16b,#8 .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v25.2d,v25.2d,v21.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec082d5 //sha512su0 v21.16b,v22.16b ext v7.16b,v17.16b,v18.16b,#8 .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v24.2d,v24.2d,v22.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082f6 //sha512su0 v22.16b,v23.16b ext v7.16b,v18.16b,v19.16b,#8 .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v25.2d,v25.2d,v23.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08217 //sha512su0 v23.16b,v16.16b ext v7.16b,v19.16b,v20.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v24.2d,v24.2d,v16.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08230 //sha512su0 v16.16b,v17.16b ext v7.16b,v20.16b,v21.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v25.2d,v25.2d,v17.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08251 //sha512su0 v17.16b,v18.16b ext v7.16b,v21.16b,v22.16b,#8 .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v24.2d,v24.2d,v18.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08272 //sha512su0 v18.16b,v19.16b ext v7.16b,v22.16b,v23.16b,#8 .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v25.2d,v25.2d,v19.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08293 //sha512su0 v19.16b,v20.16b ext v7.16b,v23.16b,v16.16b,#8 .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v24.2d,v24.2d,v20.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082b4 //sha512su0 v20.16b,v21.16b ext v7.16b,v16.16b,v17.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v21.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec082d5 //sha512su0 v21.16b,v22.16b ext v7.16b,v17.16b,v18.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v24.2d,v24.2d,v22.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082f6 //sha512su0 v22.16b,v23.16b ext v7.16b,v18.16b,v19.16b,#8 .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v25.2d,v25.2d,v23.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08217 //sha512su0 v23.16b,v16.16b ext v7.16b,v19.16b,v20.16b,#8 .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v24.2d,v24.2d,v16.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08230 //sha512su0 v16.16b,v17.16b ext v7.16b,v20.16b,v21.16b,#8 .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v25.2d,v25.2d,v17.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08251 //sha512su0 v17.16b,v18.16b ext v7.16b,v21.16b,v22.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v24.2d,v24.2d,v18.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec08272 //sha512su0 v18.16b,v19.16b ext v7.16b,v22.16b,v23.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v25.2d,v25.2d,v19.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08293 //sha512su0 v19.16b,v20.16b ext v7.16b,v23.16b,v16.16b,#8 .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b .long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v24.2d,v24.2d,v20.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082b4 //sha512su0 v20.16b,v21.16b ext v7.16b,v16.16b,v17.16b,#8 .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b .long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v25.2d,v25.2d,v21.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec082d5 //sha512su0 v21.16b,v22.16b ext v7.16b,v17.16b,v18.16b,#8 .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b .long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v24.2d,v24.2d,v22.2d ld1 {v25.2d},[x3],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" .long 0xcec082f6 //sha512su0 v22.16b,v23.16b ext v7.16b,v18.16b,v19.16b,#8 .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b .long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v23.2d ld1 {v24.2d},[x3],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" .long 0xcec08217 //sha512su0 v23.16b,v16.16b ext v7.16b,v19.16b,v20.16b,#8 .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b .long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b ld1 {v25.2d},[x3],#16 add v24.2d,v24.2d,v16.2d ld1 {v16.16b},[x1],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b rev64 v16.16b,v16.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b ld1 {v24.2d},[x3],#16 add v25.2d,v25.2d,v17.2d ld1 {v17.16b},[x1],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b rev64 v17.16b,v17.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b ld1 {v25.2d},[x3],#16 add v24.2d,v24.2d,v18.2d ld1 {v18.16b},[x1],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b rev64 v18.16b,v18.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b ld1 {v24.2d},[x3],#16 add v25.2d,v25.2d,v19.2d ld1 {v19.16b},[x1],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b rev64 v19.16b,v19.16b add v4.2d,v1.2d,v3.2d // "D + T1" .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b ld1 {v25.2d},[x3],#16 add v24.2d,v24.2d,v20.2d ld1 {v20.16b},[x1],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b rev64 v20.16b,v20.16b add v1.2d,v0.2d,v2.2d // "D + T1" .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b ld1 {v24.2d},[x3],#16 add v25.2d,v25.2d,v21.2d ld1 {v21.16b},[x1],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b rev64 v21.16b,v21.16b add v0.2d,v3.2d,v4.2d // "D + T1" .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b ld1 {v25.2d},[x3],#16 add v24.2d,v24.2d,v22.2d ld1 {v22.16b},[x1],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b rev64 v22.16b,v22.16b add v3.2d,v2.2d,v1.2d // "D + T1" .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b sub x3,x3,#80*8 // rewind add v25.2d,v25.2d,v23.2d ld1 {v23.16b},[x1],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b rev64 v23.16b,v23.16b add v2.2d,v4.2d,v0.2d // "D + T1" .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v0.2d,v0.2d,v26.2d // accumulate add v1.2d,v1.2d,v27.2d add v2.2d,v2.2d,v28.2d add v3.2d,v3.2d,v29.2d cbnz x2,Loop_hw st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context ldr x29,[sp],#16 ret #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) ring-0.17.14/pregenerated/sha512-x86_64-elf.S000064400000000000000000001370051046102023000162370ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text .globl sha512_block_data_order_nohw .hidden sha512_block_data_order_nohw .type sha512_block_data_order_nohw,@function .align 16 sha512_block_data_order_nohw: .cfi_startproc _CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx andq $-64,%rsp movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) movq %rax,152(%rsp) .cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue: movq 0(%rdi),%rax movq 8(%rdi),%rbx movq 16(%rdi),%rcx movq 24(%rdi),%rdx movq 32(%rdi),%r8 movq 40(%rdi),%r9 movq 48(%rdi),%r10 movq 56(%rdi),%r11 jmp .Lloop .align 16 .Lloop: movq %rbx,%rdi leaq K512(%rip),%rbp xorq %rcx,%rdi movq 0(%rsi),%r12 movq %r8,%r13 movq %rax,%r14 bswapq %r12 rorq $23,%r13 movq %r9,%r15 xorq %r8,%r13 rorq $5,%r14 xorq %r10,%r15 movq %r12,0(%rsp) xorq %rax,%r14 andq %r8,%r15 rorq $4,%r13 addq %r11,%r12 xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 addq %r15,%r12 movq %rax,%r15 addq (%rbp),%r12 xorq %rax,%r14 xorq %rbx,%r15 rorq $14,%r13 movq %rbx,%r11 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 leaq 8(%rbp),%rbp addq %r14,%r11 movq 8(%rsi),%r12 movq %rdx,%r13 movq %r11,%r14 bswapq %r12 rorq $23,%r13 movq %r8,%rdi xorq %rdx,%r13 rorq $5,%r14 xorq %r9,%rdi movq %r12,8(%rsp) xorq %r11,%r14 andq %rdx,%rdi rorq $4,%r13 addq %r10,%r12 xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 addq %rdi,%r12 movq %r11,%rdi addq (%rbp),%r12 xorq %r11,%r14 xorq %rax,%rdi rorq $14,%r13 movq %rax,%r10 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 leaq 24(%rbp),%rbp addq %r14,%r10 movq 16(%rsi),%r12 movq %rcx,%r13 movq %r10,%r14 bswapq %r12 rorq $23,%r13 movq %rdx,%r15 xorq %rcx,%r13 rorq $5,%r14 xorq %r8,%r15 movq %r12,16(%rsp) xorq %r10,%r14 andq %rcx,%r15 rorq $4,%r13 addq %r9,%r12 xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 addq %r15,%r12 movq %r10,%r15 addq (%rbp),%r12 xorq %r10,%r14 xorq %r11,%r15 rorq $14,%r13 movq %r11,%r9 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 leaq 8(%rbp),%rbp addq %r14,%r9 movq 24(%rsi),%r12 movq %rbx,%r13 movq %r9,%r14 bswapq %r12 rorq $23,%r13 movq %rcx,%rdi xorq %rbx,%r13 rorq $5,%r14 xorq %rdx,%rdi movq %r12,24(%rsp) xorq %r9,%r14 andq %rbx,%rdi rorq $4,%r13 addq %r8,%r12 xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 addq %rdi,%r12 movq %r9,%rdi addq (%rbp),%r12 xorq %r9,%r14 xorq %r10,%rdi rorq $14,%r13 movq %r10,%r8 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 leaq 24(%rbp),%rbp addq %r14,%r8 movq 32(%rsi),%r12 movq %rax,%r13 movq %r8,%r14 bswapq %r12 rorq $23,%r13 movq %rbx,%r15 xorq %rax,%r13 rorq $5,%r14 xorq %rcx,%r15 movq %r12,32(%rsp) xorq %r8,%r14 andq %rax,%r15 rorq $4,%r13 addq %rdx,%r12 xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 addq %r15,%r12 movq %r8,%r15 addq (%rbp),%r12 xorq %r8,%r14 xorq %r9,%r15 rorq $14,%r13 movq %r9,%rdx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx leaq 8(%rbp),%rbp addq %r14,%rdx movq 40(%rsi),%r12 movq %r11,%r13 movq %rdx,%r14 bswapq %r12 rorq $23,%r13 movq %rax,%rdi xorq %r11,%r13 rorq $5,%r14 xorq %rbx,%rdi movq %r12,40(%rsp) xorq %rdx,%r14 andq %r11,%rdi rorq $4,%r13 addq %rcx,%r12 xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 addq %rdi,%r12 movq %rdx,%rdi addq (%rbp),%r12 xorq %rdx,%r14 xorq %r8,%rdi rorq $14,%r13 movq %r8,%rcx andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx leaq 24(%rbp),%rbp addq %r14,%rcx movq 48(%rsi),%r12 movq %r10,%r13 movq %rcx,%r14 bswapq %r12 rorq $23,%r13 movq %r11,%r15 xorq %r10,%r13 rorq $5,%r14 xorq %rax,%r15 movq %r12,48(%rsp) xorq %rcx,%r14 andq %r10,%r15 rorq $4,%r13 addq %rbx,%r12 xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 addq %r15,%r12 movq %rcx,%r15 addq (%rbp),%r12 xorq %rcx,%r14 xorq %rdx,%r15 rorq $14,%r13 movq %rdx,%rbx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx leaq 8(%rbp),%rbp addq %r14,%rbx movq 56(%rsi),%r12 movq %r9,%r13 movq %rbx,%r14 bswapq %r12 rorq $23,%r13 movq %r10,%rdi xorq %r9,%r13 rorq $5,%r14 xorq %r11,%rdi movq %r12,56(%rsp) xorq %rbx,%r14 andq %r9,%rdi rorq $4,%r13 addq %rax,%r12 xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 addq %rdi,%r12 movq %rbx,%rdi addq (%rbp),%r12 xorq %rbx,%r14 xorq %rcx,%rdi rorq $14,%r13 movq %rcx,%rax andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rax addq %r12,%r8 addq %r12,%rax leaq 24(%rbp),%rbp addq %r14,%rax movq 64(%rsi),%r12 movq %r8,%r13 movq %rax,%r14 bswapq %r12 rorq $23,%r13 movq %r9,%r15 xorq %r8,%r13 rorq $5,%r14 xorq %r10,%r15 movq %r12,64(%rsp) xorq %rax,%r14 andq %r8,%r15 rorq $4,%r13 addq %r11,%r12 xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 addq %r15,%r12 movq %rax,%r15 addq (%rbp),%r12 xorq %rax,%r14 xorq %rbx,%r15 rorq $14,%r13 movq %rbx,%r11 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 leaq 8(%rbp),%rbp addq %r14,%r11 movq 72(%rsi),%r12 movq %rdx,%r13 movq %r11,%r14 bswapq %r12 rorq $23,%r13 movq %r8,%rdi xorq %rdx,%r13 rorq $5,%r14 xorq %r9,%rdi movq %r12,72(%rsp) xorq %r11,%r14 andq %rdx,%rdi rorq $4,%r13 addq %r10,%r12 xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 addq %rdi,%r12 movq %r11,%rdi addq (%rbp),%r12 xorq %r11,%r14 xorq %rax,%rdi rorq $14,%r13 movq %rax,%r10 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 leaq 24(%rbp),%rbp addq %r14,%r10 movq 80(%rsi),%r12 movq %rcx,%r13 movq %r10,%r14 bswapq %r12 rorq $23,%r13 movq %rdx,%r15 xorq %rcx,%r13 rorq $5,%r14 xorq %r8,%r15 movq %r12,80(%rsp) xorq %r10,%r14 andq %rcx,%r15 rorq $4,%r13 addq %r9,%r12 xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 addq %r15,%r12 movq %r10,%r15 addq (%rbp),%r12 xorq %r10,%r14 xorq %r11,%r15 rorq $14,%r13 movq %r11,%r9 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 leaq 8(%rbp),%rbp addq %r14,%r9 movq 88(%rsi),%r12 movq %rbx,%r13 movq %r9,%r14 bswapq %r12 rorq $23,%r13 movq %rcx,%rdi xorq %rbx,%r13 rorq $5,%r14 xorq %rdx,%rdi movq %r12,88(%rsp) xorq %r9,%r14 andq %rbx,%rdi rorq $4,%r13 addq %r8,%r12 xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 addq %rdi,%r12 movq %r9,%rdi addq (%rbp),%r12 xorq %r9,%r14 xorq %r10,%rdi rorq $14,%r13 movq %r10,%r8 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 leaq 24(%rbp),%rbp addq %r14,%r8 movq 96(%rsi),%r12 movq %rax,%r13 movq %r8,%r14 bswapq %r12 rorq $23,%r13 movq %rbx,%r15 xorq %rax,%r13 rorq $5,%r14 xorq %rcx,%r15 movq %r12,96(%rsp) xorq %r8,%r14 andq %rax,%r15 rorq $4,%r13 addq %rdx,%r12 xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 addq %r15,%r12 movq %r8,%r15 addq (%rbp),%r12 xorq %r8,%r14 xorq %r9,%r15 rorq $14,%r13 movq %r9,%rdx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx leaq 8(%rbp),%rbp addq %r14,%rdx movq 104(%rsi),%r12 movq %r11,%r13 movq %rdx,%r14 bswapq %r12 rorq $23,%r13 movq %rax,%rdi xorq %r11,%r13 rorq $5,%r14 xorq %rbx,%rdi movq %r12,104(%rsp) xorq %rdx,%r14 andq %r11,%rdi rorq $4,%r13 addq %rcx,%r12 xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 addq %rdi,%r12 movq %rdx,%rdi addq (%rbp),%r12 xorq %rdx,%r14 xorq %r8,%rdi rorq $14,%r13 movq %r8,%rcx andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx leaq 24(%rbp),%rbp addq %r14,%rcx movq 112(%rsi),%r12 movq %r10,%r13 movq %rcx,%r14 bswapq %r12 rorq $23,%r13 movq %r11,%r15 xorq %r10,%r13 rorq $5,%r14 xorq %rax,%r15 movq %r12,112(%rsp) xorq %rcx,%r14 andq %r10,%r15 rorq $4,%r13 addq %rbx,%r12 xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 addq %r15,%r12 movq %rcx,%r15 addq (%rbp),%r12 xorq %rcx,%r14 xorq %rdx,%r15 rorq $14,%r13 movq %rdx,%rbx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx leaq 8(%rbp),%rbp addq %r14,%rbx movq 120(%rsi),%r12 movq %r9,%r13 movq %rbx,%r14 bswapq %r12 rorq $23,%r13 movq %r10,%rdi xorq %r9,%r13 rorq $5,%r14 xorq %r11,%rdi movq %r12,120(%rsp) xorq %rbx,%r14 andq %r9,%rdi rorq $4,%r13 addq %rax,%r12 xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 addq %rdi,%r12 movq %rbx,%rdi addq (%rbp),%r12 xorq %rbx,%r14 xorq %rcx,%rdi rorq $14,%r13 movq %rcx,%rax andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rax addq %r12,%r8 addq %r12,%rax leaq 24(%rbp),%rbp jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: movq 8(%rsp),%r13 movq 112(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%rax movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 72(%rsp),%r12 addq 0(%rsp),%r12 movq %r8,%r13 addq %r15,%r12 movq %rax,%r14 rorq $23,%r13 movq %r9,%r15 xorq %r8,%r13 rorq $5,%r14 xorq %r10,%r15 movq %r12,0(%rsp) xorq %rax,%r14 andq %r8,%r15 rorq $4,%r13 addq %r11,%r12 xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 addq %r15,%r12 movq %rax,%r15 addq (%rbp),%r12 xorq %rax,%r14 xorq %rbx,%r15 rorq $14,%r13 movq %rbx,%r11 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 leaq 8(%rbp),%rbp movq 16(%rsp),%r13 movq 120(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%r11 movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 80(%rsp),%r12 addq 8(%rsp),%r12 movq %rdx,%r13 addq %rdi,%r12 movq %r11,%r14 rorq $23,%r13 movq %r8,%rdi xorq %rdx,%r13 rorq $5,%r14 xorq %r9,%rdi movq %r12,8(%rsp) xorq %r11,%r14 andq %rdx,%rdi rorq $4,%r13 addq %r10,%r12 xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 addq %rdi,%r12 movq %r11,%rdi addq (%rbp),%r12 xorq %r11,%r14 xorq %rax,%rdi rorq $14,%r13 movq %rax,%r10 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 leaq 24(%rbp),%rbp movq 24(%rsp),%r13 movq 0(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%r10 movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 88(%rsp),%r12 addq 16(%rsp),%r12 movq %rcx,%r13 addq %r15,%r12 movq %r10,%r14 rorq $23,%r13 movq %rdx,%r15 xorq %rcx,%r13 rorq $5,%r14 xorq %r8,%r15 movq %r12,16(%rsp) xorq %r10,%r14 andq %rcx,%r15 rorq $4,%r13 addq %r9,%r12 xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 addq %r15,%r12 movq %r10,%r15 addq (%rbp),%r12 xorq %r10,%r14 xorq %r11,%r15 rorq $14,%r13 movq %r11,%r9 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 leaq 8(%rbp),%rbp movq 32(%rsp),%r13 movq 8(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%r9 movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 96(%rsp),%r12 addq 24(%rsp),%r12 movq %rbx,%r13 addq %rdi,%r12 movq %r9,%r14 rorq $23,%r13 movq %rcx,%rdi xorq %rbx,%r13 rorq $5,%r14 xorq %rdx,%rdi movq %r12,24(%rsp) xorq %r9,%r14 andq %rbx,%rdi rorq $4,%r13 addq %r8,%r12 xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 addq %rdi,%r12 movq %r9,%rdi addq (%rbp),%r12 xorq %r9,%r14 xorq %r10,%rdi rorq $14,%r13 movq %r10,%r8 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 leaq 24(%rbp),%rbp movq 40(%rsp),%r13 movq 16(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%r8 movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 104(%rsp),%r12 addq 32(%rsp),%r12 movq %rax,%r13 addq %r15,%r12 movq %r8,%r14 rorq $23,%r13 movq %rbx,%r15 xorq %rax,%r13 rorq $5,%r14 xorq %rcx,%r15 movq %r12,32(%rsp) xorq %r8,%r14 andq %rax,%r15 rorq $4,%r13 addq %rdx,%r12 xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 addq %r15,%r12 movq %r8,%r15 addq (%rbp),%r12 xorq %r8,%r14 xorq %r9,%r15 rorq $14,%r13 movq %r9,%rdx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx leaq 8(%rbp),%rbp movq 48(%rsp),%r13 movq 24(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%rdx movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 112(%rsp),%r12 addq 40(%rsp),%r12 movq %r11,%r13 addq %rdi,%r12 movq %rdx,%r14 rorq $23,%r13 movq %rax,%rdi xorq %r11,%r13 rorq $5,%r14 xorq %rbx,%rdi movq %r12,40(%rsp) xorq %rdx,%r14 andq %r11,%rdi rorq $4,%r13 addq %rcx,%r12 xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 addq %rdi,%r12 movq %rdx,%rdi addq (%rbp),%r12 xorq %rdx,%r14 xorq %r8,%rdi rorq $14,%r13 movq %r8,%rcx andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx leaq 24(%rbp),%rbp movq 56(%rsp),%r13 movq 32(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%rcx movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 120(%rsp),%r12 addq 48(%rsp),%r12 movq %r10,%r13 addq %r15,%r12 movq %rcx,%r14 rorq $23,%r13 movq %r11,%r15 xorq %r10,%r13 rorq $5,%r14 xorq %rax,%r15 movq %r12,48(%rsp) xorq %rcx,%r14 andq %r10,%r15 rorq $4,%r13 addq %rbx,%r12 xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 addq %r15,%r12 movq %rcx,%r15 addq (%rbp),%r12 xorq %rcx,%r14 xorq %rdx,%r15 rorq $14,%r13 movq %rdx,%rbx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx leaq 8(%rbp),%rbp movq 64(%rsp),%r13 movq 40(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%rbx movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 0(%rsp),%r12 addq 56(%rsp),%r12 movq %r9,%r13 addq %rdi,%r12 movq %rbx,%r14 rorq $23,%r13 movq %r10,%rdi xorq %r9,%r13 rorq $5,%r14 xorq %r11,%rdi movq %r12,56(%rsp) xorq %rbx,%r14 andq %r9,%rdi rorq $4,%r13 addq %rax,%r12 xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 addq %rdi,%r12 movq %rbx,%rdi addq (%rbp),%r12 xorq %rbx,%r14 xorq %rcx,%rdi rorq $14,%r13 movq %rcx,%rax andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rax addq %r12,%r8 addq %r12,%rax leaq 24(%rbp),%rbp movq 72(%rsp),%r13 movq 48(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%rax movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 8(%rsp),%r12 addq 64(%rsp),%r12 movq %r8,%r13 addq %r15,%r12 movq %rax,%r14 rorq $23,%r13 movq %r9,%r15 xorq %r8,%r13 rorq $5,%r14 xorq %r10,%r15 movq %r12,64(%rsp) xorq %rax,%r14 andq %r8,%r15 rorq $4,%r13 addq %r11,%r12 xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 addq %r15,%r12 movq %rax,%r15 addq (%rbp),%r12 xorq %rax,%r14 xorq %rbx,%r15 rorq $14,%r13 movq %rbx,%r11 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 leaq 8(%rbp),%rbp movq 80(%rsp),%r13 movq 56(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%r11 movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 16(%rsp),%r12 addq 72(%rsp),%r12 movq %rdx,%r13 addq %rdi,%r12 movq %r11,%r14 rorq $23,%r13 movq %r8,%rdi xorq %rdx,%r13 rorq $5,%r14 xorq %r9,%rdi movq %r12,72(%rsp) xorq %r11,%r14 andq %rdx,%rdi rorq $4,%r13 addq %r10,%r12 xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 addq %rdi,%r12 movq %r11,%rdi addq (%rbp),%r12 xorq %r11,%r14 xorq %rax,%rdi rorq $14,%r13 movq %rax,%r10 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 leaq 24(%rbp),%rbp movq 88(%rsp),%r13 movq 64(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%r10 movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 24(%rsp),%r12 addq 80(%rsp),%r12 movq %rcx,%r13 addq %r15,%r12 movq %r10,%r14 rorq $23,%r13 movq %rdx,%r15 xorq %rcx,%r13 rorq $5,%r14 xorq %r8,%r15 movq %r12,80(%rsp) xorq %r10,%r14 andq %rcx,%r15 rorq $4,%r13 addq %r9,%r12 xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 addq %r15,%r12 movq %r10,%r15 addq (%rbp),%r12 xorq %r10,%r14 xorq %r11,%r15 rorq $14,%r13 movq %r11,%r9 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 leaq 8(%rbp),%rbp movq 96(%rsp),%r13 movq 72(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%r9 movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 32(%rsp),%r12 addq 88(%rsp),%r12 movq %rbx,%r13 addq %rdi,%r12 movq %r9,%r14 rorq $23,%r13 movq %rcx,%rdi xorq %rbx,%r13 rorq $5,%r14 xorq %rdx,%rdi movq %r12,88(%rsp) xorq %r9,%r14 andq %rbx,%rdi rorq $4,%r13 addq %r8,%r12 xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 addq %rdi,%r12 movq %r9,%rdi addq (%rbp),%r12 xorq %r9,%r14 xorq %r10,%rdi rorq $14,%r13 movq %r10,%r8 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 leaq 24(%rbp),%rbp movq 104(%rsp),%r13 movq 80(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%r8 movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 40(%rsp),%r12 addq 96(%rsp),%r12 movq %rax,%r13 addq %r15,%r12 movq %r8,%r14 rorq $23,%r13 movq %rbx,%r15 xorq %rax,%r13 rorq $5,%r14 xorq %rcx,%r15 movq %r12,96(%rsp) xorq %r8,%r14 andq %rax,%r15 rorq $4,%r13 addq %rdx,%r12 xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 addq %r15,%r12 movq %r8,%r15 addq (%rbp),%r12 xorq %r8,%r14 xorq %r9,%r15 rorq $14,%r13 movq %r9,%rdx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx leaq 8(%rbp),%rbp movq 112(%rsp),%r13 movq 88(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%rdx movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 48(%rsp),%r12 addq 104(%rsp),%r12 movq %r11,%r13 addq %rdi,%r12 movq %rdx,%r14 rorq $23,%r13 movq %rax,%rdi xorq %r11,%r13 rorq $5,%r14 xorq %rbx,%rdi movq %r12,104(%rsp) xorq %rdx,%r14 andq %r11,%rdi rorq $4,%r13 addq %rcx,%r12 xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 addq %rdi,%r12 movq %rdx,%rdi addq (%rbp),%r12 xorq %rdx,%r14 xorq %r8,%rdi rorq $14,%r13 movq %r8,%rcx andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx leaq 24(%rbp),%rbp movq 120(%rsp),%r13 movq 96(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%rcx movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 56(%rsp),%r12 addq 112(%rsp),%r12 movq %r10,%r13 addq %r15,%r12 movq %rcx,%r14 rorq $23,%r13 movq %r11,%r15 xorq %r10,%r13 rorq $5,%r14 xorq %rax,%r15 movq %r12,112(%rsp) xorq %rcx,%r14 andq %r10,%r15 rorq $4,%r13 addq %rbx,%r12 xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 addq %r15,%r12 movq %rcx,%r15 addq (%rbp),%r12 xorq %rcx,%r14 xorq %rdx,%r15 rorq $14,%r13 movq %rdx,%rbx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx leaq 8(%rbp),%rbp movq 0(%rsp),%r13 movq 104(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%rbx movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 64(%rsp),%r12 addq 120(%rsp),%r12 movq %r9,%r13 addq %rdi,%r12 movq %rbx,%r14 rorq $23,%r13 movq %r10,%rdi xorq %r9,%r13 rorq $5,%r14 xorq %r11,%rdi movq %r12,120(%rsp) xorq %rbx,%r14 andq %r9,%rdi rorq $4,%r13 addq %rax,%r12 xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 addq %rdi,%r12 movq %rbx,%rdi addq (%rbp),%r12 xorq %rbx,%r14 xorq %rcx,%rdi rorq $14,%r13 movq %rcx,%rax andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rax addq %r12,%r8 addq %r12,%rax leaq 24(%rbp),%rbp cmpb $0,7(%rbp) jnz .Lrounds_16_xx movq 128+0(%rsp),%rdi addq %r14,%rax leaq 128(%rsi),%rsi addq 0(%rdi),%rax addq 8(%rdi),%rbx addq 16(%rdi),%rcx addq 24(%rdi),%rdx addq 32(%rdi),%r8 addq 40(%rdi),%r9 addq 48(%rdi),%r10 addq 56(%rdi),%r11 cmpq 128+16(%rsp),%rsi movq %rax,0(%rdi) movq %rbx,8(%rdi) movq %rcx,16(%rdi) movq %rdx,24(%rdi) movq %r8,32(%rdi) movq %r9,40(%rdi) movq %r10,48(%rdi) movq %r11,56(%rdi) jb .Lloop movq 152(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue: ret .cfi_endproc .size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw .section .rodata .align 64 .type K512,@object K512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text .globl sha512_block_data_order_avx .hidden sha512_block_data_order_avx .type sha512_block_data_order_avx,@function .align 64 sha512_block_data_order_avx: .cfi_startproc _CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx andq $-64,%rsp movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) movq %rax,152(%rsp) .cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue_avx: vzeroupper movq 0(%rdi),%rax movq 8(%rdi),%rbx movq 16(%rdi),%rcx movq 24(%rdi),%rdx movq 32(%rdi),%r8 movq 40(%rdi),%r9 movq 48(%rdi),%r10 movq 56(%rdi),%r11 jmp .Lloop_avx .align 16 .Lloop_avx: vmovdqa K512+1280(%rip),%xmm11 vmovdqu 0(%rsi),%xmm0 leaq K512+128(%rip),%rbp vmovdqu 16(%rsi),%xmm1 vmovdqu 32(%rsi),%xmm2 vpshufb %xmm11,%xmm0,%xmm0 vmovdqu 48(%rsi),%xmm3 vpshufb %xmm11,%xmm1,%xmm1 vmovdqu 64(%rsi),%xmm4 vpshufb %xmm11,%xmm2,%xmm2 vmovdqu 80(%rsi),%xmm5 vpshufb %xmm11,%xmm3,%xmm3 vmovdqu 96(%rsi),%xmm6 vpshufb %xmm11,%xmm4,%xmm4 vmovdqu 112(%rsi),%xmm7 vpshufb %xmm11,%xmm5,%xmm5 vpaddq -128(%rbp),%xmm0,%xmm8 vpshufb %xmm11,%xmm6,%xmm6 vpaddq -96(%rbp),%xmm1,%xmm9 vpshufb %xmm11,%xmm7,%xmm7 vpaddq -64(%rbp),%xmm2,%xmm10 vpaddq -32(%rbp),%xmm3,%xmm11 vmovdqa %xmm8,0(%rsp) vpaddq 0(%rbp),%xmm4,%xmm8 vmovdqa %xmm9,16(%rsp) vpaddq 32(%rbp),%xmm5,%xmm9 vmovdqa %xmm10,32(%rsp) vpaddq 64(%rbp),%xmm6,%xmm10 vmovdqa %xmm11,48(%rsp) vpaddq 96(%rbp),%xmm7,%xmm11 vmovdqa %xmm8,64(%rsp) movq %rax,%r14 vmovdqa %xmm9,80(%rsp) movq %rbx,%rdi vmovdqa %xmm10,96(%rsp) xorq %rcx,%rdi vmovdqa %xmm11,112(%rsp) movq %r8,%r13 jmp .Lavx_00_47 .align 16 .Lavx_00_47: addq $256,%rbp vpalignr $8,%xmm0,%xmm1,%xmm8 shrdq $23,%r13,%r13 movq %r14,%rax vpalignr $8,%xmm4,%xmm5,%xmm11 movq %r9,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %r8,%r13 xorq %r10,%r12 vpaddq %xmm11,%xmm0,%xmm0 shrdq $4,%r13,%r13 xorq %rax,%r14 vpsrlq $7,%xmm8,%xmm11 andq %r8,%r12 xorq %r8,%r13 vpsllq $56,%xmm8,%xmm9 addq 0(%rsp),%r11 movq %rax,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %r10,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %rbx,%r15 addq %r12,%r11 vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %rax,%r14 addq %r13,%r11 vpxor %xmm10,%xmm8,%xmm8 xorq %rbx,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm7,%xmm11 addq %r11,%rdx addq %rdi,%r11 vpxor %xmm9,%xmm8,%xmm8 movq %rdx,%r13 addq %r11,%r14 vpsllq $3,%xmm7,%xmm10 shrdq $23,%r13,%r13 movq %r14,%r11 vpaddq %xmm8,%xmm0,%xmm0 movq %r8,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm7,%xmm9 xorq %rdx,%r13 xorq %r9,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %r11,%r14 vpsllq $42,%xmm10,%xmm10 andq %rdx,%r12 xorq %rdx,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 8(%rsp),%r10 movq %r11,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %r9,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %rax,%rdi addq %r12,%r10 vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm0,%xmm0 xorq %r11,%r14 addq %r13,%r10 vpaddq -128(%rbp),%xmm0,%xmm10 xorq %rax,%r15 shrdq $28,%r14,%r14 addq %r10,%rcx addq %r15,%r10 movq %rcx,%r13 addq %r10,%r14 vmovdqa %xmm10,0(%rsp) vpalignr $8,%xmm1,%xmm2,%xmm8 shrdq $23,%r13,%r13 movq %r14,%r10 vpalignr $8,%xmm5,%xmm6,%xmm11 movq %rdx,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %rcx,%r13 xorq %r8,%r12 vpaddq %xmm11,%xmm1,%xmm1 shrdq $4,%r13,%r13 xorq %r10,%r14 vpsrlq $7,%xmm8,%xmm11 andq %rcx,%r12 xorq %rcx,%r13 vpsllq $56,%xmm8,%xmm9 addq 16(%rsp),%r9 movq %r10,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %r8,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %r11,%r15 addq %r12,%r9 vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %r10,%r14 addq %r13,%r9 vpxor %xmm10,%xmm8,%xmm8 xorq %r11,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm0,%xmm11 addq %r9,%rbx addq %rdi,%r9 vpxor %xmm9,%xmm8,%xmm8 movq %rbx,%r13 addq %r9,%r14 vpsllq $3,%xmm0,%xmm10 shrdq $23,%r13,%r13 movq %r14,%r9 vpaddq %xmm8,%xmm1,%xmm1 movq %rcx,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm0,%xmm9 xorq %rbx,%r13 xorq %rdx,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %r9,%r14 vpsllq $42,%xmm10,%xmm10 andq %rbx,%r12 xorq %rbx,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 24(%rsp),%r8 movq %r9,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %rdx,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %r10,%rdi addq %r12,%r8 vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm1,%xmm1 xorq %r9,%r14 addq %r13,%r8 vpaddq -96(%rbp),%xmm1,%xmm10 xorq %r10,%r15 shrdq $28,%r14,%r14 addq %r8,%rax addq %r15,%r8 movq %rax,%r13 addq %r8,%r14 vmovdqa %xmm10,16(%rsp) vpalignr $8,%xmm2,%xmm3,%xmm8 shrdq $23,%r13,%r13 movq %r14,%r8 vpalignr $8,%xmm6,%xmm7,%xmm11 movq %rbx,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %rax,%r13 xorq %rcx,%r12 vpaddq %xmm11,%xmm2,%xmm2 shrdq $4,%r13,%r13 xorq %r8,%r14 vpsrlq $7,%xmm8,%xmm11 andq %rax,%r12 xorq %rax,%r13 vpsllq $56,%xmm8,%xmm9 addq 32(%rsp),%rdx movq %r8,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %rcx,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %r9,%r15 addq %r12,%rdx vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %r8,%r14 addq %r13,%rdx vpxor %xmm10,%xmm8,%xmm8 xorq %r9,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm1,%xmm11 addq %rdx,%r11 addq %rdi,%rdx vpxor %xmm9,%xmm8,%xmm8 movq %r11,%r13 addq %rdx,%r14 vpsllq $3,%xmm1,%xmm10 shrdq $23,%r13,%r13 movq %r14,%rdx vpaddq %xmm8,%xmm2,%xmm2 movq %rax,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm1,%xmm9 xorq %r11,%r13 xorq %rbx,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %rdx,%r14 vpsllq $42,%xmm10,%xmm10 andq %r11,%r12 xorq %r11,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 40(%rsp),%rcx movq %rdx,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %rbx,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %r8,%rdi addq %r12,%rcx vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm2,%xmm2 xorq %rdx,%r14 addq %r13,%rcx vpaddq -64(%rbp),%xmm2,%xmm10 xorq %r8,%r15 shrdq $28,%r14,%r14 addq %rcx,%r10 addq %r15,%rcx movq %r10,%r13 addq %rcx,%r14 vmovdqa %xmm10,32(%rsp) vpalignr $8,%xmm3,%xmm4,%xmm8 shrdq $23,%r13,%r13 movq %r14,%rcx vpalignr $8,%xmm7,%xmm0,%xmm11 movq %r11,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %r10,%r13 xorq %rax,%r12 vpaddq %xmm11,%xmm3,%xmm3 shrdq $4,%r13,%r13 xorq %rcx,%r14 vpsrlq $7,%xmm8,%xmm11 andq %r10,%r12 xorq %r10,%r13 vpsllq $56,%xmm8,%xmm9 addq 48(%rsp),%rbx movq %rcx,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %rax,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %rdx,%r15 addq %r12,%rbx vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %rcx,%r14 addq %r13,%rbx vpxor %xmm10,%xmm8,%xmm8 xorq %rdx,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm2,%xmm11 addq %rbx,%r9 addq %rdi,%rbx vpxor %xmm9,%xmm8,%xmm8 movq %r9,%r13 addq %rbx,%r14 vpsllq $3,%xmm2,%xmm10 shrdq $23,%r13,%r13 movq %r14,%rbx vpaddq %xmm8,%xmm3,%xmm3 movq %r10,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm2,%xmm9 xorq %r9,%r13 xorq %r11,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %rbx,%r14 vpsllq $42,%xmm10,%xmm10 andq %r9,%r12 xorq %r9,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 56(%rsp),%rax movq %rbx,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %r11,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %rcx,%rdi addq %r12,%rax vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm3,%xmm3 xorq %rbx,%r14 addq %r13,%rax vpaddq -32(%rbp),%xmm3,%xmm10 xorq %rcx,%r15 shrdq $28,%r14,%r14 addq %rax,%r8 addq %r15,%rax movq %r8,%r13 addq %rax,%r14 vmovdqa %xmm10,48(%rsp) vpalignr $8,%xmm4,%xmm5,%xmm8 shrdq $23,%r13,%r13 movq %r14,%rax vpalignr $8,%xmm0,%xmm1,%xmm11 movq %r9,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %r8,%r13 xorq %r10,%r12 vpaddq %xmm11,%xmm4,%xmm4 shrdq $4,%r13,%r13 xorq %rax,%r14 vpsrlq $7,%xmm8,%xmm11 andq %r8,%r12 xorq %r8,%r13 vpsllq $56,%xmm8,%xmm9 addq 64(%rsp),%r11 movq %rax,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %r10,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %rbx,%r15 addq %r12,%r11 vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %rax,%r14 addq %r13,%r11 vpxor %xmm10,%xmm8,%xmm8 xorq %rbx,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm3,%xmm11 addq %r11,%rdx addq %rdi,%r11 vpxor %xmm9,%xmm8,%xmm8 movq %rdx,%r13 addq %r11,%r14 vpsllq $3,%xmm3,%xmm10 shrdq $23,%r13,%r13 movq %r14,%r11 vpaddq %xmm8,%xmm4,%xmm4 movq %r8,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm3,%xmm9 xorq %rdx,%r13 xorq %r9,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %r11,%r14 vpsllq $42,%xmm10,%xmm10 andq %rdx,%r12 xorq %rdx,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 72(%rsp),%r10 movq %r11,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %r9,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %rax,%rdi addq %r12,%r10 vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm4,%xmm4 xorq %r11,%r14 addq %r13,%r10 vpaddq 0(%rbp),%xmm4,%xmm10 xorq %rax,%r15 shrdq $28,%r14,%r14 addq %r10,%rcx addq %r15,%r10 movq %rcx,%r13 addq %r10,%r14 vmovdqa %xmm10,64(%rsp) vpalignr $8,%xmm5,%xmm6,%xmm8 shrdq $23,%r13,%r13 movq %r14,%r10 vpalignr $8,%xmm1,%xmm2,%xmm11 movq %rdx,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %rcx,%r13 xorq %r8,%r12 vpaddq %xmm11,%xmm5,%xmm5 shrdq $4,%r13,%r13 xorq %r10,%r14 vpsrlq $7,%xmm8,%xmm11 andq %rcx,%r12 xorq %rcx,%r13 vpsllq $56,%xmm8,%xmm9 addq 80(%rsp),%r9 movq %r10,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %r8,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %r11,%r15 addq %r12,%r9 vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %r10,%r14 addq %r13,%r9 vpxor %xmm10,%xmm8,%xmm8 xorq %r11,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm4,%xmm11 addq %r9,%rbx addq %rdi,%r9 vpxor %xmm9,%xmm8,%xmm8 movq %rbx,%r13 addq %r9,%r14 vpsllq $3,%xmm4,%xmm10 shrdq $23,%r13,%r13 movq %r14,%r9 vpaddq %xmm8,%xmm5,%xmm5 movq %rcx,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm4,%xmm9 xorq %rbx,%r13 xorq %rdx,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %r9,%r14 vpsllq $42,%xmm10,%xmm10 andq %rbx,%r12 xorq %rbx,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 88(%rsp),%r8 movq %r9,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %rdx,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %r10,%rdi addq %r12,%r8 vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm5,%xmm5 xorq %r9,%r14 addq %r13,%r8 vpaddq 32(%rbp),%xmm5,%xmm10 xorq %r10,%r15 shrdq $28,%r14,%r14 addq %r8,%rax addq %r15,%r8 movq %rax,%r13 addq %r8,%r14 vmovdqa %xmm10,80(%rsp) vpalignr $8,%xmm6,%xmm7,%xmm8 shrdq $23,%r13,%r13 movq %r14,%r8 vpalignr $8,%xmm2,%xmm3,%xmm11 movq %rbx,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %rax,%r13 xorq %rcx,%r12 vpaddq %xmm11,%xmm6,%xmm6 shrdq $4,%r13,%r13 xorq %r8,%r14 vpsrlq $7,%xmm8,%xmm11 andq %rax,%r12 xorq %rax,%r13 vpsllq $56,%xmm8,%xmm9 addq 96(%rsp),%rdx movq %r8,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %rcx,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %r9,%r15 addq %r12,%rdx vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %r8,%r14 addq %r13,%rdx vpxor %xmm10,%xmm8,%xmm8 xorq %r9,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm5,%xmm11 addq %rdx,%r11 addq %rdi,%rdx vpxor %xmm9,%xmm8,%xmm8 movq %r11,%r13 addq %rdx,%r14 vpsllq $3,%xmm5,%xmm10 shrdq $23,%r13,%r13 movq %r14,%rdx vpaddq %xmm8,%xmm6,%xmm6 movq %rax,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm5,%xmm9 xorq %r11,%r13 xorq %rbx,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %rdx,%r14 vpsllq $42,%xmm10,%xmm10 andq %r11,%r12 xorq %r11,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 104(%rsp),%rcx movq %rdx,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %rbx,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %r8,%rdi addq %r12,%rcx vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm6,%xmm6 xorq %rdx,%r14 addq %r13,%rcx vpaddq 64(%rbp),%xmm6,%xmm10 xorq %r8,%r15 shrdq $28,%r14,%r14 addq %rcx,%r10 addq %r15,%rcx movq %r10,%r13 addq %rcx,%r14 vmovdqa %xmm10,96(%rsp) vpalignr $8,%xmm7,%xmm0,%xmm8 shrdq $23,%r13,%r13 movq %r14,%rcx vpalignr $8,%xmm3,%xmm4,%xmm11 movq %r11,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %r10,%r13 xorq %rax,%r12 vpaddq %xmm11,%xmm7,%xmm7 shrdq $4,%r13,%r13 xorq %rcx,%r14 vpsrlq $7,%xmm8,%xmm11 andq %r10,%r12 xorq %r10,%r13 vpsllq $56,%xmm8,%xmm9 addq 112(%rsp),%rbx movq %rcx,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %rax,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %rdx,%r15 addq %r12,%rbx vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %rcx,%r14 addq %r13,%rbx vpxor %xmm10,%xmm8,%xmm8 xorq %rdx,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm6,%xmm11 addq %rbx,%r9 addq %rdi,%rbx vpxor %xmm9,%xmm8,%xmm8 movq %r9,%r13 addq %rbx,%r14 vpsllq $3,%xmm6,%xmm10 shrdq $23,%r13,%r13 movq %r14,%rbx vpaddq %xmm8,%xmm7,%xmm7 movq %r10,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm6,%xmm9 xorq %r9,%r13 xorq %r11,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %rbx,%r14 vpsllq $42,%xmm10,%xmm10 andq %r9,%r12 xorq %r9,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 120(%rsp),%rax movq %rbx,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %r11,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %rcx,%rdi addq %r12,%rax vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm7,%xmm7 xorq %rbx,%r14 addq %r13,%rax vpaddq 96(%rbp),%xmm7,%xmm10 xorq %rcx,%r15 shrdq $28,%r14,%r14 addq %rax,%r8 addq %r15,%rax movq %r8,%r13 addq %rax,%r14 vmovdqa %xmm10,112(%rsp) cmpb $0,135(%rbp) jne .Lavx_00_47 shrdq $23,%r13,%r13 movq %r14,%rax movq %r9,%r12 shrdq $5,%r14,%r14 xorq %r8,%r13 xorq %r10,%r12 shrdq $4,%r13,%r13 xorq %rax,%r14 andq %r8,%r12 xorq %r8,%r13 addq 0(%rsp),%r11 movq %rax,%r15 xorq %r10,%r12 shrdq $6,%r14,%r14 xorq %rbx,%r15 addq %r12,%r11 shrdq $14,%r13,%r13 andq %r15,%rdi xorq %rax,%r14 addq %r13,%r11 xorq %rbx,%rdi shrdq $28,%r14,%r14 addq %r11,%rdx addq %rdi,%r11 movq %rdx,%r13 addq %r11,%r14 shrdq $23,%r13,%r13 movq %r14,%r11 movq %r8,%r12 shrdq $5,%r14,%r14 xorq %rdx,%r13 xorq %r9,%r12 shrdq $4,%r13,%r13 xorq %r11,%r14 andq %rdx,%r12 xorq %rdx,%r13 addq 8(%rsp),%r10 movq %r11,%rdi xorq %r9,%r12 shrdq $6,%r14,%r14 xorq %rax,%rdi addq %r12,%r10 shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %r11,%r14 addq %r13,%r10 xorq %rax,%r15 shrdq $28,%r14,%r14 addq %r10,%rcx addq %r15,%r10 movq %rcx,%r13 addq %r10,%r14 shrdq $23,%r13,%r13 movq %r14,%r10 movq %rdx,%r12 shrdq $5,%r14,%r14 xorq %rcx,%r13 xorq %r8,%r12 shrdq $4,%r13,%r13 xorq %r10,%r14 andq %rcx,%r12 xorq %rcx,%r13 addq 16(%rsp),%r9 movq %r10,%r15 xorq %r8,%r12 shrdq $6,%r14,%r14 xorq %r11,%r15 addq %r12,%r9 shrdq $14,%r13,%r13 andq %r15,%rdi xorq %r10,%r14 addq %r13,%r9 xorq %r11,%rdi shrdq $28,%r14,%r14 addq %r9,%rbx addq %rdi,%r9 movq %rbx,%r13 addq %r9,%r14 shrdq $23,%r13,%r13 movq %r14,%r9 movq %rcx,%r12 shrdq $5,%r14,%r14 xorq %rbx,%r13 xorq %rdx,%r12 shrdq $4,%r13,%r13 xorq %r9,%r14 andq %rbx,%r12 xorq %rbx,%r13 addq 24(%rsp),%r8 movq %r9,%rdi xorq %rdx,%r12 shrdq $6,%r14,%r14 xorq %r10,%rdi addq %r12,%r8 shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %r9,%r14 addq %r13,%r8 xorq %r10,%r15 shrdq $28,%r14,%r14 addq %r8,%rax addq %r15,%r8 movq %rax,%r13 addq %r8,%r14 shrdq $23,%r13,%r13 movq %r14,%r8 movq %rbx,%r12 shrdq $5,%r14,%r14 xorq %rax,%r13 xorq %rcx,%r12 shrdq $4,%r13,%r13 xorq %r8,%r14 andq %rax,%r12 xorq %rax,%r13 addq 32(%rsp),%rdx movq %r8,%r15 xorq %rcx,%r12 shrdq $6,%r14,%r14 xorq %r9,%r15 addq %r12,%rdx shrdq $14,%r13,%r13 andq %r15,%rdi xorq %r8,%r14 addq %r13,%rdx xorq %r9,%rdi shrdq $28,%r14,%r14 addq %rdx,%r11 addq %rdi,%rdx movq %r11,%r13 addq %rdx,%r14 shrdq $23,%r13,%r13 movq %r14,%rdx movq %rax,%r12 shrdq $5,%r14,%r14 xorq %r11,%r13 xorq %rbx,%r12 shrdq $4,%r13,%r13 xorq %rdx,%r14 andq %r11,%r12 xorq %r11,%r13 addq 40(%rsp),%rcx movq %rdx,%rdi xorq %rbx,%r12 shrdq $6,%r14,%r14 xorq %r8,%rdi addq %r12,%rcx shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %rdx,%r14 addq %r13,%rcx xorq %r8,%r15 shrdq $28,%r14,%r14 addq %rcx,%r10 addq %r15,%rcx movq %r10,%r13 addq %rcx,%r14 shrdq $23,%r13,%r13 movq %r14,%rcx movq %r11,%r12 shrdq $5,%r14,%r14 xorq %r10,%r13 xorq %rax,%r12 shrdq $4,%r13,%r13 xorq %rcx,%r14 andq %r10,%r12 xorq %r10,%r13 addq 48(%rsp),%rbx movq %rcx,%r15 xorq %rax,%r12 shrdq $6,%r14,%r14 xorq %rdx,%r15 addq %r12,%rbx shrdq $14,%r13,%r13 andq %r15,%rdi xorq %rcx,%r14 addq %r13,%rbx xorq %rdx,%rdi shrdq $28,%r14,%r14 addq %rbx,%r9 addq %rdi,%rbx movq %r9,%r13 addq %rbx,%r14 shrdq $23,%r13,%r13 movq %r14,%rbx movq %r10,%r12 shrdq $5,%r14,%r14 xorq %r9,%r13 xorq %r11,%r12 shrdq $4,%r13,%r13 xorq %rbx,%r14 andq %r9,%r12 xorq %r9,%r13 addq 56(%rsp),%rax movq %rbx,%rdi xorq %r11,%r12 shrdq $6,%r14,%r14 xorq %rcx,%rdi addq %r12,%rax shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %rbx,%r14 addq %r13,%rax xorq %rcx,%r15 shrdq $28,%r14,%r14 addq %rax,%r8 addq %r15,%rax movq %r8,%r13 addq %rax,%r14 shrdq $23,%r13,%r13 movq %r14,%rax movq %r9,%r12 shrdq $5,%r14,%r14 xorq %r8,%r13 xorq %r10,%r12 shrdq $4,%r13,%r13 xorq %rax,%r14 andq %r8,%r12 xorq %r8,%r13 addq 64(%rsp),%r11 movq %rax,%r15 xorq %r10,%r12 shrdq $6,%r14,%r14 xorq %rbx,%r15 addq %r12,%r11 shrdq $14,%r13,%r13 andq %r15,%rdi xorq %rax,%r14 addq %r13,%r11 xorq %rbx,%rdi shrdq $28,%r14,%r14 addq %r11,%rdx addq %rdi,%r11 movq %rdx,%r13 addq %r11,%r14 shrdq $23,%r13,%r13 movq %r14,%r11 movq %r8,%r12 shrdq $5,%r14,%r14 xorq %rdx,%r13 xorq %r9,%r12 shrdq $4,%r13,%r13 xorq %r11,%r14 andq %rdx,%r12 xorq %rdx,%r13 addq 72(%rsp),%r10 movq %r11,%rdi xorq %r9,%r12 shrdq $6,%r14,%r14 xorq %rax,%rdi addq %r12,%r10 shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %r11,%r14 addq %r13,%r10 xorq %rax,%r15 shrdq $28,%r14,%r14 addq %r10,%rcx addq %r15,%r10 movq %rcx,%r13 addq %r10,%r14 shrdq $23,%r13,%r13 movq %r14,%r10 movq %rdx,%r12 shrdq $5,%r14,%r14 xorq %rcx,%r13 xorq %r8,%r12 shrdq $4,%r13,%r13 xorq %r10,%r14 andq %rcx,%r12 xorq %rcx,%r13 addq 80(%rsp),%r9 movq %r10,%r15 xorq %r8,%r12 shrdq $6,%r14,%r14 xorq %r11,%r15 addq %r12,%r9 shrdq $14,%r13,%r13 andq %r15,%rdi xorq %r10,%r14 addq %r13,%r9 xorq %r11,%rdi shrdq $28,%r14,%r14 addq %r9,%rbx addq %rdi,%r9 movq %rbx,%r13 addq %r9,%r14 shrdq $23,%r13,%r13 movq %r14,%r9 movq %rcx,%r12 shrdq $5,%r14,%r14 xorq %rbx,%r13 xorq %rdx,%r12 shrdq $4,%r13,%r13 xorq %r9,%r14 andq %rbx,%r12 xorq %rbx,%r13 addq 88(%rsp),%r8 movq %r9,%rdi xorq %rdx,%r12 shrdq $6,%r14,%r14 xorq %r10,%rdi addq %r12,%r8 shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %r9,%r14 addq %r13,%r8 xorq %r10,%r15 shrdq $28,%r14,%r14 addq %r8,%rax addq %r15,%r8 movq %rax,%r13 addq %r8,%r14 shrdq $23,%r13,%r13 movq %r14,%r8 movq %rbx,%r12 shrdq $5,%r14,%r14 xorq %rax,%r13 xorq %rcx,%r12 shrdq $4,%r13,%r13 xorq %r8,%r14 andq %rax,%r12 xorq %rax,%r13 addq 96(%rsp),%rdx movq %r8,%r15 xorq %rcx,%r12 shrdq $6,%r14,%r14 xorq %r9,%r15 addq %r12,%rdx shrdq $14,%r13,%r13 andq %r15,%rdi xorq %r8,%r14 addq %r13,%rdx xorq %r9,%rdi shrdq $28,%r14,%r14 addq %rdx,%r11 addq %rdi,%rdx movq %r11,%r13 addq %rdx,%r14 shrdq $23,%r13,%r13 movq %r14,%rdx movq %rax,%r12 shrdq $5,%r14,%r14 xorq %r11,%r13 xorq %rbx,%r12 shrdq $4,%r13,%r13 xorq %rdx,%r14 andq %r11,%r12 xorq %r11,%r13 addq 104(%rsp),%rcx movq %rdx,%rdi xorq %rbx,%r12 shrdq $6,%r14,%r14 xorq %r8,%rdi addq %r12,%rcx shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %rdx,%r14 addq %r13,%rcx xorq %r8,%r15 shrdq $28,%r14,%r14 addq %rcx,%r10 addq %r15,%rcx movq %r10,%r13 addq %rcx,%r14 shrdq $23,%r13,%r13 movq %r14,%rcx movq %r11,%r12 shrdq $5,%r14,%r14 xorq %r10,%r13 xorq %rax,%r12 shrdq $4,%r13,%r13 xorq %rcx,%r14 andq %r10,%r12 xorq %r10,%r13 addq 112(%rsp),%rbx movq %rcx,%r15 xorq %rax,%r12 shrdq $6,%r14,%r14 xorq %rdx,%r15 addq %r12,%rbx shrdq $14,%r13,%r13 andq %r15,%rdi xorq %rcx,%r14 addq %r13,%rbx xorq %rdx,%rdi shrdq $28,%r14,%r14 addq %rbx,%r9 addq %rdi,%rbx movq %r9,%r13 addq %rbx,%r14 shrdq $23,%r13,%r13 movq %r14,%rbx movq %r10,%r12 shrdq $5,%r14,%r14 xorq %r9,%r13 xorq %r11,%r12 shrdq $4,%r13,%r13 xorq %rbx,%r14 andq %r9,%r12 xorq %r9,%r13 addq 120(%rsp),%rax movq %rbx,%rdi xorq %r11,%r12 shrdq $6,%r14,%r14 xorq %rcx,%rdi addq %r12,%rax shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %rbx,%r14 addq %r13,%rax xorq %rcx,%r15 shrdq $28,%r14,%r14 addq %rax,%r8 addq %r15,%rax movq %r8,%r13 addq %rax,%r14 movq 128+0(%rsp),%rdi movq %r14,%rax addq 0(%rdi),%rax leaq 128(%rsi),%rsi addq 8(%rdi),%rbx addq 16(%rdi),%rcx addq 24(%rdi),%rdx addq 32(%rdi),%r8 addq 40(%rdi),%r9 addq 48(%rdi),%r10 addq 56(%rdi),%r11 cmpq 128+16(%rsp),%rsi movq %rax,0(%rdi) movq %rbx,8(%rdi) movq %rcx,16(%rdi) movq %rdx,24(%rdi) movq %r8,32(%rdi) movq %r9,40(%rdi) movq %r10,48(%rdi) movq %r11,56(%rdi) jb .Lloop_avx movq 152(%rsp),%rsi .cfi_def_cfa %rsi,8 vzeroupper movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size sha512_block_data_order_avx,.-sha512_block_data_order_avx #endif ring-0.17.14/pregenerated/sha512-x86_64-macosx.S000064400000000000000000001351321046102023000167620ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text .globl _sha512_block_data_order_nohw .private_extern _sha512_block_data_order_nohw .p2align 4 _sha512_block_data_order_nohw: _CET_ENDBR movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx andq $-64,%rsp movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) movq %rax,152(%rsp) L$prologue: movq 0(%rdi),%rax movq 8(%rdi),%rbx movq 16(%rdi),%rcx movq 24(%rdi),%rdx movq 32(%rdi),%r8 movq 40(%rdi),%r9 movq 48(%rdi),%r10 movq 56(%rdi),%r11 jmp L$loop .p2align 4 L$loop: movq %rbx,%rdi leaq K512(%rip),%rbp xorq %rcx,%rdi movq 0(%rsi),%r12 movq %r8,%r13 movq %rax,%r14 bswapq %r12 rorq $23,%r13 movq %r9,%r15 xorq %r8,%r13 rorq $5,%r14 xorq %r10,%r15 movq %r12,0(%rsp) xorq %rax,%r14 andq %r8,%r15 rorq $4,%r13 addq %r11,%r12 xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 addq %r15,%r12 movq %rax,%r15 addq (%rbp),%r12 xorq %rax,%r14 xorq %rbx,%r15 rorq $14,%r13 movq %rbx,%r11 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 leaq 8(%rbp),%rbp addq %r14,%r11 movq 8(%rsi),%r12 movq %rdx,%r13 movq %r11,%r14 bswapq %r12 rorq $23,%r13 movq %r8,%rdi xorq %rdx,%r13 rorq $5,%r14 xorq %r9,%rdi movq %r12,8(%rsp) xorq %r11,%r14 andq %rdx,%rdi rorq $4,%r13 addq %r10,%r12 xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 addq %rdi,%r12 movq %r11,%rdi addq (%rbp),%r12 xorq %r11,%r14 xorq %rax,%rdi rorq $14,%r13 movq %rax,%r10 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 leaq 24(%rbp),%rbp addq %r14,%r10 movq 16(%rsi),%r12 movq %rcx,%r13 movq %r10,%r14 bswapq %r12 rorq $23,%r13 movq %rdx,%r15 xorq %rcx,%r13 rorq $5,%r14 xorq %r8,%r15 movq %r12,16(%rsp) xorq %r10,%r14 andq %rcx,%r15 rorq $4,%r13 addq %r9,%r12 xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 addq %r15,%r12 movq %r10,%r15 addq (%rbp),%r12 xorq %r10,%r14 xorq %r11,%r15 rorq $14,%r13 movq %r11,%r9 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 leaq 8(%rbp),%rbp addq %r14,%r9 movq 24(%rsi),%r12 movq %rbx,%r13 movq %r9,%r14 bswapq %r12 rorq $23,%r13 movq %rcx,%rdi xorq %rbx,%r13 rorq $5,%r14 xorq %rdx,%rdi movq %r12,24(%rsp) xorq %r9,%r14 andq %rbx,%rdi rorq $4,%r13 addq %r8,%r12 xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 addq %rdi,%r12 movq %r9,%rdi addq (%rbp),%r12 xorq %r9,%r14 xorq %r10,%rdi rorq $14,%r13 movq %r10,%r8 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 leaq 24(%rbp),%rbp addq %r14,%r8 movq 32(%rsi),%r12 movq %rax,%r13 movq %r8,%r14 bswapq %r12 rorq $23,%r13 movq %rbx,%r15 xorq %rax,%r13 rorq $5,%r14 xorq %rcx,%r15 movq %r12,32(%rsp) xorq %r8,%r14 andq %rax,%r15 rorq $4,%r13 addq %rdx,%r12 xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 addq %r15,%r12 movq %r8,%r15 addq (%rbp),%r12 xorq %r8,%r14 xorq %r9,%r15 rorq $14,%r13 movq %r9,%rdx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx leaq 8(%rbp),%rbp addq %r14,%rdx movq 40(%rsi),%r12 movq %r11,%r13 movq %rdx,%r14 bswapq %r12 rorq $23,%r13 movq %rax,%rdi xorq %r11,%r13 rorq $5,%r14 xorq %rbx,%rdi movq %r12,40(%rsp) xorq %rdx,%r14 andq %r11,%rdi rorq $4,%r13 addq %rcx,%r12 xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 addq %rdi,%r12 movq %rdx,%rdi addq (%rbp),%r12 xorq %rdx,%r14 xorq %r8,%rdi rorq $14,%r13 movq %r8,%rcx andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx leaq 24(%rbp),%rbp addq %r14,%rcx movq 48(%rsi),%r12 movq %r10,%r13 movq %rcx,%r14 bswapq %r12 rorq $23,%r13 movq %r11,%r15 xorq %r10,%r13 rorq $5,%r14 xorq %rax,%r15 movq %r12,48(%rsp) xorq %rcx,%r14 andq %r10,%r15 rorq $4,%r13 addq %rbx,%r12 xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 addq %r15,%r12 movq %rcx,%r15 addq (%rbp),%r12 xorq %rcx,%r14 xorq %rdx,%r15 rorq $14,%r13 movq %rdx,%rbx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx leaq 8(%rbp),%rbp addq %r14,%rbx movq 56(%rsi),%r12 movq %r9,%r13 movq %rbx,%r14 bswapq %r12 rorq $23,%r13 movq %r10,%rdi xorq %r9,%r13 rorq $5,%r14 xorq %r11,%rdi movq %r12,56(%rsp) xorq %rbx,%r14 andq %r9,%rdi rorq $4,%r13 addq %rax,%r12 xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 addq %rdi,%r12 movq %rbx,%rdi addq (%rbp),%r12 xorq %rbx,%r14 xorq %rcx,%rdi rorq $14,%r13 movq %rcx,%rax andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rax addq %r12,%r8 addq %r12,%rax leaq 24(%rbp),%rbp addq %r14,%rax movq 64(%rsi),%r12 movq %r8,%r13 movq %rax,%r14 bswapq %r12 rorq $23,%r13 movq %r9,%r15 xorq %r8,%r13 rorq $5,%r14 xorq %r10,%r15 movq %r12,64(%rsp) xorq %rax,%r14 andq %r8,%r15 rorq $4,%r13 addq %r11,%r12 xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 addq %r15,%r12 movq %rax,%r15 addq (%rbp),%r12 xorq %rax,%r14 xorq %rbx,%r15 rorq $14,%r13 movq %rbx,%r11 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 leaq 8(%rbp),%rbp addq %r14,%r11 movq 72(%rsi),%r12 movq %rdx,%r13 movq %r11,%r14 bswapq %r12 rorq $23,%r13 movq %r8,%rdi xorq %rdx,%r13 rorq $5,%r14 xorq %r9,%rdi movq %r12,72(%rsp) xorq %r11,%r14 andq %rdx,%rdi rorq $4,%r13 addq %r10,%r12 xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 addq %rdi,%r12 movq %r11,%rdi addq (%rbp),%r12 xorq %r11,%r14 xorq %rax,%rdi rorq $14,%r13 movq %rax,%r10 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 leaq 24(%rbp),%rbp addq %r14,%r10 movq 80(%rsi),%r12 movq %rcx,%r13 movq %r10,%r14 bswapq %r12 rorq $23,%r13 movq %rdx,%r15 xorq %rcx,%r13 rorq $5,%r14 xorq %r8,%r15 movq %r12,80(%rsp) xorq %r10,%r14 andq %rcx,%r15 rorq $4,%r13 addq %r9,%r12 xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 addq %r15,%r12 movq %r10,%r15 addq (%rbp),%r12 xorq %r10,%r14 xorq %r11,%r15 rorq $14,%r13 movq %r11,%r9 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 leaq 8(%rbp),%rbp addq %r14,%r9 movq 88(%rsi),%r12 movq %rbx,%r13 movq %r9,%r14 bswapq %r12 rorq $23,%r13 movq %rcx,%rdi xorq %rbx,%r13 rorq $5,%r14 xorq %rdx,%rdi movq %r12,88(%rsp) xorq %r9,%r14 andq %rbx,%rdi rorq $4,%r13 addq %r8,%r12 xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 addq %rdi,%r12 movq %r9,%rdi addq (%rbp),%r12 xorq %r9,%r14 xorq %r10,%rdi rorq $14,%r13 movq %r10,%r8 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 leaq 24(%rbp),%rbp addq %r14,%r8 movq 96(%rsi),%r12 movq %rax,%r13 movq %r8,%r14 bswapq %r12 rorq $23,%r13 movq %rbx,%r15 xorq %rax,%r13 rorq $5,%r14 xorq %rcx,%r15 movq %r12,96(%rsp) xorq %r8,%r14 andq %rax,%r15 rorq $4,%r13 addq %rdx,%r12 xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 addq %r15,%r12 movq %r8,%r15 addq (%rbp),%r12 xorq %r8,%r14 xorq %r9,%r15 rorq $14,%r13 movq %r9,%rdx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx leaq 8(%rbp),%rbp addq %r14,%rdx movq 104(%rsi),%r12 movq %r11,%r13 movq %rdx,%r14 bswapq %r12 rorq $23,%r13 movq %rax,%rdi xorq %r11,%r13 rorq $5,%r14 xorq %rbx,%rdi movq %r12,104(%rsp) xorq %rdx,%r14 andq %r11,%rdi rorq $4,%r13 addq %rcx,%r12 xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 addq %rdi,%r12 movq %rdx,%rdi addq (%rbp),%r12 xorq %rdx,%r14 xorq %r8,%rdi rorq $14,%r13 movq %r8,%rcx andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx leaq 24(%rbp),%rbp addq %r14,%rcx movq 112(%rsi),%r12 movq %r10,%r13 movq %rcx,%r14 bswapq %r12 rorq $23,%r13 movq %r11,%r15 xorq %r10,%r13 rorq $5,%r14 xorq %rax,%r15 movq %r12,112(%rsp) xorq %rcx,%r14 andq %r10,%r15 rorq $4,%r13 addq %rbx,%r12 xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 addq %r15,%r12 movq %rcx,%r15 addq (%rbp),%r12 xorq %rcx,%r14 xorq %rdx,%r15 rorq $14,%r13 movq %rdx,%rbx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx leaq 8(%rbp),%rbp addq %r14,%rbx movq 120(%rsi),%r12 movq %r9,%r13 movq %rbx,%r14 bswapq %r12 rorq $23,%r13 movq %r10,%rdi xorq %r9,%r13 rorq $5,%r14 xorq %r11,%rdi movq %r12,120(%rsp) xorq %rbx,%r14 andq %r9,%rdi rorq $4,%r13 addq %rax,%r12 xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 addq %rdi,%r12 movq %rbx,%rdi addq (%rbp),%r12 xorq %rbx,%r14 xorq %rcx,%rdi rorq $14,%r13 movq %rcx,%rax andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rax addq %r12,%r8 addq %r12,%rax leaq 24(%rbp),%rbp jmp L$rounds_16_xx .p2align 4 L$rounds_16_xx: movq 8(%rsp),%r13 movq 112(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%rax movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 72(%rsp),%r12 addq 0(%rsp),%r12 movq %r8,%r13 addq %r15,%r12 movq %rax,%r14 rorq $23,%r13 movq %r9,%r15 xorq %r8,%r13 rorq $5,%r14 xorq %r10,%r15 movq %r12,0(%rsp) xorq %rax,%r14 andq %r8,%r15 rorq $4,%r13 addq %r11,%r12 xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 addq %r15,%r12 movq %rax,%r15 addq (%rbp),%r12 xorq %rax,%r14 xorq %rbx,%r15 rorq $14,%r13 movq %rbx,%r11 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 leaq 8(%rbp),%rbp movq 16(%rsp),%r13 movq 120(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%r11 movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 80(%rsp),%r12 addq 8(%rsp),%r12 movq %rdx,%r13 addq %rdi,%r12 movq %r11,%r14 rorq $23,%r13 movq %r8,%rdi xorq %rdx,%r13 rorq $5,%r14 xorq %r9,%rdi movq %r12,8(%rsp) xorq %r11,%r14 andq %rdx,%rdi rorq $4,%r13 addq %r10,%r12 xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 addq %rdi,%r12 movq %r11,%rdi addq (%rbp),%r12 xorq %r11,%r14 xorq %rax,%rdi rorq $14,%r13 movq %rax,%r10 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 leaq 24(%rbp),%rbp movq 24(%rsp),%r13 movq 0(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%r10 movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 88(%rsp),%r12 addq 16(%rsp),%r12 movq %rcx,%r13 addq %r15,%r12 movq %r10,%r14 rorq $23,%r13 movq %rdx,%r15 xorq %rcx,%r13 rorq $5,%r14 xorq %r8,%r15 movq %r12,16(%rsp) xorq %r10,%r14 andq %rcx,%r15 rorq $4,%r13 addq %r9,%r12 xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 addq %r15,%r12 movq %r10,%r15 addq (%rbp),%r12 xorq %r10,%r14 xorq %r11,%r15 rorq $14,%r13 movq %r11,%r9 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 leaq 8(%rbp),%rbp movq 32(%rsp),%r13 movq 8(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%r9 movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 96(%rsp),%r12 addq 24(%rsp),%r12 movq %rbx,%r13 addq %rdi,%r12 movq %r9,%r14 rorq $23,%r13 movq %rcx,%rdi xorq %rbx,%r13 rorq $5,%r14 xorq %rdx,%rdi movq %r12,24(%rsp) xorq %r9,%r14 andq %rbx,%rdi rorq $4,%r13 addq %r8,%r12 xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 addq %rdi,%r12 movq %r9,%rdi addq (%rbp),%r12 xorq %r9,%r14 xorq %r10,%rdi rorq $14,%r13 movq %r10,%r8 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 leaq 24(%rbp),%rbp movq 40(%rsp),%r13 movq 16(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%r8 movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 104(%rsp),%r12 addq 32(%rsp),%r12 movq %rax,%r13 addq %r15,%r12 movq %r8,%r14 rorq $23,%r13 movq %rbx,%r15 xorq %rax,%r13 rorq $5,%r14 xorq %rcx,%r15 movq %r12,32(%rsp) xorq %r8,%r14 andq %rax,%r15 rorq $4,%r13 addq %rdx,%r12 xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 addq %r15,%r12 movq %r8,%r15 addq (%rbp),%r12 xorq %r8,%r14 xorq %r9,%r15 rorq $14,%r13 movq %r9,%rdx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx leaq 8(%rbp),%rbp movq 48(%rsp),%r13 movq 24(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%rdx movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 112(%rsp),%r12 addq 40(%rsp),%r12 movq %r11,%r13 addq %rdi,%r12 movq %rdx,%r14 rorq $23,%r13 movq %rax,%rdi xorq %r11,%r13 rorq $5,%r14 xorq %rbx,%rdi movq %r12,40(%rsp) xorq %rdx,%r14 andq %r11,%rdi rorq $4,%r13 addq %rcx,%r12 xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 addq %rdi,%r12 movq %rdx,%rdi addq (%rbp),%r12 xorq %rdx,%r14 xorq %r8,%rdi rorq $14,%r13 movq %r8,%rcx andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx leaq 24(%rbp),%rbp movq 56(%rsp),%r13 movq 32(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%rcx movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 120(%rsp),%r12 addq 48(%rsp),%r12 movq %r10,%r13 addq %r15,%r12 movq %rcx,%r14 rorq $23,%r13 movq %r11,%r15 xorq %r10,%r13 rorq $5,%r14 xorq %rax,%r15 movq %r12,48(%rsp) xorq %rcx,%r14 andq %r10,%r15 rorq $4,%r13 addq %rbx,%r12 xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 addq %r15,%r12 movq %rcx,%r15 addq (%rbp),%r12 xorq %rcx,%r14 xorq %rdx,%r15 rorq $14,%r13 movq %rdx,%rbx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx leaq 8(%rbp),%rbp movq 64(%rsp),%r13 movq 40(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%rbx movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 0(%rsp),%r12 addq 56(%rsp),%r12 movq %r9,%r13 addq %rdi,%r12 movq %rbx,%r14 rorq $23,%r13 movq %r10,%rdi xorq %r9,%r13 rorq $5,%r14 xorq %r11,%rdi movq %r12,56(%rsp) xorq %rbx,%r14 andq %r9,%rdi rorq $4,%r13 addq %rax,%r12 xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 addq %rdi,%r12 movq %rbx,%rdi addq (%rbp),%r12 xorq %rbx,%r14 xorq %rcx,%rdi rorq $14,%r13 movq %rcx,%rax andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rax addq %r12,%r8 addq %r12,%rax leaq 24(%rbp),%rbp movq 72(%rsp),%r13 movq 48(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%rax movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 8(%rsp),%r12 addq 64(%rsp),%r12 movq %r8,%r13 addq %r15,%r12 movq %rax,%r14 rorq $23,%r13 movq %r9,%r15 xorq %r8,%r13 rorq $5,%r14 xorq %r10,%r15 movq %r12,64(%rsp) xorq %rax,%r14 andq %r8,%r15 rorq $4,%r13 addq %r11,%r12 xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 addq %r15,%r12 movq %rax,%r15 addq (%rbp),%r12 xorq %rax,%r14 xorq %rbx,%r15 rorq $14,%r13 movq %rbx,%r11 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 leaq 8(%rbp),%rbp movq 80(%rsp),%r13 movq 56(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%r11 movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 16(%rsp),%r12 addq 72(%rsp),%r12 movq %rdx,%r13 addq %rdi,%r12 movq %r11,%r14 rorq $23,%r13 movq %r8,%rdi xorq %rdx,%r13 rorq $5,%r14 xorq %r9,%rdi movq %r12,72(%rsp) xorq %r11,%r14 andq %rdx,%rdi rorq $4,%r13 addq %r10,%r12 xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 addq %rdi,%r12 movq %r11,%rdi addq (%rbp),%r12 xorq %r11,%r14 xorq %rax,%rdi rorq $14,%r13 movq %rax,%r10 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 leaq 24(%rbp),%rbp movq 88(%rsp),%r13 movq 64(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%r10 movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 24(%rsp),%r12 addq 80(%rsp),%r12 movq %rcx,%r13 addq %r15,%r12 movq %r10,%r14 rorq $23,%r13 movq %rdx,%r15 xorq %rcx,%r13 rorq $5,%r14 xorq %r8,%r15 movq %r12,80(%rsp) xorq %r10,%r14 andq %rcx,%r15 rorq $4,%r13 addq %r9,%r12 xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 addq %r15,%r12 movq %r10,%r15 addq (%rbp),%r12 xorq %r10,%r14 xorq %r11,%r15 rorq $14,%r13 movq %r11,%r9 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 leaq 8(%rbp),%rbp movq 96(%rsp),%r13 movq 72(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%r9 movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 32(%rsp),%r12 addq 88(%rsp),%r12 movq %rbx,%r13 addq %rdi,%r12 movq %r9,%r14 rorq $23,%r13 movq %rcx,%rdi xorq %rbx,%r13 rorq $5,%r14 xorq %rdx,%rdi movq %r12,88(%rsp) xorq %r9,%r14 andq %rbx,%rdi rorq $4,%r13 addq %r8,%r12 xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 addq %rdi,%r12 movq %r9,%rdi addq (%rbp),%r12 xorq %r9,%r14 xorq %r10,%rdi rorq $14,%r13 movq %r10,%r8 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 leaq 24(%rbp),%rbp movq 104(%rsp),%r13 movq 80(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%r8 movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 40(%rsp),%r12 addq 96(%rsp),%r12 movq %rax,%r13 addq %r15,%r12 movq %r8,%r14 rorq $23,%r13 movq %rbx,%r15 xorq %rax,%r13 rorq $5,%r14 xorq %rcx,%r15 movq %r12,96(%rsp) xorq %r8,%r14 andq %rax,%r15 rorq $4,%r13 addq %rdx,%r12 xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 addq %r15,%r12 movq %r8,%r15 addq (%rbp),%r12 xorq %r8,%r14 xorq %r9,%r15 rorq $14,%r13 movq %r9,%rdx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx leaq 8(%rbp),%rbp movq 112(%rsp),%r13 movq 88(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%rdx movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 48(%rsp),%r12 addq 104(%rsp),%r12 movq %r11,%r13 addq %rdi,%r12 movq %rdx,%r14 rorq $23,%r13 movq %rax,%rdi xorq %r11,%r13 rorq $5,%r14 xorq %rbx,%rdi movq %r12,104(%rsp) xorq %rdx,%r14 andq %r11,%rdi rorq $4,%r13 addq %rcx,%r12 xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 addq %rdi,%r12 movq %rdx,%rdi addq (%rbp),%r12 xorq %rdx,%r14 xorq %r8,%rdi rorq $14,%r13 movq %r8,%rcx andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx leaq 24(%rbp),%rbp movq 120(%rsp),%r13 movq 96(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%rcx movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 56(%rsp),%r12 addq 112(%rsp),%r12 movq %r10,%r13 addq %r15,%r12 movq %rcx,%r14 rorq $23,%r13 movq %r11,%r15 xorq %r10,%r13 rorq $5,%r14 xorq %rax,%r15 movq %r12,112(%rsp) xorq %rcx,%r14 andq %r10,%r15 rorq $4,%r13 addq %rbx,%r12 xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 addq %r15,%r12 movq %rcx,%r15 addq (%rbp),%r12 xorq %rcx,%r14 xorq %rdx,%r15 rorq $14,%r13 movq %rdx,%rbx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx leaq 8(%rbp),%rbp movq 0(%rsp),%r13 movq 104(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%rbx movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 64(%rsp),%r12 addq 120(%rsp),%r12 movq %r9,%r13 addq %rdi,%r12 movq %rbx,%r14 rorq $23,%r13 movq %r10,%rdi xorq %r9,%r13 rorq $5,%r14 xorq %r11,%rdi movq %r12,120(%rsp) xorq %rbx,%r14 andq %r9,%rdi rorq $4,%r13 addq %rax,%r12 xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 addq %rdi,%r12 movq %rbx,%rdi addq (%rbp),%r12 xorq %rbx,%r14 xorq %rcx,%rdi rorq $14,%r13 movq %rcx,%rax andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rax addq %r12,%r8 addq %r12,%rax leaq 24(%rbp),%rbp cmpb $0,7(%rbp) jnz L$rounds_16_xx movq 128+0(%rsp),%rdi addq %r14,%rax leaq 128(%rsi),%rsi addq 0(%rdi),%rax addq 8(%rdi),%rbx addq 16(%rdi),%rcx addq 24(%rdi),%rdx addq 32(%rdi),%r8 addq 40(%rdi),%r9 addq 48(%rdi),%r10 addq 56(%rdi),%r11 cmpq 128+16(%rsp),%rsi movq %rax,0(%rdi) movq %rbx,8(%rdi) movq %rcx,16(%rdi) movq %rdx,24(%rdi) movq %r8,32(%rdi) movq %r9,40(%rdi) movq %r10,48(%rdi) movq %r11,56(%rdi) jb L$loop movq 152(%rsp),%rsi movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$epilogue: ret .section __DATA,__const .p2align 6 K512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text .globl _sha512_block_data_order_avx .private_extern _sha512_block_data_order_avx .p2align 6 _sha512_block_data_order_avx: _CET_ENDBR movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx andq $-64,%rsp movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) movq %rax,152(%rsp) L$prologue_avx: vzeroupper movq 0(%rdi),%rax movq 8(%rdi),%rbx movq 16(%rdi),%rcx movq 24(%rdi),%rdx movq 32(%rdi),%r8 movq 40(%rdi),%r9 movq 48(%rdi),%r10 movq 56(%rdi),%r11 jmp L$loop_avx .p2align 4 L$loop_avx: vmovdqa K512+1280(%rip),%xmm11 vmovdqu 0(%rsi),%xmm0 leaq K512+128(%rip),%rbp vmovdqu 16(%rsi),%xmm1 vmovdqu 32(%rsi),%xmm2 vpshufb %xmm11,%xmm0,%xmm0 vmovdqu 48(%rsi),%xmm3 vpshufb %xmm11,%xmm1,%xmm1 vmovdqu 64(%rsi),%xmm4 vpshufb %xmm11,%xmm2,%xmm2 vmovdqu 80(%rsi),%xmm5 vpshufb %xmm11,%xmm3,%xmm3 vmovdqu 96(%rsi),%xmm6 vpshufb %xmm11,%xmm4,%xmm4 vmovdqu 112(%rsi),%xmm7 vpshufb %xmm11,%xmm5,%xmm5 vpaddq -128(%rbp),%xmm0,%xmm8 vpshufb %xmm11,%xmm6,%xmm6 vpaddq -96(%rbp),%xmm1,%xmm9 vpshufb %xmm11,%xmm7,%xmm7 vpaddq -64(%rbp),%xmm2,%xmm10 vpaddq -32(%rbp),%xmm3,%xmm11 vmovdqa %xmm8,0(%rsp) vpaddq 0(%rbp),%xmm4,%xmm8 vmovdqa %xmm9,16(%rsp) vpaddq 32(%rbp),%xmm5,%xmm9 vmovdqa %xmm10,32(%rsp) vpaddq 64(%rbp),%xmm6,%xmm10 vmovdqa %xmm11,48(%rsp) vpaddq 96(%rbp),%xmm7,%xmm11 vmovdqa %xmm8,64(%rsp) movq %rax,%r14 vmovdqa %xmm9,80(%rsp) movq %rbx,%rdi vmovdqa %xmm10,96(%rsp) xorq %rcx,%rdi vmovdqa %xmm11,112(%rsp) movq %r8,%r13 jmp L$avx_00_47 .p2align 4 L$avx_00_47: addq $256,%rbp vpalignr $8,%xmm0,%xmm1,%xmm8 shrdq $23,%r13,%r13 movq %r14,%rax vpalignr $8,%xmm4,%xmm5,%xmm11 movq %r9,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %r8,%r13 xorq %r10,%r12 vpaddq %xmm11,%xmm0,%xmm0 shrdq $4,%r13,%r13 xorq %rax,%r14 vpsrlq $7,%xmm8,%xmm11 andq %r8,%r12 xorq %r8,%r13 vpsllq $56,%xmm8,%xmm9 addq 0(%rsp),%r11 movq %rax,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %r10,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %rbx,%r15 addq %r12,%r11 vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %rax,%r14 addq %r13,%r11 vpxor %xmm10,%xmm8,%xmm8 xorq %rbx,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm7,%xmm11 addq %r11,%rdx addq %rdi,%r11 vpxor %xmm9,%xmm8,%xmm8 movq %rdx,%r13 addq %r11,%r14 vpsllq $3,%xmm7,%xmm10 shrdq $23,%r13,%r13 movq %r14,%r11 vpaddq %xmm8,%xmm0,%xmm0 movq %r8,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm7,%xmm9 xorq %rdx,%r13 xorq %r9,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %r11,%r14 vpsllq $42,%xmm10,%xmm10 andq %rdx,%r12 xorq %rdx,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 8(%rsp),%r10 movq %r11,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %r9,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %rax,%rdi addq %r12,%r10 vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm0,%xmm0 xorq %r11,%r14 addq %r13,%r10 vpaddq -128(%rbp),%xmm0,%xmm10 xorq %rax,%r15 shrdq $28,%r14,%r14 addq %r10,%rcx addq %r15,%r10 movq %rcx,%r13 addq %r10,%r14 vmovdqa %xmm10,0(%rsp) vpalignr $8,%xmm1,%xmm2,%xmm8 shrdq $23,%r13,%r13 movq %r14,%r10 vpalignr $8,%xmm5,%xmm6,%xmm11 movq %rdx,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %rcx,%r13 xorq %r8,%r12 vpaddq %xmm11,%xmm1,%xmm1 shrdq $4,%r13,%r13 xorq %r10,%r14 vpsrlq $7,%xmm8,%xmm11 andq %rcx,%r12 xorq %rcx,%r13 vpsllq $56,%xmm8,%xmm9 addq 16(%rsp),%r9 movq %r10,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %r8,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %r11,%r15 addq %r12,%r9 vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %r10,%r14 addq %r13,%r9 vpxor %xmm10,%xmm8,%xmm8 xorq %r11,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm0,%xmm11 addq %r9,%rbx addq %rdi,%r9 vpxor %xmm9,%xmm8,%xmm8 movq %rbx,%r13 addq %r9,%r14 vpsllq $3,%xmm0,%xmm10 shrdq $23,%r13,%r13 movq %r14,%r9 vpaddq %xmm8,%xmm1,%xmm1 movq %rcx,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm0,%xmm9 xorq %rbx,%r13 xorq %rdx,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %r9,%r14 vpsllq $42,%xmm10,%xmm10 andq %rbx,%r12 xorq %rbx,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 24(%rsp),%r8 movq %r9,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %rdx,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %r10,%rdi addq %r12,%r8 vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm1,%xmm1 xorq %r9,%r14 addq %r13,%r8 vpaddq -96(%rbp),%xmm1,%xmm10 xorq %r10,%r15 shrdq $28,%r14,%r14 addq %r8,%rax addq %r15,%r8 movq %rax,%r13 addq %r8,%r14 vmovdqa %xmm10,16(%rsp) vpalignr $8,%xmm2,%xmm3,%xmm8 shrdq $23,%r13,%r13 movq %r14,%r8 vpalignr $8,%xmm6,%xmm7,%xmm11 movq %rbx,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %rax,%r13 xorq %rcx,%r12 vpaddq %xmm11,%xmm2,%xmm2 shrdq $4,%r13,%r13 xorq %r8,%r14 vpsrlq $7,%xmm8,%xmm11 andq %rax,%r12 xorq %rax,%r13 vpsllq $56,%xmm8,%xmm9 addq 32(%rsp),%rdx movq %r8,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %rcx,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %r9,%r15 addq %r12,%rdx vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %r8,%r14 addq %r13,%rdx vpxor %xmm10,%xmm8,%xmm8 xorq %r9,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm1,%xmm11 addq %rdx,%r11 addq %rdi,%rdx vpxor %xmm9,%xmm8,%xmm8 movq %r11,%r13 addq %rdx,%r14 vpsllq $3,%xmm1,%xmm10 shrdq $23,%r13,%r13 movq %r14,%rdx vpaddq %xmm8,%xmm2,%xmm2 movq %rax,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm1,%xmm9 xorq %r11,%r13 xorq %rbx,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %rdx,%r14 vpsllq $42,%xmm10,%xmm10 andq %r11,%r12 xorq %r11,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 40(%rsp),%rcx movq %rdx,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %rbx,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %r8,%rdi addq %r12,%rcx vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm2,%xmm2 xorq %rdx,%r14 addq %r13,%rcx vpaddq -64(%rbp),%xmm2,%xmm10 xorq %r8,%r15 shrdq $28,%r14,%r14 addq %rcx,%r10 addq %r15,%rcx movq %r10,%r13 addq %rcx,%r14 vmovdqa %xmm10,32(%rsp) vpalignr $8,%xmm3,%xmm4,%xmm8 shrdq $23,%r13,%r13 movq %r14,%rcx vpalignr $8,%xmm7,%xmm0,%xmm11 movq %r11,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %r10,%r13 xorq %rax,%r12 vpaddq %xmm11,%xmm3,%xmm3 shrdq $4,%r13,%r13 xorq %rcx,%r14 vpsrlq $7,%xmm8,%xmm11 andq %r10,%r12 xorq %r10,%r13 vpsllq $56,%xmm8,%xmm9 addq 48(%rsp),%rbx movq %rcx,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %rax,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %rdx,%r15 addq %r12,%rbx vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %rcx,%r14 addq %r13,%rbx vpxor %xmm10,%xmm8,%xmm8 xorq %rdx,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm2,%xmm11 addq %rbx,%r9 addq %rdi,%rbx vpxor %xmm9,%xmm8,%xmm8 movq %r9,%r13 addq %rbx,%r14 vpsllq $3,%xmm2,%xmm10 shrdq $23,%r13,%r13 movq %r14,%rbx vpaddq %xmm8,%xmm3,%xmm3 movq %r10,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm2,%xmm9 xorq %r9,%r13 xorq %r11,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %rbx,%r14 vpsllq $42,%xmm10,%xmm10 andq %r9,%r12 xorq %r9,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 56(%rsp),%rax movq %rbx,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %r11,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %rcx,%rdi addq %r12,%rax vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm3,%xmm3 xorq %rbx,%r14 addq %r13,%rax vpaddq -32(%rbp),%xmm3,%xmm10 xorq %rcx,%r15 shrdq $28,%r14,%r14 addq %rax,%r8 addq %r15,%rax movq %r8,%r13 addq %rax,%r14 vmovdqa %xmm10,48(%rsp) vpalignr $8,%xmm4,%xmm5,%xmm8 shrdq $23,%r13,%r13 movq %r14,%rax vpalignr $8,%xmm0,%xmm1,%xmm11 movq %r9,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %r8,%r13 xorq %r10,%r12 vpaddq %xmm11,%xmm4,%xmm4 shrdq $4,%r13,%r13 xorq %rax,%r14 vpsrlq $7,%xmm8,%xmm11 andq %r8,%r12 xorq %r8,%r13 vpsllq $56,%xmm8,%xmm9 addq 64(%rsp),%r11 movq %rax,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %r10,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %rbx,%r15 addq %r12,%r11 vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %rax,%r14 addq %r13,%r11 vpxor %xmm10,%xmm8,%xmm8 xorq %rbx,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm3,%xmm11 addq %r11,%rdx addq %rdi,%r11 vpxor %xmm9,%xmm8,%xmm8 movq %rdx,%r13 addq %r11,%r14 vpsllq $3,%xmm3,%xmm10 shrdq $23,%r13,%r13 movq %r14,%r11 vpaddq %xmm8,%xmm4,%xmm4 movq %r8,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm3,%xmm9 xorq %rdx,%r13 xorq %r9,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %r11,%r14 vpsllq $42,%xmm10,%xmm10 andq %rdx,%r12 xorq %rdx,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 72(%rsp),%r10 movq %r11,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %r9,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %rax,%rdi addq %r12,%r10 vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm4,%xmm4 xorq %r11,%r14 addq %r13,%r10 vpaddq 0(%rbp),%xmm4,%xmm10 xorq %rax,%r15 shrdq $28,%r14,%r14 addq %r10,%rcx addq %r15,%r10 movq %rcx,%r13 addq %r10,%r14 vmovdqa %xmm10,64(%rsp) vpalignr $8,%xmm5,%xmm6,%xmm8 shrdq $23,%r13,%r13 movq %r14,%r10 vpalignr $8,%xmm1,%xmm2,%xmm11 movq %rdx,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %rcx,%r13 xorq %r8,%r12 vpaddq %xmm11,%xmm5,%xmm5 shrdq $4,%r13,%r13 xorq %r10,%r14 vpsrlq $7,%xmm8,%xmm11 andq %rcx,%r12 xorq %rcx,%r13 vpsllq $56,%xmm8,%xmm9 addq 80(%rsp),%r9 movq %r10,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %r8,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %r11,%r15 addq %r12,%r9 vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %r10,%r14 addq %r13,%r9 vpxor %xmm10,%xmm8,%xmm8 xorq %r11,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm4,%xmm11 addq %r9,%rbx addq %rdi,%r9 vpxor %xmm9,%xmm8,%xmm8 movq %rbx,%r13 addq %r9,%r14 vpsllq $3,%xmm4,%xmm10 shrdq $23,%r13,%r13 movq %r14,%r9 vpaddq %xmm8,%xmm5,%xmm5 movq %rcx,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm4,%xmm9 xorq %rbx,%r13 xorq %rdx,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %r9,%r14 vpsllq $42,%xmm10,%xmm10 andq %rbx,%r12 xorq %rbx,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 88(%rsp),%r8 movq %r9,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %rdx,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %r10,%rdi addq %r12,%r8 vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm5,%xmm5 xorq %r9,%r14 addq %r13,%r8 vpaddq 32(%rbp),%xmm5,%xmm10 xorq %r10,%r15 shrdq $28,%r14,%r14 addq %r8,%rax addq %r15,%r8 movq %rax,%r13 addq %r8,%r14 vmovdqa %xmm10,80(%rsp) vpalignr $8,%xmm6,%xmm7,%xmm8 shrdq $23,%r13,%r13 movq %r14,%r8 vpalignr $8,%xmm2,%xmm3,%xmm11 movq %rbx,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %rax,%r13 xorq %rcx,%r12 vpaddq %xmm11,%xmm6,%xmm6 shrdq $4,%r13,%r13 xorq %r8,%r14 vpsrlq $7,%xmm8,%xmm11 andq %rax,%r12 xorq %rax,%r13 vpsllq $56,%xmm8,%xmm9 addq 96(%rsp),%rdx movq %r8,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %rcx,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %r9,%r15 addq %r12,%rdx vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %r8,%r14 addq %r13,%rdx vpxor %xmm10,%xmm8,%xmm8 xorq %r9,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm5,%xmm11 addq %rdx,%r11 addq %rdi,%rdx vpxor %xmm9,%xmm8,%xmm8 movq %r11,%r13 addq %rdx,%r14 vpsllq $3,%xmm5,%xmm10 shrdq $23,%r13,%r13 movq %r14,%rdx vpaddq %xmm8,%xmm6,%xmm6 movq %rax,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm5,%xmm9 xorq %r11,%r13 xorq %rbx,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %rdx,%r14 vpsllq $42,%xmm10,%xmm10 andq %r11,%r12 xorq %r11,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 104(%rsp),%rcx movq %rdx,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %rbx,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %r8,%rdi addq %r12,%rcx vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm6,%xmm6 xorq %rdx,%r14 addq %r13,%rcx vpaddq 64(%rbp),%xmm6,%xmm10 xorq %r8,%r15 shrdq $28,%r14,%r14 addq %rcx,%r10 addq %r15,%rcx movq %r10,%r13 addq %rcx,%r14 vmovdqa %xmm10,96(%rsp) vpalignr $8,%xmm7,%xmm0,%xmm8 shrdq $23,%r13,%r13 movq %r14,%rcx vpalignr $8,%xmm3,%xmm4,%xmm11 movq %r11,%r12 shrdq $5,%r14,%r14 vpsrlq $1,%xmm8,%xmm10 xorq %r10,%r13 xorq %rax,%r12 vpaddq %xmm11,%xmm7,%xmm7 shrdq $4,%r13,%r13 xorq %rcx,%r14 vpsrlq $7,%xmm8,%xmm11 andq %r10,%r12 xorq %r10,%r13 vpsllq $56,%xmm8,%xmm9 addq 112(%rsp),%rbx movq %rcx,%r15 vpxor %xmm10,%xmm11,%xmm8 xorq %rax,%r12 shrdq $6,%r14,%r14 vpsrlq $7,%xmm10,%xmm10 xorq %rdx,%r15 addq %r12,%rbx vpxor %xmm9,%xmm8,%xmm8 shrdq $14,%r13,%r13 andq %r15,%rdi vpsllq $7,%xmm9,%xmm9 xorq %rcx,%r14 addq %r13,%rbx vpxor %xmm10,%xmm8,%xmm8 xorq %rdx,%rdi shrdq $28,%r14,%r14 vpsrlq $6,%xmm6,%xmm11 addq %rbx,%r9 addq %rdi,%rbx vpxor %xmm9,%xmm8,%xmm8 movq %r9,%r13 addq %rbx,%r14 vpsllq $3,%xmm6,%xmm10 shrdq $23,%r13,%r13 movq %r14,%rbx vpaddq %xmm8,%xmm7,%xmm7 movq %r10,%r12 shrdq $5,%r14,%r14 vpsrlq $19,%xmm6,%xmm9 xorq %r9,%r13 xorq %r11,%r12 vpxor %xmm10,%xmm11,%xmm11 shrdq $4,%r13,%r13 xorq %rbx,%r14 vpsllq $42,%xmm10,%xmm10 andq %r9,%r12 xorq %r9,%r13 vpxor %xmm9,%xmm11,%xmm11 addq 120(%rsp),%rax movq %rbx,%rdi vpsrlq $42,%xmm9,%xmm9 xorq %r11,%r12 shrdq $6,%r14,%r14 vpxor %xmm10,%xmm11,%xmm11 xorq %rcx,%rdi addq %r12,%rax vpxor %xmm9,%xmm11,%xmm11 shrdq $14,%r13,%r13 andq %rdi,%r15 vpaddq %xmm11,%xmm7,%xmm7 xorq %rbx,%r14 addq %r13,%rax vpaddq 96(%rbp),%xmm7,%xmm10 xorq %rcx,%r15 shrdq $28,%r14,%r14 addq %rax,%r8 addq %r15,%rax movq %r8,%r13 addq %rax,%r14 vmovdqa %xmm10,112(%rsp) cmpb $0,135(%rbp) jne L$avx_00_47 shrdq $23,%r13,%r13 movq %r14,%rax movq %r9,%r12 shrdq $5,%r14,%r14 xorq %r8,%r13 xorq %r10,%r12 shrdq $4,%r13,%r13 xorq %rax,%r14 andq %r8,%r12 xorq %r8,%r13 addq 0(%rsp),%r11 movq %rax,%r15 xorq %r10,%r12 shrdq $6,%r14,%r14 xorq %rbx,%r15 addq %r12,%r11 shrdq $14,%r13,%r13 andq %r15,%rdi xorq %rax,%r14 addq %r13,%r11 xorq %rbx,%rdi shrdq $28,%r14,%r14 addq %r11,%rdx addq %rdi,%r11 movq %rdx,%r13 addq %r11,%r14 shrdq $23,%r13,%r13 movq %r14,%r11 movq %r8,%r12 shrdq $5,%r14,%r14 xorq %rdx,%r13 xorq %r9,%r12 shrdq $4,%r13,%r13 xorq %r11,%r14 andq %rdx,%r12 xorq %rdx,%r13 addq 8(%rsp),%r10 movq %r11,%rdi xorq %r9,%r12 shrdq $6,%r14,%r14 xorq %rax,%rdi addq %r12,%r10 shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %r11,%r14 addq %r13,%r10 xorq %rax,%r15 shrdq $28,%r14,%r14 addq %r10,%rcx addq %r15,%r10 movq %rcx,%r13 addq %r10,%r14 shrdq $23,%r13,%r13 movq %r14,%r10 movq %rdx,%r12 shrdq $5,%r14,%r14 xorq %rcx,%r13 xorq %r8,%r12 shrdq $4,%r13,%r13 xorq %r10,%r14 andq %rcx,%r12 xorq %rcx,%r13 addq 16(%rsp),%r9 movq %r10,%r15 xorq %r8,%r12 shrdq $6,%r14,%r14 xorq %r11,%r15 addq %r12,%r9 shrdq $14,%r13,%r13 andq %r15,%rdi xorq %r10,%r14 addq %r13,%r9 xorq %r11,%rdi shrdq $28,%r14,%r14 addq %r9,%rbx addq %rdi,%r9 movq %rbx,%r13 addq %r9,%r14 shrdq $23,%r13,%r13 movq %r14,%r9 movq %rcx,%r12 shrdq $5,%r14,%r14 xorq %rbx,%r13 xorq %rdx,%r12 shrdq $4,%r13,%r13 xorq %r9,%r14 andq %rbx,%r12 xorq %rbx,%r13 addq 24(%rsp),%r8 movq %r9,%rdi xorq %rdx,%r12 shrdq $6,%r14,%r14 xorq %r10,%rdi addq %r12,%r8 shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %r9,%r14 addq %r13,%r8 xorq %r10,%r15 shrdq $28,%r14,%r14 addq %r8,%rax addq %r15,%r8 movq %rax,%r13 addq %r8,%r14 shrdq $23,%r13,%r13 movq %r14,%r8 movq %rbx,%r12 shrdq $5,%r14,%r14 xorq %rax,%r13 xorq %rcx,%r12 shrdq $4,%r13,%r13 xorq %r8,%r14 andq %rax,%r12 xorq %rax,%r13 addq 32(%rsp),%rdx movq %r8,%r15 xorq %rcx,%r12 shrdq $6,%r14,%r14 xorq %r9,%r15 addq %r12,%rdx shrdq $14,%r13,%r13 andq %r15,%rdi xorq %r8,%r14 addq %r13,%rdx xorq %r9,%rdi shrdq $28,%r14,%r14 addq %rdx,%r11 addq %rdi,%rdx movq %r11,%r13 addq %rdx,%r14 shrdq $23,%r13,%r13 movq %r14,%rdx movq %rax,%r12 shrdq $5,%r14,%r14 xorq %r11,%r13 xorq %rbx,%r12 shrdq $4,%r13,%r13 xorq %rdx,%r14 andq %r11,%r12 xorq %r11,%r13 addq 40(%rsp),%rcx movq %rdx,%rdi xorq %rbx,%r12 shrdq $6,%r14,%r14 xorq %r8,%rdi addq %r12,%rcx shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %rdx,%r14 addq %r13,%rcx xorq %r8,%r15 shrdq $28,%r14,%r14 addq %rcx,%r10 addq %r15,%rcx movq %r10,%r13 addq %rcx,%r14 shrdq $23,%r13,%r13 movq %r14,%rcx movq %r11,%r12 shrdq $5,%r14,%r14 xorq %r10,%r13 xorq %rax,%r12 shrdq $4,%r13,%r13 xorq %rcx,%r14 andq %r10,%r12 xorq %r10,%r13 addq 48(%rsp),%rbx movq %rcx,%r15 xorq %rax,%r12 shrdq $6,%r14,%r14 xorq %rdx,%r15 addq %r12,%rbx shrdq $14,%r13,%r13 andq %r15,%rdi xorq %rcx,%r14 addq %r13,%rbx xorq %rdx,%rdi shrdq $28,%r14,%r14 addq %rbx,%r9 addq %rdi,%rbx movq %r9,%r13 addq %rbx,%r14 shrdq $23,%r13,%r13 movq %r14,%rbx movq %r10,%r12 shrdq $5,%r14,%r14 xorq %r9,%r13 xorq %r11,%r12 shrdq $4,%r13,%r13 xorq %rbx,%r14 andq %r9,%r12 xorq %r9,%r13 addq 56(%rsp),%rax movq %rbx,%rdi xorq %r11,%r12 shrdq $6,%r14,%r14 xorq %rcx,%rdi addq %r12,%rax shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %rbx,%r14 addq %r13,%rax xorq %rcx,%r15 shrdq $28,%r14,%r14 addq %rax,%r8 addq %r15,%rax movq %r8,%r13 addq %rax,%r14 shrdq $23,%r13,%r13 movq %r14,%rax movq %r9,%r12 shrdq $5,%r14,%r14 xorq %r8,%r13 xorq %r10,%r12 shrdq $4,%r13,%r13 xorq %rax,%r14 andq %r8,%r12 xorq %r8,%r13 addq 64(%rsp),%r11 movq %rax,%r15 xorq %r10,%r12 shrdq $6,%r14,%r14 xorq %rbx,%r15 addq %r12,%r11 shrdq $14,%r13,%r13 andq %r15,%rdi xorq %rax,%r14 addq %r13,%r11 xorq %rbx,%rdi shrdq $28,%r14,%r14 addq %r11,%rdx addq %rdi,%r11 movq %rdx,%r13 addq %r11,%r14 shrdq $23,%r13,%r13 movq %r14,%r11 movq %r8,%r12 shrdq $5,%r14,%r14 xorq %rdx,%r13 xorq %r9,%r12 shrdq $4,%r13,%r13 xorq %r11,%r14 andq %rdx,%r12 xorq %rdx,%r13 addq 72(%rsp),%r10 movq %r11,%rdi xorq %r9,%r12 shrdq $6,%r14,%r14 xorq %rax,%rdi addq %r12,%r10 shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %r11,%r14 addq %r13,%r10 xorq %rax,%r15 shrdq $28,%r14,%r14 addq %r10,%rcx addq %r15,%r10 movq %rcx,%r13 addq %r10,%r14 shrdq $23,%r13,%r13 movq %r14,%r10 movq %rdx,%r12 shrdq $5,%r14,%r14 xorq %rcx,%r13 xorq %r8,%r12 shrdq $4,%r13,%r13 xorq %r10,%r14 andq %rcx,%r12 xorq %rcx,%r13 addq 80(%rsp),%r9 movq %r10,%r15 xorq %r8,%r12 shrdq $6,%r14,%r14 xorq %r11,%r15 addq %r12,%r9 shrdq $14,%r13,%r13 andq %r15,%rdi xorq %r10,%r14 addq %r13,%r9 xorq %r11,%rdi shrdq $28,%r14,%r14 addq %r9,%rbx addq %rdi,%r9 movq %rbx,%r13 addq %r9,%r14 shrdq $23,%r13,%r13 movq %r14,%r9 movq %rcx,%r12 shrdq $5,%r14,%r14 xorq %rbx,%r13 xorq %rdx,%r12 shrdq $4,%r13,%r13 xorq %r9,%r14 andq %rbx,%r12 xorq %rbx,%r13 addq 88(%rsp),%r8 movq %r9,%rdi xorq %rdx,%r12 shrdq $6,%r14,%r14 xorq %r10,%rdi addq %r12,%r8 shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %r9,%r14 addq %r13,%r8 xorq %r10,%r15 shrdq $28,%r14,%r14 addq %r8,%rax addq %r15,%r8 movq %rax,%r13 addq %r8,%r14 shrdq $23,%r13,%r13 movq %r14,%r8 movq %rbx,%r12 shrdq $5,%r14,%r14 xorq %rax,%r13 xorq %rcx,%r12 shrdq $4,%r13,%r13 xorq %r8,%r14 andq %rax,%r12 xorq %rax,%r13 addq 96(%rsp),%rdx movq %r8,%r15 xorq %rcx,%r12 shrdq $6,%r14,%r14 xorq %r9,%r15 addq %r12,%rdx shrdq $14,%r13,%r13 andq %r15,%rdi xorq %r8,%r14 addq %r13,%rdx xorq %r9,%rdi shrdq $28,%r14,%r14 addq %rdx,%r11 addq %rdi,%rdx movq %r11,%r13 addq %rdx,%r14 shrdq $23,%r13,%r13 movq %r14,%rdx movq %rax,%r12 shrdq $5,%r14,%r14 xorq %r11,%r13 xorq %rbx,%r12 shrdq $4,%r13,%r13 xorq %rdx,%r14 andq %r11,%r12 xorq %r11,%r13 addq 104(%rsp),%rcx movq %rdx,%rdi xorq %rbx,%r12 shrdq $6,%r14,%r14 xorq %r8,%rdi addq %r12,%rcx shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %rdx,%r14 addq %r13,%rcx xorq %r8,%r15 shrdq $28,%r14,%r14 addq %rcx,%r10 addq %r15,%rcx movq %r10,%r13 addq %rcx,%r14 shrdq $23,%r13,%r13 movq %r14,%rcx movq %r11,%r12 shrdq $5,%r14,%r14 xorq %r10,%r13 xorq %rax,%r12 shrdq $4,%r13,%r13 xorq %rcx,%r14 andq %r10,%r12 xorq %r10,%r13 addq 112(%rsp),%rbx movq %rcx,%r15 xorq %rax,%r12 shrdq $6,%r14,%r14 xorq %rdx,%r15 addq %r12,%rbx shrdq $14,%r13,%r13 andq %r15,%rdi xorq %rcx,%r14 addq %r13,%rbx xorq %rdx,%rdi shrdq $28,%r14,%r14 addq %rbx,%r9 addq %rdi,%rbx movq %r9,%r13 addq %rbx,%r14 shrdq $23,%r13,%r13 movq %r14,%rbx movq %r10,%r12 shrdq $5,%r14,%r14 xorq %r9,%r13 xorq %r11,%r12 shrdq $4,%r13,%r13 xorq %rbx,%r14 andq %r9,%r12 xorq %r9,%r13 addq 120(%rsp),%rax movq %rbx,%rdi xorq %r11,%r12 shrdq $6,%r14,%r14 xorq %rcx,%rdi addq %r12,%rax shrdq $14,%r13,%r13 andq %rdi,%r15 xorq %rbx,%r14 addq %r13,%rax xorq %rcx,%r15 shrdq $28,%r14,%r14 addq %rax,%r8 addq %r15,%rax movq %r8,%r13 addq %rax,%r14 movq 128+0(%rsp),%rdi movq %r14,%rax addq 0(%rdi),%rax leaq 128(%rsi),%rsi addq 8(%rdi),%rbx addq 16(%rdi),%rcx addq 24(%rdi),%rdx addq 32(%rdi),%r8 addq 40(%rdi),%r9 addq 48(%rdi),%r10 addq 56(%rdi),%r11 cmpq 128+16(%rsp),%rsi movq %rax,0(%rdi) movq %rbx,8(%rdi) movq %rcx,16(%rdi) movq %rdx,24(%rdi) movq %r8,32(%rdi) movq %r9,40(%rdi) movq %r10,48(%rdi) movq %r11,56(%rdi) jb L$loop_avx movq 152(%rsp),%rsi vzeroupper movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$epilogue_avx: ret #endif ring-0.17.14/pregenerated/sha512-x86_64-nasm.asm000064400000000000000000001303431046102023000170030ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %ifidn __OUTPUT_FORMAT__, win64 default rel %define XMMWORD %define YMMWORD %define ZMMWORD %define _CET_ENDBR %include "ring_core_generated/prefix_symbols_nasm.inc" section .text code align=64 global sha512_block_data_order_nohw ALIGN 16 sha512_block_data_order_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_sha512_block_data_order_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 shl rdx,4 sub rsp,16*8+4*8 lea rdx,[rdx*8+rsi] and rsp,-64 mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx mov QWORD[152+rsp],rax $L$prologue: mov rax,QWORD[rdi] mov rbx,QWORD[8+rdi] mov rcx,QWORD[16+rdi] mov rdx,QWORD[24+rdi] mov r8,QWORD[32+rdi] mov r9,QWORD[40+rdi] mov r10,QWORD[48+rdi] mov r11,QWORD[56+rdi] jmp NEAR $L$loop ALIGN 16 $L$loop: mov rdi,rbx lea rbp,[K512] xor rdi,rcx mov r12,QWORD[rsi] mov r13,r8 mov r14,rax bswap r12 ror r13,23 mov r15,r9 xor r13,r8 ror r14,5 xor r15,r10 mov QWORD[rsp],r12 xor r14,rax and r15,r8 ror r13,4 add r12,r11 xor r15,r10 ror r14,6 xor r13,r8 add r12,r15 mov r15,rax add r12,QWORD[rbp] xor r14,rax xor r15,rbx ror r13,14 mov r11,rbx and rdi,r15 ror r14,28 add r12,r13 xor r11,rdi add rdx,r12 add r11,r12 lea rbp,[8+rbp] add r11,r14 mov r12,QWORD[8+rsi] mov r13,rdx mov r14,r11 bswap r12 ror r13,23 mov rdi,r8 xor r13,rdx ror r14,5 xor rdi,r9 mov QWORD[8+rsp],r12 xor r14,r11 and rdi,rdx ror r13,4 add r12,r10 xor rdi,r9 ror r14,6 xor r13,rdx add r12,rdi mov rdi,r11 add r12,QWORD[rbp] xor r14,r11 xor rdi,rax ror r13,14 mov r10,rax and r15,rdi ror r14,28 add r12,r13 xor r10,r15 add rcx,r12 add r10,r12 lea rbp,[24+rbp] add r10,r14 mov r12,QWORD[16+rsi] mov r13,rcx mov r14,r10 bswap r12 ror r13,23 mov r15,rdx xor r13,rcx ror r14,5 xor r15,r8 mov QWORD[16+rsp],r12 xor r14,r10 and r15,rcx ror r13,4 add r12,r9 xor r15,r8 ror r14,6 xor r13,rcx add r12,r15 mov r15,r10 add r12,QWORD[rbp] xor r14,r10 xor r15,r11 ror r13,14 mov r9,r11 and rdi,r15 ror r14,28 add r12,r13 xor r9,rdi add rbx,r12 add r9,r12 lea rbp,[8+rbp] add r9,r14 mov r12,QWORD[24+rsi] mov r13,rbx mov r14,r9 bswap r12 ror r13,23 mov rdi,rcx xor r13,rbx ror r14,5 xor rdi,rdx mov QWORD[24+rsp],r12 xor r14,r9 and rdi,rbx ror r13,4 add r12,r8 xor rdi,rdx ror r14,6 xor r13,rbx add r12,rdi mov rdi,r9 add r12,QWORD[rbp] xor r14,r9 xor rdi,r10 ror r13,14 mov r8,r10 and r15,rdi ror r14,28 add r12,r13 xor r8,r15 add rax,r12 add r8,r12 lea rbp,[24+rbp] add r8,r14 mov r12,QWORD[32+rsi] mov r13,rax mov r14,r8 bswap r12 ror r13,23 mov r15,rbx xor r13,rax ror r14,5 xor r15,rcx mov QWORD[32+rsp],r12 xor r14,r8 and r15,rax ror r13,4 add r12,rdx xor r15,rcx ror r14,6 xor r13,rax add r12,r15 mov r15,r8 add r12,QWORD[rbp] xor r14,r8 xor r15,r9 ror r13,14 mov rdx,r9 and rdi,r15 ror r14,28 add r12,r13 xor rdx,rdi add r11,r12 add rdx,r12 lea rbp,[8+rbp] add rdx,r14 mov r12,QWORD[40+rsi] mov r13,r11 mov r14,rdx bswap r12 ror r13,23 mov rdi,rax xor r13,r11 ror r14,5 xor rdi,rbx mov QWORD[40+rsp],r12 xor r14,rdx and rdi,r11 ror r13,4 add r12,rcx xor rdi,rbx ror r14,6 xor r13,r11 add r12,rdi mov rdi,rdx add r12,QWORD[rbp] xor r14,rdx xor rdi,r8 ror r13,14 mov rcx,r8 and r15,rdi ror r14,28 add r12,r13 xor rcx,r15 add r10,r12 add rcx,r12 lea rbp,[24+rbp] add rcx,r14 mov r12,QWORD[48+rsi] mov r13,r10 mov r14,rcx bswap r12 ror r13,23 mov r15,r11 xor r13,r10 ror r14,5 xor r15,rax mov QWORD[48+rsp],r12 xor r14,rcx and r15,r10 ror r13,4 add r12,rbx xor r15,rax ror r14,6 xor r13,r10 add r12,r15 mov r15,rcx add r12,QWORD[rbp] xor r14,rcx xor r15,rdx ror r13,14 mov rbx,rdx and rdi,r15 ror r14,28 add r12,r13 xor rbx,rdi add r9,r12 add rbx,r12 lea rbp,[8+rbp] add rbx,r14 mov r12,QWORD[56+rsi] mov r13,r9 mov r14,rbx bswap r12 ror r13,23 mov rdi,r10 xor r13,r9 ror r14,5 xor rdi,r11 mov QWORD[56+rsp],r12 xor r14,rbx and rdi,r9 ror r13,4 add r12,rax xor rdi,r11 ror r14,6 xor r13,r9 add r12,rdi mov rdi,rbx add r12,QWORD[rbp] xor r14,rbx xor rdi,rcx ror r13,14 mov rax,rcx and r15,rdi ror r14,28 add r12,r13 xor rax,r15 add r8,r12 add rax,r12 lea rbp,[24+rbp] add rax,r14 mov r12,QWORD[64+rsi] mov r13,r8 mov r14,rax bswap r12 ror r13,23 mov r15,r9 xor r13,r8 ror r14,5 xor r15,r10 mov QWORD[64+rsp],r12 xor r14,rax and r15,r8 ror r13,4 add r12,r11 xor r15,r10 ror r14,6 xor r13,r8 add r12,r15 mov r15,rax add r12,QWORD[rbp] xor r14,rax xor r15,rbx ror r13,14 mov r11,rbx and rdi,r15 ror r14,28 add r12,r13 xor r11,rdi add rdx,r12 add r11,r12 lea rbp,[8+rbp] add r11,r14 mov r12,QWORD[72+rsi] mov r13,rdx mov r14,r11 bswap r12 ror r13,23 mov rdi,r8 xor r13,rdx ror r14,5 xor rdi,r9 mov QWORD[72+rsp],r12 xor r14,r11 and rdi,rdx ror r13,4 add r12,r10 xor rdi,r9 ror r14,6 xor r13,rdx add r12,rdi mov rdi,r11 add r12,QWORD[rbp] xor r14,r11 xor rdi,rax ror r13,14 mov r10,rax and r15,rdi ror r14,28 add r12,r13 xor r10,r15 add rcx,r12 add r10,r12 lea rbp,[24+rbp] add r10,r14 mov r12,QWORD[80+rsi] mov r13,rcx mov r14,r10 bswap r12 ror r13,23 mov r15,rdx xor r13,rcx ror r14,5 xor r15,r8 mov QWORD[80+rsp],r12 xor r14,r10 and r15,rcx ror r13,4 add r12,r9 xor r15,r8 ror r14,6 xor r13,rcx add r12,r15 mov r15,r10 add r12,QWORD[rbp] xor r14,r10 xor r15,r11 ror r13,14 mov r9,r11 and rdi,r15 ror r14,28 add r12,r13 xor r9,rdi add rbx,r12 add r9,r12 lea rbp,[8+rbp] add r9,r14 mov r12,QWORD[88+rsi] mov r13,rbx mov r14,r9 bswap r12 ror r13,23 mov rdi,rcx xor r13,rbx ror r14,5 xor rdi,rdx mov QWORD[88+rsp],r12 xor r14,r9 and rdi,rbx ror r13,4 add r12,r8 xor rdi,rdx ror r14,6 xor r13,rbx add r12,rdi mov rdi,r9 add r12,QWORD[rbp] xor r14,r9 xor rdi,r10 ror r13,14 mov r8,r10 and r15,rdi ror r14,28 add r12,r13 xor r8,r15 add rax,r12 add r8,r12 lea rbp,[24+rbp] add r8,r14 mov r12,QWORD[96+rsi] mov r13,rax mov r14,r8 bswap r12 ror r13,23 mov r15,rbx xor r13,rax ror r14,5 xor r15,rcx mov QWORD[96+rsp],r12 xor r14,r8 and r15,rax ror r13,4 add r12,rdx xor r15,rcx ror r14,6 xor r13,rax add r12,r15 mov r15,r8 add r12,QWORD[rbp] xor r14,r8 xor r15,r9 ror r13,14 mov rdx,r9 and rdi,r15 ror r14,28 add r12,r13 xor rdx,rdi add r11,r12 add rdx,r12 lea rbp,[8+rbp] add rdx,r14 mov r12,QWORD[104+rsi] mov r13,r11 mov r14,rdx bswap r12 ror r13,23 mov rdi,rax xor r13,r11 ror r14,5 xor rdi,rbx mov QWORD[104+rsp],r12 xor r14,rdx and rdi,r11 ror r13,4 add r12,rcx xor rdi,rbx ror r14,6 xor r13,r11 add r12,rdi mov rdi,rdx add r12,QWORD[rbp] xor r14,rdx xor rdi,r8 ror r13,14 mov rcx,r8 and r15,rdi ror r14,28 add r12,r13 xor rcx,r15 add r10,r12 add rcx,r12 lea rbp,[24+rbp] add rcx,r14 mov r12,QWORD[112+rsi] mov r13,r10 mov r14,rcx bswap r12 ror r13,23 mov r15,r11 xor r13,r10 ror r14,5 xor r15,rax mov QWORD[112+rsp],r12 xor r14,rcx and r15,r10 ror r13,4 add r12,rbx xor r15,rax ror r14,6 xor r13,r10 add r12,r15 mov r15,rcx add r12,QWORD[rbp] xor r14,rcx xor r15,rdx ror r13,14 mov rbx,rdx and rdi,r15 ror r14,28 add r12,r13 xor rbx,rdi add r9,r12 add rbx,r12 lea rbp,[8+rbp] add rbx,r14 mov r12,QWORD[120+rsi] mov r13,r9 mov r14,rbx bswap r12 ror r13,23 mov rdi,r10 xor r13,r9 ror r14,5 xor rdi,r11 mov QWORD[120+rsp],r12 xor r14,rbx and rdi,r9 ror r13,4 add r12,rax xor rdi,r11 ror r14,6 xor r13,r9 add r12,rdi mov rdi,rbx add r12,QWORD[rbp] xor r14,rbx xor rdi,rcx ror r13,14 mov rax,rcx and r15,rdi ror r14,28 add r12,r13 xor rax,r15 add r8,r12 add rax,r12 lea rbp,[24+rbp] jmp NEAR $L$rounds_16_xx ALIGN 16 $L$rounds_16_xx: mov r13,QWORD[8+rsp] mov r15,QWORD[112+rsp] mov r12,r13 ror r13,7 add rax,r14 mov r14,r15 ror r15,42 xor r13,r12 shr r12,7 ror r13,1 xor r15,r14 shr r14,6 ror r15,19 xor r12,r13 xor r15,r14 add r12,QWORD[72+rsp] add r12,QWORD[rsp] mov r13,r8 add r12,r15 mov r14,rax ror r13,23 mov r15,r9 xor r13,r8 ror r14,5 xor r15,r10 mov QWORD[rsp],r12 xor r14,rax and r15,r8 ror r13,4 add r12,r11 xor r15,r10 ror r14,6 xor r13,r8 add r12,r15 mov r15,rax add r12,QWORD[rbp] xor r14,rax xor r15,rbx ror r13,14 mov r11,rbx and rdi,r15 ror r14,28 add r12,r13 xor r11,rdi add rdx,r12 add r11,r12 lea rbp,[8+rbp] mov r13,QWORD[16+rsp] mov rdi,QWORD[120+rsp] mov r12,r13 ror r13,7 add r11,r14 mov r14,rdi ror rdi,42 xor r13,r12 shr r12,7 ror r13,1 xor rdi,r14 shr r14,6 ror rdi,19 xor r12,r13 xor rdi,r14 add r12,QWORD[80+rsp] add r12,QWORD[8+rsp] mov r13,rdx add r12,rdi mov r14,r11 ror r13,23 mov rdi,r8 xor r13,rdx ror r14,5 xor rdi,r9 mov QWORD[8+rsp],r12 xor r14,r11 and rdi,rdx ror r13,4 add r12,r10 xor rdi,r9 ror r14,6 xor r13,rdx add r12,rdi mov rdi,r11 add r12,QWORD[rbp] xor r14,r11 xor rdi,rax ror r13,14 mov r10,rax and r15,rdi ror r14,28 add r12,r13 xor r10,r15 add rcx,r12 add r10,r12 lea rbp,[24+rbp] mov r13,QWORD[24+rsp] mov r15,QWORD[rsp] mov r12,r13 ror r13,7 add r10,r14 mov r14,r15 ror r15,42 xor r13,r12 shr r12,7 ror r13,1 xor r15,r14 shr r14,6 ror r15,19 xor r12,r13 xor r15,r14 add r12,QWORD[88+rsp] add r12,QWORD[16+rsp] mov r13,rcx add r12,r15 mov r14,r10 ror r13,23 mov r15,rdx xor r13,rcx ror r14,5 xor r15,r8 mov QWORD[16+rsp],r12 xor r14,r10 and r15,rcx ror r13,4 add r12,r9 xor r15,r8 ror r14,6 xor r13,rcx add r12,r15 mov r15,r10 add r12,QWORD[rbp] xor r14,r10 xor r15,r11 ror r13,14 mov r9,r11 and rdi,r15 ror r14,28 add r12,r13 xor r9,rdi add rbx,r12 add r9,r12 lea rbp,[8+rbp] mov r13,QWORD[32+rsp] mov rdi,QWORD[8+rsp] mov r12,r13 ror r13,7 add r9,r14 mov r14,rdi ror rdi,42 xor r13,r12 shr r12,7 ror r13,1 xor rdi,r14 shr r14,6 ror rdi,19 xor r12,r13 xor rdi,r14 add r12,QWORD[96+rsp] add r12,QWORD[24+rsp] mov r13,rbx add r12,rdi mov r14,r9 ror r13,23 mov rdi,rcx xor r13,rbx ror r14,5 xor rdi,rdx mov QWORD[24+rsp],r12 xor r14,r9 and rdi,rbx ror r13,4 add r12,r8 xor rdi,rdx ror r14,6 xor r13,rbx add r12,rdi mov rdi,r9 add r12,QWORD[rbp] xor r14,r9 xor rdi,r10 ror r13,14 mov r8,r10 and r15,rdi ror r14,28 add r12,r13 xor r8,r15 add rax,r12 add r8,r12 lea rbp,[24+rbp] mov r13,QWORD[40+rsp] mov r15,QWORD[16+rsp] mov r12,r13 ror r13,7 add r8,r14 mov r14,r15 ror r15,42 xor r13,r12 shr r12,7 ror r13,1 xor r15,r14 shr r14,6 ror r15,19 xor r12,r13 xor r15,r14 add r12,QWORD[104+rsp] add r12,QWORD[32+rsp] mov r13,rax add r12,r15 mov r14,r8 ror r13,23 mov r15,rbx xor r13,rax ror r14,5 xor r15,rcx mov QWORD[32+rsp],r12 xor r14,r8 and r15,rax ror r13,4 add r12,rdx xor r15,rcx ror r14,6 xor r13,rax add r12,r15 mov r15,r8 add r12,QWORD[rbp] xor r14,r8 xor r15,r9 ror r13,14 mov rdx,r9 and rdi,r15 ror r14,28 add r12,r13 xor rdx,rdi add r11,r12 add rdx,r12 lea rbp,[8+rbp] mov r13,QWORD[48+rsp] mov rdi,QWORD[24+rsp] mov r12,r13 ror r13,7 add rdx,r14 mov r14,rdi ror rdi,42 xor r13,r12 shr r12,7 ror r13,1 xor rdi,r14 shr r14,6 ror rdi,19 xor r12,r13 xor rdi,r14 add r12,QWORD[112+rsp] add r12,QWORD[40+rsp] mov r13,r11 add r12,rdi mov r14,rdx ror r13,23 mov rdi,rax xor r13,r11 ror r14,5 xor rdi,rbx mov QWORD[40+rsp],r12 xor r14,rdx and rdi,r11 ror r13,4 add r12,rcx xor rdi,rbx ror r14,6 xor r13,r11 add r12,rdi mov rdi,rdx add r12,QWORD[rbp] xor r14,rdx xor rdi,r8 ror r13,14 mov rcx,r8 and r15,rdi ror r14,28 add r12,r13 xor rcx,r15 add r10,r12 add rcx,r12 lea rbp,[24+rbp] mov r13,QWORD[56+rsp] mov r15,QWORD[32+rsp] mov r12,r13 ror r13,7 add rcx,r14 mov r14,r15 ror r15,42 xor r13,r12 shr r12,7 ror r13,1 xor r15,r14 shr r14,6 ror r15,19 xor r12,r13 xor r15,r14 add r12,QWORD[120+rsp] add r12,QWORD[48+rsp] mov r13,r10 add r12,r15 mov r14,rcx ror r13,23 mov r15,r11 xor r13,r10 ror r14,5 xor r15,rax mov QWORD[48+rsp],r12 xor r14,rcx and r15,r10 ror r13,4 add r12,rbx xor r15,rax ror r14,6 xor r13,r10 add r12,r15 mov r15,rcx add r12,QWORD[rbp] xor r14,rcx xor r15,rdx ror r13,14 mov rbx,rdx and rdi,r15 ror r14,28 add r12,r13 xor rbx,rdi add r9,r12 add rbx,r12 lea rbp,[8+rbp] mov r13,QWORD[64+rsp] mov rdi,QWORD[40+rsp] mov r12,r13 ror r13,7 add rbx,r14 mov r14,rdi ror rdi,42 xor r13,r12 shr r12,7 ror r13,1 xor rdi,r14 shr r14,6 ror rdi,19 xor r12,r13 xor rdi,r14 add r12,QWORD[rsp] add r12,QWORD[56+rsp] mov r13,r9 add r12,rdi mov r14,rbx ror r13,23 mov rdi,r10 xor r13,r9 ror r14,5 xor rdi,r11 mov QWORD[56+rsp],r12 xor r14,rbx and rdi,r9 ror r13,4 add r12,rax xor rdi,r11 ror r14,6 xor r13,r9 add r12,rdi mov rdi,rbx add r12,QWORD[rbp] xor r14,rbx xor rdi,rcx ror r13,14 mov rax,rcx and r15,rdi ror r14,28 add r12,r13 xor rax,r15 add r8,r12 add rax,r12 lea rbp,[24+rbp] mov r13,QWORD[72+rsp] mov r15,QWORD[48+rsp] mov r12,r13 ror r13,7 add rax,r14 mov r14,r15 ror r15,42 xor r13,r12 shr r12,7 ror r13,1 xor r15,r14 shr r14,6 ror r15,19 xor r12,r13 xor r15,r14 add r12,QWORD[8+rsp] add r12,QWORD[64+rsp] mov r13,r8 add r12,r15 mov r14,rax ror r13,23 mov r15,r9 xor r13,r8 ror r14,5 xor r15,r10 mov QWORD[64+rsp],r12 xor r14,rax and r15,r8 ror r13,4 add r12,r11 xor r15,r10 ror r14,6 xor r13,r8 add r12,r15 mov r15,rax add r12,QWORD[rbp] xor r14,rax xor r15,rbx ror r13,14 mov r11,rbx and rdi,r15 ror r14,28 add r12,r13 xor r11,rdi add rdx,r12 add r11,r12 lea rbp,[8+rbp] mov r13,QWORD[80+rsp] mov rdi,QWORD[56+rsp] mov r12,r13 ror r13,7 add r11,r14 mov r14,rdi ror rdi,42 xor r13,r12 shr r12,7 ror r13,1 xor rdi,r14 shr r14,6 ror rdi,19 xor r12,r13 xor rdi,r14 add r12,QWORD[16+rsp] add r12,QWORD[72+rsp] mov r13,rdx add r12,rdi mov r14,r11 ror r13,23 mov rdi,r8 xor r13,rdx ror r14,5 xor rdi,r9 mov QWORD[72+rsp],r12 xor r14,r11 and rdi,rdx ror r13,4 add r12,r10 xor rdi,r9 ror r14,6 xor r13,rdx add r12,rdi mov rdi,r11 add r12,QWORD[rbp] xor r14,r11 xor rdi,rax ror r13,14 mov r10,rax and r15,rdi ror r14,28 add r12,r13 xor r10,r15 add rcx,r12 add r10,r12 lea rbp,[24+rbp] mov r13,QWORD[88+rsp] mov r15,QWORD[64+rsp] mov r12,r13 ror r13,7 add r10,r14 mov r14,r15 ror r15,42 xor r13,r12 shr r12,7 ror r13,1 xor r15,r14 shr r14,6 ror r15,19 xor r12,r13 xor r15,r14 add r12,QWORD[24+rsp] add r12,QWORD[80+rsp] mov r13,rcx add r12,r15 mov r14,r10 ror r13,23 mov r15,rdx xor r13,rcx ror r14,5 xor r15,r8 mov QWORD[80+rsp],r12 xor r14,r10 and r15,rcx ror r13,4 add r12,r9 xor r15,r8 ror r14,6 xor r13,rcx add r12,r15 mov r15,r10 add r12,QWORD[rbp] xor r14,r10 xor r15,r11 ror r13,14 mov r9,r11 and rdi,r15 ror r14,28 add r12,r13 xor r9,rdi add rbx,r12 add r9,r12 lea rbp,[8+rbp] mov r13,QWORD[96+rsp] mov rdi,QWORD[72+rsp] mov r12,r13 ror r13,7 add r9,r14 mov r14,rdi ror rdi,42 xor r13,r12 shr r12,7 ror r13,1 xor rdi,r14 shr r14,6 ror rdi,19 xor r12,r13 xor rdi,r14 add r12,QWORD[32+rsp] add r12,QWORD[88+rsp] mov r13,rbx add r12,rdi mov r14,r9 ror r13,23 mov rdi,rcx xor r13,rbx ror r14,5 xor rdi,rdx mov QWORD[88+rsp],r12 xor r14,r9 and rdi,rbx ror r13,4 add r12,r8 xor rdi,rdx ror r14,6 xor r13,rbx add r12,rdi mov rdi,r9 add r12,QWORD[rbp] xor r14,r9 xor rdi,r10 ror r13,14 mov r8,r10 and r15,rdi ror r14,28 add r12,r13 xor r8,r15 add rax,r12 add r8,r12 lea rbp,[24+rbp] mov r13,QWORD[104+rsp] mov r15,QWORD[80+rsp] mov r12,r13 ror r13,7 add r8,r14 mov r14,r15 ror r15,42 xor r13,r12 shr r12,7 ror r13,1 xor r15,r14 shr r14,6 ror r15,19 xor r12,r13 xor r15,r14 add r12,QWORD[40+rsp] add r12,QWORD[96+rsp] mov r13,rax add r12,r15 mov r14,r8 ror r13,23 mov r15,rbx xor r13,rax ror r14,5 xor r15,rcx mov QWORD[96+rsp],r12 xor r14,r8 and r15,rax ror r13,4 add r12,rdx xor r15,rcx ror r14,6 xor r13,rax add r12,r15 mov r15,r8 add r12,QWORD[rbp] xor r14,r8 xor r15,r9 ror r13,14 mov rdx,r9 and rdi,r15 ror r14,28 add r12,r13 xor rdx,rdi add r11,r12 add rdx,r12 lea rbp,[8+rbp] mov r13,QWORD[112+rsp] mov rdi,QWORD[88+rsp] mov r12,r13 ror r13,7 add rdx,r14 mov r14,rdi ror rdi,42 xor r13,r12 shr r12,7 ror r13,1 xor rdi,r14 shr r14,6 ror rdi,19 xor r12,r13 xor rdi,r14 add r12,QWORD[48+rsp] add r12,QWORD[104+rsp] mov r13,r11 add r12,rdi mov r14,rdx ror r13,23 mov rdi,rax xor r13,r11 ror r14,5 xor rdi,rbx mov QWORD[104+rsp],r12 xor r14,rdx and rdi,r11 ror r13,4 add r12,rcx xor rdi,rbx ror r14,6 xor r13,r11 add r12,rdi mov rdi,rdx add r12,QWORD[rbp] xor r14,rdx xor rdi,r8 ror r13,14 mov rcx,r8 and r15,rdi ror r14,28 add r12,r13 xor rcx,r15 add r10,r12 add rcx,r12 lea rbp,[24+rbp] mov r13,QWORD[120+rsp] mov r15,QWORD[96+rsp] mov r12,r13 ror r13,7 add rcx,r14 mov r14,r15 ror r15,42 xor r13,r12 shr r12,7 ror r13,1 xor r15,r14 shr r14,6 ror r15,19 xor r12,r13 xor r15,r14 add r12,QWORD[56+rsp] add r12,QWORD[112+rsp] mov r13,r10 add r12,r15 mov r14,rcx ror r13,23 mov r15,r11 xor r13,r10 ror r14,5 xor r15,rax mov QWORD[112+rsp],r12 xor r14,rcx and r15,r10 ror r13,4 add r12,rbx xor r15,rax ror r14,6 xor r13,r10 add r12,r15 mov r15,rcx add r12,QWORD[rbp] xor r14,rcx xor r15,rdx ror r13,14 mov rbx,rdx and rdi,r15 ror r14,28 add r12,r13 xor rbx,rdi add r9,r12 add rbx,r12 lea rbp,[8+rbp] mov r13,QWORD[rsp] mov rdi,QWORD[104+rsp] mov r12,r13 ror r13,7 add rbx,r14 mov r14,rdi ror rdi,42 xor r13,r12 shr r12,7 ror r13,1 xor rdi,r14 shr r14,6 ror rdi,19 xor r12,r13 xor rdi,r14 add r12,QWORD[64+rsp] add r12,QWORD[120+rsp] mov r13,r9 add r12,rdi mov r14,rbx ror r13,23 mov rdi,r10 xor r13,r9 ror r14,5 xor rdi,r11 mov QWORD[120+rsp],r12 xor r14,rbx and rdi,r9 ror r13,4 add r12,rax xor rdi,r11 ror r14,6 xor r13,r9 add r12,rdi mov rdi,rbx add r12,QWORD[rbp] xor r14,rbx xor rdi,rcx ror r13,14 mov rax,rcx and r15,rdi ror r14,28 add r12,r13 xor rax,r15 add r8,r12 add rax,r12 lea rbp,[24+rbp] cmp BYTE[7+rbp],0 jnz NEAR $L$rounds_16_xx mov rdi,QWORD[((128+0))+rsp] add rax,r14 lea rsi,[128+rsi] add rax,QWORD[rdi] add rbx,QWORD[8+rdi] add rcx,QWORD[16+rdi] add rdx,QWORD[24+rdi] add r8,QWORD[32+rdi] add r9,QWORD[40+rdi] add r10,QWORD[48+rdi] add r11,QWORD[56+rdi] cmp rsi,QWORD[((128+16))+rsp] mov QWORD[rdi],rax mov QWORD[8+rdi],rbx mov QWORD[16+rdi],rcx mov QWORD[24+rdi],rdx mov QWORD[32+rdi],r8 mov QWORD[40+rdi],r9 mov QWORD[48+rdi],r10 mov QWORD[56+rdi],r11 jb NEAR $L$loop mov rsi,QWORD[152+rsp] mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_sha512_block_data_order_nohw: section .rdata rdata align=8 ALIGN 64 K512: DQ 0x428a2f98d728ae22,0x7137449123ef65cd DQ 0x428a2f98d728ae22,0x7137449123ef65cd DQ 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc DQ 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc DQ 0x3956c25bf348b538,0x59f111f1b605d019 DQ 0x3956c25bf348b538,0x59f111f1b605d019 DQ 0x923f82a4af194f9b,0xab1c5ed5da6d8118 DQ 0x923f82a4af194f9b,0xab1c5ed5da6d8118 DQ 0xd807aa98a3030242,0x12835b0145706fbe DQ 0xd807aa98a3030242,0x12835b0145706fbe DQ 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 DQ 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 DQ 0x72be5d74f27b896f,0x80deb1fe3b1696b1 DQ 0x72be5d74f27b896f,0x80deb1fe3b1696b1 DQ 0x9bdc06a725c71235,0xc19bf174cf692694 DQ 0x9bdc06a725c71235,0xc19bf174cf692694 DQ 0xe49b69c19ef14ad2,0xefbe4786384f25e3 DQ 0xe49b69c19ef14ad2,0xefbe4786384f25e3 DQ 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 DQ 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 DQ 0x2de92c6f592b0275,0x4a7484aa6ea6e483 DQ 0x2de92c6f592b0275,0x4a7484aa6ea6e483 DQ 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 DQ 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 DQ 0x983e5152ee66dfab,0xa831c66d2db43210 DQ 0x983e5152ee66dfab,0xa831c66d2db43210 DQ 0xb00327c898fb213f,0xbf597fc7beef0ee4 DQ 0xb00327c898fb213f,0xbf597fc7beef0ee4 DQ 0xc6e00bf33da88fc2,0xd5a79147930aa725 DQ 0xc6e00bf33da88fc2,0xd5a79147930aa725 DQ 0x06ca6351e003826f,0x142929670a0e6e70 DQ 0x06ca6351e003826f,0x142929670a0e6e70 DQ 0x27b70a8546d22ffc,0x2e1b21385c26c926 DQ 0x27b70a8546d22ffc,0x2e1b21385c26c926 DQ 0x4d2c6dfc5ac42aed,0x53380d139d95b3df DQ 0x4d2c6dfc5ac42aed,0x53380d139d95b3df DQ 0x650a73548baf63de,0x766a0abb3c77b2a8 DQ 0x650a73548baf63de,0x766a0abb3c77b2a8 DQ 0x81c2c92e47edaee6,0x92722c851482353b DQ 0x81c2c92e47edaee6,0x92722c851482353b DQ 0xa2bfe8a14cf10364,0xa81a664bbc423001 DQ 0xa2bfe8a14cf10364,0xa81a664bbc423001 DQ 0xc24b8b70d0f89791,0xc76c51a30654be30 DQ 0xc24b8b70d0f89791,0xc76c51a30654be30 DQ 0xd192e819d6ef5218,0xd69906245565a910 DQ 0xd192e819d6ef5218,0xd69906245565a910 DQ 0xf40e35855771202a,0x106aa07032bbd1b8 DQ 0xf40e35855771202a,0x106aa07032bbd1b8 DQ 0x19a4c116b8d2d0c8,0x1e376c085141ab53 DQ 0x19a4c116b8d2d0c8,0x1e376c085141ab53 DQ 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 DQ 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 DQ 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb DQ 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb DQ 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 DQ 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 DQ 0x748f82ee5defb2fc,0x78a5636f43172f60 DQ 0x748f82ee5defb2fc,0x78a5636f43172f60 DQ 0x84c87814a1f0ab72,0x8cc702081a6439ec DQ 0x84c87814a1f0ab72,0x8cc702081a6439ec DQ 0x90befffa23631e28,0xa4506cebde82bde9 DQ 0x90befffa23631e28,0xa4506cebde82bde9 DQ 0xbef9a3f7b2c67915,0xc67178f2e372532b DQ 0xbef9a3f7b2c67915,0xc67178f2e372532b DQ 0xca273eceea26619c,0xd186b8c721c0c207 DQ 0xca273eceea26619c,0xd186b8c721c0c207 DQ 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 DQ 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 DQ 0x06f067aa72176fba,0x0a637dc5a2c898a6 DQ 0x06f067aa72176fba,0x0a637dc5a2c898a6 DQ 0x113f9804bef90dae,0x1b710b35131c471b DQ 0x113f9804bef90dae,0x1b710b35131c471b DQ 0x28db77f523047d84,0x32caab7b40c72493 DQ 0x28db77f523047d84,0x32caab7b40c72493 DQ 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c DQ 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c DQ 0x4cc5d4becb3e42b6,0x597f299cfc657e2a DQ 0x4cc5d4becb3e42b6,0x597f299cfc657e2a DQ 0x5fcb6fab3ad6faec,0x6c44198c4a475817 DQ 0x5fcb6fab3ad6faec,0x6c44198c4a475817 DQ 0x0001020304050607,0x08090a0b0c0d0e0f DQ 0x0001020304050607,0x08090a0b0c0d0e0f DB 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 DB 111,114,103,62,0 section .text global sha512_block_data_order_avx ALIGN 64 sha512_block_data_order_avx: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_sha512_block_data_order_avx: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 shl rdx,4 sub rsp,256 lea rdx,[rdx*8+rsi] and rsp,-64 mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx mov QWORD[152+rsp],rax movaps XMMWORD[(128+32)+rsp],xmm6 movaps XMMWORD[(128+48)+rsp],xmm7 movaps XMMWORD[(128+64)+rsp],xmm8 movaps XMMWORD[(128+80)+rsp],xmm9 movaps XMMWORD[(128+96)+rsp],xmm10 movaps XMMWORD[(128+112)+rsp],xmm11 $L$prologue_avx: vzeroupper mov rax,QWORD[rdi] mov rbx,QWORD[8+rdi] mov rcx,QWORD[16+rdi] mov rdx,QWORD[24+rdi] mov r8,QWORD[32+rdi] mov r9,QWORD[40+rdi] mov r10,QWORD[48+rdi] mov r11,QWORD[56+rdi] jmp NEAR $L$loop_avx ALIGN 16 $L$loop_avx: vmovdqa xmm11,XMMWORD[((K512+1280))] vmovdqu xmm0,XMMWORD[rsi] lea rbp,[((K512+128))] vmovdqu xmm1,XMMWORD[16+rsi] vmovdqu xmm2,XMMWORD[32+rsi] vpshufb xmm0,xmm0,xmm11 vmovdqu xmm3,XMMWORD[48+rsi] vpshufb xmm1,xmm1,xmm11 vmovdqu xmm4,XMMWORD[64+rsi] vpshufb xmm2,xmm2,xmm11 vmovdqu xmm5,XMMWORD[80+rsi] vpshufb xmm3,xmm3,xmm11 vmovdqu xmm6,XMMWORD[96+rsi] vpshufb xmm4,xmm4,xmm11 vmovdqu xmm7,XMMWORD[112+rsi] vpshufb xmm5,xmm5,xmm11 vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] vpshufb xmm6,xmm6,xmm11 vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] vpshufb xmm7,xmm7,xmm11 vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] vmovdqa XMMWORD[rsp],xmm8 vpaddq xmm8,xmm4,XMMWORD[rbp] vmovdqa XMMWORD[16+rsp],xmm9 vpaddq xmm9,xmm5,XMMWORD[32+rbp] vmovdqa XMMWORD[32+rsp],xmm10 vpaddq xmm10,xmm6,XMMWORD[64+rbp] vmovdqa XMMWORD[48+rsp],xmm11 vpaddq xmm11,xmm7,XMMWORD[96+rbp] vmovdqa XMMWORD[64+rsp],xmm8 mov r14,rax vmovdqa XMMWORD[80+rsp],xmm9 mov rdi,rbx vmovdqa XMMWORD[96+rsp],xmm10 xor rdi,rcx vmovdqa XMMWORD[112+rsp],xmm11 mov r13,r8 jmp NEAR $L$avx_00_47 ALIGN 16 $L$avx_00_47: add rbp,256 vpalignr xmm8,xmm1,xmm0,8 shrd r13,r13,23 mov rax,r14 vpalignr xmm11,xmm5,xmm4,8 mov r12,r9 shrd r14,r14,5 vpsrlq xmm10,xmm8,1 xor r13,r8 xor r12,r10 vpaddq xmm0,xmm0,xmm11 shrd r13,r13,4 xor r14,rax vpsrlq xmm11,xmm8,7 and r12,r8 xor r13,r8 vpsllq xmm9,xmm8,56 add r11,QWORD[rsp] mov r15,rax vpxor xmm8,xmm11,xmm10 xor r12,r10 shrd r14,r14,6 vpsrlq xmm10,xmm10,7 xor r15,rbx add r11,r12 vpxor xmm8,xmm8,xmm9 shrd r13,r13,14 and rdi,r15 vpsllq xmm9,xmm9,7 xor r14,rax add r11,r13 vpxor xmm8,xmm8,xmm10 xor rdi,rbx shrd r14,r14,28 vpsrlq xmm11,xmm7,6 add rdx,r11 add r11,rdi vpxor xmm8,xmm8,xmm9 mov r13,rdx add r14,r11 vpsllq xmm10,xmm7,3 shrd r13,r13,23 mov r11,r14 vpaddq xmm0,xmm0,xmm8 mov r12,r8 shrd r14,r14,5 vpsrlq xmm9,xmm7,19 xor r13,rdx xor r12,r9 vpxor xmm11,xmm11,xmm10 shrd r13,r13,4 xor r14,r11 vpsllq xmm10,xmm10,42 and r12,rdx xor r13,rdx vpxor xmm11,xmm11,xmm9 add r10,QWORD[8+rsp] mov rdi,r11 vpsrlq xmm9,xmm9,42 xor r12,r9 shrd r14,r14,6 vpxor xmm11,xmm11,xmm10 xor rdi,rax add r10,r12 vpxor xmm11,xmm11,xmm9 shrd r13,r13,14 and r15,rdi vpaddq xmm0,xmm0,xmm11 xor r14,r11 add r10,r13 vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] xor r15,rax shrd r14,r14,28 add rcx,r10 add r10,r15 mov r13,rcx add r14,r10 vmovdqa XMMWORD[rsp],xmm10 vpalignr xmm8,xmm2,xmm1,8 shrd r13,r13,23 mov r10,r14 vpalignr xmm11,xmm6,xmm5,8 mov r12,rdx shrd r14,r14,5 vpsrlq xmm10,xmm8,1 xor r13,rcx xor r12,r8 vpaddq xmm1,xmm1,xmm11 shrd r13,r13,4 xor r14,r10 vpsrlq xmm11,xmm8,7 and r12,rcx xor r13,rcx vpsllq xmm9,xmm8,56 add r9,QWORD[16+rsp] mov r15,r10 vpxor xmm8,xmm11,xmm10 xor r12,r8 shrd r14,r14,6 vpsrlq xmm10,xmm10,7 xor r15,r11 add r9,r12 vpxor xmm8,xmm8,xmm9 shrd r13,r13,14 and rdi,r15 vpsllq xmm9,xmm9,7 xor r14,r10 add r9,r13 vpxor xmm8,xmm8,xmm10 xor rdi,r11 shrd r14,r14,28 vpsrlq xmm11,xmm0,6 add rbx,r9 add r9,rdi vpxor xmm8,xmm8,xmm9 mov r13,rbx add r14,r9 vpsllq xmm10,xmm0,3 shrd r13,r13,23 mov r9,r14 vpaddq xmm1,xmm1,xmm8 mov r12,rcx shrd r14,r14,5 vpsrlq xmm9,xmm0,19 xor r13,rbx xor r12,rdx vpxor xmm11,xmm11,xmm10 shrd r13,r13,4 xor r14,r9 vpsllq xmm10,xmm10,42 and r12,rbx xor r13,rbx vpxor xmm11,xmm11,xmm9 add r8,QWORD[24+rsp] mov rdi,r9 vpsrlq xmm9,xmm9,42 xor r12,rdx shrd r14,r14,6 vpxor xmm11,xmm11,xmm10 xor rdi,r10 add r8,r12 vpxor xmm11,xmm11,xmm9 shrd r13,r13,14 and r15,rdi vpaddq xmm1,xmm1,xmm11 xor r14,r9 add r8,r13 vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] xor r15,r10 shrd r14,r14,28 add rax,r8 add r8,r15 mov r13,rax add r14,r8 vmovdqa XMMWORD[16+rsp],xmm10 vpalignr xmm8,xmm3,xmm2,8 shrd r13,r13,23 mov r8,r14 vpalignr xmm11,xmm7,xmm6,8 mov r12,rbx shrd r14,r14,5 vpsrlq xmm10,xmm8,1 xor r13,rax xor r12,rcx vpaddq xmm2,xmm2,xmm11 shrd r13,r13,4 xor r14,r8 vpsrlq xmm11,xmm8,7 and r12,rax xor r13,rax vpsllq xmm9,xmm8,56 add rdx,QWORD[32+rsp] mov r15,r8 vpxor xmm8,xmm11,xmm10 xor r12,rcx shrd r14,r14,6 vpsrlq xmm10,xmm10,7 xor r15,r9 add rdx,r12 vpxor xmm8,xmm8,xmm9 shrd r13,r13,14 and rdi,r15 vpsllq xmm9,xmm9,7 xor r14,r8 add rdx,r13 vpxor xmm8,xmm8,xmm10 xor rdi,r9 shrd r14,r14,28 vpsrlq xmm11,xmm1,6 add r11,rdx add rdx,rdi vpxor xmm8,xmm8,xmm9 mov r13,r11 add r14,rdx vpsllq xmm10,xmm1,3 shrd r13,r13,23 mov rdx,r14 vpaddq xmm2,xmm2,xmm8 mov r12,rax shrd r14,r14,5 vpsrlq xmm9,xmm1,19 xor r13,r11 xor r12,rbx vpxor xmm11,xmm11,xmm10 shrd r13,r13,4 xor r14,rdx vpsllq xmm10,xmm10,42 and r12,r11 xor r13,r11 vpxor xmm11,xmm11,xmm9 add rcx,QWORD[40+rsp] mov rdi,rdx vpsrlq xmm9,xmm9,42 xor r12,rbx shrd r14,r14,6 vpxor xmm11,xmm11,xmm10 xor rdi,r8 add rcx,r12 vpxor xmm11,xmm11,xmm9 shrd r13,r13,14 and r15,rdi vpaddq xmm2,xmm2,xmm11 xor r14,rdx add rcx,r13 vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] xor r15,r8 shrd r14,r14,28 add r10,rcx add rcx,r15 mov r13,r10 add r14,rcx vmovdqa XMMWORD[32+rsp],xmm10 vpalignr xmm8,xmm4,xmm3,8 shrd r13,r13,23 mov rcx,r14 vpalignr xmm11,xmm0,xmm7,8 mov r12,r11 shrd r14,r14,5 vpsrlq xmm10,xmm8,1 xor r13,r10 xor r12,rax vpaddq xmm3,xmm3,xmm11 shrd r13,r13,4 xor r14,rcx vpsrlq xmm11,xmm8,7 and r12,r10 xor r13,r10 vpsllq xmm9,xmm8,56 add rbx,QWORD[48+rsp] mov r15,rcx vpxor xmm8,xmm11,xmm10 xor r12,rax shrd r14,r14,6 vpsrlq xmm10,xmm10,7 xor r15,rdx add rbx,r12 vpxor xmm8,xmm8,xmm9 shrd r13,r13,14 and rdi,r15 vpsllq xmm9,xmm9,7 xor r14,rcx add rbx,r13 vpxor xmm8,xmm8,xmm10 xor rdi,rdx shrd r14,r14,28 vpsrlq xmm11,xmm2,6 add r9,rbx add rbx,rdi vpxor xmm8,xmm8,xmm9 mov r13,r9 add r14,rbx vpsllq xmm10,xmm2,3 shrd r13,r13,23 mov rbx,r14 vpaddq xmm3,xmm3,xmm8 mov r12,r10 shrd r14,r14,5 vpsrlq xmm9,xmm2,19 xor r13,r9 xor r12,r11 vpxor xmm11,xmm11,xmm10 shrd r13,r13,4 xor r14,rbx vpsllq xmm10,xmm10,42 and r12,r9 xor r13,r9 vpxor xmm11,xmm11,xmm9 add rax,QWORD[56+rsp] mov rdi,rbx vpsrlq xmm9,xmm9,42 xor r12,r11 shrd r14,r14,6 vpxor xmm11,xmm11,xmm10 xor rdi,rcx add rax,r12 vpxor xmm11,xmm11,xmm9 shrd r13,r13,14 and r15,rdi vpaddq xmm3,xmm3,xmm11 xor r14,rbx add rax,r13 vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] xor r15,rcx shrd r14,r14,28 add r8,rax add rax,r15 mov r13,r8 add r14,rax vmovdqa XMMWORD[48+rsp],xmm10 vpalignr xmm8,xmm5,xmm4,8 shrd r13,r13,23 mov rax,r14 vpalignr xmm11,xmm1,xmm0,8 mov r12,r9 shrd r14,r14,5 vpsrlq xmm10,xmm8,1 xor r13,r8 xor r12,r10 vpaddq xmm4,xmm4,xmm11 shrd r13,r13,4 xor r14,rax vpsrlq xmm11,xmm8,7 and r12,r8 xor r13,r8 vpsllq xmm9,xmm8,56 add r11,QWORD[64+rsp] mov r15,rax vpxor xmm8,xmm11,xmm10 xor r12,r10 shrd r14,r14,6 vpsrlq xmm10,xmm10,7 xor r15,rbx add r11,r12 vpxor xmm8,xmm8,xmm9 shrd r13,r13,14 and rdi,r15 vpsllq xmm9,xmm9,7 xor r14,rax add r11,r13 vpxor xmm8,xmm8,xmm10 xor rdi,rbx shrd r14,r14,28 vpsrlq xmm11,xmm3,6 add rdx,r11 add r11,rdi vpxor xmm8,xmm8,xmm9 mov r13,rdx add r14,r11 vpsllq xmm10,xmm3,3 shrd r13,r13,23 mov r11,r14 vpaddq xmm4,xmm4,xmm8 mov r12,r8 shrd r14,r14,5 vpsrlq xmm9,xmm3,19 xor r13,rdx xor r12,r9 vpxor xmm11,xmm11,xmm10 shrd r13,r13,4 xor r14,r11 vpsllq xmm10,xmm10,42 and r12,rdx xor r13,rdx vpxor xmm11,xmm11,xmm9 add r10,QWORD[72+rsp] mov rdi,r11 vpsrlq xmm9,xmm9,42 xor r12,r9 shrd r14,r14,6 vpxor xmm11,xmm11,xmm10 xor rdi,rax add r10,r12 vpxor xmm11,xmm11,xmm9 shrd r13,r13,14 and r15,rdi vpaddq xmm4,xmm4,xmm11 xor r14,r11 add r10,r13 vpaddq xmm10,xmm4,XMMWORD[rbp] xor r15,rax shrd r14,r14,28 add rcx,r10 add r10,r15 mov r13,rcx add r14,r10 vmovdqa XMMWORD[64+rsp],xmm10 vpalignr xmm8,xmm6,xmm5,8 shrd r13,r13,23 mov r10,r14 vpalignr xmm11,xmm2,xmm1,8 mov r12,rdx shrd r14,r14,5 vpsrlq xmm10,xmm8,1 xor r13,rcx xor r12,r8 vpaddq xmm5,xmm5,xmm11 shrd r13,r13,4 xor r14,r10 vpsrlq xmm11,xmm8,7 and r12,rcx xor r13,rcx vpsllq xmm9,xmm8,56 add r9,QWORD[80+rsp] mov r15,r10 vpxor xmm8,xmm11,xmm10 xor r12,r8 shrd r14,r14,6 vpsrlq xmm10,xmm10,7 xor r15,r11 add r9,r12 vpxor xmm8,xmm8,xmm9 shrd r13,r13,14 and rdi,r15 vpsllq xmm9,xmm9,7 xor r14,r10 add r9,r13 vpxor xmm8,xmm8,xmm10 xor rdi,r11 shrd r14,r14,28 vpsrlq xmm11,xmm4,6 add rbx,r9 add r9,rdi vpxor xmm8,xmm8,xmm9 mov r13,rbx add r14,r9 vpsllq xmm10,xmm4,3 shrd r13,r13,23 mov r9,r14 vpaddq xmm5,xmm5,xmm8 mov r12,rcx shrd r14,r14,5 vpsrlq xmm9,xmm4,19 xor r13,rbx xor r12,rdx vpxor xmm11,xmm11,xmm10 shrd r13,r13,4 xor r14,r9 vpsllq xmm10,xmm10,42 and r12,rbx xor r13,rbx vpxor xmm11,xmm11,xmm9 add r8,QWORD[88+rsp] mov rdi,r9 vpsrlq xmm9,xmm9,42 xor r12,rdx shrd r14,r14,6 vpxor xmm11,xmm11,xmm10 xor rdi,r10 add r8,r12 vpxor xmm11,xmm11,xmm9 shrd r13,r13,14 and r15,rdi vpaddq xmm5,xmm5,xmm11 xor r14,r9 add r8,r13 vpaddq xmm10,xmm5,XMMWORD[32+rbp] xor r15,r10 shrd r14,r14,28 add rax,r8 add r8,r15 mov r13,rax add r14,r8 vmovdqa XMMWORD[80+rsp],xmm10 vpalignr xmm8,xmm7,xmm6,8 shrd r13,r13,23 mov r8,r14 vpalignr xmm11,xmm3,xmm2,8 mov r12,rbx shrd r14,r14,5 vpsrlq xmm10,xmm8,1 xor r13,rax xor r12,rcx vpaddq xmm6,xmm6,xmm11 shrd r13,r13,4 xor r14,r8 vpsrlq xmm11,xmm8,7 and r12,rax xor r13,rax vpsllq xmm9,xmm8,56 add rdx,QWORD[96+rsp] mov r15,r8 vpxor xmm8,xmm11,xmm10 xor r12,rcx shrd r14,r14,6 vpsrlq xmm10,xmm10,7 xor r15,r9 add rdx,r12 vpxor xmm8,xmm8,xmm9 shrd r13,r13,14 and rdi,r15 vpsllq xmm9,xmm9,7 xor r14,r8 add rdx,r13 vpxor xmm8,xmm8,xmm10 xor rdi,r9 shrd r14,r14,28 vpsrlq xmm11,xmm5,6 add r11,rdx add rdx,rdi vpxor xmm8,xmm8,xmm9 mov r13,r11 add r14,rdx vpsllq xmm10,xmm5,3 shrd r13,r13,23 mov rdx,r14 vpaddq xmm6,xmm6,xmm8 mov r12,rax shrd r14,r14,5 vpsrlq xmm9,xmm5,19 xor r13,r11 xor r12,rbx vpxor xmm11,xmm11,xmm10 shrd r13,r13,4 xor r14,rdx vpsllq xmm10,xmm10,42 and r12,r11 xor r13,r11 vpxor xmm11,xmm11,xmm9 add rcx,QWORD[104+rsp] mov rdi,rdx vpsrlq xmm9,xmm9,42 xor r12,rbx shrd r14,r14,6 vpxor xmm11,xmm11,xmm10 xor rdi,r8 add rcx,r12 vpxor xmm11,xmm11,xmm9 shrd r13,r13,14 and r15,rdi vpaddq xmm6,xmm6,xmm11 xor r14,rdx add rcx,r13 vpaddq xmm10,xmm6,XMMWORD[64+rbp] xor r15,r8 shrd r14,r14,28 add r10,rcx add rcx,r15 mov r13,r10 add r14,rcx vmovdqa XMMWORD[96+rsp],xmm10 vpalignr xmm8,xmm0,xmm7,8 shrd r13,r13,23 mov rcx,r14 vpalignr xmm11,xmm4,xmm3,8 mov r12,r11 shrd r14,r14,5 vpsrlq xmm10,xmm8,1 xor r13,r10 xor r12,rax vpaddq xmm7,xmm7,xmm11 shrd r13,r13,4 xor r14,rcx vpsrlq xmm11,xmm8,7 and r12,r10 xor r13,r10 vpsllq xmm9,xmm8,56 add rbx,QWORD[112+rsp] mov r15,rcx vpxor xmm8,xmm11,xmm10 xor r12,rax shrd r14,r14,6 vpsrlq xmm10,xmm10,7 xor r15,rdx add rbx,r12 vpxor xmm8,xmm8,xmm9 shrd r13,r13,14 and rdi,r15 vpsllq xmm9,xmm9,7 xor r14,rcx add rbx,r13 vpxor xmm8,xmm8,xmm10 xor rdi,rdx shrd r14,r14,28 vpsrlq xmm11,xmm6,6 add r9,rbx add rbx,rdi vpxor xmm8,xmm8,xmm9 mov r13,r9 add r14,rbx vpsllq xmm10,xmm6,3 shrd r13,r13,23 mov rbx,r14 vpaddq xmm7,xmm7,xmm8 mov r12,r10 shrd r14,r14,5 vpsrlq xmm9,xmm6,19 xor r13,r9 xor r12,r11 vpxor xmm11,xmm11,xmm10 shrd r13,r13,4 xor r14,rbx vpsllq xmm10,xmm10,42 and r12,r9 xor r13,r9 vpxor xmm11,xmm11,xmm9 add rax,QWORD[120+rsp] mov rdi,rbx vpsrlq xmm9,xmm9,42 xor r12,r11 shrd r14,r14,6 vpxor xmm11,xmm11,xmm10 xor rdi,rcx add rax,r12 vpxor xmm11,xmm11,xmm9 shrd r13,r13,14 and r15,rdi vpaddq xmm7,xmm7,xmm11 xor r14,rbx add rax,r13 vpaddq xmm10,xmm7,XMMWORD[96+rbp] xor r15,rcx shrd r14,r14,28 add r8,rax add rax,r15 mov r13,r8 add r14,rax vmovdqa XMMWORD[112+rsp],xmm10 cmp BYTE[135+rbp],0 jne NEAR $L$avx_00_47 shrd r13,r13,23 mov rax,r14 mov r12,r9 shrd r14,r14,5 xor r13,r8 xor r12,r10 shrd r13,r13,4 xor r14,rax and r12,r8 xor r13,r8 add r11,QWORD[rsp] mov r15,rax xor r12,r10 shrd r14,r14,6 xor r15,rbx add r11,r12 shrd r13,r13,14 and rdi,r15 xor r14,rax add r11,r13 xor rdi,rbx shrd r14,r14,28 add rdx,r11 add r11,rdi mov r13,rdx add r14,r11 shrd r13,r13,23 mov r11,r14 mov r12,r8 shrd r14,r14,5 xor r13,rdx xor r12,r9 shrd r13,r13,4 xor r14,r11 and r12,rdx xor r13,rdx add r10,QWORD[8+rsp] mov rdi,r11 xor r12,r9 shrd r14,r14,6 xor rdi,rax add r10,r12 shrd r13,r13,14 and r15,rdi xor r14,r11 add r10,r13 xor r15,rax shrd r14,r14,28 add rcx,r10 add r10,r15 mov r13,rcx add r14,r10 shrd r13,r13,23 mov r10,r14 mov r12,rdx shrd r14,r14,5 xor r13,rcx xor r12,r8 shrd r13,r13,4 xor r14,r10 and r12,rcx xor r13,rcx add r9,QWORD[16+rsp] mov r15,r10 xor r12,r8 shrd r14,r14,6 xor r15,r11 add r9,r12 shrd r13,r13,14 and rdi,r15 xor r14,r10 add r9,r13 xor rdi,r11 shrd r14,r14,28 add rbx,r9 add r9,rdi mov r13,rbx add r14,r9 shrd r13,r13,23 mov r9,r14 mov r12,rcx shrd r14,r14,5 xor r13,rbx xor r12,rdx shrd r13,r13,4 xor r14,r9 and r12,rbx xor r13,rbx add r8,QWORD[24+rsp] mov rdi,r9 xor r12,rdx shrd r14,r14,6 xor rdi,r10 add r8,r12 shrd r13,r13,14 and r15,rdi xor r14,r9 add r8,r13 xor r15,r10 shrd r14,r14,28 add rax,r8 add r8,r15 mov r13,rax add r14,r8 shrd r13,r13,23 mov r8,r14 mov r12,rbx shrd r14,r14,5 xor r13,rax xor r12,rcx shrd r13,r13,4 xor r14,r8 and r12,rax xor r13,rax add rdx,QWORD[32+rsp] mov r15,r8 xor r12,rcx shrd r14,r14,6 xor r15,r9 add rdx,r12 shrd r13,r13,14 and rdi,r15 xor r14,r8 add rdx,r13 xor rdi,r9 shrd r14,r14,28 add r11,rdx add rdx,rdi mov r13,r11 add r14,rdx shrd r13,r13,23 mov rdx,r14 mov r12,rax shrd r14,r14,5 xor r13,r11 xor r12,rbx shrd r13,r13,4 xor r14,rdx and r12,r11 xor r13,r11 add rcx,QWORD[40+rsp] mov rdi,rdx xor r12,rbx shrd r14,r14,6 xor rdi,r8 add rcx,r12 shrd r13,r13,14 and r15,rdi xor r14,rdx add rcx,r13 xor r15,r8 shrd r14,r14,28 add r10,rcx add rcx,r15 mov r13,r10 add r14,rcx shrd r13,r13,23 mov rcx,r14 mov r12,r11 shrd r14,r14,5 xor r13,r10 xor r12,rax shrd r13,r13,4 xor r14,rcx and r12,r10 xor r13,r10 add rbx,QWORD[48+rsp] mov r15,rcx xor r12,rax shrd r14,r14,6 xor r15,rdx add rbx,r12 shrd r13,r13,14 and rdi,r15 xor r14,rcx add rbx,r13 xor rdi,rdx shrd r14,r14,28 add r9,rbx add rbx,rdi mov r13,r9 add r14,rbx shrd r13,r13,23 mov rbx,r14 mov r12,r10 shrd r14,r14,5 xor r13,r9 xor r12,r11 shrd r13,r13,4 xor r14,rbx and r12,r9 xor r13,r9 add rax,QWORD[56+rsp] mov rdi,rbx xor r12,r11 shrd r14,r14,6 xor rdi,rcx add rax,r12 shrd r13,r13,14 and r15,rdi xor r14,rbx add rax,r13 xor r15,rcx shrd r14,r14,28 add r8,rax add rax,r15 mov r13,r8 add r14,rax shrd r13,r13,23 mov rax,r14 mov r12,r9 shrd r14,r14,5 xor r13,r8 xor r12,r10 shrd r13,r13,4 xor r14,rax and r12,r8 xor r13,r8 add r11,QWORD[64+rsp] mov r15,rax xor r12,r10 shrd r14,r14,6 xor r15,rbx add r11,r12 shrd r13,r13,14 and rdi,r15 xor r14,rax add r11,r13 xor rdi,rbx shrd r14,r14,28 add rdx,r11 add r11,rdi mov r13,rdx add r14,r11 shrd r13,r13,23 mov r11,r14 mov r12,r8 shrd r14,r14,5 xor r13,rdx xor r12,r9 shrd r13,r13,4 xor r14,r11 and r12,rdx xor r13,rdx add r10,QWORD[72+rsp] mov rdi,r11 xor r12,r9 shrd r14,r14,6 xor rdi,rax add r10,r12 shrd r13,r13,14 and r15,rdi xor r14,r11 add r10,r13 xor r15,rax shrd r14,r14,28 add rcx,r10 add r10,r15 mov r13,rcx add r14,r10 shrd r13,r13,23 mov r10,r14 mov r12,rdx shrd r14,r14,5 xor r13,rcx xor r12,r8 shrd r13,r13,4 xor r14,r10 and r12,rcx xor r13,rcx add r9,QWORD[80+rsp] mov r15,r10 xor r12,r8 shrd r14,r14,6 xor r15,r11 add r9,r12 shrd r13,r13,14 and rdi,r15 xor r14,r10 add r9,r13 xor rdi,r11 shrd r14,r14,28 add rbx,r9 add r9,rdi mov r13,rbx add r14,r9 shrd r13,r13,23 mov r9,r14 mov r12,rcx shrd r14,r14,5 xor r13,rbx xor r12,rdx shrd r13,r13,4 xor r14,r9 and r12,rbx xor r13,rbx add r8,QWORD[88+rsp] mov rdi,r9 xor r12,rdx shrd r14,r14,6 xor rdi,r10 add r8,r12 shrd r13,r13,14 and r15,rdi xor r14,r9 add r8,r13 xor r15,r10 shrd r14,r14,28 add rax,r8 add r8,r15 mov r13,rax add r14,r8 shrd r13,r13,23 mov r8,r14 mov r12,rbx shrd r14,r14,5 xor r13,rax xor r12,rcx shrd r13,r13,4 xor r14,r8 and r12,rax xor r13,rax add rdx,QWORD[96+rsp] mov r15,r8 xor r12,rcx shrd r14,r14,6 xor r15,r9 add rdx,r12 shrd r13,r13,14 and rdi,r15 xor r14,r8 add rdx,r13 xor rdi,r9 shrd r14,r14,28 add r11,rdx add rdx,rdi mov r13,r11 add r14,rdx shrd r13,r13,23 mov rdx,r14 mov r12,rax shrd r14,r14,5 xor r13,r11 xor r12,rbx shrd r13,r13,4 xor r14,rdx and r12,r11 xor r13,r11 add rcx,QWORD[104+rsp] mov rdi,rdx xor r12,rbx shrd r14,r14,6 xor rdi,r8 add rcx,r12 shrd r13,r13,14 and r15,rdi xor r14,rdx add rcx,r13 xor r15,r8 shrd r14,r14,28 add r10,rcx add rcx,r15 mov r13,r10 add r14,rcx shrd r13,r13,23 mov rcx,r14 mov r12,r11 shrd r14,r14,5 xor r13,r10 xor r12,rax shrd r13,r13,4 xor r14,rcx and r12,r10 xor r13,r10 add rbx,QWORD[112+rsp] mov r15,rcx xor r12,rax shrd r14,r14,6 xor r15,rdx add rbx,r12 shrd r13,r13,14 and rdi,r15 xor r14,rcx add rbx,r13 xor rdi,rdx shrd r14,r14,28 add r9,rbx add rbx,rdi mov r13,r9 add r14,rbx shrd r13,r13,23 mov rbx,r14 mov r12,r10 shrd r14,r14,5 xor r13,r9 xor r12,r11 shrd r13,r13,4 xor r14,rbx and r12,r9 xor r13,r9 add rax,QWORD[120+rsp] mov rdi,rbx xor r12,r11 shrd r14,r14,6 xor rdi,rcx add rax,r12 shrd r13,r13,14 and r15,rdi xor r14,rbx add rax,r13 xor r15,rcx shrd r14,r14,28 add r8,rax add rax,r15 mov r13,r8 add r14,rax mov rdi,QWORD[((128+0))+rsp] mov rax,r14 add rax,QWORD[rdi] lea rsi,[128+rsi] add rbx,QWORD[8+rdi] add rcx,QWORD[16+rdi] add rdx,QWORD[24+rdi] add r8,QWORD[32+rdi] add r9,QWORD[40+rdi] add r10,QWORD[48+rdi] add r11,QWORD[56+rdi] cmp rsi,QWORD[((128+16))+rsp] mov QWORD[rdi],rax mov QWORD[8+rdi],rbx mov QWORD[16+rdi],rcx mov QWORD[24+rdi],rdx mov QWORD[32+rdi],r8 mov QWORD[40+rdi],r9 mov QWORD[48+rdi],r10 mov QWORD[56+rdi],r11 jb NEAR $L$loop_avx mov rsi,QWORD[152+rsp] vzeroupper movaps xmm6,XMMWORD[((128+32))+rsp] movaps xmm7,XMMWORD[((128+48))+rsp] movaps xmm8,XMMWORD[((128+64))+rsp] movaps xmm9,XMMWORD[((128+80))+rsp] movaps xmm10,XMMWORD[((128+96))+rsp] movaps xmm11,XMMWORD[((128+112))+rsp] mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_sha512_block_data_order_avx: EXTERN __imp_RtlVirtualUnwind ALIGN 16 se_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$in_prologue mov rax,QWORD[152+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$in_prologue mov rsi,rax mov rax,QWORD[((128+24))+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] mov r13,QWORD[((-32))+rax] mov r14,QWORD[((-40))+rax] mov r15,QWORD[((-48))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 mov QWORD[224+r8],r13 mov QWORD[232+r8],r14 mov QWORD[240+r8],r15 lea r10,[$L$epilogue] cmp rbx,r10 jb NEAR $L$in_prologue lea rsi,[((128+32))+rsi] lea rdi,[512+r8] mov ecx,12 DD 0xa548f3fc $L$in_prologue: mov rdi,QWORD[8+rax] mov rsi,QWORD[16+rax] mov QWORD[152+r8],rax mov QWORD[168+r8],rsi mov QWORD[176+r8],rdi mov rdi,QWORD[40+r9] mov rsi,r8 mov ecx,154 DD 0xa548f3fc mov rsi,r9 xor rcx,rcx mov rdx,QWORD[8+rsi] mov r8,QWORD[rsi] mov r9,QWORD[16+rsi] mov r10,QWORD[40+rsi] lea r11,[56+rsi] lea r12,[24+rsi] mov QWORD[32+rsp],r10 mov QWORD[40+rsp],r11 mov QWORD[48+rsp],r12 mov QWORD[56+rsp],rcx call QWORD[__imp_RtlVirtualUnwind] mov eax,1 add rsp,64 popfq pop r15 pop r14 pop r13 pop r12 pop rbp pop rbx pop rdi pop rsi ret section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_sha512_block_data_order_nohw wrt ..imagebase DD $L$SEH_end_sha512_block_data_order_nohw wrt ..imagebase DD $L$SEH_info_sha512_block_data_order_nohw wrt ..imagebase DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha512_block_data_order_nohw: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase $L$SEH_info_sha512_block_data_order_avx: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/sha512-x86_64-nasm.o000064400000000000000000001032771046102023000164670ustar 00000000000000dgo#.debug$SST(@B.debug$T8`VV@B.text%V2| p`.rdataeZ|@p@.pdataׁ@0@.xdata 3@@@7C:\Users\b\p\ring\pregenerated\sha512-x86_64-nasm.asm mzkO@P% 4P   "$&(!*#,'-..2/60>1F2N3V7Y8]9a:e;i<m=q>u?zACDEFGHIJKMNOQRSUVWYZ[]^_abcefgijkmnopqrst v wxz{|~#&)-036:=@DGJNQTWZ^aehknrux|   $'*-047;>ADHKNRUZ]`dgjnqtw{~     "#$&'(*+,./0234678: ; <=>?@A!C$D(E+G0H3I6K:L=M@ODPGQJSMTQUTWWX[Y^[a\e]h_k`naqcudxe|fghijlmnpqrtuvxyz|}~   #'*-147;>ADGKNRUX[_beilqtw{~  !$ ( + . 1 58;?BGJMQTW[^a d!h"k$n%r&u(x)|*,-.012345679:;=>?ABCEFGIJKMNOQRSUVWYZ[\]^_ `bcdfg h#j'k*l-n1o4p7r:s>tAvDwHxKzN{R|U~X[^beilorvy|  !$(+.148=@EJMQTW[^behlpsv{      !#$%&(")%*(++,/-2/5091<3A4D5G7K8N9Q;U<X=[?^@bAeChDlEoGrHvIyK|LMOPQSTUVWYZ[\]_`abdefghiklmopqstuwxy{ | }        " % ) . 3 6 : = @ D G K N Q U Y \ _ d i l o r v y |                                                $ ' , / 2 6 9 < @ C F I M P S W Z ] a d g j m q v { ~                      ! # $ % ' ( ) + , - / 0 1 3 4 5 7 8 9 ; < = ? @ A C" D& E) F, G0 I3 J7 K: L= MA OE PH QK RP TU UX V[ W^ Xb Ye [h \l ]o _t `w az c~ d e g h i k l m o p q s t u w x y { | }                           ! $ ' + . 1 4 8 ; > B E H L O R U X \ a f i m p s w z ~                                               " % ( , 0 3 6 ; @ C F I M P S W Z _ b e i l o s v y |  ! # $ % ' ( ) + , - / 0 1 3 4 5 6 7 9 : ; < = ? @ A B D E F G H I K L M OPQ S TUWXY[ \$]'_*`.a1c4d8e;g>hAiDkHlMmRoUpYq\r_scufvjwmxpyt{x|{}~~ "',/259<?CFKNQUX[_behlorvy|       $'*-04 9!>#A$E%H&K'O)R*V+Y,\-`/d0g1j2o4t5w6z7}89;<=?@ACDEGHIKLMOPQSTUWXY[\]_`abcefghikl m npqrs t$u'w*x.y1{6|9}<@CFJMPSWZ]adgknqtw{ *@,E-J.M0P1S2V7Y9Z;[=]?_AaCcEgFnGrHvI~JKLNOPQRSVWXYZ[\]^_`bc defg"h'i,j1k6l;m@nEoJpOqTrYs^tcuhvmwrxwy|z{|}~  "%*-28;>CHKQTW\_dilotwz  "(+069<AFIORU[`chkpvy|       !&).149 <!A"D#G$J%M&S'Y(^)a*g+j,o-u.x/{0123456789:;<=>?@ABCDEFGHIJKLMNOPQ R STUVW!X&Y)Z/[2\5]:^?_B`HaKbPcUdXe[f`gehhimjpkslxm{nopqrstuvwxyz{|}~  %(+036;@CHKPUX[`ehnqty~  %(-369>CFLORWZ_dgjoruz                  $ ' , 2 5 8 = B E K N Q W \ _ d! g" l# r$ u% x& }' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N "O %P *Q -R 0S 5T 8U =V @W CX FY IZ O[ U\ Z] ]^ c_ f` ka qb tc wd |e f g h i j k l m n o p q r s t u v w x y z { | } ~               " % + . 1 6 ; > D G L Q T W \ a d i l o t w |                                       ! $ ' , / 2 7 < ? D G L Q T W \ a d j m p u z }                                          ! $ ) , / 2 5 : = @ E H K P S V Y ^ a d i l o! t" w# z$ }% & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R !S $T 'U ,V /W 2X 5Y 8Z =[ @\ C] F^ I_ N` Qa Tb Yc \d _e df gg jh mi rj uk xl }m n o p q r s t u v w x y z { | } ~                            " % * - 0 5 8 ; @ C F I L Q T W Z ] b e h m p s x { ~ ! ! ! ! ! ! ! ! ! ! %! (! +! 0! 3! 6! 9! >! A! D! I! L! O! T! W! Z! ]! `! e! h! k! n! q! v! y! |! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !! !" "# "$ "% "& "' "( ") "* "+ ", #"- &". )"/ ."0 1"1 4"2 9"3 <"4 ?"5 D"6 G"7 J"8 M"9 R": U"; X"< ]"= `"> c"? h"@ k"A n"B q"C t"D y"E |"F "G "H "I "J "K "L "M "N "O "P "Q "R "S "T "U "V "W "X "Y "Z "[ "\ "] "^ "_ "` "a "b "c "d "e "f "g "h "i "j "k #l #m #n #o #p #q #r #s #t ##u &#v )#w ,#x 1#y 4#z 7#{ :#| =#} B#~ E# H# M# P# S# X# [# ^# a# f# i# l# q# t# w# |# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # $ $ $ $ $$ -$ 6$ :$ >$ B$ F$ J$ N$ Q$ V$ [$ \$ `$ a$ b$ c$ d$ f$ h$ j$ l$ m$ q$ u$ |$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ % % % % % % #% '% .% 5% <% @% C% H% L% O% R% V% Y% ]% a% e% i% n% s% x% }% % %! %" %# %$ %% %& %' %( %) %* %+ :C:\Users\b\p\ring\pregenerated\sha512-x86_64-nasm.o4'The Netwide Assembler 2.13.039ring_core_0_17_14__sha512_block_data_order_nohw2L$SEH_begin_sha512_block_data_order_nohwL$prologueL$loopL$rounds_16_xxL$epilogue0L$SEH_end_sha512_block_data_order_nohw #K5128ring_core_0_17_14__sha512_block_data_order_avx1L$SEH_begin_sha512_block_data_order_avxL$prologue_avxL$loop_avxL$avx_00_47L$epilogue_avx/L$SEH_end_sha512_block_data_order_avxse_handlerL$in_prologue4  L$SEH_info_sha512_block_data_order_nohw3  L$SEH_info_sha512_block_data_order_avxl p *Q .Q eQ iQ Q Q Q Q Q Q Q Q Q Q 'R +R 6R :R pR tR R R R R R R R R S S 5S 9S KS OS hS! lS! S" S" H|$Ht$HHHLHSUATAUAVAWHHHHH$H$H$H$HH_HOHWLG LO(LW0L_8HH-H1L&MIIIMM1IM1L$$I1M!IMM1IM1MILeI1I1IIL!IMI1LMHmMLfIMIILI1IL1Ld$M1H!IML1II1ILLeM1H1III!IMM1LMHmMLfIMIIII1IM1Ld$M1I!IMM1II1MMLeM1M1IML!IMI1LMHmMLfIMIIHI1IH1Ld$M1H!IMH1II1ILLeM1L1IMI!IMM1LMHmMLf IMIIII1II1Ld$ M1I!III1II1MMLeM1M1ILL!IMH1MLHmLLf(MIIIHM1IH1Ld$(I1L!IIH1IM1IHLeI1L1ILI!IML1MLHmLLf0MIIIMM1II1Ld$0I1M!III1IM1MILeI1I1IHL!IMH1MLHmLLf8MIIILM1IL1Ld$8I1L!IIL1IM1IHLeI1H1IHI!IML1MLHmLLf@MIIIMM1IM1Ld$@I1M!IMM1IM1MILeI1I1IIL!IMI1LMHmMLfHIMIILI1IL1Ld$HM1H!IML1II1ILLeM1H1III!IMM1LMHmMLfPIMIIII1IM1Ld$PM1I!IMM1II1MMLeM1M1IML!IMI1LMHmMLfXIMIIHI1IH1Ld$XM1H!IMH1II1ILLeM1L1IMI!IMM1LMHmMLf`IMIIII1II1Ld$`M1I!III1II1MMLeM1M1ILL!IMH1MLHmLLfhMIIIHM1IH1Ld$hI1L!IIH1IM1IHLeI1L1ILI!IML1MLHmLLfpMIIIMM1II1Ld$pI1M!III1IM1MILeI1I1IHL!IMH1MLHmLLfxMIIILM1IL1Ld$xI1L!IIL1IM1IHLeI1H1IHI!IML1MLHmLl$L|$pMILMI*M1IIM1IIM1M1Ld$HL$$MMIIMM1IM1L$$I1M!IMM1IM1MILeI1I1IIL!IMI1LMHmLl$H|$xMIMIH*M1IIL1IHM1L1Ld$PLd$IIMILI1IL1Ld$M1H!IML1II1ILLeM1H1III!IMM1LMHmLl$L<$MIMMI*M1IIM1IIM1M1Ld$XLd$IMMIII1IM1Ld$M1I!IMM1II1MMLeM1M1IML!IMI1LMHmLl$ H|$MIMIH*M1IIL1IHM1L1Ld$`Ld$IIMIHI1IH1Ld$M1H!IMH1II1ILLeM1L1IMI!IMM1LMHmLl$(L|$MIMMI*M1IIM1IIM1M1Ld$hLd$ IMMIII1II1Ld$ M1I!III1II1MMLeM1M1ILL!IMH1MLHmLl$0H|$MILIH*M1IIL1IHM1L1Ld$pLd$(MIIIHM1IH1Ld$(I1L!IIH1IM1IHLeI1L1ILI!IML1MLHmLl$8L|$ MILMI*M1IIM1IIM1M1Ld$xLd$0MMIIMM1II1Ld$0I1M!III1IM1MILeI1I1IHL!IMH1MLHmLl$@H|$(MILIH*M1IIL1IHM1L1L$$Ld$8MIIILM1IL1Ld$8I1L!IIL1IM1IHLeI1H1IHI!IML1MLHmLl$HL|$0MILMI*M1IIM1IIM1M1Ld$Ld$@MMIIMM1IM1Ld$@I1M!IMM1IM1MILeI1I1IIL!IMI1LMHmLl$PH|$8MIMIH*M1IIL1IHM1L1Ld$Ld$HIIMILI1IL1Ld$HM1H!IML1II1ILLeM1H1III!IMM1LMHmLl$XL|$@MIMMI*M1IIM1IIM1M1Ld$Ld$PIMMIII1IM1Ld$PM1I!IMM1II1MMLeM1M1IML!IMI1LMHmLl$`H|$HMIMIH*M1IIL1IHM1L1Ld$ Ld$XIIMIHI1IH1Ld$XM1H!IMH1II1ILLeM1L1IMI!IMM1LMHmLl$hL|$PMIMMI*M1IIM1IIM1M1Ld$(Ld$`IMMIII1II1Ld$`M1I!III1II1MMLeM1M1ILL!IMH1MLHmLl$pH|$XMILIH*M1IIL1IHM1L1Ld$0Ld$hMIIIHM1IH1Ld$hI1L!IIH1IM1IHLeI1L1ILI!IML1MLHmLl$xL|$`MILMI*M1IIM1IIM1M1Ld$8Ld$pMMIIMM1II1Ld$pI1M!III1IM1MILeI1I1IHL!IMH1MLHmL,$H|$hMILIH*M1IIL1IHM1L1Ld$@Ld$xMIIILM1IL1Ld$xI1L!IIL1IM1IHLeI1H1IHI!IML1MLHm}H$LHHH_HOHWLG LO(LW0L_8H;$HH_HOHWLG LO(LW0L_8H$L~LvLnLfHnH^H&H|$Ht$ÐH|$Ht$HHHLHSUATAUAVAWHHHHH$H$H$H$)$)$D)$D)$D)$D)$wHH_HOHWLG LO(LW0L_8yooH-oNoV yo^0qof@ionPaov`Yo~pQyEIqMAiUa]y$YEyL$QM yT$ IU@y\$0A]`yD$@IyL$PHyT$`H1y\$pM HcqMLcQMM)sM1M1yMI1!sM!M11s8L$IA!M1M)sI1MA9ML!1sI1MA9H1MšsLIA9IMũsMMyMMűsI1M1A!MM1)s*I!I1A!LT$L1s*M1MA!H1MA!MI!yM1MyUI1MLMIMy$ciMMcIIM)sI1M1qMM1!sI!I11s8LL$MA!M1M)sM1MA9ML!1sM1MA9L1MšsLIA9IMũsMMqIMűsI1I1A!MM1)s*I!I1A!LD$L1s*I1MA!L1MA!MI!qM1MqUM1MLMIMyT$caMMcAIM)sI1I1iMM1!sI!I11s8HT$ MA!I1M)sM1LA9ML!1sM1LA9L1MšsIHA9MIũsMLiIMűsM1I1A!MI1)s*M!M1A!HL$(H1s*I1MA!L1LA!MI!iI1LiUM1MILMIyT$ cYMLcyMM)sM1I1aMI1!sM!M11s8H\$0IA!I1M)sI1LA9ML!1sI1LA9H1MšsIHA9MIũsMLaMMűsM1M1A!MI1)s*M!M1A!HD$8H1s*M1MA!H1LA!MI!aI1LaUI1MILMIyT$0cQMLcqMM)sM1M1YMI1!sM!M11s8L\$@IA!M1M)sI1MA9ML!1sI1MA9H1MšsLIA9IMũsMMYMMűsI1M1A!MM1)s*I!I1A!LT$HL1s*M1MA!H1MA!MI!YM1MYUI1MLMIMyT$@cIMMciIM)sI1M1QMM1!sI!I11s8LL$PMA!M1M)sM1MA9ML!1sM1MA9L1MšsLIA9IMũsMMQIMűsI1I1A!MM1)s*I!I1A!LD$XL1s*I1MA!L1MA!MI!QM1MQU M1MLMIMyT$PcAMMcaIM)sI1I1IMM1!sI!I11s8HT$`MA!I1M)sM1LA9ML!1sM1LA9L1MšsIHA9MIũsMLIIMűsM1I1A!MI1)s*M!M1A!HL$hH1s*I1MA!L1LA!MI!II1LIU@M1MILMIyT$`cyMLcYMM)sM1I1AMI1!sM!M11s8H\$pIA!I1M)sI1LA9ML!1sI1LA9H1MšsIHA9MIũsMLAMMűsM1M1A!MI1)s*M!M1A!HD$xH1s*M1MA!H1LA!MI!AI1LAU`I1MILMIyT$pMLMMM1M1MI1M!M1L$IM1MI1MML!I1MH1MLIIMMMMMI1M1MM1I!I1LT$LM1MH1MMI!M1MI1MLMIMMMIMI1M1MM1I!I1LL$MM1MM1MML!M1ML1MLIIMMMIMI1I1MM1I!I1LD$LI1ML1MMI!M1MM1MLMIMMMIMI1I1MM1I!I1HT$ MI1MM1LML!M1LL1MIHMIMLIMM1I1MI1M!M1HL$(HI1ML1LMI!I1LM1MILMIMLMMM1I1MI1M!M1H\$0II1MI1LML!I1LH1MIHMIMLMMM1M1MI1M!M1HD$8HM1MH1LMI!I1LI1MILMIMLMMM1M1MI1M!M1L\$@IM1MI1MML!I1MH1MLIIMMMMMI1M1MM1I!I1LT$HLM1MH1MMI!M1MI1MLMIMMMIMI1M1MM1I!I1LL$PMM1MM1MML!M1ML1MLIIMMMIMI1I1MM1I!I1LD$XLI1ML1MMI!M1MM1MLMIMMMIMI1I1MM1I!I1HT$`MI1MM1LML!M1LL1MIHMIMLIMM1I1MI1M!M1HL$hHI1ML1LMI!I1LM1MILMIMLMMM1I1MI1M!M1H\$pII1MI1LML!I1LH1MIHMIMLMMM1M1MI1M!M1HD$xHM1MH1LMI!I1LI1MILMIH$LHHH_HOHWLG LO(LW0L_8H;$HH_HOHWLG LO(LW0L_8 H$w($($D($D($D($D($L~LvLnLfHnH^H&H|$Ht$ÐVWSUATAUAVAWH@I@xIIqMY8ENL9IESNL9sHHHXHhL`LhLpLxIIMMMMLL9HI HHxHpIIIIy(LƹHLH1HVLLNLV(L^8LfLT$ L\$(Ld$0HL$8H@A_A^A]A\][_^Æ%"(ט/Be#D7q"(ט/Be#D7q/;Mۉ۵/;Mۉ۵8H[V9Y8H[V9YO?m^O?m^BؾopE[BؾopE[N1$} UN1$} Uo{t]r;ހo{t]r;ހ5%ܛ&it5%ܛ&itJi%O8GJi%O8GՌƝew̡ $ՌƝew̡ $u+Yo,-ntJu+Yo,-ntJAܩ\SڈvAܩ\SڈvfRQ>2-m1fRQ>2-m1?!'Y?!'Y= % G= % GoQcpn g))oQcpn g))/F '&&\8!./F '&&\8!.*Zm,M߳ 8S*Zm,M߳ 8ScTs ew< jvcTs ew< jvG.;5,rG.;5,rdL迢0BKfdL迢0BKfpK0TQlǑpK0TQlReU$ReU$* qW5ѻ2pj* qW5ѻ2pjҸSAQl7ҸSAQl7LwH'Hᵼ4LwH'Hᵼ4cZų 9ˊAJNcZų 9ˊAJNscwOʜ[o.hscwOʜ[o.h]t`/Cocx]t`/CocxrxȄ9dnjrxȄ9dnj(c#齂lP(c#齂lPyƲ+SrxqyƲ+SrxqƜa&>'!Ǹќa&>'!Ǹ}xnO}}xnO}orgȢ}c orgȢ}c ?G5 q ?G5 q}#w($@{2}#w($@{2 ˾L*~e)YB>˾L*~e)Y:o_XGJDl:o_XGJDl  SHA512 block transform for x86_64, CRYPTOGAMS by M\$    `$V `$Q$ .filegC:\Users\b\p\ring\.debug$SS(.debug$T8.text%.rdatae.pdata.xdata .absolutK tVL$loop@K512@M&1=Q$L\$r`$}%__imp_RtlVirtualUnwindring_core_0_17_14__sha512_block_data_order_nohwL$SEH_begin_sha512_block_data_order_nohwL$prologueL$rounds_16_xxL$epilogueL$SEH_end_sha512_block_data_order_nohwring_core_0_17_14__sha512_block_data_order_avxL$SEH_begin_sha512_block_data_order_avxL$prologue_avxL$loop_avxL$avx_00_47L$epilogue_avxL$SEH_end_sha512_block_data_order_avxse_handlerL$in_prologueL$SEH_info_sha512_block_data_order_nohwL$SEH_info_sha512_block_data_order_avxring-0.17.14/pregenerated/vpaes-armv7-linux32.S000064400000000000000000000551241046102023000172070ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) .syntax unified .arch armv7-a .fpu neon #if defined(__thumb2__) .thumb #else .code 32 #endif .text .type _vpaes_consts,%object .align 7 @ totally strategic alignment _vpaes_consts: .Lk_mc_forward:@ mc_forward .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 .quad 0x080B0A0904070605, 0x000302010C0F0E0D .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 .quad 0x000302010C0F0E0D, 0x080B0A0904070605 .Lk_mc_backward:@ mc_backward .quad 0x0605040702010003, 0x0E0D0C0F0A09080B .quad 0x020100030E0D0C0F, 0x0A09080B06050407 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 .quad 0x0A09080B06050407, 0x020100030E0D0C0F .Lk_sr:@ sr .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 .quad 0x030E09040F0A0500, 0x0B06010C07020D08 .quad 0x0F060D040B020900, 0x070E050C030A0108 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 @ @ "Hot" constants @ .Lk_inv:@ inv, inva .quad 0x0E05060F0D080180, 0x040703090A0B0C02 .quad 0x01040A060F0B0780, 0x030D0E0C02050809 .Lk_ipt:@ input transform (lo, hi) .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 .Lk_sbo:@ sbou, sbot .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA .Lk_sb1:@ sb1u, sb1t .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 .Lk_sb2:@ sb2u, sb2t .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .align 2 .size _vpaes_consts,.-_vpaes_consts .align 6 @@ @@ _aes_preheat @@ @@ Fills q9-q15 as specified below. @@ .type _vpaes_preheat,%function .align 4 _vpaes_preheat: adr r10, .Lk_inv vmov.i8 q9, #0x0f @ .Lk_s0F vld1.64 {q10,q11}, [r10]! @ .Lk_inv add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo vld1.64 {q12,q13}, [r10]! @ .Lk_sb1 vld1.64 {q14,q15}, [r10] @ .Lk_sb2 bx lr @@ @@ _aes_encrypt_core @@ @@ AES-encrypt q0. @@ @@ Inputs: @@ q0 = input @@ q9-q15 as in _vpaes_preheat @@ [r2] = scheduled keys @@ @@ Output in q0 @@ Clobbers q1-q5, r8-r11 @@ Preserves q6-q8 so you get some local vectors @@ @@ .type _vpaes_encrypt_core,%function .align 4 _vpaes_encrypt_core: mov r9, r2 ldr r8, [r2,#240] @ pull rounds adr r11, .Lk_ipt @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi vld1.64 {q2, q3}, [r11] adr r11, .Lk_mc_forward+16 vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1 vtbl.8 d3, {q2}, d3 vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2 vtbl.8 d5, {q3}, d1 veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 @ .Lenc_entry ends with a bnz instruction which is normally paired with @ subs in .Lenc_loop. tst r8, r8 b .Lenc_entry .align 4 .Lenc_loop: @ middle of middle round add r10, r11, #0x40 vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u vtbl.8 d9, {q13}, d5 vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t vtbl.8 d1, {q12}, d7 veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u vtbl.8 d11, {q15}, d5 veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t vtbl.8 d5, {q14}, d7 vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B vtbl.8 d7, {q0}, d3 veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A @ Write to q5 instead of q0, so the table and destination registers do @ not overlap. vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D vtbl.8 d11, {q0}, d9 veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C vtbl.8 d9, {q3}, d3 @ Here we restore the original q0/q5 usage. veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4 veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D subs r8, r8, #1 @ nr-- .Lenc_entry: @ top of round vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k vtbl.8 d11, {q11}, d3 veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i vtbl.8 d7, {q10}, d1 vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j vtbl.8 d9, {q10}, d3 veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak vtbl.8 d5, {q10}, d7 vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak vtbl.8 d7, {q10}, d9 veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 bne .Lenc_loop @ middle of last round add r10, r11, #0x80 adr r11, .Lk_sbo @ Read to q1 instead of q4, so the vtbl.8 instruction below does not @ overlap table and destination registers. vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou vtbl.8 d9, {q1}, d5 vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] @ Write to q2 instead of q0 below, to avoid overlapping table and @ destination registers. vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t vtbl.8 d5, {q0}, d7 veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A @ Here we restore the original q0/q2 usage. vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 vtbl.8 d1, {q2}, d3 bx lr .size _vpaes_encrypt_core,.-_vpaes_encrypt_core @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @@ @@ @@ AES key schedule @@ @@ @@ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @ This function diverges from both x86_64 and armv7 in which constants are @ pinned. x86_64 has a common preheat function for all operations. aarch64 @ separates them because it has enough registers to pin nearly all constants. @ armv7 does not have enough registers, but needing explicit loads and stores @ also complicates using x86_64's register allocation directly. @ @ We pin some constants for convenience and leave q14 and q15 free to load @ others on demand. @ @ Key schedule constants @ .type _vpaes_key_consts,%object .align 4 _vpaes_key_consts: .Lk_rcon:@ rcon .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 .Lk_opt:@ output transform .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 .Lk_deskew:@ deskew tables: inverts the sbox's "skew" .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 .size _vpaes_key_consts,.-_vpaes_key_consts .type _vpaes_key_preheat,%function .align 4 _vpaes_key_preheat: adr r11, .Lk_rcon vmov.i8 q12, #0x5b @ .Lk_s63 adr r10, .Lk_inv @ Must be aligned to 8 mod 16. vmov.i8 q9, #0x0f @ .Lk_s0F vld1.64 {q10,q11}, [r10] @ .Lk_inv vld1.64 {q8}, [r11] @ .Lk_rcon bx lr .size _vpaes_key_preheat,.-_vpaes_key_preheat .type _vpaes_schedule_core,%function .align 4 _vpaes_schedule_core: @ We only need to save lr, but ARM requires an 8-byte stack alignment, @ so save an extra register. stmdb sp!, {r3,lr} bl _vpaes_key_preheat @ load the tables adr r11, .Lk_ipt @ Must be aligned to 8 mod 16. vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) @ input transform @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not @ overlap table and destination. vmov q4, q0 @ vmovdqa %xmm0, %xmm3 bl _vpaes_schedule_transform adr r10, .Lk_sr @ Must be aligned to 8 mod 16. vmov q7, q0 @ vmovdqa %xmm0, %xmm7 add r8, r8, r10 @ encrypting, output zeroth round key after transform vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) @ *ring*: Decryption removed. .Lschedule_go: cmp r1, #192 @ cmp $192, %esi bhi .Lschedule_256 @ 128: fall though @@ @@ .schedule_128 @@ @@ 128-bit specific part of key schedule. @@ @@ This schedule is really simple, because all its parts @@ are accomplished by the subroutines. @@ .Lschedule_128: mov r0, #10 @ mov $10, %esi .Loop_schedule_128: bl _vpaes_schedule_round subs r0, r0, #1 @ dec %esi beq .Lschedule_mangle_last bl _vpaes_schedule_mangle @ write output b .Loop_schedule_128 @@ @@ .aes_schedule_256 @@ @@ 256-bit specific part of key schedule. @@ @@ The structure here is very similar to the 128-bit @@ schedule, but with an additional "low side" in @@ q6. The low side's rounds are the same as the @@ high side's, except no rcon and no rotation. @@ .align 4 .Lschedule_256: vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) bl _vpaes_schedule_transform @ input transform mov r0, #7 @ mov $7, %esi .Loop_schedule_256: bl _vpaes_schedule_mangle @ output low result vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 @ high round bl _vpaes_schedule_round subs r0, r0, #1 @ dec %esi beq .Lschedule_mangle_last bl _vpaes_schedule_mangle @ low round. swap xmm7 and xmm6 vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 vmov.i8 q4, #0 vmov q5, q7 @ vmovdqa %xmm7, %xmm5 vmov q7, q6 @ vmovdqa %xmm6, %xmm7 bl _vpaes_schedule_low_round vmov q7, q5 @ vmovdqa %xmm5, %xmm7 b .Loop_schedule_256 @@ @@ .aes_schedule_mangle_last @@ @@ Mangler for last round of key schedule @@ Mangles q0 @@ when encrypting, outputs out(q0) ^ 63 @@ when decrypting, outputs unskew(q0) @@ @@ Always called right before return... jumps to cleanup and exits @@ .align 4 .Lschedule_mangle_last: @ schedule last round key from xmm0 adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew @ encrypting vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform add r2, r2, #32 @ add $32, %rdx vmov q2, q0 vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute vtbl.8 d1, {q2}, d3 .Lschedule_mangle_last_dec: sub r2, r2, #16 @ add $-16, %rdx veor q0, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0 bl _vpaes_schedule_transform @ output transform vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key @ cleanup veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 ldmia sp!, {r3,pc} @ return .size _vpaes_schedule_core,.-_vpaes_schedule_core @@ @@ .aes_schedule_round @@ @@ Runs one main round of the key schedule on q0, q7 @@ @@ Specifically, runs subbytes on the high dword of q0 @@ then rotates it by one byte and xors into the low dword of @@ q7. @@ @@ Adds rcon from low byte of q8, then rotates q8 for @@ next rcon. @@ @@ Smears the dwords of q7 by xoring the low into the @@ second low, result into third, result into highest. @@ @@ Returns results in q7 = q0. @@ Clobbers q1-q4, r11. @@ .type _vpaes_schedule_round,%function .align 4 _vpaes_schedule_round: @ extract rcon from xmm8 vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1 vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8 veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 @ rotate vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0 @ fall through... @ low round: same as high round, but no rotation and no rcon. _vpaes_schedule_low_round: @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. @ We pin other values in _vpaes_key_preheat, so load them now. adr r11, .Lk_sb1 vld1.64 {q14,q15}, [r11] @ smear xmm7 vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1 veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4 @ subbytes vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k vtbl.8 d5, {q11}, d3 veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i vtbl.8 d7, {q10}, d1 veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j vtbl.8 d9, {q10}, d3 veor q7, q7, q12 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7 vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak vtbl.8 d7, {q10}, d7 veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak vtbl.8 d5, {q10}, d9 veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou vtbl.8 d9, {q15}, d7 vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t vtbl.8 d3, {q14}, d5 veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output @ add in smeared stuff veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 bx lr .size _vpaes_schedule_round,.-_vpaes_schedule_round @@ @@ .aes_schedule_transform @@ @@ Linear-transform q0 according to tables at [r11] @@ @@ Requires that q9 = 0x0F0F... as in preheat @@ Output in q0 @@ Clobbers q1, q2, q14, q15 @@ .type _vpaes_schedule_transform,%function .align 4 _vpaes_schedule_transform: vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo @ vmovdqa 16(%r11), %xmm1 # hi vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 vtbl.8 d5, {q14}, d3 vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 vtbl.8 d1, {q15}, d1 veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 bx lr .size _vpaes_schedule_transform,.-_vpaes_schedule_transform @@ @@ .aes_schedule_mangle @@ @@ Mangles q0 from (basis-transformed) standard version @@ to our version. @@ @@ On encrypt, @@ xor with 0x63 @@ multiply by circulant 0,1,1,1 @@ apply shiftrows transform @@ @@ On decrypt, @@ xor with 0x63 @@ multiply by "inverse mixcolumns" circulant E,B,D,9 @@ deskew @@ apply shiftrows transform @@ @@ @@ Writes out to [r2], and increments or decrements it @@ Keeps track of round number mod 4 in r8 @@ Preserves q0 @@ Clobbers q1-q5 @@ .type _vpaes_schedule_mangle,%function .align 4 _vpaes_schedule_mangle: tst r3, r3 vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16. vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5 @ encrypting @ Write to q2 so we do not overlap table and destination below. veor q2, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4 add r2, r2, #16 @ add $16, %rdx vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4 vtbl.8 d9, {q2}, d11 vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1 vtbl.8 d3, {q4}, d11 vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3 vtbl.8 d7, {q1}, d11 veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 .Lschedule_mangle_both: @ Write to q2 so table and destination do not overlap. vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 vtbl.8 d5, {q3}, d3 add r8, r8, #64-16 @ add $-16, %r8 and r8, r8, #~(1<<6) @ and $0x30, %r8 vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx) bx lr .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle .globl vpaes_set_encrypt_key .hidden vpaes_set_encrypt_key .type vpaes_set_encrypt_key,%function .align 4 vpaes_set_encrypt_key: stmdb sp!, {r7,r8,r9,r10,r11, lr} vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} lsr r9, r1, #5 @ shr $5,%eax add r9, r9, #5 @ $5,%eax str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; mov r3, #0 @ mov $0,%ecx mov r8, #0x30 @ mov $0x30,%r8d bl _vpaes_schedule_core eor r0, r0, r0 vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key @ Additional constants for converting to bsaes. .type _vpaes_convert_consts,%object .align 4 _vpaes_convert_consts: @ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear @ transform in the AES S-box. 0x63 is incorporated into the low half of the @ table. This was computed with the following script: @ @ def u64s_to_u128(x, y): @ return x | (y << 64) @ def u128_to_u64s(w): @ return w & ((1<<64)-1), w >> 64 @ def get_byte(w, i): @ return (w >> (i*8)) & 0xff @ def apply_table(table, b): @ lo = b & 0xf @ hi = b >> 4 @ return get_byte(table[0], lo) ^ get_byte(table[1], hi) @ def opt(b): @ table = [ @ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), @ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), @ ] @ return apply_table(table, b) @ def rot_byte(b, n): @ return 0xff & ((b << n) | (b >> (8-n))) @ def skew(x): @ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ @ rot_byte(x, 4)) @ table = [0, 0] @ for i in range(16): @ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) @ table[1] |= skew(opt(i<<4)) << (i*8) @ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0])) @ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1])) .Lk_opt_then_skew: .quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b .quad 0x1f30062936192f00, 0xb49bad829db284ab @ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); .globl vpaes_encrypt_key_to_bsaes .hidden vpaes_encrypt_key_to_bsaes .type vpaes_encrypt_key_to_bsaes,%function .align 4 vpaes_encrypt_key_to_bsaes: stmdb sp!, {r11, lr} @ See _vpaes_schedule_core for the key schedule logic. In particular, @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last @ contain the transformations not in the bsaes representation. This @ function inverts those transforms. @ @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key @ representation, which does not match the other aes_nohw_* @ implementations. The ARM aes_nohw_* stores each 32-bit word @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the @ cost of extra REV and VREV32 operations in little-endian ARM. vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16. add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) vld1.64 {q12}, [r2] vmov.i8 q10, #0x5b @ .Lk_s63 from vpaes-x86_64 adr r11, .Lk_opt @ Must be aligned to 8 mod 16. vmov.i8 q11, #0x63 @ .LK_s63 without .Lk_ipt applied @ vpaes stores one fewer round count than bsaes, but the number of keys @ is the same. ldr r2, [r1,#240] add r2, r2, #1 str r2, [r0,#240] @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). @ Invert this with .Lk_opt. vld1.64 {q0}, [r1]! bl _vpaes_schedule_transform vrev32.8 q0, q0 vst1.64 {q0}, [r0]! @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. .Loop_enc_key_to_bsaes: vld1.64 {q0}, [r1]! @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. @ We use r3 rather than r8 to avoid a callee-saved register. vld1.64 {q1}, [r3] vtbl.8 d4, {q0}, d2 vtbl.8 d5, {q0}, d3 add r3, r3, #16 and r3, r3, #~(1<<6) vmov q0, q2 @ Handle the last key differently. subs r2, r2, #1 beq .Loop_enc_key_to_bsaes_last @ Multiply by the circulant. This is its own inverse. vtbl.8 d2, {q0}, d24 vtbl.8 d3, {q0}, d25 vmov q0, q1 vtbl.8 d4, {q1}, d24 vtbl.8 d5, {q1}, d25 veor q0, q0, q2 vtbl.8 d2, {q2}, d24 vtbl.8 d3, {q2}, d25 veor q0, q0, q1 @ XOR and finish. veor q0, q0, q10 bl _vpaes_schedule_transform vrev32.8 q0, q0 vst1.64 {q0}, [r0]! b .Loop_enc_key_to_bsaes .Loop_enc_key_to_bsaes_last: @ The final key does not have a basis transform (note @ .Lschedule_mangle_last inverts the original transform). It only XORs @ 0x63 and applies ShiftRows. The latter was already inverted in the @ loop. Note that, because we act on the original representation, we use @ q11, not q10. veor q0, q0, q11 vrev32.8 q0, q0 vst1.64 {q0}, [r0] @ Wipe registers which contained key material. veor q0, q0, q0 veor q1, q1, q1 veor q2, q2, q2 ldmia sp!, {r11, pc} @ return .size vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes .globl vpaes_ctr32_encrypt_blocks .hidden vpaes_ctr32_encrypt_blocks .type vpaes_ctr32_encrypt_blocks,%function .align 4 vpaes_ctr32_encrypt_blocks: mov ip, sp stmdb sp!, {r7,r8,r9,r10,r11, lr} @ This function uses q4-q7 (d8-d15), which are callee-saved. vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} cmp r2, #0 @ r8 is passed on the stack. ldr r8, [ip] beq .Lctr32_done @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3. mov r9, r3 mov r3, r2 mov r2, r9 @ Load the IV and counter portion. ldr r7, [r8, #12] vld1.8 {q7}, [r8] bl _vpaes_preheat rev r7, r7 @ The counter is big-endian. .Lctr32_loop: vmov q0, q7 vld1.8 {q6}, [r0]! @ .Load input ahead of time bl _vpaes_encrypt_core veor q0, q0, q6 @ XOR input and result vst1.8 {q0}, [r1]! subs r3, r3, #1 @ Update the counter. add r7, r7, #1 rev r9, r7 vmov.32 d15[1], r9 bne .Lctr32_loop .Lctr32_done: vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return .size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) ring-0.17.14/pregenerated/vpaes-armv8-ios64.S000064400000000000000000000611161046102023000166460ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) .section __TEXT,__const .align 7 // totally strategic alignment _vpaes_consts: Lk_mc_forward: // mc_forward .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 .quad 0x080B0A0904070605, 0x000302010C0F0E0D .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 .quad 0x000302010C0F0E0D, 0x080B0A0904070605 Lk_mc_backward: // mc_backward .quad 0x0605040702010003, 0x0E0D0C0F0A09080B .quad 0x020100030E0D0C0F, 0x0A09080B06050407 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 .quad 0x0A09080B06050407, 0x020100030E0D0C0F Lk_sr: // sr .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 .quad 0x030E09040F0A0500, 0x0B06010C07020D08 .quad 0x0F060D040B020900, 0x070E050C030A0108 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 // // "Hot" constants // Lk_inv: // inv, inva .quad 0x0E05060F0D080180, 0x040703090A0B0C02 .quad 0x01040A060F0B0780, 0x030D0E0C02050809 Lk_ipt: // input transform (lo, hi) .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 Lk_sbo: // sbou, sbot .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA Lk_sb1: // sb1u, sb1t .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 Lk_sb2: // sb2u, sb2t .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD // // Key schedule constants // Lk_dksd: // decryption key schedule: invskew x*D .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E Lk_dksb: // decryption key schedule: invskew x*B .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 Lk_dkse: // decryption key schedule: invskew x*E + 0x63 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 Lk_dks9: // decryption key schedule: invskew x*9 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE Lk_rcon: // rcon .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 Lk_opt: // output transform .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 Lk_deskew: // deskew tables: inverts the sbox's "skew" .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .align 2 .align 6 .text ## ## _aes_preheat ## ## Fills register %r10 -> .aes_consts (so you can -fPIC) ## and %xmm9-%xmm15 as specified below. ## .align 4 _vpaes_encrypt_preheat: adrp x10, Lk_inv@PAGE add x10, x10, Lk_inv@PAGEOFF movi v17.16b, #0x0f ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 ret ## ## _aes_encrypt_core ## ## AES-encrypt %xmm0. ## ## Inputs: ## %xmm0 = input ## %xmm9-%xmm15 as in _vpaes_preheat ## (%rdx) = scheduled keys ## ## Output in %xmm0 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax ## Preserves %xmm6 - %xmm8 so you get some local vectors ## ## .align 4 _vpaes_encrypt_core: mov x9, x2 ldr w8, [x2,#240] // pull rounds adrp x11, Lk_mc_forward@PAGE+16 add x11, x11, Lk_mc_forward@PAGEOFF+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 b Lenc_entry .align 4 Lenc_loop: // middle of middle round add x10, x11, #0x40 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D sub w8, w8, #1 // nr-- Lenc_entry: // top of round and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 cbnz w8, Lenc_loop // middle of last round add x10, x11, #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 ret .align 4 _vpaes_encrypt_2x: mov x9, x2 ldr w8, [x2,#240] // pull rounds adrp x11, Lk_mc_forward@PAGE+16 add x11, x11, Lk_mc_forward@PAGEOFF+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 and v9.16b, v15.16b, v17.16b ushr v8.16b, v15.16b, #4 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 tbl v9.16b, {v20.16b}, v9.16b // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 tbl v10.16b, {v21.16b}, v8.16b eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 eor v8.16b, v9.16b, v16.16b eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 eor v8.16b, v8.16b, v10.16b b Lenc_2x_entry .align 4 Lenc_2x_loop: // middle of middle round add x10, x11, #0x40 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u tbl v12.16b, {v25.16b}, v10.16b ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t tbl v8.16b, {v24.16b}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v12.16b, v12.16b, v16.16b tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u tbl v13.16b, {v27.16b}, v10.16b eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A eor v8.16b, v8.16b, v12.16b tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t tbl v10.16b, {v26.16b}, v11.16b ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B tbl v11.16b, {v8.16b}, v1.16b eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A eor v10.16b, v10.16b, v13.16b tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D tbl v8.16b, {v8.16b}, v4.16b eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B eor v11.16b, v11.16b, v10.16b tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C tbl v12.16b, {v11.16b},v1.16b eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D eor v8.16b, v8.16b, v11.16b and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D eor v8.16b, v8.16b, v12.16b sub w8, w8, #1 // nr-- Lenc_2x_entry: // top of round and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i and v9.16b, v8.16b, v17.16b ushr v8.16b, v8.16b, #4 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k tbl v13.16b, {v19.16b},v9.16b eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j eor v9.16b, v9.16b, v8.16b tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i tbl v11.16b, {v18.16b},v8.16b tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j tbl v12.16b, {v18.16b},v9.16b eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k eor v11.16b, v11.16b, v13.16b eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k eor v12.16b, v12.16b, v13.16b tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak tbl v10.16b, {v18.16b},v11.16b tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak tbl v11.16b, {v18.16b},v12.16b eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v10.16b, v10.16b, v9.16b eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo eor v11.16b, v11.16b, v8.16b ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 cbnz w8, Lenc_2x_loop // middle of last round add x10, x11, #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou tbl v12.16b, {v22.16b}, v10.16b ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t tbl v8.16b, {v23.16b}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v12.16b, v12.16b, v16.16b eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A eor v8.16b, v8.16b, v12.16b tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 tbl v1.16b, {v8.16b},v1.16b ret ######################################################## ## ## ## AES key schedule ## ## ## ######################################################## .align 4 _vpaes_key_preheat: adrp x10, Lk_inv@PAGE add x10, x10, Lk_inv@PAGEOFF movi v16.16b, #0x5b // Lk_s63 adrp x11, Lk_sb1@PAGE add x11, x11, Lk_sb1@PAGEOFF movi v17.16b, #0x0f // Lk_s0F ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt adrp x10, Lk_dksd@PAGE add x10, x10, Lk_dksd@PAGEOFF ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 adrp x11, Lk_mc_forward@PAGE add x11, x11, Lk_mc_forward@PAGEOFF ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 ld1 {v8.2d}, [x10] // Lk_rcon ld1 {v9.2d}, [x11] // Lk_mc_forward[0] ret .align 4 _vpaes_schedule_core: AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp,#-16]! add x29,sp,#0 bl _vpaes_key_preheat // load the tables ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) // input transform mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 bl _vpaes_schedule_transform mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 adrp x10, Lk_sr@PAGE // lea Lk_sr(%rip),%r10 add x10, x10, Lk_sr@PAGEOFF add x8, x8, x10 // encrypting, output zeroth round key after transform st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) cmp w1, #192 // cmp $192, %esi b.hi Lschedule_256 b.eq Lschedule_192 // 128: fall though ## ## .schedule_128 ## ## 128-bit specific part of key schedule. ## ## This schedule is really simple, because all its parts ## are accomplished by the subroutines. ## Lschedule_128: mov x0, #10 // mov $10, %esi Loop_schedule_128: sub x0, x0, #1 // dec %esi bl _vpaes_schedule_round cbz x0, Lschedule_mangle_last bl _vpaes_schedule_mangle // write output b Loop_schedule_128 ## ## .aes_schedule_192 ## ## 192-bit specific part of key schedule. ## ## The main body of this schedule is the same as the 128-bit ## schedule, but with more smearing. The long, high side is ## stored in %xmm7 as before, and the short, low side is in ## the high bits of %xmm6. ## ## This schedule is somewhat nastier, however, because each ## round produces 192 bits of key material, or 1.5 round keys. ## Therefore, on each cycle we do 2 rounds and produce 3 round ## keys. ## .align 4 Lschedule_192: sub x0, x0, #8 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) bl _vpaes_schedule_transform // input transform mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros mov x0, #4 // mov $4, %esi Loop_schedule_192: sub x0, x0, #1 // dec %esi bl _vpaes_schedule_round ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 bl _vpaes_schedule_mangle // save key n bl _vpaes_schedule_192_smear bl _vpaes_schedule_mangle // save key n+1 bl _vpaes_schedule_round cbz x0, Lschedule_mangle_last bl _vpaes_schedule_mangle // save key n+2 bl _vpaes_schedule_192_smear b Loop_schedule_192 ## ## .aes_schedule_256 ## ## 256-bit specific part of key schedule. ## ## The structure here is very similar to the 128-bit ## schedule, but with an additional "low side" in ## %xmm6. The low side's rounds are the same as the ## high side's, except no rcon and no rotation. ## .align 4 Lschedule_256: ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) bl _vpaes_schedule_transform // input transform mov x0, #7 // mov $7, %esi Loop_schedule_256: sub x0, x0, #1 // dec %esi bl _vpaes_schedule_mangle // output low result mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 // high round bl _vpaes_schedule_round cbz x0, Lschedule_mangle_last bl _vpaes_schedule_mangle // low round. swap xmm7 and xmm6 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 movi v4.16b, #0 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 bl _vpaes_schedule_low_round mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 b Loop_schedule_256 ## ## .aes_schedule_mangle_last ## ## Mangler for last round of key schedule ## Mangles %xmm0 ## when encrypting, outputs out(%xmm0) ^ 63 ## when decrypting, outputs unskew(%xmm0) ## ## Always called right before return... jumps to cleanup and exits ## .align 4 Lschedule_mangle_last: // schedule last round key from xmm0 adrp x11, Lk_deskew@PAGE // lea Lk_deskew(%rip),%r11 # prepare to deskew add x11, x11, Lk_deskew@PAGEOFF cbnz w3, Lschedule_mangle_last_dec // encrypting ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 adrp x11, Lk_opt@PAGE // lea Lk_opt(%rip), %r11 # prepare to output transform add x11, x11, Lk_opt@PAGEOFF add x2, x2, #32 // add $32, %rdx tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute Lschedule_mangle_last_dec: ld1 {v20.2d,v21.2d}, [x11] // reload constants sub x2, x2, #16 // add $-16, %rdx eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 bl _vpaes_schedule_transform // output transform st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key // cleanup eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 ldp x29, x30, [sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret ## ## .aes_schedule_192_smear ## ## Smear the short, low side in the 192-bit key schedule. ## ## Inputs: ## %xmm7: high side, b a x y ## %xmm6: low side, d c 0 0 ## %xmm13: 0 ## ## Outputs: ## %xmm6: b+c+d b+c 0 0 ## %xmm0: b+c+d b+c b a ## .align 4 _vpaes_schedule_192_smear: movi v1.16b, #0 dup v0.4s, v7.s[3] ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros ret ## ## .aes_schedule_round ## ## Runs one main round of the key schedule on %xmm0, %xmm7 ## ## Specifically, runs subbytes on the high dword of %xmm0 ## then rotates it by one byte and xors into the low dword of ## %xmm7. ## ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for ## next rcon. ## ## Smears the dwords of %xmm7 by xoring the low into the ## second low, result into third, result into highest. ## ## Returns results in %xmm7 = %xmm0. ## Clobbers %xmm1-%xmm4, %r11. ## .align 4 _vpaes_schedule_round: // extract rcon from xmm8 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 // rotate dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 // fall through... // low round: same as high round, but no rotation and no rcon. _vpaes_schedule_low_round: // smear xmm7 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 // subbytes and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output // add in smeared stuff eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 ret ## ## .aes_schedule_transform ## ## Linear-transform %xmm0 according to tables at (%r11) ## ## Requires that %xmm9 = 0x0F0F... as in preheat ## Output in %xmm0 ## Clobbers %xmm1, %xmm2 ## .align 4 _vpaes_schedule_transform: and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 // vmovdqa (%r11), %xmm2 # lo tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 // vmovdqa 16(%r11), %xmm1 # hi tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 ret ## ## .aes_schedule_mangle ## ## Mangle xmm0 from (basis-transformed) standard version ## to our version. ## ## On encrypt, ## xor with 0x63 ## multiply by circulant 0,1,1,1 ## apply shiftrows transform ## ## On decrypt, ## xor with 0x63 ## multiply by "inverse mixcolumns" circulant E,B,D,9 ## deskew ## apply shiftrows transform ## ## ## Writes out to (%rdx), and increments or decrements it ## Keeps track of round number mod 4 in %r8 ## Preserves xmm0 ## Clobbers xmm1-xmm5 ## .align 4 _vpaes_schedule_mangle: mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later // vmovdqa .Lk_mc_forward(%rip),%xmm5 // encrypting eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 add x2, x2, #16 // add $16, %rdx tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 Lschedule_mangle_both: tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 add x8, x8, #48 // add $-16, %r8 and x8, x8, #~(1<<6) // and $0x30, %r8 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) ret .globl _vpaes_set_encrypt_key .private_extern _vpaes_set_encrypt_key .align 4 _vpaes_set_encrypt_key: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so lsr w9, w1, #5 // shr $5,%eax add w9, w9, #5 // $5,%eax str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; mov w3, #0 // mov $0,%ecx mov x8, #0x30 // mov $0x30,%r8d bl _vpaes_schedule_core eor x0, x0, x0 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret .globl _vpaes_ctr32_encrypt_blocks .private_extern _vpaes_ctr32_encrypt_blocks .align 4 _vpaes_ctr32_encrypt_blocks: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so stp d10,d11,[sp,#-16]! stp d12,d13,[sp,#-16]! stp d14,d15,[sp,#-16]! cbz x2, Lctr32_done // Note, unlike the other functions, x2 here is measured in blocks, // not bytes. mov x17, x2 mov x2, x3 // Load the IV and counter portion. ldr w6, [x4, #12] ld1 {v7.16b}, [x4] bl _vpaes_encrypt_preheat tst x17, #1 rev w6, w6 // The counter is big-endian. b.eq Lctr32_prep_loop // Handle one block so the remaining block count is even for // _vpaes_encrypt_2x. ld1 {v6.16b}, [x0], #16 // Load input ahead of time bl _vpaes_encrypt_core eor v0.16b, v0.16b, v6.16b // XOR input and result st1 {v0.16b}, [x1], #16 subs x17, x17, #1 // Update the counter. add w6, w6, #1 rev w7, w6 mov v7.s[3], w7 b.ls Lctr32_done Lctr32_prep_loop: // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x // uses v14 and v15. mov v15.16b, v7.16b mov v14.16b, v7.16b add w6, w6, #1 rev w7, w6 mov v15.s[3], w7 Lctr32_loop: ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time bl _vpaes_encrypt_2x eor v0.16b, v0.16b, v6.16b // XOR input and result eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) st1 {v0.16b,v1.16b}, [x1], #32 subs x17, x17, #2 // Update the counter. add w7, w6, #1 add w6, w6, #2 rev w7, w7 mov v14.s[3], w7 rev w7, w6 mov v15.s[3], w7 b.hi Lctr32_loop Lctr32_done: ldp d14,d15,[sp],#16 ldp d12,d13,[sp],#16 ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) ring-0.17.14/pregenerated/vpaes-armv8-linux64.S000064400000000000000000000631361046102023000172170ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) .section .rodata .type _vpaes_consts,%object .align 7 // totally strategic alignment _vpaes_consts: .Lk_mc_forward: // mc_forward .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 .quad 0x080B0A0904070605, 0x000302010C0F0E0D .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 .quad 0x000302010C0F0E0D, 0x080B0A0904070605 .Lk_mc_backward: // mc_backward .quad 0x0605040702010003, 0x0E0D0C0F0A09080B .quad 0x020100030E0D0C0F, 0x0A09080B06050407 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 .quad 0x0A09080B06050407, 0x020100030E0D0C0F .Lk_sr: // sr .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 .quad 0x030E09040F0A0500, 0x0B06010C07020D08 .quad 0x0F060D040B020900, 0x070E050C030A0108 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 // // "Hot" constants // .Lk_inv: // inv, inva .quad 0x0E05060F0D080180, 0x040703090A0B0C02 .quad 0x01040A060F0B0780, 0x030D0E0C02050809 .Lk_ipt: // input transform (lo, hi) .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 .Lk_sbo: // sbou, sbot .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA .Lk_sb1: // sb1u, sb1t .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 .Lk_sb2: // sb2u, sb2t .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD // // Key schedule constants // .Lk_dksd: // decryption key schedule: invskew x*D .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E .Lk_dksb: // decryption key schedule: invskew x*B .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 .Lk_dks9: // decryption key schedule: invskew x*9 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE .Lk_rcon: // rcon .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 .Lk_opt: // output transform .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 .Lk_deskew: // deskew tables: inverts the sbox's "skew" .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .align 2 .size _vpaes_consts,.-_vpaes_consts .align 6 .text ## ## _aes_preheat ## ## Fills register %r10 -> .aes_consts (so you can -fPIC) ## and %xmm9-%xmm15 as specified below. ## .type _vpaes_encrypt_preheat,%function .align 4 _vpaes_encrypt_preheat: adrp x10, .Lk_inv add x10, x10, :lo12:.Lk_inv movi v17.16b, #0x0f ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 ret .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat ## ## _aes_encrypt_core ## ## AES-encrypt %xmm0. ## ## Inputs: ## %xmm0 = input ## %xmm9-%xmm15 as in _vpaes_preheat ## (%rdx) = scheduled keys ## ## Output in %xmm0 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax ## Preserves %xmm6 - %xmm8 so you get some local vectors ## ## .type _vpaes_encrypt_core,%function .align 4 _vpaes_encrypt_core: mov x9, x2 ldr w8, [x2,#240] // pull rounds adrp x11, .Lk_mc_forward+16 add x11, x11, :lo12:.Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 b .Lenc_entry .align 4 .Lenc_loop: // middle of middle round add x10, x11, #0x40 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D sub w8, w8, #1 // nr-- .Lenc_entry: // top of round and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 cbnz w8, .Lenc_loop // middle of last round add x10, x11, #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 ret .size _vpaes_encrypt_core,.-_vpaes_encrypt_core .type _vpaes_encrypt_2x,%function .align 4 _vpaes_encrypt_2x: mov x9, x2 ldr w8, [x2,#240] // pull rounds adrp x11, .Lk_mc_forward+16 add x11, x11, :lo12:.Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 and v9.16b, v15.16b, v17.16b ushr v8.16b, v15.16b, #4 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 tbl v9.16b, {v20.16b}, v9.16b // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 tbl v10.16b, {v21.16b}, v8.16b eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 eor v8.16b, v9.16b, v16.16b eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 eor v8.16b, v8.16b, v10.16b b .Lenc_2x_entry .align 4 .Lenc_2x_loop: // middle of middle round add x10, x11, #0x40 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u tbl v12.16b, {v25.16b}, v10.16b ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t tbl v8.16b, {v24.16b}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v12.16b, v12.16b, v16.16b tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u tbl v13.16b, {v27.16b}, v10.16b eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A eor v8.16b, v8.16b, v12.16b tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t tbl v10.16b, {v26.16b}, v11.16b ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B tbl v11.16b, {v8.16b}, v1.16b eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A eor v10.16b, v10.16b, v13.16b tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D tbl v8.16b, {v8.16b}, v4.16b eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B eor v11.16b, v11.16b, v10.16b tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C tbl v12.16b, {v11.16b},v1.16b eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D eor v8.16b, v8.16b, v11.16b and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D eor v8.16b, v8.16b, v12.16b sub w8, w8, #1 // nr-- .Lenc_2x_entry: // top of round and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i and v9.16b, v8.16b, v17.16b ushr v8.16b, v8.16b, #4 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k tbl v13.16b, {v19.16b},v9.16b eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j eor v9.16b, v9.16b, v8.16b tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i tbl v11.16b, {v18.16b},v8.16b tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j tbl v12.16b, {v18.16b},v9.16b eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k eor v11.16b, v11.16b, v13.16b eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k eor v12.16b, v12.16b, v13.16b tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak tbl v10.16b, {v18.16b},v11.16b tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak tbl v11.16b, {v18.16b},v12.16b eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v10.16b, v10.16b, v9.16b eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo eor v11.16b, v11.16b, v8.16b ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 cbnz w8, .Lenc_2x_loop // middle of last round add x10, x11, #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou tbl v12.16b, {v22.16b}, v10.16b ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t tbl v8.16b, {v23.16b}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v12.16b, v12.16b, v16.16b eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A eor v8.16b, v8.16b, v12.16b tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 tbl v1.16b, {v8.16b},v1.16b ret .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x ######################################################## ## ## ## AES key schedule ## ## ## ######################################################## .type _vpaes_key_preheat,%function .align 4 _vpaes_key_preheat: adrp x10, .Lk_inv add x10, x10, :lo12:.Lk_inv movi v16.16b, #0x5b // .Lk_s63 adrp x11, .Lk_sb1 add x11, x11, :lo12:.Lk_sb1 movi v17.16b, #0x0f // .Lk_s0F ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt adrp x10, .Lk_dksd add x10, x10, :lo12:.Lk_dksd ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 adrp x11, .Lk_mc_forward add x11, x11, :lo12:.Lk_mc_forward ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 ld1 {v8.2d}, [x10] // .Lk_rcon ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] ret .size _vpaes_key_preheat,.-_vpaes_key_preheat .type _vpaes_schedule_core,%function .align 4 _vpaes_schedule_core: AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp,#-16]! add x29,sp,#0 bl _vpaes_key_preheat // load the tables ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) // input transform mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 bl _vpaes_schedule_transform mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10 add x10, x10, :lo12:.Lk_sr add x8, x8, x10 // encrypting, output zeroth round key after transform st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) cmp w1, #192 // cmp $192, %esi b.hi .Lschedule_256 b.eq .Lschedule_192 // 128: fall though ## ## .schedule_128 ## ## 128-bit specific part of key schedule. ## ## This schedule is really simple, because all its parts ## are accomplished by the subroutines. ## .Lschedule_128: mov x0, #10 // mov $10, %esi .Loop_schedule_128: sub x0, x0, #1 // dec %esi bl _vpaes_schedule_round cbz x0, .Lschedule_mangle_last bl _vpaes_schedule_mangle // write output b .Loop_schedule_128 ## ## .aes_schedule_192 ## ## 192-bit specific part of key schedule. ## ## The main body of this schedule is the same as the 128-bit ## schedule, but with more smearing. The long, high side is ## stored in %xmm7 as before, and the short, low side is in ## the high bits of %xmm6. ## ## This schedule is somewhat nastier, however, because each ## round produces 192 bits of key material, or 1.5 round keys. ## Therefore, on each cycle we do 2 rounds and produce 3 round ## keys. ## .align 4 .Lschedule_192: sub x0, x0, #8 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) bl _vpaes_schedule_transform // input transform mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros mov x0, #4 // mov $4, %esi .Loop_schedule_192: sub x0, x0, #1 // dec %esi bl _vpaes_schedule_round ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 bl _vpaes_schedule_mangle // save key n bl _vpaes_schedule_192_smear bl _vpaes_schedule_mangle // save key n+1 bl _vpaes_schedule_round cbz x0, .Lschedule_mangle_last bl _vpaes_schedule_mangle // save key n+2 bl _vpaes_schedule_192_smear b .Loop_schedule_192 ## ## .aes_schedule_256 ## ## 256-bit specific part of key schedule. ## ## The structure here is very similar to the 128-bit ## schedule, but with an additional "low side" in ## %xmm6. The low side's rounds are the same as the ## high side's, except no rcon and no rotation. ## .align 4 .Lschedule_256: ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) bl _vpaes_schedule_transform // input transform mov x0, #7 // mov $7, %esi .Loop_schedule_256: sub x0, x0, #1 // dec %esi bl _vpaes_schedule_mangle // output low result mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 // high round bl _vpaes_schedule_round cbz x0, .Lschedule_mangle_last bl _vpaes_schedule_mangle // low round. swap xmm7 and xmm6 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 movi v4.16b, #0 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 bl _vpaes_schedule_low_round mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 b .Loop_schedule_256 ## ## .aes_schedule_mangle_last ## ## Mangler for last round of key schedule ## Mangles %xmm0 ## when encrypting, outputs out(%xmm0) ^ 63 ## when decrypting, outputs unskew(%xmm0) ## ## Always called right before return... jumps to cleanup and exits ## .align 4 .Lschedule_mangle_last: // schedule last round key from xmm0 adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew add x11, x11, :lo12:.Lk_deskew cbnz w3, .Lschedule_mangle_last_dec // encrypting ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform add x11, x11, :lo12:.Lk_opt add x2, x2, #32 // add $32, %rdx tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute .Lschedule_mangle_last_dec: ld1 {v20.2d,v21.2d}, [x11] // reload constants sub x2, x2, #16 // add $-16, %rdx eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 bl _vpaes_schedule_transform // output transform st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key // cleanup eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 ldp x29, x30, [sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret .size _vpaes_schedule_core,.-_vpaes_schedule_core ## ## .aes_schedule_192_smear ## ## Smear the short, low side in the 192-bit key schedule. ## ## Inputs: ## %xmm7: high side, b a x y ## %xmm6: low side, d c 0 0 ## %xmm13: 0 ## ## Outputs: ## %xmm6: b+c+d b+c 0 0 ## %xmm0: b+c+d b+c b a ## .type _vpaes_schedule_192_smear,%function .align 4 _vpaes_schedule_192_smear: movi v1.16b, #0 dup v0.4s, v7.s[3] ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros ret .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear ## ## .aes_schedule_round ## ## Runs one main round of the key schedule on %xmm0, %xmm7 ## ## Specifically, runs subbytes on the high dword of %xmm0 ## then rotates it by one byte and xors into the low dword of ## %xmm7. ## ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for ## next rcon. ## ## Smears the dwords of %xmm7 by xoring the low into the ## second low, result into third, result into highest. ## ## Returns results in %xmm7 = %xmm0. ## Clobbers %xmm1-%xmm4, %r11. ## .type _vpaes_schedule_round,%function .align 4 _vpaes_schedule_round: // extract rcon from xmm8 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 // rotate dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 // fall through... // low round: same as high round, but no rotation and no rcon. _vpaes_schedule_low_round: // smear xmm7 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 // subbytes and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output // add in smeared stuff eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 ret .size _vpaes_schedule_round,.-_vpaes_schedule_round ## ## .aes_schedule_transform ## ## Linear-transform %xmm0 according to tables at (%r11) ## ## Requires that %xmm9 = 0x0F0F... as in preheat ## Output in %xmm0 ## Clobbers %xmm1, %xmm2 ## .type _vpaes_schedule_transform,%function .align 4 _vpaes_schedule_transform: and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 // vmovdqa (%r11), %xmm2 # lo tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 // vmovdqa 16(%r11), %xmm1 # hi tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 ret .size _vpaes_schedule_transform,.-_vpaes_schedule_transform ## ## .aes_schedule_mangle ## ## Mangle xmm0 from (basis-transformed) standard version ## to our version. ## ## On encrypt, ## xor with 0x63 ## multiply by circulant 0,1,1,1 ## apply shiftrows transform ## ## On decrypt, ## xor with 0x63 ## multiply by "inverse mixcolumns" circulant E,B,D,9 ## deskew ## apply shiftrows transform ## ## ## Writes out to (%rdx), and increments or decrements it ## Keeps track of round number mod 4 in %r8 ## Preserves xmm0 ## Clobbers xmm1-xmm5 ## .type _vpaes_schedule_mangle,%function .align 4 _vpaes_schedule_mangle: mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later // vmovdqa .Lk_mc_forward(%rip),%xmm5 // encrypting eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 add x2, x2, #16 // add $16, %rdx tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 .Lschedule_mangle_both: tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 add x8, x8, #48 // add $-16, %r8 and x8, x8, #~(1<<6) // and $0x30, %r8 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) ret .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle .globl vpaes_set_encrypt_key .hidden vpaes_set_encrypt_key .type vpaes_set_encrypt_key,%function .align 4 vpaes_set_encrypt_key: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so lsr w9, w1, #5 // shr $5,%eax add w9, w9, #5 // $5,%eax str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; mov w3, #0 // mov $0,%ecx mov x8, #0x30 // mov $0x30,%r8d bl _vpaes_schedule_core eor x0, x0, x0 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key .globl vpaes_ctr32_encrypt_blocks .hidden vpaes_ctr32_encrypt_blocks .type vpaes_ctr32_encrypt_blocks,%function .align 4 vpaes_ctr32_encrypt_blocks: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so stp d10,d11,[sp,#-16]! stp d12,d13,[sp,#-16]! stp d14,d15,[sp,#-16]! cbz x2, .Lctr32_done // Note, unlike the other functions, x2 here is measured in blocks, // not bytes. mov x17, x2 mov x2, x3 // Load the IV and counter portion. ldr w6, [x4, #12] ld1 {v7.16b}, [x4] bl _vpaes_encrypt_preheat tst x17, #1 rev w6, w6 // The counter is big-endian. b.eq .Lctr32_prep_loop // Handle one block so the remaining block count is even for // _vpaes_encrypt_2x. ld1 {v6.16b}, [x0], #16 // .Load input ahead of time bl _vpaes_encrypt_core eor v0.16b, v0.16b, v6.16b // XOR input and result st1 {v0.16b}, [x1], #16 subs x17, x17, #1 // Update the counter. add w6, w6, #1 rev w7, w6 mov v7.s[3], w7 b.ls .Lctr32_done .Lctr32_prep_loop: // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x // uses v14 and v15. mov v15.16b, v7.16b mov v14.16b, v7.16b add w6, w6, #1 rev w7, w6 mov v15.s[3], w7 .Lctr32_loop: ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time bl _vpaes_encrypt_2x eor v0.16b, v0.16b, v6.16b // XOR input and result eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) st1 {v0.16b,v1.16b}, [x1], #32 subs x17, x17, #2 // Update the counter. add w7, w6, #1 add w6, w6, #2 rev w7, w7 mov v14.s[3], w7 rev w7, w6 mov v15.s[3], w7 b.hi .Lctr32_loop .Lctr32_done: ldp d14,d15,[sp],#16 ldp d12,d13,[sp],#16 ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) ring-0.17.14/pregenerated/vpaes-armv8-win64.S000064400000000000000000000616321046102023000166540ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) .section .rodata .align 7 // totally strategic alignment _vpaes_consts: Lk_mc_forward: // mc_forward .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 .quad 0x080B0A0904070605, 0x000302010C0F0E0D .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 .quad 0x000302010C0F0E0D, 0x080B0A0904070605 Lk_mc_backward: // mc_backward .quad 0x0605040702010003, 0x0E0D0C0F0A09080B .quad 0x020100030E0D0C0F, 0x0A09080B06050407 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 .quad 0x0A09080B06050407, 0x020100030E0D0C0F Lk_sr: // sr .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 .quad 0x030E09040F0A0500, 0x0B06010C07020D08 .quad 0x0F060D040B020900, 0x070E050C030A0108 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 // // "Hot" constants // Lk_inv: // inv, inva .quad 0x0E05060F0D080180, 0x040703090A0B0C02 .quad 0x01040A060F0B0780, 0x030D0E0C02050809 Lk_ipt: // input transform (lo, hi) .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 Lk_sbo: // sbou, sbot .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA Lk_sb1: // sb1u, sb1t .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 Lk_sb2: // sb2u, sb2t .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD // // Key schedule constants // Lk_dksd: // decryption key schedule: invskew x*D .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E Lk_dksb: // decryption key schedule: invskew x*B .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 Lk_dkse: // decryption key schedule: invskew x*E + 0x63 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 Lk_dks9: // decryption key schedule: invskew x*9 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE Lk_rcon: // rcon .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 Lk_opt: // output transform .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 Lk_deskew: // deskew tables: inverts the sbox's "skew" .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .align 2 .align 6 .text ## ## _aes_preheat ## ## Fills register %r10 -> .aes_consts (so you can -fPIC) ## and %xmm9-%xmm15 as specified below. ## .def _vpaes_encrypt_preheat .type 32 .endef .align 4 _vpaes_encrypt_preheat: adrp x10, Lk_inv add x10, x10, :lo12:Lk_inv movi v17.16b, #0x0f ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 ret ## ## _aes_encrypt_core ## ## AES-encrypt %xmm0. ## ## Inputs: ## %xmm0 = input ## %xmm9-%xmm15 as in _vpaes_preheat ## (%rdx) = scheduled keys ## ## Output in %xmm0 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax ## Preserves %xmm6 - %xmm8 so you get some local vectors ## ## .def _vpaes_encrypt_core .type 32 .endef .align 4 _vpaes_encrypt_core: mov x9, x2 ldr w8, [x2,#240] // pull rounds adrp x11, Lk_mc_forward+16 add x11, x11, :lo12:Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 b Lenc_entry .align 4 Lenc_loop: // middle of middle round add x10, x11, #0x40 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D sub w8, w8, #1 // nr-- Lenc_entry: // top of round and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 cbnz w8, Lenc_loop // middle of last round add x10, x11, #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 ret .def _vpaes_encrypt_2x .type 32 .endef .align 4 _vpaes_encrypt_2x: mov x9, x2 ldr w8, [x2,#240] // pull rounds adrp x11, Lk_mc_forward+16 add x11, x11, :lo12:Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 and v9.16b, v15.16b, v17.16b ushr v8.16b, v15.16b, #4 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 tbl v9.16b, {v20.16b}, v9.16b // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 tbl v10.16b, {v21.16b}, v8.16b eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 eor v8.16b, v9.16b, v16.16b eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 eor v8.16b, v8.16b, v10.16b b Lenc_2x_entry .align 4 Lenc_2x_loop: // middle of middle round add x10, x11, #0x40 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u tbl v12.16b, {v25.16b}, v10.16b ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t tbl v8.16b, {v24.16b}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v12.16b, v12.16b, v16.16b tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u tbl v13.16b, {v27.16b}, v10.16b eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A eor v8.16b, v8.16b, v12.16b tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t tbl v10.16b, {v26.16b}, v11.16b ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B tbl v11.16b, {v8.16b}, v1.16b eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A eor v10.16b, v10.16b, v13.16b tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D tbl v8.16b, {v8.16b}, v4.16b eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B eor v11.16b, v11.16b, v10.16b tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C tbl v12.16b, {v11.16b},v1.16b eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D eor v8.16b, v8.16b, v11.16b and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D eor v8.16b, v8.16b, v12.16b sub w8, w8, #1 // nr-- Lenc_2x_entry: // top of round and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i and v9.16b, v8.16b, v17.16b ushr v8.16b, v8.16b, #4 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k tbl v13.16b, {v19.16b},v9.16b eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j eor v9.16b, v9.16b, v8.16b tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i tbl v11.16b, {v18.16b},v8.16b tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j tbl v12.16b, {v18.16b},v9.16b eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k eor v11.16b, v11.16b, v13.16b eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k eor v12.16b, v12.16b, v13.16b tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak tbl v10.16b, {v18.16b},v11.16b tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak tbl v11.16b, {v18.16b},v12.16b eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v10.16b, v10.16b, v9.16b eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo eor v11.16b, v11.16b, v8.16b ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 cbnz w8, Lenc_2x_loop // middle of last round add x10, x11, #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou tbl v12.16b, {v22.16b}, v10.16b ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t tbl v8.16b, {v23.16b}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v12.16b, v12.16b, v16.16b eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A eor v8.16b, v8.16b, v12.16b tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 tbl v1.16b, {v8.16b},v1.16b ret ######################################################## ## ## ## AES key schedule ## ## ## ######################################################## .def _vpaes_key_preheat .type 32 .endef .align 4 _vpaes_key_preheat: adrp x10, Lk_inv add x10, x10, :lo12:Lk_inv movi v16.16b, #0x5b // Lk_s63 adrp x11, Lk_sb1 add x11, x11, :lo12:Lk_sb1 movi v17.16b, #0x0f // Lk_s0F ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt adrp x10, Lk_dksd add x10, x10, :lo12:Lk_dksd ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 adrp x11, Lk_mc_forward add x11, x11, :lo12:Lk_mc_forward ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 ld1 {v8.2d}, [x10] // Lk_rcon ld1 {v9.2d}, [x11] // Lk_mc_forward[0] ret .def _vpaes_schedule_core .type 32 .endef .align 4 _vpaes_schedule_core: AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp,#-16]! add x29,sp,#0 bl _vpaes_key_preheat // load the tables ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) // input transform mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 bl _vpaes_schedule_transform mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 adrp x10, Lk_sr // lea Lk_sr(%rip),%r10 add x10, x10, :lo12:Lk_sr add x8, x8, x10 // encrypting, output zeroth round key after transform st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) cmp w1, #192 // cmp $192, %esi b.hi Lschedule_256 b.eq Lschedule_192 // 128: fall though ## ## .schedule_128 ## ## 128-bit specific part of key schedule. ## ## This schedule is really simple, because all its parts ## are accomplished by the subroutines. ## Lschedule_128: mov x0, #10 // mov $10, %esi Loop_schedule_128: sub x0, x0, #1 // dec %esi bl _vpaes_schedule_round cbz x0, Lschedule_mangle_last bl _vpaes_schedule_mangle // write output b Loop_schedule_128 ## ## .aes_schedule_192 ## ## 192-bit specific part of key schedule. ## ## The main body of this schedule is the same as the 128-bit ## schedule, but with more smearing. The long, high side is ## stored in %xmm7 as before, and the short, low side is in ## the high bits of %xmm6. ## ## This schedule is somewhat nastier, however, because each ## round produces 192 bits of key material, or 1.5 round keys. ## Therefore, on each cycle we do 2 rounds and produce 3 round ## keys. ## .align 4 Lschedule_192: sub x0, x0, #8 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) bl _vpaes_schedule_transform // input transform mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros mov x0, #4 // mov $4, %esi Loop_schedule_192: sub x0, x0, #1 // dec %esi bl _vpaes_schedule_round ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 bl _vpaes_schedule_mangle // save key n bl _vpaes_schedule_192_smear bl _vpaes_schedule_mangle // save key n+1 bl _vpaes_schedule_round cbz x0, Lschedule_mangle_last bl _vpaes_schedule_mangle // save key n+2 bl _vpaes_schedule_192_smear b Loop_schedule_192 ## ## .aes_schedule_256 ## ## 256-bit specific part of key schedule. ## ## The structure here is very similar to the 128-bit ## schedule, but with an additional "low side" in ## %xmm6. The low side's rounds are the same as the ## high side's, except no rcon and no rotation. ## .align 4 Lschedule_256: ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) bl _vpaes_schedule_transform // input transform mov x0, #7 // mov $7, %esi Loop_schedule_256: sub x0, x0, #1 // dec %esi bl _vpaes_schedule_mangle // output low result mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 // high round bl _vpaes_schedule_round cbz x0, Lschedule_mangle_last bl _vpaes_schedule_mangle // low round. swap xmm7 and xmm6 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 movi v4.16b, #0 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 bl _vpaes_schedule_low_round mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 b Loop_schedule_256 ## ## .aes_schedule_mangle_last ## ## Mangler for last round of key schedule ## Mangles %xmm0 ## when encrypting, outputs out(%xmm0) ^ 63 ## when decrypting, outputs unskew(%xmm0) ## ## Always called right before return... jumps to cleanup and exits ## .align 4 Lschedule_mangle_last: // schedule last round key from xmm0 adrp x11, Lk_deskew // lea Lk_deskew(%rip),%r11 # prepare to deskew add x11, x11, :lo12:Lk_deskew cbnz w3, Lschedule_mangle_last_dec // encrypting ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 adrp x11, Lk_opt // lea Lk_opt(%rip), %r11 # prepare to output transform add x11, x11, :lo12:Lk_opt add x2, x2, #32 // add $32, %rdx tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute Lschedule_mangle_last_dec: ld1 {v20.2d,v21.2d}, [x11] // reload constants sub x2, x2, #16 // add $-16, %rdx eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 bl _vpaes_schedule_transform // output transform st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key // cleanup eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 ldp x29, x30, [sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret ## ## .aes_schedule_192_smear ## ## Smear the short, low side in the 192-bit key schedule. ## ## Inputs: ## %xmm7: high side, b a x y ## %xmm6: low side, d c 0 0 ## %xmm13: 0 ## ## Outputs: ## %xmm6: b+c+d b+c 0 0 ## %xmm0: b+c+d b+c b a ## .def _vpaes_schedule_192_smear .type 32 .endef .align 4 _vpaes_schedule_192_smear: movi v1.16b, #0 dup v0.4s, v7.s[3] ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros ret ## ## .aes_schedule_round ## ## Runs one main round of the key schedule on %xmm0, %xmm7 ## ## Specifically, runs subbytes on the high dword of %xmm0 ## then rotates it by one byte and xors into the low dword of ## %xmm7. ## ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for ## next rcon. ## ## Smears the dwords of %xmm7 by xoring the low into the ## second low, result into third, result into highest. ## ## Returns results in %xmm7 = %xmm0. ## Clobbers %xmm1-%xmm4, %r11. ## .def _vpaes_schedule_round .type 32 .endef .align 4 _vpaes_schedule_round: // extract rcon from xmm8 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 // rotate dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 // fall through... // low round: same as high round, but no rotation and no rcon. _vpaes_schedule_low_round: // smear xmm7 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 // subbytes and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output // add in smeared stuff eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 ret ## ## .aes_schedule_transform ## ## Linear-transform %xmm0 according to tables at (%r11) ## ## Requires that %xmm9 = 0x0F0F... as in preheat ## Output in %xmm0 ## Clobbers %xmm1, %xmm2 ## .def _vpaes_schedule_transform .type 32 .endef .align 4 _vpaes_schedule_transform: and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 // vmovdqa (%r11), %xmm2 # lo tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 // vmovdqa 16(%r11), %xmm1 # hi tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 ret ## ## .aes_schedule_mangle ## ## Mangle xmm0 from (basis-transformed) standard version ## to our version. ## ## On encrypt, ## xor with 0x63 ## multiply by circulant 0,1,1,1 ## apply shiftrows transform ## ## On decrypt, ## xor with 0x63 ## multiply by "inverse mixcolumns" circulant E,B,D,9 ## deskew ## apply shiftrows transform ## ## ## Writes out to (%rdx), and increments or decrements it ## Keeps track of round number mod 4 in %r8 ## Preserves xmm0 ## Clobbers xmm1-xmm5 ## .def _vpaes_schedule_mangle .type 32 .endef .align 4 _vpaes_schedule_mangle: mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later // vmovdqa .Lk_mc_forward(%rip),%xmm5 // encrypting eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 add x2, x2, #16 // add $16, %rdx tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 Lschedule_mangle_both: tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 add x8, x8, #48 // add $-16, %r8 and x8, x8, #~(1<<6) // and $0x30, %r8 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) ret .globl vpaes_set_encrypt_key .def vpaes_set_encrypt_key .type 32 .endef .align 4 vpaes_set_encrypt_key: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so lsr w9, w1, #5 // shr $5,%eax add w9, w9, #5 // $5,%eax str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; mov w3, #0 // mov $0,%ecx mov x8, #0x30 // mov $0x30,%r8d bl _vpaes_schedule_core eor x0, x0, x0 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret .globl vpaes_ctr32_encrypt_blocks .def vpaes_ctr32_encrypt_blocks .type 32 .endef .align 4 vpaes_ctr32_encrypt_blocks: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so stp d10,d11,[sp,#-16]! stp d12,d13,[sp,#-16]! stp d14,d15,[sp,#-16]! cbz x2, Lctr32_done // Note, unlike the other functions, x2 here is measured in blocks, // not bytes. mov x17, x2 mov x2, x3 // Load the IV and counter portion. ldr w6, [x4, #12] ld1 {v7.16b}, [x4] bl _vpaes_encrypt_preheat tst x17, #1 rev w6, w6 // The counter is big-endian. b.eq Lctr32_prep_loop // Handle one block so the remaining block count is even for // _vpaes_encrypt_2x. ld1 {v6.16b}, [x0], #16 // Load input ahead of time bl _vpaes_encrypt_core eor v0.16b, v0.16b, v6.16b // XOR input and result st1 {v0.16b}, [x1], #16 subs x17, x17, #1 // Update the counter. add w6, w6, #1 rev w7, w6 mov v7.s[3], w7 b.ls Lctr32_done Lctr32_prep_loop: // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x // uses v14 and v15. mov v15.16b, v7.16b mov v14.16b, v7.16b add w6, w6, #1 rev w7, w6 mov v15.s[3], w7 Lctr32_loop: ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time bl _vpaes_encrypt_2x eor v0.16b, v0.16b, v6.16b // XOR input and result eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) st1 {v0.16b,v1.16b}, [x1], #32 subs x17, x17, #2 // Update the counter. add w7, w6, #1 add w6, w6, #2 rev w7, w7 mov v14.s[3], w7 rev w7, w6 mov v15.s[3], w7 b.hi Lctr32_loop Lctr32_done: ldp d14,d15,[sp],#16 ldp d12,d13,[sp],#16 ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 AARCH64_VALIDATE_LINK_REGISTER ret #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) ring-0.17.14/pregenerated/vpaes-x86-elf.S000064400000000000000000000233461046102023000160430ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) .text #ifdef BORINGSSL_DISPATCH_TEST #endif .align 64 .L_vpaes_consts: .long 218628480,235210255,168496130,67568393 .long 252381056,17041926,33884169,51187212 .long 252645135,252645135,252645135,252645135 .long 1512730624,3266504856,1377990664,3401244816 .long 830229760,1275146365,2969422977,3447763452 .long 3411033600,2979783055,338359620,2782886510 .long 4209124096,907596821,221174255,1006095553 .long 191964160,3799684038,3164090317,1589111125 .long 182528256,1777043520,2877432650,3265356744 .long 1874708224,3503451415,3305285752,363511674 .long 1606117888,3487855781,1093350906,2384367825 .long 197121,67569157,134941193,202313229 .long 67569157,134941193,202313229,197121 .long 134941193,202313229,197121,67569157 .long 202313229,197121,67569157,134941193 .long 33619971,100992007,168364043,235736079 .long 235736079,33619971,100992007,168364043 .long 168364043,235736079,33619971,100992007 .long 100992007,168364043,235736079,33619971 .long 50462976,117835012,185207048,252579084 .long 252314880,51251460,117574920,184942860 .long 184682752,252054788,50987272,118359308 .long 118099200,185467140,251790600,50727180 .long 2946363062,528716217,1300004225,1881839624 .long 1532713819,1532713819,1532713819,1532713819 .long 3602276352,4288629033,3737020424,4153884961 .long 1354558464,32357713,2958822624,3775749553 .long 1201988352,132424512,1572796698,503232858 .long 2213177600,1597421020,4103937655,675398315 .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 .byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 .byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 .byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 .byte 118,101,114,115,105,116,121,41,0 .align 64 .hidden _vpaes_preheat .type _vpaes_preheat,@function .align 16 _vpaes_preheat: addl (%esp),%ebp movdqa -48(%ebp),%xmm7 movdqa -16(%ebp),%xmm6 ret .size _vpaes_preheat,.-_vpaes_preheat .hidden _vpaes_encrypt_core .type _vpaes_encrypt_core,@function .align 16 _vpaes_encrypt_core: movl $16,%ecx movl 240(%edx),%eax movdqa %xmm6,%xmm1 movdqa (%ebp),%xmm2 pandn %xmm0,%xmm1 pand %xmm6,%xmm0 movdqu (%edx),%xmm5 .byte 102,15,56,0,208 movdqa 16(%ebp),%xmm0 pxor %xmm5,%xmm2 psrld $4,%xmm1 addl $16,%edx .byte 102,15,56,0,193 leal 192(%ebp),%ebx pxor %xmm2,%xmm0 jmp .L000enc_entry .align 16 .L001enc_loop: movdqa 32(%ebp),%xmm4 movdqa 48(%ebp),%xmm0 .byte 102,15,56,0,226 .byte 102,15,56,0,195 pxor %xmm5,%xmm4 movdqa 64(%ebp),%xmm5 pxor %xmm4,%xmm0 movdqa -64(%ebx,%ecx,1),%xmm1 .byte 102,15,56,0,234 movdqa 80(%ebp),%xmm2 movdqa (%ebx,%ecx,1),%xmm4 .byte 102,15,56,0,211 movdqa %xmm0,%xmm3 pxor %xmm5,%xmm2 .byte 102,15,56,0,193 addl $16,%edx pxor %xmm2,%xmm0 .byte 102,15,56,0,220 addl $16,%ecx pxor %xmm0,%xmm3 .byte 102,15,56,0,193 andl $48,%ecx subl $1,%eax pxor %xmm3,%xmm0 .L000enc_entry: movdqa %xmm6,%xmm1 movdqa -32(%ebp),%xmm5 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm6,%xmm0 .byte 102,15,56,0,232 movdqa %xmm7,%xmm3 pxor %xmm1,%xmm0 .byte 102,15,56,0,217 movdqa %xmm7,%xmm4 pxor %xmm5,%xmm3 .byte 102,15,56,0,224 movdqa %xmm7,%xmm2 pxor %xmm5,%xmm4 .byte 102,15,56,0,211 movdqa %xmm7,%xmm3 pxor %xmm0,%xmm2 .byte 102,15,56,0,220 movdqu (%edx),%xmm5 pxor %xmm1,%xmm3 jnz .L001enc_loop movdqa 96(%ebp),%xmm4 movdqa 112(%ebp),%xmm0 .byte 102,15,56,0,226 pxor %xmm5,%xmm4 .byte 102,15,56,0,195 movdqa 64(%ebx,%ecx,1),%xmm1 pxor %xmm4,%xmm0 .byte 102,15,56,0,193 ret .size _vpaes_encrypt_core,.-_vpaes_encrypt_core .hidden _vpaes_schedule_core .type _vpaes_schedule_core,@function .align 16 _vpaes_schedule_core: addl (%esp),%ebp movdqu (%esi),%xmm0 movdqa 320(%ebp),%xmm2 movdqa %xmm0,%xmm3 leal (%ebp),%ebx movdqa %xmm2,4(%esp) call _vpaes_schedule_transform movdqa %xmm0,%xmm7 testl %edi,%edi jnz .L002schedule_am_decrypting movdqu %xmm0,(%edx) jmp .L003schedule_go .L002schedule_am_decrypting: movdqa 256(%ebp,%ecx,1),%xmm1 .byte 102,15,56,0,217 movdqu %xmm3,(%edx) xorl $48,%ecx .L003schedule_go: cmpl $192,%eax ja .L004schedule_256 .L005schedule_128: movl $10,%eax .L006loop_schedule_128: call _vpaes_schedule_round decl %eax jz .L007schedule_mangle_last call _vpaes_schedule_mangle jmp .L006loop_schedule_128 .align 16 .L004schedule_256: movdqu 16(%esi),%xmm0 call _vpaes_schedule_transform movl $7,%eax .L008loop_schedule_256: call _vpaes_schedule_mangle movdqa %xmm0,%xmm6 call _vpaes_schedule_round decl %eax jz .L007schedule_mangle_last call _vpaes_schedule_mangle pshufd $255,%xmm0,%xmm0 movdqa %xmm7,20(%esp) movdqa %xmm6,%xmm7 call .L_vpaes_schedule_low_round movdqa 20(%esp),%xmm7 jmp .L008loop_schedule_256 .align 16 .L007schedule_mangle_last: leal 384(%ebp),%ebx testl %edi,%edi jnz .L009schedule_mangle_last_dec movdqa 256(%ebp,%ecx,1),%xmm1 .byte 102,15,56,0,193 leal 352(%ebp),%ebx addl $32,%edx .L009schedule_mangle_last_dec: addl $-16,%edx pxor 336(%ebp),%xmm0 call _vpaes_schedule_transform movdqu %xmm0,(%edx) pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 ret .size _vpaes_schedule_core,.-_vpaes_schedule_core .hidden _vpaes_schedule_round .type _vpaes_schedule_round,@function .align 16 _vpaes_schedule_round: movdqa 8(%esp),%xmm2 pxor %xmm1,%xmm1 .byte 102,15,58,15,202,15 .byte 102,15,58,15,210,15 pxor %xmm1,%xmm7 pshufd $255,%xmm0,%xmm0 .byte 102,15,58,15,192,1 movdqa %xmm2,8(%esp) .L_vpaes_schedule_low_round: movdqa %xmm7,%xmm1 pslldq $4,%xmm7 pxor %xmm1,%xmm7 movdqa %xmm7,%xmm1 pslldq $8,%xmm7 pxor %xmm1,%xmm7 pxor 336(%ebp),%xmm7 movdqa -16(%ebp),%xmm4 movdqa -48(%ebp),%xmm5 movdqa %xmm4,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm4,%xmm0 movdqa -32(%ebp),%xmm2 .byte 102,15,56,0,208 pxor %xmm1,%xmm0 movdqa %xmm5,%xmm3 .byte 102,15,56,0,217 pxor %xmm2,%xmm3 movdqa %xmm5,%xmm4 .byte 102,15,56,0,224 pxor %xmm2,%xmm4 movdqa %xmm5,%xmm2 .byte 102,15,56,0,211 pxor %xmm0,%xmm2 movdqa %xmm5,%xmm3 .byte 102,15,56,0,220 pxor %xmm1,%xmm3 movdqa 32(%ebp),%xmm4 .byte 102,15,56,0,226 movdqa 48(%ebp),%xmm0 .byte 102,15,56,0,195 pxor %xmm4,%xmm0 pxor %xmm7,%xmm0 movdqa %xmm0,%xmm7 ret .size _vpaes_schedule_round,.-_vpaes_schedule_round .hidden _vpaes_schedule_transform .type _vpaes_schedule_transform,@function .align 16 _vpaes_schedule_transform: movdqa -16(%ebp),%xmm2 movdqa %xmm2,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm2,%xmm0 movdqa (%ebx),%xmm2 .byte 102,15,56,0,208 movdqa 16(%ebx),%xmm0 .byte 102,15,56,0,193 pxor %xmm2,%xmm0 ret .size _vpaes_schedule_transform,.-_vpaes_schedule_transform .hidden _vpaes_schedule_mangle .type _vpaes_schedule_mangle,@function .align 16 _vpaes_schedule_mangle: movdqa %xmm0,%xmm4 movdqa 128(%ebp),%xmm5 testl %edi,%edi jnz .L010schedule_mangle_dec addl $16,%edx pxor 336(%ebp),%xmm4 .byte 102,15,56,0,229 movdqa %xmm4,%xmm3 .byte 102,15,56,0,229 pxor %xmm4,%xmm3 .byte 102,15,56,0,229 pxor %xmm4,%xmm3 jmp .L011schedule_mangle_both .align 16 .L010schedule_mangle_dec: movdqa -16(%ebp),%xmm2 leal (%ebp),%esi movdqa %xmm2,%xmm1 pandn %xmm4,%xmm1 psrld $4,%xmm1 pand %xmm2,%xmm4 movdqa (%esi),%xmm2 .byte 102,15,56,0,212 movdqa 16(%esi),%xmm3 .byte 102,15,56,0,217 pxor %xmm2,%xmm3 .byte 102,15,56,0,221 movdqa 32(%esi),%xmm2 .byte 102,15,56,0,212 pxor %xmm3,%xmm2 movdqa 48(%esi),%xmm3 .byte 102,15,56,0,217 pxor %xmm2,%xmm3 .byte 102,15,56,0,221 movdqa 64(%esi),%xmm2 .byte 102,15,56,0,212 pxor %xmm3,%xmm2 movdqa 80(%esi),%xmm3 .byte 102,15,56,0,217 pxor %xmm2,%xmm3 .byte 102,15,56,0,221 movdqa 96(%esi),%xmm2 .byte 102,15,56,0,212 pxor %xmm3,%xmm2 movdqa 112(%esi),%xmm3 .byte 102,15,56,0,217 pxor %xmm2,%xmm3 addl $-16,%edx .L011schedule_mangle_both: movdqa 256(%ebp,%ecx,1),%xmm1 .byte 102,15,56,0,217 addl $-16,%ecx andl $48,%ecx movdqu %xmm3,(%edx) ret .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle .globl vpaes_set_encrypt_key .hidden vpaes_set_encrypt_key .type vpaes_set_encrypt_key,@function .align 16 vpaes_set_encrypt_key: .L_vpaes_set_encrypt_key_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi #ifdef BORINGSSL_DISPATCH_TEST pushl %ebx pushl %edx call .L012pic_for_function_hit .L012pic_for_function_hit: popl %ebx leal BORINGSSL_function_hit+5-.L012pic_for_function_hit(%ebx),%ebx movl $1,%edx movb %dl,(%ebx) popl %edx popl %ebx #endif movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%eax andl $-16,%ebx movl 28(%esp),%edx xchgl %esp,%ebx movl %ebx,48(%esp) movl %eax,%ebx shrl $5,%ebx addl $5,%ebx movl %ebx,240(%edx) movl $48,%ecx movl $0,%edi leal .L_vpaes_consts+0x30-.L013pic_point,%ebp call _vpaes_schedule_core .L013pic_point: movl 48(%esp),%esp xorl %eax,%eax popl %edi popl %esi popl %ebx popl %ebp ret .size vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin .globl vpaes_encrypt .hidden vpaes_encrypt .type vpaes_encrypt,@function .align 16 vpaes_encrypt: .L_vpaes_encrypt_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi #ifdef BORINGSSL_DISPATCH_TEST pushl %ebx pushl %edx call .L014pic_for_function_hit .L014pic_for_function_hit: popl %ebx leal BORINGSSL_function_hit+4-.L014pic_for_function_hit(%ebx),%ebx movl $1,%edx movb %dl,(%ebx) popl %edx popl %ebx #endif leal .L_vpaes_consts+0x30-.L015pic_point,%ebp call _vpaes_preheat .L015pic_point: movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%edi andl $-16,%ebx movl 28(%esp),%edx xchgl %esp,%ebx movl %ebx,48(%esp) movdqu (%esi),%xmm0 call _vpaes_encrypt_core movdqu %xmm0,(%edi) movl 48(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret .size vpaes_encrypt,.-.L_vpaes_encrypt_begin #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) ring-0.17.14/pregenerated/vpaes-x86-win32n.asm000064400000000000000000000207511046102023000167700ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %include "ring_core_generated/prefix_symbols_nasm.inc" %ifidn __OUTPUT_FORMAT__, win32 %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 $@feat.00 equ 1 section .text code align=64 %else section .text code %endif %ifdef BORINGSSL_DISPATCH_TEST extern _BORINGSSL_function_hit %endif align 64 L$_vpaes_consts: dd 218628480,235210255,168496130,67568393 dd 252381056,17041926,33884169,51187212 dd 252645135,252645135,252645135,252645135 dd 1512730624,3266504856,1377990664,3401244816 dd 830229760,1275146365,2969422977,3447763452 dd 3411033600,2979783055,338359620,2782886510 dd 4209124096,907596821,221174255,1006095553 dd 191964160,3799684038,3164090317,1589111125 dd 182528256,1777043520,2877432650,3265356744 dd 1874708224,3503451415,3305285752,363511674 dd 1606117888,3487855781,1093350906,2384367825 dd 197121,67569157,134941193,202313229 dd 67569157,134941193,202313229,197121 dd 134941193,202313229,197121,67569157 dd 202313229,197121,67569157,134941193 dd 33619971,100992007,168364043,235736079 dd 235736079,33619971,100992007,168364043 dd 168364043,235736079,33619971,100992007 dd 100992007,168364043,235736079,33619971 dd 50462976,117835012,185207048,252579084 dd 252314880,51251460,117574920,184942860 dd 184682752,252054788,50987272,118359308 dd 118099200,185467140,251790600,50727180 dd 2946363062,528716217,1300004225,1881839624 dd 1532713819,1532713819,1532713819,1532713819 dd 3602276352,4288629033,3737020424,4153884961 dd 1354558464,32357713,2958822624,3775749553 dd 1201988352,132424512,1572796698,503232858 dd 2213177600,1597421020,4103937655,675398315 db 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 db 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 db 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 db 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 db 118,101,114,115,105,116,121,41,0 align 64 align 16 __vpaes_preheat: add ebp,DWORD [esp] movdqa xmm7,[ebp-48] movdqa xmm6,[ebp-16] ret align 16 __vpaes_encrypt_core: mov ecx,16 mov eax,DWORD [240+edx] movdqa xmm1,xmm6 movdqa xmm2,[ebp] pandn xmm1,xmm0 pand xmm0,xmm6 movdqu xmm5,[edx] db 102,15,56,0,208 movdqa xmm0,[16+ebp] pxor xmm2,xmm5 psrld xmm1,4 add edx,16 db 102,15,56,0,193 lea ebx,[192+ebp] pxor xmm0,xmm2 jmp NEAR L$000enc_entry align 16 L$001enc_loop: movdqa xmm4,[32+ebp] movdqa xmm0,[48+ebp] db 102,15,56,0,226 db 102,15,56,0,195 pxor xmm4,xmm5 movdqa xmm5,[64+ebp] pxor xmm0,xmm4 movdqa xmm1,[ecx*1+ebx-64] db 102,15,56,0,234 movdqa xmm2,[80+ebp] movdqa xmm4,[ecx*1+ebx] db 102,15,56,0,211 movdqa xmm3,xmm0 pxor xmm2,xmm5 db 102,15,56,0,193 add edx,16 pxor xmm0,xmm2 db 102,15,56,0,220 add ecx,16 pxor xmm3,xmm0 db 102,15,56,0,193 and ecx,48 sub eax,1 pxor xmm0,xmm3 L$000enc_entry: movdqa xmm1,xmm6 movdqa xmm5,[ebp-32] pandn xmm1,xmm0 psrld xmm1,4 pand xmm0,xmm6 db 102,15,56,0,232 movdqa xmm3,xmm7 pxor xmm0,xmm1 db 102,15,56,0,217 movdqa xmm4,xmm7 pxor xmm3,xmm5 db 102,15,56,0,224 movdqa xmm2,xmm7 pxor xmm4,xmm5 db 102,15,56,0,211 movdqa xmm3,xmm7 pxor xmm2,xmm0 db 102,15,56,0,220 movdqu xmm5,[edx] pxor xmm3,xmm1 jnz NEAR L$001enc_loop movdqa xmm4,[96+ebp] movdqa xmm0,[112+ebp] db 102,15,56,0,226 pxor xmm4,xmm5 db 102,15,56,0,195 movdqa xmm1,[64+ecx*1+ebx] pxor xmm0,xmm4 db 102,15,56,0,193 ret align 16 __vpaes_schedule_core: add ebp,DWORD [esp] movdqu xmm0,[esi] movdqa xmm2,[320+ebp] movdqa xmm3,xmm0 lea ebx,[ebp] movdqa [4+esp],xmm2 call __vpaes_schedule_transform movdqa xmm7,xmm0 test edi,edi jnz NEAR L$002schedule_am_decrypting movdqu [edx],xmm0 jmp NEAR L$003schedule_go L$002schedule_am_decrypting: movdqa xmm1,[256+ecx*1+ebp] db 102,15,56,0,217 movdqu [edx],xmm3 xor ecx,48 L$003schedule_go: cmp eax,192 ja NEAR L$004schedule_256 L$005schedule_128: mov eax,10 L$006loop_schedule_128: call __vpaes_schedule_round dec eax jz NEAR L$007schedule_mangle_last call __vpaes_schedule_mangle jmp NEAR L$006loop_schedule_128 align 16 L$004schedule_256: movdqu xmm0,[16+esi] call __vpaes_schedule_transform mov eax,7 L$008loop_schedule_256: call __vpaes_schedule_mangle movdqa xmm6,xmm0 call __vpaes_schedule_round dec eax jz NEAR L$007schedule_mangle_last call __vpaes_schedule_mangle pshufd xmm0,xmm0,255 movdqa [20+esp],xmm7 movdqa xmm7,xmm6 call L$_vpaes_schedule_low_round movdqa xmm7,[20+esp] jmp NEAR L$008loop_schedule_256 align 16 L$007schedule_mangle_last: lea ebx,[384+ebp] test edi,edi jnz NEAR L$009schedule_mangle_last_dec movdqa xmm1,[256+ecx*1+ebp] db 102,15,56,0,193 lea ebx,[352+ebp] add edx,32 L$009schedule_mangle_last_dec: add edx,-16 pxor xmm0,[336+ebp] call __vpaes_schedule_transform movdqu [edx],xmm0 pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 pxor xmm6,xmm6 pxor xmm7,xmm7 ret align 16 __vpaes_schedule_round: movdqa xmm2,[8+esp] pxor xmm1,xmm1 db 102,15,58,15,202,15 db 102,15,58,15,210,15 pxor xmm7,xmm1 pshufd xmm0,xmm0,255 db 102,15,58,15,192,1 movdqa [8+esp],xmm2 L$_vpaes_schedule_low_round: movdqa xmm1,xmm7 pslldq xmm7,4 pxor xmm7,xmm1 movdqa xmm1,xmm7 pslldq xmm7,8 pxor xmm7,xmm1 pxor xmm7,[336+ebp] movdqa xmm4,[ebp-16] movdqa xmm5,[ebp-48] movdqa xmm1,xmm4 pandn xmm1,xmm0 psrld xmm1,4 pand xmm0,xmm4 movdqa xmm2,[ebp-32] db 102,15,56,0,208 pxor xmm0,xmm1 movdqa xmm3,xmm5 db 102,15,56,0,217 pxor xmm3,xmm2 movdqa xmm4,xmm5 db 102,15,56,0,224 pxor xmm4,xmm2 movdqa xmm2,xmm5 db 102,15,56,0,211 pxor xmm2,xmm0 movdqa xmm3,xmm5 db 102,15,56,0,220 pxor xmm3,xmm1 movdqa xmm4,[32+ebp] db 102,15,56,0,226 movdqa xmm0,[48+ebp] db 102,15,56,0,195 pxor xmm0,xmm4 pxor xmm0,xmm7 movdqa xmm7,xmm0 ret align 16 __vpaes_schedule_transform: movdqa xmm2,[ebp-16] movdqa xmm1,xmm2 pandn xmm1,xmm0 psrld xmm1,4 pand xmm0,xmm2 movdqa xmm2,[ebx] db 102,15,56,0,208 movdqa xmm0,[16+ebx] db 102,15,56,0,193 pxor xmm0,xmm2 ret align 16 __vpaes_schedule_mangle: movdqa xmm4,xmm0 movdqa xmm5,[128+ebp] test edi,edi jnz NEAR L$010schedule_mangle_dec add edx,16 pxor xmm4,[336+ebp] db 102,15,56,0,229 movdqa xmm3,xmm4 db 102,15,56,0,229 pxor xmm3,xmm4 db 102,15,56,0,229 pxor xmm3,xmm4 jmp NEAR L$011schedule_mangle_both align 16 L$010schedule_mangle_dec: movdqa xmm2,[ebp-16] lea esi,[ebp] movdqa xmm1,xmm2 pandn xmm1,xmm4 psrld xmm1,4 pand xmm4,xmm2 movdqa xmm2,[esi] db 102,15,56,0,212 movdqa xmm3,[16+esi] db 102,15,56,0,217 pxor xmm3,xmm2 db 102,15,56,0,221 movdqa xmm2,[32+esi] db 102,15,56,0,212 pxor xmm2,xmm3 movdqa xmm3,[48+esi] db 102,15,56,0,217 pxor xmm3,xmm2 db 102,15,56,0,221 movdqa xmm2,[64+esi] db 102,15,56,0,212 pxor xmm2,xmm3 movdqa xmm3,[80+esi] db 102,15,56,0,217 pxor xmm3,xmm2 db 102,15,56,0,221 movdqa xmm2,[96+esi] db 102,15,56,0,212 pxor xmm2,xmm3 movdqa xmm3,[112+esi] db 102,15,56,0,217 pxor xmm3,xmm2 add edx,-16 L$011schedule_mangle_both: movdqa xmm1,[256+ecx*1+ebp] db 102,15,56,0,217 add ecx,-16 and ecx,48 movdqu [edx],xmm3 ret global _vpaes_set_encrypt_key align 16 _vpaes_set_encrypt_key: L$_vpaes_set_encrypt_key_begin: push ebp push ebx push esi push edi %ifdef BORINGSSL_DISPATCH_TEST push ebx push edx call L$012pic_for_function_hit L$012pic_for_function_hit: pop ebx lea ebx,[(_BORINGSSL_function_hit+5-L$012pic_for_function_hit)+ebx] mov edx,1 mov BYTE [ebx],dl pop edx pop ebx %endif mov esi,DWORD [20+esp] lea ebx,[esp-56] mov eax,DWORD [24+esp] and ebx,-16 mov edx,DWORD [28+esp] xchg ebx,esp mov DWORD [48+esp],ebx mov ebx,eax shr ebx,5 add ebx,5 mov DWORD [240+edx],ebx mov ecx,48 mov edi,0 lea ebp,[(L$_vpaes_consts+0x30-L$013pic_point)] call __vpaes_schedule_core L$013pic_point: mov esp,DWORD [48+esp] xor eax,eax pop edi pop esi pop ebx pop ebp ret global _vpaes_encrypt align 16 _vpaes_encrypt: L$_vpaes_encrypt_begin: push ebp push ebx push esi push edi %ifdef BORINGSSL_DISPATCH_TEST push ebx push edx call L$014pic_for_function_hit L$014pic_for_function_hit: pop ebx lea ebx,[(_BORINGSSL_function_hit+4-L$014pic_for_function_hit)+ebx] mov edx,1 mov BYTE [ebx],dl pop edx pop ebx %endif lea ebp,[(L$_vpaes_consts+0x30-L$015pic_point)] call __vpaes_preheat L$015pic_point: mov esi,DWORD [20+esp] lea ebx,[esp-56] mov edi,DWORD [24+esp] and ebx,-16 mov edx,DWORD [28+esp] xchg ebx,esp mov DWORD [48+esp],ebx movdqu xmm0,[esi] call __vpaes_encrypt_core movdqu [edi],xmm0 mov esp,DWORD [48+esp] pop edi pop esi pop ebx pop ebp ret %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/vpaes-x86-win32n.o000064400000000000000000000167461046102023000164570ustar 00000000000000Lg$.debug$Sp6@B.debug$T8@B.text> p`5C:\Users\b\p\ring\pregenerated\vpaes-x86-win32n.asm,5f/~ >N|  0@P`p !"#$ %0&@'P(`)p*+,-./012345@8C9H:M;N<P>U?[@_AdBhClDpEuFzG~HIJKLMNPQRSTUVWXYZ[\]^_`abcdefg ijklm n%o)p-q2r6s:t?uCvGwLxPyTzY{]|a}g~lqvz$(-.49>DHMSX`fhnw| $(-26:?CGLPTY]afjoty~      !""#'$,%0&5':(?)C*H+M,Q-V.[/`0d1i2n3r4w5|6789:;=>?@ABDGHIJWXYZ[\]^_`abcdeghijklmorstu "$(,159:;<=8C:\Users\b\p\ring\pregenerated\vpaes-x86-win32n.o4'The Netwide Assembler 2.13.03L$_vpaes_consts__vpaes_preheat__vpaes_encrypt_coreL$001enc_loopL$000enc_entry__vpaes_schedule_core%L$002schedule_am_decryptingL$003schedule_goL$005schedule_128 L$006loop_schedule_128L$004schedule_256 L$008loop_schedule_256#L$007schedule_mangle_last'L$009schedule_mangle_last_dec __vpaes_schedule_round%L$_vpaes_schedule_low_round$__vpaes_schedule_transform!__vpaes_schedule_mangle"L$010schedule_mangle_dec#L$011schedule_mangle_both3_ring_core_0_17_14__vpaes_set_encrypt_key(L$_vpaes_set_encrypt_key_beginL$013pic_point+_ring_core_0_17_14__vpaes_encrypt L$_vpaes_encrypt_beginL$015pic_pointl p p t         A  E  ]  a  z  ~           )  -  K  O  r  v          9 = c g }! ! " " # #        p*Zx"RM|1}0L>PˏᛱD*nzߥ#6. ;$q Ɠz/U^) @iJ#cǽomxzj_t5+Aѐ                                    }|M*p[[[[[[[[[[[[[[[[`)Ih!APQ \] G@]Ziܵ6_wA(Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University),$fo}fouÐfofoUffo*f8foEffrf8fpfoe foE0f8f8ffom@ffoL f8foUPfo$ f8foff8ff8܃ff80ffofomffrff8foff8foff8foff8foff8o*f9foe`foEpf8ff8foL @ff8Ð,$ofo@fo؍]fT$fo fo f80=* HioFvfoH,fpf|$fofo|$fo f8` fPffffffffÐfoT$ff:f:ffpf:fT$fofsffofsffPfoefomfoffrffoUf8ffof8ffof8ffof8ffof8ffoe f8foE0f8fffoÐfoUfoffrffof8foCf8fÐfofo,fPf8fof8ff8ffoUufoffrffof8fo^f8ff8foV f8ffo^0f8ff8foV@f8ffo^Pf8ff8foV`f8ffo^pf8fڃfo f8ك0ÐUSVWt$\$ȋD$T$܉\$00-@d$01_^[]ÐUSVW-!1t$\$ȋ|$T$܉\$0od$0_^[].filegC:\Users\b\p\ring\.debug$S6.debug$T8.text>.absolut@feat.00@$P9G Vl`#:Vq6M\L$_vpaes_consts__vpaes_preheat__vpaes_encrypt_coreL$001enc_loopL$000enc_entry__vpaes_schedule_coreL$002schedule_am_decryptingL$003schedule_goL$005schedule_128L$006loop_schedule_128L$004schedule_256L$008loop_schedule_256L$007schedule_mangle_lastL$009schedule_mangle_last_dec__vpaes_schedule_roundL$_vpaes_schedule_low_round__vpaes_schedule_transform__vpaes_schedule_mangleL$010schedule_mangle_decL$011schedule_mangle_both_ring_core_0_17_14__vpaes_set_encrypt_keyL$_vpaes_set_encrypt_key_beginL$013pic_point_ring_core_0_17_14__vpaes_encryptL$_vpaes_encrypt_beginL$015pic_pointring-0.17.14/pregenerated/vpaes-x86_64-elf.S000064400000000000000000000276121046102023000163540ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text .type _vpaes_encrypt_core,@function .align 16 _vpaes_encrypt_core: .cfi_startproc movq %rdx,%r9 movq $16,%r11 movl 240(%rdx),%eax movdqa %xmm9,%xmm1 movdqa .Lk_ipt(%rip),%xmm2 pandn %xmm0,%xmm1 movdqu (%r9),%xmm5 psrld $4,%xmm1 pand %xmm9,%xmm0 .byte 102,15,56,0,208 movdqa .Lk_ipt+16(%rip),%xmm0 .byte 102,15,56,0,193 pxor %xmm5,%xmm2 addq $16,%r9 pxor %xmm2,%xmm0 leaq .Lk_mc_backward(%rip),%r10 jmp .Lenc_entry .align 16 .Lenc_loop: movdqa %xmm13,%xmm4 movdqa %xmm12,%xmm0 .byte 102,15,56,0,226 .byte 102,15,56,0,195 pxor %xmm5,%xmm4 movdqa %xmm15,%xmm5 pxor %xmm4,%xmm0 movdqa -64(%r11,%r10,1),%xmm1 .byte 102,15,56,0,234 movdqa (%r11,%r10,1),%xmm4 movdqa %xmm14,%xmm2 .byte 102,15,56,0,211 movdqa %xmm0,%xmm3 pxor %xmm5,%xmm2 .byte 102,15,56,0,193 addq $16,%r9 pxor %xmm2,%xmm0 .byte 102,15,56,0,220 addq $16,%r11 pxor %xmm0,%xmm3 .byte 102,15,56,0,193 andq $0x30,%r11 subq $1,%rax pxor %xmm3,%xmm0 .Lenc_entry: movdqa %xmm9,%xmm1 movdqa %xmm11,%xmm5 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm9,%xmm0 .byte 102,15,56,0,232 movdqa %xmm10,%xmm3 pxor %xmm1,%xmm0 .byte 102,15,56,0,217 movdqa %xmm10,%xmm4 pxor %xmm5,%xmm3 .byte 102,15,56,0,224 movdqa %xmm10,%xmm2 pxor %xmm5,%xmm4 .byte 102,15,56,0,211 movdqa %xmm10,%xmm3 pxor %xmm0,%xmm2 .byte 102,15,56,0,220 movdqu (%r9),%xmm5 pxor %xmm1,%xmm3 jnz .Lenc_loop movdqa -96(%r10),%xmm4 movdqa -80(%r10),%xmm0 .byte 102,15,56,0,226 pxor %xmm5,%xmm4 .byte 102,15,56,0,195 movdqa 64(%r11,%r10,1),%xmm1 pxor %xmm4,%xmm0 .byte 102,15,56,0,193 ret .cfi_endproc .size _vpaes_encrypt_core,.-_vpaes_encrypt_core .type _vpaes_encrypt_core_2x,@function .align 16 _vpaes_encrypt_core_2x: .cfi_startproc movq %rdx,%r9 movq $16,%r11 movl 240(%rdx),%eax movdqa %xmm9,%xmm1 movdqa %xmm9,%xmm7 movdqa .Lk_ipt(%rip),%xmm2 movdqa %xmm2,%xmm8 pandn %xmm0,%xmm1 pandn %xmm6,%xmm7 movdqu (%r9),%xmm5 psrld $4,%xmm1 psrld $4,%xmm7 pand %xmm9,%xmm0 pand %xmm9,%xmm6 .byte 102,15,56,0,208 .byte 102,68,15,56,0,198 movdqa .Lk_ipt+16(%rip),%xmm0 movdqa %xmm0,%xmm6 .byte 102,15,56,0,193 .byte 102,15,56,0,247 pxor %xmm5,%xmm2 pxor %xmm5,%xmm8 addq $16,%r9 pxor %xmm2,%xmm0 pxor %xmm8,%xmm6 leaq .Lk_mc_backward(%rip),%r10 jmp .Lenc2x_entry .align 16 .Lenc2x_loop: movdqa .Lk_sb1(%rip),%xmm4 movdqa .Lk_sb1+16(%rip),%xmm0 movdqa %xmm4,%xmm12 movdqa %xmm0,%xmm6 .byte 102,15,56,0,226 .byte 102,69,15,56,0,224 .byte 102,15,56,0,195 .byte 102,65,15,56,0,243 pxor %xmm5,%xmm4 pxor %xmm5,%xmm12 movdqa .Lk_sb2(%rip),%xmm5 movdqa %xmm5,%xmm13 pxor %xmm4,%xmm0 pxor %xmm12,%xmm6 movdqa -64(%r11,%r10,1),%xmm1 .byte 102,15,56,0,234 .byte 102,69,15,56,0,232 movdqa (%r11,%r10,1),%xmm4 movdqa .Lk_sb2+16(%rip),%xmm2 movdqa %xmm2,%xmm8 .byte 102,15,56,0,211 .byte 102,69,15,56,0,195 movdqa %xmm0,%xmm3 movdqa %xmm6,%xmm11 pxor %xmm5,%xmm2 pxor %xmm13,%xmm8 .byte 102,15,56,0,193 .byte 102,15,56,0,241 addq $16,%r9 pxor %xmm2,%xmm0 pxor %xmm8,%xmm6 .byte 102,15,56,0,220 .byte 102,68,15,56,0,220 addq $16,%r11 pxor %xmm0,%xmm3 pxor %xmm6,%xmm11 .byte 102,15,56,0,193 .byte 102,15,56,0,241 andq $0x30,%r11 subq $1,%rax pxor %xmm3,%xmm0 pxor %xmm11,%xmm6 .Lenc2x_entry: movdqa %xmm9,%xmm1 movdqa %xmm9,%xmm7 movdqa .Lk_inv+16(%rip),%xmm5 movdqa %xmm5,%xmm13 pandn %xmm0,%xmm1 pandn %xmm6,%xmm7 psrld $4,%xmm1 psrld $4,%xmm7 pand %xmm9,%xmm0 pand %xmm9,%xmm6 .byte 102,15,56,0,232 .byte 102,68,15,56,0,238 movdqa %xmm10,%xmm3 movdqa %xmm10,%xmm11 pxor %xmm1,%xmm0 pxor %xmm7,%xmm6 .byte 102,15,56,0,217 .byte 102,68,15,56,0,223 movdqa %xmm10,%xmm4 movdqa %xmm10,%xmm12 pxor %xmm5,%xmm3 pxor %xmm13,%xmm11 .byte 102,15,56,0,224 .byte 102,68,15,56,0,230 movdqa %xmm10,%xmm2 movdqa %xmm10,%xmm8 pxor %xmm5,%xmm4 pxor %xmm13,%xmm12 .byte 102,15,56,0,211 .byte 102,69,15,56,0,195 movdqa %xmm10,%xmm3 movdqa %xmm10,%xmm11 pxor %xmm0,%xmm2 pxor %xmm6,%xmm8 .byte 102,15,56,0,220 .byte 102,69,15,56,0,220 movdqu (%r9),%xmm5 pxor %xmm1,%xmm3 pxor %xmm7,%xmm11 jnz .Lenc2x_loop movdqa -96(%r10),%xmm4 movdqa -80(%r10),%xmm0 movdqa %xmm4,%xmm12 movdqa %xmm0,%xmm6 .byte 102,15,56,0,226 .byte 102,69,15,56,0,224 pxor %xmm5,%xmm4 pxor %xmm5,%xmm12 .byte 102,15,56,0,195 .byte 102,65,15,56,0,243 movdqa 64(%r11,%r10,1),%xmm1 pxor %xmm4,%xmm0 pxor %xmm12,%xmm6 .byte 102,15,56,0,193 .byte 102,15,56,0,241 ret .cfi_endproc .size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x .type _vpaes_schedule_core,@function .align 16 _vpaes_schedule_core: .cfi_startproc call _vpaes_preheat movdqa .Lk_rcon(%rip),%xmm8 movdqu (%rdi),%xmm0 movdqa %xmm0,%xmm3 leaq .Lk_ipt(%rip),%r11 call _vpaes_schedule_transform movdqa %xmm0,%xmm7 leaq .Lk_sr(%rip),%r10 movdqu %xmm0,(%rdx) .Lschedule_go: cmpl $192,%esi ja .Lschedule_256 .Lschedule_128: movl $10,%esi .Loop_schedule_128: call _vpaes_schedule_round decq %rsi jz .Lschedule_mangle_last call _vpaes_schedule_mangle jmp .Loop_schedule_128 .align 16 .Lschedule_256: movdqu 16(%rdi),%xmm0 call _vpaes_schedule_transform movl $7,%esi .Loop_schedule_256: call _vpaes_schedule_mangle movdqa %xmm0,%xmm6 call _vpaes_schedule_round decq %rsi jz .Lschedule_mangle_last call _vpaes_schedule_mangle pshufd $0xFF,%xmm0,%xmm0 movdqa %xmm7,%xmm5 movdqa %xmm6,%xmm7 call _vpaes_schedule_low_round movdqa %xmm5,%xmm7 jmp .Loop_schedule_256 .align 16 .Lschedule_mangle_last: leaq .Lk_deskew(%rip),%r11 movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,193 leaq .Lk_opt(%rip),%r11 addq $32,%rdx .Lschedule_mangle_last_dec: addq $-16,%rdx pxor .Lk_s63(%rip),%xmm0 call _vpaes_schedule_transform movdqu %xmm0,(%rdx) pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 ret .cfi_endproc .size _vpaes_schedule_core,.-_vpaes_schedule_core .type _vpaes_schedule_round,@function .align 16 _vpaes_schedule_round: .cfi_startproc pxor %xmm1,%xmm1 .byte 102,65,15,58,15,200,15 .byte 102,69,15,58,15,192,15 pxor %xmm1,%xmm7 pshufd $0xFF,%xmm0,%xmm0 .byte 102,15,58,15,192,1 _vpaes_schedule_low_round: movdqa %xmm7,%xmm1 pslldq $4,%xmm7 pxor %xmm1,%xmm7 movdqa %xmm7,%xmm1 pslldq $8,%xmm7 pxor %xmm1,%xmm7 pxor .Lk_s63(%rip),%xmm7 movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm9,%xmm0 movdqa %xmm11,%xmm2 .byte 102,15,56,0,208 pxor %xmm1,%xmm0 movdqa %xmm10,%xmm3 .byte 102,15,56,0,217 pxor %xmm2,%xmm3 movdqa %xmm10,%xmm4 .byte 102,15,56,0,224 pxor %xmm2,%xmm4 movdqa %xmm10,%xmm2 .byte 102,15,56,0,211 pxor %xmm0,%xmm2 movdqa %xmm10,%xmm3 .byte 102,15,56,0,220 pxor %xmm1,%xmm3 movdqa %xmm13,%xmm4 .byte 102,15,56,0,226 movdqa %xmm12,%xmm0 .byte 102,15,56,0,195 pxor %xmm4,%xmm0 pxor %xmm7,%xmm0 movdqa %xmm0,%xmm7 ret .cfi_endproc .size _vpaes_schedule_round,.-_vpaes_schedule_round .type _vpaes_schedule_transform,@function .align 16 _vpaes_schedule_transform: .cfi_startproc movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm9,%xmm0 movdqa (%r11),%xmm2 .byte 102,15,56,0,208 movdqa 16(%r11),%xmm0 .byte 102,15,56,0,193 pxor %xmm2,%xmm0 ret .cfi_endproc .size _vpaes_schedule_transform,.-_vpaes_schedule_transform .type _vpaes_schedule_mangle,@function .align 16 _vpaes_schedule_mangle: .cfi_startproc movdqa %xmm0,%xmm4 movdqa .Lk_mc_forward(%rip),%xmm5 addq $16,%rdx pxor .Lk_s63(%rip),%xmm4 .byte 102,15,56,0,229 movdqa %xmm4,%xmm3 .byte 102,15,56,0,229 pxor %xmm4,%xmm3 .byte 102,15,56,0,229 pxor %xmm4,%xmm3 .Lschedule_mangle_both: movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,217 addq $-16,%r8 andq $0x30,%r8 movdqu %xmm3,(%rdx) ret .cfi_endproc .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle .globl vpaes_set_encrypt_key .hidden vpaes_set_encrypt_key .type vpaes_set_encrypt_key,@function .align 16 vpaes_set_encrypt_key: .cfi_startproc _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST .extern BORINGSSL_function_hit .hidden BORINGSSL_function_hit movb $1,BORINGSSL_function_hit+5(%rip) #endif movl %esi,%eax shrl $5,%eax addl $5,%eax movl %eax,240(%rdx) movl $0,%ecx movl $0x30,%r8d call _vpaes_schedule_core xorl %eax,%eax ret .cfi_endproc .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key .globl vpaes_ctr32_encrypt_blocks .hidden vpaes_ctr32_encrypt_blocks .type vpaes_ctr32_encrypt_blocks,@function .align 16 vpaes_ctr32_encrypt_blocks: .cfi_startproc _CET_ENDBR xchgq %rcx,%rdx testq %rcx,%rcx jz .Lctr32_abort movdqu (%r8),%xmm0 movdqa .Lctr_add_one(%rip),%xmm8 subq %rdi,%rsi call _vpaes_preheat movdqa %xmm0,%xmm6 pshufb .Lrev_ctr(%rip),%xmm6 testq $1,%rcx jz .Lctr32_prep_loop movdqu (%rdi),%xmm7 call _vpaes_encrypt_core pxor %xmm7,%xmm0 paddd %xmm8,%xmm6 movdqu %xmm0,(%rsi,%rdi,1) subq $1,%rcx leaq 16(%rdi),%rdi jz .Lctr32_done .Lctr32_prep_loop: movdqa %xmm6,%xmm14 movdqa %xmm6,%xmm15 paddd %xmm8,%xmm15 .Lctr32_loop: movdqa .Lrev_ctr(%rip),%xmm1 movdqa %xmm14,%xmm0 movdqa %xmm15,%xmm6 .byte 102,15,56,0,193 .byte 102,15,56,0,241 call _vpaes_encrypt_core_2x movdqu (%rdi),%xmm1 movdqu 16(%rdi),%xmm2 movdqa .Lctr_add_two(%rip),%xmm3 pxor %xmm1,%xmm0 pxor %xmm2,%xmm6 paddd %xmm3,%xmm14 paddd %xmm3,%xmm15 movdqu %xmm0,(%rsi,%rdi,1) movdqu %xmm6,16(%rsi,%rdi,1) subq $2,%rcx leaq 32(%rdi),%rdi jnz .Lctr32_loop .Lctr32_done: .Lctr32_abort: ret .cfi_endproc .size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks .type _vpaes_preheat,@function .align 16 _vpaes_preheat: .cfi_startproc leaq .Lk_s0F(%rip),%r10 movdqa -32(%r10),%xmm10 movdqa -16(%r10),%xmm11 movdqa 0(%r10),%xmm9 movdqa 48(%r10),%xmm13 movdqa 64(%r10),%xmm12 movdqa 80(%r10),%xmm15 movdqa 96(%r10),%xmm14 ret .cfi_endproc .size _vpaes_preheat,.-_vpaes_preheat .type _vpaes_consts,@object .section .rodata .align 64 _vpaes_consts: .Lk_inv: .quad 0x0E05060F0D080180, 0x040703090A0B0C02 .quad 0x01040A060F0B0780, 0x030D0E0C02050809 .Lk_s0F: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F .Lk_ipt: .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 .Lk_sb1: .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF .Lk_sb2: .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A .Lk_sbo: .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA .Lk_mc_forward: .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 .quad 0x080B0A0904070605, 0x000302010C0F0E0D .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 .quad 0x000302010C0F0E0D, 0x080B0A0904070605 .Lk_mc_backward: .quad 0x0605040702010003, 0x0E0D0C0F0A09080B .quad 0x020100030E0D0C0F, 0x0A09080B06050407 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 .quad 0x0A09080B06050407, 0x020100030E0D0C0F .Lk_sr: .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 .quad 0x030E09040F0A0500, 0x0B06010C07020D08 .quad 0x0F060D040B020900, 0x070E050C030A0108 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 .Lk_rcon: .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 .Lk_s63: .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B .Lk_opt: .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 .Lk_deskew: .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 .Lrev_ctr: .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 .Lctr_add_one: .quad 0x0000000000000000, 0x0000000100000000 .Lctr_add_two: .quad 0x0000000000000000, 0x0000000200000000 .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .align 64 .size _vpaes_consts,.-_vpaes_consts .text #endif ring-0.17.14/pregenerated/vpaes-x86_64-macosx.S000064400000000000000000000254471046102023000171040ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text .p2align 4 _vpaes_encrypt_core: movq %rdx,%r9 movq $16,%r11 movl 240(%rdx),%eax movdqa %xmm9,%xmm1 movdqa L$k_ipt(%rip),%xmm2 pandn %xmm0,%xmm1 movdqu (%r9),%xmm5 psrld $4,%xmm1 pand %xmm9,%xmm0 .byte 102,15,56,0,208 movdqa L$k_ipt+16(%rip),%xmm0 .byte 102,15,56,0,193 pxor %xmm5,%xmm2 addq $16,%r9 pxor %xmm2,%xmm0 leaq L$k_mc_backward(%rip),%r10 jmp L$enc_entry .p2align 4 L$enc_loop: movdqa %xmm13,%xmm4 movdqa %xmm12,%xmm0 .byte 102,15,56,0,226 .byte 102,15,56,0,195 pxor %xmm5,%xmm4 movdqa %xmm15,%xmm5 pxor %xmm4,%xmm0 movdqa -64(%r11,%r10,1),%xmm1 .byte 102,15,56,0,234 movdqa (%r11,%r10,1),%xmm4 movdqa %xmm14,%xmm2 .byte 102,15,56,0,211 movdqa %xmm0,%xmm3 pxor %xmm5,%xmm2 .byte 102,15,56,0,193 addq $16,%r9 pxor %xmm2,%xmm0 .byte 102,15,56,0,220 addq $16,%r11 pxor %xmm0,%xmm3 .byte 102,15,56,0,193 andq $0x30,%r11 subq $1,%rax pxor %xmm3,%xmm0 L$enc_entry: movdqa %xmm9,%xmm1 movdqa %xmm11,%xmm5 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm9,%xmm0 .byte 102,15,56,0,232 movdqa %xmm10,%xmm3 pxor %xmm1,%xmm0 .byte 102,15,56,0,217 movdqa %xmm10,%xmm4 pxor %xmm5,%xmm3 .byte 102,15,56,0,224 movdqa %xmm10,%xmm2 pxor %xmm5,%xmm4 .byte 102,15,56,0,211 movdqa %xmm10,%xmm3 pxor %xmm0,%xmm2 .byte 102,15,56,0,220 movdqu (%r9),%xmm5 pxor %xmm1,%xmm3 jnz L$enc_loop movdqa -96(%r10),%xmm4 movdqa -80(%r10),%xmm0 .byte 102,15,56,0,226 pxor %xmm5,%xmm4 .byte 102,15,56,0,195 movdqa 64(%r11,%r10,1),%xmm1 pxor %xmm4,%xmm0 .byte 102,15,56,0,193 ret .p2align 4 _vpaes_encrypt_core_2x: movq %rdx,%r9 movq $16,%r11 movl 240(%rdx),%eax movdqa %xmm9,%xmm1 movdqa %xmm9,%xmm7 movdqa L$k_ipt(%rip),%xmm2 movdqa %xmm2,%xmm8 pandn %xmm0,%xmm1 pandn %xmm6,%xmm7 movdqu (%r9),%xmm5 psrld $4,%xmm1 psrld $4,%xmm7 pand %xmm9,%xmm0 pand %xmm9,%xmm6 .byte 102,15,56,0,208 .byte 102,68,15,56,0,198 movdqa L$k_ipt+16(%rip),%xmm0 movdqa %xmm0,%xmm6 .byte 102,15,56,0,193 .byte 102,15,56,0,247 pxor %xmm5,%xmm2 pxor %xmm5,%xmm8 addq $16,%r9 pxor %xmm2,%xmm0 pxor %xmm8,%xmm6 leaq L$k_mc_backward(%rip),%r10 jmp L$enc2x_entry .p2align 4 L$enc2x_loop: movdqa L$k_sb1(%rip),%xmm4 movdqa L$k_sb1+16(%rip),%xmm0 movdqa %xmm4,%xmm12 movdqa %xmm0,%xmm6 .byte 102,15,56,0,226 .byte 102,69,15,56,0,224 .byte 102,15,56,0,195 .byte 102,65,15,56,0,243 pxor %xmm5,%xmm4 pxor %xmm5,%xmm12 movdqa L$k_sb2(%rip),%xmm5 movdqa %xmm5,%xmm13 pxor %xmm4,%xmm0 pxor %xmm12,%xmm6 movdqa -64(%r11,%r10,1),%xmm1 .byte 102,15,56,0,234 .byte 102,69,15,56,0,232 movdqa (%r11,%r10,1),%xmm4 movdqa L$k_sb2+16(%rip),%xmm2 movdqa %xmm2,%xmm8 .byte 102,15,56,0,211 .byte 102,69,15,56,0,195 movdqa %xmm0,%xmm3 movdqa %xmm6,%xmm11 pxor %xmm5,%xmm2 pxor %xmm13,%xmm8 .byte 102,15,56,0,193 .byte 102,15,56,0,241 addq $16,%r9 pxor %xmm2,%xmm0 pxor %xmm8,%xmm6 .byte 102,15,56,0,220 .byte 102,68,15,56,0,220 addq $16,%r11 pxor %xmm0,%xmm3 pxor %xmm6,%xmm11 .byte 102,15,56,0,193 .byte 102,15,56,0,241 andq $0x30,%r11 subq $1,%rax pxor %xmm3,%xmm0 pxor %xmm11,%xmm6 L$enc2x_entry: movdqa %xmm9,%xmm1 movdqa %xmm9,%xmm7 movdqa L$k_inv+16(%rip),%xmm5 movdqa %xmm5,%xmm13 pandn %xmm0,%xmm1 pandn %xmm6,%xmm7 psrld $4,%xmm1 psrld $4,%xmm7 pand %xmm9,%xmm0 pand %xmm9,%xmm6 .byte 102,15,56,0,232 .byte 102,68,15,56,0,238 movdqa %xmm10,%xmm3 movdqa %xmm10,%xmm11 pxor %xmm1,%xmm0 pxor %xmm7,%xmm6 .byte 102,15,56,0,217 .byte 102,68,15,56,0,223 movdqa %xmm10,%xmm4 movdqa %xmm10,%xmm12 pxor %xmm5,%xmm3 pxor %xmm13,%xmm11 .byte 102,15,56,0,224 .byte 102,68,15,56,0,230 movdqa %xmm10,%xmm2 movdqa %xmm10,%xmm8 pxor %xmm5,%xmm4 pxor %xmm13,%xmm12 .byte 102,15,56,0,211 .byte 102,69,15,56,0,195 movdqa %xmm10,%xmm3 movdqa %xmm10,%xmm11 pxor %xmm0,%xmm2 pxor %xmm6,%xmm8 .byte 102,15,56,0,220 .byte 102,69,15,56,0,220 movdqu (%r9),%xmm5 pxor %xmm1,%xmm3 pxor %xmm7,%xmm11 jnz L$enc2x_loop movdqa -96(%r10),%xmm4 movdqa -80(%r10),%xmm0 movdqa %xmm4,%xmm12 movdqa %xmm0,%xmm6 .byte 102,15,56,0,226 .byte 102,69,15,56,0,224 pxor %xmm5,%xmm4 pxor %xmm5,%xmm12 .byte 102,15,56,0,195 .byte 102,65,15,56,0,243 movdqa 64(%r11,%r10,1),%xmm1 pxor %xmm4,%xmm0 pxor %xmm12,%xmm6 .byte 102,15,56,0,193 .byte 102,15,56,0,241 ret .p2align 4 _vpaes_schedule_core: call _vpaes_preheat movdqa L$k_rcon(%rip),%xmm8 movdqu (%rdi),%xmm0 movdqa %xmm0,%xmm3 leaq L$k_ipt(%rip),%r11 call _vpaes_schedule_transform movdqa %xmm0,%xmm7 leaq L$k_sr(%rip),%r10 movdqu %xmm0,(%rdx) L$schedule_go: cmpl $192,%esi ja L$schedule_256 L$schedule_128: movl $10,%esi L$oop_schedule_128: call _vpaes_schedule_round decq %rsi jz L$schedule_mangle_last call _vpaes_schedule_mangle jmp L$oop_schedule_128 .p2align 4 L$schedule_256: movdqu 16(%rdi),%xmm0 call _vpaes_schedule_transform movl $7,%esi L$oop_schedule_256: call _vpaes_schedule_mangle movdqa %xmm0,%xmm6 call _vpaes_schedule_round decq %rsi jz L$schedule_mangle_last call _vpaes_schedule_mangle pshufd $0xFF,%xmm0,%xmm0 movdqa %xmm7,%xmm5 movdqa %xmm6,%xmm7 call _vpaes_schedule_low_round movdqa %xmm5,%xmm7 jmp L$oop_schedule_256 .p2align 4 L$schedule_mangle_last: leaq L$k_deskew(%rip),%r11 movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,193 leaq L$k_opt(%rip),%r11 addq $32,%rdx L$schedule_mangle_last_dec: addq $-16,%rdx pxor L$k_s63(%rip),%xmm0 call _vpaes_schedule_transform movdqu %xmm0,(%rdx) pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 ret .p2align 4 _vpaes_schedule_round: pxor %xmm1,%xmm1 .byte 102,65,15,58,15,200,15 .byte 102,69,15,58,15,192,15 pxor %xmm1,%xmm7 pshufd $0xFF,%xmm0,%xmm0 .byte 102,15,58,15,192,1 _vpaes_schedule_low_round: movdqa %xmm7,%xmm1 pslldq $4,%xmm7 pxor %xmm1,%xmm7 movdqa %xmm7,%xmm1 pslldq $8,%xmm7 pxor %xmm1,%xmm7 pxor L$k_s63(%rip),%xmm7 movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm9,%xmm0 movdqa %xmm11,%xmm2 .byte 102,15,56,0,208 pxor %xmm1,%xmm0 movdqa %xmm10,%xmm3 .byte 102,15,56,0,217 pxor %xmm2,%xmm3 movdqa %xmm10,%xmm4 .byte 102,15,56,0,224 pxor %xmm2,%xmm4 movdqa %xmm10,%xmm2 .byte 102,15,56,0,211 pxor %xmm0,%xmm2 movdqa %xmm10,%xmm3 .byte 102,15,56,0,220 pxor %xmm1,%xmm3 movdqa %xmm13,%xmm4 .byte 102,15,56,0,226 movdqa %xmm12,%xmm0 .byte 102,15,56,0,195 pxor %xmm4,%xmm0 pxor %xmm7,%xmm0 movdqa %xmm0,%xmm7 ret .p2align 4 _vpaes_schedule_transform: movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm9,%xmm0 movdqa (%r11),%xmm2 .byte 102,15,56,0,208 movdqa 16(%r11),%xmm0 .byte 102,15,56,0,193 pxor %xmm2,%xmm0 ret .p2align 4 _vpaes_schedule_mangle: movdqa %xmm0,%xmm4 movdqa L$k_mc_forward(%rip),%xmm5 addq $16,%rdx pxor L$k_s63(%rip),%xmm4 .byte 102,15,56,0,229 movdqa %xmm4,%xmm3 .byte 102,15,56,0,229 pxor %xmm4,%xmm3 .byte 102,15,56,0,229 pxor %xmm4,%xmm3 L$schedule_mangle_both: movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,217 addq $-16,%r8 andq $0x30,%r8 movdqu %xmm3,(%rdx) ret .globl _vpaes_set_encrypt_key .private_extern _vpaes_set_encrypt_key .p2align 4 _vpaes_set_encrypt_key: _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST movb $1,_BORINGSSL_function_hit+5(%rip) #endif movl %esi,%eax shrl $5,%eax addl $5,%eax movl %eax,240(%rdx) movl $0,%ecx movl $0x30,%r8d call _vpaes_schedule_core xorl %eax,%eax ret .globl _vpaes_ctr32_encrypt_blocks .private_extern _vpaes_ctr32_encrypt_blocks .p2align 4 _vpaes_ctr32_encrypt_blocks: _CET_ENDBR xchgq %rcx,%rdx testq %rcx,%rcx jz L$ctr32_abort movdqu (%r8),%xmm0 movdqa L$ctr_add_one(%rip),%xmm8 subq %rdi,%rsi call _vpaes_preheat movdqa %xmm0,%xmm6 pshufb L$rev_ctr(%rip),%xmm6 testq $1,%rcx jz L$ctr32_prep_loop movdqu (%rdi),%xmm7 call _vpaes_encrypt_core pxor %xmm7,%xmm0 paddd %xmm8,%xmm6 movdqu %xmm0,(%rsi,%rdi,1) subq $1,%rcx leaq 16(%rdi),%rdi jz L$ctr32_done L$ctr32_prep_loop: movdqa %xmm6,%xmm14 movdqa %xmm6,%xmm15 paddd %xmm8,%xmm15 L$ctr32_loop: movdqa L$rev_ctr(%rip),%xmm1 movdqa %xmm14,%xmm0 movdqa %xmm15,%xmm6 .byte 102,15,56,0,193 .byte 102,15,56,0,241 call _vpaes_encrypt_core_2x movdqu (%rdi),%xmm1 movdqu 16(%rdi),%xmm2 movdqa L$ctr_add_two(%rip),%xmm3 pxor %xmm1,%xmm0 pxor %xmm2,%xmm6 paddd %xmm3,%xmm14 paddd %xmm3,%xmm15 movdqu %xmm0,(%rsi,%rdi,1) movdqu %xmm6,16(%rsi,%rdi,1) subq $2,%rcx leaq 32(%rdi),%rdi jnz L$ctr32_loop L$ctr32_done: L$ctr32_abort: ret .p2align 4 _vpaes_preheat: leaq L$k_s0F(%rip),%r10 movdqa -32(%r10),%xmm10 movdqa -16(%r10),%xmm11 movdqa 0(%r10),%xmm9 movdqa 48(%r10),%xmm13 movdqa 64(%r10),%xmm12 movdqa 80(%r10),%xmm15 movdqa 96(%r10),%xmm14 ret .section __DATA,__const .p2align 6 _vpaes_consts: L$k_inv: .quad 0x0E05060F0D080180, 0x040703090A0B0C02 .quad 0x01040A060F0B0780, 0x030D0E0C02050809 L$k_s0F: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F L$k_ipt: .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 L$k_sb1: .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF L$k_sb2: .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A L$k_sbo: .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA L$k_mc_forward: .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 .quad 0x080B0A0904070605, 0x000302010C0F0E0D .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 .quad 0x000302010C0F0E0D, 0x080B0A0904070605 L$k_mc_backward: .quad 0x0605040702010003, 0x0E0D0C0F0A09080B .quad 0x020100030E0D0C0F, 0x0A09080B06050407 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 .quad 0x0A09080B06050407, 0x020100030E0D0C0F L$k_sr: .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 .quad 0x030E09040F0A0500, 0x0B06010C07020D08 .quad 0x0F060D040B020900, 0x070E050C030A0108 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 L$k_rcon: .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 L$k_s63: .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B L$k_opt: .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 L$k_deskew: .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 L$rev_ctr: .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 L$ctr_add_one: .quad 0x0000000000000000, 0x0000000100000000 L$ctr_add_two: .quad 0x0000000000000000, 0x0000000200000000 .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .p2align 6 .text #endif ring-0.17.14/pregenerated/vpaes-x86_64-nasm.asm000064400000000000000000000342121046102023000171140ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %ifidn __OUTPUT_FORMAT__, win64 default rel %define XMMWORD %define YMMWORD %define ZMMWORD %define _CET_ENDBR %include "ring_core_generated/prefix_symbols_nasm.inc" section .text code align=64 ALIGN 16 _vpaes_encrypt_core: mov r9,rdx mov r11,16 mov eax,DWORD[240+rdx] movdqa xmm1,xmm9 movdqa xmm2,XMMWORD[$L$k_ipt] pandn xmm1,xmm0 movdqu xmm5,XMMWORD[r9] psrld xmm1,4 pand xmm0,xmm9 DB 102,15,56,0,208 movdqa xmm0,XMMWORD[(($L$k_ipt+16))] DB 102,15,56,0,193 pxor xmm2,xmm5 add r9,16 pxor xmm0,xmm2 lea r10,[$L$k_mc_backward] jmp NEAR $L$enc_entry ALIGN 16 $L$enc_loop: movdqa xmm4,xmm13 movdqa xmm0,xmm12 DB 102,15,56,0,226 DB 102,15,56,0,195 pxor xmm4,xmm5 movdqa xmm5,xmm15 pxor xmm0,xmm4 movdqa xmm1,XMMWORD[((-64))+r10*1+r11] DB 102,15,56,0,234 movdqa xmm4,XMMWORD[r10*1+r11] movdqa xmm2,xmm14 DB 102,15,56,0,211 movdqa xmm3,xmm0 pxor xmm2,xmm5 DB 102,15,56,0,193 add r9,16 pxor xmm0,xmm2 DB 102,15,56,0,220 add r11,16 pxor xmm3,xmm0 DB 102,15,56,0,193 and r11,0x30 sub rax,1 pxor xmm0,xmm3 $L$enc_entry: movdqa xmm1,xmm9 movdqa xmm5,xmm11 pandn xmm1,xmm0 psrld xmm1,4 pand xmm0,xmm9 DB 102,15,56,0,232 movdqa xmm3,xmm10 pxor xmm0,xmm1 DB 102,15,56,0,217 movdqa xmm4,xmm10 pxor xmm3,xmm5 DB 102,15,56,0,224 movdqa xmm2,xmm10 pxor xmm4,xmm5 DB 102,15,56,0,211 movdqa xmm3,xmm10 pxor xmm2,xmm0 DB 102,15,56,0,220 movdqu xmm5,XMMWORD[r9] pxor xmm3,xmm1 jnz NEAR $L$enc_loop movdqa xmm4,XMMWORD[((-96))+r10] movdqa xmm0,XMMWORD[((-80))+r10] DB 102,15,56,0,226 pxor xmm4,xmm5 DB 102,15,56,0,195 movdqa xmm1,XMMWORD[64+r10*1+r11] pxor xmm0,xmm4 DB 102,15,56,0,193 ret ALIGN 16 _vpaes_encrypt_core_2x: mov r9,rdx mov r11,16 mov eax,DWORD[240+rdx] movdqa xmm1,xmm9 movdqa xmm7,xmm9 movdqa xmm2,XMMWORD[$L$k_ipt] movdqa xmm8,xmm2 pandn xmm1,xmm0 pandn xmm7,xmm6 movdqu xmm5,XMMWORD[r9] psrld xmm1,4 psrld xmm7,4 pand xmm0,xmm9 pand xmm6,xmm9 DB 102,15,56,0,208 DB 102,68,15,56,0,198 movdqa xmm0,XMMWORD[(($L$k_ipt+16))] movdqa xmm6,xmm0 DB 102,15,56,0,193 DB 102,15,56,0,247 pxor xmm2,xmm5 pxor xmm8,xmm5 add r9,16 pxor xmm0,xmm2 pxor xmm6,xmm8 lea r10,[$L$k_mc_backward] jmp NEAR $L$enc2x_entry ALIGN 16 $L$enc2x_loop: movdqa xmm4,XMMWORD[$L$k_sb1] movdqa xmm0,XMMWORD[(($L$k_sb1+16))] movdqa xmm12,xmm4 movdqa xmm6,xmm0 DB 102,15,56,0,226 DB 102,69,15,56,0,224 DB 102,15,56,0,195 DB 102,65,15,56,0,243 pxor xmm4,xmm5 pxor xmm12,xmm5 movdqa xmm5,XMMWORD[$L$k_sb2] movdqa xmm13,xmm5 pxor xmm0,xmm4 pxor xmm6,xmm12 movdqa xmm1,XMMWORD[((-64))+r10*1+r11] DB 102,15,56,0,234 DB 102,69,15,56,0,232 movdqa xmm4,XMMWORD[r10*1+r11] movdqa xmm2,XMMWORD[(($L$k_sb2+16))] movdqa xmm8,xmm2 DB 102,15,56,0,211 DB 102,69,15,56,0,195 movdqa xmm3,xmm0 movdqa xmm11,xmm6 pxor xmm2,xmm5 pxor xmm8,xmm13 DB 102,15,56,0,193 DB 102,15,56,0,241 add r9,16 pxor xmm0,xmm2 pxor xmm6,xmm8 DB 102,15,56,0,220 DB 102,68,15,56,0,220 add r11,16 pxor xmm3,xmm0 pxor xmm11,xmm6 DB 102,15,56,0,193 DB 102,15,56,0,241 and r11,0x30 sub rax,1 pxor xmm0,xmm3 pxor xmm6,xmm11 $L$enc2x_entry: movdqa xmm1,xmm9 movdqa xmm7,xmm9 movdqa xmm5,XMMWORD[(($L$k_inv+16))] movdqa xmm13,xmm5 pandn xmm1,xmm0 pandn xmm7,xmm6 psrld xmm1,4 psrld xmm7,4 pand xmm0,xmm9 pand xmm6,xmm9 DB 102,15,56,0,232 DB 102,68,15,56,0,238 movdqa xmm3,xmm10 movdqa xmm11,xmm10 pxor xmm0,xmm1 pxor xmm6,xmm7 DB 102,15,56,0,217 DB 102,68,15,56,0,223 movdqa xmm4,xmm10 movdqa xmm12,xmm10 pxor xmm3,xmm5 pxor xmm11,xmm13 DB 102,15,56,0,224 DB 102,68,15,56,0,230 movdqa xmm2,xmm10 movdqa xmm8,xmm10 pxor xmm4,xmm5 pxor xmm12,xmm13 DB 102,15,56,0,211 DB 102,69,15,56,0,195 movdqa xmm3,xmm10 movdqa xmm11,xmm10 pxor xmm2,xmm0 pxor xmm8,xmm6 DB 102,15,56,0,220 DB 102,69,15,56,0,220 movdqu xmm5,XMMWORD[r9] pxor xmm3,xmm1 pxor xmm11,xmm7 jnz NEAR $L$enc2x_loop movdqa xmm4,XMMWORD[((-96))+r10] movdqa xmm0,XMMWORD[((-80))+r10] movdqa xmm12,xmm4 movdqa xmm6,xmm0 DB 102,15,56,0,226 DB 102,69,15,56,0,224 pxor xmm4,xmm5 pxor xmm12,xmm5 DB 102,15,56,0,195 DB 102,65,15,56,0,243 movdqa xmm1,XMMWORD[64+r10*1+r11] pxor xmm0,xmm4 pxor xmm6,xmm12 DB 102,15,56,0,193 DB 102,15,56,0,241 ret ALIGN 16 _vpaes_schedule_core: call _vpaes_preheat movdqa xmm8,XMMWORD[$L$k_rcon] movdqu xmm0,XMMWORD[rdi] movdqa xmm3,xmm0 lea r11,[$L$k_ipt] call _vpaes_schedule_transform movdqa xmm7,xmm0 lea r10,[$L$k_sr] movdqu XMMWORD[rdx],xmm0 $L$schedule_go: cmp esi,192 ja NEAR $L$schedule_256 $L$schedule_128: mov esi,10 $L$oop_schedule_128: call _vpaes_schedule_round dec rsi jz NEAR $L$schedule_mangle_last call _vpaes_schedule_mangle jmp NEAR $L$oop_schedule_128 ALIGN 16 $L$schedule_256: movdqu xmm0,XMMWORD[16+rdi] call _vpaes_schedule_transform mov esi,7 $L$oop_schedule_256: call _vpaes_schedule_mangle movdqa xmm6,xmm0 call _vpaes_schedule_round dec rsi jz NEAR $L$schedule_mangle_last call _vpaes_schedule_mangle pshufd xmm0,xmm0,0xFF movdqa xmm5,xmm7 movdqa xmm7,xmm6 call _vpaes_schedule_low_round movdqa xmm7,xmm5 jmp NEAR $L$oop_schedule_256 ALIGN 16 $L$schedule_mangle_last: lea r11,[$L$k_deskew] movdqa xmm1,XMMWORD[r10*1+r8] DB 102,15,56,0,193 lea r11,[$L$k_opt] add rdx,32 $L$schedule_mangle_last_dec: add rdx,-16 pxor xmm0,XMMWORD[$L$k_s63] call _vpaes_schedule_transform movdqu XMMWORD[rdx],xmm0 pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 pxor xmm6,xmm6 pxor xmm7,xmm7 ret ALIGN 16 _vpaes_schedule_round: pxor xmm1,xmm1 DB 102,65,15,58,15,200,15 DB 102,69,15,58,15,192,15 pxor xmm7,xmm1 pshufd xmm0,xmm0,0xFF DB 102,15,58,15,192,1 _vpaes_schedule_low_round: movdqa xmm1,xmm7 pslldq xmm7,4 pxor xmm7,xmm1 movdqa xmm1,xmm7 pslldq xmm7,8 pxor xmm7,xmm1 pxor xmm7,XMMWORD[$L$k_s63] movdqa xmm1,xmm9 pandn xmm1,xmm0 psrld xmm1,4 pand xmm0,xmm9 movdqa xmm2,xmm11 DB 102,15,56,0,208 pxor xmm0,xmm1 movdqa xmm3,xmm10 DB 102,15,56,0,217 pxor xmm3,xmm2 movdqa xmm4,xmm10 DB 102,15,56,0,224 pxor xmm4,xmm2 movdqa xmm2,xmm10 DB 102,15,56,0,211 pxor xmm2,xmm0 movdqa xmm3,xmm10 DB 102,15,56,0,220 pxor xmm3,xmm1 movdqa xmm4,xmm13 DB 102,15,56,0,226 movdqa xmm0,xmm12 DB 102,15,56,0,195 pxor xmm0,xmm4 pxor xmm0,xmm7 movdqa xmm7,xmm0 ret ALIGN 16 _vpaes_schedule_transform: movdqa xmm1,xmm9 pandn xmm1,xmm0 psrld xmm1,4 pand xmm0,xmm9 movdqa xmm2,XMMWORD[r11] DB 102,15,56,0,208 movdqa xmm0,XMMWORD[16+r11] DB 102,15,56,0,193 pxor xmm0,xmm2 ret ALIGN 16 _vpaes_schedule_mangle: movdqa xmm4,xmm0 movdqa xmm5,XMMWORD[$L$k_mc_forward] add rdx,16 pxor xmm4,XMMWORD[$L$k_s63] DB 102,15,56,0,229 movdqa xmm3,xmm4 DB 102,15,56,0,229 pxor xmm3,xmm4 DB 102,15,56,0,229 pxor xmm3,xmm4 $L$schedule_mangle_both: movdqa xmm1,XMMWORD[r10*1+r8] DB 102,15,56,0,217 add r8,-16 and r8,0x30 movdqu XMMWORD[rdx],xmm3 ret global vpaes_set_encrypt_key ALIGN 16 vpaes_set_encrypt_key: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_vpaes_set_encrypt_key: mov rdi,rcx mov rsi,rdx mov rdx,r8 _CET_ENDBR %ifdef BORINGSSL_DISPATCH_TEST EXTERN BORINGSSL_function_hit mov BYTE[((BORINGSSL_function_hit+5))],1 %endif lea rsp,[((-184))+rsp] movaps XMMWORD[16+rsp],xmm6 movaps XMMWORD[32+rsp],xmm7 movaps XMMWORD[48+rsp],xmm8 movaps XMMWORD[64+rsp],xmm9 movaps XMMWORD[80+rsp],xmm10 movaps XMMWORD[96+rsp],xmm11 movaps XMMWORD[112+rsp],xmm12 movaps XMMWORD[128+rsp],xmm13 movaps XMMWORD[144+rsp],xmm14 movaps XMMWORD[160+rsp],xmm15 $L$enc_key_body: mov eax,esi shr eax,5 add eax,5 mov DWORD[240+rdx],eax mov ecx,0 mov r8d,0x30 call _vpaes_schedule_core movaps xmm6,XMMWORD[16+rsp] movaps xmm7,XMMWORD[32+rsp] movaps xmm8,XMMWORD[48+rsp] movaps xmm9,XMMWORD[64+rsp] movaps xmm10,XMMWORD[80+rsp] movaps xmm11,XMMWORD[96+rsp] movaps xmm12,XMMWORD[112+rsp] movaps xmm13,XMMWORD[128+rsp] movaps xmm14,XMMWORD[144+rsp] movaps xmm15,XMMWORD[160+rsp] lea rsp,[184+rsp] $L$enc_key_epilogue: xor eax,eax mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_vpaes_set_encrypt_key: global vpaes_ctr32_encrypt_blocks ALIGN 16 vpaes_ctr32_encrypt_blocks: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_vpaes_ctr32_encrypt_blocks: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] _CET_ENDBR xchg rdx,rcx test rcx,rcx jz NEAR $L$ctr32_abort lea rsp,[((-184))+rsp] movaps XMMWORD[16+rsp],xmm6 movaps XMMWORD[32+rsp],xmm7 movaps XMMWORD[48+rsp],xmm8 movaps XMMWORD[64+rsp],xmm9 movaps XMMWORD[80+rsp],xmm10 movaps XMMWORD[96+rsp],xmm11 movaps XMMWORD[112+rsp],xmm12 movaps XMMWORD[128+rsp],xmm13 movaps XMMWORD[144+rsp],xmm14 movaps XMMWORD[160+rsp],xmm15 $L$ctr32_body: movdqu xmm0,XMMWORD[r8] movdqa xmm8,XMMWORD[$L$ctr_add_one] sub rsi,rdi call _vpaes_preheat movdqa xmm6,xmm0 pshufb xmm6,XMMWORD[$L$rev_ctr] test rcx,1 jz NEAR $L$ctr32_prep_loop movdqu xmm7,XMMWORD[rdi] call _vpaes_encrypt_core pxor xmm0,xmm7 paddd xmm6,xmm8 movdqu XMMWORD[rdi*1+rsi],xmm0 sub rcx,1 lea rdi,[16+rdi] jz NEAR $L$ctr32_done $L$ctr32_prep_loop: movdqa xmm14,xmm6 movdqa xmm15,xmm6 paddd xmm15,xmm8 $L$ctr32_loop: movdqa xmm1,XMMWORD[$L$rev_ctr] movdqa xmm0,xmm14 movdqa xmm6,xmm15 DB 102,15,56,0,193 DB 102,15,56,0,241 call _vpaes_encrypt_core_2x movdqu xmm1,XMMWORD[rdi] movdqu xmm2,XMMWORD[16+rdi] movdqa xmm3,XMMWORD[$L$ctr_add_two] pxor xmm0,xmm1 pxor xmm6,xmm2 paddd xmm14,xmm3 paddd xmm15,xmm3 movdqu XMMWORD[rdi*1+rsi],xmm0 movdqu XMMWORD[16+rdi*1+rsi],xmm6 sub rcx,2 lea rdi,[32+rdi] jnz NEAR $L$ctr32_loop $L$ctr32_done: movaps xmm6,XMMWORD[16+rsp] movaps xmm7,XMMWORD[32+rsp] movaps xmm8,XMMWORD[48+rsp] movaps xmm9,XMMWORD[64+rsp] movaps xmm10,XMMWORD[80+rsp] movaps xmm11,XMMWORD[96+rsp] movaps xmm12,XMMWORD[112+rsp] movaps xmm13,XMMWORD[128+rsp] movaps xmm14,XMMWORD[144+rsp] movaps xmm15,XMMWORD[160+rsp] lea rsp,[184+rsp] $L$ctr32_epilogue: $L$ctr32_abort: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_vpaes_ctr32_encrypt_blocks: ALIGN 16 _vpaes_preheat: lea r10,[$L$k_s0F] movdqa xmm10,XMMWORD[((-32))+r10] movdqa xmm11,XMMWORD[((-16))+r10] movdqa xmm9,XMMWORD[r10] movdqa xmm13,XMMWORD[48+r10] movdqa xmm12,XMMWORD[64+r10] movdqa xmm15,XMMWORD[80+r10] movdqa xmm14,XMMWORD[96+r10] ret section .rdata rdata align=8 ALIGN 64 _vpaes_consts: $L$k_inv: DQ 0x0E05060F0D080180,0x040703090A0B0C02 DQ 0x01040A060F0B0780,0x030D0E0C02050809 $L$k_s0F: DQ 0x0F0F0F0F0F0F0F0F,0x0F0F0F0F0F0F0F0F $L$k_ipt: DQ 0xC2B2E8985A2A7000,0xCABAE09052227808 DQ 0x4C01307D317C4D00,0xCD80B1FCB0FDCC81 $L$k_sb1: DQ 0xB19BE18FCB503E00,0xA5DF7A6E142AF544 DQ 0x3618D415FAE22300,0x3BF7CCC10D2ED9EF $L$k_sb2: DQ 0xE27A93C60B712400,0x5EB7E955BC982FCD DQ 0x69EB88400AE12900,0xC2A163C8AB82234A $L$k_sbo: DQ 0xD0D26D176FBDC700,0x15AABF7AC502A878 DQ 0xCFE474A55FBB6A00,0x8E1E90D1412B35FA $L$k_mc_forward: DQ 0x0407060500030201,0x0C0F0E0D080B0A09 DQ 0x080B0A0904070605,0x000302010C0F0E0D DQ 0x0C0F0E0D080B0A09,0x0407060500030201 DQ 0x000302010C0F0E0D,0x080B0A0904070605 $L$k_mc_backward: DQ 0x0605040702010003,0x0E0D0C0F0A09080B DQ 0x020100030E0D0C0F,0x0A09080B06050407 DQ 0x0E0D0C0F0A09080B,0x0605040702010003 DQ 0x0A09080B06050407,0x020100030E0D0C0F $L$k_sr: DQ 0x0706050403020100,0x0F0E0D0C0B0A0908 DQ 0x030E09040F0A0500,0x0B06010C07020D08 DQ 0x0F060D040B020900,0x070E050C030A0108 DQ 0x0B0E0104070A0D00,0x0306090C0F020508 $L$k_rcon: DQ 0x1F8391B9AF9DEEB6,0x702A98084D7C7D81 $L$k_s63: DQ 0x5B5B5B5B5B5B5B5B,0x5B5B5B5B5B5B5B5B $L$k_opt: DQ 0xFF9F4929D6B66000,0xF7974121DEBE6808 DQ 0x01EDBD5150BCEC00,0xE10D5DB1B05C0CE0 $L$k_deskew: DQ 0x07E4A34047A4E300,0x1DFEB95A5DBEF91A DQ 0x5F36B5DC83EA6900,0x2841C2ABF49D1E77 $L$rev_ctr: DQ 0x0706050403020100,0x0c0d0e0f0b0a0908 $L$ctr_add_one: DQ 0x0000000000000000,0x0000000100000000 $L$ctr_add_two: DQ 0x0000000000000000,0x0000000200000000 DB 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 DB 111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54 DB 52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97 DB 109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32 DB 85,110,105,118,101,114,115,105,116,121,41,0 ALIGN 64 section .text EXTERN __imp_RtlVirtualUnwind ALIGN 16 se_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$in_prologue mov rax,QWORD[152+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$in_prologue lea rsi,[16+rax] lea rdi,[512+r8] mov ecx,20 DD 0xa548f3fc lea rax,[184+rax] $L$in_prologue: mov rdi,QWORD[8+rax] mov rsi,QWORD[16+rax] mov QWORD[152+r8],rax mov QWORD[168+r8],rsi mov QWORD[176+r8],rdi mov rdi,QWORD[40+r9] mov rsi,r8 mov ecx,154 DD 0xa548f3fc mov rsi,r9 xor rcx,rcx mov rdx,QWORD[8+rsi] mov r8,QWORD[rsi] mov r9,QWORD[16+rsi] mov r10,QWORD[40+rsi] lea r11,[56+rsi] lea r12,[24+rsi] mov QWORD[32+rsp],r10 mov QWORD[40+rsp],r11 mov QWORD[48+rsp],r12 mov QWORD[56+rsp],rcx call QWORD[__imp_RtlVirtualUnwind] mov eax,1 add rsp,64 popfq pop r15 pop r14 pop r13 pop r12 pop rbp pop rbx pop rdi pop rsi ret section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_vpaes_set_encrypt_key wrt ..imagebase DD $L$SEH_end_vpaes_set_encrypt_key wrt ..imagebase DD $L$SEH_info_vpaes_set_encrypt_key wrt ..imagebase DD $L$SEH_begin_vpaes_ctr32_encrypt_blocks wrt ..imagebase DD $L$SEH_end_vpaes_ctr32_encrypt_blocks wrt ..imagebase DD $L$SEH_info_vpaes_ctr32_encrypt_blocks wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_vpaes_set_encrypt_key: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$enc_key_body wrt ..imagebase,$L$enc_key_epilogue wrt ..imagebase $L$SEH_info_vpaes_ctr32_encrypt_blocks: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/vpaes-x86_64-nasm.o000064400000000000000000000316651046102023000166030ustar 00000000000000dg+G.debug$S`dp@B.debug$T8@B.text & p`.rdata'R*@p@.pdataR*j*@0@.xdata **@@@6C:\Users\b\p\ring\pregenerated\vpaes-x86_64-nasm.asm|?@ABCDEFGHIJKLMNRSTUVWXYZ[\] ^_`ab c%d*e.f4i:j@kElImNnUoYp^q_`cioty #(059>EJPV^chnrw{ %*/38=CHMQV[afkoty     *23478 9:<?!B'C-P2S7T:U@VEWJcPeUfZg_jdkhnmoppvq{tuvwxz !%*.27;CHLQV[`dinrw| %(),-. /0123#6)7.8296::;;D@FEGJHMJPKSLVV^WcXhYnZt[z\]^_`bcdeghijklmnopqrst v wxy~ %*-0369>ADJRW\bhntz #+/38=BHLPV[`flrx~HJKLMNOPQRS U V X Y [ \ ] ^$ `+ b/ c3 d6 e< g@ hG iL jP kW n[ o_ pf qm rt tx u{ v w y z { | } ~                   9C:\Users\b\p\ring\pregenerated\vpaes-x86_64-nasm.o4'The Netwide Assembler 2.13.03_vpaes_encrypt_coreL$enc_loopL$enc_entry _vpaes_encrypt_core_2xL$enc2x_loopL$enc2x_entry_vpaes_schedule_coreL$schedule_goL$schedule_128L$oop_schedule_128L$schedule_256L$oop_schedule_256 L$schedule_mangle_last$L$schedule_mangle_last_dec_vpaes_schedule_round#_vpaes_schedule_low_round#_vpaes_schedule_transform _vpaes_schedule_mangle L$schedule_mangle_both2ring_core_0_17_14__vpaes_set_encrypt_key+L$SEH_begin_vpaes_set_encrypt_keyL$enc_key_bodyL$enc_key_epilogue)L$SEH_end_vpaes_set_encrypt_key7ring_core_0_17_14__vpaes_ctr32_encrypt_blocks0L$SEH_begin_vpaes_ctr32_encrypt_blocksL$ctr32_bodyL$ctr32_prep_loopL$ctr32_loopL$ctr32_doneL$ctr32_epilogueL$ctr32_abort.L$SEH_end_vpaes_ctr32_encrypt_blocks_vpaes_preheat _vpaes_consts #L$k_inv #L$k_s0F #L$k_ipt #L$k_sb1 #L$k_sb2 #L$k_sbo #L$k_mc_forward #L$k_mc_backward #L$k_sr #L$k_rcon #L$k_s63 #L$k_opt #L$k_deskew #L$rev_ctr #L$ctr_add_one #L$ctr_add_twose_handlerL$in_prologue-  L$SEH_info_vpaes_set_encrypt_key2  L$SEH_info_vpaes_ctr32_encrypt_blocksl p       - 1 O S g k             ) - K O q u     ! ! " " # $# T$ X$ % % & & ' ' ( ( ) !) O* S* g+ k+ , , - - . . / / 0 0 1 1 72 ;2 S3 W3 i4 m4 5 5 6 6 7 7 8 8 9 9 : : ; ; '< +< >= B= T> X> j? n? @ @ A A B B C C D D E E 1F 5F IAfAofo0fAo)frfAf8fo@f8fIfLwfAofAof8f8ffAoffCoLf8fCo$fAof8foff8Iff8Iff8I0HffAofAoffrfAf8fAoff8fAoff8fAoff8fAoff8Ao)f,fAobfAoBf8ff8fCoL@ff8ÐIAfAofAofo0fDoffAo)frfrfAfAf8fD8fo@fof8f8ffDIffALfo%Pfo`fDofof8fE8f8fA8ffDfo-pfDoffAfCoLf8fE8fCo$fofDof8fE8fofDoffEf8f8IffAf8fD8IffDf8f8I0HffAfAofAofo-fDofffrfrfAfAf8fD8fAofEofff8fD8fAofEoffEf8fD8fAofEoffEf8fE8fAofEoffDf8fE8Ao)ffD]fAobfAoBfDofof8fE8ffDf8fA8fCoL@ffAf8f8ÐfDopofoL0foL0# H`oGffoH*ufpfofofoLfCo f8LH HfffffffffÐffA:fE:ffpf:fofsffofsff=fAoffrfAfAof8ffAof8ffAof8ffAof8ffAof8ffAof8fAof8fffoÐfAoffrfAfAof8fAoCf8fÐfofo-Hf%f8fof8ff8ffCo f8II0ÐH|$Ht$HHHLH$H)t$)|$ D)D$0D)L$@D)T$PD)\$`D)d$pD)$D)$D)$A01(t$(|$ D(D$0D(L$@D(T$PD(\$`D(d$pD($D($D($H$1H|$Ht$ÐH|$Ht$HHHLLLD$(HHWH$H)t$)|$ D)D$0D)L$@D)T$PD)\$`D)d$pD)$D)$D)$AofDoH)fof85H%o?2ffA>HHlfDofDofEfo fAofAof8f8FooWfofffDfD>t>HH (t$(|$ D(D$0D(L$@D(T$PD(\$`D(d$pD($D($D($H$H|$Ht$ÐL fEoRfEoZfEo fEoj0fEob@fEozPfEor`ÐVWSUATAUAVAWH@I@xIIqMY8ENL93IESNL9HpIHHHxHpIIIIy(LƹHLH1HVLLNLV(L^8LfLT$ L\$(Ld$0HL$8H@A_A^A]A\][_^8P},Z ?'        p*Zx"RM|1}0L>PˏᛱD*nzߥ#6. ;$q Ɠz/U^) @iJ#cǽomxzj_t5+Aѐ                                    }|M*p[[[[[[[[[[[[[[[[`)Ih!APQ \] G@]Ziܵ6_wA(  Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)M-       .filegC:\Users\b\p\ring\.debug$S`p.debug$T8.text .rdata.pdata.xdata .absolut/`:F`]jx!-2P_'!A[r#@M  D-kxVL$k_invL$k_s0F L$k_ipt0L$k_sb1PL$k_sb2pL$k_sboL$k_sr0L$k_rconpL$k_s63L$k_opt$/9GU`W n__imp_RtlVirtualUnwind_vpaes_encrypt_coreL$enc_loopL$enc_entry_vpaes_encrypt_core_2xL$enc2x_loopL$enc2x_entry_vpaes_schedule_coreL$schedule_goL$schedule_128L$oop_schedule_128L$schedule_256L$oop_schedule_256L$schedule_mangle_lastL$schedule_mangle_last_dec_vpaes_schedule_round_vpaes_schedule_low_round_vpaes_schedule_transform_vpaes_schedule_mangleL$schedule_mangle_bothring_core_0_17_14__vpaes_set_encrypt_keyL$SEH_begin_vpaes_set_encrypt_keyL$enc_key_bodyL$enc_key_epilogueL$SEH_end_vpaes_set_encrypt_keyring_core_0_17_14__vpaes_ctr32_encrypt_blocksL$SEH_begin_vpaes_ctr32_encrypt_blocksL$ctr32_bodyL$ctr32_prep_loopL$ctr32_loopL$ctr32_doneL$ctr32_epilogueL$ctr32_abortL$SEH_end_vpaes_ctr32_encrypt_blocks_vpaes_preheat_vpaes_constsL$k_mc_forwardL$k_mc_backwardL$k_deskewL$rev_ctrL$ctr_add_oneL$ctr_add_twose_handlerL$in_prologueL$SEH_info_vpaes_set_encrypt_keyL$SEH_info_vpaes_ctr32_encrypt_blocksring-0.17.14/pregenerated/x86-mont-elf.S000064400000000000000000000102051046102023000156700ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) .text .globl bn_mul_mont .hidden bn_mul_mont .type bn_mul_mont,@function .align 16 bn_mul_mont: .L_bn_mul_mont_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi xorl %eax,%eax movl 40(%esp),%edi leal 20(%esp),%esi leal 24(%esp),%edx addl $2,%edi negl %edi leal -32(%esp,%edi,4),%ebp negl %edi movl %ebp,%eax subl %edx,%eax andl $2047,%eax subl %eax,%ebp xorl %ebp,%edx andl $2048,%edx xorl $2048,%edx subl %edx,%ebp andl $-64,%ebp movl %esp,%eax subl %ebp,%eax andl $-4096,%eax movl %esp,%edx leal (%ebp,%eax,1),%esp movl (%esp),%eax cmpl %ebp,%esp ja .L000page_walk jmp .L001page_walk_done .align 16 .L000page_walk: leal -4096(%esp),%esp movl (%esp),%eax cmpl %ebp,%esp ja .L000page_walk .L001page_walk_done: movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%ebp movl 16(%esi),%esi movl (%esi),%esi movl %eax,4(%esp) movl %ebx,8(%esp) movl %ecx,12(%esp) movl %ebp,16(%esp) movl %esi,20(%esp) leal -3(%edi),%ebx movl %edx,24(%esp) movl $-1,%eax movd %eax,%mm7 movl 8(%esp),%esi movl 12(%esp),%edi movl 16(%esp),%ebp xorl %edx,%edx xorl %ecx,%ecx movd (%edi),%mm4 movd (%esi),%mm5 movd (%ebp),%mm3 pmuludq %mm4,%mm5 movq %mm5,%mm2 movq %mm5,%mm0 pand %mm7,%mm0 pmuludq 20(%esp),%mm5 pmuludq %mm5,%mm3 paddq %mm0,%mm3 movd 4(%ebp),%mm1 movd 4(%esi),%mm0 psrlq $32,%mm2 psrlq $32,%mm3 incl %ecx .align 16 .L0021st: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 paddq %mm1,%mm3 movq %mm2,%mm0 pand %mm7,%mm0 movd 4(%ebp,%ecx,4),%mm1 paddq %mm0,%mm3 movd 4(%esi,%ecx,4),%mm0 psrlq $32,%mm2 movd %mm3,28(%esp,%ecx,4) psrlq $32,%mm3 leal 1(%ecx),%ecx cmpl %ebx,%ecx jl .L0021st pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 paddq %mm1,%mm3 movq %mm2,%mm0 pand %mm7,%mm0 paddq %mm0,%mm3 movd %mm3,28(%esp,%ecx,4) psrlq $32,%mm2 psrlq $32,%mm3 paddq %mm2,%mm3 movq %mm3,32(%esp,%ebx,4) incl %edx .L003outer: xorl %ecx,%ecx movd (%edi,%edx,4),%mm4 movd (%esi),%mm5 movd 32(%esp),%mm6 movd (%ebp),%mm3 pmuludq %mm4,%mm5 paddq %mm6,%mm5 movq %mm5,%mm0 movq %mm5,%mm2 pand %mm7,%mm0 pmuludq 20(%esp),%mm5 pmuludq %mm5,%mm3 paddq %mm0,%mm3 movd 36(%esp),%mm6 movd 4(%ebp),%mm1 movd 4(%esi),%mm0 psrlq $32,%mm2 psrlq $32,%mm3 paddq %mm6,%mm2 incl %ecx decl %ebx .L004inner: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 paddq %mm1,%mm3 movq %mm2,%mm0 movd 36(%esp,%ecx,4),%mm6 pand %mm7,%mm0 movd 4(%ebp,%ecx,4),%mm1 paddq %mm0,%mm3 movd 4(%esi,%ecx,4),%mm0 psrlq $32,%mm2 movd %mm3,28(%esp,%ecx,4) psrlq $32,%mm3 paddq %mm6,%mm2 decl %ebx leal 1(%ecx),%ecx jnz .L004inner movl %ecx,%ebx pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 paddq %mm1,%mm3 movq %mm2,%mm0 pand %mm7,%mm0 paddq %mm0,%mm3 movd %mm3,28(%esp,%ecx,4) psrlq $32,%mm2 psrlq $32,%mm3 movd 36(%esp,%ebx,4),%mm6 paddq %mm2,%mm3 paddq %mm6,%mm3 movq %mm3,32(%esp,%ebx,4) leal 1(%edx),%edx cmpl %ebx,%edx jle .L003outer emms jmp .L005common_tail .align 16 .L005common_tail: movl 16(%esp),%ebp movl 4(%esp),%edi leal 32(%esp),%esi movl (%esi),%eax movl %ebx,%ecx xorl %edx,%edx .align 16 .L006sub: sbbl (%ebp,%edx,4),%eax movl %eax,(%edi,%edx,4) decl %ecx movl 4(%esi,%edx,4),%eax leal 1(%edx),%edx jge .L006sub sbbl $0,%eax movl $-1,%edx xorl %eax,%edx jmp .L007copy .align 16 .L007copy: movl 32(%esp,%ebx,4),%esi movl (%edi,%ebx,4),%ebp movl %ecx,32(%esp,%ebx,4) andl %eax,%esi andl %edx,%ebp orl %esi,%ebp movl %ebp,(%edi,%ebx,4) decl %ebx jge .L007copy movl 24(%esp),%esp movl $1,%eax popl %edi popl %esi popl %ebx popl %ebp ret .size bn_mul_mont,.-.L_bn_mul_mont_begin .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 .byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 .byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 .byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 .byte 111,114,103,62,0 #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) ring-0.17.14/pregenerated/x86-mont-win32n.asm000064400000000000000000000102051046102023000166200ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %include "ring_core_generated/prefix_symbols_nasm.inc" %ifidn __OUTPUT_FORMAT__, win32 %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 $@feat.00 equ 1 section .text code align=64 %else section .text code %endif global _bn_mul_mont align 16 _bn_mul_mont: L$_bn_mul_mont_begin: push ebp push ebx push esi push edi xor eax,eax mov edi,DWORD [40+esp] lea esi,[20+esp] lea edx,[24+esp] add edi,2 neg edi lea ebp,[edi*4+esp-32] neg edi mov eax,ebp sub eax,edx and eax,2047 sub ebp,eax xor edx,ebp and edx,2048 xor edx,2048 sub ebp,edx and ebp,-64 mov eax,esp sub eax,ebp and eax,-4096 mov edx,esp lea esp,[eax*1+ebp] mov eax,DWORD [esp] cmp esp,ebp ja NEAR L$000page_walk jmp NEAR L$001page_walk_done align 16 L$000page_walk: lea esp,[esp-4096] mov eax,DWORD [esp] cmp esp,ebp ja NEAR L$000page_walk L$001page_walk_done: mov eax,DWORD [esi] mov ebx,DWORD [4+esi] mov ecx,DWORD [8+esi] mov ebp,DWORD [12+esi] mov esi,DWORD [16+esi] mov esi,DWORD [esi] mov DWORD [4+esp],eax mov DWORD [8+esp],ebx mov DWORD [12+esp],ecx mov DWORD [16+esp],ebp mov DWORD [20+esp],esi lea ebx,[edi-3] mov DWORD [24+esp],edx mov eax,-1 movd mm7,eax mov esi,DWORD [8+esp] mov edi,DWORD [12+esp] mov ebp,DWORD [16+esp] xor edx,edx xor ecx,ecx movd mm4,DWORD [edi] movd mm5,DWORD [esi] movd mm3,DWORD [ebp] pmuludq mm5,mm4 movq mm2,mm5 movq mm0,mm5 pand mm0,mm7 pmuludq mm5,[20+esp] pmuludq mm3,mm5 paddq mm3,mm0 movd mm1,DWORD [4+ebp] movd mm0,DWORD [4+esi] psrlq mm2,32 psrlq mm3,32 inc ecx align 16 L$0021st: pmuludq mm0,mm4 pmuludq mm1,mm5 paddq mm2,mm0 paddq mm3,mm1 movq mm0,mm2 pand mm0,mm7 movd mm1,DWORD [4+ecx*4+ebp] paddq mm3,mm0 movd mm0,DWORD [4+ecx*4+esi] psrlq mm2,32 movd DWORD [28+ecx*4+esp],mm3 psrlq mm3,32 lea ecx,[1+ecx] cmp ecx,ebx jl NEAR L$0021st pmuludq mm0,mm4 pmuludq mm1,mm5 paddq mm2,mm0 paddq mm3,mm1 movq mm0,mm2 pand mm0,mm7 paddq mm3,mm0 movd DWORD [28+ecx*4+esp],mm3 psrlq mm2,32 psrlq mm3,32 paddq mm3,mm2 movq [32+ebx*4+esp],mm3 inc edx L$003outer: xor ecx,ecx movd mm4,DWORD [edx*4+edi] movd mm5,DWORD [esi] movd mm6,DWORD [32+esp] movd mm3,DWORD [ebp] pmuludq mm5,mm4 paddq mm5,mm6 movq mm0,mm5 movq mm2,mm5 pand mm0,mm7 pmuludq mm5,[20+esp] pmuludq mm3,mm5 paddq mm3,mm0 movd mm6,DWORD [36+esp] movd mm1,DWORD [4+ebp] movd mm0,DWORD [4+esi] psrlq mm2,32 psrlq mm3,32 paddq mm2,mm6 inc ecx dec ebx L$004inner: pmuludq mm0,mm4 pmuludq mm1,mm5 paddq mm2,mm0 paddq mm3,mm1 movq mm0,mm2 movd mm6,DWORD [36+ecx*4+esp] pand mm0,mm7 movd mm1,DWORD [4+ecx*4+ebp] paddq mm3,mm0 movd mm0,DWORD [4+ecx*4+esi] psrlq mm2,32 movd DWORD [28+ecx*4+esp],mm3 psrlq mm3,32 paddq mm2,mm6 dec ebx lea ecx,[1+ecx] jnz NEAR L$004inner mov ebx,ecx pmuludq mm0,mm4 pmuludq mm1,mm5 paddq mm2,mm0 paddq mm3,mm1 movq mm0,mm2 pand mm0,mm7 paddq mm3,mm0 movd DWORD [28+ecx*4+esp],mm3 psrlq mm2,32 psrlq mm3,32 movd mm6,DWORD [36+ebx*4+esp] paddq mm3,mm2 paddq mm3,mm6 movq [32+ebx*4+esp],mm3 lea edx,[1+edx] cmp edx,ebx jle NEAR L$003outer emms jmp NEAR L$005common_tail align 16 L$005common_tail: mov ebp,DWORD [16+esp] mov edi,DWORD [4+esp] lea esi,[32+esp] mov eax,DWORD [esi] mov ecx,ebx xor edx,edx align 16 L$006sub: sbb eax,DWORD [edx*4+ebp] mov DWORD [edx*4+edi],eax dec ecx mov eax,DWORD [4+edx*4+esi] lea edx,[1+edx] jge NEAR L$006sub sbb eax,0 mov edx,-1 xor edx,eax jmp NEAR L$007copy align 16 L$007copy: mov esi,DWORD [32+ebx*4+esp] mov ebp,DWORD [ebx*4+edi] mov DWORD [32+ebx*4+esp],ecx and esi,eax and ebp,edx or ebp,esi mov DWORD [ebx*4+edi],ebp dec ebx jge NEAR L$007copy mov esp,DWORD [24+esp] mov eax,1 pop edi pop esi pop ebx pop ebp ret db 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 db 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 db 111,114,103,62,0 %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/x86-mont-win32n.o000064400000000000000000000072011046102023000163000ustar 00000000000000Lg .debug$S(@B.debug$T @B.text p`4C:\Users\b\p\ring\pregenerated\x86-mont-win32n.asm4r ~@4 ! &!("*#0$6%8&;'=(?)D*F+J,M-O.U/Z0`2g3j4l5r7t8w9z:};<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\]^_`abc defghi!j'k*l-m0n3o6p9q<rAsEtIuLvQwRyTzX{[|`}d~gjmpsx{~   $(,.02@DGHLOUX]_dptw{}x7C:\Users\b\p\ring\pregenerated\x86-mont-win32n.o4'The Netwide Assembler 2.13.03)_ring_core_0_17_14__bn_mul_montL$_bn_mul_mont_beginL$000page_walkL$001page_walk_doneL$0021stL$003outerL$004innerL$005common_tailL$006subL$007copyh l # ' N R n r               USVW1|$(t$T$ߍl߉)%)1)Ճ)%d$9 $$9^Nn v6D$\$L$ l$t$_T$nt$|$ l$11n'n.n]ool$nMnFs s AonLnDs ~\s I9o~\s s \ B1n$n.nt$ n]ool$nt$$nMnFs s AKont$nLnDs ~\s KIo~\s s nt$\ R9=wl$|$t$ 1ҐDIDR1 t ,L !! ,Kd$_^[]Montgomery Multiplication for x86, CRYPTOGAMS by .filegC:\Users\b\p\ring\.debug$S(.debug$T.text.absolut@feat.00$9`HrL$0021st\Rgr L$006sub@p_ring_core_0_17_14__bn_mul_montL$_bn_mul_mont_beginL$000page_walkL$001page_walk_doneL$003outerL$004innerL$005common_tailL$007copyring-0.17.14/pregenerated/x86_64-mont-elf.S000064400000000000000000000477571046102023000162270ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text .globl bn_mul_mont_nohw .hidden bn_mul_mont_nohw .type bn_mul_mont_nohw,@function .align 16 bn_mul_mont_nohw: .cfi_startproc _CET_ENDBR movl %r9d,%r9d movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 negq %r9 movq %rsp,%r11 leaq -16(%rsp,%r9,8),%r10 negq %r9 andq $-1024,%r10 subq %r10,%r11 andq $-4096,%r11 leaq (%r10,%r11,1),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja .Lmul_page_walk jmp .Lmul_page_walk_done .align 16 .Lmul_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja .Lmul_page_walk .Lmul_page_walk_done: movq %rax,8(%rsp,%r9,8) .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul_body: movq %rdx,%r12 movq (%r8),%r8 movq (%r12),%rbx movq (%rsi),%rax xorq %r14,%r14 xorq %r15,%r15 movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq %rdx,%r13 leaq 1(%r15),%r15 jmp .L1st_enter .align 16 .L1st: addq %rax,%r13 movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%r13 movq %r10,%r11 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 .L1st_enter: mulq %rbx addq %rax,%r11 movq (%rcx,%r15,8),%rax adcq $0,%rdx leaq 1(%r15),%r15 movq %rdx,%r10 mulq %rbp cmpq %r9,%r15 jne .L1st addq %rax,%r13 movq (%rsi),%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 movq %r10,%r11 xorq %rdx,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r9,8) movq %rdx,(%rsp,%r9,8) leaq 1(%r14),%r14 jmp .Louter .align 16 .Louter: movq (%r12,%r14,8),%rbx xorq %r15,%r15 movq %r8,%rbp movq (%rsp),%r10 mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq 8(%rsp),%r10 movq %rdx,%r13 leaq 1(%r15),%r15 jmp .Linner_enter .align 16 .Linner: addq %rax,%r13 movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 movq (%rsp,%r15,8),%r10 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 .Linner_enter: mulq %rbx addq %rax,%r11 movq (%rcx,%r15,8),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 adcq $0,%r11 leaq 1(%r15),%r15 mulq %rbp cmpq %r9,%r15 jne .Linner addq %rax,%r13 movq (%rsi),%rax adcq $0,%rdx addq %r10,%r13 movq (%rsp,%r15,8),%r10 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 xorq %rdx,%rdx addq %r11,%r13 adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r9,8) movq %rdx,(%rsp,%r9,8) leaq 1(%r14),%r14 cmpq %r9,%r14 jb .Louter xorq %r14,%r14 movq (%rsp),%rax movq %r9,%r15 .align 16 .Lsub: sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) movq 8(%rsp,%r14,8),%rax leaq 1(%r14),%r14 decq %r15 jnz .Lsub sbbq $0,%rax movq $-1,%rbx xorq %rax,%rbx xorq %r14,%r14 movq %r9,%r15 .Lcopy: movq (%rdi,%r14,8),%rcx movq (%rsp,%r14,8),%rdx andq %rbx,%rcx andq %rax,%rdx movq %r9,(%rsp,%r14,8) orq %rcx,%rdx movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz .Lcopy movq 8(%rsp,%r9,8),%rsi .cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul_epilogue: ret .cfi_endproc .size bn_mul_mont_nohw,.-bn_mul_mont_nohw .globl bn_mul4x_mont .hidden bn_mul4x_mont .type bn_mul4x_mont,@function .align 16 bn_mul4x_mont: .cfi_startproc _CET_ENDBR movl %r9d,%r9d movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 negq %r9 movq %rsp,%r11 leaq -32(%rsp,%r9,8),%r10 negq %r9 andq $-1024,%r10 subq %r10,%r11 andq $-4096,%r11 leaq (%r10,%r11,1),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja .Lmul4x_page_walk jmp .Lmul4x_page_walk_done .Lmul4x_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja .Lmul4x_page_walk .Lmul4x_page_walk_done: movq %rax,8(%rsp,%r9,8) .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul4x_body: movq %rdi,16(%rsp,%r9,8) movq %rdx,%r12 movq (%r8),%r8 movq (%r12),%rbx movq (%rsi),%rax xorq %r14,%r14 xorq %r15,%r15 movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi),%rax adcq $0,%rdx addq %r11,%rdi leaq 4(%r15),%r15 adcq $0,%rdx movq %rdi,(%rsp) movq %rdx,%r13 jmp .L1st4x .align 16 .L1st4x: mulq %rbx addq %rax,%r10 movq -16(%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%rsp,%r15,8) movq %rdx,%r13 mulq %rbx addq %rax,%r10 movq (%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq 8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx,%r15,8),%rax adcq $0,%rdx leaq 4(%r15),%r15 movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq -16(%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-32(%rsp,%r15,8) movq %rdx,%r13 cmpq %r9,%r15 jb .L1st4x mulq %rbx addq %rax,%r10 movq -16(%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%rsp,%r15,8) movq %rdx,%r13 xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi movq %r13,-8(%rsp,%r15,8) movq %rdi,(%rsp,%r15,8) leaq 1(%r14),%r14 .align 4 .Louter4x: movq (%r12,%r14,8),%rbx xorq %r15,%r15 movq (%rsp),%r10 movq %r8,%rbp mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx addq 8(%rsp),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi),%rax adcq $0,%rdx addq %r11,%rdi leaq 4(%r15),%r15 adcq $0,%rdx movq %rdi,(%rsp) movq %rdx,%r13 jmp .Linner4x .align 16 .Linner4x: mulq %rbx addq %rax,%r10 movq -16(%rcx,%r15,8),%rax adcq $0,%rdx addq -16(%rsp,%r15,8),%r10 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx,%r15,8),%rax adcq $0,%rdx addq -8(%rsp,%r15,8),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%rsp,%r15,8) movq %rdx,%r13 mulq %rbx addq %rax,%r10 movq (%rcx,%r15,8),%rax adcq $0,%rdx addq (%rsp,%r15,8),%r10 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq 8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx,%r15,8),%rax adcq $0,%rdx addq 8(%rsp,%r15,8),%r11 adcq $0,%rdx leaq 4(%r15),%r15 movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq -16(%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-32(%rsp,%r15,8) movq %rdx,%r13 cmpq %r9,%r15 jb .Linner4x mulq %rbx addq %rax,%r10 movq -16(%rcx,%r15,8),%rax adcq $0,%rdx addq -16(%rsp,%r15,8),%r10 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx,%r15,8),%rax adcq $0,%rdx addq -8(%rsp,%r15,8),%r11 adcq $0,%rdx leaq 1(%r14),%r14 movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%rsp,%r15,8) movq %rdx,%r13 xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi addq (%rsp,%r9,8),%r13 adcq $0,%rdi movq %r13,-8(%rsp,%r15,8) movq %rdi,(%rsp,%r15,8) cmpq %r9,%r14 jb .Louter4x movq 16(%rsp,%r9,8),%rdi leaq -4(%r9),%r15 movq 0(%rsp),%rax movq 8(%rsp),%rdx shrq $2,%r15 leaq (%rsp),%rsi xorq %r14,%r14 subq 0(%rcx),%rax movq 16(%rsi),%rbx movq 24(%rsi),%rbp sbbq 8(%rcx),%rdx .Lsub4x: movq %rax,0(%rdi,%r14,8) movq %rdx,8(%rdi,%r14,8) sbbq 16(%rcx,%r14,8),%rbx movq 32(%rsi,%r14,8),%rax movq 40(%rsi,%r14,8),%rdx sbbq 24(%rcx,%r14,8),%rbp movq %rbx,16(%rdi,%r14,8) movq %rbp,24(%rdi,%r14,8) sbbq 32(%rcx,%r14,8),%rax movq 48(%rsi,%r14,8),%rbx movq 56(%rsi,%r14,8),%rbp sbbq 40(%rcx,%r14,8),%rdx leaq 4(%r14),%r14 decq %r15 jnz .Lsub4x movq %rax,0(%rdi,%r14,8) movq 32(%rsi,%r14,8),%rax sbbq 16(%rcx,%r14,8),%rbx movq %rdx,8(%rdi,%r14,8) sbbq 24(%rcx,%r14,8),%rbp movq %rbx,16(%rdi,%r14,8) sbbq $0,%rax movq %rbp,24(%rdi,%r14,8) pxor %xmm0,%xmm0 .byte 102,72,15,110,224 pcmpeqd %xmm5,%xmm5 pshufd $0,%xmm4,%xmm4 movq %r9,%r15 pxor %xmm4,%xmm5 shrq $2,%r15 xorl %eax,%eax jmp .Lcopy4x .align 16 .Lcopy4x: movdqa (%rsp,%rax,1),%xmm1 movdqu (%rdi,%rax,1),%xmm2 pand %xmm4,%xmm1 pand %xmm5,%xmm2 movdqa 16(%rsp,%rax,1),%xmm3 movdqa %xmm0,(%rsp,%rax,1) por %xmm2,%xmm1 movdqu 16(%rdi,%rax,1),%xmm2 movdqu %xmm1,(%rdi,%rax,1) pand %xmm4,%xmm3 pand %xmm5,%xmm2 movdqa %xmm0,16(%rsp,%rax,1) por %xmm2,%xmm3 movdqu %xmm3,16(%rdi,%rax,1) leaq 32(%rax),%rax decq %r15 jnz .Lcopy4x movq 8(%rsp,%r9,8),%rsi .cfi_def_cfa %rsi, 8 movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret .cfi_endproc .size bn_mul4x_mont,.-bn_mul4x_mont .extern bn_sqrx8x_internal .hidden bn_sqrx8x_internal .extern bn_sqr8x_internal .hidden bn_sqr8x_internal .globl bn_sqr8x_mont .hidden bn_sqr8x_mont .type bn_sqr8x_mont,@function .align 32 bn_sqr8x_mont: .cfi_startproc _CET_ENDBR movl %r9d,%r9d movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 .Lsqr8x_prologue: movl %r9d,%r10d shll $3,%r9d shlq $3+2,%r10 negq %r9 leaq -64(%rsp,%r9,2),%r11 movq %rsp,%rbp movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lsqr8x_sp_alt subq %r11,%rbp leaq -64(%rbp,%r9,2),%rbp jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: leaq 4096-64(,%r9,2),%r10 leaq -64(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp .Lsqr8x_sp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lsqr8x_page_walk jmp .Lsqr8x_page_walk_done .align 16 .Lsqr8x_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lsqr8x_page_walk .Lsqr8x_page_walk_done: movq %r9,%r10 negq %r9 movq %r8,32(%rsp) movq %rax,40(%rsp) .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lsqr8x_body: .byte 102,72,15,110,209 pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 testq %rdx,%rdx jz .Lsqr8x_nox call bn_sqrx8x_internal leaq (%r8,%rcx,1),%rbx movq %rcx,%r9 movq %rcx,%rdx .byte 102,72,15,126,207 sarq $3+2,%rcx jmp .Lsqr8x_sub .align 32 .Lsqr8x_nox: call bn_sqr8x_internal leaq (%rdi,%r9,1),%rbx movq %r9,%rcx movq %r9,%rdx .byte 102,72,15,126,207 sarq $3+2,%rcx jmp .Lsqr8x_sub .align 32 .Lsqr8x_sub: movq 0(%rbx),%r12 movq 8(%rbx),%r13 movq 16(%rbx),%r14 movq 24(%rbx),%r15 leaq 32(%rbx),%rbx sbbq 0(%rbp),%r12 sbbq 8(%rbp),%r13 sbbq 16(%rbp),%r14 sbbq 24(%rbp),%r15 leaq 32(%rbp),%rbp movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r14,16(%rdi) movq %r15,24(%rdi) leaq 32(%rdi),%rdi incq %rcx jnz .Lsqr8x_sub sbbq $0,%rax leaq (%rbx,%r9,1),%rbx leaq (%rdi,%r9,1),%rdi .byte 102,72,15,110,200 pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi .cfi_def_cfa %rsi,8 jmp .Lsqr8x_cond_copy .align 32 .Lsqr8x_cond_copy: movdqa 0(%rbx),%xmm2 movdqa 16(%rbx),%xmm3 leaq 32(%rbx),%rbx movdqu 0(%rdi),%xmm4 movdqu 16(%rdi),%xmm5 leaq 32(%rdi),%rdi movdqa %xmm0,-32(%rbx) movdqa %xmm0,-16(%rbx) movdqa %xmm0,-32(%rbx,%rdx,1) movdqa %xmm0,-16(%rbx,%rdx,1) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-32(%rdi) movdqu %xmm5,-16(%rdi) addq $32,%r9 jnz .Lsqr8x_cond_copy movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lsqr8x_epilogue: ret .cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont .globl bn_mulx4x_mont .hidden bn_mulx4x_mont .type bn_mulx4x_mont,@function .align 32 bn_mulx4x_mont: .cfi_startproc _CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 .Lmulx4x_prologue: shll $3,%r9d xorq %r10,%r10 subq %r9,%r10 movq (%r8),%r8 leaq -72(%rsp,%r10,1),%rbp andq $-128,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lmulx4x_page_walk jmp .Lmulx4x_page_walk_done .align 16 .Lmulx4x_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lmulx4x_page_walk .Lmulx4x_page_walk_done: leaq (%rdx,%r9,1),%r10 movq %r9,0(%rsp) shrq $5,%r9 movq %r10,16(%rsp) subq $1,%r9 movq %r8,24(%rsp) movq %rdi,32(%rsp) movq %rax,40(%rsp) .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 movq %r9,48(%rsp) jmp .Lmulx4x_body .align 32 .Lmulx4x_body: leaq 8(%rdx),%rdi movq (%rdx),%rdx leaq 64+32(%rsp),%rbx movq %rdx,%r9 mulxq 0(%rsi),%r8,%rax mulxq 8(%rsi),%r11,%r14 addq %rax,%r11 movq %rdi,8(%rsp) mulxq 16(%rsi),%r12,%r13 adcq %r14,%r12 adcq $0,%r13 movq %r8,%rdi imulq 24(%rsp),%r8 xorq %rbp,%rbp mulxq 24(%rsi),%rax,%r14 movq %r8,%rdx leaq 32(%rsi),%rsi adcxq %rax,%r13 adcxq %rbp,%r14 mulxq 0(%rcx),%rax,%r10 adcxq %rax,%rdi adoxq %r11,%r10 mulxq 8(%rcx),%rax,%r11 adcxq %rax,%r10 adoxq %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 movq 48(%rsp),%rdi movq %r10,-32(%rbx) adcxq %rax,%r11 adoxq %r13,%r12 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq %r11,-24(%rbx) adcxq %rax,%r12 adoxq %rbp,%r15 leaq 32(%rcx),%rcx movq %r12,-16(%rbx) jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: adcxq %rbp,%r15 mulxq 0(%rsi),%r10,%rax adcxq %r14,%r10 mulxq 8(%rsi),%r11,%r14 adcxq %rax,%r11 mulxq 16(%rsi),%r12,%rax adcxq %r14,%r12 mulxq 24(%rsi),%r13,%r14 .byte 0x67,0x67 movq %r8,%rdx adcxq %rax,%r13 adcxq %rbp,%r14 leaq 32(%rsi),%rsi leaq 32(%rbx),%rbx adoxq %r15,%r10 mulxq 0(%rcx),%rax,%r15 adcxq %rax,%r10 adoxq %r15,%r11 mulxq 8(%rcx),%rax,%r15 adcxq %rax,%r11 adoxq %r15,%r12 mulxq 16(%rcx),%rax,%r15 movq %r10,-40(%rbx) adcxq %rax,%r12 movq %r11,-32(%rbx) adoxq %r15,%r13 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq %r12,-24(%rbx) adcxq %rax,%r13 adoxq %rbp,%r15 leaq 32(%rcx),%rcx movq %r13,-16(%rbx) decq %rdi jnz .Lmulx4x_1st movq 0(%rsp),%rax movq 8(%rsp),%rdi adcq %rbp,%r15 addq %r15,%r14 sbbq %r15,%r15 movq %r14,-8(%rbx) jmp .Lmulx4x_outer .align 32 .Lmulx4x_outer: movq (%rdi),%rdx leaq 8(%rdi),%rdi subq %rax,%rsi movq %r15,(%rbx) leaq 64+32(%rsp),%rbx subq %rax,%rcx mulxq 0(%rsi),%r8,%r11 xorl %ebp,%ebp movq %rdx,%r9 mulxq 8(%rsi),%r14,%r12 adoxq -32(%rbx),%r8 adcxq %r14,%r11 mulxq 16(%rsi),%r15,%r13 adoxq -24(%rbx),%r11 adcxq %r15,%r12 adoxq -16(%rbx),%r12 adcxq %rbp,%r13 adoxq %rbp,%r13 movq %rdi,8(%rsp) movq %r8,%r15 imulq 24(%rsp),%r8 xorl %ebp,%ebp mulxq 24(%rsi),%rax,%r14 movq %r8,%rdx adcxq %rax,%r13 adoxq -8(%rbx),%r13 adcxq %rbp,%r14 leaq 32(%rsi),%rsi adoxq %rbp,%r14 mulxq 0(%rcx),%rax,%r10 adcxq %rax,%r15 adoxq %r11,%r10 mulxq 8(%rcx),%rax,%r11 adcxq %rax,%r10 adoxq %r12,%r11 mulxq 16(%rcx),%rax,%r12 movq %r10,-32(%rbx) adcxq %rax,%r11 adoxq %r13,%r12 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq %r11,-24(%rbx) leaq 32(%rcx),%rcx adcxq %rax,%r12 adoxq %rbp,%r15 movq 48(%rsp),%rdi movq %r12,-16(%rbx) jmp .Lmulx4x_inner .align 32 .Lmulx4x_inner: mulxq 0(%rsi),%r10,%rax adcxq %rbp,%r15 adoxq %r14,%r10 mulxq 8(%rsi),%r11,%r14 adcxq 0(%rbx),%r10 adoxq %rax,%r11 mulxq 16(%rsi),%r12,%rax adcxq 8(%rbx),%r11 adoxq %r14,%r12 mulxq 24(%rsi),%r13,%r14 movq %r8,%rdx adcxq 16(%rbx),%r12 adoxq %rax,%r13 adcxq 24(%rbx),%r13 adoxq %rbp,%r14 leaq 32(%rsi),%rsi leaq 32(%rbx),%rbx adcxq %rbp,%r14 adoxq %r15,%r10 mulxq 0(%rcx),%rax,%r15 adcxq %rax,%r10 adoxq %r15,%r11 mulxq 8(%rcx),%rax,%r15 adcxq %rax,%r11 adoxq %r15,%r12 mulxq 16(%rcx),%rax,%r15 movq %r10,-40(%rbx) adcxq %rax,%r12 adoxq %r15,%r13 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq %r11,-32(%rbx) movq %r12,-24(%rbx) adcxq %rax,%r13 adoxq %rbp,%r15 leaq 32(%rcx),%rcx movq %r13,-16(%rbx) decq %rdi jnz .Lmulx4x_inner movq 0(%rsp),%rax movq 8(%rsp),%rdi adcq %rbp,%r15 subq 0(%rbx),%rbp adcq %r15,%r14 sbbq %r15,%r15 movq %r14,-8(%rbx) cmpq 16(%rsp),%rdi jne .Lmulx4x_outer leaq 64(%rsp),%rbx subq %rax,%rcx negq %r15 movq %rax,%rdx shrq $3+2,%rax movq 32(%rsp),%rdi jmp .Lmulx4x_sub .align 32 .Lmulx4x_sub: movq 0(%rbx),%r11 movq 8(%rbx),%r12 movq 16(%rbx),%r13 movq 24(%rbx),%r14 leaq 32(%rbx),%rbx sbbq 0(%rcx),%r11 sbbq 8(%rcx),%r12 sbbq 16(%rcx),%r13 sbbq 24(%rcx),%r14 leaq 32(%rcx),%rcx movq %r11,0(%rdi) movq %r12,8(%rdi) movq %r13,16(%rdi) movq %r14,24(%rdi) leaq 32(%rdi),%rdi decq %rax jnz .Lmulx4x_sub sbbq $0,%r15 leaq 64(%rsp),%rbx subq %rdx,%rdi .byte 102,73,15,110,207 pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi .cfi_def_cfa %rsi,8 jmp .Lmulx4x_cond_copy .align 32 .Lmulx4x_cond_copy: movdqa 0(%rbx),%xmm2 movdqa 16(%rbx),%xmm3 leaq 32(%rbx),%rbx movdqu 0(%rdi),%xmm4 movdqu 16(%rdi),%xmm5 leaq 32(%rdi),%rdi movdqa %xmm0,-32(%rbx) movdqa %xmm0,-16(%rbx) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-32(%rdi) movdqu %xmm5,-16(%rdi) subq $32,%rdx jnz .Lmulx4x_cond_copy movq %rdx,(%rbx) movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret .cfi_endproc .size bn_mulx4x_mont,.-bn_mulx4x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 #endif ring-0.17.14/pregenerated/x86_64-mont-macosx.S000064400000000000000000000443271046102023000167410ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text .globl _bn_mul_mont_nohw .private_extern _bn_mul_mont_nohw .p2align 4 _bn_mul_mont_nohw: _CET_ENDBR movl %r9d,%r9d movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 negq %r9 movq %rsp,%r11 leaq -16(%rsp,%r9,8),%r10 negq %r9 andq $-1024,%r10 subq %r10,%r11 andq $-4096,%r11 leaq (%r10,%r11,1),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja L$mul_page_walk jmp L$mul_page_walk_done .p2align 4 L$mul_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja L$mul_page_walk L$mul_page_walk_done: movq %rax,8(%rsp,%r9,8) L$mul_body: movq %rdx,%r12 movq (%r8),%r8 movq (%r12),%rbx movq (%rsi),%rax xorq %r14,%r14 xorq %r15,%r15 movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq %rdx,%r13 leaq 1(%r15),%r15 jmp L$1st_enter .p2align 4 L$1st: addq %rax,%r13 movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%r13 movq %r10,%r11 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 L$1st_enter: mulq %rbx addq %rax,%r11 movq (%rcx,%r15,8),%rax adcq $0,%rdx leaq 1(%r15),%r15 movq %rdx,%r10 mulq %rbp cmpq %r9,%r15 jne L$1st addq %rax,%r13 movq (%rsi),%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 movq %r10,%r11 xorq %rdx,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r9,8) movq %rdx,(%rsp,%r9,8) leaq 1(%r14),%r14 jmp L$outer .p2align 4 L$outer: movq (%r12,%r14,8),%rbx xorq %r15,%r15 movq %r8,%rbp movq (%rsp),%r10 mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq 8(%rsp),%r10 movq %rdx,%r13 leaq 1(%r15),%r15 jmp L$inner_enter .p2align 4 L$inner: addq %rax,%r13 movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 movq (%rsp,%r15,8),%r10 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 L$inner_enter: mulq %rbx addq %rax,%r11 movq (%rcx,%r15,8),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 adcq $0,%r11 leaq 1(%r15),%r15 mulq %rbp cmpq %r9,%r15 jne L$inner addq %rax,%r13 movq (%rsi),%rax adcq $0,%rdx addq %r10,%r13 movq (%rsp,%r15,8),%r10 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 xorq %rdx,%rdx addq %r11,%r13 adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r9,8) movq %rdx,(%rsp,%r9,8) leaq 1(%r14),%r14 cmpq %r9,%r14 jb L$outer xorq %r14,%r14 movq (%rsp),%rax movq %r9,%r15 .p2align 4 L$sub: sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) movq 8(%rsp,%r14,8),%rax leaq 1(%r14),%r14 decq %r15 jnz L$sub sbbq $0,%rax movq $-1,%rbx xorq %rax,%rbx xorq %r14,%r14 movq %r9,%r15 L$copy: movq (%rdi,%r14,8),%rcx movq (%rsp,%r14,8),%rdx andq %rbx,%rcx andq %rax,%rdx movq %r9,(%rsp,%r14,8) orq %rcx,%rdx movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz L$copy movq 8(%rsp,%r9,8),%rsi movq $1,%rax movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$mul_epilogue: ret .globl _bn_mul4x_mont .private_extern _bn_mul4x_mont .p2align 4 _bn_mul4x_mont: _CET_ENDBR movl %r9d,%r9d movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 negq %r9 movq %rsp,%r11 leaq -32(%rsp,%r9,8),%r10 negq %r9 andq $-1024,%r10 subq %r10,%r11 andq $-4096,%r11 leaq (%r10,%r11,1),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja L$mul4x_page_walk jmp L$mul4x_page_walk_done L$mul4x_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja L$mul4x_page_walk L$mul4x_page_walk_done: movq %rax,8(%rsp,%r9,8) L$mul4x_body: movq %rdi,16(%rsp,%r9,8) movq %rdx,%r12 movq (%r8),%r8 movq (%r12),%rbx movq (%rsi),%rax xorq %r14,%r14 xorq %r15,%r15 movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi),%rax adcq $0,%rdx addq %r11,%rdi leaq 4(%r15),%r15 adcq $0,%rdx movq %rdi,(%rsp) movq %rdx,%r13 jmp L$1st4x .p2align 4 L$1st4x: mulq %rbx addq %rax,%r10 movq -16(%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%rsp,%r15,8) movq %rdx,%r13 mulq %rbx addq %rax,%r10 movq (%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq 8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx,%r15,8),%rax adcq $0,%rdx leaq 4(%r15),%r15 movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq -16(%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-32(%rsp,%r15,8) movq %rdx,%r13 cmpq %r9,%r15 jb L$1st4x mulq %rbx addq %rax,%r10 movq -16(%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%rsp,%r15,8) movq %rdx,%r13 xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi movq %r13,-8(%rsp,%r15,8) movq %rdi,(%rsp,%r15,8) leaq 1(%r14),%r14 .p2align 2 L$outer4x: movq (%r12,%r14,8),%rbx xorq %r15,%r15 movq (%rsp),%r10 movq %r8,%rbp mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx addq 8(%rsp),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi),%rax adcq $0,%rdx addq %r11,%rdi leaq 4(%r15),%r15 adcq $0,%rdx movq %rdi,(%rsp) movq %rdx,%r13 jmp L$inner4x .p2align 4 L$inner4x: mulq %rbx addq %rax,%r10 movq -16(%rcx,%r15,8),%rax adcq $0,%rdx addq -16(%rsp,%r15,8),%r10 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx,%r15,8),%rax adcq $0,%rdx addq -8(%rsp,%r15,8),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%rsp,%r15,8) movq %rdx,%r13 mulq %rbx addq %rax,%r10 movq (%rcx,%r15,8),%rax adcq $0,%rdx addq (%rsp,%r15,8),%r10 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq 8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx,%r15,8),%rax adcq $0,%rdx addq 8(%rsp,%r15,8),%r11 adcq $0,%rdx leaq 4(%r15),%r15 movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq -16(%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-32(%rsp,%r15,8) movq %rdx,%r13 cmpq %r9,%r15 jb L$inner4x mulq %rbx addq %rax,%r10 movq -16(%rcx,%r15,8),%rax adcq $0,%rdx addq -16(%rsp,%r15,8),%r10 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx,%r15,8),%rax adcq $0,%rdx addq -8(%rsp,%r15,8),%r11 adcq $0,%rdx leaq 1(%r14),%r14 movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%rsp,%r15,8) movq %rdx,%r13 xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi addq (%rsp,%r9,8),%r13 adcq $0,%rdi movq %r13,-8(%rsp,%r15,8) movq %rdi,(%rsp,%r15,8) cmpq %r9,%r14 jb L$outer4x movq 16(%rsp,%r9,8),%rdi leaq -4(%r9),%r15 movq 0(%rsp),%rax movq 8(%rsp),%rdx shrq $2,%r15 leaq (%rsp),%rsi xorq %r14,%r14 subq 0(%rcx),%rax movq 16(%rsi),%rbx movq 24(%rsi),%rbp sbbq 8(%rcx),%rdx L$sub4x: movq %rax,0(%rdi,%r14,8) movq %rdx,8(%rdi,%r14,8) sbbq 16(%rcx,%r14,8),%rbx movq 32(%rsi,%r14,8),%rax movq 40(%rsi,%r14,8),%rdx sbbq 24(%rcx,%r14,8),%rbp movq %rbx,16(%rdi,%r14,8) movq %rbp,24(%rdi,%r14,8) sbbq 32(%rcx,%r14,8),%rax movq 48(%rsi,%r14,8),%rbx movq 56(%rsi,%r14,8),%rbp sbbq 40(%rcx,%r14,8),%rdx leaq 4(%r14),%r14 decq %r15 jnz L$sub4x movq %rax,0(%rdi,%r14,8) movq 32(%rsi,%r14,8),%rax sbbq 16(%rcx,%r14,8),%rbx movq %rdx,8(%rdi,%r14,8) sbbq 24(%rcx,%r14,8),%rbp movq %rbx,16(%rdi,%r14,8) sbbq $0,%rax movq %rbp,24(%rdi,%r14,8) pxor %xmm0,%xmm0 .byte 102,72,15,110,224 pcmpeqd %xmm5,%xmm5 pshufd $0,%xmm4,%xmm4 movq %r9,%r15 pxor %xmm4,%xmm5 shrq $2,%r15 xorl %eax,%eax jmp L$copy4x .p2align 4 L$copy4x: movdqa (%rsp,%rax,1),%xmm1 movdqu (%rdi,%rax,1),%xmm2 pand %xmm4,%xmm1 pand %xmm5,%xmm2 movdqa 16(%rsp,%rax,1),%xmm3 movdqa %xmm0,(%rsp,%rax,1) por %xmm2,%xmm1 movdqu 16(%rdi,%rax,1),%xmm2 movdqu %xmm1,(%rdi,%rax,1) pand %xmm4,%xmm3 pand %xmm5,%xmm2 movdqa %xmm0,16(%rsp,%rax,1) por %xmm2,%xmm3 movdqu %xmm3,16(%rdi,%rax,1) leaq 32(%rax),%rax decq %r15 jnz L$copy4x movq 8(%rsp,%r9,8),%rsi movq $1,%rax movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$mul4x_epilogue: ret .globl _bn_sqr8x_mont .private_extern _bn_sqr8x_mont .p2align 5 _bn_sqr8x_mont: _CET_ENDBR movl %r9d,%r9d movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$sqr8x_prologue: movl %r9d,%r10d shll $3,%r9d shlq $3+2,%r10 negq %r9 leaq -64(%rsp,%r9,2),%r11 movq %rsp,%rbp movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$sqr8x_sp_alt subq %r11,%rbp leaq -64(%rbp,%r9,2),%rbp jmp L$sqr8x_sp_done .p2align 5 L$sqr8x_sp_alt: leaq 4096-64(,%r9,2),%r10 leaq -64(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp L$sqr8x_sp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja L$sqr8x_page_walk jmp L$sqr8x_page_walk_done .p2align 4 L$sqr8x_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja L$sqr8x_page_walk L$sqr8x_page_walk_done: movq %r9,%r10 negq %r9 movq %r8,32(%rsp) movq %rax,40(%rsp) L$sqr8x_body: .byte 102,72,15,110,209 pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 testq %rdx,%rdx jz L$sqr8x_nox call _bn_sqrx8x_internal leaq (%r8,%rcx,1),%rbx movq %rcx,%r9 movq %rcx,%rdx .byte 102,72,15,126,207 sarq $3+2,%rcx jmp L$sqr8x_sub .p2align 5 L$sqr8x_nox: call _bn_sqr8x_internal leaq (%rdi,%r9,1),%rbx movq %r9,%rcx movq %r9,%rdx .byte 102,72,15,126,207 sarq $3+2,%rcx jmp L$sqr8x_sub .p2align 5 L$sqr8x_sub: movq 0(%rbx),%r12 movq 8(%rbx),%r13 movq 16(%rbx),%r14 movq 24(%rbx),%r15 leaq 32(%rbx),%rbx sbbq 0(%rbp),%r12 sbbq 8(%rbp),%r13 sbbq 16(%rbp),%r14 sbbq 24(%rbp),%r15 leaq 32(%rbp),%rbp movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r14,16(%rdi) movq %r15,24(%rdi) leaq 32(%rdi),%rdi incq %rcx jnz L$sqr8x_sub sbbq $0,%rax leaq (%rbx,%r9,1),%rbx leaq (%rdi,%r9,1),%rdi .byte 102,72,15,110,200 pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi jmp L$sqr8x_cond_copy .p2align 5 L$sqr8x_cond_copy: movdqa 0(%rbx),%xmm2 movdqa 16(%rbx),%xmm3 leaq 32(%rbx),%rbx movdqu 0(%rdi),%xmm4 movdqu 16(%rdi),%xmm5 leaq 32(%rdi),%rdi movdqa %xmm0,-32(%rbx) movdqa %xmm0,-16(%rbx) movdqa %xmm0,-32(%rbx,%rdx,1) movdqa %xmm0,-16(%rbx,%rdx,1) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-32(%rdi) movdqu %xmm5,-16(%rdi) addq $32,%r9 jnz L$sqr8x_cond_copy movq $1,%rax movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$sqr8x_epilogue: ret .globl _bn_mulx4x_mont .private_extern _bn_mulx4x_mont .p2align 5 _bn_mulx4x_mont: _CET_ENDBR movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$mulx4x_prologue: shll $3,%r9d xorq %r10,%r10 subq %r9,%r10 movq (%r8),%r8 leaq -72(%rsp,%r10,1),%rbp andq $-128,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja L$mulx4x_page_walk jmp L$mulx4x_page_walk_done .p2align 4 L$mulx4x_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja L$mulx4x_page_walk L$mulx4x_page_walk_done: leaq (%rdx,%r9,1),%r10 movq %r9,0(%rsp) shrq $5,%r9 movq %r10,16(%rsp) subq $1,%r9 movq %r8,24(%rsp) movq %rdi,32(%rsp) movq %rax,40(%rsp) movq %r9,48(%rsp) jmp L$mulx4x_body .p2align 5 L$mulx4x_body: leaq 8(%rdx),%rdi movq (%rdx),%rdx leaq 64+32(%rsp),%rbx movq %rdx,%r9 mulxq 0(%rsi),%r8,%rax mulxq 8(%rsi),%r11,%r14 addq %rax,%r11 movq %rdi,8(%rsp) mulxq 16(%rsi),%r12,%r13 adcq %r14,%r12 adcq $0,%r13 movq %r8,%rdi imulq 24(%rsp),%r8 xorq %rbp,%rbp mulxq 24(%rsi),%rax,%r14 movq %r8,%rdx leaq 32(%rsi),%rsi adcxq %rax,%r13 adcxq %rbp,%r14 mulxq 0(%rcx),%rax,%r10 adcxq %rax,%rdi adoxq %r11,%r10 mulxq 8(%rcx),%rax,%r11 adcxq %rax,%r10 adoxq %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 movq 48(%rsp),%rdi movq %r10,-32(%rbx) adcxq %rax,%r11 adoxq %r13,%r12 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq %r11,-24(%rbx) adcxq %rax,%r12 adoxq %rbp,%r15 leaq 32(%rcx),%rcx movq %r12,-16(%rbx) jmp L$mulx4x_1st .p2align 5 L$mulx4x_1st: adcxq %rbp,%r15 mulxq 0(%rsi),%r10,%rax adcxq %r14,%r10 mulxq 8(%rsi),%r11,%r14 adcxq %rax,%r11 mulxq 16(%rsi),%r12,%rax adcxq %r14,%r12 mulxq 24(%rsi),%r13,%r14 .byte 0x67,0x67 movq %r8,%rdx adcxq %rax,%r13 adcxq %rbp,%r14 leaq 32(%rsi),%rsi leaq 32(%rbx),%rbx adoxq %r15,%r10 mulxq 0(%rcx),%rax,%r15 adcxq %rax,%r10 adoxq %r15,%r11 mulxq 8(%rcx),%rax,%r15 adcxq %rax,%r11 adoxq %r15,%r12 mulxq 16(%rcx),%rax,%r15 movq %r10,-40(%rbx) adcxq %rax,%r12 movq %r11,-32(%rbx) adoxq %r15,%r13 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq %r12,-24(%rbx) adcxq %rax,%r13 adoxq %rbp,%r15 leaq 32(%rcx),%rcx movq %r13,-16(%rbx) decq %rdi jnz L$mulx4x_1st movq 0(%rsp),%rax movq 8(%rsp),%rdi adcq %rbp,%r15 addq %r15,%r14 sbbq %r15,%r15 movq %r14,-8(%rbx) jmp L$mulx4x_outer .p2align 5 L$mulx4x_outer: movq (%rdi),%rdx leaq 8(%rdi),%rdi subq %rax,%rsi movq %r15,(%rbx) leaq 64+32(%rsp),%rbx subq %rax,%rcx mulxq 0(%rsi),%r8,%r11 xorl %ebp,%ebp movq %rdx,%r9 mulxq 8(%rsi),%r14,%r12 adoxq -32(%rbx),%r8 adcxq %r14,%r11 mulxq 16(%rsi),%r15,%r13 adoxq -24(%rbx),%r11 adcxq %r15,%r12 adoxq -16(%rbx),%r12 adcxq %rbp,%r13 adoxq %rbp,%r13 movq %rdi,8(%rsp) movq %r8,%r15 imulq 24(%rsp),%r8 xorl %ebp,%ebp mulxq 24(%rsi),%rax,%r14 movq %r8,%rdx adcxq %rax,%r13 adoxq -8(%rbx),%r13 adcxq %rbp,%r14 leaq 32(%rsi),%rsi adoxq %rbp,%r14 mulxq 0(%rcx),%rax,%r10 adcxq %rax,%r15 adoxq %r11,%r10 mulxq 8(%rcx),%rax,%r11 adcxq %rax,%r10 adoxq %r12,%r11 mulxq 16(%rcx),%rax,%r12 movq %r10,-32(%rbx) adcxq %rax,%r11 adoxq %r13,%r12 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq %r11,-24(%rbx) leaq 32(%rcx),%rcx adcxq %rax,%r12 adoxq %rbp,%r15 movq 48(%rsp),%rdi movq %r12,-16(%rbx) jmp L$mulx4x_inner .p2align 5 L$mulx4x_inner: mulxq 0(%rsi),%r10,%rax adcxq %rbp,%r15 adoxq %r14,%r10 mulxq 8(%rsi),%r11,%r14 adcxq 0(%rbx),%r10 adoxq %rax,%r11 mulxq 16(%rsi),%r12,%rax adcxq 8(%rbx),%r11 adoxq %r14,%r12 mulxq 24(%rsi),%r13,%r14 movq %r8,%rdx adcxq 16(%rbx),%r12 adoxq %rax,%r13 adcxq 24(%rbx),%r13 adoxq %rbp,%r14 leaq 32(%rsi),%rsi leaq 32(%rbx),%rbx adcxq %rbp,%r14 adoxq %r15,%r10 mulxq 0(%rcx),%rax,%r15 adcxq %rax,%r10 adoxq %r15,%r11 mulxq 8(%rcx),%rax,%r15 adcxq %rax,%r11 adoxq %r15,%r12 mulxq 16(%rcx),%rax,%r15 movq %r10,-40(%rbx) adcxq %rax,%r12 adoxq %r15,%r13 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq %r11,-32(%rbx) movq %r12,-24(%rbx) adcxq %rax,%r13 adoxq %rbp,%r15 leaq 32(%rcx),%rcx movq %r13,-16(%rbx) decq %rdi jnz L$mulx4x_inner movq 0(%rsp),%rax movq 8(%rsp),%rdi adcq %rbp,%r15 subq 0(%rbx),%rbp adcq %r15,%r14 sbbq %r15,%r15 movq %r14,-8(%rbx) cmpq 16(%rsp),%rdi jne L$mulx4x_outer leaq 64(%rsp),%rbx subq %rax,%rcx negq %r15 movq %rax,%rdx shrq $3+2,%rax movq 32(%rsp),%rdi jmp L$mulx4x_sub .p2align 5 L$mulx4x_sub: movq 0(%rbx),%r11 movq 8(%rbx),%r12 movq 16(%rbx),%r13 movq 24(%rbx),%r14 leaq 32(%rbx),%rbx sbbq 0(%rcx),%r11 sbbq 8(%rcx),%r12 sbbq 16(%rcx),%r13 sbbq 24(%rcx),%r14 leaq 32(%rcx),%rcx movq %r11,0(%rdi) movq %r12,8(%rdi) movq %r13,16(%rdi) movq %r14,24(%rdi) leaq 32(%rdi),%rdi decq %rax jnz L$mulx4x_sub sbbq $0,%r15 leaq 64(%rsp),%rbx subq %rdx,%rdi .byte 102,73,15,110,207 pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi jmp L$mulx4x_cond_copy .p2align 5 L$mulx4x_cond_copy: movdqa 0(%rbx),%xmm2 movdqa 16(%rbx),%xmm3 leaq 32(%rbx),%rbx movdqu 0(%rdi),%xmm4 movdqu 16(%rdi),%xmm5 leaq 32(%rdi),%rdi movdqa %xmm0,-32(%rbx) movdqa %xmm0,-16(%rbx) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-32(%rdi) movdqu %xmm5,-16(%rdi) subq $32,%rdx jnz L$mulx4x_cond_copy movq %rdx,(%rbx) movq $1,%rax movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$mulx4x_epilogue: ret .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 4 #endif ring-0.17.14/pregenerated/x86_64-mont-nasm.asm000064400000000000000000000546401046102023000167620ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %ifidn __OUTPUT_FORMAT__, win64 default rel %define XMMWORD %define YMMWORD %define ZMMWORD %define _CET_ENDBR %include "ring_core_generated/prefix_symbols_nasm.inc" section .text code align=64 global bn_mul_mont_nohw ALIGN 16 bn_mul_mont_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_bn_mul_mont_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] mov r9,QWORD[48+rsp] _CET_ENDBR mov r9d,r9d mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 neg r9 mov r11,rsp lea r10,[((-16))+r9*8+rsp] neg r9 and r10,-1024 sub r11,r10 and r11,-4096 lea rsp,[r11*1+r10] mov r11,QWORD[rsp] cmp rsp,r10 ja NEAR $L$mul_page_walk jmp NEAR $L$mul_page_walk_done ALIGN 16 $L$mul_page_walk: lea rsp,[((-4096))+rsp] mov r11,QWORD[rsp] cmp rsp,r10 ja NEAR $L$mul_page_walk $L$mul_page_walk_done: mov QWORD[8+r9*8+rsp],rax $L$mul_body: mov r12,rdx mov r8,QWORD[r8] mov rbx,QWORD[r12] mov rax,QWORD[rsi] xor r14,r14 xor r15,r15 mov rbp,r8 mul rbx mov r10,rax mov rax,QWORD[rcx] imul rbp,r10 mov r11,rdx mul rbp add r10,rax mov rax,QWORD[8+rsi] adc rdx,0 mov r13,rdx lea r15,[1+r15] jmp NEAR $L$1st_enter ALIGN 16 $L$1st: add r13,rax mov rax,QWORD[r15*8+rsi] adc rdx,0 add r13,r11 mov r11,r10 adc rdx,0 mov QWORD[((-16))+r15*8+rsp],r13 mov r13,rdx $L$1st_enter: mul rbx add r11,rax mov rax,QWORD[r15*8+rcx] adc rdx,0 lea r15,[1+r15] mov r10,rdx mul rbp cmp r15,r9 jne NEAR $L$1st add r13,rax mov rax,QWORD[rsi] adc rdx,0 add r13,r11 adc rdx,0 mov QWORD[((-16))+r15*8+rsp],r13 mov r13,rdx mov r11,r10 xor rdx,rdx add r13,r11 adc rdx,0 mov QWORD[((-8))+r9*8+rsp],r13 mov QWORD[r9*8+rsp],rdx lea r14,[1+r14] jmp NEAR $L$outer ALIGN 16 $L$outer: mov rbx,QWORD[r14*8+r12] xor r15,r15 mov rbp,r8 mov r10,QWORD[rsp] mul rbx add r10,rax mov rax,QWORD[rcx] adc rdx,0 imul rbp,r10 mov r11,rdx mul rbp add r10,rax mov rax,QWORD[8+rsi] adc rdx,0 mov r10,QWORD[8+rsp] mov r13,rdx lea r15,[1+r15] jmp NEAR $L$inner_enter ALIGN 16 $L$inner: add r13,rax mov rax,QWORD[r15*8+rsi] adc rdx,0 add r13,r10 mov r10,QWORD[r15*8+rsp] adc rdx,0 mov QWORD[((-16))+r15*8+rsp],r13 mov r13,rdx $L$inner_enter: mul rbx add r11,rax mov rax,QWORD[r15*8+rcx] adc rdx,0 add r10,r11 mov r11,rdx adc r11,0 lea r15,[1+r15] mul rbp cmp r15,r9 jne NEAR $L$inner add r13,rax mov rax,QWORD[rsi] adc rdx,0 add r13,r10 mov r10,QWORD[r15*8+rsp] adc rdx,0 mov QWORD[((-16))+r15*8+rsp],r13 mov r13,rdx xor rdx,rdx add r13,r11 adc rdx,0 add r13,r10 adc rdx,0 mov QWORD[((-8))+r9*8+rsp],r13 mov QWORD[r9*8+rsp],rdx lea r14,[1+r14] cmp r14,r9 jb NEAR $L$outer xor r14,r14 mov rax,QWORD[rsp] mov r15,r9 ALIGN 16 $L$sub: sbb rax,QWORD[r14*8+rcx] mov QWORD[r14*8+rdi],rax mov rax,QWORD[8+r14*8+rsp] lea r14,[1+r14] dec r15 jnz NEAR $L$sub sbb rax,0 mov rbx,-1 xor rbx,rax xor r14,r14 mov r15,r9 $L$copy: mov rcx,QWORD[r14*8+rdi] mov rdx,QWORD[r14*8+rsp] and rcx,rbx and rdx,rax mov QWORD[r14*8+rsp],r9 or rdx,rcx mov QWORD[r14*8+rdi],rdx lea r14,[1+r14] sub r15,1 jnz NEAR $L$copy mov rsi,QWORD[8+r9*8+rsp] mov rax,1 mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$mul_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_bn_mul_mont_nohw: global bn_mul4x_mont ALIGN 16 bn_mul4x_mont: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_bn_mul4x_mont: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] mov r9,QWORD[48+rsp] _CET_ENDBR mov r9d,r9d mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 neg r9 mov r11,rsp lea r10,[((-32))+r9*8+rsp] neg r9 and r10,-1024 sub r11,r10 and r11,-4096 lea rsp,[r11*1+r10] mov r11,QWORD[rsp] cmp rsp,r10 ja NEAR $L$mul4x_page_walk jmp NEAR $L$mul4x_page_walk_done $L$mul4x_page_walk: lea rsp,[((-4096))+rsp] mov r11,QWORD[rsp] cmp rsp,r10 ja NEAR $L$mul4x_page_walk $L$mul4x_page_walk_done: mov QWORD[8+r9*8+rsp],rax $L$mul4x_body: mov QWORD[16+r9*8+rsp],rdi mov r12,rdx mov r8,QWORD[r8] mov rbx,QWORD[r12] mov rax,QWORD[rsi] xor r14,r14 xor r15,r15 mov rbp,r8 mul rbx mov r10,rax mov rax,QWORD[rcx] imul rbp,r10 mov r11,rdx mul rbp add r10,rax mov rax,QWORD[8+rsi] adc rdx,0 mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[8+rcx] adc rdx,0 mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[16+rsi] adc rdx,0 add rdi,r11 lea r15,[4+r15] adc rdx,0 mov QWORD[rsp],rdi mov r13,rdx jmp NEAR $L$1st4x ALIGN 16 $L$1st4x: mul rbx add r10,rax mov rax,QWORD[((-16))+r15*8+rcx] adc rdx,0 mov r11,rdx mul rbp add r13,rax mov rax,QWORD[((-8))+r15*8+rsi] adc rdx,0 add r13,r10 adc rdx,0 mov QWORD[((-24))+r15*8+rsp],r13 mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[((-8))+r15*8+rcx] adc rdx,0 mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[r15*8+rsi] adc rdx,0 add rdi,r11 adc rdx,0 mov QWORD[((-16))+r15*8+rsp],rdi mov r13,rdx mul rbx add r10,rax mov rax,QWORD[r15*8+rcx] adc rdx,0 mov r11,rdx mul rbp add r13,rax mov rax,QWORD[8+r15*8+rsi] adc rdx,0 add r13,r10 adc rdx,0 mov QWORD[((-8))+r15*8+rsp],r13 mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[8+r15*8+rcx] adc rdx,0 lea r15,[4+r15] mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[((-16))+r15*8+rsi] adc rdx,0 add rdi,r11 adc rdx,0 mov QWORD[((-32))+r15*8+rsp],rdi mov r13,rdx cmp r15,r9 jb NEAR $L$1st4x mul rbx add r10,rax mov rax,QWORD[((-16))+r15*8+rcx] adc rdx,0 mov r11,rdx mul rbp add r13,rax mov rax,QWORD[((-8))+r15*8+rsi] adc rdx,0 add r13,r10 adc rdx,0 mov QWORD[((-24))+r15*8+rsp],r13 mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[((-8))+r15*8+rcx] adc rdx,0 mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[rsi] adc rdx,0 add rdi,r11 adc rdx,0 mov QWORD[((-16))+r15*8+rsp],rdi mov r13,rdx xor rdi,rdi add r13,r10 adc rdi,0 mov QWORD[((-8))+r15*8+rsp],r13 mov QWORD[r15*8+rsp],rdi lea r14,[1+r14] ALIGN 4 $L$outer4x: mov rbx,QWORD[r14*8+r12] xor r15,r15 mov r10,QWORD[rsp] mov rbp,r8 mul rbx add r10,rax mov rax,QWORD[rcx] adc rdx,0 imul rbp,r10 mov r11,rdx mul rbp add r10,rax mov rax,QWORD[8+rsi] adc rdx,0 mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[8+rcx] adc rdx,0 add r11,QWORD[8+rsp] adc rdx,0 mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[16+rsi] adc rdx,0 add rdi,r11 lea r15,[4+r15] adc rdx,0 mov QWORD[rsp],rdi mov r13,rdx jmp NEAR $L$inner4x ALIGN 16 $L$inner4x: mul rbx add r10,rax mov rax,QWORD[((-16))+r15*8+rcx] adc rdx,0 add r10,QWORD[((-16))+r15*8+rsp] adc rdx,0 mov r11,rdx mul rbp add r13,rax mov rax,QWORD[((-8))+r15*8+rsi] adc rdx,0 add r13,r10 adc rdx,0 mov QWORD[((-24))+r15*8+rsp],r13 mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[((-8))+r15*8+rcx] adc rdx,0 add r11,QWORD[((-8))+r15*8+rsp] adc rdx,0 mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[r15*8+rsi] adc rdx,0 add rdi,r11 adc rdx,0 mov QWORD[((-16))+r15*8+rsp],rdi mov r13,rdx mul rbx add r10,rax mov rax,QWORD[r15*8+rcx] adc rdx,0 add r10,QWORD[r15*8+rsp] adc rdx,0 mov r11,rdx mul rbp add r13,rax mov rax,QWORD[8+r15*8+rsi] adc rdx,0 add r13,r10 adc rdx,0 mov QWORD[((-8))+r15*8+rsp],r13 mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[8+r15*8+rcx] adc rdx,0 add r11,QWORD[8+r15*8+rsp] adc rdx,0 lea r15,[4+r15] mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[((-16))+r15*8+rsi] adc rdx,0 add rdi,r11 adc rdx,0 mov QWORD[((-32))+r15*8+rsp],rdi mov r13,rdx cmp r15,r9 jb NEAR $L$inner4x mul rbx add r10,rax mov rax,QWORD[((-16))+r15*8+rcx] adc rdx,0 add r10,QWORD[((-16))+r15*8+rsp] adc rdx,0 mov r11,rdx mul rbp add r13,rax mov rax,QWORD[((-8))+r15*8+rsi] adc rdx,0 add r13,r10 adc rdx,0 mov QWORD[((-24))+r15*8+rsp],r13 mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[((-8))+r15*8+rcx] adc rdx,0 add r11,QWORD[((-8))+r15*8+rsp] adc rdx,0 lea r14,[1+r14] mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[rsi] adc rdx,0 add rdi,r11 adc rdx,0 mov QWORD[((-16))+r15*8+rsp],rdi mov r13,rdx xor rdi,rdi add r13,r10 adc rdi,0 add r13,QWORD[r9*8+rsp] adc rdi,0 mov QWORD[((-8))+r15*8+rsp],r13 mov QWORD[r15*8+rsp],rdi cmp r14,r9 jb NEAR $L$outer4x mov rdi,QWORD[16+r9*8+rsp] lea r15,[((-4))+r9] mov rax,QWORD[rsp] mov rdx,QWORD[8+rsp] shr r15,2 lea rsi,[rsp] xor r14,r14 sub rax,QWORD[rcx] mov rbx,QWORD[16+rsi] mov rbp,QWORD[24+rsi] sbb rdx,QWORD[8+rcx] $L$sub4x: mov QWORD[r14*8+rdi],rax mov QWORD[8+r14*8+rdi],rdx sbb rbx,QWORD[16+r14*8+rcx] mov rax,QWORD[32+r14*8+rsi] mov rdx,QWORD[40+r14*8+rsi] sbb rbp,QWORD[24+r14*8+rcx] mov QWORD[16+r14*8+rdi],rbx mov QWORD[24+r14*8+rdi],rbp sbb rax,QWORD[32+r14*8+rcx] mov rbx,QWORD[48+r14*8+rsi] mov rbp,QWORD[56+r14*8+rsi] sbb rdx,QWORD[40+r14*8+rcx] lea r14,[4+r14] dec r15 jnz NEAR $L$sub4x mov QWORD[r14*8+rdi],rax mov rax,QWORD[32+r14*8+rsi] sbb rbx,QWORD[16+r14*8+rcx] mov QWORD[8+r14*8+rdi],rdx sbb rbp,QWORD[24+r14*8+rcx] mov QWORD[16+r14*8+rdi],rbx sbb rax,0 mov QWORD[24+r14*8+rdi],rbp pxor xmm0,xmm0 DB 102,72,15,110,224 pcmpeqd xmm5,xmm5 pshufd xmm4,xmm4,0 mov r15,r9 pxor xmm5,xmm4 shr r15,2 xor eax,eax jmp NEAR $L$copy4x ALIGN 16 $L$copy4x: movdqa xmm1,XMMWORD[rax*1+rsp] movdqu xmm2,XMMWORD[rax*1+rdi] pand xmm1,xmm4 pand xmm2,xmm5 movdqa xmm3,XMMWORD[16+rax*1+rsp] movdqa XMMWORD[rax*1+rsp],xmm0 por xmm1,xmm2 movdqu xmm2,XMMWORD[16+rax*1+rdi] movdqu XMMWORD[rax*1+rdi],xmm1 pand xmm3,xmm4 pand xmm2,xmm5 movdqa XMMWORD[16+rax*1+rsp],xmm0 por xmm3,xmm2 movdqu XMMWORD[16+rax*1+rdi],xmm3 lea rax,[32+rax] dec r15 jnz NEAR $L$copy4x mov rsi,QWORD[8+r9*8+rsp] mov rax,1 mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$mul4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_bn_mul4x_mont: EXTERN bn_sqrx8x_internal EXTERN bn_sqr8x_internal global bn_sqr8x_mont ALIGN 32 bn_sqr8x_mont: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_bn_sqr8x_mont: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] mov r9,QWORD[48+rsp] _CET_ENDBR mov r9d,r9d mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 $L$sqr8x_prologue: mov r10d,r9d shl r9d,3 shl r10,3+2 neg r9 lea r11,[((-64))+r9*2+rsp] mov rbp,rsp mov r8,QWORD[r8] sub r11,rsi and r11,4095 cmp r10,r11 jb NEAR $L$sqr8x_sp_alt sub rbp,r11 lea rbp,[((-64))+r9*2+rbp] jmp NEAR $L$sqr8x_sp_done ALIGN 32 $L$sqr8x_sp_alt: lea r10,[((4096-64))+r9*2] lea rbp,[((-64))+r9*2+rbp] sub r11,r10 mov r10,0 cmovc r11,r10 sub rbp,r11 $L$sqr8x_sp_done: and rbp,-64 mov r11,rsp sub r11,rbp and r11,-4096 lea rsp,[rbp*1+r11] mov r10,QWORD[rsp] cmp rsp,rbp ja NEAR $L$sqr8x_page_walk jmp NEAR $L$sqr8x_page_walk_done ALIGN 16 $L$sqr8x_page_walk: lea rsp,[((-4096))+rsp] mov r10,QWORD[rsp] cmp rsp,rbp ja NEAR $L$sqr8x_page_walk $L$sqr8x_page_walk_done: mov r10,r9 neg r9 mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax $L$sqr8x_body: DB 102,72,15,110,209 pxor xmm0,xmm0 DB 102,72,15,110,207 DB 102,73,15,110,218 test rdx,rdx jz NEAR $L$sqr8x_nox call bn_sqrx8x_internal lea rbx,[rcx*1+r8] mov r9,rcx mov rdx,rcx DB 102,72,15,126,207 sar rcx,3+2 jmp NEAR $L$sqr8x_sub ALIGN 32 $L$sqr8x_nox: call bn_sqr8x_internal lea rbx,[r9*1+rdi] mov rcx,r9 mov rdx,r9 DB 102,72,15,126,207 sar rcx,3+2 jmp NEAR $L$sqr8x_sub ALIGN 32 $L$sqr8x_sub: mov r12,QWORD[rbx] mov r13,QWORD[8+rbx] mov r14,QWORD[16+rbx] mov r15,QWORD[24+rbx] lea rbx,[32+rbx] sbb r12,QWORD[rbp] sbb r13,QWORD[8+rbp] sbb r14,QWORD[16+rbp] sbb r15,QWORD[24+rbp] lea rbp,[32+rbp] mov QWORD[rdi],r12 mov QWORD[8+rdi],r13 mov QWORD[16+rdi],r14 mov QWORD[24+rdi],r15 lea rdi,[32+rdi] inc rcx jnz NEAR $L$sqr8x_sub sbb rax,0 lea rbx,[r9*1+rbx] lea rdi,[r9*1+rdi] DB 102,72,15,110,200 pxor xmm0,xmm0 pshufd xmm1,xmm1,0 mov rsi,QWORD[40+rsp] jmp NEAR $L$sqr8x_cond_copy ALIGN 32 $L$sqr8x_cond_copy: movdqa xmm2,XMMWORD[rbx] movdqa xmm3,XMMWORD[16+rbx] lea rbx,[32+rbx] movdqu xmm4,XMMWORD[rdi] movdqu xmm5,XMMWORD[16+rdi] lea rdi,[32+rdi] movdqa XMMWORD[(-32)+rbx],xmm0 movdqa XMMWORD[(-16)+rbx],xmm0 movdqa XMMWORD[(-32)+rdx*1+rbx],xmm0 movdqa XMMWORD[(-16)+rdx*1+rbx],xmm0 pcmpeqd xmm0,xmm1 pand xmm2,xmm1 pand xmm3,xmm1 pand xmm4,xmm0 pand xmm5,xmm0 pxor xmm0,xmm0 por xmm4,xmm2 por xmm5,xmm3 movdqu XMMWORD[(-32)+rdi],xmm4 movdqu XMMWORD[(-16)+rdi],xmm5 add r9,32 jnz NEAR $L$sqr8x_cond_copy mov rax,1 mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$sqr8x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_bn_sqr8x_mont: global bn_mulx4x_mont ALIGN 32 bn_mulx4x_mont: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_bn_mulx4x_mont: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] mov r9,QWORD[48+rsp] _CET_ENDBR mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 $L$mulx4x_prologue: shl r9d,3 xor r10,r10 sub r10,r9 mov r8,QWORD[r8] lea rbp,[((-72))+r10*1+rsp] and rbp,-128 mov r11,rsp sub r11,rbp and r11,-4096 lea rsp,[rbp*1+r11] mov r10,QWORD[rsp] cmp rsp,rbp ja NEAR $L$mulx4x_page_walk jmp NEAR $L$mulx4x_page_walk_done ALIGN 16 $L$mulx4x_page_walk: lea rsp,[((-4096))+rsp] mov r10,QWORD[rsp] cmp rsp,rbp ja NEAR $L$mulx4x_page_walk $L$mulx4x_page_walk_done: lea r10,[r9*1+rdx] mov QWORD[rsp],r9 shr r9,5 mov QWORD[16+rsp],r10 sub r9,1 mov QWORD[24+rsp],r8 mov QWORD[32+rsp],rdi mov QWORD[40+rsp],rax mov QWORD[48+rsp],r9 jmp NEAR $L$mulx4x_body ALIGN 32 $L$mulx4x_body: lea rdi,[8+rdx] mov rdx,QWORD[rdx] lea rbx,[((64+32))+rsp] mov r9,rdx mulx rax,r8,QWORD[rsi] mulx r14,r11,QWORD[8+rsi] add r11,rax mov QWORD[8+rsp],rdi mulx r13,r12,QWORD[16+rsi] adc r12,r14 adc r13,0 mov rdi,r8 imul r8,QWORD[24+rsp] xor rbp,rbp mulx r14,rax,QWORD[24+rsi] mov rdx,r8 lea rsi,[32+rsi] adcx r13,rax adcx r14,rbp mulx r10,rax,QWORD[rcx] adcx rdi,rax adox r10,r11 mulx r11,rax,QWORD[8+rcx] adcx r10,rax adox r11,r12 DB 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 mov rdi,QWORD[48+rsp] mov QWORD[((-32))+rbx],r10 adcx r11,rax adox r12,r13 mulx r15,rax,QWORD[24+rcx] mov rdx,r9 mov QWORD[((-24))+rbx],r11 adcx r12,rax adox r15,rbp lea rcx,[32+rcx] mov QWORD[((-16))+rbx],r12 jmp NEAR $L$mulx4x_1st ALIGN 32 $L$mulx4x_1st: adcx r15,rbp mulx rax,r10,QWORD[rsi] adcx r10,r14 mulx r14,r11,QWORD[8+rsi] adcx r11,rax mulx rax,r12,QWORD[16+rsi] adcx r12,r14 mulx r14,r13,QWORD[24+rsi] DB 0x67,0x67 mov rdx,r8 adcx r13,rax adcx r14,rbp lea rsi,[32+rsi] lea rbx,[32+rbx] adox r10,r15 mulx r15,rax,QWORD[rcx] adcx r10,rax adox r11,r15 mulx r15,rax,QWORD[8+rcx] adcx r11,rax adox r12,r15 mulx r15,rax,QWORD[16+rcx] mov QWORD[((-40))+rbx],r10 adcx r12,rax mov QWORD[((-32))+rbx],r11 adox r13,r15 mulx r15,rax,QWORD[24+rcx] mov rdx,r9 mov QWORD[((-24))+rbx],r12 adcx r13,rax adox r15,rbp lea rcx,[32+rcx] mov QWORD[((-16))+rbx],r13 dec rdi jnz NEAR $L$mulx4x_1st mov rax,QWORD[rsp] mov rdi,QWORD[8+rsp] adc r15,rbp add r14,r15 sbb r15,r15 mov QWORD[((-8))+rbx],r14 jmp NEAR $L$mulx4x_outer ALIGN 32 $L$mulx4x_outer: mov rdx,QWORD[rdi] lea rdi,[8+rdi] sub rsi,rax mov QWORD[rbx],r15 lea rbx,[((64+32))+rsp] sub rcx,rax mulx r11,r8,QWORD[rsi] xor ebp,ebp mov r9,rdx mulx r12,r14,QWORD[8+rsi] adox r8,QWORD[((-32))+rbx] adcx r11,r14 mulx r13,r15,QWORD[16+rsi] adox r11,QWORD[((-24))+rbx] adcx r12,r15 adox r12,QWORD[((-16))+rbx] adcx r13,rbp adox r13,rbp mov QWORD[8+rsp],rdi mov r15,r8 imul r8,QWORD[24+rsp] xor ebp,ebp mulx r14,rax,QWORD[24+rsi] mov rdx,r8 adcx r13,rax adox r13,QWORD[((-8))+rbx] adcx r14,rbp lea rsi,[32+rsi] adox r14,rbp mulx r10,rax,QWORD[rcx] adcx r15,rax adox r10,r11 mulx r11,rax,QWORD[8+rcx] adcx r10,rax adox r11,r12 mulx r12,rax,QWORD[16+rcx] mov QWORD[((-32))+rbx],r10 adcx r11,rax adox r12,r13 mulx r15,rax,QWORD[24+rcx] mov rdx,r9 mov QWORD[((-24))+rbx],r11 lea rcx,[32+rcx] adcx r12,rax adox r15,rbp mov rdi,QWORD[48+rsp] mov QWORD[((-16))+rbx],r12 jmp NEAR $L$mulx4x_inner ALIGN 32 $L$mulx4x_inner: mulx rax,r10,QWORD[rsi] adcx r15,rbp adox r10,r14 mulx r14,r11,QWORD[8+rsi] adcx r10,QWORD[rbx] adox r11,rax mulx rax,r12,QWORD[16+rsi] adcx r11,QWORD[8+rbx] adox r12,r14 mulx r14,r13,QWORD[24+rsi] mov rdx,r8 adcx r12,QWORD[16+rbx] adox r13,rax adcx r13,QWORD[24+rbx] adox r14,rbp lea rsi,[32+rsi] lea rbx,[32+rbx] adcx r14,rbp adox r10,r15 mulx r15,rax,QWORD[rcx] adcx r10,rax adox r11,r15 mulx r15,rax,QWORD[8+rcx] adcx r11,rax adox r12,r15 mulx r15,rax,QWORD[16+rcx] mov QWORD[((-40))+rbx],r10 adcx r12,rax adox r13,r15 mulx r15,rax,QWORD[24+rcx] mov rdx,r9 mov QWORD[((-32))+rbx],r11 mov QWORD[((-24))+rbx],r12 adcx r13,rax adox r15,rbp lea rcx,[32+rcx] mov QWORD[((-16))+rbx],r13 dec rdi jnz NEAR $L$mulx4x_inner mov rax,QWORD[rsp] mov rdi,QWORD[8+rsp] adc r15,rbp sub rbp,QWORD[rbx] adc r14,r15 sbb r15,r15 mov QWORD[((-8))+rbx],r14 cmp rdi,QWORD[16+rsp] jne NEAR $L$mulx4x_outer lea rbx,[64+rsp] sub rcx,rax neg r15 mov rdx,rax shr rax,3+2 mov rdi,QWORD[32+rsp] jmp NEAR $L$mulx4x_sub ALIGN 32 $L$mulx4x_sub: mov r11,QWORD[rbx] mov r12,QWORD[8+rbx] mov r13,QWORD[16+rbx] mov r14,QWORD[24+rbx] lea rbx,[32+rbx] sbb r11,QWORD[rcx] sbb r12,QWORD[8+rcx] sbb r13,QWORD[16+rcx] sbb r14,QWORD[24+rcx] lea rcx,[32+rcx] mov QWORD[rdi],r11 mov QWORD[8+rdi],r12 mov QWORD[16+rdi],r13 mov QWORD[24+rdi],r14 lea rdi,[32+rdi] dec rax jnz NEAR $L$mulx4x_sub sbb r15,0 lea rbx,[64+rsp] sub rdi,rdx DB 102,73,15,110,207 pxor xmm0,xmm0 pshufd xmm1,xmm1,0 mov rsi,QWORD[40+rsp] jmp NEAR $L$mulx4x_cond_copy ALIGN 32 $L$mulx4x_cond_copy: movdqa xmm2,XMMWORD[rbx] movdqa xmm3,XMMWORD[16+rbx] lea rbx,[32+rbx] movdqu xmm4,XMMWORD[rdi] movdqu xmm5,XMMWORD[16+rdi] lea rdi,[32+rdi] movdqa XMMWORD[(-32)+rbx],xmm0 movdqa XMMWORD[(-16)+rbx],xmm0 pcmpeqd xmm0,xmm1 pand xmm2,xmm1 pand xmm3,xmm1 pand xmm4,xmm0 pand xmm5,xmm0 pxor xmm0,xmm0 por xmm4,xmm2 por xmm5,xmm3 movdqu XMMWORD[(-32)+rdi],xmm4 movdqu XMMWORD[(-16)+rdi],xmm5 sub rdx,32 jnz NEAR $L$mulx4x_cond_copy mov QWORD[rbx],rdx mov rax,1 mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$mulx4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_bn_mulx4x_mont: DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 DB 54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83 DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 DB 115,108,46,111,114,103,62,0 ALIGN 16 EXTERN __imp_RtlVirtualUnwind ALIGN 16 mul_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$common_seh_tail mov rax,QWORD[152+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail mov r10,QWORD[192+r8] mov rax,QWORD[8+r10*8+rax] jmp NEAR $L$common_pop_regs ALIGN 16 sqr_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$common_seh_tail mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$common_pop_regs mov rax,QWORD[152+r8] mov r10d,DWORD[8+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail mov rax,QWORD[40+rax] $L$common_pop_regs: mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] mov r13,QWORD[((-32))+rax] mov r14,QWORD[((-40))+rax] mov r15,QWORD[((-48))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 mov QWORD[224+r8],r13 mov QWORD[232+r8],r14 mov QWORD[240+r8],r15 $L$common_seh_tail: mov rdi,QWORD[8+rax] mov rsi,QWORD[16+rax] mov QWORD[152+r8],rax mov QWORD[168+r8],rsi mov QWORD[176+r8],rdi mov rdi,QWORD[40+r9] mov rsi,r8 mov ecx,154 DD 0xa548f3fc mov rsi,r9 xor rcx,rcx mov rdx,QWORD[8+rsi] mov r8,QWORD[rsi] mov r9,QWORD[16+rsi] mov r10,QWORD[40+rsi] lea r11,[56+rsi] lea r12,[24+rsi] mov QWORD[32+rsp],r10 mov QWORD[40+rsp],r11 mov QWORD[48+rsp],r12 mov QWORD[56+rsp],rcx call QWORD[__imp_RtlVirtualUnwind] mov eax,1 add rsp,64 popfq pop r15 pop r14 pop r13 pop r12 pop rbp pop rbx pop rdi pop rsi ret section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_bn_mul_mont_nohw wrt ..imagebase DD $L$SEH_end_bn_mul_mont_nohw wrt ..imagebase DD $L$SEH_info_bn_mul_mont_nohw wrt ..imagebase DD $L$SEH_begin_bn_mul4x_mont wrt ..imagebase DD $L$SEH_end_bn_mul4x_mont wrt ..imagebase DD $L$SEH_info_bn_mul4x_mont wrt ..imagebase DD $L$SEH_begin_bn_sqr8x_mont wrt ..imagebase DD $L$SEH_end_bn_sqr8x_mont wrt ..imagebase DD $L$SEH_info_bn_sqr8x_mont wrt ..imagebase DD $L$SEH_begin_bn_mulx4x_mont wrt ..imagebase DD $L$SEH_end_bn_mulx4x_mont wrt ..imagebase DD $L$SEH_info_bn_mulx4x_mont wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_bn_mul_mont_nohw: DB 9,0,0,0 DD mul_handler wrt ..imagebase DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase $L$SEH_info_bn_mul4x_mont: DB 9,0,0,0 DD mul_handler wrt ..imagebase DD $L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase $L$SEH_info_bn_sqr8x_mont: DB 9,0,0,0 DD sqr_handler wrt ..imagebase DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_mulx4x_mont: DB 9,0,0,0 DD sqr_handler wrt ..imagebase DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase ALIGN 8 %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/x86_64-mont-nasm.o000064400000000000000000000465371046102023000164460ustar 00000000000000dgCL.debug$S)*z@B.debug$TlL//@B.textN/B p`.pdata0$BTB @0@.xdataPBC@@@5C:\Users\b\p\ring\pregenerated\x86_64-mont-nasm.asm {*r*3|?C3!N;!  #!&")$*&+(-*/,1.316293>4A5H?K@RAVBZC]DcEhGpIxJ|KLORSTUWXZ[\]_`bcdefhikmnopqrstwxyz{|~$'*-049=AFPTWZ^adgkorux|  !$*-14@DHMQTZ^ehknrvy|    !#%')+-01 23467"8&9*:-;3<8?@@DAGBMERHWIZJ]KaLdNgOjQmRpSsTvVzW}YZ[\]_`abcefghijklmnoqrstuwxyz{|}~  #',/259=@CFKORV[^adimqtwz !$'+/258<@CFIMQVZ]`cgknrvz}     ! "#$%'( )%*)+,,0-5.80;1>2C3G4L5P6T7W9Z:];b<f=i>m?r@uAxB~DEFGHIJLMNOPQRSUVWXYZ[\^_`abcdeghijkl m opqrs#t(u,v0w3y6z:{>|BFKPUZ_dinsx} $(+16;?CGKOSV[`a       ! $ ' . 2 6 9 ? D P X \ _ e "h #k %p &u *z +~ , - . / 1 6 7 8 9 : ; = ? D E F G H I K M N O P Q R S T U V W X Y Z [ \ ]# _' `+ a/ c4 d8 e= fB hG j` ld mi nm oq pv qz r s t u v w x y z { | } ~                        # & ' ( * , . 0 4 7 : = B F I L S W [ ^ d i p x |                                   % + 1 7 @ E I O U [ ^ b h n r v {          ! " # $ % & ( ) * + , - . / 0 1 2 3 4 5 6 7 8$ 9( :, </ =5 ?9 @> AA BD CG DK EP G` Ic Jg Kj Lm Mr Nu Pz Q| R S T U V W X Y Z [ ] ^ _ ` b c d e f g h j k l mn opqr!s't-u0v4w8x>yDzI{M}R`ekqw} #'+.48=@CFIMRX]`cfjot  $(,048<@DINRX[`dhlptx{    !#$&'()+ -./02#3(5-90;1<2=3>4?6@8A:B<C=DAFEGLIPJTLWM[N^OdQhRlSoTuV|XYZ[]`abcdefghijknopqrtuvwyz{ | }~"',17<@ACEGIJKLME8C:\Users\b\p\ring\pregenerated\x86_64-mont-nasm.o4'The Netwide Assembler 2.13.03-ring_core_0_17_14__bn_mul_mont_nohw&L$SEH_begin_bn_mul_mont_nohwL$mul_page_walkL$mul_page_walk_doneL$mul_bodyL$1stL$1st_enterL$outerL$innerL$inner_enterL$subL$copyL$mul_epilogue$L$SEH_end_bn_mul_mont_nohw*ring_core_0_17_14__bn_mul4x_mont#L$SEH_begin_bn_mul4x_montL$mul4x_page_walk L$mul4x_page_walk_doneL$mul4x_bodyL$1st4xL$outer4xL$inner4xL$sub4xL$copy4xL$mul4x_epilogue!L$SEH_end_bn_mul4x_mont*ring_core_0_17_14__bn_sqr8x_mont#L$SEH_begin_bn_sqr8x_montL$sqr8x_prologueL$sqr8x_sp_altL$sqr8x_sp_doneL$sqr8x_page_walk L$sqr8x_page_walk_doneL$sqr8x_bodyL$sqr8x_noxL$sqr8x_subL$sqr8x_cond_copyL$sqr8x_epilogue!L$SEH_end_bn_sqr8x_mont+ring_core_0_17_14__bn_mulx4x_mont$L$SEH_begin_bn_mulx4x_montL$mulx4x_prologueL$mulx4x_page_walk!L$mulx4x_page_walk_doneL$mulx4x_bodyL$mulx4x_1stL$mulx4x_outerL$mulx4x_innerL$mulx4x_subL$mulx4x_cond_copyL$mulx4x_epilogue"L$SEH_end_bn_mulx4x_montmul_handlersqr_handlerL$common_pop_regsL$common_seh_tail(  L$SEH_info_bn_mul_mont_nohw%  L$SEH_info_bn_mul4x_mont%  L$SEH_info_bn_sqr8x_mont&  L$SEH_info_bn_mulx4x_montl p " " # # /# 3# J# N# j# n# # # # # # # # # # # # # # # $ $ $$ ($ J$ N$ v$ z$ $ $ $! $! $" $" $# $# %$ %$ %% %% /%& 3%& B%' F%' V%( Z%( r%) v%) %* %* %+ %+ %, %, &- &- &. &. 7&/ ;&/ T&0 X&0 v&1 z&1 &2 &2 &3 &3 &4 &4 &5 &5 &6 &6 '7 '7 E'8 I'8 k'9 o'9 ': ': '; '; '< '< '= '= '> '> (? (? .(@ 2(@ F(A J(A d(B h(B (C (C (D (D (E (E (F (F (G (G )H )H ;)I ?)I b)J f)J )K )K H|$Ht$HHHLLLD$(LL$0EHSUATAUAVAWIINTIIM)IK$L$L9 H$L$L9JDIMI$HM1M1LHIHIIHIHFHIM#IJHMMHNlIHIJHMIHM9IHHMHNlIMH1MHNlJMv KM1LL$HIHHIIHIHFHLT$IM-IJHMNHNlIHIJHMIIMHM9IHHMNHNlIH1MHMHNlJMvM9&M1H$MϐJJJDMvIHHH1M1MJ JH!H!N H JMvIJtL~LvLnLfHnH^H&H|$Ht$ÐH|$Ht$HHHLLLD$(LL$0EHSUATAUAVAWIINTIIM)IK$L$L9H$L$L9JDJ|IMI$HM1M1LHIHIIHIHFHHHIHAHIHHHFHLMHH<$I HIJDHIHIJDHMHNlHHIJDHIHHJHLHJ|IHIJHIHIJDHMHNlHHIJDHMIHHJDHLHJ|IM95HIJDHIHIJDHMHNlHHIJDHIHHHHLHJ|IH1MHNlJVWSUATAUAVAWH@I@xIIqMY8ENL9IESNL9MJDdVWSUATAUAVAWH@I@xIIqMY8ENL9oESNL9IESNL9FH@(HXHhL`LhLpLxIIMMMMHxHpIIIIy(LƹHLH1HVLLNLV(L^8LfLT$ L\$(Ld$0HL$8H@A_A^A]A\][_^Ò  3 a 8    $(,   RV 0u 00 { $(,0<@DH.filegC:\Users\b\p\ring\.debug$S)z.debug$Tl.textN.pdata0 .xdataP.absolut*Of pL$1stL$outerPL$innerL$sub@L$copyn<V8hMRL$1st4xL$sub4xBL$copy4xVa $ 4P Fe ]u j v `     0  p  7 E R` a`p}{0, E8_ring_core_0_17_14__bn_sqrx8x_internalring_core_0_17_14__bn_sqr8x_internal__imp_RtlVirtualUnwindring_core_0_17_14__bn_mul_mont_nohwL$SEH_begin_bn_mul_mont_nohwL$mul_page_walkL$mul_page_walk_doneL$mul_bodyL$1st_enterL$inner_enterL$mul_epilogueL$SEH_end_bn_mul_mont_nohwring_core_0_17_14__bn_mul4x_montL$SEH_begin_bn_mul4x_montL$mul4x_page_walkL$mul4x_page_walk_doneL$mul4x_bodyL$outer4xL$inner4xL$mul4x_epilogueL$SEH_end_bn_mul4x_montring_core_0_17_14__bn_sqr8x_montL$SEH_begin_bn_sqr8x_montL$sqr8x_prologueL$sqr8x_sp_altL$sqr8x_sp_doneL$sqr8x_page_walkL$sqr8x_page_walk_doneL$sqr8x_bodyL$sqr8x_noxL$sqr8x_subL$sqr8x_cond_copyL$sqr8x_epilogueL$SEH_end_bn_sqr8x_montring_core_0_17_14__bn_mulx4x_montL$SEH_begin_bn_mulx4x_montL$mulx4x_prologueL$mulx4x_page_walkL$mulx4x_page_walk_doneL$mulx4x_bodyL$mulx4x_1stL$mulx4x_outerL$mulx4x_innerL$mulx4x_subL$mulx4x_cond_copyL$mulx4x_epilogueL$SEH_end_bn_mulx4x_montmul_handlersqr_handlerL$common_pop_regsL$common_seh_tailL$SEH_info_bn_mul_mont_nohwL$SEH_info_bn_mul4x_montL$SEH_info_bn_sqr8x_montL$SEH_info_bn_mulx4x_montring-0.17.14/pregenerated/x86_64-mont5-elf.S000064400000000000000000001436141046102023000163010ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text .globl bn_mul4x_mont_gather5 .hidden bn_mul4x_mont_gather5 .type bn_mul4x_mont_gather5,@function .align 32 bn_mul4x_mont_gather5: .cfi_startproc _CET_ENDBR .byte 0x67 movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 .Lmul4x_prologue: .byte 0x67 shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lmul4xsp_alt subq %r11,%rbp leaq -320(%rbp,%r9,2),%rbp jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: leaq 4096-320(,%r9,2),%r10 leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp .Lmul4xsp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lmul4x_page_walk jmp .Lmul4x_page_walk_done .Lmul4x_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lmul4x_page_walk .Lmul4x_page_walk_done: negq %r9 movq %rax,40(%rsp) .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lmul4x_body: call mul4x_internal movq 40(%rsp),%rsi .cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret .cfi_endproc .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 .type mul4x_internal,@function .align 32 mul4x_internal: .cfi_startproc shlq $5,%r9 movd 8(%rax),%xmm5 leaq .Linc(%rip),%rax leaq 128(%rdx,%r9,1),%r13 shrq $5,%r9 movdqa 0(%rax),%xmm0 movdqa 16(%rax),%xmm1 leaq 88-112(%rsp,%r9,1),%r10 leaq 128(%rdx),%r12 pshufd $0,%xmm5,%xmm5 movdqa %xmm1,%xmm4 .byte 0x67,0x67 movdqa %xmm1,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 .byte 0x67 movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,112(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,128(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,144(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,160(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,176(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,192(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,208(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,224(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,240(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,256(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,272(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,288(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,304(%r10) paddd %xmm2,%xmm3 .byte 0x67 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,320(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,336(%r10) pand 64(%r12),%xmm0 pand 80(%r12),%xmm1 pand 96(%r12),%xmm2 movdqa %xmm3,352(%r10) pand 112(%r12),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa -128(%r12),%xmm4 movdqa -112(%r12),%xmm5 movdqa -96(%r12),%xmm2 pand 112(%r10),%xmm4 movdqa -80(%r12),%xmm3 pand 128(%r10),%xmm5 por %xmm4,%xmm0 pand 144(%r10),%xmm2 por %xmm5,%xmm1 pand 160(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa -64(%r12),%xmm4 movdqa -48(%r12),%xmm5 movdqa -32(%r12),%xmm2 pand 176(%r10),%xmm4 movdqa -16(%r12),%xmm3 pand 192(%r10),%xmm5 por %xmm4,%xmm0 pand 208(%r10),%xmm2 por %xmm5,%xmm1 pand 224(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa 0(%r12),%xmm4 movdqa 16(%r12),%xmm5 movdqa 32(%r12),%xmm2 pand 240(%r10),%xmm4 movdqa 48(%r12),%xmm3 pand 256(%r10),%xmm5 por %xmm4,%xmm0 pand 272(%r10),%xmm2 por %xmm5,%xmm1 pand 288(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 por %xmm1,%xmm0 pshufd $0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 leaq 256(%r12),%r12 .byte 102,72,15,126,195 movq %r13,16+8(%rsp) movq %rdi,56+8(%rsp) movq (%r8),%r8 movq (%rsi),%rax leaq (%rsi,%r9,1),%rsi negq %r9 movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax imulq %r10,%rbp leaq 64+8(%rsp),%r14 movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi,%r9,1),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 jmp .L1st4x .align 32 .L1st4x: mulq %rbx addq %rax,%r10 movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%r14) movq %rdx,%r13 mulq %rbx addq %rax,%r10 movq 0(%rcx),%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq 8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-8(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 addq $32,%r15 jnz .L1st4x mulq %rbx addq %rax,%r10 movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%r14) movq %rdx,%r13 leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi movq %r13,-8(%r14) jmp .Louter4x .align 32 .Louter4x: leaq 16+128(%r14),%rdx pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 movdqa -128(%r12),%xmm0 movdqa -112(%r12),%xmm1 movdqa -96(%r12),%xmm2 movdqa -80(%r12),%xmm3 pand -128(%rdx),%xmm0 pand -112(%rdx),%xmm1 por %xmm0,%xmm4 pand -96(%rdx),%xmm2 por %xmm1,%xmm5 pand -80(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa -64(%r12),%xmm0 movdqa -48(%r12),%xmm1 movdqa -32(%r12),%xmm2 movdqa -16(%r12),%xmm3 pand -64(%rdx),%xmm0 pand -48(%rdx),%xmm1 por %xmm0,%xmm4 pand -32(%rdx),%xmm2 por %xmm1,%xmm5 pand -16(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 0(%r12),%xmm0 movdqa 16(%r12),%xmm1 movdqa 32(%r12),%xmm2 movdqa 48(%r12),%xmm3 pand 0(%rdx),%xmm0 pand 16(%rdx),%xmm1 por %xmm0,%xmm4 pand 32(%rdx),%xmm2 por %xmm1,%xmm5 pand 48(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 64(%r12),%xmm0 movdqa 80(%r12),%xmm1 movdqa 96(%r12),%xmm2 movdqa 112(%r12),%xmm3 pand 64(%rdx),%xmm0 pand 80(%rdx),%xmm1 por %xmm0,%xmm4 pand 96(%rdx),%xmm2 por %xmm1,%xmm5 pand 112(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 por %xmm5,%xmm4 pshufd $0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 leaq 256(%r12),%r12 .byte 102,72,15,126,195 movq (%r14,%r9,1),%r10 movq %r8,%rbp mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx imulq %r10,%rbp movq %rdx,%r11 movq %rdi,(%r14) leaq (%r14,%r9,1),%r14 mulq %rbp addq %rax,%r10 movq 8(%rsi,%r9,1),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdx,%r13 jmp .Linner4x .align 32 .Linner4x: mulq %rbx addq %rax,%r10 movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdi,-32(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx),%rax adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %r13,-24(%r14) movq %rdx,%r13 mulq %rbx addq %rax,%r10 movq 0(%rcx),%rax adcq $0,%rdx addq (%r14),%r10 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq 8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdi,-16(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi leaq 32(%rcx),%rcx adcq $0,%rdx movq %r13,-8(%r14) movq %rdx,%r13 addq $32,%r15 jnz .Linner4x mulq %rbx addq %rax,%r10 movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdi,-32(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq %rbp,%rax movq -8(%rcx),%rbp adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %r13,-24(%r14) movq %rdx,%r13 movq %rdi,-16(%r14) leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi addq (%r14),%r13 adcq $0,%rdi movq %r13,-8(%r14) cmpq 16+8(%rsp),%r12 jb .Louter4x xorq %rax,%rax subq %r13,%rbp adcq %r15,%r15 orq %r15,%rdi subq %rdi,%rax leaq (%r14,%r9,1),%rbx movq (%rcx),%r12 leaq (%rcx),%rbp movq %r9,%rcx sarq $3+2,%rcx movq 56+8(%rsp),%rdi decq %r12 xorq %r10,%r10 movq 8(%rbp),%r13 movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp .Lsqr4x_sub_entry .cfi_endproc .size mul4x_internal,.-mul4x_internal .globl bn_power5_nohw .hidden bn_power5_nohw .type bn_power5_nohw,@function .align 32 bn_power5_nohw: .cfi_startproc _CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 .Lpower5_prologue: shll $3,%r9d leal (%r9,%r9,2),%r10d negq %r9 movq (%r8),%r8 leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lpwr_sp_alt subq %r11,%rbp leaq -320(%rbp,%r9,2),%rbp jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: leaq 4096-320(,%r9,2),%r10 leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp .Lpwr_sp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lpwr_page_walk jmp .Lpwr_page_walk_done .Lpwr_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lpwr_page_walk .Lpwr_page_walk_done: movq %r9,%r10 negq %r9 movq %r8,32(%rsp) movq %rax,40(%rsp) .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lpower5_body: .byte 102,72,15,110,207 .byte 102,72,15,110,209 .byte 102,73,15,110,218 .byte 102,72,15,110,226 call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal .byte 102,72,15,126,209 .byte 102,72,15,126,226 movq %rsi,%rdi movq 40(%rsp),%rax leaq 32(%rsp),%r8 call mul4x_internal movq 40(%rsp),%rsi .cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpower5_epilogue: ret .cfi_endproc .size bn_power5_nohw,.-bn_power5_nohw .globl bn_sqr8x_internal .hidden bn_sqr8x_internal .hidden bn_sqr8x_internal .type bn_sqr8x_internal,@function .align 32 bn_sqr8x_internal: __bn_sqr8x_internal: .cfi_startproc _CET_ENDBR leaq 32(%r10),%rbp leaq (%rsi,%r9,1),%rsi movq %r9,%rcx movq -32(%rsi,%rbp,1),%r14 leaq 48+8(%rsp,%r9,2),%rdi movq -24(%rsi,%rbp,1),%rax leaq -32(%rdi,%rbp,1),%rdi movq -16(%rsi,%rbp,1),%rbx movq %rax,%r15 mulq %r14 movq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 movq %r10,-24(%rdi,%rbp,1) mulq %r14 addq %rax,%r11 movq %rbx,%rax adcq $0,%rdx movq %r11,-16(%rdi,%rbp,1) movq %rdx,%r10 movq -8(%rsi,%rbp,1),%rbx mulq %r15 movq %rax,%r12 movq %rbx,%rax movq %rdx,%r13 leaq (%rbp),%rcx mulq %r14 addq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 adcq $0,%r11 addq %r12,%r10 adcq $0,%r11 movq %r10,-8(%rdi,%rcx,1) jmp .Lsqr4x_1st .align 32 .Lsqr4x_1st: movq (%rsi,%rcx,1),%rbx mulq %r15 addq %rax,%r13 movq %rbx,%rax movq %rdx,%r12 adcq $0,%r12 mulq %r14 addq %rax,%r11 movq %rbx,%rax movq 8(%rsi,%rcx,1),%rbx movq %rdx,%r10 adcq $0,%r10 addq %r13,%r11 adcq $0,%r10 mulq %r15 addq %rax,%r12 movq %rbx,%rax movq %r11,(%rdi,%rcx,1) movq %rdx,%r13 adcq $0,%r13 mulq %r14 addq %rax,%r10 movq %rbx,%rax movq 16(%rsi,%rcx,1),%rbx movq %rdx,%r11 adcq $0,%r11 addq %r12,%r10 adcq $0,%r11 mulq %r15 addq %rax,%r13 movq %rbx,%rax movq %r10,8(%rdi,%rcx,1) movq %rdx,%r12 adcq $0,%r12 mulq %r14 addq %rax,%r11 movq %rbx,%rax movq 24(%rsi,%rcx,1),%rbx movq %rdx,%r10 adcq $0,%r10 addq %r13,%r11 adcq $0,%r10 mulq %r15 addq %rax,%r12 movq %rbx,%rax movq %r11,16(%rdi,%rcx,1) movq %rdx,%r13 adcq $0,%r13 leaq 32(%rcx),%rcx mulq %r14 addq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 adcq $0,%r11 addq %r12,%r10 adcq $0,%r11 movq %r10,-8(%rdi,%rcx,1) cmpq $0,%rcx jne .Lsqr4x_1st mulq %r15 addq %rax,%r13 leaq 16(%rbp),%rbp adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,(%rdi) movq %rdx,%r12 movq %rdx,8(%rdi) jmp .Lsqr4x_outer .align 32 .Lsqr4x_outer: movq -32(%rsi,%rbp,1),%r14 leaq 48+8(%rsp,%r9,2),%rdi movq -24(%rsi,%rbp,1),%rax leaq -32(%rdi,%rbp,1),%rdi movq -16(%rsi,%rbp,1),%rbx movq %rax,%r15 mulq %r14 movq -24(%rdi,%rbp,1),%r10 addq %rax,%r10 movq %rbx,%rax adcq $0,%rdx movq %r10,-24(%rdi,%rbp,1) movq %rdx,%r11 mulq %r14 addq %rax,%r11 movq %rbx,%rax adcq $0,%rdx addq -16(%rdi,%rbp,1),%r11 movq %rdx,%r10 adcq $0,%r10 movq %r11,-16(%rdi,%rbp,1) xorq %r12,%r12 movq -8(%rsi,%rbp,1),%rbx mulq %r15 addq %rax,%r12 movq %rbx,%rax adcq $0,%rdx addq -8(%rdi,%rbp,1),%r12 movq %rdx,%r13 adcq $0,%r13 mulq %r14 addq %rax,%r10 movq %rbx,%rax adcq $0,%rdx addq %r12,%r10 movq %rdx,%r11 adcq $0,%r11 movq %r10,-8(%rdi,%rbp,1) leaq (%rbp),%rcx jmp .Lsqr4x_inner .align 32 .Lsqr4x_inner: movq (%rsi,%rcx,1),%rbx mulq %r15 addq %rax,%r13 movq %rbx,%rax movq %rdx,%r12 adcq $0,%r12 addq (%rdi,%rcx,1),%r13 adcq $0,%r12 .byte 0x67 mulq %r14 addq %rax,%r11 movq %rbx,%rax movq 8(%rsi,%rcx,1),%rbx movq %rdx,%r10 adcq $0,%r10 addq %r13,%r11 adcq $0,%r10 mulq %r15 addq %rax,%r12 movq %r11,(%rdi,%rcx,1) movq %rbx,%rax movq %rdx,%r13 adcq $0,%r13 addq 8(%rdi,%rcx,1),%r12 leaq 16(%rcx),%rcx adcq $0,%r13 mulq %r14 addq %rax,%r10 movq %rbx,%rax adcq $0,%rdx addq %r12,%r10 movq %rdx,%r11 adcq $0,%r11 movq %r10,-8(%rdi,%rcx,1) cmpq $0,%rcx jne .Lsqr4x_inner .byte 0x67 mulq %r15 addq %rax,%r13 adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,(%rdi) movq %rdx,%r12 movq %rdx,8(%rdi) addq $16,%rbp jnz .Lsqr4x_outer movq -32(%rsi),%r14 leaq 48+8(%rsp,%r9,2),%rdi movq -24(%rsi),%rax leaq -32(%rdi,%rbp,1),%rdi movq -16(%rsi),%rbx movq %rax,%r15 mulq %r14 addq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 adcq $0,%r11 mulq %r14 addq %rax,%r11 movq %rbx,%rax movq %r10,-24(%rdi) movq %rdx,%r10 adcq $0,%r10 addq %r13,%r11 movq -8(%rsi),%rbx adcq $0,%r10 mulq %r15 addq %rax,%r12 movq %rbx,%rax movq %r11,-16(%rdi) movq %rdx,%r13 adcq $0,%r13 mulq %r14 addq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 adcq $0,%r11 addq %r12,%r10 adcq $0,%r11 movq %r10,-8(%rdi) mulq %r15 addq %rax,%r13 movq -16(%rsi),%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,(%rdi) movq %rdx,%r12 movq %rdx,8(%rdi) mulq %rbx addq $16,%rbp xorq %r14,%r14 subq %r9,%rbp xorq %r15,%r15 addq %r12,%rax adcq $0,%rdx movq %rax,8(%rdi) movq %rdx,16(%rdi) movq %r15,24(%rdi) movq -16(%rsi,%rbp,1),%rax leaq 48+8(%rsp),%rdi xorq %r10,%r10 movq 8(%rdi),%r11 leaq (%r14,%r10,2),%r12 shrq $63,%r10 leaq (%rcx,%r11,2),%r13 shrq $63,%r11 orq %r10,%r13 movq 16(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 24(%rdi),%r11 adcq %rax,%r12 movq -8(%rsi,%rbp,1),%rax movq %r12,(%rdi) adcq %rdx,%r13 leaq (%r14,%r10,2),%rbx movq %r13,8(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r8 shrq $63,%r11 orq %r10,%r8 movq 32(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 40(%rdi),%r11 adcq %rax,%rbx movq 0(%rsi,%rbp,1),%rax movq %rbx,16(%rdi) adcq %rdx,%r8 leaq 16(%rbp),%rbp movq %r8,24(%rdi) sbbq %r15,%r15 leaq 64(%rdi),%rdi jmp .Lsqr4x_shift_n_add .align 32 .Lsqr4x_shift_n_add: leaq (%r14,%r10,2),%r12 shrq $63,%r10 leaq (%rcx,%r11,2),%r13 shrq $63,%r11 orq %r10,%r13 movq -16(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq -8(%rdi),%r11 adcq %rax,%r12 movq -8(%rsi,%rbp,1),%rax movq %r12,-32(%rdi) adcq %rdx,%r13 leaq (%r14,%r10,2),%rbx movq %r13,-24(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r8 shrq $63,%r11 orq %r10,%r8 movq 0(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 8(%rdi),%r11 adcq %rax,%rbx movq 0(%rsi,%rbp,1),%rax movq %rbx,-16(%rdi) adcq %rdx,%r8 leaq (%r14,%r10,2),%r12 movq %r8,-8(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r13 shrq $63,%r11 orq %r10,%r13 movq 16(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 24(%rdi),%r11 adcq %rax,%r12 movq 8(%rsi,%rbp,1),%rax movq %r12,0(%rdi) adcq %rdx,%r13 leaq (%r14,%r10,2),%rbx movq %r13,8(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r8 shrq $63,%r11 orq %r10,%r8 movq 32(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 40(%rdi),%r11 adcq %rax,%rbx movq 16(%rsi,%rbp,1),%rax movq %rbx,16(%rdi) adcq %rdx,%r8 movq %r8,24(%rdi) sbbq %r15,%r15 leaq 64(%rdi),%rdi addq $32,%rbp jnz .Lsqr4x_shift_n_add leaq (%r14,%r10,2),%r12 .byte 0x67 shrq $63,%r10 leaq (%rcx,%r11,2),%r13 shrq $63,%r11 orq %r10,%r13 movq -16(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq -8(%rdi),%r11 adcq %rax,%r12 movq -8(%rsi),%rax movq %r12,-32(%rdi) adcq %rdx,%r13 leaq (%r14,%r10,2),%rbx movq %r13,-24(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r8 shrq $63,%r11 orq %r10,%r8 mulq %rax negq %r15 adcq %rax,%rbx adcq %rdx,%r8 movq %rbx,-16(%rdi) movq %r8,-8(%rdi) .byte 102,72,15,126,213 __bn_sqr8x_reduction: xorq %rax,%rax leaq (%r9,%rbp,1),%rcx leaq 48+8(%rsp,%r9,2),%rdx movq %rcx,0+8(%rsp) leaq 48+8(%rsp,%r9,1),%rdi movq %rdx,8+8(%rsp) negq %r9 jmp .L8x_reduction_loop .align 32 .L8x_reduction_loop: leaq (%rdi,%r9,1),%rdi .byte 0x66 movq 0(%rdi),%rbx movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq 40(%rdi),%r13 movq 48(%rdi),%r14 movq 56(%rdi),%r15 movq %rax,(%rdx) leaq 64(%rdi),%rdi .byte 0x67 movq %rbx,%r8 imulq 32+8(%rsp),%rbx movq 0(%rbp),%rax movl $8,%ecx jmp .L8x_reduce .align 32 .L8x_reduce: mulq %rbx movq 8(%rbp),%rax negq %r8 movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 movq %rbx,48-8+8(%rsp,%rcx,8) movq %rdx,%r9 adcq $0,%r9 mulq %rbx addq %rax,%r10 movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq 32+8(%rsp),%rsi movq %rdx,%r10 adcq $0,%r10 mulq %rbx addq %rax,%r11 movq 32(%rbp),%rax adcq $0,%rdx imulq %r8,%rsi addq %r11,%r10 movq %rdx,%r11 adcq $0,%r11 mulq %rbx addq %rax,%r12 movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 adcq $0,%r12 mulq %rbx addq %rax,%r13 movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 adcq $0,%r13 mulq %rbx addq %rax,%r14 movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 adcq $0,%r14 mulq %rbx movq %rsi,%rbx addq %rax,%r15 movq 0(%rbp),%rax adcq $0,%rdx addq %r15,%r14 movq %rdx,%r15 adcq $0,%r15 decl %ecx jnz .L8x_reduce leaq 64(%rbp),%rbp xorq %rax,%rax movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp jae .L8x_no_tail .byte 0x66 addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 sbbq %rsi,%rsi movq 48+56+8(%rsp),%rbx movl $8,%ecx movq 0(%rbp),%rax jmp .L8x_tail .align 32 .L8x_tail: mulq %rbx addq %rax,%r8 movq 8(%rbp),%rax movq %r8,(%rdi) movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 leaq 8(%rdi),%rdi movq %rdx,%r9 adcq $0,%r9 mulq %rbx addq %rax,%r10 movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq %rdx,%r10 adcq $0,%r10 mulq %rbx addq %rax,%r11 movq 32(%rbp),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 adcq $0,%r11 mulq %rbx addq %rax,%r12 movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 adcq $0,%r12 mulq %rbx addq %rax,%r13 movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 adcq $0,%r13 mulq %rbx addq %rax,%r14 movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 adcq $0,%r14 mulq %rbx movq 48-16+8(%rsp,%rcx,8),%rbx addq %rax,%r15 adcq $0,%rdx addq %r15,%r14 movq 0(%rbp),%rax movq %rdx,%r15 adcq $0,%r15 decl %ecx jnz .L8x_tail leaq 64(%rbp),%rbp movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp jae .L8x_tail_done movq 48+56+8(%rsp),%rbx negq %rsi movq 0(%rbp),%rax adcq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 sbbq %rsi,%rsi movl $8,%ecx jmp .L8x_tail .align 32 .L8x_tail_done: xorq %rax,%rax addq (%rdx),%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rax negq %rsi .L8x_no_tail: adcq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 adcq $0,%rax movq -8(%rbp),%rcx xorq %rsi,%rsi .byte 102,72,15,126,213 movq %r8,0(%rdi) movq %r9,8(%rdi) .byte 102,73,15,126,217 movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) leaq 64(%rdi),%rdi cmpq %rdx,%rdi jb .L8x_reduction_loop ret .cfi_endproc .size bn_sqr8x_internal,.-bn_sqr8x_internal .type __bn_post4x_internal,@function .align 32 __bn_post4x_internal: .cfi_startproc movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx movq %r9,%rcx .byte 102,72,15,126,207 negq %rax .byte 102,72,15,126,206 sarq $3+2,%rcx decq %r12 xorq %r10,%r10 movq 8(%rbp),%r13 movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp .Lsqr4x_sub_entry .align 16 .Lsqr4x_sub: movq 0(%rbp),%r12 movq 8(%rbp),%r13 movq 16(%rbp),%r14 movq 24(%rbp),%r15 .Lsqr4x_sub_entry: leaq 32(%rbp),%rbp notq %r12 notq %r13 notq %r14 notq %r15 andq %rax,%r12 andq %rax,%r13 andq %rax,%r14 andq %rax,%r15 negq %r10 adcq 0(%rbx),%r12 adcq 8(%rbx),%r13 adcq 16(%rbx),%r14 adcq 24(%rbx),%r15 movq %r12,0(%rdi) leaq 32(%rbx),%rbx movq %r13,8(%rdi) sbbq %r10,%r10 movq %r14,16(%rdi) movq %r15,24(%rdi) leaq 32(%rdi),%rdi incq %rcx jnz .Lsqr4x_sub movq %r9,%r10 negq %r9 ret .cfi_endproc .size __bn_post4x_internal,.-__bn_post4x_internal .globl bn_mulx4x_mont_gather5 .hidden bn_mulx4x_mont_gather5 .type bn_mulx4x_mont_gather5,@function .align 32 bn_mulx4x_mont_gather5: .cfi_startproc _CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 .Lmulx4x_prologue: shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 movq (%r8),%r8 leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lmulx4xsp_alt subq %r11,%rbp leaq -320(%rbp,%r9,2),%rbp jmp .Lmulx4xsp_done .Lmulx4xsp_alt: leaq 4096-320(,%r9,2),%r10 leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp .Lmulx4xsp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lmulx4x_page_walk jmp .Lmulx4x_page_walk_done .Lmulx4x_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lmulx4x_page_walk .Lmulx4x_page_walk_done: movq %r8,32(%rsp) movq %rax,40(%rsp) .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lmulx4x_body: call mulx4x_internal movq 40(%rsp),%rsi .cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret .cfi_endproc .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 .type mulx4x_internal,@function .align 32 mulx4x_internal: .cfi_startproc movq %r9,8(%rsp) movq %r9,%r10 negq %r9 shlq $5,%r9 negq %r10 leaq 128(%rdx,%r9,1),%r13 shrq $5+5,%r9 movd 8(%rax),%xmm5 subq $1,%r9 leaq .Linc(%rip),%rax movq %r13,16+8(%rsp) movq %r9,24+8(%rsp) movq %rdi,56+8(%rsp) movdqa 0(%rax),%xmm0 movdqa 16(%rax),%xmm1 leaq 88-112(%rsp,%r10,1),%r10 leaq 128(%rdx),%rdi pshufd $0,%xmm5,%xmm5 movdqa %xmm1,%xmm4 .byte 0x67 movdqa %xmm1,%xmm2 .byte 0x67 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,112(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,128(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,144(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,160(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,176(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,192(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,208(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,224(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,240(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,256(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,272(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,288(%r10) movdqa %xmm4,%xmm3 .byte 0x67 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,304(%r10) paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,320(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,336(%r10) pand 64(%rdi),%xmm0 pand 80(%rdi),%xmm1 pand 96(%rdi),%xmm2 movdqa %xmm3,352(%r10) pand 112(%rdi),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa -128(%rdi),%xmm4 movdqa -112(%rdi),%xmm5 movdqa -96(%rdi),%xmm2 pand 112(%r10),%xmm4 movdqa -80(%rdi),%xmm3 pand 128(%r10),%xmm5 por %xmm4,%xmm0 pand 144(%r10),%xmm2 por %xmm5,%xmm1 pand 160(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa -64(%rdi),%xmm4 movdqa -48(%rdi),%xmm5 movdqa -32(%rdi),%xmm2 pand 176(%r10),%xmm4 movdqa -16(%rdi),%xmm3 pand 192(%r10),%xmm5 por %xmm4,%xmm0 pand 208(%r10),%xmm2 por %xmm5,%xmm1 pand 224(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa 0(%rdi),%xmm4 movdqa 16(%rdi),%xmm5 movdqa 32(%rdi),%xmm2 pand 240(%r10),%xmm4 movdqa 48(%rdi),%xmm3 pand 256(%r10),%xmm5 por %xmm4,%xmm0 pand 272(%r10),%xmm2 por %xmm5,%xmm1 pand 288(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 pxor %xmm1,%xmm0 pshufd $0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 leaq 256(%rdi),%rdi .byte 102,72,15,126,194 leaq 64+32+8(%rsp),%rbx movq %rdx,%r9 mulxq 0(%rsi),%r8,%rax mulxq 8(%rsi),%r11,%r12 addq %rax,%r11 mulxq 16(%rsi),%rax,%r13 adcq %rax,%r12 adcq $0,%r13 mulxq 24(%rsi),%rax,%r14 movq %r8,%r15 imulq 32+8(%rsp),%r8 xorq %rbp,%rbp movq %r8,%rdx movq %rdi,8+8(%rsp) leaq 32(%rsi),%rsi adcxq %rax,%r13 adcxq %rbp,%r14 mulxq 0(%rcx),%rax,%r10 adcxq %rax,%r15 adoxq %r11,%r10 mulxq 8(%rcx),%rax,%r11 adcxq %rax,%r10 adoxq %r12,%r11 mulxq 16(%rcx),%rax,%r12 movq 24+8(%rsp),%rdi movq %r10,-32(%rbx) adcxq %rax,%r11 adoxq %r13,%r12 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq %r11,-24(%rbx) adcxq %rax,%r12 adoxq %rbp,%r15 leaq 32(%rcx),%rcx movq %r12,-16(%rbx) jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: adcxq %rbp,%r15 mulxq 0(%rsi),%r10,%rax adcxq %r14,%r10 mulxq 8(%rsi),%r11,%r14 adcxq %rax,%r11 mulxq 16(%rsi),%r12,%rax adcxq %r14,%r12 mulxq 24(%rsi),%r13,%r14 .byte 0x67,0x67 movq %r8,%rdx adcxq %rax,%r13 adcxq %rbp,%r14 leaq 32(%rsi),%rsi leaq 32(%rbx),%rbx adoxq %r15,%r10 mulxq 0(%rcx),%rax,%r15 adcxq %rax,%r10 adoxq %r15,%r11 mulxq 8(%rcx),%rax,%r15 adcxq %rax,%r11 adoxq %r15,%r12 mulxq 16(%rcx),%rax,%r15 movq %r10,-40(%rbx) adcxq %rax,%r12 movq %r11,-32(%rbx) adoxq %r15,%r13 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq %r12,-24(%rbx) adcxq %rax,%r13 adoxq %rbp,%r15 leaq 32(%rcx),%rcx movq %r13,-16(%rbx) decq %rdi jnz .Lmulx4x_1st movq 8(%rsp),%rax adcq %rbp,%r15 leaq (%rsi,%rax,1),%rsi addq %r15,%r14 movq 8+8(%rsp),%rdi adcq %rbp,%rbp movq %r14,-8(%rbx) jmp .Lmulx4x_outer .align 32 .Lmulx4x_outer: leaq 16-256(%rbx),%r10 pxor %xmm4,%xmm4 .byte 0x67,0x67 pxor %xmm5,%xmm5 movdqa -128(%rdi),%xmm0 movdqa -112(%rdi),%xmm1 movdqa -96(%rdi),%xmm2 pand 256(%r10),%xmm0 movdqa -80(%rdi),%xmm3 pand 272(%r10),%xmm1 por %xmm0,%xmm4 pand 288(%r10),%xmm2 por %xmm1,%xmm5 pand 304(%r10),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa -64(%rdi),%xmm0 movdqa -48(%rdi),%xmm1 movdqa -32(%rdi),%xmm2 pand 320(%r10),%xmm0 movdqa -16(%rdi),%xmm3 pand 336(%r10),%xmm1 por %xmm0,%xmm4 pand 352(%r10),%xmm2 por %xmm1,%xmm5 pand 368(%r10),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 0(%rdi),%xmm0 movdqa 16(%rdi),%xmm1 movdqa 32(%rdi),%xmm2 pand 384(%r10),%xmm0 movdqa 48(%rdi),%xmm3 pand 400(%r10),%xmm1 por %xmm0,%xmm4 pand 416(%r10),%xmm2 por %xmm1,%xmm5 pand 432(%r10),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 64(%rdi),%xmm0 movdqa 80(%rdi),%xmm1 movdqa 96(%rdi),%xmm2 pand 448(%r10),%xmm0 movdqa 112(%rdi),%xmm3 pand 464(%r10),%xmm1 por %xmm0,%xmm4 pand 480(%r10),%xmm2 por %xmm1,%xmm5 pand 496(%r10),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 por %xmm5,%xmm4 pshufd $0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 leaq 256(%rdi),%rdi .byte 102,72,15,126,194 movq %rbp,(%rbx) leaq 32(%rbx,%rax,1),%rbx mulxq 0(%rsi),%r8,%r11 xorq %rbp,%rbp movq %rdx,%r9 mulxq 8(%rsi),%r14,%r12 adoxq -32(%rbx),%r8 adcxq %r14,%r11 mulxq 16(%rsi),%r15,%r13 adoxq -24(%rbx),%r11 adcxq %r15,%r12 mulxq 24(%rsi),%rdx,%r14 adoxq -16(%rbx),%r12 adcxq %rdx,%r13 leaq (%rcx,%rax,1),%rcx leaq 32(%rsi),%rsi adoxq -8(%rbx),%r13 adcxq %rbp,%r14 adoxq %rbp,%r14 movq %r8,%r15 imulq 32+8(%rsp),%r8 movq %r8,%rdx xorq %rbp,%rbp movq %rdi,8+8(%rsp) mulxq 0(%rcx),%rax,%r10 adcxq %rax,%r15 adoxq %r11,%r10 mulxq 8(%rcx),%rax,%r11 adcxq %rax,%r10 adoxq %r12,%r11 mulxq 16(%rcx),%rax,%r12 adcxq %rax,%r11 adoxq %r13,%r12 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq 24+8(%rsp),%rdi movq %r10,-32(%rbx) adcxq %rax,%r12 movq %r11,-24(%rbx) adoxq %rbp,%r15 movq %r12,-16(%rbx) leaq 32(%rcx),%rcx jmp .Lmulx4x_inner .align 32 .Lmulx4x_inner: mulxq 0(%rsi),%r10,%rax adcxq %rbp,%r15 adoxq %r14,%r10 mulxq 8(%rsi),%r11,%r14 adcxq 0(%rbx),%r10 adoxq %rax,%r11 mulxq 16(%rsi),%r12,%rax adcxq 8(%rbx),%r11 adoxq %r14,%r12 mulxq 24(%rsi),%r13,%r14 movq %r8,%rdx adcxq 16(%rbx),%r12 adoxq %rax,%r13 adcxq 24(%rbx),%r13 adoxq %rbp,%r14 leaq 32(%rsi),%rsi leaq 32(%rbx),%rbx adcxq %rbp,%r14 adoxq %r15,%r10 mulxq 0(%rcx),%rax,%r15 adcxq %rax,%r10 adoxq %r15,%r11 mulxq 8(%rcx),%rax,%r15 adcxq %rax,%r11 adoxq %r15,%r12 mulxq 16(%rcx),%rax,%r15 movq %r10,-40(%rbx) adcxq %rax,%r12 adoxq %r15,%r13 movq %r11,-32(%rbx) mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx leaq 32(%rcx),%rcx movq %r12,-24(%rbx) adcxq %rax,%r13 adoxq %rbp,%r15 movq %r13,-16(%rbx) decq %rdi jnz .Lmulx4x_inner movq 0+8(%rsp),%rax adcq %rbp,%r15 subq 0(%rbx),%rdi movq 8+8(%rsp),%rdi movq 16+8(%rsp),%r10 adcq %r15,%r14 leaq (%rsi,%rax,1),%rsi adcq %rbp,%rbp movq %r14,-8(%rbx) cmpq %r10,%rdi jb .Lmulx4x_outer movq -8(%rcx),%r10 movq %rbp,%r8 movq (%rcx,%rax,1),%r12 leaq (%rcx,%rax,1),%rbp movq %rax,%rcx leaq (%rbx,%rax,1),%rdi xorl %eax,%eax xorq %r15,%r15 subq %r14,%r10 adcq %r15,%r15 orq %r15,%r8 sarq $3+2,%rcx subq %r8,%rax movq 56+8(%rsp),%rdx decq %r12 movq 8(%rbp),%r13 xorq %r8,%r8 movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp .Lsqrx4x_sub_entry .cfi_endproc .size mulx4x_internal,.-mulx4x_internal .globl bn_powerx5 .hidden bn_powerx5 .type bn_powerx5,@function .align 32 bn_powerx5: .cfi_startproc _CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 .Lpowerx5_prologue: shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 movq (%r8),%r8 leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lpwrx_sp_alt subq %r11,%rbp leaq -320(%rbp,%r9,2),%rbp jmp .Lpwrx_sp_done .align 32 .Lpwrx_sp_alt: leaq 4096-320(,%r9,2),%r10 leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp .Lpwrx_sp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lpwrx_page_walk jmp .Lpwrx_page_walk_done .Lpwrx_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lpwrx_page_walk .Lpwrx_page_walk_done: movq %r9,%r10 negq %r9 pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,72,15,110,209 .byte 102,73,15,110,218 .byte 102,72,15,110,226 movq %r8,32(%rsp) movq %rax,40(%rsp) .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lpowerx5_body: call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal movq %r10,%r9 movq %rsi,%rdi .byte 102,72,15,126,209 .byte 102,72,15,126,226 movq 40(%rsp),%rax call mulx4x_internal movq 40(%rsp),%rsi .cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpowerx5_epilogue: ret .cfi_endproc .size bn_powerx5,.-bn_powerx5 .globl bn_sqrx8x_internal .hidden bn_sqrx8x_internal .hidden bn_sqrx8x_internal .type bn_sqrx8x_internal,@function .align 32 bn_sqrx8x_internal: __bn_sqrx8x_internal: .cfi_startproc _CET_ENDBR leaq 48+8(%rsp),%rdi leaq (%rsi,%r9,1),%rbp movq %r9,0+8(%rsp) movq %rbp,8+8(%rsp) jmp .Lsqr8x_zero_start .align 32 .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 .Lsqrx8x_zero: .byte 0x3e movdqa %xmm0,0(%rdi) movdqa %xmm0,16(%rdi) movdqa %xmm0,32(%rdi) movdqa %xmm0,48(%rdi) .Lsqr8x_zero_start: movdqa %xmm0,64(%rdi) movdqa %xmm0,80(%rdi) movdqa %xmm0,96(%rdi) movdqa %xmm0,112(%rdi) leaq 128(%rdi),%rdi subq $64,%r9 jnz .Lsqrx8x_zero movq 0(%rsi),%rdx xorq %r10,%r10 xorq %r11,%r11 xorq %r12,%r12 xorq %r13,%r13 xorq %r14,%r14 xorq %r15,%r15 leaq 48+8(%rsp),%rdi xorq %rbp,%rbp jmp .Lsqrx8x_outer_loop .align 32 .Lsqrx8x_outer_loop: mulxq 8(%rsi),%r8,%rax adcxq %r9,%r8 adoxq %rax,%r10 mulxq 16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 adcxq %r11,%r10 adoxq %rax,%r12 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 adcxq %r12,%r11 adoxq %rax,%r13 mulxq 40(%rsi),%r12,%rax adcxq %r13,%r12 adoxq %rax,%r14 mulxq 48(%rsi),%r13,%rax adcxq %r14,%r13 adoxq %r15,%rax mulxq 56(%rsi),%r14,%r15 movq 8(%rsi),%rdx adcxq %rax,%r14 adoxq %rbp,%r15 adcq 64(%rdi),%r15 movq %r8,8(%rdi) movq %r9,16(%rdi) sbbq %rcx,%rcx xorq %rbp,%rbp mulxq 16(%rsi),%r8,%rbx mulxq 24(%rsi),%r9,%rax adcxq %r10,%r8 adoxq %rbx,%r9 mulxq 32(%rsi),%r10,%rbx adcxq %r11,%r9 adoxq %rax,%r10 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 adcxq %r12,%r10 adoxq %rbx,%r11 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 adcxq %r13,%r11 adoxq %r14,%r12 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 movq 16(%rsi),%rdx adcxq %rax,%r12 adoxq %rbx,%r13 adcxq %r15,%r13 adoxq %rbp,%r14 adcxq %rbp,%r14 movq %r8,24(%rdi) movq %r9,32(%rdi) mulxq 24(%rsi),%r8,%rbx mulxq 32(%rsi),%r9,%rax adcxq %r10,%r8 adoxq %rbx,%r9 mulxq 40(%rsi),%r10,%rbx adcxq %r11,%r9 adoxq %rax,%r10 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 adcxq %r12,%r10 adoxq %r13,%r11 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 .byte 0x3e movq 24(%rsi),%rdx adcxq %rbx,%r11 adoxq %rax,%r12 adcxq %r14,%r12 movq %r8,40(%rdi) movq %r9,48(%rdi) mulxq 32(%rsi),%r8,%rax adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq 40(%rsi),%r9,%rbx adcxq %r10,%r8 adoxq %rax,%r9 mulxq 48(%rsi),%r10,%rax adcxq %r11,%r9 adoxq %r12,%r10 mulxq 56(%rsi),%r11,%r12 movq 32(%rsi),%rdx movq 40(%rsi),%r14 adcxq %rbx,%r10 adoxq %rax,%r11 movq 48(%rsi),%r15 adcxq %r13,%r11 adoxq %rbp,%r12 adcxq %rbp,%r12 movq %r8,56(%rdi) movq %r9,64(%rdi) mulxq %r14,%r9,%rax movq 56(%rsi),%r8 adcxq %r10,%r9 mulxq %r15,%r10,%rbx adoxq %rax,%r10 adcxq %r11,%r10 mulxq %r8,%r11,%rax movq %r14,%rdx adoxq %rbx,%r11 adcxq %r12,%r11 adcxq %rbp,%rax mulxq %r15,%r14,%rbx mulxq %r8,%r12,%r13 movq %r15,%rdx leaq 64(%rsi),%rsi adcxq %r14,%r11 adoxq %rbx,%r12 adcxq %rax,%r12 adoxq %rbp,%r13 .byte 0x67,0x67 mulxq %r8,%r8,%r14 adcxq %r8,%r13 adcxq %rbp,%r14 cmpq 8+8(%rsp),%rsi je .Lsqrx8x_outer_break negq %rcx movq $-8,%rcx movq %rbp,%r15 movq 64(%rdi),%r8 adcxq 72(%rdi),%r9 adcxq 80(%rdi),%r10 adcxq 88(%rdi),%r11 adcq 96(%rdi),%r12 adcq 104(%rdi),%r13 adcq 112(%rdi),%r14 adcq 120(%rdi),%r15 leaq (%rsi),%rbp leaq 128(%rdi),%rdi sbbq %rax,%rax movq -64(%rsi),%rdx movq %rax,16+8(%rsp) movq %rdi,24+8(%rsp) xorl %eax,%eax jmp .Lsqrx8x_loop .align 32 .Lsqrx8x_loop: movq %r8,%rbx mulxq 0(%rbp),%rax,%r8 adcxq %rax,%rbx adoxq %r9,%r8 mulxq 8(%rbp),%rax,%r9 adcxq %rax,%r8 adoxq %r10,%r9 mulxq 16(%rbp),%rax,%r10 adcxq %rax,%r9 adoxq %r11,%r10 mulxq 24(%rbp),%rax,%r11 adcxq %rax,%r10 adoxq %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 adcxq %rax,%r11 adoxq %r13,%r12 mulxq 40(%rbp),%rax,%r13 adcxq %rax,%r12 adoxq %r14,%r13 mulxq 48(%rbp),%rax,%r14 movq %rbx,(%rdi,%rcx,8) movl $0,%ebx adcxq %rax,%r13 adoxq %r15,%r14 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 movq 8(%rsi,%rcx,8),%rdx adcxq %rax,%r14 adoxq %rbx,%r15 adcxq %rbx,%r15 .byte 0x67 incq %rcx jnz .Lsqrx8x_loop leaq 64(%rbp),%rbp movq $-8,%rcx cmpq 8+8(%rsp),%rbp je .Lsqrx8x_break subq 16+8(%rsp),%rbx .byte 0x66 movq -64(%rsi),%rdx adcxq 0(%rdi),%r8 adcxq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 leaq 64(%rdi),%rdi .byte 0x67 sbbq %rax,%rax xorl %ebx,%ebx movq %rax,16+8(%rsp) jmp .Lsqrx8x_loop .align 32 .Lsqrx8x_break: xorq %rbp,%rbp subq 16+8(%rsp),%rbx adcxq %rbp,%r8 movq 24+8(%rsp),%rcx adcxq %rbp,%r9 movq 0(%rsi),%rdx adcq $0,%r10 movq %r8,0(%rdi) adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 cmpq %rcx,%rdi je .Lsqrx8x_outer_loop movq %r9,8(%rdi) movq 8(%rcx),%r9 movq %r10,16(%rdi) movq 16(%rcx),%r10 movq %r11,24(%rdi) movq 24(%rcx),%r11 movq %r12,32(%rdi) movq 32(%rcx),%r12 movq %r13,40(%rdi) movq 40(%rcx),%r13 movq %r14,48(%rdi) movq 48(%rcx),%r14 movq %r15,56(%rdi) movq 56(%rcx),%r15 movq %rcx,%rdi jmp .Lsqrx8x_outer_loop .align 32 .Lsqrx8x_outer_break: movq %r9,72(%rdi) .byte 102,72,15,126,217 movq %r10,80(%rdi) movq %r11,88(%rdi) movq %r12,96(%rdi) movq %r13,104(%rdi) movq %r14,112(%rdi) leaq 48+8(%rsp),%rdi movq (%rsi,%rcx,1),%rdx movq 8(%rdi),%r11 xorq %r10,%r10 movq 0+8(%rsp),%r9 adoxq %r11,%r11 movq 16(%rdi),%r12 movq 24(%rdi),%r13 .align 32 .Lsqrx4x_shift_n_add: mulxq %rdx,%rax,%rbx adoxq %r12,%r12 adcxq %r10,%rax .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 adoxq %r13,%r13 adcxq %r11,%rbx movq 40(%rdi),%r11 movq %rax,0(%rdi) movq %rbx,8(%rdi) mulxq %rdx,%rax,%rbx adoxq %r10,%r10 adcxq %r12,%rax movq 16(%rsi,%rcx,1),%rdx movq 48(%rdi),%r12 adoxq %r11,%r11 adcxq %r13,%rbx movq 56(%rdi),%r13 movq %rax,16(%rdi) movq %rbx,24(%rdi) mulxq %rdx,%rax,%rbx adoxq %r12,%r12 adcxq %r10,%rax movq 24(%rsi,%rcx,1),%rdx leaq 32(%rcx),%rcx movq 64(%rdi),%r10 adoxq %r13,%r13 adcxq %r11,%rbx movq 72(%rdi),%r11 movq %rax,32(%rdi) movq %rbx,40(%rdi) mulxq %rdx,%rax,%rbx adoxq %r10,%r10 adcxq %r12,%rax jrcxz .Lsqrx4x_shift_n_add_break .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 adoxq %r11,%r11 adcxq %r13,%rbx movq 80(%rdi),%r12 movq 88(%rdi),%r13 movq %rax,48(%rdi) movq %rbx,56(%rdi) leaq 64(%rdi),%rdi nop jmp .Lsqrx4x_shift_n_add .align 32 .Lsqrx4x_shift_n_add_break: adcxq %r13,%rbx movq %rax,48(%rdi) movq %rbx,56(%rdi) leaq 64(%rdi),%rdi .byte 102,72,15,126,213 __bn_sqrx8x_reduction: xorl %eax,%eax movq 32+8(%rsp),%rbx movq 48+8(%rsp),%rdx leaq -64(%rbp,%r9,1),%rcx movq %rcx,0+8(%rsp) movq %rdi,8+8(%rsp) leaq 48+8(%rsp),%rdi jmp .Lsqrx8x_reduction_loop .align 32 .Lsqrx8x_reduction_loop: movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq %rdx,%r8 imulq %rbx,%rdx movq 40(%rdi),%r13 movq 48(%rdi),%r14 movq 56(%rdi),%r15 movq %rax,24+8(%rsp) leaq 64(%rdi),%rdi xorq %rsi,%rsi movq $-8,%rcx jmp .Lsqrx8x_reduce .align 32 .Lsqrx8x_reduce: movq %r8,%rbx mulxq 0(%rbp),%rax,%r8 adcxq %rbx,%rax adoxq %r9,%r8 mulxq 8(%rbp),%rbx,%r9 adcxq %rbx,%r8 adoxq %r10,%r9 mulxq 16(%rbp),%rbx,%r10 adcxq %rbx,%r9 adoxq %r11,%r10 mulxq 24(%rbp),%rbx,%r11 adcxq %rbx,%r10 adoxq %r12,%r11 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 movq %rdx,%rax movq %r8,%rdx adcxq %rbx,%r11 adoxq %r13,%r12 mulxq 32+8(%rsp),%rbx,%rdx movq %rax,%rdx movq %rax,64+48+8(%rsp,%rcx,8) mulxq 40(%rbp),%rax,%r13 adcxq %rax,%r12 adoxq %r14,%r13 mulxq 48(%rbp),%rax,%r14 adcxq %rax,%r13 adoxq %r15,%r14 mulxq 56(%rbp),%rax,%r15 movq %rbx,%rdx adcxq %rax,%r14 adoxq %rsi,%r15 adcxq %rsi,%r15 .byte 0x67,0x67,0x67 incq %rcx jnz .Lsqrx8x_reduce movq %rsi,%rax cmpq 0+8(%rsp),%rbp jae .Lsqrx8x_no_tail movq 48+8(%rsp),%rdx addq 0(%rdi),%r8 leaq 64(%rbp),%rbp movq $-8,%rcx adcxq 8(%rdi),%r9 adcxq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 leaq 64(%rdi),%rdi sbbq %rax,%rax xorq %rsi,%rsi movq %rax,16+8(%rsp) jmp .Lsqrx8x_tail .align 32 .Lsqrx8x_tail: movq %r8,%rbx mulxq 0(%rbp),%rax,%r8 adcxq %rax,%rbx adoxq %r9,%r8 mulxq 8(%rbp),%rax,%r9 adcxq %rax,%r8 adoxq %r10,%r9 mulxq 16(%rbp),%rax,%r10 adcxq %rax,%r9 adoxq %r11,%r10 mulxq 24(%rbp),%rax,%r11 adcxq %rax,%r10 adoxq %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 adcxq %rax,%r11 adoxq %r13,%r12 mulxq 40(%rbp),%rax,%r13 adcxq %rax,%r12 adoxq %r14,%r13 mulxq 48(%rbp),%rax,%r14 adcxq %rax,%r13 adoxq %r15,%r14 mulxq 56(%rbp),%rax,%r15 movq 72+48+8(%rsp,%rcx,8),%rdx adcxq %rax,%r14 adoxq %rsi,%r15 movq %rbx,(%rdi,%rcx,8) movq %r8,%rbx adcxq %rsi,%r15 incq %rcx jnz .Lsqrx8x_tail cmpq 0+8(%rsp),%rbp jae .Lsqrx8x_tail_done subq 16+8(%rsp),%rsi movq 48+8(%rsp),%rdx leaq 64(%rbp),%rbp adcq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 leaq 64(%rdi),%rdi sbbq %rax,%rax subq $8,%rcx xorq %rsi,%rsi movq %rax,16+8(%rsp) jmp .Lsqrx8x_tail .align 32 .Lsqrx8x_tail_done: xorq %rax,%rax addq 24+8(%rsp),%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rax subq 16+8(%rsp),%rsi .Lsqrx8x_no_tail: adcq 0(%rdi),%r8 .byte 102,72,15,126,217 adcq 8(%rdi),%r9 movq 56(%rbp),%rsi .byte 102,72,15,126,213 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 adcq $0,%rax movq 32+8(%rsp),%rbx movq 64(%rdi,%rcx,1),%rdx movq %r8,0(%rdi) leaq 64(%rdi),%r8 movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) leaq 64(%rdi,%rcx,1),%rdi cmpq 8+8(%rsp),%r8 jb .Lsqrx8x_reduction_loop ret .cfi_endproc .size bn_sqrx8x_internal,.-bn_sqrx8x_internal .align 32 .type __bn_postx4x_internal,@function __bn_postx4x_internal: .cfi_startproc movq 0(%rbp),%r12 movq %rcx,%r10 movq %rcx,%r9 negq %rax sarq $3+2,%rcx .byte 102,72,15,126,202 .byte 102,72,15,126,206 decq %r12 movq 8(%rbp),%r13 xorq %r8,%r8 movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp .Lsqrx4x_sub_entry .align 16 .Lsqrx4x_sub: movq 0(%rbp),%r12 movq 8(%rbp),%r13 movq 16(%rbp),%r14 movq 24(%rbp),%r15 .Lsqrx4x_sub_entry: andnq %rax,%r12,%r12 leaq 32(%rbp),%rbp andnq %rax,%r13,%r13 andnq %rax,%r14,%r14 andnq %rax,%r15,%r15 negq %r8 adcq 0(%rdi),%r12 adcq 8(%rdi),%r13 adcq 16(%rdi),%r14 adcq 24(%rdi),%r15 movq %r12,0(%rdx) leaq 32(%rdi),%rdi movq %r13,8(%rdx) sbbq %r8,%r8 movq %r14,16(%rdx) movq %r15,24(%rdx) leaq 32(%rdx),%rdx incq %rcx jnz .Lsqrx4x_sub negq %r9 ret .cfi_endproc .size __bn_postx4x_internal,.-__bn_postx4x_internal .globl bn_scatter5 .hidden bn_scatter5 .type bn_scatter5,@function .align 16 bn_scatter5: .cfi_startproc _CET_ENDBR cmpl $0,%esi jz .Lscatter_epilogue leaq (%rdx,%rcx,8),%rdx .Lscatter: movq (%rdi),%rax leaq 8(%rdi),%rdi movq %rax,(%rdx) leaq 256(%rdx),%rdx subl $1,%esi jnz .Lscatter .Lscatter_epilogue: ret .cfi_endproc .size bn_scatter5,.-bn_scatter5 .globl bn_gather5 .hidden bn_gather5 .type bn_gather5,@function .align 32 bn_gather5: .cfi_startproc .LSEH_begin_bn_gather5: _CET_ENDBR .byte 0x4c,0x8d,0x14,0x24 .cfi_def_cfa_register %r10 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 leaq .Linc(%rip),%rax andq $-16,%rsp movd %ecx,%xmm5 movdqa 0(%rax),%xmm0 movdqa 16(%rax),%xmm1 leaq 128(%rdx),%r11 leaq 128(%rsp),%rax pshufd $0,%xmm5,%xmm5 movdqa %xmm1,%xmm4 movdqa %xmm1,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,-128(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,-112(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,-96(%rax) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,-80(%rax) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,-64(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,-48(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,-32(%rax) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,-16(%rax) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,0(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,16(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,32(%rax) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,48(%rax) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,64(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,80(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,96(%rax) movdqa %xmm4,%xmm2 movdqa %xmm3,112(%rax) jmp .Lgather .align 32 .Lgather: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 movdqa -128(%r11),%xmm0 movdqa -112(%r11),%xmm1 movdqa -96(%r11),%xmm2 pand -128(%rax),%xmm0 movdqa -80(%r11),%xmm3 pand -112(%rax),%xmm1 por %xmm0,%xmm4 pand -96(%rax),%xmm2 por %xmm1,%xmm5 pand -80(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa -64(%r11),%xmm0 movdqa -48(%r11),%xmm1 movdqa -32(%r11),%xmm2 pand -64(%rax),%xmm0 movdqa -16(%r11),%xmm3 pand -48(%rax),%xmm1 por %xmm0,%xmm4 pand -32(%rax),%xmm2 por %xmm1,%xmm5 pand -16(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 0(%r11),%xmm0 movdqa 16(%r11),%xmm1 movdqa 32(%r11),%xmm2 pand 0(%rax),%xmm0 movdqa 48(%r11),%xmm3 pand 16(%rax),%xmm1 por %xmm0,%xmm4 pand 32(%rax),%xmm2 por %xmm1,%xmm5 pand 48(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 64(%r11),%xmm0 movdqa 80(%r11),%xmm1 movdqa 96(%r11),%xmm2 pand 64(%rax),%xmm0 movdqa 112(%r11),%xmm3 pand 80(%rax),%xmm1 por %xmm0,%xmm4 pand 96(%rax),%xmm2 por %xmm1,%xmm5 pand 112(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 por %xmm5,%xmm4 leaq 256(%r11),%r11 pshufd $0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 movq %xmm0,(%rdi) leaq 8(%rdi),%rdi subl $1,%esi jnz .Lgather leaq (%r10),%rsp .cfi_def_cfa_register %rsp ret .LSEH_end_bn_gather5: .cfi_endproc .size bn_gather5,.-bn_gather5 .section .rodata .align 64 .Linc: .long 0,0, 1,1 .long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text #endif ring-0.17.14/pregenerated/x86_64-mont5-macosx.S000064400000000000000000001367541046102023000170340ustar 00000000000000// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text .globl _bn_mul4x_mont_gather5 .private_extern _bn_mul4x_mont_gather5 .p2align 5 _bn_mul4x_mont_gather5: _CET_ENDBR .byte 0x67 movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$mul4x_prologue: .byte 0x67 shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$mul4xsp_alt subq %r11,%rbp leaq -320(%rbp,%r9,2),%rbp jmp L$mul4xsp_done .p2align 5 L$mul4xsp_alt: leaq 4096-320(,%r9,2),%r10 leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp L$mul4xsp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja L$mul4x_page_walk jmp L$mul4x_page_walk_done L$mul4x_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja L$mul4x_page_walk L$mul4x_page_walk_done: negq %r9 movq %rax,40(%rsp) L$mul4x_body: call mul4x_internal movq 40(%rsp),%rsi movq $1,%rax movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$mul4x_epilogue: ret .p2align 5 mul4x_internal: shlq $5,%r9 movd 8(%rax),%xmm5 leaq L$inc(%rip),%rax leaq 128(%rdx,%r9,1),%r13 shrq $5,%r9 movdqa 0(%rax),%xmm0 movdqa 16(%rax),%xmm1 leaq 88-112(%rsp,%r9,1),%r10 leaq 128(%rdx),%r12 pshufd $0,%xmm5,%xmm5 movdqa %xmm1,%xmm4 .byte 0x67,0x67 movdqa %xmm1,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 .byte 0x67 movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,112(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,128(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,144(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,160(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,176(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,192(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,208(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,224(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,240(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,256(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,272(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,288(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,304(%r10) paddd %xmm2,%xmm3 .byte 0x67 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,320(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,336(%r10) pand 64(%r12),%xmm0 pand 80(%r12),%xmm1 pand 96(%r12),%xmm2 movdqa %xmm3,352(%r10) pand 112(%r12),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa -128(%r12),%xmm4 movdqa -112(%r12),%xmm5 movdqa -96(%r12),%xmm2 pand 112(%r10),%xmm4 movdqa -80(%r12),%xmm3 pand 128(%r10),%xmm5 por %xmm4,%xmm0 pand 144(%r10),%xmm2 por %xmm5,%xmm1 pand 160(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa -64(%r12),%xmm4 movdqa -48(%r12),%xmm5 movdqa -32(%r12),%xmm2 pand 176(%r10),%xmm4 movdqa -16(%r12),%xmm3 pand 192(%r10),%xmm5 por %xmm4,%xmm0 pand 208(%r10),%xmm2 por %xmm5,%xmm1 pand 224(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa 0(%r12),%xmm4 movdqa 16(%r12),%xmm5 movdqa 32(%r12),%xmm2 pand 240(%r10),%xmm4 movdqa 48(%r12),%xmm3 pand 256(%r10),%xmm5 por %xmm4,%xmm0 pand 272(%r10),%xmm2 por %xmm5,%xmm1 pand 288(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 por %xmm1,%xmm0 pshufd $0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 leaq 256(%r12),%r12 .byte 102,72,15,126,195 movq %r13,16+8(%rsp) movq %rdi,56+8(%rsp) movq (%r8),%r8 movq (%rsi),%rax leaq (%rsi,%r9,1),%rsi negq %r9 movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax imulq %r10,%rbp leaq 64+8(%rsp),%r14 movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi,%r9,1),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 jmp L$1st4x .p2align 5 L$1st4x: mulq %rbx addq %rax,%r10 movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%r14) movq %rdx,%r13 mulq %rbx addq %rax,%r10 movq 0(%rcx),%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq 8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-8(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 addq $32,%r15 jnz L$1st4x mulq %rbx addq %rax,%r10 movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%r14) movq %rdx,%r13 leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi movq %r13,-8(%r14) jmp L$outer4x .p2align 5 L$outer4x: leaq 16+128(%r14),%rdx pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 movdqa -128(%r12),%xmm0 movdqa -112(%r12),%xmm1 movdqa -96(%r12),%xmm2 movdqa -80(%r12),%xmm3 pand -128(%rdx),%xmm0 pand -112(%rdx),%xmm1 por %xmm0,%xmm4 pand -96(%rdx),%xmm2 por %xmm1,%xmm5 pand -80(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa -64(%r12),%xmm0 movdqa -48(%r12),%xmm1 movdqa -32(%r12),%xmm2 movdqa -16(%r12),%xmm3 pand -64(%rdx),%xmm0 pand -48(%rdx),%xmm1 por %xmm0,%xmm4 pand -32(%rdx),%xmm2 por %xmm1,%xmm5 pand -16(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 0(%r12),%xmm0 movdqa 16(%r12),%xmm1 movdqa 32(%r12),%xmm2 movdqa 48(%r12),%xmm3 pand 0(%rdx),%xmm0 pand 16(%rdx),%xmm1 por %xmm0,%xmm4 pand 32(%rdx),%xmm2 por %xmm1,%xmm5 pand 48(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 64(%r12),%xmm0 movdqa 80(%r12),%xmm1 movdqa 96(%r12),%xmm2 movdqa 112(%r12),%xmm3 pand 64(%rdx),%xmm0 pand 80(%rdx),%xmm1 por %xmm0,%xmm4 pand 96(%rdx),%xmm2 por %xmm1,%xmm5 pand 112(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 por %xmm5,%xmm4 pshufd $0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 leaq 256(%r12),%r12 .byte 102,72,15,126,195 movq (%r14,%r9,1),%r10 movq %r8,%rbp mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx imulq %r10,%rbp movq %rdx,%r11 movq %rdi,(%r14) leaq (%r14,%r9,1),%r14 mulq %rbp addq %rax,%r10 movq 8(%rsi,%r9,1),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdx,%r13 jmp L$inner4x .p2align 5 L$inner4x: mulq %rbx addq %rax,%r10 movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdi,-32(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx),%rax adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %r13,-24(%r14) movq %rdx,%r13 mulq %rbx addq %rax,%r10 movq 0(%rcx),%rax adcq $0,%rdx addq (%r14),%r10 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq 8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdi,-16(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi leaq 32(%rcx),%rcx adcq $0,%rdx movq %r13,-8(%r14) movq %rdx,%r13 addq $32,%r15 jnz L$inner4x mulq %rbx addq %rax,%r10 movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdi,-32(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq %rbp,%rax movq -8(%rcx),%rbp adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %r13,-24(%r14) movq %rdx,%r13 movq %rdi,-16(%r14) leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi addq (%r14),%r13 adcq $0,%rdi movq %r13,-8(%r14) cmpq 16+8(%rsp),%r12 jb L$outer4x xorq %rax,%rax subq %r13,%rbp adcq %r15,%r15 orq %r15,%rdi subq %rdi,%rax leaq (%r14,%r9,1),%rbx movq (%rcx),%r12 leaq (%rcx),%rbp movq %r9,%rcx sarq $3+2,%rcx movq 56+8(%rsp),%rdi decq %r12 xorq %r10,%r10 movq 8(%rbp),%r13 movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp L$sqr4x_sub_entry .globl _bn_power5_nohw .private_extern _bn_power5_nohw .p2align 5 _bn_power5_nohw: _CET_ENDBR movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$power5_prologue: shll $3,%r9d leal (%r9,%r9,2),%r10d negq %r9 movq (%r8),%r8 leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$pwr_sp_alt subq %r11,%rbp leaq -320(%rbp,%r9,2),%rbp jmp L$pwr_sp_done .p2align 5 L$pwr_sp_alt: leaq 4096-320(,%r9,2),%r10 leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp L$pwr_sp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja L$pwr_page_walk jmp L$pwr_page_walk_done L$pwr_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja L$pwr_page_walk L$pwr_page_walk_done: movq %r9,%r10 negq %r9 movq %r8,32(%rsp) movq %rax,40(%rsp) L$power5_body: .byte 102,72,15,110,207 .byte 102,72,15,110,209 .byte 102,73,15,110,218 .byte 102,72,15,110,226 call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal .byte 102,72,15,126,209 .byte 102,72,15,126,226 movq %rsi,%rdi movq 40(%rsp),%rax leaq 32(%rsp),%r8 call mul4x_internal movq 40(%rsp),%rsi movq $1,%rax movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$power5_epilogue: ret .globl _bn_sqr8x_internal .private_extern _bn_sqr8x_internal .private_extern _bn_sqr8x_internal .p2align 5 _bn_sqr8x_internal: __bn_sqr8x_internal: _CET_ENDBR leaq 32(%r10),%rbp leaq (%rsi,%r9,1),%rsi movq %r9,%rcx movq -32(%rsi,%rbp,1),%r14 leaq 48+8(%rsp,%r9,2),%rdi movq -24(%rsi,%rbp,1),%rax leaq -32(%rdi,%rbp,1),%rdi movq -16(%rsi,%rbp,1),%rbx movq %rax,%r15 mulq %r14 movq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 movq %r10,-24(%rdi,%rbp,1) mulq %r14 addq %rax,%r11 movq %rbx,%rax adcq $0,%rdx movq %r11,-16(%rdi,%rbp,1) movq %rdx,%r10 movq -8(%rsi,%rbp,1),%rbx mulq %r15 movq %rax,%r12 movq %rbx,%rax movq %rdx,%r13 leaq (%rbp),%rcx mulq %r14 addq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 adcq $0,%r11 addq %r12,%r10 adcq $0,%r11 movq %r10,-8(%rdi,%rcx,1) jmp L$sqr4x_1st .p2align 5 L$sqr4x_1st: movq (%rsi,%rcx,1),%rbx mulq %r15 addq %rax,%r13 movq %rbx,%rax movq %rdx,%r12 adcq $0,%r12 mulq %r14 addq %rax,%r11 movq %rbx,%rax movq 8(%rsi,%rcx,1),%rbx movq %rdx,%r10 adcq $0,%r10 addq %r13,%r11 adcq $0,%r10 mulq %r15 addq %rax,%r12 movq %rbx,%rax movq %r11,(%rdi,%rcx,1) movq %rdx,%r13 adcq $0,%r13 mulq %r14 addq %rax,%r10 movq %rbx,%rax movq 16(%rsi,%rcx,1),%rbx movq %rdx,%r11 adcq $0,%r11 addq %r12,%r10 adcq $0,%r11 mulq %r15 addq %rax,%r13 movq %rbx,%rax movq %r10,8(%rdi,%rcx,1) movq %rdx,%r12 adcq $0,%r12 mulq %r14 addq %rax,%r11 movq %rbx,%rax movq 24(%rsi,%rcx,1),%rbx movq %rdx,%r10 adcq $0,%r10 addq %r13,%r11 adcq $0,%r10 mulq %r15 addq %rax,%r12 movq %rbx,%rax movq %r11,16(%rdi,%rcx,1) movq %rdx,%r13 adcq $0,%r13 leaq 32(%rcx),%rcx mulq %r14 addq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 adcq $0,%r11 addq %r12,%r10 adcq $0,%r11 movq %r10,-8(%rdi,%rcx,1) cmpq $0,%rcx jne L$sqr4x_1st mulq %r15 addq %rax,%r13 leaq 16(%rbp),%rbp adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,(%rdi) movq %rdx,%r12 movq %rdx,8(%rdi) jmp L$sqr4x_outer .p2align 5 L$sqr4x_outer: movq -32(%rsi,%rbp,1),%r14 leaq 48+8(%rsp,%r9,2),%rdi movq -24(%rsi,%rbp,1),%rax leaq -32(%rdi,%rbp,1),%rdi movq -16(%rsi,%rbp,1),%rbx movq %rax,%r15 mulq %r14 movq -24(%rdi,%rbp,1),%r10 addq %rax,%r10 movq %rbx,%rax adcq $0,%rdx movq %r10,-24(%rdi,%rbp,1) movq %rdx,%r11 mulq %r14 addq %rax,%r11 movq %rbx,%rax adcq $0,%rdx addq -16(%rdi,%rbp,1),%r11 movq %rdx,%r10 adcq $0,%r10 movq %r11,-16(%rdi,%rbp,1) xorq %r12,%r12 movq -8(%rsi,%rbp,1),%rbx mulq %r15 addq %rax,%r12 movq %rbx,%rax adcq $0,%rdx addq -8(%rdi,%rbp,1),%r12 movq %rdx,%r13 adcq $0,%r13 mulq %r14 addq %rax,%r10 movq %rbx,%rax adcq $0,%rdx addq %r12,%r10 movq %rdx,%r11 adcq $0,%r11 movq %r10,-8(%rdi,%rbp,1) leaq (%rbp),%rcx jmp L$sqr4x_inner .p2align 5 L$sqr4x_inner: movq (%rsi,%rcx,1),%rbx mulq %r15 addq %rax,%r13 movq %rbx,%rax movq %rdx,%r12 adcq $0,%r12 addq (%rdi,%rcx,1),%r13 adcq $0,%r12 .byte 0x67 mulq %r14 addq %rax,%r11 movq %rbx,%rax movq 8(%rsi,%rcx,1),%rbx movq %rdx,%r10 adcq $0,%r10 addq %r13,%r11 adcq $0,%r10 mulq %r15 addq %rax,%r12 movq %r11,(%rdi,%rcx,1) movq %rbx,%rax movq %rdx,%r13 adcq $0,%r13 addq 8(%rdi,%rcx,1),%r12 leaq 16(%rcx),%rcx adcq $0,%r13 mulq %r14 addq %rax,%r10 movq %rbx,%rax adcq $0,%rdx addq %r12,%r10 movq %rdx,%r11 adcq $0,%r11 movq %r10,-8(%rdi,%rcx,1) cmpq $0,%rcx jne L$sqr4x_inner .byte 0x67 mulq %r15 addq %rax,%r13 adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,(%rdi) movq %rdx,%r12 movq %rdx,8(%rdi) addq $16,%rbp jnz L$sqr4x_outer movq -32(%rsi),%r14 leaq 48+8(%rsp,%r9,2),%rdi movq -24(%rsi),%rax leaq -32(%rdi,%rbp,1),%rdi movq -16(%rsi),%rbx movq %rax,%r15 mulq %r14 addq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 adcq $0,%r11 mulq %r14 addq %rax,%r11 movq %rbx,%rax movq %r10,-24(%rdi) movq %rdx,%r10 adcq $0,%r10 addq %r13,%r11 movq -8(%rsi),%rbx adcq $0,%r10 mulq %r15 addq %rax,%r12 movq %rbx,%rax movq %r11,-16(%rdi) movq %rdx,%r13 adcq $0,%r13 mulq %r14 addq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 adcq $0,%r11 addq %r12,%r10 adcq $0,%r11 movq %r10,-8(%rdi) mulq %r15 addq %rax,%r13 movq -16(%rsi),%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,(%rdi) movq %rdx,%r12 movq %rdx,8(%rdi) mulq %rbx addq $16,%rbp xorq %r14,%r14 subq %r9,%rbp xorq %r15,%r15 addq %r12,%rax adcq $0,%rdx movq %rax,8(%rdi) movq %rdx,16(%rdi) movq %r15,24(%rdi) movq -16(%rsi,%rbp,1),%rax leaq 48+8(%rsp),%rdi xorq %r10,%r10 movq 8(%rdi),%r11 leaq (%r14,%r10,2),%r12 shrq $63,%r10 leaq (%rcx,%r11,2),%r13 shrq $63,%r11 orq %r10,%r13 movq 16(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 24(%rdi),%r11 adcq %rax,%r12 movq -8(%rsi,%rbp,1),%rax movq %r12,(%rdi) adcq %rdx,%r13 leaq (%r14,%r10,2),%rbx movq %r13,8(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r8 shrq $63,%r11 orq %r10,%r8 movq 32(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 40(%rdi),%r11 adcq %rax,%rbx movq 0(%rsi,%rbp,1),%rax movq %rbx,16(%rdi) adcq %rdx,%r8 leaq 16(%rbp),%rbp movq %r8,24(%rdi) sbbq %r15,%r15 leaq 64(%rdi),%rdi jmp L$sqr4x_shift_n_add .p2align 5 L$sqr4x_shift_n_add: leaq (%r14,%r10,2),%r12 shrq $63,%r10 leaq (%rcx,%r11,2),%r13 shrq $63,%r11 orq %r10,%r13 movq -16(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq -8(%rdi),%r11 adcq %rax,%r12 movq -8(%rsi,%rbp,1),%rax movq %r12,-32(%rdi) adcq %rdx,%r13 leaq (%r14,%r10,2),%rbx movq %r13,-24(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r8 shrq $63,%r11 orq %r10,%r8 movq 0(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 8(%rdi),%r11 adcq %rax,%rbx movq 0(%rsi,%rbp,1),%rax movq %rbx,-16(%rdi) adcq %rdx,%r8 leaq (%r14,%r10,2),%r12 movq %r8,-8(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r13 shrq $63,%r11 orq %r10,%r13 movq 16(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 24(%rdi),%r11 adcq %rax,%r12 movq 8(%rsi,%rbp,1),%rax movq %r12,0(%rdi) adcq %rdx,%r13 leaq (%r14,%r10,2),%rbx movq %r13,8(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r8 shrq $63,%r11 orq %r10,%r8 movq 32(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 40(%rdi),%r11 adcq %rax,%rbx movq 16(%rsi,%rbp,1),%rax movq %rbx,16(%rdi) adcq %rdx,%r8 movq %r8,24(%rdi) sbbq %r15,%r15 leaq 64(%rdi),%rdi addq $32,%rbp jnz L$sqr4x_shift_n_add leaq (%r14,%r10,2),%r12 .byte 0x67 shrq $63,%r10 leaq (%rcx,%r11,2),%r13 shrq $63,%r11 orq %r10,%r13 movq -16(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq -8(%rdi),%r11 adcq %rax,%r12 movq -8(%rsi),%rax movq %r12,-32(%rdi) adcq %rdx,%r13 leaq (%r14,%r10,2),%rbx movq %r13,-24(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r8 shrq $63,%r11 orq %r10,%r8 mulq %rax negq %r15 adcq %rax,%rbx adcq %rdx,%r8 movq %rbx,-16(%rdi) movq %r8,-8(%rdi) .byte 102,72,15,126,213 __bn_sqr8x_reduction: xorq %rax,%rax leaq (%r9,%rbp,1),%rcx leaq 48+8(%rsp,%r9,2),%rdx movq %rcx,0+8(%rsp) leaq 48+8(%rsp,%r9,1),%rdi movq %rdx,8+8(%rsp) negq %r9 jmp L$8x_reduction_loop .p2align 5 L$8x_reduction_loop: leaq (%rdi,%r9,1),%rdi .byte 0x66 movq 0(%rdi),%rbx movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq 40(%rdi),%r13 movq 48(%rdi),%r14 movq 56(%rdi),%r15 movq %rax,(%rdx) leaq 64(%rdi),%rdi .byte 0x67 movq %rbx,%r8 imulq 32+8(%rsp),%rbx movq 0(%rbp),%rax movl $8,%ecx jmp L$8x_reduce .p2align 5 L$8x_reduce: mulq %rbx movq 8(%rbp),%rax negq %r8 movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 movq %rbx,48-8+8(%rsp,%rcx,8) movq %rdx,%r9 adcq $0,%r9 mulq %rbx addq %rax,%r10 movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq 32+8(%rsp),%rsi movq %rdx,%r10 adcq $0,%r10 mulq %rbx addq %rax,%r11 movq 32(%rbp),%rax adcq $0,%rdx imulq %r8,%rsi addq %r11,%r10 movq %rdx,%r11 adcq $0,%r11 mulq %rbx addq %rax,%r12 movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 adcq $0,%r12 mulq %rbx addq %rax,%r13 movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 adcq $0,%r13 mulq %rbx addq %rax,%r14 movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 adcq $0,%r14 mulq %rbx movq %rsi,%rbx addq %rax,%r15 movq 0(%rbp),%rax adcq $0,%rdx addq %r15,%r14 movq %rdx,%r15 adcq $0,%r15 decl %ecx jnz L$8x_reduce leaq 64(%rbp),%rbp xorq %rax,%rax movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp jae L$8x_no_tail .byte 0x66 addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 sbbq %rsi,%rsi movq 48+56+8(%rsp),%rbx movl $8,%ecx movq 0(%rbp),%rax jmp L$8x_tail .p2align 5 L$8x_tail: mulq %rbx addq %rax,%r8 movq 8(%rbp),%rax movq %r8,(%rdi) movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 leaq 8(%rdi),%rdi movq %rdx,%r9 adcq $0,%r9 mulq %rbx addq %rax,%r10 movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq %rdx,%r10 adcq $0,%r10 mulq %rbx addq %rax,%r11 movq 32(%rbp),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 adcq $0,%r11 mulq %rbx addq %rax,%r12 movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 adcq $0,%r12 mulq %rbx addq %rax,%r13 movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 adcq $0,%r13 mulq %rbx addq %rax,%r14 movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 adcq $0,%r14 mulq %rbx movq 48-16+8(%rsp,%rcx,8),%rbx addq %rax,%r15 adcq $0,%rdx addq %r15,%r14 movq 0(%rbp),%rax movq %rdx,%r15 adcq $0,%r15 decl %ecx jnz L$8x_tail leaq 64(%rbp),%rbp movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp jae L$8x_tail_done movq 48+56+8(%rsp),%rbx negq %rsi movq 0(%rbp),%rax adcq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 sbbq %rsi,%rsi movl $8,%ecx jmp L$8x_tail .p2align 5 L$8x_tail_done: xorq %rax,%rax addq (%rdx),%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rax negq %rsi L$8x_no_tail: adcq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 adcq $0,%rax movq -8(%rbp),%rcx xorq %rsi,%rsi .byte 102,72,15,126,213 movq %r8,0(%rdi) movq %r9,8(%rdi) .byte 102,73,15,126,217 movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) leaq 64(%rdi),%rdi cmpq %rdx,%rdi jb L$8x_reduction_loop ret .p2align 5 __bn_post4x_internal: movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx movq %r9,%rcx .byte 102,72,15,126,207 negq %rax .byte 102,72,15,126,206 sarq $3+2,%rcx decq %r12 xorq %r10,%r10 movq 8(%rbp),%r13 movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp L$sqr4x_sub_entry .p2align 4 L$sqr4x_sub: movq 0(%rbp),%r12 movq 8(%rbp),%r13 movq 16(%rbp),%r14 movq 24(%rbp),%r15 L$sqr4x_sub_entry: leaq 32(%rbp),%rbp notq %r12 notq %r13 notq %r14 notq %r15 andq %rax,%r12 andq %rax,%r13 andq %rax,%r14 andq %rax,%r15 negq %r10 adcq 0(%rbx),%r12 adcq 8(%rbx),%r13 adcq 16(%rbx),%r14 adcq 24(%rbx),%r15 movq %r12,0(%rdi) leaq 32(%rbx),%rbx movq %r13,8(%rdi) sbbq %r10,%r10 movq %r14,16(%rdi) movq %r15,24(%rdi) leaq 32(%rdi),%rdi incq %rcx jnz L$sqr4x_sub movq %r9,%r10 negq %r9 ret .globl _bn_mulx4x_mont_gather5 .private_extern _bn_mulx4x_mont_gather5 .p2align 5 _bn_mulx4x_mont_gather5: _CET_ENDBR movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$mulx4x_prologue: shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 movq (%r8),%r8 leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$mulx4xsp_alt subq %r11,%rbp leaq -320(%rbp,%r9,2),%rbp jmp L$mulx4xsp_done L$mulx4xsp_alt: leaq 4096-320(,%r9,2),%r10 leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp L$mulx4xsp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja L$mulx4x_page_walk jmp L$mulx4x_page_walk_done L$mulx4x_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja L$mulx4x_page_walk L$mulx4x_page_walk_done: movq %r8,32(%rsp) movq %rax,40(%rsp) L$mulx4x_body: call mulx4x_internal movq 40(%rsp),%rsi movq $1,%rax movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$mulx4x_epilogue: ret .p2align 5 mulx4x_internal: movq %r9,8(%rsp) movq %r9,%r10 negq %r9 shlq $5,%r9 negq %r10 leaq 128(%rdx,%r9,1),%r13 shrq $5+5,%r9 movd 8(%rax),%xmm5 subq $1,%r9 leaq L$inc(%rip),%rax movq %r13,16+8(%rsp) movq %r9,24+8(%rsp) movq %rdi,56+8(%rsp) movdqa 0(%rax),%xmm0 movdqa 16(%rax),%xmm1 leaq 88-112(%rsp,%r10,1),%r10 leaq 128(%rdx),%rdi pshufd $0,%xmm5,%xmm5 movdqa %xmm1,%xmm4 .byte 0x67 movdqa %xmm1,%xmm2 .byte 0x67 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,112(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,128(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,144(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,160(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,176(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,192(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,208(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,224(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,240(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,256(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,272(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,288(%r10) movdqa %xmm4,%xmm3 .byte 0x67 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,304(%r10) paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,320(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,336(%r10) pand 64(%rdi),%xmm0 pand 80(%rdi),%xmm1 pand 96(%rdi),%xmm2 movdqa %xmm3,352(%r10) pand 112(%rdi),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa -128(%rdi),%xmm4 movdqa -112(%rdi),%xmm5 movdqa -96(%rdi),%xmm2 pand 112(%r10),%xmm4 movdqa -80(%rdi),%xmm3 pand 128(%r10),%xmm5 por %xmm4,%xmm0 pand 144(%r10),%xmm2 por %xmm5,%xmm1 pand 160(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa -64(%rdi),%xmm4 movdqa -48(%rdi),%xmm5 movdqa -32(%rdi),%xmm2 pand 176(%r10),%xmm4 movdqa -16(%rdi),%xmm3 pand 192(%r10),%xmm5 por %xmm4,%xmm0 pand 208(%r10),%xmm2 por %xmm5,%xmm1 pand 224(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa 0(%rdi),%xmm4 movdqa 16(%rdi),%xmm5 movdqa 32(%rdi),%xmm2 pand 240(%r10),%xmm4 movdqa 48(%rdi),%xmm3 pand 256(%r10),%xmm5 por %xmm4,%xmm0 pand 272(%r10),%xmm2 por %xmm5,%xmm1 pand 288(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 pxor %xmm1,%xmm0 pshufd $0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 leaq 256(%rdi),%rdi .byte 102,72,15,126,194 leaq 64+32+8(%rsp),%rbx movq %rdx,%r9 mulxq 0(%rsi),%r8,%rax mulxq 8(%rsi),%r11,%r12 addq %rax,%r11 mulxq 16(%rsi),%rax,%r13 adcq %rax,%r12 adcq $0,%r13 mulxq 24(%rsi),%rax,%r14 movq %r8,%r15 imulq 32+8(%rsp),%r8 xorq %rbp,%rbp movq %r8,%rdx movq %rdi,8+8(%rsp) leaq 32(%rsi),%rsi adcxq %rax,%r13 adcxq %rbp,%r14 mulxq 0(%rcx),%rax,%r10 adcxq %rax,%r15 adoxq %r11,%r10 mulxq 8(%rcx),%rax,%r11 adcxq %rax,%r10 adoxq %r12,%r11 mulxq 16(%rcx),%rax,%r12 movq 24+8(%rsp),%rdi movq %r10,-32(%rbx) adcxq %rax,%r11 adoxq %r13,%r12 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq %r11,-24(%rbx) adcxq %rax,%r12 adoxq %rbp,%r15 leaq 32(%rcx),%rcx movq %r12,-16(%rbx) jmp L$mulx4x_1st .p2align 5 L$mulx4x_1st: adcxq %rbp,%r15 mulxq 0(%rsi),%r10,%rax adcxq %r14,%r10 mulxq 8(%rsi),%r11,%r14 adcxq %rax,%r11 mulxq 16(%rsi),%r12,%rax adcxq %r14,%r12 mulxq 24(%rsi),%r13,%r14 .byte 0x67,0x67 movq %r8,%rdx adcxq %rax,%r13 adcxq %rbp,%r14 leaq 32(%rsi),%rsi leaq 32(%rbx),%rbx adoxq %r15,%r10 mulxq 0(%rcx),%rax,%r15 adcxq %rax,%r10 adoxq %r15,%r11 mulxq 8(%rcx),%rax,%r15 adcxq %rax,%r11 adoxq %r15,%r12 mulxq 16(%rcx),%rax,%r15 movq %r10,-40(%rbx) adcxq %rax,%r12 movq %r11,-32(%rbx) adoxq %r15,%r13 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq %r12,-24(%rbx) adcxq %rax,%r13 adoxq %rbp,%r15 leaq 32(%rcx),%rcx movq %r13,-16(%rbx) decq %rdi jnz L$mulx4x_1st movq 8(%rsp),%rax adcq %rbp,%r15 leaq (%rsi,%rax,1),%rsi addq %r15,%r14 movq 8+8(%rsp),%rdi adcq %rbp,%rbp movq %r14,-8(%rbx) jmp L$mulx4x_outer .p2align 5 L$mulx4x_outer: leaq 16-256(%rbx),%r10 pxor %xmm4,%xmm4 .byte 0x67,0x67 pxor %xmm5,%xmm5 movdqa -128(%rdi),%xmm0 movdqa -112(%rdi),%xmm1 movdqa -96(%rdi),%xmm2 pand 256(%r10),%xmm0 movdqa -80(%rdi),%xmm3 pand 272(%r10),%xmm1 por %xmm0,%xmm4 pand 288(%r10),%xmm2 por %xmm1,%xmm5 pand 304(%r10),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa -64(%rdi),%xmm0 movdqa -48(%rdi),%xmm1 movdqa -32(%rdi),%xmm2 pand 320(%r10),%xmm0 movdqa -16(%rdi),%xmm3 pand 336(%r10),%xmm1 por %xmm0,%xmm4 pand 352(%r10),%xmm2 por %xmm1,%xmm5 pand 368(%r10),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 0(%rdi),%xmm0 movdqa 16(%rdi),%xmm1 movdqa 32(%rdi),%xmm2 pand 384(%r10),%xmm0 movdqa 48(%rdi),%xmm3 pand 400(%r10),%xmm1 por %xmm0,%xmm4 pand 416(%r10),%xmm2 por %xmm1,%xmm5 pand 432(%r10),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 64(%rdi),%xmm0 movdqa 80(%rdi),%xmm1 movdqa 96(%rdi),%xmm2 pand 448(%r10),%xmm0 movdqa 112(%rdi),%xmm3 pand 464(%r10),%xmm1 por %xmm0,%xmm4 pand 480(%r10),%xmm2 por %xmm1,%xmm5 pand 496(%r10),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 por %xmm5,%xmm4 pshufd $0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 leaq 256(%rdi),%rdi .byte 102,72,15,126,194 movq %rbp,(%rbx) leaq 32(%rbx,%rax,1),%rbx mulxq 0(%rsi),%r8,%r11 xorq %rbp,%rbp movq %rdx,%r9 mulxq 8(%rsi),%r14,%r12 adoxq -32(%rbx),%r8 adcxq %r14,%r11 mulxq 16(%rsi),%r15,%r13 adoxq -24(%rbx),%r11 adcxq %r15,%r12 mulxq 24(%rsi),%rdx,%r14 adoxq -16(%rbx),%r12 adcxq %rdx,%r13 leaq (%rcx,%rax,1),%rcx leaq 32(%rsi),%rsi adoxq -8(%rbx),%r13 adcxq %rbp,%r14 adoxq %rbp,%r14 movq %r8,%r15 imulq 32+8(%rsp),%r8 movq %r8,%rdx xorq %rbp,%rbp movq %rdi,8+8(%rsp) mulxq 0(%rcx),%rax,%r10 adcxq %rax,%r15 adoxq %r11,%r10 mulxq 8(%rcx),%rax,%r11 adcxq %rax,%r10 adoxq %r12,%r11 mulxq 16(%rcx),%rax,%r12 adcxq %rax,%r11 adoxq %r13,%r12 mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx movq 24+8(%rsp),%rdi movq %r10,-32(%rbx) adcxq %rax,%r12 movq %r11,-24(%rbx) adoxq %rbp,%r15 movq %r12,-16(%rbx) leaq 32(%rcx),%rcx jmp L$mulx4x_inner .p2align 5 L$mulx4x_inner: mulxq 0(%rsi),%r10,%rax adcxq %rbp,%r15 adoxq %r14,%r10 mulxq 8(%rsi),%r11,%r14 adcxq 0(%rbx),%r10 adoxq %rax,%r11 mulxq 16(%rsi),%r12,%rax adcxq 8(%rbx),%r11 adoxq %r14,%r12 mulxq 24(%rsi),%r13,%r14 movq %r8,%rdx adcxq 16(%rbx),%r12 adoxq %rax,%r13 adcxq 24(%rbx),%r13 adoxq %rbp,%r14 leaq 32(%rsi),%rsi leaq 32(%rbx),%rbx adcxq %rbp,%r14 adoxq %r15,%r10 mulxq 0(%rcx),%rax,%r15 adcxq %rax,%r10 adoxq %r15,%r11 mulxq 8(%rcx),%rax,%r15 adcxq %rax,%r11 adoxq %r15,%r12 mulxq 16(%rcx),%rax,%r15 movq %r10,-40(%rbx) adcxq %rax,%r12 adoxq %r15,%r13 movq %r11,-32(%rbx) mulxq 24(%rcx),%rax,%r15 movq %r9,%rdx leaq 32(%rcx),%rcx movq %r12,-24(%rbx) adcxq %rax,%r13 adoxq %rbp,%r15 movq %r13,-16(%rbx) decq %rdi jnz L$mulx4x_inner movq 0+8(%rsp),%rax adcq %rbp,%r15 subq 0(%rbx),%rdi movq 8+8(%rsp),%rdi movq 16+8(%rsp),%r10 adcq %r15,%r14 leaq (%rsi,%rax,1),%rsi adcq %rbp,%rbp movq %r14,-8(%rbx) cmpq %r10,%rdi jb L$mulx4x_outer movq -8(%rcx),%r10 movq %rbp,%r8 movq (%rcx,%rax,1),%r12 leaq (%rcx,%rax,1),%rbp movq %rax,%rcx leaq (%rbx,%rax,1),%rdi xorl %eax,%eax xorq %r15,%r15 subq %r14,%r10 adcq %r15,%r15 orq %r15,%r8 sarq $3+2,%rcx subq %r8,%rax movq 56+8(%rsp),%rdx decq %r12 movq 8(%rbp),%r13 xorq %r8,%r8 movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp L$sqrx4x_sub_entry .globl _bn_powerx5 .private_extern _bn_powerx5 .p2align 5 _bn_powerx5: _CET_ENDBR movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 L$powerx5_prologue: shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 movq (%r8),%r8 leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$pwrx_sp_alt subq %r11,%rbp leaq -320(%rbp,%r9,2),%rbp jmp L$pwrx_sp_done .p2align 5 L$pwrx_sp_alt: leaq 4096-320(,%r9,2),%r10 leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp L$pwrx_sp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja L$pwrx_page_walk jmp L$pwrx_page_walk_done L$pwrx_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja L$pwrx_page_walk L$pwrx_page_walk_done: movq %r9,%r10 negq %r9 pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,72,15,110,209 .byte 102,73,15,110,218 .byte 102,72,15,110,226 movq %r8,32(%rsp) movq %rax,40(%rsp) L$powerx5_body: call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal movq %r10,%r9 movq %rsi,%rdi .byte 102,72,15,126,209 .byte 102,72,15,126,226 movq 40(%rsp),%rax call mulx4x_internal movq 40(%rsp),%rsi movq $1,%rax movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 movq -24(%rsi),%r12 movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp L$powerx5_epilogue: ret .globl _bn_sqrx8x_internal .private_extern _bn_sqrx8x_internal .private_extern _bn_sqrx8x_internal .p2align 5 _bn_sqrx8x_internal: __bn_sqrx8x_internal: _CET_ENDBR leaq 48+8(%rsp),%rdi leaq (%rsi,%r9,1),%rbp movq %r9,0+8(%rsp) movq %rbp,8+8(%rsp) jmp L$sqr8x_zero_start .p2align 5 .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 L$sqrx8x_zero: .byte 0x3e movdqa %xmm0,0(%rdi) movdqa %xmm0,16(%rdi) movdqa %xmm0,32(%rdi) movdqa %xmm0,48(%rdi) L$sqr8x_zero_start: movdqa %xmm0,64(%rdi) movdqa %xmm0,80(%rdi) movdqa %xmm0,96(%rdi) movdqa %xmm0,112(%rdi) leaq 128(%rdi),%rdi subq $64,%r9 jnz L$sqrx8x_zero movq 0(%rsi),%rdx xorq %r10,%r10 xorq %r11,%r11 xorq %r12,%r12 xorq %r13,%r13 xorq %r14,%r14 xorq %r15,%r15 leaq 48+8(%rsp),%rdi xorq %rbp,%rbp jmp L$sqrx8x_outer_loop .p2align 5 L$sqrx8x_outer_loop: mulxq 8(%rsi),%r8,%rax adcxq %r9,%r8 adoxq %rax,%r10 mulxq 16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 adcxq %r11,%r10 adoxq %rax,%r12 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 adcxq %r12,%r11 adoxq %rax,%r13 mulxq 40(%rsi),%r12,%rax adcxq %r13,%r12 adoxq %rax,%r14 mulxq 48(%rsi),%r13,%rax adcxq %r14,%r13 adoxq %r15,%rax mulxq 56(%rsi),%r14,%r15 movq 8(%rsi),%rdx adcxq %rax,%r14 adoxq %rbp,%r15 adcq 64(%rdi),%r15 movq %r8,8(%rdi) movq %r9,16(%rdi) sbbq %rcx,%rcx xorq %rbp,%rbp mulxq 16(%rsi),%r8,%rbx mulxq 24(%rsi),%r9,%rax adcxq %r10,%r8 adoxq %rbx,%r9 mulxq 32(%rsi),%r10,%rbx adcxq %r11,%r9 adoxq %rax,%r10 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 adcxq %r12,%r10 adoxq %rbx,%r11 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 adcxq %r13,%r11 adoxq %r14,%r12 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 movq 16(%rsi),%rdx adcxq %rax,%r12 adoxq %rbx,%r13 adcxq %r15,%r13 adoxq %rbp,%r14 adcxq %rbp,%r14 movq %r8,24(%rdi) movq %r9,32(%rdi) mulxq 24(%rsi),%r8,%rbx mulxq 32(%rsi),%r9,%rax adcxq %r10,%r8 adoxq %rbx,%r9 mulxq 40(%rsi),%r10,%rbx adcxq %r11,%r9 adoxq %rax,%r10 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 adcxq %r12,%r10 adoxq %r13,%r11 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 .byte 0x3e movq 24(%rsi),%rdx adcxq %rbx,%r11 adoxq %rax,%r12 adcxq %r14,%r12 movq %r8,40(%rdi) movq %r9,48(%rdi) mulxq 32(%rsi),%r8,%rax adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq 40(%rsi),%r9,%rbx adcxq %r10,%r8 adoxq %rax,%r9 mulxq 48(%rsi),%r10,%rax adcxq %r11,%r9 adoxq %r12,%r10 mulxq 56(%rsi),%r11,%r12 movq 32(%rsi),%rdx movq 40(%rsi),%r14 adcxq %rbx,%r10 adoxq %rax,%r11 movq 48(%rsi),%r15 adcxq %r13,%r11 adoxq %rbp,%r12 adcxq %rbp,%r12 movq %r8,56(%rdi) movq %r9,64(%rdi) mulxq %r14,%r9,%rax movq 56(%rsi),%r8 adcxq %r10,%r9 mulxq %r15,%r10,%rbx adoxq %rax,%r10 adcxq %r11,%r10 mulxq %r8,%r11,%rax movq %r14,%rdx adoxq %rbx,%r11 adcxq %r12,%r11 adcxq %rbp,%rax mulxq %r15,%r14,%rbx mulxq %r8,%r12,%r13 movq %r15,%rdx leaq 64(%rsi),%rsi adcxq %r14,%r11 adoxq %rbx,%r12 adcxq %rax,%r12 adoxq %rbp,%r13 .byte 0x67,0x67 mulxq %r8,%r8,%r14 adcxq %r8,%r13 adcxq %rbp,%r14 cmpq 8+8(%rsp),%rsi je L$sqrx8x_outer_break negq %rcx movq $-8,%rcx movq %rbp,%r15 movq 64(%rdi),%r8 adcxq 72(%rdi),%r9 adcxq 80(%rdi),%r10 adcxq 88(%rdi),%r11 adcq 96(%rdi),%r12 adcq 104(%rdi),%r13 adcq 112(%rdi),%r14 adcq 120(%rdi),%r15 leaq (%rsi),%rbp leaq 128(%rdi),%rdi sbbq %rax,%rax movq -64(%rsi),%rdx movq %rax,16+8(%rsp) movq %rdi,24+8(%rsp) xorl %eax,%eax jmp L$sqrx8x_loop .p2align 5 L$sqrx8x_loop: movq %r8,%rbx mulxq 0(%rbp),%rax,%r8 adcxq %rax,%rbx adoxq %r9,%r8 mulxq 8(%rbp),%rax,%r9 adcxq %rax,%r8 adoxq %r10,%r9 mulxq 16(%rbp),%rax,%r10 adcxq %rax,%r9 adoxq %r11,%r10 mulxq 24(%rbp),%rax,%r11 adcxq %rax,%r10 adoxq %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 adcxq %rax,%r11 adoxq %r13,%r12 mulxq 40(%rbp),%rax,%r13 adcxq %rax,%r12 adoxq %r14,%r13 mulxq 48(%rbp),%rax,%r14 movq %rbx,(%rdi,%rcx,8) movl $0,%ebx adcxq %rax,%r13 adoxq %r15,%r14 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 movq 8(%rsi,%rcx,8),%rdx adcxq %rax,%r14 adoxq %rbx,%r15 adcxq %rbx,%r15 .byte 0x67 incq %rcx jnz L$sqrx8x_loop leaq 64(%rbp),%rbp movq $-8,%rcx cmpq 8+8(%rsp),%rbp je L$sqrx8x_break subq 16+8(%rsp),%rbx .byte 0x66 movq -64(%rsi),%rdx adcxq 0(%rdi),%r8 adcxq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 leaq 64(%rdi),%rdi .byte 0x67 sbbq %rax,%rax xorl %ebx,%ebx movq %rax,16+8(%rsp) jmp L$sqrx8x_loop .p2align 5 L$sqrx8x_break: xorq %rbp,%rbp subq 16+8(%rsp),%rbx adcxq %rbp,%r8 movq 24+8(%rsp),%rcx adcxq %rbp,%r9 movq 0(%rsi),%rdx adcq $0,%r10 movq %r8,0(%rdi) adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 cmpq %rcx,%rdi je L$sqrx8x_outer_loop movq %r9,8(%rdi) movq 8(%rcx),%r9 movq %r10,16(%rdi) movq 16(%rcx),%r10 movq %r11,24(%rdi) movq 24(%rcx),%r11 movq %r12,32(%rdi) movq 32(%rcx),%r12 movq %r13,40(%rdi) movq 40(%rcx),%r13 movq %r14,48(%rdi) movq 48(%rcx),%r14 movq %r15,56(%rdi) movq 56(%rcx),%r15 movq %rcx,%rdi jmp L$sqrx8x_outer_loop .p2align 5 L$sqrx8x_outer_break: movq %r9,72(%rdi) .byte 102,72,15,126,217 movq %r10,80(%rdi) movq %r11,88(%rdi) movq %r12,96(%rdi) movq %r13,104(%rdi) movq %r14,112(%rdi) leaq 48+8(%rsp),%rdi movq (%rsi,%rcx,1),%rdx movq 8(%rdi),%r11 xorq %r10,%r10 movq 0+8(%rsp),%r9 adoxq %r11,%r11 movq 16(%rdi),%r12 movq 24(%rdi),%r13 .p2align 5 L$sqrx4x_shift_n_add: mulxq %rdx,%rax,%rbx adoxq %r12,%r12 adcxq %r10,%rax .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 adoxq %r13,%r13 adcxq %r11,%rbx movq 40(%rdi),%r11 movq %rax,0(%rdi) movq %rbx,8(%rdi) mulxq %rdx,%rax,%rbx adoxq %r10,%r10 adcxq %r12,%rax movq 16(%rsi,%rcx,1),%rdx movq 48(%rdi),%r12 adoxq %r11,%r11 adcxq %r13,%rbx movq 56(%rdi),%r13 movq %rax,16(%rdi) movq %rbx,24(%rdi) mulxq %rdx,%rax,%rbx adoxq %r12,%r12 adcxq %r10,%rax movq 24(%rsi,%rcx,1),%rdx leaq 32(%rcx),%rcx movq 64(%rdi),%r10 adoxq %r13,%r13 adcxq %r11,%rbx movq 72(%rdi),%r11 movq %rax,32(%rdi) movq %rbx,40(%rdi) mulxq %rdx,%rax,%rbx adoxq %r10,%r10 adcxq %r12,%rax jrcxz L$sqrx4x_shift_n_add_break .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 adoxq %r11,%r11 adcxq %r13,%rbx movq 80(%rdi),%r12 movq 88(%rdi),%r13 movq %rax,48(%rdi) movq %rbx,56(%rdi) leaq 64(%rdi),%rdi nop jmp L$sqrx4x_shift_n_add .p2align 5 L$sqrx4x_shift_n_add_break: adcxq %r13,%rbx movq %rax,48(%rdi) movq %rbx,56(%rdi) leaq 64(%rdi),%rdi .byte 102,72,15,126,213 __bn_sqrx8x_reduction: xorl %eax,%eax movq 32+8(%rsp),%rbx movq 48+8(%rsp),%rdx leaq -64(%rbp,%r9,1),%rcx movq %rcx,0+8(%rsp) movq %rdi,8+8(%rsp) leaq 48+8(%rsp),%rdi jmp L$sqrx8x_reduction_loop .p2align 5 L$sqrx8x_reduction_loop: movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq %rdx,%r8 imulq %rbx,%rdx movq 40(%rdi),%r13 movq 48(%rdi),%r14 movq 56(%rdi),%r15 movq %rax,24+8(%rsp) leaq 64(%rdi),%rdi xorq %rsi,%rsi movq $-8,%rcx jmp L$sqrx8x_reduce .p2align 5 L$sqrx8x_reduce: movq %r8,%rbx mulxq 0(%rbp),%rax,%r8 adcxq %rbx,%rax adoxq %r9,%r8 mulxq 8(%rbp),%rbx,%r9 adcxq %rbx,%r8 adoxq %r10,%r9 mulxq 16(%rbp),%rbx,%r10 adcxq %rbx,%r9 adoxq %r11,%r10 mulxq 24(%rbp),%rbx,%r11 adcxq %rbx,%r10 adoxq %r12,%r11 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 movq %rdx,%rax movq %r8,%rdx adcxq %rbx,%r11 adoxq %r13,%r12 mulxq 32+8(%rsp),%rbx,%rdx movq %rax,%rdx movq %rax,64+48+8(%rsp,%rcx,8) mulxq 40(%rbp),%rax,%r13 adcxq %rax,%r12 adoxq %r14,%r13 mulxq 48(%rbp),%rax,%r14 adcxq %rax,%r13 adoxq %r15,%r14 mulxq 56(%rbp),%rax,%r15 movq %rbx,%rdx adcxq %rax,%r14 adoxq %rsi,%r15 adcxq %rsi,%r15 .byte 0x67,0x67,0x67 incq %rcx jnz L$sqrx8x_reduce movq %rsi,%rax cmpq 0+8(%rsp),%rbp jae L$sqrx8x_no_tail movq 48+8(%rsp),%rdx addq 0(%rdi),%r8 leaq 64(%rbp),%rbp movq $-8,%rcx adcxq 8(%rdi),%r9 adcxq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 leaq 64(%rdi),%rdi sbbq %rax,%rax xorq %rsi,%rsi movq %rax,16+8(%rsp) jmp L$sqrx8x_tail .p2align 5 L$sqrx8x_tail: movq %r8,%rbx mulxq 0(%rbp),%rax,%r8 adcxq %rax,%rbx adoxq %r9,%r8 mulxq 8(%rbp),%rax,%r9 adcxq %rax,%r8 adoxq %r10,%r9 mulxq 16(%rbp),%rax,%r10 adcxq %rax,%r9 adoxq %r11,%r10 mulxq 24(%rbp),%rax,%r11 adcxq %rax,%r10 adoxq %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 adcxq %rax,%r11 adoxq %r13,%r12 mulxq 40(%rbp),%rax,%r13 adcxq %rax,%r12 adoxq %r14,%r13 mulxq 48(%rbp),%rax,%r14 adcxq %rax,%r13 adoxq %r15,%r14 mulxq 56(%rbp),%rax,%r15 movq 72+48+8(%rsp,%rcx,8),%rdx adcxq %rax,%r14 adoxq %rsi,%r15 movq %rbx,(%rdi,%rcx,8) movq %r8,%rbx adcxq %rsi,%r15 incq %rcx jnz L$sqrx8x_tail cmpq 0+8(%rsp),%rbp jae L$sqrx8x_tail_done subq 16+8(%rsp),%rsi movq 48+8(%rsp),%rdx leaq 64(%rbp),%rbp adcq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 leaq 64(%rdi),%rdi sbbq %rax,%rax subq $8,%rcx xorq %rsi,%rsi movq %rax,16+8(%rsp) jmp L$sqrx8x_tail .p2align 5 L$sqrx8x_tail_done: xorq %rax,%rax addq 24+8(%rsp),%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rax subq 16+8(%rsp),%rsi L$sqrx8x_no_tail: adcq 0(%rdi),%r8 .byte 102,72,15,126,217 adcq 8(%rdi),%r9 movq 56(%rbp),%rsi .byte 102,72,15,126,213 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 adcq $0,%rax movq 32+8(%rsp),%rbx movq 64(%rdi,%rcx,1),%rdx movq %r8,0(%rdi) leaq 64(%rdi),%r8 movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) leaq 64(%rdi,%rcx,1),%rdi cmpq 8+8(%rsp),%r8 jb L$sqrx8x_reduction_loop ret .p2align 5 __bn_postx4x_internal: movq 0(%rbp),%r12 movq %rcx,%r10 movq %rcx,%r9 negq %rax sarq $3+2,%rcx .byte 102,72,15,126,202 .byte 102,72,15,126,206 decq %r12 movq 8(%rbp),%r13 xorq %r8,%r8 movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp L$sqrx4x_sub_entry .p2align 4 L$sqrx4x_sub: movq 0(%rbp),%r12 movq 8(%rbp),%r13 movq 16(%rbp),%r14 movq 24(%rbp),%r15 L$sqrx4x_sub_entry: andnq %rax,%r12,%r12 leaq 32(%rbp),%rbp andnq %rax,%r13,%r13 andnq %rax,%r14,%r14 andnq %rax,%r15,%r15 negq %r8 adcq 0(%rdi),%r12 adcq 8(%rdi),%r13 adcq 16(%rdi),%r14 adcq 24(%rdi),%r15 movq %r12,0(%rdx) leaq 32(%rdi),%rdi movq %r13,8(%rdx) sbbq %r8,%r8 movq %r14,16(%rdx) movq %r15,24(%rdx) leaq 32(%rdx),%rdx incq %rcx jnz L$sqrx4x_sub negq %r9 ret .globl _bn_scatter5 .private_extern _bn_scatter5 .p2align 4 _bn_scatter5: _CET_ENDBR cmpl $0,%esi jz L$scatter_epilogue leaq (%rdx,%rcx,8),%rdx L$scatter: movq (%rdi),%rax leaq 8(%rdi),%rdi movq %rax,(%rdx) leaq 256(%rdx),%rdx subl $1,%esi jnz L$scatter L$scatter_epilogue: ret .globl _bn_gather5 .private_extern _bn_gather5 .p2align 5 _bn_gather5: L$SEH_begin_bn_gather5: _CET_ENDBR .byte 0x4c,0x8d,0x14,0x24 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 leaq L$inc(%rip),%rax andq $-16,%rsp movd %ecx,%xmm5 movdqa 0(%rax),%xmm0 movdqa 16(%rax),%xmm1 leaq 128(%rdx),%r11 leaq 128(%rsp),%rax pshufd $0,%xmm5,%xmm5 movdqa %xmm1,%xmm4 movdqa %xmm1,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,-128(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,-112(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,-96(%rax) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,-80(%rax) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,-64(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,-48(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,-32(%rax) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,-16(%rax) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,0(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,16(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,32(%rax) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,48(%rax) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,64(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,80(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,96(%rax) movdqa %xmm4,%xmm2 movdqa %xmm3,112(%rax) jmp L$gather .p2align 5 L$gather: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 movdqa -128(%r11),%xmm0 movdqa -112(%r11),%xmm1 movdqa -96(%r11),%xmm2 pand -128(%rax),%xmm0 movdqa -80(%r11),%xmm3 pand -112(%rax),%xmm1 por %xmm0,%xmm4 pand -96(%rax),%xmm2 por %xmm1,%xmm5 pand -80(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa -64(%r11),%xmm0 movdqa -48(%r11),%xmm1 movdqa -32(%r11),%xmm2 pand -64(%rax),%xmm0 movdqa -16(%r11),%xmm3 pand -48(%rax),%xmm1 por %xmm0,%xmm4 pand -32(%rax),%xmm2 por %xmm1,%xmm5 pand -16(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 0(%r11),%xmm0 movdqa 16(%r11),%xmm1 movdqa 32(%r11),%xmm2 pand 0(%rax),%xmm0 movdqa 48(%r11),%xmm3 pand 16(%rax),%xmm1 por %xmm0,%xmm4 pand 32(%rax),%xmm2 por %xmm1,%xmm5 pand 48(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 64(%r11),%xmm0 movdqa 80(%r11),%xmm1 movdqa 96(%r11),%xmm2 pand 64(%rax),%xmm0 movdqa 112(%r11),%xmm3 pand 80(%rax),%xmm1 por %xmm0,%xmm4 pand 96(%rax),%xmm2 por %xmm1,%xmm5 pand 112(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 por %xmm5,%xmm4 leaq 256(%r11),%r11 pshufd $0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 movq %xmm0,(%rdi) leaq 8(%rdi),%rdi subl $1,%esi jnz L$gather leaq (%r10),%rsp ret L$SEH_end_bn_gather5: .section __DATA,__const .p2align 6 L$inc: .long 0,0, 1,1 .long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text #endif ring-0.17.14/pregenerated/x86_64-mont5-nasm.asm000064400000000000000000001477421046102023000170550ustar 00000000000000; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %ifidn __OUTPUT_FORMAT__, win64 default rel %define XMMWORD %define YMMWORD %define ZMMWORD %define _CET_ENDBR %include "ring_core_generated/prefix_symbols_nasm.inc" section .text code align=64 global bn_mul4x_mont_gather5 ALIGN 32 bn_mul4x_mont_gather5: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_bn_mul4x_mont_gather5: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] mov r9,QWORD[48+rsp] _CET_ENDBR DB 0x67 mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 $L$mul4x_prologue: DB 0x67 shl r9d,3 lea r10,[r9*2+r9] neg r9 lea r11,[((-320))+r9*2+rsp] mov rbp,rsp sub r11,rdi and r11,4095 cmp r10,r11 jb NEAR $L$mul4xsp_alt sub rbp,r11 lea rbp,[((-320))+r9*2+rbp] jmp NEAR $L$mul4xsp_done ALIGN 32 $L$mul4xsp_alt: lea r10,[((4096-320))+r9*2] lea rbp,[((-320))+r9*2+rbp] sub r11,r10 mov r10,0 cmovc r11,r10 sub rbp,r11 $L$mul4xsp_done: and rbp,-64 mov r11,rsp sub r11,rbp and r11,-4096 lea rsp,[rbp*1+r11] mov r10,QWORD[rsp] cmp rsp,rbp ja NEAR $L$mul4x_page_walk jmp NEAR $L$mul4x_page_walk_done $L$mul4x_page_walk: lea rsp,[((-4096))+rsp] mov r10,QWORD[rsp] cmp rsp,rbp ja NEAR $L$mul4x_page_walk $L$mul4x_page_walk_done: neg r9 mov QWORD[40+rsp],rax $L$mul4x_body: call mul4x_internal mov rsi,QWORD[40+rsp] mov rax,1 mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$mul4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_bn_mul4x_mont_gather5: ALIGN 32 mul4x_internal: shl r9,5 movd xmm5,DWORD[56+rax] lea rax,[$L$inc] lea r13,[128+r9*1+rdx] shr r9,5 movdqa xmm0,XMMWORD[rax] movdqa xmm1,XMMWORD[16+rax] lea r10,[((88-112))+r9*1+rsp] lea r12,[128+rdx] pshufd xmm5,xmm5,0 movdqa xmm4,xmm1 DB 0x67,0x67 movdqa xmm2,xmm1 paddd xmm1,xmm0 pcmpeqd xmm0,xmm5 DB 0x67 movdqa xmm3,xmm4 paddd xmm2,xmm1 pcmpeqd xmm1,xmm5 movdqa XMMWORD[112+r10],xmm0 movdqa xmm0,xmm4 paddd xmm3,xmm2 pcmpeqd xmm2,xmm5 movdqa XMMWORD[128+r10],xmm1 movdqa xmm1,xmm4 paddd xmm0,xmm3 pcmpeqd xmm3,xmm5 movdqa XMMWORD[144+r10],xmm2 movdqa xmm2,xmm4 paddd xmm1,xmm0 pcmpeqd xmm0,xmm5 movdqa XMMWORD[160+r10],xmm3 movdqa xmm3,xmm4 paddd xmm2,xmm1 pcmpeqd xmm1,xmm5 movdqa XMMWORD[176+r10],xmm0 movdqa xmm0,xmm4 paddd xmm3,xmm2 pcmpeqd xmm2,xmm5 movdqa XMMWORD[192+r10],xmm1 movdqa xmm1,xmm4 paddd xmm0,xmm3 pcmpeqd xmm3,xmm5 movdqa XMMWORD[208+r10],xmm2 movdqa xmm2,xmm4 paddd xmm1,xmm0 pcmpeqd xmm0,xmm5 movdqa XMMWORD[224+r10],xmm3 movdqa xmm3,xmm4 paddd xmm2,xmm1 pcmpeqd xmm1,xmm5 movdqa XMMWORD[240+r10],xmm0 movdqa xmm0,xmm4 paddd xmm3,xmm2 pcmpeqd xmm2,xmm5 movdqa XMMWORD[256+r10],xmm1 movdqa xmm1,xmm4 paddd xmm0,xmm3 pcmpeqd xmm3,xmm5 movdqa XMMWORD[272+r10],xmm2 movdqa xmm2,xmm4 paddd xmm1,xmm0 pcmpeqd xmm0,xmm5 movdqa XMMWORD[288+r10],xmm3 movdqa xmm3,xmm4 paddd xmm2,xmm1 pcmpeqd xmm1,xmm5 movdqa XMMWORD[304+r10],xmm0 paddd xmm3,xmm2 DB 0x67 pcmpeqd xmm2,xmm5 movdqa XMMWORD[320+r10],xmm1 pcmpeqd xmm3,xmm5 movdqa XMMWORD[336+r10],xmm2 pand xmm0,XMMWORD[64+r12] pand xmm1,XMMWORD[80+r12] pand xmm2,XMMWORD[96+r12] movdqa XMMWORD[352+r10],xmm3 pand xmm3,XMMWORD[112+r12] por xmm0,xmm2 por xmm1,xmm3 movdqa xmm4,XMMWORD[((-128))+r12] movdqa xmm5,XMMWORD[((-112))+r12] movdqa xmm2,XMMWORD[((-96))+r12] pand xmm4,XMMWORD[112+r10] movdqa xmm3,XMMWORD[((-80))+r12] pand xmm5,XMMWORD[128+r10] por xmm0,xmm4 pand xmm2,XMMWORD[144+r10] por xmm1,xmm5 pand xmm3,XMMWORD[160+r10] por xmm0,xmm2 por xmm1,xmm3 movdqa xmm4,XMMWORD[((-64))+r12] movdqa xmm5,XMMWORD[((-48))+r12] movdqa xmm2,XMMWORD[((-32))+r12] pand xmm4,XMMWORD[176+r10] movdqa xmm3,XMMWORD[((-16))+r12] pand xmm5,XMMWORD[192+r10] por xmm0,xmm4 pand xmm2,XMMWORD[208+r10] por xmm1,xmm5 pand xmm3,XMMWORD[224+r10] por xmm0,xmm2 por xmm1,xmm3 movdqa xmm4,XMMWORD[r12] movdqa xmm5,XMMWORD[16+r12] movdqa xmm2,XMMWORD[32+r12] pand xmm4,XMMWORD[240+r10] movdqa xmm3,XMMWORD[48+r12] pand xmm5,XMMWORD[256+r10] por xmm0,xmm4 pand xmm2,XMMWORD[272+r10] por xmm1,xmm5 pand xmm3,XMMWORD[288+r10] por xmm0,xmm2 por xmm1,xmm3 por xmm0,xmm1 pshufd xmm1,xmm0,0x4e por xmm0,xmm1 lea r12,[256+r12] DB 102,72,15,126,195 mov QWORD[((16+8))+rsp],r13 mov QWORD[((56+8))+rsp],rdi mov r8,QWORD[r8] mov rax,QWORD[rsi] lea rsi,[r9*1+rsi] neg r9 mov rbp,r8 mul rbx mov r10,rax mov rax,QWORD[rcx] imul rbp,r10 lea r14,[((64+8))+rsp] mov r11,rdx mul rbp add r10,rax mov rax,QWORD[8+r9*1+rsi] adc rdx,0 mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[8+rcx] adc rdx,0 mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[16+r9*1+rsi] adc rdx,0 add rdi,r11 lea r15,[32+r9] lea rcx,[32+rcx] adc rdx,0 mov QWORD[r14],rdi mov r13,rdx jmp NEAR $L$1st4x ALIGN 32 $L$1st4x: mul rbx add r10,rax mov rax,QWORD[((-16))+rcx] lea r14,[32+r14] adc rdx,0 mov r11,rdx mul rbp add r13,rax mov rax,QWORD[((-8))+r15*1+rsi] adc rdx,0 add r13,r10 adc rdx,0 mov QWORD[((-24))+r14],r13 mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[((-8))+rcx] adc rdx,0 mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[r15*1+rsi] adc rdx,0 add rdi,r11 adc rdx,0 mov QWORD[((-16))+r14],rdi mov r13,rdx mul rbx add r10,rax mov rax,QWORD[rcx] adc rdx,0 mov r11,rdx mul rbp add r13,rax mov rax,QWORD[8+r15*1+rsi] adc rdx,0 add r13,r10 adc rdx,0 mov QWORD[((-8))+r14],r13 mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[8+rcx] adc rdx,0 mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[16+r15*1+rsi] adc rdx,0 add rdi,r11 lea rcx,[32+rcx] adc rdx,0 mov QWORD[r14],rdi mov r13,rdx add r15,32 jnz NEAR $L$1st4x mul rbx add r10,rax mov rax,QWORD[((-16))+rcx] lea r14,[32+r14] adc rdx,0 mov r11,rdx mul rbp add r13,rax mov rax,QWORD[((-8))+rsi] adc rdx,0 add r13,r10 adc rdx,0 mov QWORD[((-24))+r14],r13 mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[((-8))+rcx] adc rdx,0 mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[r9*1+rsi] adc rdx,0 add rdi,r11 adc rdx,0 mov QWORD[((-16))+r14],rdi mov r13,rdx lea rcx,[r9*1+rcx] xor rdi,rdi add r13,r10 adc rdi,0 mov QWORD[((-8))+r14],r13 jmp NEAR $L$outer4x ALIGN 32 $L$outer4x: lea rdx,[((16+128))+r14] pxor xmm4,xmm4 pxor xmm5,xmm5 movdqa xmm0,XMMWORD[((-128))+r12] movdqa xmm1,XMMWORD[((-112))+r12] movdqa xmm2,XMMWORD[((-96))+r12] movdqa xmm3,XMMWORD[((-80))+r12] pand xmm0,XMMWORD[((-128))+rdx] pand xmm1,XMMWORD[((-112))+rdx] por xmm4,xmm0 pand xmm2,XMMWORD[((-96))+rdx] por xmm5,xmm1 pand xmm3,XMMWORD[((-80))+rdx] por xmm4,xmm2 por xmm5,xmm3 movdqa xmm0,XMMWORD[((-64))+r12] movdqa xmm1,XMMWORD[((-48))+r12] movdqa xmm2,XMMWORD[((-32))+r12] movdqa xmm3,XMMWORD[((-16))+r12] pand xmm0,XMMWORD[((-64))+rdx] pand xmm1,XMMWORD[((-48))+rdx] por xmm4,xmm0 pand xmm2,XMMWORD[((-32))+rdx] por xmm5,xmm1 pand xmm3,XMMWORD[((-16))+rdx] por xmm4,xmm2 por xmm5,xmm3 movdqa xmm0,XMMWORD[r12] movdqa xmm1,XMMWORD[16+r12] movdqa xmm2,XMMWORD[32+r12] movdqa xmm3,XMMWORD[48+r12] pand xmm0,XMMWORD[rdx] pand xmm1,XMMWORD[16+rdx] por xmm4,xmm0 pand xmm2,XMMWORD[32+rdx] por xmm5,xmm1 pand xmm3,XMMWORD[48+rdx] por xmm4,xmm2 por xmm5,xmm3 movdqa xmm0,XMMWORD[64+r12] movdqa xmm1,XMMWORD[80+r12] movdqa xmm2,XMMWORD[96+r12] movdqa xmm3,XMMWORD[112+r12] pand xmm0,XMMWORD[64+rdx] pand xmm1,XMMWORD[80+rdx] por xmm4,xmm0 pand xmm2,XMMWORD[96+rdx] por xmm5,xmm1 pand xmm3,XMMWORD[112+rdx] por xmm4,xmm2 por xmm5,xmm3 por xmm4,xmm5 pshufd xmm0,xmm4,0x4e por xmm0,xmm4 lea r12,[256+r12] DB 102,72,15,126,195 mov r10,QWORD[r9*1+r14] mov rbp,r8 mul rbx add r10,rax mov rax,QWORD[rcx] adc rdx,0 imul rbp,r10 mov r11,rdx mov QWORD[r14],rdi lea r14,[r9*1+r14] mul rbp add r10,rax mov rax,QWORD[8+r9*1+rsi] adc rdx,0 mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[8+rcx] adc rdx,0 add r11,QWORD[8+r14] adc rdx,0 mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[16+r9*1+rsi] adc rdx,0 add rdi,r11 lea r15,[32+r9] lea rcx,[32+rcx] adc rdx,0 mov r13,rdx jmp NEAR $L$inner4x ALIGN 32 $L$inner4x: mul rbx add r10,rax mov rax,QWORD[((-16))+rcx] adc rdx,0 add r10,QWORD[16+r14] lea r14,[32+r14] adc rdx,0 mov r11,rdx mul rbp add r13,rax mov rax,QWORD[((-8))+r15*1+rsi] adc rdx,0 add r13,r10 adc rdx,0 mov QWORD[((-32))+r14],rdi mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[((-8))+rcx] adc rdx,0 add r11,QWORD[((-8))+r14] adc rdx,0 mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[r15*1+rsi] adc rdx,0 add rdi,r11 adc rdx,0 mov QWORD[((-24))+r14],r13 mov r13,rdx mul rbx add r10,rax mov rax,QWORD[rcx] adc rdx,0 add r10,QWORD[r14] adc rdx,0 mov r11,rdx mul rbp add r13,rax mov rax,QWORD[8+r15*1+rsi] adc rdx,0 add r13,r10 adc rdx,0 mov QWORD[((-16))+r14],rdi mov rdi,rdx mul rbx add r11,rax mov rax,QWORD[8+rcx] adc rdx,0 add r11,QWORD[8+r14] adc rdx,0 mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[16+r15*1+rsi] adc rdx,0 add rdi,r11 lea rcx,[32+rcx] adc rdx,0 mov QWORD[((-8))+r14],r13 mov r13,rdx add r15,32 jnz NEAR $L$inner4x mul rbx add r10,rax mov rax,QWORD[((-16))+rcx] adc rdx,0 add r10,QWORD[16+r14] lea r14,[32+r14] adc rdx,0 mov r11,rdx mul rbp add r13,rax mov rax,QWORD[((-8))+rsi] adc rdx,0 add r13,r10 adc rdx,0 mov QWORD[((-32))+r14],rdi mov rdi,rdx mul rbx add r11,rax mov rax,rbp mov rbp,QWORD[((-8))+rcx] adc rdx,0 add r11,QWORD[((-8))+r14] adc rdx,0 mov r10,rdx mul rbp add rdi,rax mov rax,QWORD[r9*1+rsi] adc rdx,0 add rdi,r11 adc rdx,0 mov QWORD[((-24))+r14],r13 mov r13,rdx mov QWORD[((-16))+r14],rdi lea rcx,[r9*1+rcx] xor rdi,rdi add r13,r10 adc rdi,0 add r13,QWORD[r14] adc rdi,0 mov QWORD[((-8))+r14],r13 cmp r12,QWORD[((16+8))+rsp] jb NEAR $L$outer4x xor rax,rax sub rbp,r13 adc r15,r15 or rdi,r15 sub rax,rdi lea rbx,[r9*1+r14] mov r12,QWORD[rcx] lea rbp,[rcx] mov rcx,r9 sar rcx,3+2 mov rdi,QWORD[((56+8))+rsp] dec r12 xor r10,r10 mov r13,QWORD[8+rbp] mov r14,QWORD[16+rbp] mov r15,QWORD[24+rbp] jmp NEAR $L$sqr4x_sub_entry global bn_power5_nohw ALIGN 32 bn_power5_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_bn_power5_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] mov r9,QWORD[48+rsp] _CET_ENDBR mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 $L$power5_prologue: shl r9d,3 lea r10d,[r9*2+r9] neg r9 mov r8,QWORD[r8] lea r11,[((-320))+r9*2+rsp] mov rbp,rsp sub r11,rdi and r11,4095 cmp r10,r11 jb NEAR $L$pwr_sp_alt sub rbp,r11 lea rbp,[((-320))+r9*2+rbp] jmp NEAR $L$pwr_sp_done ALIGN 32 $L$pwr_sp_alt: lea r10,[((4096-320))+r9*2] lea rbp,[((-320))+r9*2+rbp] sub r11,r10 mov r10,0 cmovc r11,r10 sub rbp,r11 $L$pwr_sp_done: and rbp,-64 mov r11,rsp sub r11,rbp and r11,-4096 lea rsp,[rbp*1+r11] mov r10,QWORD[rsp] cmp rsp,rbp ja NEAR $L$pwr_page_walk jmp NEAR $L$pwr_page_walk_done $L$pwr_page_walk: lea rsp,[((-4096))+rsp] mov r10,QWORD[rsp] cmp rsp,rbp ja NEAR $L$pwr_page_walk $L$pwr_page_walk_done: mov r10,r9 neg r9 mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax $L$power5_body: DB 102,72,15,110,207 DB 102,72,15,110,209 DB 102,73,15,110,218 DB 102,72,15,110,226 call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal DB 102,72,15,126,209 DB 102,72,15,126,226 mov rdi,rsi mov rax,QWORD[40+rsp] lea r8,[32+rsp] call mul4x_internal mov rsi,QWORD[40+rsp] mov rax,1 mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$power5_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_bn_power5_nohw: global bn_sqr8x_internal ALIGN 32 bn_sqr8x_internal: __bn_sqr8x_internal: _CET_ENDBR lea rbp,[32+r10] lea rsi,[r9*1+rsi] mov rcx,r9 mov r14,QWORD[((-32))+rbp*1+rsi] lea rdi,[((48+8))+r9*2+rsp] mov rax,QWORD[((-24))+rbp*1+rsi] lea rdi,[((-32))+rbp*1+rdi] mov rbx,QWORD[((-16))+rbp*1+rsi] mov r15,rax mul r14 mov r10,rax mov rax,rbx mov r11,rdx mov QWORD[((-24))+rbp*1+rdi],r10 mul r14 add r11,rax mov rax,rbx adc rdx,0 mov QWORD[((-16))+rbp*1+rdi],r11 mov r10,rdx mov rbx,QWORD[((-8))+rbp*1+rsi] mul r15 mov r12,rax mov rax,rbx mov r13,rdx lea rcx,[rbp] mul r14 add r10,rax mov rax,rbx mov r11,rdx adc r11,0 add r10,r12 adc r11,0 mov QWORD[((-8))+rcx*1+rdi],r10 jmp NEAR $L$sqr4x_1st ALIGN 32 $L$sqr4x_1st: mov rbx,QWORD[rcx*1+rsi] mul r15 add r13,rax mov rax,rbx mov r12,rdx adc r12,0 mul r14 add r11,rax mov rax,rbx mov rbx,QWORD[8+rcx*1+rsi] mov r10,rdx adc r10,0 add r11,r13 adc r10,0 mul r15 add r12,rax mov rax,rbx mov QWORD[rcx*1+rdi],r11 mov r13,rdx adc r13,0 mul r14 add r10,rax mov rax,rbx mov rbx,QWORD[16+rcx*1+rsi] mov r11,rdx adc r11,0 add r10,r12 adc r11,0 mul r15 add r13,rax mov rax,rbx mov QWORD[8+rcx*1+rdi],r10 mov r12,rdx adc r12,0 mul r14 add r11,rax mov rax,rbx mov rbx,QWORD[24+rcx*1+rsi] mov r10,rdx adc r10,0 add r11,r13 adc r10,0 mul r15 add r12,rax mov rax,rbx mov QWORD[16+rcx*1+rdi],r11 mov r13,rdx adc r13,0 lea rcx,[32+rcx] mul r14 add r10,rax mov rax,rbx mov r11,rdx adc r11,0 add r10,r12 adc r11,0 mov QWORD[((-8))+rcx*1+rdi],r10 cmp rcx,0 jne NEAR $L$sqr4x_1st mul r15 add r13,rax lea rbp,[16+rbp] adc rdx,0 add r13,r11 adc rdx,0 mov QWORD[rdi],r13 mov r12,rdx mov QWORD[8+rdi],rdx jmp NEAR $L$sqr4x_outer ALIGN 32 $L$sqr4x_outer: mov r14,QWORD[((-32))+rbp*1+rsi] lea rdi,[((48+8))+r9*2+rsp] mov rax,QWORD[((-24))+rbp*1+rsi] lea rdi,[((-32))+rbp*1+rdi] mov rbx,QWORD[((-16))+rbp*1+rsi] mov r15,rax mul r14 mov r10,QWORD[((-24))+rbp*1+rdi] add r10,rax mov rax,rbx adc rdx,0 mov QWORD[((-24))+rbp*1+rdi],r10 mov r11,rdx mul r14 add r11,rax mov rax,rbx adc rdx,0 add r11,QWORD[((-16))+rbp*1+rdi] mov r10,rdx adc r10,0 mov QWORD[((-16))+rbp*1+rdi],r11 xor r12,r12 mov rbx,QWORD[((-8))+rbp*1+rsi] mul r15 add r12,rax mov rax,rbx adc rdx,0 add r12,QWORD[((-8))+rbp*1+rdi] mov r13,rdx adc r13,0 mul r14 add r10,rax mov rax,rbx adc rdx,0 add r10,r12 mov r11,rdx adc r11,0 mov QWORD[((-8))+rbp*1+rdi],r10 lea rcx,[rbp] jmp NEAR $L$sqr4x_inner ALIGN 32 $L$sqr4x_inner: mov rbx,QWORD[rcx*1+rsi] mul r15 add r13,rax mov rax,rbx mov r12,rdx adc r12,0 add r13,QWORD[rcx*1+rdi] adc r12,0 DB 0x67 mul r14 add r11,rax mov rax,rbx mov rbx,QWORD[8+rcx*1+rsi] mov r10,rdx adc r10,0 add r11,r13 adc r10,0 mul r15 add r12,rax mov QWORD[rcx*1+rdi],r11 mov rax,rbx mov r13,rdx adc r13,0 add r12,QWORD[8+rcx*1+rdi] lea rcx,[16+rcx] adc r13,0 mul r14 add r10,rax mov rax,rbx adc rdx,0 add r10,r12 mov r11,rdx adc r11,0 mov QWORD[((-8))+rcx*1+rdi],r10 cmp rcx,0 jne NEAR $L$sqr4x_inner DB 0x67 mul r15 add r13,rax adc rdx,0 add r13,r11 adc rdx,0 mov QWORD[rdi],r13 mov r12,rdx mov QWORD[8+rdi],rdx add rbp,16 jnz NEAR $L$sqr4x_outer mov r14,QWORD[((-32))+rsi] lea rdi,[((48+8))+r9*2+rsp] mov rax,QWORD[((-24))+rsi] lea rdi,[((-32))+rbp*1+rdi] mov rbx,QWORD[((-16))+rsi] mov r15,rax mul r14 add r10,rax mov rax,rbx mov r11,rdx adc r11,0 mul r14 add r11,rax mov rax,rbx mov QWORD[((-24))+rdi],r10 mov r10,rdx adc r10,0 add r11,r13 mov rbx,QWORD[((-8))+rsi] adc r10,0 mul r15 add r12,rax mov rax,rbx mov QWORD[((-16))+rdi],r11 mov r13,rdx adc r13,0 mul r14 add r10,rax mov rax,rbx mov r11,rdx adc r11,0 add r10,r12 adc r11,0 mov QWORD[((-8))+rdi],r10 mul r15 add r13,rax mov rax,QWORD[((-16))+rsi] adc rdx,0 add r13,r11 adc rdx,0 mov QWORD[rdi],r13 mov r12,rdx mov QWORD[8+rdi],rdx mul rbx add rbp,16 xor r14,r14 sub rbp,r9 xor r15,r15 add rax,r12 adc rdx,0 mov QWORD[8+rdi],rax mov QWORD[16+rdi],rdx mov QWORD[24+rdi],r15 mov rax,QWORD[((-16))+rbp*1+rsi] lea rdi,[((48+8))+rsp] xor r10,r10 mov r11,QWORD[8+rdi] lea r12,[r10*2+r14] shr r10,63 lea r13,[r11*2+rcx] shr r11,63 or r13,r10 mov r10,QWORD[16+rdi] mov r14,r11 mul rax neg r15 mov r11,QWORD[24+rdi] adc r12,rax mov rax,QWORD[((-8))+rbp*1+rsi] mov QWORD[rdi],r12 adc r13,rdx lea rbx,[r10*2+r14] mov QWORD[8+rdi],r13 sbb r15,r15 shr r10,63 lea r8,[r11*2+rcx] shr r11,63 or r8,r10 mov r10,QWORD[32+rdi] mov r14,r11 mul rax neg r15 mov r11,QWORD[40+rdi] adc rbx,rax mov rax,QWORD[rbp*1+rsi] mov QWORD[16+rdi],rbx adc r8,rdx lea rbp,[16+rbp] mov QWORD[24+rdi],r8 sbb r15,r15 lea rdi,[64+rdi] jmp NEAR $L$sqr4x_shift_n_add ALIGN 32 $L$sqr4x_shift_n_add: lea r12,[r10*2+r14] shr r10,63 lea r13,[r11*2+rcx] shr r11,63 or r13,r10 mov r10,QWORD[((-16))+rdi] mov r14,r11 mul rax neg r15 mov r11,QWORD[((-8))+rdi] adc r12,rax mov rax,QWORD[((-8))+rbp*1+rsi] mov QWORD[((-32))+rdi],r12 adc r13,rdx lea rbx,[r10*2+r14] mov QWORD[((-24))+rdi],r13 sbb r15,r15 shr r10,63 lea r8,[r11*2+rcx] shr r11,63 or r8,r10 mov r10,QWORD[rdi] mov r14,r11 mul rax neg r15 mov r11,QWORD[8+rdi] adc rbx,rax mov rax,QWORD[rbp*1+rsi] mov QWORD[((-16))+rdi],rbx adc r8,rdx lea r12,[r10*2+r14] mov QWORD[((-8))+rdi],r8 sbb r15,r15 shr r10,63 lea r13,[r11*2+rcx] shr r11,63 or r13,r10 mov r10,QWORD[16+rdi] mov r14,r11 mul rax neg r15 mov r11,QWORD[24+rdi] adc r12,rax mov rax,QWORD[8+rbp*1+rsi] mov QWORD[rdi],r12 adc r13,rdx lea rbx,[r10*2+r14] mov QWORD[8+rdi],r13 sbb r15,r15 shr r10,63 lea r8,[r11*2+rcx] shr r11,63 or r8,r10 mov r10,QWORD[32+rdi] mov r14,r11 mul rax neg r15 mov r11,QWORD[40+rdi] adc rbx,rax mov rax,QWORD[16+rbp*1+rsi] mov QWORD[16+rdi],rbx adc r8,rdx mov QWORD[24+rdi],r8 sbb r15,r15 lea rdi,[64+rdi] add rbp,32 jnz NEAR $L$sqr4x_shift_n_add lea r12,[r10*2+r14] DB 0x67 shr r10,63 lea r13,[r11*2+rcx] shr r11,63 or r13,r10 mov r10,QWORD[((-16))+rdi] mov r14,r11 mul rax neg r15 mov r11,QWORD[((-8))+rdi] adc r12,rax mov rax,QWORD[((-8))+rsi] mov QWORD[((-32))+rdi],r12 adc r13,rdx lea rbx,[r10*2+r14] mov QWORD[((-24))+rdi],r13 sbb r15,r15 shr r10,63 lea r8,[r11*2+rcx] shr r11,63 or r8,r10 mul rax neg r15 adc rbx,rax adc r8,rdx mov QWORD[((-16))+rdi],rbx mov QWORD[((-8))+rdi],r8 DB 102,72,15,126,213 __bn_sqr8x_reduction: xor rax,rax lea rcx,[rbp*1+r9] lea rdx,[((48+8))+r9*2+rsp] mov QWORD[((0+8))+rsp],rcx lea rdi,[((48+8))+r9*1+rsp] mov QWORD[((8+8))+rsp],rdx neg r9 jmp NEAR $L$8x_reduction_loop ALIGN 32 $L$8x_reduction_loop: lea rdi,[r9*1+rdi] DB 0x66 mov rbx,QWORD[rdi] mov r9,QWORD[8+rdi] mov r10,QWORD[16+rdi] mov r11,QWORD[24+rdi] mov r12,QWORD[32+rdi] mov r13,QWORD[40+rdi] mov r14,QWORD[48+rdi] mov r15,QWORD[56+rdi] mov QWORD[rdx],rax lea rdi,[64+rdi] DB 0x67 mov r8,rbx imul rbx,QWORD[((32+8))+rsp] mov rax,QWORD[rbp] mov ecx,8 jmp NEAR $L$8x_reduce ALIGN 32 $L$8x_reduce: mul rbx mov rax,QWORD[8+rbp] neg r8 mov r8,rdx adc r8,0 mul rbx add r9,rax mov rax,QWORD[16+rbp] adc rdx,0 add r8,r9 mov QWORD[((48-8+8))+rcx*8+rsp],rbx mov r9,rdx adc r9,0 mul rbx add r10,rax mov rax,QWORD[24+rbp] adc rdx,0 add r9,r10 mov rsi,QWORD[((32+8))+rsp] mov r10,rdx adc r10,0 mul rbx add r11,rax mov rax,QWORD[32+rbp] adc rdx,0 imul rsi,r8 add r10,r11 mov r11,rdx adc r11,0 mul rbx add r12,rax mov rax,QWORD[40+rbp] adc rdx,0 add r11,r12 mov r12,rdx adc r12,0 mul rbx add r13,rax mov rax,QWORD[48+rbp] adc rdx,0 add r12,r13 mov r13,rdx adc r13,0 mul rbx add r14,rax mov rax,QWORD[56+rbp] adc rdx,0 add r13,r14 mov r14,rdx adc r14,0 mul rbx mov rbx,rsi add r15,rax mov rax,QWORD[rbp] adc rdx,0 add r14,r15 mov r15,rdx adc r15,0 dec ecx jnz NEAR $L$8x_reduce lea rbp,[64+rbp] xor rax,rax mov rdx,QWORD[((8+8))+rsp] cmp rbp,QWORD[((0+8))+rsp] jae NEAR $L$8x_no_tail DB 0x66 add r8,QWORD[rdi] adc r9,QWORD[8+rdi] adc r10,QWORD[16+rdi] adc r11,QWORD[24+rdi] adc r12,QWORD[32+rdi] adc r13,QWORD[40+rdi] adc r14,QWORD[48+rdi] adc r15,QWORD[56+rdi] sbb rsi,rsi mov rbx,QWORD[((48+56+8))+rsp] mov ecx,8 mov rax,QWORD[rbp] jmp NEAR $L$8x_tail ALIGN 32 $L$8x_tail: mul rbx add r8,rax mov rax,QWORD[8+rbp] mov QWORD[rdi],r8 mov r8,rdx adc r8,0 mul rbx add r9,rax mov rax,QWORD[16+rbp] adc rdx,0 add r8,r9 lea rdi,[8+rdi] mov r9,rdx adc r9,0 mul rbx add r10,rax mov rax,QWORD[24+rbp] adc rdx,0 add r9,r10 mov r10,rdx adc r10,0 mul rbx add r11,rax mov rax,QWORD[32+rbp] adc rdx,0 add r10,r11 mov r11,rdx adc r11,0 mul rbx add r12,rax mov rax,QWORD[40+rbp] adc rdx,0 add r11,r12 mov r12,rdx adc r12,0 mul rbx add r13,rax mov rax,QWORD[48+rbp] adc rdx,0 add r12,r13 mov r13,rdx adc r13,0 mul rbx add r14,rax mov rax,QWORD[56+rbp] adc rdx,0 add r13,r14 mov r14,rdx adc r14,0 mul rbx mov rbx,QWORD[((48-16+8))+rcx*8+rsp] add r15,rax adc rdx,0 add r14,r15 mov rax,QWORD[rbp] mov r15,rdx adc r15,0 dec ecx jnz NEAR $L$8x_tail lea rbp,[64+rbp] mov rdx,QWORD[((8+8))+rsp] cmp rbp,QWORD[((0+8))+rsp] jae NEAR $L$8x_tail_done mov rbx,QWORD[((48+56+8))+rsp] neg rsi mov rax,QWORD[rbp] adc r8,QWORD[rdi] adc r9,QWORD[8+rdi] adc r10,QWORD[16+rdi] adc r11,QWORD[24+rdi] adc r12,QWORD[32+rdi] adc r13,QWORD[40+rdi] adc r14,QWORD[48+rdi] adc r15,QWORD[56+rdi] sbb rsi,rsi mov ecx,8 jmp NEAR $L$8x_tail ALIGN 32 $L$8x_tail_done: xor rax,rax add r8,QWORD[rdx] adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 adc r14,0 adc r15,0 adc rax,0 neg rsi $L$8x_no_tail: adc r8,QWORD[rdi] adc r9,QWORD[8+rdi] adc r10,QWORD[16+rdi] adc r11,QWORD[24+rdi] adc r12,QWORD[32+rdi] adc r13,QWORD[40+rdi] adc r14,QWORD[48+rdi] adc r15,QWORD[56+rdi] adc rax,0 mov rcx,QWORD[((-8))+rbp] xor rsi,rsi DB 102,72,15,126,213 mov QWORD[rdi],r8 mov QWORD[8+rdi],r9 DB 102,73,15,126,217 mov QWORD[16+rdi],r10 mov QWORD[24+rdi],r11 mov QWORD[32+rdi],r12 mov QWORD[40+rdi],r13 mov QWORD[48+rdi],r14 mov QWORD[56+rdi],r15 lea rdi,[64+rdi] cmp rdi,rdx jb NEAR $L$8x_reduction_loop ret ALIGN 32 __bn_post4x_internal: mov r12,QWORD[rbp] lea rbx,[r9*1+rdi] mov rcx,r9 DB 102,72,15,126,207 neg rax DB 102,72,15,126,206 sar rcx,3+2 dec r12 xor r10,r10 mov r13,QWORD[8+rbp] mov r14,QWORD[16+rbp] mov r15,QWORD[24+rbp] jmp NEAR $L$sqr4x_sub_entry ALIGN 16 $L$sqr4x_sub: mov r12,QWORD[rbp] mov r13,QWORD[8+rbp] mov r14,QWORD[16+rbp] mov r15,QWORD[24+rbp] $L$sqr4x_sub_entry: lea rbp,[32+rbp] not r12 not r13 not r14 not r15 and r12,rax and r13,rax and r14,rax and r15,rax neg r10 adc r12,QWORD[rbx] adc r13,QWORD[8+rbx] adc r14,QWORD[16+rbx] adc r15,QWORD[24+rbx] mov QWORD[rdi],r12 lea rbx,[32+rbx] mov QWORD[8+rdi],r13 sbb r10,r10 mov QWORD[16+rdi],r14 mov QWORD[24+rdi],r15 lea rdi,[32+rdi] inc rcx jnz NEAR $L$sqr4x_sub mov r10,r9 neg r9 ret global bn_mulx4x_mont_gather5 ALIGN 32 bn_mulx4x_mont_gather5: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_bn_mulx4x_mont_gather5: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] mov r9,QWORD[48+rsp] _CET_ENDBR mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 $L$mulx4x_prologue: shl r9d,3 lea r10,[r9*2+r9] neg r9 mov r8,QWORD[r8] lea r11,[((-320))+r9*2+rsp] mov rbp,rsp sub r11,rdi and r11,4095 cmp r10,r11 jb NEAR $L$mulx4xsp_alt sub rbp,r11 lea rbp,[((-320))+r9*2+rbp] jmp NEAR $L$mulx4xsp_done $L$mulx4xsp_alt: lea r10,[((4096-320))+r9*2] lea rbp,[((-320))+r9*2+rbp] sub r11,r10 mov r10,0 cmovc r11,r10 sub rbp,r11 $L$mulx4xsp_done: and rbp,-64 mov r11,rsp sub r11,rbp and r11,-4096 lea rsp,[rbp*1+r11] mov r10,QWORD[rsp] cmp rsp,rbp ja NEAR $L$mulx4x_page_walk jmp NEAR $L$mulx4x_page_walk_done $L$mulx4x_page_walk: lea rsp,[((-4096))+rsp] mov r10,QWORD[rsp] cmp rsp,rbp ja NEAR $L$mulx4x_page_walk $L$mulx4x_page_walk_done: mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax $L$mulx4x_body: call mulx4x_internal mov rsi,QWORD[40+rsp] mov rax,1 mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$mulx4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_bn_mulx4x_mont_gather5: ALIGN 32 mulx4x_internal: mov QWORD[8+rsp],r9 mov r10,r9 neg r9 shl r9,5 neg r10 lea r13,[128+r9*1+rdx] shr r9,5+5 movd xmm5,DWORD[56+rax] sub r9,1 lea rax,[$L$inc] mov QWORD[((16+8))+rsp],r13 mov QWORD[((24+8))+rsp],r9 mov QWORD[((56+8))+rsp],rdi movdqa xmm0,XMMWORD[rax] movdqa xmm1,XMMWORD[16+rax] lea r10,[((88-112))+r10*1+rsp] lea rdi,[128+rdx] pshufd xmm5,xmm5,0 movdqa xmm4,xmm1 DB 0x67 movdqa xmm2,xmm1 DB 0x67 paddd xmm1,xmm0 pcmpeqd xmm0,xmm5 movdqa xmm3,xmm4 paddd xmm2,xmm1 pcmpeqd xmm1,xmm5 movdqa XMMWORD[112+r10],xmm0 movdqa xmm0,xmm4 paddd xmm3,xmm2 pcmpeqd xmm2,xmm5 movdqa XMMWORD[128+r10],xmm1 movdqa xmm1,xmm4 paddd xmm0,xmm3 pcmpeqd xmm3,xmm5 movdqa XMMWORD[144+r10],xmm2 movdqa xmm2,xmm4 paddd xmm1,xmm0 pcmpeqd xmm0,xmm5 movdqa XMMWORD[160+r10],xmm3 movdqa xmm3,xmm4 paddd xmm2,xmm1 pcmpeqd xmm1,xmm5 movdqa XMMWORD[176+r10],xmm0 movdqa xmm0,xmm4 paddd xmm3,xmm2 pcmpeqd xmm2,xmm5 movdqa XMMWORD[192+r10],xmm1 movdqa xmm1,xmm4 paddd xmm0,xmm3 pcmpeqd xmm3,xmm5 movdqa XMMWORD[208+r10],xmm2 movdqa xmm2,xmm4 paddd xmm1,xmm0 pcmpeqd xmm0,xmm5 movdqa XMMWORD[224+r10],xmm3 movdqa xmm3,xmm4 paddd xmm2,xmm1 pcmpeqd xmm1,xmm5 movdqa XMMWORD[240+r10],xmm0 movdqa xmm0,xmm4 paddd xmm3,xmm2 pcmpeqd xmm2,xmm5 movdqa XMMWORD[256+r10],xmm1 movdqa xmm1,xmm4 paddd xmm0,xmm3 pcmpeqd xmm3,xmm5 movdqa XMMWORD[272+r10],xmm2 movdqa xmm2,xmm4 paddd xmm1,xmm0 pcmpeqd xmm0,xmm5 movdqa XMMWORD[288+r10],xmm3 movdqa xmm3,xmm4 DB 0x67 paddd xmm2,xmm1 pcmpeqd xmm1,xmm5 movdqa XMMWORD[304+r10],xmm0 paddd xmm3,xmm2 pcmpeqd xmm2,xmm5 movdqa XMMWORD[320+r10],xmm1 pcmpeqd xmm3,xmm5 movdqa XMMWORD[336+r10],xmm2 pand xmm0,XMMWORD[64+rdi] pand xmm1,XMMWORD[80+rdi] pand xmm2,XMMWORD[96+rdi] movdqa XMMWORD[352+r10],xmm3 pand xmm3,XMMWORD[112+rdi] por xmm0,xmm2 por xmm1,xmm3 movdqa xmm4,XMMWORD[((-128))+rdi] movdqa xmm5,XMMWORD[((-112))+rdi] movdqa xmm2,XMMWORD[((-96))+rdi] pand xmm4,XMMWORD[112+r10] movdqa xmm3,XMMWORD[((-80))+rdi] pand xmm5,XMMWORD[128+r10] por xmm0,xmm4 pand xmm2,XMMWORD[144+r10] por xmm1,xmm5 pand xmm3,XMMWORD[160+r10] por xmm0,xmm2 por xmm1,xmm3 movdqa xmm4,XMMWORD[((-64))+rdi] movdqa xmm5,XMMWORD[((-48))+rdi] movdqa xmm2,XMMWORD[((-32))+rdi] pand xmm4,XMMWORD[176+r10] movdqa xmm3,XMMWORD[((-16))+rdi] pand xmm5,XMMWORD[192+r10] por xmm0,xmm4 pand xmm2,XMMWORD[208+r10] por xmm1,xmm5 pand xmm3,XMMWORD[224+r10] por xmm0,xmm2 por xmm1,xmm3 movdqa xmm4,XMMWORD[rdi] movdqa xmm5,XMMWORD[16+rdi] movdqa xmm2,XMMWORD[32+rdi] pand xmm4,XMMWORD[240+r10] movdqa xmm3,XMMWORD[48+rdi] pand xmm5,XMMWORD[256+r10] por xmm0,xmm4 pand xmm2,XMMWORD[272+r10] por xmm1,xmm5 pand xmm3,XMMWORD[288+r10] por xmm0,xmm2 por xmm1,xmm3 pxor xmm0,xmm1 pshufd xmm1,xmm0,0x4e por xmm0,xmm1 lea rdi,[256+rdi] DB 102,72,15,126,194 lea rbx,[((64+32+8))+rsp] mov r9,rdx mulx rax,r8,QWORD[rsi] mulx r12,r11,QWORD[8+rsi] add r11,rax mulx r13,rax,QWORD[16+rsi] adc r12,rax adc r13,0 mulx r14,rax,QWORD[24+rsi] mov r15,r8 imul r8,QWORD[((32+8))+rsp] xor rbp,rbp mov rdx,r8 mov QWORD[((8+8))+rsp],rdi lea rsi,[32+rsi] adcx r13,rax adcx r14,rbp mulx r10,rax,QWORD[rcx] adcx r15,rax adox r10,r11 mulx r11,rax,QWORD[8+rcx] adcx r10,rax adox r11,r12 mulx r12,rax,QWORD[16+rcx] mov rdi,QWORD[((24+8))+rsp] mov QWORD[((-32))+rbx],r10 adcx r11,rax adox r12,r13 mulx r15,rax,QWORD[24+rcx] mov rdx,r9 mov QWORD[((-24))+rbx],r11 adcx r12,rax adox r15,rbp lea rcx,[32+rcx] mov QWORD[((-16))+rbx],r12 jmp NEAR $L$mulx4x_1st ALIGN 32 $L$mulx4x_1st: adcx r15,rbp mulx rax,r10,QWORD[rsi] adcx r10,r14 mulx r14,r11,QWORD[8+rsi] adcx r11,rax mulx rax,r12,QWORD[16+rsi] adcx r12,r14 mulx r14,r13,QWORD[24+rsi] DB 0x67,0x67 mov rdx,r8 adcx r13,rax adcx r14,rbp lea rsi,[32+rsi] lea rbx,[32+rbx] adox r10,r15 mulx r15,rax,QWORD[rcx] adcx r10,rax adox r11,r15 mulx r15,rax,QWORD[8+rcx] adcx r11,rax adox r12,r15 mulx r15,rax,QWORD[16+rcx] mov QWORD[((-40))+rbx],r10 adcx r12,rax mov QWORD[((-32))+rbx],r11 adox r13,r15 mulx r15,rax,QWORD[24+rcx] mov rdx,r9 mov QWORD[((-24))+rbx],r12 adcx r13,rax adox r15,rbp lea rcx,[32+rcx] mov QWORD[((-16))+rbx],r13 dec rdi jnz NEAR $L$mulx4x_1st mov rax,QWORD[8+rsp] adc r15,rbp lea rsi,[rax*1+rsi] add r14,r15 mov rdi,QWORD[((8+8))+rsp] adc rbp,rbp mov QWORD[((-8))+rbx],r14 jmp NEAR $L$mulx4x_outer ALIGN 32 $L$mulx4x_outer: lea r10,[((16-256))+rbx] pxor xmm4,xmm4 DB 0x67,0x67 pxor xmm5,xmm5 movdqa xmm0,XMMWORD[((-128))+rdi] movdqa xmm1,XMMWORD[((-112))+rdi] movdqa xmm2,XMMWORD[((-96))+rdi] pand xmm0,XMMWORD[256+r10] movdqa xmm3,XMMWORD[((-80))+rdi] pand xmm1,XMMWORD[272+r10] por xmm4,xmm0 pand xmm2,XMMWORD[288+r10] por xmm5,xmm1 pand xmm3,XMMWORD[304+r10] por xmm4,xmm2 por xmm5,xmm3 movdqa xmm0,XMMWORD[((-64))+rdi] movdqa xmm1,XMMWORD[((-48))+rdi] movdqa xmm2,XMMWORD[((-32))+rdi] pand xmm0,XMMWORD[320+r10] movdqa xmm3,XMMWORD[((-16))+rdi] pand xmm1,XMMWORD[336+r10] por xmm4,xmm0 pand xmm2,XMMWORD[352+r10] por xmm5,xmm1 pand xmm3,XMMWORD[368+r10] por xmm4,xmm2 por xmm5,xmm3 movdqa xmm0,XMMWORD[rdi] movdqa xmm1,XMMWORD[16+rdi] movdqa xmm2,XMMWORD[32+rdi] pand xmm0,XMMWORD[384+r10] movdqa xmm3,XMMWORD[48+rdi] pand xmm1,XMMWORD[400+r10] por xmm4,xmm0 pand xmm2,XMMWORD[416+r10] por xmm5,xmm1 pand xmm3,XMMWORD[432+r10] por xmm4,xmm2 por xmm5,xmm3 movdqa xmm0,XMMWORD[64+rdi] movdqa xmm1,XMMWORD[80+rdi] movdqa xmm2,XMMWORD[96+rdi] pand xmm0,XMMWORD[448+r10] movdqa xmm3,XMMWORD[112+rdi] pand xmm1,XMMWORD[464+r10] por xmm4,xmm0 pand xmm2,XMMWORD[480+r10] por xmm5,xmm1 pand xmm3,XMMWORD[496+r10] por xmm4,xmm2 por xmm5,xmm3 por xmm4,xmm5 pshufd xmm0,xmm4,0x4e por xmm0,xmm4 lea rdi,[256+rdi] DB 102,72,15,126,194 mov QWORD[rbx],rbp lea rbx,[32+rax*1+rbx] mulx r11,r8,QWORD[rsi] xor rbp,rbp mov r9,rdx mulx r12,r14,QWORD[8+rsi] adox r8,QWORD[((-32))+rbx] adcx r11,r14 mulx r13,r15,QWORD[16+rsi] adox r11,QWORD[((-24))+rbx] adcx r12,r15 mulx r14,rdx,QWORD[24+rsi] adox r12,QWORD[((-16))+rbx] adcx r13,rdx lea rcx,[rax*1+rcx] lea rsi,[32+rsi] adox r13,QWORD[((-8))+rbx] adcx r14,rbp adox r14,rbp mov r15,r8 imul r8,QWORD[((32+8))+rsp] mov rdx,r8 xor rbp,rbp mov QWORD[((8+8))+rsp],rdi mulx r10,rax,QWORD[rcx] adcx r15,rax adox r10,r11 mulx r11,rax,QWORD[8+rcx] adcx r10,rax adox r11,r12 mulx r12,rax,QWORD[16+rcx] adcx r11,rax adox r12,r13 mulx r15,rax,QWORD[24+rcx] mov rdx,r9 mov rdi,QWORD[((24+8))+rsp] mov QWORD[((-32))+rbx],r10 adcx r12,rax mov QWORD[((-24))+rbx],r11 adox r15,rbp mov QWORD[((-16))+rbx],r12 lea rcx,[32+rcx] jmp NEAR $L$mulx4x_inner ALIGN 32 $L$mulx4x_inner: mulx rax,r10,QWORD[rsi] adcx r15,rbp adox r10,r14 mulx r14,r11,QWORD[8+rsi] adcx r10,QWORD[rbx] adox r11,rax mulx rax,r12,QWORD[16+rsi] adcx r11,QWORD[8+rbx] adox r12,r14 mulx r14,r13,QWORD[24+rsi] mov rdx,r8 adcx r12,QWORD[16+rbx] adox r13,rax adcx r13,QWORD[24+rbx] adox r14,rbp lea rsi,[32+rsi] lea rbx,[32+rbx] adcx r14,rbp adox r10,r15 mulx r15,rax,QWORD[rcx] adcx r10,rax adox r11,r15 mulx r15,rax,QWORD[8+rcx] adcx r11,rax adox r12,r15 mulx r15,rax,QWORD[16+rcx] mov QWORD[((-40))+rbx],r10 adcx r12,rax adox r13,r15 mov QWORD[((-32))+rbx],r11 mulx r15,rax,QWORD[24+rcx] mov rdx,r9 lea rcx,[32+rcx] mov QWORD[((-24))+rbx],r12 adcx r13,rax adox r15,rbp mov QWORD[((-16))+rbx],r13 dec rdi jnz NEAR $L$mulx4x_inner mov rax,QWORD[((0+8))+rsp] adc r15,rbp sub rdi,QWORD[rbx] mov rdi,QWORD[((8+8))+rsp] mov r10,QWORD[((16+8))+rsp] adc r14,r15 lea rsi,[rax*1+rsi] adc rbp,rbp mov QWORD[((-8))+rbx],r14 cmp rdi,r10 jb NEAR $L$mulx4x_outer mov r10,QWORD[((-8))+rcx] mov r8,rbp mov r12,QWORD[rax*1+rcx] lea rbp,[rax*1+rcx] mov rcx,rax lea rdi,[rax*1+rbx] xor eax,eax xor r15,r15 sub r10,r14 adc r15,r15 or r8,r15 sar rcx,3+2 sub rax,r8 mov rdx,QWORD[((56+8))+rsp] dec r12 mov r13,QWORD[8+rbp] xor r8,r8 mov r14,QWORD[16+rbp] mov r15,QWORD[24+rbp] jmp NEAR $L$sqrx4x_sub_entry global bn_powerx5 ALIGN 32 bn_powerx5: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_bn_powerx5: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD[40+rsp] mov r9,QWORD[48+rsp] _CET_ENDBR mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 $L$powerx5_prologue: shl r9d,3 lea r10,[r9*2+r9] neg r9 mov r8,QWORD[r8] lea r11,[((-320))+r9*2+rsp] mov rbp,rsp sub r11,rdi and r11,4095 cmp r10,r11 jb NEAR $L$pwrx_sp_alt sub rbp,r11 lea rbp,[((-320))+r9*2+rbp] jmp NEAR $L$pwrx_sp_done ALIGN 32 $L$pwrx_sp_alt: lea r10,[((4096-320))+r9*2] lea rbp,[((-320))+r9*2+rbp] sub r11,r10 mov r10,0 cmovc r11,r10 sub rbp,r11 $L$pwrx_sp_done: and rbp,-64 mov r11,rsp sub r11,rbp and r11,-4096 lea rsp,[rbp*1+r11] mov r10,QWORD[rsp] cmp rsp,rbp ja NEAR $L$pwrx_page_walk jmp NEAR $L$pwrx_page_walk_done $L$pwrx_page_walk: lea rsp,[((-4096))+rsp] mov r10,QWORD[rsp] cmp rsp,rbp ja NEAR $L$pwrx_page_walk $L$pwrx_page_walk_done: mov r10,r9 neg r9 pxor xmm0,xmm0 DB 102,72,15,110,207 DB 102,72,15,110,209 DB 102,73,15,110,218 DB 102,72,15,110,226 mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax $L$powerx5_body: call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal mov r9,r10 mov rdi,rsi DB 102,72,15,126,209 DB 102,72,15,126,226 mov rax,QWORD[40+rsp] call mulx4x_internal mov rsi,QWORD[40+rsp] mov rax,1 mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] mov r12,QWORD[((-24))+rsi] mov rbp,QWORD[((-16))+rsi] mov rbx,QWORD[((-8))+rsi] lea rsp,[rsi] $L$powerx5_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] ret $L$SEH_end_bn_powerx5: global bn_sqrx8x_internal ALIGN 32 bn_sqrx8x_internal: __bn_sqrx8x_internal: _CET_ENDBR lea rdi,[((48+8))+rsp] lea rbp,[r9*1+rsi] mov QWORD[((0+8))+rsp],r9 mov QWORD[((8+8))+rsp],rbp jmp NEAR $L$sqr8x_zero_start ALIGN 32 DB 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 $L$sqrx8x_zero: DB 0x3e movdqa XMMWORD[rdi],xmm0 movdqa XMMWORD[16+rdi],xmm0 movdqa XMMWORD[32+rdi],xmm0 movdqa XMMWORD[48+rdi],xmm0 $L$sqr8x_zero_start: movdqa XMMWORD[64+rdi],xmm0 movdqa XMMWORD[80+rdi],xmm0 movdqa XMMWORD[96+rdi],xmm0 movdqa XMMWORD[112+rdi],xmm0 lea rdi,[128+rdi] sub r9,64 jnz NEAR $L$sqrx8x_zero mov rdx,QWORD[rsi] xor r10,r10 xor r11,r11 xor r12,r12 xor r13,r13 xor r14,r14 xor r15,r15 lea rdi,[((48+8))+rsp] xor rbp,rbp jmp NEAR $L$sqrx8x_outer_loop ALIGN 32 $L$sqrx8x_outer_loop: mulx rax,r8,QWORD[8+rsi] adcx r8,r9 adox r10,rax mulx rax,r9,QWORD[16+rsi] adcx r9,r10 adox r11,rax DB 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 adcx r10,r11 adox r12,rax DB 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 adcx r11,r12 adox r13,rax mulx rax,r12,QWORD[40+rsi] adcx r12,r13 adox r14,rax mulx rax,r13,QWORD[48+rsi] adcx r13,r14 adox rax,r15 mulx r15,r14,QWORD[56+rsi] mov rdx,QWORD[8+rsi] adcx r14,rax adox r15,rbp adc r15,QWORD[64+rdi] mov QWORD[8+rdi],r8 mov QWORD[16+rdi],r9 sbb rcx,rcx xor rbp,rbp mulx rbx,r8,QWORD[16+rsi] mulx rax,r9,QWORD[24+rsi] adcx r8,r10 adox r9,rbx mulx rbx,r10,QWORD[32+rsi] adcx r9,r11 adox r10,rax DB 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 adcx r10,r12 adox r11,rbx DB 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 adcx r11,r13 adox r12,r14 DB 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 mov rdx,QWORD[16+rsi] adcx r12,rax adox r13,rbx adcx r13,r15 adox r14,rbp adcx r14,rbp mov QWORD[24+rdi],r8 mov QWORD[32+rdi],r9 mulx rbx,r8,QWORD[24+rsi] mulx rax,r9,QWORD[32+rsi] adcx r8,r10 adox r9,rbx mulx rbx,r10,QWORD[40+rsi] adcx r9,r11 adox r10,rax DB 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 adcx r10,r12 adox r11,r13 DB 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 DB 0x3e mov rdx,QWORD[24+rsi] adcx r11,rbx adox r12,rax adcx r12,r14 mov QWORD[40+rdi],r8 mov QWORD[48+rdi],r9 mulx rax,r8,QWORD[32+rsi] adox r13,rbp adcx r13,rbp mulx rbx,r9,QWORD[40+rsi] adcx r8,r10 adox r9,rax mulx rax,r10,QWORD[48+rsi] adcx r9,r11 adox r10,r12 mulx r12,r11,QWORD[56+rsi] mov rdx,QWORD[32+rsi] mov r14,QWORD[40+rsi] adcx r10,rbx adox r11,rax mov r15,QWORD[48+rsi] adcx r11,r13 adox r12,rbp adcx r12,rbp mov QWORD[56+rdi],r8 mov QWORD[64+rdi],r9 mulx rax,r9,r14 mov r8,QWORD[56+rsi] adcx r9,r10 mulx rbx,r10,r15 adox r10,rax adcx r10,r11 mulx rax,r11,r8 mov rdx,r14 adox r11,rbx adcx r11,r12 adcx rax,rbp mulx rbx,r14,r15 mulx r13,r12,r8 mov rdx,r15 lea rsi,[64+rsi] adcx r11,r14 adox r12,rbx adcx r12,rax adox r13,rbp DB 0x67,0x67 mulx r14,r8,r8 adcx r13,r8 adcx r14,rbp cmp rsi,QWORD[((8+8))+rsp] je NEAR $L$sqrx8x_outer_break neg rcx mov rcx,-8 mov r15,rbp mov r8,QWORD[64+rdi] adcx r9,QWORD[72+rdi] adcx r10,QWORD[80+rdi] adcx r11,QWORD[88+rdi] adc r12,QWORD[96+rdi] adc r13,QWORD[104+rdi] adc r14,QWORD[112+rdi] adc r15,QWORD[120+rdi] lea rbp,[rsi] lea rdi,[128+rdi] sbb rax,rax mov rdx,QWORD[((-64))+rsi] mov QWORD[((16+8))+rsp],rax mov QWORD[((24+8))+rsp],rdi xor eax,eax jmp NEAR $L$sqrx8x_loop ALIGN 32 $L$sqrx8x_loop: mov rbx,r8 mulx r8,rax,QWORD[rbp] adcx rbx,rax adox r8,r9 mulx r9,rax,QWORD[8+rbp] adcx r8,rax adox r9,r10 mulx r10,rax,QWORD[16+rbp] adcx r9,rax adox r10,r11 mulx r11,rax,QWORD[24+rbp] adcx r10,rax adox r11,r12 DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 adcx r11,rax adox r12,r13 mulx r13,rax,QWORD[40+rbp] adcx r12,rax adox r13,r14 mulx r14,rax,QWORD[48+rbp] mov QWORD[rcx*8+rdi],rbx mov ebx,0 adcx r13,rax adox r14,r15 DB 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 mov rdx,QWORD[8+rcx*8+rsi] adcx r14,rax adox r15,rbx adcx r15,rbx DB 0x67 inc rcx jnz NEAR $L$sqrx8x_loop lea rbp,[64+rbp] mov rcx,-8 cmp rbp,QWORD[((8+8))+rsp] je NEAR $L$sqrx8x_break sub rbx,QWORD[((16+8))+rsp] DB 0x66 mov rdx,QWORD[((-64))+rsi] adcx r8,QWORD[rdi] adcx r9,QWORD[8+rdi] adc r10,QWORD[16+rdi] adc r11,QWORD[24+rdi] adc r12,QWORD[32+rdi] adc r13,QWORD[40+rdi] adc r14,QWORD[48+rdi] adc r15,QWORD[56+rdi] lea rdi,[64+rdi] DB 0x67 sbb rax,rax xor ebx,ebx mov QWORD[((16+8))+rsp],rax jmp NEAR $L$sqrx8x_loop ALIGN 32 $L$sqrx8x_break: xor rbp,rbp sub rbx,QWORD[((16+8))+rsp] adcx r8,rbp mov rcx,QWORD[((24+8))+rsp] adcx r9,rbp mov rdx,QWORD[rsi] adc r10,0 mov QWORD[rdi],r8 adc r11,0 adc r12,0 adc r13,0 adc r14,0 adc r15,0 cmp rdi,rcx je NEAR $L$sqrx8x_outer_loop mov QWORD[8+rdi],r9 mov r9,QWORD[8+rcx] mov QWORD[16+rdi],r10 mov r10,QWORD[16+rcx] mov QWORD[24+rdi],r11 mov r11,QWORD[24+rcx] mov QWORD[32+rdi],r12 mov r12,QWORD[32+rcx] mov QWORD[40+rdi],r13 mov r13,QWORD[40+rcx] mov QWORD[48+rdi],r14 mov r14,QWORD[48+rcx] mov QWORD[56+rdi],r15 mov r15,QWORD[56+rcx] mov rdi,rcx jmp NEAR $L$sqrx8x_outer_loop ALIGN 32 $L$sqrx8x_outer_break: mov QWORD[72+rdi],r9 DB 102,72,15,126,217 mov QWORD[80+rdi],r10 mov QWORD[88+rdi],r11 mov QWORD[96+rdi],r12 mov QWORD[104+rdi],r13 mov QWORD[112+rdi],r14 lea rdi,[((48+8))+rsp] mov rdx,QWORD[rcx*1+rsi] mov r11,QWORD[8+rdi] xor r10,r10 mov r9,QWORD[((0+8))+rsp] adox r11,r11 mov r12,QWORD[16+rdi] mov r13,QWORD[24+rdi] ALIGN 32 $L$sqrx4x_shift_n_add: mulx rbx,rax,rdx adox r12,r12 adcx rax,r10 DB 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 DB 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 adox r13,r13 adcx rbx,r11 mov r11,QWORD[40+rdi] mov QWORD[rdi],rax mov QWORD[8+rdi],rbx mulx rbx,rax,rdx adox r10,r10 adcx rax,r12 mov rdx,QWORD[16+rcx*1+rsi] mov r12,QWORD[48+rdi] adox r11,r11 adcx rbx,r13 mov r13,QWORD[56+rdi] mov QWORD[16+rdi],rax mov QWORD[24+rdi],rbx mulx rbx,rax,rdx adox r12,r12 adcx rax,r10 mov rdx,QWORD[24+rcx*1+rsi] lea rcx,[32+rcx] mov r10,QWORD[64+rdi] adox r13,r13 adcx rbx,r11 mov r11,QWORD[72+rdi] mov QWORD[32+rdi],rax mov QWORD[40+rdi],rbx mulx rbx,rax,rdx adox r10,r10 adcx rax,r12 jrcxz $L$sqrx4x_shift_n_add_break DB 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 adox r11,r11 adcx rbx,r13 mov r12,QWORD[80+rdi] mov r13,QWORD[88+rdi] mov QWORD[48+rdi],rax mov QWORD[56+rdi],rbx lea rdi,[64+rdi] nop jmp NEAR $L$sqrx4x_shift_n_add ALIGN 32 $L$sqrx4x_shift_n_add_break: adcx rbx,r13 mov QWORD[48+rdi],rax mov QWORD[56+rdi],rbx lea rdi,[64+rdi] DB 102,72,15,126,213 __bn_sqrx8x_reduction: xor eax,eax mov rbx,QWORD[((32+8))+rsp] mov rdx,QWORD[((48+8))+rsp] lea rcx,[((-64))+r9*1+rbp] mov QWORD[((0+8))+rsp],rcx mov QWORD[((8+8))+rsp],rdi lea rdi,[((48+8))+rsp] jmp NEAR $L$sqrx8x_reduction_loop ALIGN 32 $L$sqrx8x_reduction_loop: mov r9,QWORD[8+rdi] mov r10,QWORD[16+rdi] mov r11,QWORD[24+rdi] mov r12,QWORD[32+rdi] mov r8,rdx imul rdx,rbx mov r13,QWORD[40+rdi] mov r14,QWORD[48+rdi] mov r15,QWORD[56+rdi] mov QWORD[((24+8))+rsp],rax lea rdi,[64+rdi] xor rsi,rsi mov rcx,-8 jmp NEAR $L$sqrx8x_reduce ALIGN 32 $L$sqrx8x_reduce: mov rbx,r8 mulx r8,rax,QWORD[rbp] adcx rax,rbx adox r8,r9 mulx r9,rbx,QWORD[8+rbp] adcx r8,rbx adox r9,r10 mulx r10,rbx,QWORD[16+rbp] adcx r9,rbx adox r10,r11 mulx r11,rbx,QWORD[24+rbp] adcx r10,rbx adox r11,r12 DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 mov rax,rdx mov rdx,r8 adcx r11,rbx adox r12,r13 mulx rdx,rbx,QWORD[((32+8))+rsp] mov rdx,rax mov QWORD[((64+48+8))+rcx*8+rsp],rax mulx r13,rax,QWORD[40+rbp] adcx r12,rax adox r13,r14 mulx r14,rax,QWORD[48+rbp] adcx r13,rax adox r14,r15 mulx r15,rax,QWORD[56+rbp] mov rdx,rbx adcx r14,rax adox r15,rsi adcx r15,rsi DB 0x67,0x67,0x67 inc rcx jnz NEAR $L$sqrx8x_reduce mov rax,rsi cmp rbp,QWORD[((0+8))+rsp] jae NEAR $L$sqrx8x_no_tail mov rdx,QWORD[((48+8))+rsp] add r8,QWORD[rdi] lea rbp,[64+rbp] mov rcx,-8 adcx r9,QWORD[8+rdi] adcx r10,QWORD[16+rdi] adc r11,QWORD[24+rdi] adc r12,QWORD[32+rdi] adc r13,QWORD[40+rdi] adc r14,QWORD[48+rdi] adc r15,QWORD[56+rdi] lea rdi,[64+rdi] sbb rax,rax xor rsi,rsi mov QWORD[((16+8))+rsp],rax jmp NEAR $L$sqrx8x_tail ALIGN 32 $L$sqrx8x_tail: mov rbx,r8 mulx r8,rax,QWORD[rbp] adcx rbx,rax adox r8,r9 mulx r9,rax,QWORD[8+rbp] adcx r8,rax adox r9,r10 mulx r10,rax,QWORD[16+rbp] adcx r9,rax adox r10,r11 mulx r11,rax,QWORD[24+rbp] adcx r10,rax adox r11,r12 DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 adcx r11,rax adox r12,r13 mulx r13,rax,QWORD[40+rbp] adcx r12,rax adox r13,r14 mulx r14,rax,QWORD[48+rbp] adcx r13,rax adox r14,r15 mulx r15,rax,QWORD[56+rbp] mov rdx,QWORD[((72+48+8))+rcx*8+rsp] adcx r14,rax adox r15,rsi mov QWORD[rcx*8+rdi],rbx mov rbx,r8 adcx r15,rsi inc rcx jnz NEAR $L$sqrx8x_tail cmp rbp,QWORD[((0+8))+rsp] jae NEAR $L$sqrx8x_tail_done sub rsi,QWORD[((16+8))+rsp] mov rdx,QWORD[((48+8))+rsp] lea rbp,[64+rbp] adc r8,QWORD[rdi] adc r9,QWORD[8+rdi] adc r10,QWORD[16+rdi] adc r11,QWORD[24+rdi] adc r12,QWORD[32+rdi] adc r13,QWORD[40+rdi] adc r14,QWORD[48+rdi] adc r15,QWORD[56+rdi] lea rdi,[64+rdi] sbb rax,rax sub rcx,8 xor rsi,rsi mov QWORD[((16+8))+rsp],rax jmp NEAR $L$sqrx8x_tail ALIGN 32 $L$sqrx8x_tail_done: xor rax,rax add r8,QWORD[((24+8))+rsp] adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 adc r14,0 adc r15,0 adc rax,0 sub rsi,QWORD[((16+8))+rsp] $L$sqrx8x_no_tail: adc r8,QWORD[rdi] DB 102,72,15,126,217 adc r9,QWORD[8+rdi] mov rsi,QWORD[56+rbp] DB 102,72,15,126,213 adc r10,QWORD[16+rdi] adc r11,QWORD[24+rdi] adc r12,QWORD[32+rdi] adc r13,QWORD[40+rdi] adc r14,QWORD[48+rdi] adc r15,QWORD[56+rdi] adc rax,0 mov rbx,QWORD[((32+8))+rsp] mov rdx,QWORD[64+rcx*1+rdi] mov QWORD[rdi],r8 lea r8,[64+rdi] mov QWORD[8+rdi],r9 mov QWORD[16+rdi],r10 mov QWORD[24+rdi],r11 mov QWORD[32+rdi],r12 mov QWORD[40+rdi],r13 mov QWORD[48+rdi],r14 mov QWORD[56+rdi],r15 lea rdi,[64+rcx*1+rdi] cmp r8,QWORD[((8+8))+rsp] jb NEAR $L$sqrx8x_reduction_loop ret ALIGN 32 __bn_postx4x_internal: mov r12,QWORD[rbp] mov r10,rcx mov r9,rcx neg rax sar rcx,3+2 DB 102,72,15,126,202 DB 102,72,15,126,206 dec r12 mov r13,QWORD[8+rbp] xor r8,r8 mov r14,QWORD[16+rbp] mov r15,QWORD[24+rbp] jmp NEAR $L$sqrx4x_sub_entry ALIGN 16 $L$sqrx4x_sub: mov r12,QWORD[rbp] mov r13,QWORD[8+rbp] mov r14,QWORD[16+rbp] mov r15,QWORD[24+rbp] $L$sqrx4x_sub_entry: andn r12,r12,rax lea rbp,[32+rbp] andn r13,r13,rax andn r14,r14,rax andn r15,r15,rax neg r8 adc r12,QWORD[rdi] adc r13,QWORD[8+rdi] adc r14,QWORD[16+rdi] adc r15,QWORD[24+rdi] mov QWORD[rdx],r12 lea rdi,[32+rdi] mov QWORD[8+rdx],r13 sbb r8,r8 mov QWORD[16+rdx],r14 mov QWORD[24+rdx],r15 lea rdx,[32+rdx] inc rcx jnz NEAR $L$sqrx4x_sub neg r9 ret global bn_scatter5 ALIGN 16 bn_scatter5: _CET_ENDBR cmp edx,0 jz NEAR $L$scatter_epilogue lea r8,[r9*8+r8] $L$scatter: mov rax,QWORD[rcx] lea rcx,[8+rcx] mov QWORD[r8],rax lea r8,[256+r8] sub edx,1 jnz NEAR $L$scatter $L$scatter_epilogue: ret global bn_gather5 ALIGN 32 bn_gather5: $L$SEH_begin_bn_gather5: _CET_ENDBR DB 0x4c,0x8d,0x14,0x24 DB 0x48,0x81,0xec,0x08,0x01,0x00,0x00 lea rax,[$L$inc] and rsp,-16 movd xmm5,r9d movdqa xmm0,XMMWORD[rax] movdqa xmm1,XMMWORD[16+rax] lea r11,[128+r8] lea rax,[128+rsp] pshufd xmm5,xmm5,0 movdqa xmm4,xmm1 movdqa xmm2,xmm1 paddd xmm1,xmm0 pcmpeqd xmm0,xmm5 movdqa xmm3,xmm4 paddd xmm2,xmm1 pcmpeqd xmm1,xmm5 movdqa XMMWORD[(-128)+rax],xmm0 movdqa xmm0,xmm4 paddd xmm3,xmm2 pcmpeqd xmm2,xmm5 movdqa XMMWORD[(-112)+rax],xmm1 movdqa xmm1,xmm4 paddd xmm0,xmm3 pcmpeqd xmm3,xmm5 movdqa XMMWORD[(-96)+rax],xmm2 movdqa xmm2,xmm4 paddd xmm1,xmm0 pcmpeqd xmm0,xmm5 movdqa XMMWORD[(-80)+rax],xmm3 movdqa xmm3,xmm4 paddd xmm2,xmm1 pcmpeqd xmm1,xmm5 movdqa XMMWORD[(-64)+rax],xmm0 movdqa xmm0,xmm4 paddd xmm3,xmm2 pcmpeqd xmm2,xmm5 movdqa XMMWORD[(-48)+rax],xmm1 movdqa xmm1,xmm4 paddd xmm0,xmm3 pcmpeqd xmm3,xmm5 movdqa XMMWORD[(-32)+rax],xmm2 movdqa xmm2,xmm4 paddd xmm1,xmm0 pcmpeqd xmm0,xmm5 movdqa XMMWORD[(-16)+rax],xmm3 movdqa xmm3,xmm4 paddd xmm2,xmm1 pcmpeqd xmm1,xmm5 movdqa XMMWORD[rax],xmm0 movdqa xmm0,xmm4 paddd xmm3,xmm2 pcmpeqd xmm2,xmm5 movdqa XMMWORD[16+rax],xmm1 movdqa xmm1,xmm4 paddd xmm0,xmm3 pcmpeqd xmm3,xmm5 movdqa XMMWORD[32+rax],xmm2 movdqa xmm2,xmm4 paddd xmm1,xmm0 pcmpeqd xmm0,xmm5 movdqa XMMWORD[48+rax],xmm3 movdqa xmm3,xmm4 paddd xmm2,xmm1 pcmpeqd xmm1,xmm5 movdqa XMMWORD[64+rax],xmm0 movdqa xmm0,xmm4 paddd xmm3,xmm2 pcmpeqd xmm2,xmm5 movdqa XMMWORD[80+rax],xmm1 movdqa xmm1,xmm4 paddd xmm0,xmm3 pcmpeqd xmm3,xmm5 movdqa XMMWORD[96+rax],xmm2 movdqa xmm2,xmm4 movdqa XMMWORD[112+rax],xmm3 jmp NEAR $L$gather ALIGN 32 $L$gather: pxor xmm4,xmm4 pxor xmm5,xmm5 movdqa xmm0,XMMWORD[((-128))+r11] movdqa xmm1,XMMWORD[((-112))+r11] movdqa xmm2,XMMWORD[((-96))+r11] pand xmm0,XMMWORD[((-128))+rax] movdqa xmm3,XMMWORD[((-80))+r11] pand xmm1,XMMWORD[((-112))+rax] por xmm4,xmm0 pand xmm2,XMMWORD[((-96))+rax] por xmm5,xmm1 pand xmm3,XMMWORD[((-80))+rax] por xmm4,xmm2 por xmm5,xmm3 movdqa xmm0,XMMWORD[((-64))+r11] movdqa xmm1,XMMWORD[((-48))+r11] movdqa xmm2,XMMWORD[((-32))+r11] pand xmm0,XMMWORD[((-64))+rax] movdqa xmm3,XMMWORD[((-16))+r11] pand xmm1,XMMWORD[((-48))+rax] por xmm4,xmm0 pand xmm2,XMMWORD[((-32))+rax] por xmm5,xmm1 pand xmm3,XMMWORD[((-16))+rax] por xmm4,xmm2 por xmm5,xmm3 movdqa xmm0,XMMWORD[r11] movdqa xmm1,XMMWORD[16+r11] movdqa xmm2,XMMWORD[32+r11] pand xmm0,XMMWORD[rax] movdqa xmm3,XMMWORD[48+r11] pand xmm1,XMMWORD[16+rax] por xmm4,xmm0 pand xmm2,XMMWORD[32+rax] por xmm5,xmm1 pand xmm3,XMMWORD[48+rax] por xmm4,xmm2 por xmm5,xmm3 movdqa xmm0,XMMWORD[64+r11] movdqa xmm1,XMMWORD[80+r11] movdqa xmm2,XMMWORD[96+r11] pand xmm0,XMMWORD[64+rax] movdqa xmm3,XMMWORD[112+r11] pand xmm1,XMMWORD[80+rax] por xmm4,xmm0 pand xmm2,XMMWORD[96+rax] por xmm5,xmm1 pand xmm3,XMMWORD[112+rax] por xmm4,xmm2 por xmm5,xmm3 por xmm4,xmm5 lea r11,[256+r11] pshufd xmm0,xmm4,0x4e por xmm0,xmm4 movq QWORD[rcx],xmm0 lea rcx,[8+rcx] sub edx,1 jnz NEAR $L$gather lea rsp,[r10] ret $L$SEH_end_bn_gather5: section .rdata rdata align=8 ALIGN 64 $L$inc: DD 0,0,1,1 DD 2,2,2,2 DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115 DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111 DB 114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79 DB 71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111 DB 112,101,110,115,115,108,46,111,114,103,62,0 section .text EXTERN __imp_RtlVirtualUnwind ALIGN 16 mul_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$common_seh_tail mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$common_pop_regs mov rax,QWORD[152+r8] mov r10d,DWORD[8+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail lea r10,[$L$mul4x_epilogue] cmp rbx,r10 ja NEAR $L$body_40 mov r10,QWORD[192+r8] mov rax,QWORD[8+r10*8+rax] jmp NEAR $L$common_pop_regs $L$body_40: mov rax,QWORD[40+rax] $L$common_pop_regs: mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] mov r13,QWORD[((-32))+rax] mov r14,QWORD[((-40))+rax] mov r15,QWORD[((-48))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 mov QWORD[224+r8],r13 mov QWORD[232+r8],r14 mov QWORD[240+r8],r15 $L$common_seh_tail: mov rdi,QWORD[8+rax] mov rsi,QWORD[16+rax] mov QWORD[152+r8],rax mov QWORD[168+r8],rsi mov QWORD[176+r8],rdi mov rdi,QWORD[40+r9] mov rsi,r8 mov ecx,154 DD 0xa548f3fc mov rsi,r9 xor rcx,rcx mov rdx,QWORD[8+rsi] mov r8,QWORD[rsi] mov r9,QWORD[16+rsi] mov r10,QWORD[40+rsi] lea r11,[56+rsi] lea r12,[24+rsi] mov QWORD[32+rsp],r10 mov QWORD[40+rsp],r11 mov QWORD[48+rsp],r12 mov QWORD[56+rsp],rcx call QWORD[__imp_RtlVirtualUnwind] mov eax,1 add rsp,64 popfq pop r15 pop r14 pop r13 pop r12 pop rbp pop rbx pop rdi pop rsi ret section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase DD $L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase DD $L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase DD $L$SEH_begin_bn_power5_nohw wrt ..imagebase DD $L$SEH_end_bn_power5_nohw wrt ..imagebase DD $L$SEH_info_bn_power5_nohw wrt ..imagebase DD $L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase DD $L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase DD $L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase DD $L$SEH_begin_bn_powerx5 wrt ..imagebase DD $L$SEH_end_bn_powerx5 wrt ..imagebase DD $L$SEH_info_bn_powerx5 wrt ..imagebase DD $L$SEH_begin_bn_gather5 wrt ..imagebase DD $L$SEH_end_bn_gather5 wrt ..imagebase DD $L$SEH_info_bn_gather5 wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_bn_mul4x_mont_gather5: DB 9,0,0,0 DD mul_handler wrt ..imagebase DD $L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_power5_nohw: DB 9,0,0,0 DD mul_handler wrt ..imagebase DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_mulx4x_mont_gather5: DB 9,0,0,0 DD mul_handler wrt ..imagebase DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_powerx5: DB 9,0,0,0 DD mul_handler wrt ..imagebase DD $L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_gather5: DB 0x01,0x0b,0x03,0x0a DB 0x0b,0x01,0x21,0x00 DB 0x04,0xa3,0x00,0x00 ALIGN 8 %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret %endif ring-0.17.14/pregenerated/x86_64-mont5-nasm.o000064400000000000000000001225151046102023000165220ustar 00000000000000dgs.debug$Sx\|]@B.debug$TLe f@B.text, f p`.rdata|GÓ@p@.pdata<Ó@0@.xdatap@@@6C:\Users\b\p\ring\pregenerated\x86_64-mont5-nasm.asm)X-<ٯ-O, O  #!$"'$(&)(+*-,/.122667:8=CEDHEKFRGUH[I^JfKkMOPQRSTVWXYZ[\]^abcdgimoqsuwy{}  $)08<@EJQVZ\`dhimqu{#'+/8<@DMQUYbfjnw{| %.5>BKOX\`fmt}      "#$&'()* , -./023!4&5*6-718599:<;?<D>`@cAfBjCnDrEuGxH{IJKLMNPQRSTVWXYZ[\]_`abcefghijklnopqrtuvw xyz{|~!'*-159<?BFJMQUX[^bfiloswz~ "'+/5<CJNSW\`eimt{  #&*.25 : @ CFJNRVZ]`chloswz} !"#$%'()*+,-.012345689:;<=>?ABCDEFGIJKL MNOPQS!T'V*W-X1Y5Z9[=\A]D_G`JaNbRcUdYe]f`hcifjikmlqmunyo|qrstuvwxz{}~         # & ' ( * , . 0 4 8 ; > F I L S V \ _ g l                                   # ( - 2 7 < ? D I N S X \ ` d h !l #p %s (x )} *~ 1                                     $ ' * - 0 4 7 : = B E I L P S V Y ] ` d g j m r u y |                                            % * / 4 9 < ? D G J N S V Y \ _ c h k o t w |      ! " $ % & ' ( ) * + - . 0 2 3 4 5 6 7 8 9 ; < = > ? @ A B C E F G H I J K L M O P Q# R' S* T- U1 V6 X: Y@ [A \D ]G ^K _N `R bU cX d\ f` gf jj ko ls mx n| o q r s t u w x y z { | } ~                            $),048<@CGJMPTW\_bfjmquy|   $(+/36:>BEILORVY^adhlos w { ~   !"#$%&'()+,-./01234 5 678:; <%=*>/?4@7A<C@EDFEGHHLIPJTKXL\M`NdOgPkRlSoTuUyV~WY[\]^_abcdefghjklmnopqstuvwxyz| } ~"%)-037:=AEHKORUX\`cfjlrvy~  #&*.148;>BFILPSVZ^adhkpswz~       !"#$% &$'((,)0*3,8.;/?0D1H2L3P4T5X6\7`9c:i;j?BCDEFGHIJKLMNPRSTUWXYZ[\]^_abcdefgh i jklno!q$r's(x@zE{J|M~PSVY^cfghjlnptx{~ !%)-159<AFG`ehkorz~     !" $ %&'(")&*/+3-7.;/D0H2L3P4Y5]7a8e9n:r;v<z=>@ABCEFGHJKLMNOPQSTUWXZ[\]^_`a b%c*d0e5f>gBhKiOjXk\l`menjoopxq}rstuvwxyz{|}~ $'+14:=@EIOUZ`flrx~%+17;AEKQTX^dhlouz}#,09=AEJOX] f j s w  !#$%&'() *+,-".(//05192=3D4J5P7S8Y:\;_<d>i?o@uA{BCDEFGHIJKLMNOPRTUVWXYZ [\]^_&`,a3b9c=dAeGgMhRiXj^kdljmpnvozpqrstuvwxy{|~ "'@EJMPSVY^cfghjlnptx{~" & + 0 5 : ? D I N S X ] b g l q v y |    ! # % ' ) + - / 1 4 5 6 j k l m n p q s t u v w y z { | } ~  % ( + . 1 4 7 : ? B G ` f l r x ~                           $ - 3 9 B H N W [ a g m s y } ! ! ! ! ! ! $! (! ,! 2! 8! "? "@ "A "C "D "E "F "G "I "J "K "M "N #O #P #R #S #T #U #V $#W (#X ,#Y 0#Z 4#[ 8#\ <#] @#^ A#_ D#` F#a K#b P#d `#f c#g h#h n#i s#j y#k |#l #m #n #o #p #q #r #s #t #v #w #x #y #z #{ #| #} #~ # # # # # # # # # # # # # # # $ $ $ $ $ $ $ $ %$ +$ 1$ 9$ @$ F$ L$ P$ S$ W$ \$ b$ h$ m$ q$ w$ }$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ % % % % % % % % #% (% -% 2% 7% <% @% D% H% L% P% S% W% [% _% c% h% l% o% v% {% % % % % % % % % % % % % % % % % % % % % % % % & & & & & & "& (& .& 4& 7& :& @&" C&# H&$ N&& S&' V&( Z&) a&* h&+ o&, s&- w&. {&/ &0 &1 &2 &4 &5 &6 &8 &: &; &< &= &? &@ &A &C &D &E &G &H &I &K &L &M 'O 'P 'Q 'S 'T 'U $'W *'X 2'Y 8'Z >'[ B'\ E'] K'_ N'` T'b Y'c _'e d'f i'g m'h p'i t'j x'k |'l 'm 'n 'o 'p 'q 'r 't 'u 'v 'x 'z '{ '| '} '~ ' ' ' ' ' ' ' ' ' ' ' ( ( ( ( ( ( ( ( #( (( +( /( 3( 7( ;( ?( C( G( K( P( U( [( \( `( d( g( j( m( q( v( {( ~( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ) ) ) ) ) ) ) $) ') .) 1) 7) 8) @) D) K) R) V) [) _) d) k) s) x) |) ) ) ) ) ) ) ) ) ) ) ) ) ) )! )" )# )$ )% )& )( )) )* )+ )- ). )/ )0 )2 )3 )4 )5 *6 *7 *8 *9 *; *< *= *> $*@ (*A ,*B 1*C 5*E 9*F =*G B*H F*I J*J N*K S*L W*N [*O _*P d*Q h*S l*T p*U u*V y*X }*Y *Z *[ *\ *] *_ *a *b *c *d *e *f *g *h *i *j *k *l *m *n *o *p *q *r *s +t +u +v +w +x +y +z +{ %+| ++} 1+~ 5+ ;+ @+ D+ I+ M+ R+ V+ Z+ `+ f+ l+ q+ w+ |+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + , , , , , , $, ', -, 4, 9, >, B, F, J, N, R, V, Z, a, h, o, v, }, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , R 9C:\Users\b\p\ring\pregenerated\x86_64-mont5-nasm.o4'The Netwide Assembler 2.13.032ring_core_0_17_14__bn_mul4x_mont_gather5+L$SEH_begin_bn_mul4x_mont_gather5L$mul4x_prologueL$mul4xsp_altL$mul4xsp_doneL$mul4x_page_walk L$mul4x_page_walk_doneL$mul4x_bodyL$mul4x_epilogue)L$SEH_end_bn_mul4x_mont_gather5mul4x_internalL$1st4xL$outer4xL$inner4x+ring_core_0_17_14__bn_power5_nohw$L$SEH_begin_bn_power5_nohwL$power5_prologueL$pwr_sp_altL$pwr_sp_doneL$pwr_page_walkL$pwr_page_walk_doneL$power5_bodyL$power5_epilogue"L$SEH_end_bn_power5_nohw.ring_core_0_17_14__bn_sqr8x_internal__bn_sqr8x_internalL$sqr4x_1stL$sqr4x_outerL$sqr4x_innerL$sqr4x_shift_n_add__bn_sqr8x_reductionL$8x_reduction_loopL$8x_reduceL$8x_tailL$8x_tail_doneL$8x_no_tail__bn_post4x_internalL$sqr4x_subL$sqr4x_sub_entry3ring_core_0_17_14__bn_mulx4x_mont_gather5,L$SEH_begin_bn_mulx4x_mont_gather5L$mulx4x_prologueL$mulx4xsp_altL$mulx4xsp_doneL$mulx4x_page_walk!L$mulx4x_page_walk_doneL$mulx4x_bodyL$mulx4x_epilogue*L$SEH_end_bn_mulx4x_mont_gather5mulx4x_internalL$mulx4x_1stL$mulx4x_outerL$mulx4x_inner'ring_core_0_17_14__bn_powerx5 L$SEH_begin_bn_powerx5L$powerx5_prologueL$pwrx_sp_altL$pwrx_sp_doneL$pwrx_page_walkL$pwrx_page_walk_doneL$powerx5_bodyL$powerx5_epilogueL$SEH_end_bn_powerx5/ring_core_0_17_14__bn_sqrx8x_internal__bn_sqrx8x_internalL$sqrx8x_zeroL$sqr8x_zero_startL$sqrx8x_outer_loopL$sqrx8x_loopL$sqrx8x_breakL$sqrx8x_outer_breakL$sqrx4x_shift_n_add$L$sqrx4x_shift_n_add_break__bn_sqrx8x_reduction!L$sqrx8x_reduction_loopL$sqrx8x_reduceL$sqrx8x_tailL$sqrx8x_tail_doneL$sqrx8x_no_tail__bn_postx4x_internalL$sqrx4x_subL$sqrx4x_sub_entry(ring_core_0_17_14__bn_scatter5L$scatterL$scatter_epilogue'ring_core_0_17_14__bn_gather5 L$SEH_begin_bn_gather5L$gatherL$SEH_end_bn_gather5 "L$incmul_handlerL$body_40L$common_pop_regsL$common_seh_tail-  L$SEH_info_bn_mul4x_mont_gather5&  L$SEH_info_bn_power5_nohw.  L$SEH_info_bn_mulx4x_mont_gather5"  L$SEH_info_bn_powerx5"  L$SEH_info_bn_gather5l p P P P P P P Q Q /Q 3Q IQ MQ fQ jQ Q Q Q Q Q Q Q Q R R R R )R -R >R BR kR oR R R R! R! R" R" R# R# R$ R$ S% S% 3S& 7S& PS' TS' tS( xS( S) S) S* S* S+ S+ S, S, T- T- +T. /T. KT/ OT/ jT0 nT0 T1 T1 T2 T2 T3 T3 T4 T4 T5 T5 T6 U6 U7 U7 QU8 UU8 U9 U9 U: U: U; U; U< U< U= U= V> V> +V? /V? HV@ LV@ tVA xVA VB VB VC VC VD VD VE VE WF WF &WG *WG DWH HWH ]WI aWI wWJ {WJ WK WK WL WL WM WM WN WN XO XO =XP AXP ]XQ aXQ vXR zXR XS XS XT XT XU XU XV XV YW YW &YX *YX LYY PYY mYZ qYZ Y[ Y[ Y\ Y\ Y] Y] Y^ Y^ Y_ Z_ Z` #Z` 7Za ;Za UZb YZb Zc Zc Zd Zd Ze Ze Zf Zf Zg [g [h [h 5[i 9[i E[j I[j \[k `[k q[l u[l [m [m [n [n [o [o \p \p 6\q :\q Z\r ^\r H|$Ht$HHHLLLD$(LL$0gHSUATAUAVAWgAOIINLHI)IM9%L)JM5O JMM)AMBL)HII)II$+L$H9H$L$H9IHD$(7Ht$(L~LvLnLfHnH^H&H|$Ht$ÐIfnh8HN IfofoHNT LfpfoggfoffvgfoffvfABpfoffvfAfoffvfAfoffvfAfoffvfAfoffvfAfoffvfAfoffvfAfoffvfAfoffvfAfoffvfAfoffvfA foffvfA0fgfvfA@fvfAPfAD$@fAL$PfAT$`fA`fA\$pfffAod$fAol$fAoT$fAbpfAo\$fA۪ffAےffAۚfffAod$fAol$fAoT$fAۢfAo\$fA۪ffAےffAۚfffAo$$fAol$fAoT$ fAۢfAo\$0fA۪ffAےffAۚ ffffpNfM$fH~Ll$H|$@MHJ4ILHIHILt$HIHIJDHHHIHAHIHHJDHLMy HI HI>IHIHAMv HIHIJD>HMHMnHHIHAHIHHJ>HLHI~IHIHHIHIJD>HMHMnHHIHAHIHHJD>HLHI HI>II 9HIHAMv HIHIHFHMHMnHHIHAHIHHJHLHI~IJ H1MHMnIfffAoD$fAoL$fAoT$fAo\$fBfJffRffZfffAoD$fAoL$fAoT$fAo\$fBfJffRffZfffAo$fAoL$fAoT$ fAo\$0ffJffR ffZ0fffAoD$@fAoL$PfAoT$`fAo\$pfB@fJPffR`ffZpffffpNfM$fH~OLHIHHIII>O4HIJDHHHIHAHM^HIHHJDHLMy HI HIHIHAHMVMv HIHIJD>HMHI~HHIHAHM^HIHHJ>HLHMnIHIHHMHIHIJD>HMHI~HHIHAHM^HIHHJD>HLHI HMnII HIHAHMVMv HIHIHFHMHI~HHIHHiHM^HIHHJHLHMnII~J H1MHM.HMnL;d$H1L)ML H)KL!H)LHH|$@IM1LmLuL} H|$Ht$HHHLLLD$(LL$0HSUATAUAVAWAGIIMNLHI)IM9$L)JM4O JMM)AMBL)HII)II$+L$H9H$L$H9MILD$ HD$(fHnfHnfInfHn{v ql gb ]X SN fH~fH~HHD$(LD$ Ht$(L~LvLnLfHnH^H&H|$Ht$ÐIj J4LLt.J|L8HD.H|/H\.IIIHILT/IIHHL\/IH\.IIHIHMIIHIIMILTHIIHIIIIHH\IIMIIIHLIIIIHH\IIMIIIHLTIIIIHH\IIMIIIHL\IIHI IIHIIMILTH0IIHmHMHL/IHW Lt.J|L8HD.H|/H\.IILT/IHHLT/IIIHHL\/IIL\/M1H\.IIHHLd/IIIIHHMIILT/HMHIIHIIL,IgIIHH\IIMIIILHIILdHIIIIHHMIILTHgIIHMHL/IHWHLvJ|L8HFH|/H^IIIHIIIIHLWIIMH^IIIHL_IIIIHIIMILWIIHFHMHL/IHWHHM1L)M1LHHGHWLHD.H|$8M1L_O$VI?N,YI?M LWMHIL_IHD.L'IKVLoMI?NYI?M LW MHIL_(HH.H_IHmLGMH@O$VI?N,YI?M LWMHIL_IHD.LgIKVLoMI?NYI?M LMHIL_HH.H_IO$VLGMI?N,YI?M LWMHIL_IHD.L'IKVLoMI?NYI?M LW MHIL_(HHD.H_ILGMH@H  O$VgI?N,YI?M LWMHIL_IHFLgIKVLoMI?NYI?M HIHIH_LGfH~H1I )JTL8HL$J| 8HT$IJ<fHLOLWL_Lg Lo(Lw0L8HH@gIH\$(HEHHEIIIHIHEHMH\0IIHIHEHMHt$(IIHIHE HIMIIHIHE(HMIIHIHE0HMIIHIHE8HMIIHHIHEHMII.Hm@H1HT$H;l$fLLOLWL_Lg Lo(Lw0L8HH\$pHEHIHELIIHIHEHMHIIHIHEHMIIHIHE HMIIHIHE(HMIIHIHE0HMIIHIHE8HMIIHH\(IHMHEII3Hm@HT$H;l$?H\$pHHELLOLWL_Lg Lo(Lw0L8HH1LIIIIIIIHHLLOLWL_Lg Lo(Lw0L8HHMH1fH~LLOfI~LWL_Lg Lo(Lw0L8H@H9ÐLeJLfH~HfH~HIM1LmLuL}LeLmLuL}Hm IIIII!I!I!I!IL#LkLsL{L'H[ LoMLwLH HMIÐH|$Ht$HHHLLLD$(LL$0HSUATAUAVAWAOIIMNLHI)IM9L)JM O JMM)AMBL)HII)II$+L$H9H$L$H9LD$ HD$(IHt$(L~LvLnLfHnH^H&H|$Ht$ÐLL$MIIIN I fnh8IHLl$LL$ H|$@fofoHNTHfpfogfogffvfoffvfABpfoffvfAfoffvfAfoffvfAfoffvfAfoffvfAfoffvfAfoffvfAfoffvfAfoffvfAfoffvfAfoffvfA fogffvfA0ffvfA@fvfAPfG@fOPfW`fA`f_pfffogfoofoWfAbpfo_fA۪ffAےffAۚfffogfoofoWfAۢfo_fA۪ffAےffAۚfffo'foofoW fAۢfo_0fA۪ffAےffAۚ ffffpNfHfH~H\$hIbfIbnIIbvMLD$(H1LH|$Hv fL8fL8bfL8M8bYfL8M8baH|$ LSfL8M8byLL[fL8L8HI LcfL8fM8bvfL8FfM8bvggLfL8fL8Hv H[ M8b9fL8M8byfL8M8byLSfL8L[M8byLLcfL8L8HI LkHKHD$IH4MH|$HLs LfggffoGfoOfoWfAۂfo_fAۊffAے ffAۚ0fffoGfoOfoWfAۂ@fo_fAۊPffAے`ffAۚpfffofoOfoW fAۂfo_0fAۊffAےffAۚfffoG@foOPfoW`fAۂfo_pfAۊffAےffAۚffffpNfHfH~H+H\ bH1IbfL8CfM8bnL8[fM8bvL8cfL8H Hv L8kfL8L8MLD$(LH1H|$bfL8M8bYfL8M8bafL8M8byLH|$ LSfL8L[L8LcHI fL8M8bvfL8L8FfL8[M8bvLfL8cL8fL8kL8Hv H[ fL8M8b9fL8M8byfL8M8byLSfL8M8L[byLHI LcfL8L8LkH,HD$IH+;H|$LT$MH4HLsL9LQIL$H,HH<1M1M)MM HL)HT$@ILmM1LuL} H|$Ht$HHHLLLD$(LL$0HSUATAUAVAWAOIIMNLHI)IM9$L)JM4O JMM)AMBL)HII)II$+L$H9H$L$H9MIffHnfHnfInfHnLD$ HD$(w m c Y O MHfH~fH~HD$(Ht$(L~LvLnLfHnH^H&H|$Ht$H|$8J,LL$Hl$(fff.>ffGfG fG0fG@fGPfG`fGpHI@HM1M1M1M1M1M1H|$8H1FfM8L8FfM8L8fM8L8 fM8L8F(fM8L8F0fM8I8b~8HVfL8L8L@LGLOHH1^FfM8L8^ fM8L8(fM8L80fM8M8b8HVfL8L8fM8L8fL8LGLO ^F fM8L8^(fM8L80fM8M8b8>HVfL8L8fM8LG(LO0F L8fL8^(fM8L8F0fM8M8bf8HV Lv(fL8L8L~0fM8L8fL8LG8LO@³LF8fM8«L8fM8£LL8fM8fH8‹BLHv@fM8L8fL8L8ggBfM8fL8H;t$ HHILG@fL8OHfL8WPfL8_XLg`LohLwpLxH.HHHVHD$H|$ 1LbEfH8M8bMfL8M8bUfL8M8b]fL8M8b fL8M8bm(fL8M8bu0HϻfL8M8b8HTfL8L8fL8gHIHm@HH;l$SH+\$fHVfL8fL8OLWL_Lg Lo(Lw0L8H@gH1HD$H1H+\$fL8HL$ fL8HILIIIIIH9LOLILWLQL_LYLg La Lo(Li(Lw0Lq0L8Ly8HLOHfH~LWPL_XLg`LohLwpH|$8HL_M1LL$M8LgLoM8fI8HL M8fI8L_(HH_M8fI8HTLg0M8fI8Lo8HGH_M8fI8HTHI LW@M8fI8L_HHG H_(M8fI8.HM8fI8LgPLoXHG0H_8H@ fI8HG0H_8H@fH~1H\$(HT$8JL HL$H|$H|$8LOLWL_Lg IHLo(Lw0L8HD$ H@H1HLbEfH8M8bMfL8M8bUfL8M8b]fL8M8b HLfL8M8T$(HHDxbm(fL8M8bu0fL8M8b}8HfL8L8fL8gggH@HH;l$HT$8LHm@HfL8OfL8WL_Lg Lo(Lw0L8H@HH1HD$ LbEfH8M8bMfL8M8bUfL8M8b]fL8M8b fL8M8bm(fL8M8bu0fL8M8b}8H̀fL8L8HLfL8HLH;l$aH+t$HT$8Hm@LLOLWL_Lg Lo(Lw0L8H@HHH1HD$H1LD$ IIIIIIIHH+t$LfH~LOHu8fH~LWL_Lg Lo(Lw0L8HH\$(HT@LLG@LOLWL_Lg Lo(Lw0L8H|@L;D$ÐLeIIHHfH~fH~ILmM1LuL}LeLmLuL}bHm bbbIL'LoLwLL"H LjMLrLzHR HIÐOHHIIMÐL$HHHfAnfofoHMH$fpfofoffvfoffvf@foffvfHfoffvfPfoffvfXfoffvf@foffvfHfoffvfPfoffvfXfoffvffoffvfHfoffvfP foffvfX0foffvf@@foffvfHPfoffvfP`fofXp fffAoCfAoKfAoSf@fAo[fHffPffXfffAoCfAoKfAoSf@fAo[fHffPffXfffAofAoKfAoS ffAo[0fHffP ffX0fffAoC@fAoKPfAoS`f@@fAo[pfHPffP`ffXpfffMfpNffHII"ÐVWSUATAUAVAWH@I@xIIqMY8ENL9ESNL9=IESNL9gLL9MJDH@(HXHhL`LhLpLxIIMMMMHxHpIIIIy(LƹHLH1HVLLNLV(L^8LfLT$ L\$(Ld$0HL$8H@A_A^A]A\][_^,N),Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by  ~ MG0MH@)+`    $(, 048  +1 +0 s +p< +pD  !  $(48<@LPTX.filegC:\Users\b\p\ring\.debug$Sx\.debug$T.text,.rdata|.pdata<.xdatap.absolutD f1w L$1st4x` @ @ [0 m z    s ~     + 9 G[p@ @M6pHWgz<G`@,MCpVdsD -`A@"O`#^#s $%%@%%&''`()(6(I)h)r7)@)@)L$gather*+L$inc+>,B,, *D0fH|`__imp_RtlVirtualUnwindring_core_0_17_14__bn_mul4x_mont_gather5L$SEH_begin_bn_mul4x_mont_gather5L$mul4x_prologueL$mul4xsp_altL$mul4xsp_doneL$mul4x_page_walkL$mul4x_page_walk_doneL$mul4x_bodyL$mul4x_epilogueL$SEH_end_bn_mul4x_mont_gather5mul4x_internalL$outer4xL$inner4xring_core_0_17_14__bn_power5_nohwL$SEH_begin_bn_power5_nohwL$power5_prologueL$pwr_sp_altL$pwr_sp_doneL$pwr_page_walkL$pwr_page_walk_doneL$power5_bodyL$power5_epilogueL$SEH_end_bn_power5_nohwring_core_0_17_14__bn_sqr8x_internal__bn_sqr8x_internalL$sqr4x_1stL$sqr4x_outerL$sqr4x_innerL$sqr4x_shift_n_add__bn_sqr8x_reductionL$8x_reduction_loopL$8x_reduceL$8x_tailL$8x_tail_doneL$8x_no_tail__bn_post4x_internalL$sqr4x_subL$sqr4x_sub_entryring_core_0_17_14__bn_mulx4x_mont_gather5L$SEH_begin_bn_mulx4x_mont_gather5L$mulx4x_prologueL$mulx4xsp_altL$mulx4xsp_doneL$mulx4x_page_walkL$mulx4x_page_walk_doneL$mulx4x_bodyL$mulx4x_epilogueL$SEH_end_bn_mulx4x_mont_gather5mulx4x_internalL$mulx4x_1stL$mulx4x_outerL$mulx4x_innerring_core_0_17_14__bn_powerx5L$SEH_begin_bn_powerx5L$powerx5_prologueL$pwrx_sp_altL$pwrx_sp_doneL$pwrx_page_walkL$pwrx_page_walk_doneL$powerx5_bodyL$powerx5_epilogueL$SEH_end_bn_powerx5ring_core_0_17_14__bn_sqrx8x_internal__bn_sqrx8x_internalL$sqrx8x_zeroL$sqr8x_zero_startL$sqrx8x_outer_loopL$sqrx8x_loopL$sqrx8x_breakL$sqrx8x_outer_breakL$sqrx4x_shift_n_addL$sqrx4x_shift_n_add_break__bn_sqrx8x_reductionL$sqrx8x_reduction_loopL$sqrx8x_reduceL$sqrx8x_tailL$sqrx8x_tail_doneL$sqrx8x_no_tail__bn_postx4x_internalL$sqrx4x_subL$sqrx4x_sub_entryring_core_0_17_14__bn_scatter5L$scatterL$scatter_epiloguering_core_0_17_14__bn_gather5L$SEH_begin_bn_gather5L$SEH_end_bn_gather5mul_handlerL$body_40L$common_pop_regsL$common_seh_tailL$SEH_info_bn_mul4x_mont_gather5L$SEH_info_bn_power5_nohwL$SEH_info_bn_mulx4x_mont_gather5L$SEH_info_bn_powerx5L$SEH_info_bn_gather5ring-0.17.14/src/aead/aes/bs.rs000064400000000000000000000052141046102023000141420ustar 00000000000000// Copyright 2018-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(all(target_arch = "arm", target_endian = "little"))] use super::{Counter, Overlapping, AES_KEY}; /// SAFETY: /// * The caller must ensure that if blocks > 0 then either `input` and /// `output` do not overlap at all, or input == output.add(n) for some /// (nonnegative) n. /// * if blocks > 0, The caller must ensure `input` points to `blocks` blocks /// and that `output` points to writable space for `blocks` blocks. /// * The caller must ensure that `vpaes_key` was initialized with /// `vpaes_set_encrypt_key`. /// * Upon returning, `blocks` blocks will have been read from `input` and /// written to `output`. pub(super) unsafe fn ctr32_encrypt_blocks_with_vpaes_key( in_out: Overlapping<'_>, vpaes_key: &AES_KEY, ctr: &mut Counter, ) { prefixed_extern! { // bsaes_ctr32_encrypt_blocks requires transformation of an existing // VPAES key; there is no `bsaes_set_encrypt_key`. fn vpaes_encrypt_key_to_bsaes(bsaes_key: *mut AES_KEY, vpaes_key: &AES_KEY); } // SAFETY: // * The caller ensures `vpaes_key` was initialized by // `vpaes_set_encrypt_key`. // * `bsaes_key was zeroed above, and `vpaes_encrypt_key_to_bsaes` // is assumed to initialize `bsaes_key`. let bsaes_key = unsafe { AES_KEY::derive(vpaes_encrypt_key_to_bsaes, vpaes_key) }; // The code for `vpaes_encrypt_key_to_bsaes` notes "vpaes stores one // fewer round count than bsaes, but the number of keys is the same," // so use this as a sanity check. debug_assert_eq!(bsaes_key.rounds(), vpaes_key.rounds() + 1); // SAFETY: // * `bsaes_key` is in bsaes format after calling // `vpaes_encrypt_key_to_bsaes`. // * `bsaes_ctr32_encrypt_blocks` satisfies the contract for // `ctr32_encrypt_blocks`. unsafe { ctr32_encrypt_blocks!(bsaes_ctr32_encrypt_blocks, in_out, &bsaes_key, ctr); } } ring-0.17.14/src/aead/aes/fallback.rs000064400000000000000000000032241046102023000152740ustar 00000000000000// Copyright 2018-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{Block, Counter, EncryptBlock, EncryptCtr32, Iv, KeyBytes, Overlapping, AES_KEY}; use crate::error; #[derive(Clone)] pub struct Key { inner: AES_KEY, } impl Key { pub(in super::super) fn new(bytes: KeyBytes<'_>) -> Result { let inner = unsafe { set_encrypt_key!(aes_nohw_set_encrypt_key, bytes) }?; Ok(Self { inner }) } } impl EncryptBlock for Key { fn encrypt_block(&self, block: Block) -> Block { unsafe { encrypt_block!(aes_nohw_encrypt, block, &self.inner) } } fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block { super::encrypt_iv_xor_block_using_encrypt_block(self, iv, block) } } impl EncryptCtr32 for Key { fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) { unsafe { ctr32_encrypt_blocks!(aes_nohw_ctr32_encrypt_blocks, in_out, &self.inner, ctr) } } } ring-0.17.14/src/aead/aes/ffi.rs000064400000000000000000000160411046102023000143020ustar 00000000000000// Copyright 2018-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{Block, KeyBytes, Overlapping, BLOCK_LEN}; use crate::{bits::BitLength, c, error}; use core::{ ffi::{c_int, c_uint}, num::{NonZeroU32, NonZeroUsize}, }; /// nonce || big-endian counter. #[repr(transparent)] pub(in super::super) struct Counter(pub(super) [u8; BLOCK_LEN]); // Keep this in sync with AES_KEY in aes.h. #[repr(C)] #[derive(Clone)] pub(in super::super) struct AES_KEY { pub rd_key: [u32; 4 * (MAX_ROUNDS + 1)], pub rounds: c_uint, } // Keep this in sync with `AES_MAXNR` in aes.h. const MAX_ROUNDS: usize = 14; impl AES_KEY { #[inline] pub(super) unsafe fn new( f: unsafe extern "C" fn(*const u8, BitLength, *mut AES_KEY) -> c_int, bytes: KeyBytes<'_>, ) -> Result { let mut key = Self { rd_key: [0; 4 * (MAX_ROUNDS + 1)], rounds: 0, }; let (bytes, key_bits) = match bytes { KeyBytes::AES_128(bytes) => (&bytes[..], BitLength::from_bits(128)), KeyBytes::AES_256(bytes) => (&bytes[..], BitLength::from_bits(256)), }; // Unusually, in this case zero means success and non-zero means failure. if 0 == unsafe { f(bytes.as_ptr(), key_bits, &mut key) } { debug_assert_ne!(key.rounds, 0); // Sanity check initialization. Ok(key) } else { Err(error::Unspecified) } } } #[cfg(all(target_arch = "arm", target_endian = "little"))] impl AES_KEY { pub(super) unsafe fn derive( f: for<'a> unsafe extern "C" fn(*mut AES_KEY, &'a AES_KEY), src: &Self, ) -> Self { let mut r = AES_KEY { rd_key: [0u32; 4 * (MAX_ROUNDS + 1)], rounds: 0, }; unsafe { f(&mut r, src) }; r } pub(super) fn rounds(&self) -> u32 { self.rounds } } // SAFETY: // * The function `$name` must read `bits` bits from `user_key`; `bits` will // always be a valid AES key length, i.e. a whole number of bytes. // * `$name` must set `key.rounds` to the value expected by the corresponding // encryption/decryption functions and return 0, or otherwise must return // non-zero to indicate failure. // * `$name` may inspect CPU features. // // In BoringSSL, the C prototypes for these are in // crypto/fipsmodule/aes/internal.h. macro_rules! set_encrypt_key { ( $name:ident, $key_bytes:expr $(,)? ) => {{ use crate::bits::BitLength; use core::ffi::c_int; prefixed_extern! { fn $name(user_key: *const u8, bits: BitLength, key: *mut AES_KEY) -> c_int; } $crate::aead::aes::ffi::AES_KEY::new($name, $key_bytes) }}; } macro_rules! encrypt_block { ($name:ident, $block:expr, $key:expr) => {{ use crate::aead::aes::{ffi::AES_KEY, Block}; prefixed_extern! { fn $name(a: &Block, r: *mut Block, key: &AES_KEY); } $key.encrypt_block($name, $block) }}; } impl AES_KEY { #[inline] pub(super) unsafe fn encrypt_block( &self, f: unsafe extern "C" fn(&Block, *mut Block, &AES_KEY), a: Block, ) -> Block { let mut result = core::mem::MaybeUninit::uninit(); unsafe { f(&a, result.as_mut_ptr(), self); result.assume_init() } } } /// SAFETY: /// * The caller must ensure that `$key` was initialized with the /// `set_encrypt_key!` invocation that `$name` requires. /// * The caller must ensure that fhe function `$name` satisfies the conditions /// for the `f` parameter to `ctr32_encrypt_blocks`. macro_rules! ctr32_encrypt_blocks { ($name:ident, $in_out:expr, $key:expr, $ctr:expr $(,)? ) => {{ use crate::{ aead::aes::{ffi::AES_KEY, Counter, BLOCK_LEN}, c, }; prefixed_extern! { fn $name( input: *const [u8; BLOCK_LEN], output: *mut [u8; BLOCK_LEN], blocks: c::NonZero_size_t, key: &AES_KEY, ivec: &Counter, ); } $key.ctr32_encrypt_blocks($name, $in_out, $ctr) }}; } impl AES_KEY { /// SAFETY: /// * `f` must not read more than `blocks` blocks from `input`. /// * `f` must write exactly `block` blocks to `output`. /// * In particular, `f` must handle blocks == 0 without reading from `input` /// or writing to `output`. /// * `f` must support the input overlapping with the output exactly or /// with any nonnegative offset `n` (i.e. `input == output.add(n)`); /// `f` does NOT need to support the cases where input < output. /// * `key` must have been initialized with the `set_encrypt_key!` invocation /// that corresponds to `f`. /// * `f` may inspect CPU features. #[inline] pub(super) unsafe fn ctr32_encrypt_blocks( &self, f: unsafe extern "C" fn( input: *const [u8; BLOCK_LEN], output: *mut [u8; BLOCK_LEN], blocks: c::NonZero_size_t, key: &AES_KEY, ivec: &Counter, ), in_out: Overlapping<'_>, ctr: &mut Counter, ) { in_out.with_input_output_len(|input, output, len| { debug_assert_eq!(len % BLOCK_LEN, 0); let blocks = match NonZeroUsize::new(len / BLOCK_LEN) { Some(blocks) => blocks, None => { return; } }; let input: *const [u8; BLOCK_LEN] = input.cast(); let output: *mut [u8; BLOCK_LEN] = output.cast(); let blocks_u32: NonZeroU32 = blocks.try_into().unwrap(); // SAFETY: // * `input` points to `blocks` blocks. // * `output` points to space for `blocks` blocks to be written. // * input == output.add(n), where n == src.start, and the caller is // responsible for ensuing this sufficient for `f` to work correctly. // * `blocks` is non-zero so `f` doesn't have to work for empty slices. // * The caller is responsible for ensuring `key` was initialized by the // `set_encrypt_key!` invocation required by `f`. unsafe { f(input, output, blocks, self, ctr); } ctr.increment_by_less_safe(blocks_u32); }); } } ring-0.17.14/src/aead/aes/hw.rs000064400000000000000000000070551046102023000141610ustar 00000000000000// Copyright 2018-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ))] use super::{Block, Counter, EncryptBlock, EncryptCtr32, Iv, KeyBytes, Overlapping, AES_KEY}; use crate::{cpu, error}; use cfg_if::cfg_if; cfg_if! { if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { pub(in super::super) type RequiredCpuFeatures = cpu::arm::Aes; pub(in super::super) type OptionalCpuFeatures = (); } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { use cpu::intel::{Aes, Avx, Ssse3}; // Some functions seem to have been written to require only SSE/SSE2 // but there seem to be no SSSE3-less CPUs with AES-NI, and we don't // have feature detection for SSE2. pub(in super::super) type RequiredCpuFeatures = (Aes, Ssse3); pub(in super::super) type OptionalCpuFeatures = Avx; } } #[derive(Clone)] pub struct Key { inner: AES_KEY, } impl Key { #[cfg(all(target_arch = "aarch64", target_endian = "little"))] pub(in super::super) fn new( bytes: KeyBytes<'_>, _required_cpu_features: RequiredCpuFeatures, _optional_cpu_features: Option, ) -> Result { let inner = unsafe { set_encrypt_key!(aes_hw_set_encrypt_key, bytes) }?; Ok(Self { inner }) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub(in super::super) fn new( bytes: KeyBytes<'_>, (Aes { .. }, Ssse3 { .. }): RequiredCpuFeatures, optional_cpu_features: Option, ) -> Result { // Ssse3 is required, but upstream only uses this if there is also Avx; // presumably the base version is faster on pre-AVX CPUs. let inner = if let Some(Avx { .. }) = optional_cpu_features { unsafe { set_encrypt_key!(aes_hw_set_encrypt_key_alt, bytes) }? } else { unsafe { set_encrypt_key!(aes_hw_set_encrypt_key_base, bytes) }? }; Ok(Self { inner }) } #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64" ))] #[must_use] pub(in super::super) fn inner_less_safe(&self) -> &AES_KEY { &self.inner } } impl EncryptBlock for Key { fn encrypt_block(&self, block: Block) -> Block { super::encrypt_block_using_encrypt_iv_xor_block(self, block) } fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block { super::encrypt_iv_xor_block_using_ctr32(self, iv, block) } } impl EncryptCtr32 for Key { fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) { unsafe { ctr32_encrypt_blocks!(aes_hw_ctr32_encrypt_blocks, in_out, &self.inner, ctr) } } } ring-0.17.14/src/aead/aes/vp.rs000064400000000000000000000120571046102023000141660ustar 00000000000000// Copyright 2018-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ))] use super::{Block, Counter, EncryptBlock, EncryptCtr32, Iv, KeyBytes, Overlapping, AES_KEY}; use crate::{cpu, error}; #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little") ))] type RequiredCpuFeatures = cpu::arm::Neon; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub(in super::super) type RequiredCpuFeatures = cpu::intel::Ssse3; #[derive(Clone)] pub(in super::super) struct Key { inner: AES_KEY, } impl Key { pub(in super::super) fn new( bytes: KeyBytes<'_>, _cpu: RequiredCpuFeatures, ) -> Result { let inner = unsafe { set_encrypt_key!(vpaes_set_encrypt_key, bytes) }?; Ok(Self { inner }) } } #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86_64" ))] impl EncryptBlock for Key { fn encrypt_block(&self, block: Block) -> Block { super::encrypt_block_using_encrypt_iv_xor_block(self, block) } fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block { super::encrypt_iv_xor_block_using_ctr32(self, iv, block) } } #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64" ))] impl EncryptCtr32 for Key { fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) { unsafe { ctr32_encrypt_blocks!(vpaes_ctr32_encrypt_blocks, in_out, &self.inner, ctr) } } } #[cfg(all(target_arch = "arm", target_endian = "little"))] impl EncryptCtr32 for Key { fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) { use super::{super::overlapping::IndexError, bs, BLOCK_LEN}; let in_out = { let (in_out, src) = in_out.into_slice_src_mut(); let blocks = in_out[src.clone()].len() / BLOCK_LEN; // bsaes operates in batches of 8 blocks. let bsaes_blocks = if blocks >= 8 && (blocks % 8) < 6 { // It's faster to use bsaes for all the full batches and then // switch to vpaes for the last partial batch (if any). blocks - (blocks % 8) } else if blocks >= 8 { // It's faster to let bsaes handle everything including // the last partial batch. blocks } else { // It's faster to let vpaes handle everything. 0 }; let bsaes_in_out_len = bsaes_blocks * BLOCK_LEN; let bs_in_out = Overlapping::new(&mut in_out[..(src.start + bsaes_in_out_len)], src.clone()) .unwrap_or_else(|IndexError { .. }| unreachable!()); // SAFETY: // * self.inner was initialized with `vpaes_set_encrypt_key` above, // as required by `bsaes_ctr32_encrypt_blocks_with_vpaes_key`. unsafe { bs::ctr32_encrypt_blocks_with_vpaes_key(bs_in_out, &self.inner, ctr); } Overlapping::new(&mut in_out[bsaes_in_out_len..], src) .unwrap_or_else(|IndexError { .. }| unreachable!()) }; // SAFETY: // * self.inner was initialized with `vpaes_set_encrypt_key` above, // as required by `vpaes_ctr32_encrypt_blocks`. // * `vpaes_ctr32_encrypt_blocks` satisfies the contract for // `ctr32_encrypt_blocks`. unsafe { ctr32_encrypt_blocks!(vpaes_ctr32_encrypt_blocks, in_out, &self.inner, ctr) } } } #[cfg(target_arch = "x86")] impl EncryptBlock for Key { fn encrypt_block(&self, block: Block) -> Block { unsafe { encrypt_block!(vpaes_encrypt, block, &self.inner) } } fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block { super::encrypt_iv_xor_block_using_encrypt_block(self, iv, block) } } #[cfg(target_arch = "x86")] impl EncryptCtr32 for Key { fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) { super::super::shift::shift_full_blocks(in_out, |input| { self.encrypt_iv_xor_block(ctr.increment(), *input) }); } } ring-0.17.14/src/aead/aes.rs000064400000000000000000000211051046102023000135330ustar 00000000000000// Copyright 2018-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{nonce::Nonce, overlapping, quic::Sample, NONCE_LEN}; use crate::{ bb, cpu::{self, GetFeature as _}, error, polyfill::unwrap_const, }; use cfg_if::cfg_if; use core::num::NonZeroU32; pub(super) use ffi::Counter; #[macro_use] mod ffi; mod bs; pub(super) mod fallback; pub(super) mod hw; pub(super) mod vp; pub type Overlapping<'o> = overlapping::Overlapping<'o, u8>; pub type OverlappingPartialBlock<'o> = overlapping::PartialBlock<'o, u8, BLOCK_LEN>; cfg_if! { if #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64"))] { pub(super) use ffi::AES_KEY; } else { use ffi::AES_KEY; } } #[derive(Clone)] pub(super) enum Key { #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64", target_arch = "x86" ))] Hw(hw::Key), #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ))] Vp(vp::Key), Fallback(fallback::Key), } impl Key { #[inline] pub fn new( bytes: KeyBytes<'_>, cpu_features: cpu::Features, ) -> Result { #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ))] if let Some(hw_features) = cpu_features.get_feature() { return Ok(Self::Hw(hw::Key::new( bytes, hw_features, cpu_features.get_feature(), )?)); } #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86_64", target_arch = "x86" ))] if let Some(vp_features) = cpu_features.get_feature() { return Ok(Self::Vp(vp::Key::new(bytes, vp_features)?)); } let _ = cpu_features; Ok(Self::Fallback(fallback::Key::new(bytes)?)) } #[inline] fn encrypt_block(&self, a: Block) -> Block { match self { #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64", target_arch = "x86" ))] Key::Hw(inner) => inner.encrypt_block(a), #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ))] Key::Vp(inner) => inner.encrypt_block(a), Key::Fallback(inner) => inner.encrypt_block(a), } } pub fn new_mask(&self, sample: Sample) -> [u8; 5] { let [b0, b1, b2, b3, b4, ..] = self.encrypt_block(sample); [b0, b1, b2, b3, b4] } } pub const AES_128_KEY_LEN: usize = 128 / 8; pub const AES_256_KEY_LEN: usize = 256 / 8; pub enum KeyBytes<'a> { AES_128(&'a [u8; AES_128_KEY_LEN]), AES_256(&'a [u8; AES_256_KEY_LEN]), } // `Counter` is `ffi::Counter` as its representation is dictated by its use in // the FFI. impl Counter { pub fn one(nonce: Nonce) -> Self { let mut value = [0u8; BLOCK_LEN]; value[..NONCE_LEN].copy_from_slice(nonce.as_ref()); value[BLOCK_LEN - 1] = 1; Self(value) } pub fn increment(&mut self) -> Iv { const ONE: NonZeroU32 = unwrap_const(NonZeroU32::new(1)); let iv = Iv(self.0); self.increment_by_less_safe(ONE); iv } pub(super) fn increment_by_less_safe(&mut self, increment_by: NonZeroU32) { let [.., c0, c1, c2, c3] = &mut self.0; let old_value: u32 = u32::from_be_bytes([*c0, *c1, *c2, *c3]); let new_value = old_value.wrapping_add(increment_by.get()); [*c0, *c1, *c2, *c3] = u32::to_be_bytes(new_value); } } /// The IV for a single block encryption. /// /// Intentionally not `Clone` to ensure each is used only once. pub struct Iv(Block); impl From for Iv { fn from(counter: Counter) -> Self { Self(counter.0) } } pub(super) type Block = [u8; BLOCK_LEN]; pub(super) const BLOCK_LEN: usize = 16; pub(super) const ZERO_BLOCK: Block = [0u8; BLOCK_LEN]; pub(super) trait EncryptBlock { fn encrypt_block(&self, block: Block) -> Block; fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block; } pub(super) trait EncryptCtr32 { fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter); } #[allow(dead_code)] fn encrypt_block_using_encrypt_iv_xor_block(key: &impl EncryptBlock, block: Block) -> Block { key.encrypt_iv_xor_block(Iv(block), ZERO_BLOCK) } fn encrypt_iv_xor_block_using_encrypt_block( key: &impl EncryptBlock, iv: Iv, block: Block, ) -> Block { let encrypted_iv = key.encrypt_block(iv.0); bb::xor_16(encrypted_iv, block) } #[allow(dead_code)] fn encrypt_iv_xor_block_using_ctr32(key: &impl EncryptCtr32, iv: Iv, mut block: Block) -> Block { let mut ctr = Counter(iv.0); // This is OK because we're only encrypting one block. key.ctr32_encrypt_within(block.as_mut().into(), &mut ctr); block } #[cfg(test)] mod tests { use super::*; use crate::testutil as test; #[test] pub fn test_aes() { test::run(test_vector_file!("aes_tests.txt"), |section, test_case| { assert_eq!(section, ""); let key = consume_key(test_case, "Key"); let input = test_case.consume_bytes("Input"); let block: Block = input.as_slice().try_into()?; let expected_output = test_case.consume_bytes("Output"); let output = key.encrypt_block(block); assert_eq!(output.as_ref(), &expected_output[..]); Ok(()) }) } fn consume_key(test_case: &mut test::TestCase, name: &str) -> Key { let key = test_case.consume_bytes(name); let key = &key[..]; let key = match key.len() { 16 => KeyBytes::AES_128(key.try_into().unwrap()), 32 => KeyBytes::AES_256(key.try_into().unwrap()), _ => unreachable!(), }; Key::new(key, cpu::features()).unwrap() } } // These AES-GCM-specific tests are here instead of in `aes_gcm` because // `Counter`'s API isn't visible (enough) to aes_gcm. #[cfg(test)] mod aes_gcm_tests { use super::{super::aes_gcm::MAX_IN_OUT_LEN, *}; use core::num::NonZeroU32; #[test] fn test_aes_gcm_counter_blocks_max() { test_aes_gcm_counter_blocks(MAX_IN_OUT_LEN, &[0, 0, 0, 0]); } #[test] fn test_aes_gcm_counter_blocks_max_minus_one() { test_aes_gcm_counter_blocks(MAX_IN_OUT_LEN - BLOCK_LEN, &[0xff, 0xff, 0xff, 0xff]); } fn test_aes_gcm_counter_blocks(in_out_len: usize, expected_final_counter: &[u8; 4]) { fn ctr32(ctr: &Counter) -> &[u8; 4] { (&ctr.0[12..]).try_into().unwrap() } let rounded_down = in_out_len / BLOCK_LEN; let blocks = rounded_down + (if in_out_len % BLOCK_LEN == 0 { 0 } else { 1 }); let blocks = u32::try_from(blocks) .ok() .and_then(NonZeroU32::new) .unwrap(); let nonce = Nonce::assume_unique_for_key([1; 12]); let mut ctr = Counter::one(nonce); assert_eq!(ctr32(&ctr), &[0, 0, 0, 1]); let _tag_iv = ctr.increment(); assert_eq!(ctr32(&ctr), &[0, 0, 0, 2]); ctr.increment_by_less_safe(blocks); // `MAX_IN_OUT_LEN` is less on 32-bit targets, so we don't even get // close to wrapping, but run the tests on them anyway. #[cfg(target_pointer_width = "64")] assert_eq!(ctr32(&ctr), expected_final_counter); #[cfg(target_pointer_width = "32")] let _ = expected_final_counter; } } ring-0.17.14/src/aead/aes_gcm/aarch64.rs000064400000000000000000000064041046102023000156160ustar 00000000000000// Copyright 2015-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(all(target_arch = "aarch64", target_endian = "little"))] use super::{aes, gcm, Counter, BLOCK_LEN}; use crate::{aead::aes::Overlapping, bits::BitLength, polyfill::slice::AsChunksMut}; use core::num::NonZeroU64; pub(super) fn seal_whole( aes_key: &aes::hw::Key, auth: &mut gcm::Context, ctr: &mut Counter, mut in_out: AsChunksMut, ) { let whole_block_bits = auth.in_out_whole_block_bits(); let whole_block_bits_u64: BitLength = whole_block_bits.into(); if let Ok(whole_block_bits) = whole_block_bits_u64.try_into() { let (htable, xi) = auth.inner(); prefixed_extern! { fn aes_gcm_enc_kernel( input: *const [u8; BLOCK_LEN], in_bits: BitLength, output: *mut [u8; BLOCK_LEN], Xi: &mut gcm::Xi, ivec: &mut Counter, key: &aes::AES_KEY, Htable: &gcm::HTable); } unsafe { aes_gcm_enc_kernel( in_out.as_ptr(), whole_block_bits, in_out.as_mut_ptr(), xi, ctr, aes_key.inner_less_safe(), htable, ) } } } pub(super) fn open_whole( aes_key: &aes::hw::Key, auth: &mut gcm::Context, in_out: Overlapping, ctr: &mut Counter, ) { // Precondition. TODO: Create an overlapping::AsChunks for this. assert_eq!(in_out.len() % BLOCK_LEN, 0); in_out.with_input_output_len(|input, output, _len| { let whole_block_bits = auth.in_out_whole_block_bits(); let whole_block_bits_u64: BitLength = whole_block_bits.into(); if let Ok(whole_block_bits) = whole_block_bits_u64.try_into() { let (htable, xi) = auth.inner(); prefixed_extern! { fn aes_gcm_dec_kernel( input: *const u8, in_bits: BitLength, output: *mut u8, Xi: &mut gcm::Xi, ivec: &mut Counter, key: &aes::AES_KEY, Htable: &gcm::HTable); } unsafe { aes_gcm_dec_kernel( input, whole_block_bits, output, xi, ctr, aes_key.inner_less_safe(), htable, ) } } }) } ring-0.17.14/src/aead/aes_gcm/aeshwclmulmovbe.rs000064400000000000000000000123261046102023000175630ustar 00000000000000// Copyright 2015-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(target_arch = "x86_64")] use super::{ super::overlapping::IndexError, aes::{self, Counter, EncryptCtr32, Overlapping, OverlappingPartialBlock}, gcm, Aad, Tag, }; use crate::{ c, error::{self, InputTooLongError}, polyfill::slice, }; use core::ops::RangeFrom; #[inline(never)] pub(super) fn seal( aes_key: &aes::hw::Key, gcm_key: &gcm::clmulavxmovbe::Key, mut ctr: Counter, tag_iv: aes::Iv, aad: Aad<&[u8]>, in_out: &mut [u8], ) -> Result { prefixed_extern! { // `HTable` and `Xi` should be 128-bit aligned. TODO: Can we shrink `HTable`? The // assembly says it needs just nine values in that array. fn aesni_gcm_encrypt( input: *const u8, output: *mut u8, len: c::size_t, key: &aes::AES_KEY, ivec: &mut Counter, Htable: &gcm::HTable, Xi: &mut gcm::Xi) -> c::size_t; } let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?; let (htable, xi) = auth.inner(); let processed = unsafe { aesni_gcm_encrypt( in_out.as_ptr(), in_out.as_mut_ptr(), in_out.len(), aes_key.inner_less_safe(), &mut ctr, htable, xi, ) }; let ramaining = match in_out.get_mut(processed..) { Some(remaining) => remaining, None => { // This can't happen. If it did, then the assembly already // caused a buffer overflow. unreachable!() } }; let (mut whole, remainder) = slice::as_chunks_mut(ramaining); aes_key.ctr32_encrypt_within(whole.as_flattened_mut().into(), &mut ctr); auth.update_blocks(whole.as_ref()); let remainder = OverlappingPartialBlock::new(remainder.into()) .unwrap_or_else(|InputTooLongError { .. }| unreachable!()); super::seal_finish(aes_key, auth, remainder, ctr, tag_iv) } #[inline(never)] pub(super) fn open( aes_key: &aes::hw::Key, gcm_key: &gcm::clmulavxmovbe::Key, mut ctr: Counter, tag_iv: aes::Iv, aad: Aad<&[u8]>, in_out_slice: &mut [u8], src: RangeFrom, ) -> Result { prefixed_extern! { // `HTable` and `Xi` should be 128-bit aligned. TODO: Can we shrink `HTable`? The // assembly says it needs just nine values in that array. fn aesni_gcm_decrypt( input: *const u8, output: *mut u8, len: c::size_t, key: &aes::AES_KEY, ivec: &mut Counter, Htable: &gcm::HTable, Xi: &mut gcm::Xi) -> c::size_t; } let in_out = Overlapping::new(in_out_slice, src.clone()).map_err(error::erase::)?; let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?; let processed = in_out.with_input_output_len(|input, output, len| { let (htable, xi) = auth.inner(); unsafe { aesni_gcm_decrypt( input, output, len, aes_key.inner_less_safe(), &mut ctr, htable, xi, ) } }); let in_out_slice = in_out_slice.get_mut(processed..).unwrap_or_else(|| { // This can't happen. If it did, then the assembly already // caused a buffer overflow. unreachable!() }); // Authenticate any remaining whole blocks. let in_out = Overlapping::new(in_out_slice, src.clone()).unwrap_or_else(|IndexError { .. }| { // This can't happen. If it did, then the assembly already // overwrote part of the remaining input. unreachable!() }); let (whole, _) = slice::as_chunks(in_out.input()); auth.update_blocks(whole); let whole_len = whole.as_flattened().len(); // Decrypt any remaining whole blocks. let whole = Overlapping::new(&mut in_out_slice[..(src.start + whole_len)], src.clone()) .map_err(error::erase::)?; aes_key.ctr32_encrypt_within(whole, &mut ctr); let in_out_slice = match in_out_slice.get_mut(whole_len..) { Some(partial) => partial, None => unreachable!(), }; let in_out = Overlapping::new(in_out_slice, src).unwrap_or_else(|IndexError { .. }| unreachable!()); let in_out = OverlappingPartialBlock::new(in_out) .unwrap_or_else(|InputTooLongError { .. }| unreachable!()); super::open_finish(aes_key, auth, in_out, ctr, tag_iv) } ring-0.17.14/src/aead/aes_gcm/vaesclmulavx2.rs000064400000000000000000000061061046102023000171610ustar 00000000000000// Copyright 2015-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(target_arch = "x86_64")] use super::{aes, gcm, Counter, BLOCK_LEN}; use crate::{aead::aes::Overlapping, c, polyfill::slice::AsChunksMut}; use core::num::NonZeroU32; pub(super) fn seal_whole_vaes_clmul_avx2( aes_key: &aes::hw::Key, auth: &mut gcm::Context, ctr: &mut Counter, mut in_out: AsChunksMut, ) { prefixed_extern! { fn aes_gcm_enc_update_vaes_avx2( input: *const u8, output: *mut u8, len: c::size_t, key: &aes::AES_KEY, ivec: &Counter, Htable: &gcm::HTable, Xi: &mut gcm::Xi); } let in_out = in_out.as_flattened_mut(); // Precondition: Since we have a `gcm::Context` then the number of blocks // must fit in `u32`. let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap(); if let Some(blocks) = NonZeroU32::new(blocks) { let aes_key = aes_key.inner_less_safe(); let (htable, xi) = auth.inner(); let input = in_out.as_ptr(); let output = in_out.as_mut_ptr(); let len = in_out.len(); unsafe { aes_gcm_enc_update_vaes_avx2(input, output, len, aes_key, ctr, htable, xi) }; ctr.increment_by_less_safe(blocks); } } pub(super) fn open_whole_vaes_clmul_avx2( aes_key: &aes::hw::Key, auth: &mut gcm::Context, in_out: Overlapping, ctr: &mut Counter, ) { prefixed_extern! { fn aes_gcm_dec_update_vaes_avx2( input: *const u8, output: *mut u8, len: c::size_t, key: &aes::AES_KEY, ivec: &mut Counter, Htable: &gcm::HTable, Xi: &mut gcm::Xi); } // Precondition. TODO: Create an overlapping::AsChunks for this. assert_eq!(in_out.len() % BLOCK_LEN, 0); // Precondition: Since we have a `gcm::Context` then the number of blocks // must fit in `u32`. let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap(); if let Some(blocks) = NonZeroU32::new(blocks) { let aes_key = aes_key.inner_less_safe(); let (htable, xi) = auth.inner(); in_out.with_input_output_len(|input, output, len| unsafe { aes_gcm_dec_update_vaes_avx2(input, output, len, aes_key, ctr, htable, xi) }); ctr.increment_by_less_safe(blocks); } } ring-0.17.14/src/aead/aes_gcm.rs000064400000000000000000000413531046102023000143700ustar 00000000000000// Copyright 2015-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{ aes::{self, Counter, Overlapping, OverlappingPartialBlock, BLOCK_LEN, ZERO_BLOCK}, gcm, overlapping::IndexError, Aad, Nonce, Tag, }; use crate::{ cpu, error::{self, InputTooLongError}, polyfill::{slice, sliceutil::overwrite_at_start, usize_from_u64_saturated}, }; use core::ops::RangeFrom; #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ))] use cpu::GetFeature as _; mod aarch64; mod aeshwclmulmovbe; mod vaesclmulavx2; #[derive(Clone)] pub(super) struct Key(DynKey); impl Key { pub(super) fn new( key: aes::KeyBytes, cpu_features: cpu::Features, ) -> Result { Ok(Self(DynKey::new(key, cpu_features)?)) } } #[derive(Clone)] enum DynKey { #[cfg(target_arch = "x86_64")] VAesClMulAvx2(Combo), #[cfg(target_arch = "x86_64")] AesHwClMulAvxMovbe(Combo), #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ))] AesHwClMul(Combo), #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little") ))] Simd(Combo), #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Simd(Combo), Fallback(Combo), } impl DynKey { fn new(key: aes::KeyBytes, cpu: cpu::Features) -> Result { let cpu = cpu.values(); #[cfg(target_arch = "x86_64")] if let Some((aes, gcm)) = cpu.get_feature() { // 14.3.1 Detection of VEX-Encoded AES and VPCLMULQDQ let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?; let gcm_key_value = derive_gcm_key_value(&aes_key); let combo = if let Some(cpu) = cpu.get_feature() { let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu); Self::VAesClMulAvx2(Combo { aes_key, gcm_key }) } else if let Some(cpu) = cpu.get_feature() { let gcm_key = gcm::clmulavxmovbe::Key::new(gcm_key_value, cpu); Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) } else { let gcm_key = gcm::clmul::Key::new(gcm_key_value, gcm); Self::AesHwClMul(Combo { aes_key, gcm_key }) }; return Ok(combo); } // x86_64 is handled above. #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86" ))] if let (Some(aes), Some(gcm)) = (cpu.get_feature(), cpu.get_feature()) { let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?; let gcm_key_value = derive_gcm_key_value(&aes_key); let gcm_key = gcm::clmul::Key::new(gcm_key_value, gcm); return Ok(Self::AesHwClMul(Combo { aes_key, gcm_key })); } #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little") ))] if let Some(cpu) = cpu.get_feature() { return Self::new_neon(key, cpu); } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] if let Some(cpu) = cpu.get_feature() { return Self::new_ssse3(key, cpu); } let _ = cpu; Self::new_fallback(key) } #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little") ))] #[cfg_attr(target_arch = "aarch64", inline(never))] fn new_neon(key: aes::KeyBytes, cpu: cpu::arm::Neon) -> Result { let aes_key = aes::vp::Key::new(key, cpu)?; let gcm_key_value = derive_gcm_key_value(&aes_key); let gcm_key = gcm::neon::Key::new(gcm_key_value, cpu); Ok(Self::Simd(Combo { aes_key, gcm_key })) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(never)] fn new_ssse3( key: aes::KeyBytes, cpu: aes::vp::RequiredCpuFeatures, ) -> Result { let aes_key = aes::vp::Key::new(key, cpu)?; let gcm_key_value = derive_gcm_key_value(&aes_key); let gcm_key = gcm::fallback::Key::new(gcm_key_value); Ok(Self::Simd(Combo { aes_key, gcm_key })) } #[cfg_attr( any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64", ), inline(never) )] fn new_fallback(key: aes::KeyBytes) -> Result { let aes_key = aes::fallback::Key::new(key)?; let gcm_key_value = derive_gcm_key_value(&aes_key); let gcm_key = gcm::fallback::Key::new(gcm_key_value); Ok(Self::Fallback(Combo { aes_key, gcm_key })) } } fn derive_gcm_key_value(aes_key: &impl aes::EncryptBlock) -> gcm::KeyValue { gcm::KeyValue::new(aes_key.encrypt_block(ZERO_BLOCK)) } const CHUNK_BLOCKS: usize = 3 * 1024 / 16; #[inline(never)] pub(super) fn seal( Key(key): &Key, nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], ) -> Result { let mut ctr = Counter::one(nonce); let tag_iv = ctr.increment(); match key { #[cfg(all(target_arch = "aarch64", target_endian = "little"))] DynKey::AesHwClMul(c) => { seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole) } #[cfg(target_arch = "x86_64")] DynKey::VAesClMulAvx2(c) => seal_whole_partial( c, aad, in_out, ctr, tag_iv, vaesclmulavx2::seal_whole_vaes_clmul_avx2, ), #[cfg(target_arch = "x86_64")] DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => { aeshwclmulmovbe::seal(aes_key, gcm_key, ctr, tag_iv, aad, in_out) } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] DynKey::AesHwClMul(c) => seal_strided(c, aad, in_out, ctr, tag_iv), #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86_64", target_arch = "x86" ))] DynKey::Simd(c) => seal_strided(c, aad, in_out, ctr, tag_iv), DynKey::Fallback(c) => seal_strided(c, aad, in_out, ctr, tag_iv), } } #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64" ))] fn seal_whole_partial( Combo { aes_key, gcm_key }: &Combo, aad: Aad<&[u8]>, in_out: &mut [u8], mut ctr: Counter, tag_iv: aes::Iv, seal_whole: impl FnOnce(&A, &mut gcm::Context, &mut Counter, slice::AsChunksMut), ) -> Result { let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?; let (whole, remainder) = slice::as_chunks_mut(in_out); seal_whole(aes_key, &mut auth, &mut ctr, whole); let remainder = OverlappingPartialBlock::new(remainder.into()) .unwrap_or_else(|InputTooLongError { .. }| unreachable!()); seal_finish(aes_key, auth, remainder, ctr, tag_iv) } #[cfg_attr( any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ), inline(never) )] #[cfg_attr( any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64" ), cold )] fn seal_strided< A: aes::EncryptBlock + aes::EncryptCtr32, G: gcm::UpdateBlock + gcm::UpdateBlocks, >( Combo { aes_key, gcm_key }: &Combo, aad: Aad<&[u8]>, in_out: &mut [u8], mut ctr: Counter, tag_iv: aes::Iv, ) -> Result { let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?; let (mut whole, remainder) = slice::as_chunks_mut(in_out); for mut chunk in whole.chunks_mut::() { aes_key.ctr32_encrypt_within(chunk.as_flattened_mut().into(), &mut ctr); auth.update_blocks(chunk.as_ref()); } let remainder = OverlappingPartialBlock::new(remainder.into()) .unwrap_or_else(|InputTooLongError { .. }| unreachable!()); seal_finish(aes_key, auth, remainder, ctr, tag_iv) } fn seal_finish( aes_key: &A, mut auth: gcm::Context, remainder: OverlappingPartialBlock<'_>, ctr: Counter, tag_iv: aes::Iv, ) -> Result { let remainder_len = remainder.len(); if remainder_len > 0 { let mut input = ZERO_BLOCK; overwrite_at_start(&mut input, remainder.input()); let mut output = aes_key.encrypt_iv_xor_block(ctr.into(), input); output[remainder_len..].fill(0); auth.update_block(output); remainder.overwrite_at_start(output); } Ok(finish(aes_key, auth, tag_iv)) } #[inline(never)] pub(super) fn open( Key(key): &Key, nonce: Nonce, aad: Aad<&[u8]>, in_out_slice: &mut [u8], src: RangeFrom, ) -> Result { let mut ctr = Counter::one(nonce); let tag_iv = ctr.increment(); match key { #[cfg(all(target_arch = "aarch64", target_endian = "little"))] DynKey::AesHwClMul(c) => { open_whole_partial(c, aad, in_out_slice, src, ctr, tag_iv, aarch64::open_whole) } #[cfg(target_arch = "x86_64")] DynKey::VAesClMulAvx2(c) => open_whole_partial( c, aad, in_out_slice, src, ctr, tag_iv, vaesclmulavx2::open_whole_vaes_clmul_avx2, ), #[cfg(target_arch = "x86_64")] DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => { aeshwclmulmovbe::open(aes_key, gcm_key, ctr, tag_iv, aad, in_out_slice, src) } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] DynKey::AesHwClMul(c) => open_strided(c, aad, in_out_slice, src, ctr, tag_iv), #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86_64", target_arch = "x86" ))] DynKey::Simd(c) => open_strided(c, aad, in_out_slice, src, ctr, tag_iv), DynKey::Fallback(c) => open_strided(c, aad, in_out_slice, src, ctr, tag_iv), } } #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64" ))] fn open_whole_partial( Combo { aes_key, gcm_key }: &Combo, aad: Aad<&[u8]>, in_out_slice: &mut [u8], src: RangeFrom, mut ctr: Counter, tag_iv: aes::Iv, open_whole: impl FnOnce(&A, &mut gcm::Context, Overlapping, &mut Counter), ) -> Result { let in_out = Overlapping::new(in_out_slice, src.clone()).map_err(error::erase::)?; let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?; let remainder_len = in_out.len() % BLOCK_LEN; let in_out_slice_len = in_out_slice.len(); let whole_in_out_slice = &mut in_out_slice[..(in_out_slice_len - remainder_len)]; let whole = Overlapping::new(whole_in_out_slice, src.clone()) .unwrap_or_else(|IndexError { .. }| unreachable!()); let whole_len = whole.len(); open_whole(aes_key, &mut auth, whole, &mut ctr); let remainder = &mut in_out_slice[whole_len..]; let remainder = Overlapping::new(remainder, src).unwrap_or_else(|IndexError { .. }| unreachable!()); let remainder = OverlappingPartialBlock::new(remainder) .unwrap_or_else(|InputTooLongError { .. }| unreachable!()); open_finish(aes_key, auth, remainder, ctr, tag_iv) } #[cfg_attr( any( all( any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little") ), target_feature = "neon" ), all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse" ) ), inline(never) )] #[cfg_attr( any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64" ), cold )] fn open_strided< A: aes::EncryptBlock + aes::EncryptCtr32, G: gcm::UpdateBlock + gcm::UpdateBlocks, >( Combo { aes_key, gcm_key }: &Combo, aad: Aad<&[u8]>, in_out_slice: &mut [u8], src: RangeFrom, mut ctr: Counter, tag_iv: aes::Iv, ) -> Result { let in_out = Overlapping::new(in_out_slice, src.clone()).map_err(error::erase::)?; let input = in_out.input(); let input_len = input.len(); let mut auth = gcm::Context::new(gcm_key, aad, input_len)?; let remainder_len = input_len % BLOCK_LEN; let whole_len = input_len - remainder_len; let in_prefix_len = src.start; { let mut chunk_len = CHUNK_BLOCKS * BLOCK_LEN; let mut output = 0; let mut input = in_prefix_len; loop { if whole_len - output < chunk_len { chunk_len = whole_len - output; } let ciphertext = &in_out_slice[input..][..chunk_len]; let (ciphertext, leftover) = slice::as_chunks(ciphertext); debug_assert_eq!(leftover.len(), 0); if ciphertext.is_empty() { break; } auth.update_blocks(ciphertext); let chunk = Overlapping::new( &mut in_out_slice[output..][..(chunk_len + in_prefix_len)], in_prefix_len.., ) .map_err(error::erase::)?; aes_key.ctr32_encrypt_within(chunk, &mut ctr); output += chunk_len; input += chunk_len; } } let in_out = Overlapping::new(&mut in_out_slice[whole_len..], src) .unwrap_or_else(|IndexError { .. }| unreachable!()); let in_out = OverlappingPartialBlock::new(in_out) .unwrap_or_else(|InputTooLongError { .. }| unreachable!()); open_finish(aes_key, auth, in_out, ctr, tag_iv) } fn open_finish( aes_key: &A, mut auth: gcm::Context, remainder: OverlappingPartialBlock<'_>, ctr: Counter, tag_iv: aes::Iv, ) -> Result { if remainder.len() > 0 { let mut input = ZERO_BLOCK; overwrite_at_start(&mut input, remainder.input()); auth.update_block(input); remainder.overwrite_at_start(aes_key.encrypt_iv_xor_block(ctr.into(), input)); } Ok(finish(aes_key, auth, tag_iv)) } fn finish( aes_key: &A, gcm_ctx: gcm::Context, tag_iv: aes::Iv, ) -> Tag { // Finalize the tag and return it. gcm_ctx.pre_finish(|pre_tag| Tag(aes_key.encrypt_iv_xor_block(tag_iv, pre_tag))) } pub(super) const MAX_IN_OUT_LEN: usize = super::max_input_len(BLOCK_LEN, 2); // [NIST SP800-38D] Section 5.2.1.1. Note that [RFC 5116 Section 5.1] and // [RFC 5116 Section 5.2] have an off-by-one error in `P_MAX`. // // [NIST SP800-38D]: // http://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf // [RFC 5116 Section 5.1]: https://tools.ietf.org/html/rfc5116#section-5.1 // [RFC 5116 Section 5.2]: https://tools.ietf.org/html/rfc5116#section-5.2 const _MAX_INPUT_LEN_BOUNDED_BY_NIST: () = assert!(MAX_IN_OUT_LEN == usize_from_u64_saturated(((1u64 << 39) - 256) / 8)); #[derive(Copy, Clone)] pub(super) struct Combo { pub(super) aes_key: Aes, pub(super) gcm_key: Gcm, } ring-0.17.14/src/aead/algorithm.rs000064400000000000000000000165351046102023000147640ustar 00000000000000// Copyright 2015-2021 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::{ bb, cpu, error::{self, InputTooLongError}, hkdf, }; use core::ops::RangeFrom; use super::{ aes, aes_gcm, chacha20_poly1305, nonce::{Nonce, NONCE_LEN}, overlapping::{IndexError, Overlapping}, Aad, KeyInner, Tag, TAG_LEN, }; impl hkdf::KeyType for &'static Algorithm { #[inline] fn len(&self) -> usize { self.key_len() } } /// An AEAD Algorithm. pub struct Algorithm { init: fn(key: &[u8], cpu_features: cpu::Features) -> Result, seal: fn( key: &KeyInner, nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], cpu_features: cpu::Features, ) -> Result, open: fn( key: &KeyInner, nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], src: RangeFrom, cpu_features: cpu::Features, ) -> Result, key_len: usize, id: AlgorithmID, } impl Algorithm { /// The length of the key. #[inline(always)] pub fn key_len(&self) -> usize { self.key_len } /// The length of a tag. /// /// See also `MAX_TAG_LEN`. #[inline(always)] pub fn tag_len(&self) -> usize { TAG_LEN } /// The length of the nonces. #[inline(always)] pub fn nonce_len(&self) -> usize { NONCE_LEN } pub(super) fn new_key( &self, key_bytes: &[u8], cpu_features: cpu::Features, ) -> Result { (self.init)(key_bytes, cpu_features) } pub(super) fn open_within<'io>( &self, key: &KeyInner, nonce: Nonce, aad: Aad<&[u8]>, received_tag: Tag, in_out: &'io mut [u8], src: RangeFrom, cpu_features: cpu::Features, ) -> Result<&'io mut [u8], error::Unspecified> { let ciphertext_len = in_out.get(src.clone()).ok_or(error::Unspecified)?.len(); let Tag(calculated_tag) = (self.open)(key, nonce, aad, in_out, src, cpu_features)?; if bb::verify_slices_are_equal(calculated_tag.as_ref(), received_tag.as_ref()).is_err() { // Zero out the plaintext so that it isn't accidentally leaked or used // after verification fails. It would be safest if we could check the // tag before decrypting, but some `open` implementations interleave // authentication with decryption for performance. for b in &mut in_out[..ciphertext_len] { *b = 0; } return Err(error::Unspecified); } // `ciphertext_len` is also the plaintext length. Ok(&mut in_out[..ciphertext_len]) } #[inline] pub(super) fn seal( &self, key: &KeyInner, nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], cpu_features: cpu::Features, ) -> Result { (self.seal)(key, nonce, aad, in_out, cpu_features) } } derive_debug_via_id!(Algorithm); #[derive(Debug, Eq, PartialEq)] pub(super) enum AlgorithmID { AES_128_GCM, AES_256_GCM, CHACHA20_POLY1305, } impl PartialEq for Algorithm { fn eq(&self, other: &Self) -> bool { self.id == other.id } } impl Eq for Algorithm {} /// AES-128 in GCM mode with 128-bit tags and 96 bit nonces. pub static AES_128_GCM: Algorithm = Algorithm { key_len: aes::AES_128_KEY_LEN, init: aes_gcm_init_128, seal: aes_gcm_seal, open: aes_gcm_open, id: AlgorithmID::AES_128_GCM, }; /// AES-256 in GCM mode with 128-bit tags and 96 bit nonces. pub static AES_256_GCM: Algorithm = Algorithm { key_len: aes::AES_256_KEY_LEN, init: aes_gcm_init_256, seal: aes_gcm_seal, open: aes_gcm_open, id: AlgorithmID::AES_256_GCM, }; fn aes_gcm_init_128( key: &[u8], cpu_features: cpu::Features, ) -> Result { let key = key.try_into().map_err(|_| error::Unspecified)?; Ok(KeyInner::AesGcm(aes_gcm::Key::new( aes::KeyBytes::AES_128(key), cpu_features, )?)) } fn aes_gcm_init_256( key: &[u8], cpu_features: cpu::Features, ) -> Result { let key = key.try_into().map_err(|_| error::Unspecified)?; Ok(KeyInner::AesGcm(aes_gcm::Key::new( aes::KeyBytes::AES_256(key), cpu_features, )?)) } fn aes_gcm_seal( key: &KeyInner, nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], _cpu_features: cpu::Features, ) -> Result { let key = match key { KeyInner::AesGcm(key) => key, _ => unreachable!(), }; aes_gcm::seal(key, nonce, aad, in_out) } pub(super) fn aes_gcm_open( key: &KeyInner, nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], src: RangeFrom, _cpu_features: cpu::Features, ) -> Result { let key = match key { KeyInner::AesGcm(key) => key, _ => unreachable!(), }; aes_gcm::open(key, nonce, aad, in_out, src) } /// ChaCha20-Poly1305 as described in [RFC 8439]. /// /// The keys are 256 bits long and the nonces are 96 bits long. /// /// [RFC 8439]: https://tools.ietf.org/html/rfc8439 pub static CHACHA20_POLY1305: Algorithm = Algorithm { key_len: chacha20_poly1305::KEY_LEN, init: chacha20_poly1305_init, seal: chacha20_poly1305_seal, open: chacha20_poly1305_open, id: AlgorithmID::CHACHA20_POLY1305, }; /// Copies |key| into |ctx_buf|. fn chacha20_poly1305_init( key: &[u8], _cpu_features: cpu::Features, ) -> Result { let key: [u8; chacha20_poly1305::KEY_LEN] = key.try_into()?; Ok(KeyInner::ChaCha20Poly1305(chacha20_poly1305::Key::new(key))) } fn chacha20_poly1305_seal( key: &KeyInner, nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], cpu_features: cpu::Features, ) -> Result { let key = match key { KeyInner::ChaCha20Poly1305(key) => key, _ => unreachable!(), }; chacha20_poly1305::seal(key, nonce, aad, in_out, cpu_features) .map_err(error::erase::) } fn chacha20_poly1305_open( key: &KeyInner, nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], src: RangeFrom, cpu_features: cpu::Features, ) -> Result { let key = match key { KeyInner::ChaCha20Poly1305(key) => key, _ => unreachable!(), }; let in_out = Overlapping::new(in_out, src).map_err(error::erase::)?; chacha20_poly1305::open(key, nonce, aad, in_out, cpu_features) .map_err(error::erase::) } ring-0.17.14/src/aead/chacha/fallback.rs000064400000000000000000000076141046102023000157420ustar 00000000000000// Copyright 2021 Brian Smith. // Portions Copyright (c) 2014, Google Inc. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ // Adapted from the public domain, estream code by D. Bernstein. // Adapted from the BoringSSL crypto/chacha/chacha.c. use super::{super::overlapping::IndexError, Counter, Key, Overlapping, BLOCK_LEN}; use crate::{bb, polyfill::sliceutil}; use core::mem::size_of; pub(super) fn ChaCha20_ctr32(key: &Key, counter: Counter, mut in_out: Overlapping<'_>) { const SIGMA: [u32; 4] = [ u32::from_le_bytes(*b"expa"), u32::from_le_bytes(*b"nd 3"), u32::from_le_bytes(*b"2-by"), u32::from_le_bytes(*b"te k"), ]; let key = key.words_less_safe(); let counter = counter.into_words_less_safe(); let mut state = [ SIGMA[0], SIGMA[1], SIGMA[2], SIGMA[3], key[0], key[1], key[2], key[3], key[4], key[5], key[6], key[7], counter[0], counter[1], counter[2], counter[3], ]; let mut in_out_len = in_out.len(); let mut buf = [0u8; BLOCK_LEN]; while in_out_len > 0 { chacha_core(&mut buf, &state); state[12] += 1; debug_assert_eq!(in_out_len, in_out.len()); // Both branches do the same thing, but the duplication helps the // compiler optimize (vectorize) the `BLOCK_LEN` case. if in_out_len >= BLOCK_LEN { in_out = in_out .split_first_chunk::(|in_out| { bb::xor_assign_at_start(&mut buf, in_out.input()); sliceutil::overwrite_at_start(in_out.into_unwritten_output(), &buf); }) .unwrap_or_else(|IndexError { .. }| { // Since `in_out_len == in_out.len() && in_out_len >= BLOCK_LEN`. unreachable!() }); } else { bb::xor_assign_at_start(&mut buf, in_out.input()); sliceutil::overwrite_at_start(in_out.into_unwritten_output(), &buf); break; } in_out_len -= BLOCK_LEN; } } // Performs 20 rounds of ChaCha on `input`, storing the result in `output`. #[inline(always)] fn chacha_core(output: &mut [u8; BLOCK_LEN], input: &State) { let mut x = *input; for _ in (0..20).step_by(2) { quarterround(&mut x, 0, 4, 8, 12); quarterround(&mut x, 1, 5, 9, 13); quarterround(&mut x, 2, 6, 10, 14); quarterround(&mut x, 3, 7, 11, 15); quarterround(&mut x, 0, 5, 10, 15); quarterround(&mut x, 1, 6, 11, 12); quarterround(&mut x, 2, 7, 8, 13); quarterround(&mut x, 3, 4, 9, 14); } for (x, input) in x.iter_mut().zip(input.iter()) { *x = x.wrapping_add(*input); } output .chunks_exact_mut(size_of::()) .zip(x.iter()) .for_each(|(output, &x)| output.copy_from_slice(&x.to_le_bytes())); } #[inline(always)] fn quarterround(x: &mut State, a: usize, b: usize, c: usize, d: usize) { #[inline(always)] fn step(x: &mut State, a: usize, b: usize, c: usize, rotation: u32) { x[a] = x[a].wrapping_add(x[b]); x[c] = (x[c] ^ x[a]).rotate_left(rotation); } step(x, a, b, d, 16); step(x, c, d, b, 12); step(x, a, b, d, 8); step(x, c, d, b, 7); } type State = [u32; BLOCK_LEN / 4]; ring-0.17.14/src/aead/chacha/ffi.rs000064400000000000000000000052101046102023000147350ustar 00000000000000// Copyright 2016-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{super::overlapping::Overlapping, Counter, Key}; // `unsafe { (N, C, InOut) => f }` means that the function `f` is safe to call // iff the in/out length is at least `N`, the CPU features `C` are available, // and the input type is `InOut`. If `f` supports overlapping input/output then // `InOut` should be `Overlapping<'_, u8>`; otherwise it should be `&mut [u8]`. macro_rules! chacha20_ctr32_ffi { ( unsafe { ($MIN_LEN:expr, $Cpu:ty, $InOut:ty) => $f:ident }, $key:expr, $counter:expr, $in_out:expr, $cpu:expr ) => {{ prefixed_extern! { fn $f( out: *mut u8, in_: *const u8, in_len: crate::c::size_t, key: &[u32; 8], counter: &crate::aead::chacha::Counter, ); } // SAFETY: The user asserts that $f has the signature above and is safe // to call if additionally we have a value of type `$Cpu` and an in/out // value of the indicated type, which we do. unsafe { crate::aead::chacha::ffi::chacha20_ctr32_ffi::<$InOut, $Cpu, $MIN_LEN>( $key, $counter, $in_out, $cpu, $f, ) } }}; } // Panics if `in_out.len() < MIN_LEN`. The caller should have guarded against // that so that the assertion gets optimized away. pub(super) unsafe fn chacha20_ctr32_ffi< 'o, InOut: 'o + Into>, Cpu, const MIN_LEN: usize, >( key: &Key, counter: Counter, in_out: InOut, cpu: Cpu, f: unsafe extern "C" fn(*mut u8, *const u8, crate::c::size_t, &[u32; 8], &Counter), ) { assert!(MIN_LEN > 0); let in_out: Overlapping<'_, u8> = in_out.into(); in_out.with_input_output_len(|input, output, len| { assert!(len >= MIN_LEN); let key = key.words_less_safe(); let _: Cpu = cpu; unsafe { f(output, input, len, key, &counter) } }); } ring-0.17.14/src/aead/chacha.rs000064400000000000000000000272401046102023000142000ustar 00000000000000// Copyright 2016 Brian Smith. // Portions Copyright (c) 2016, Google Inc. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{overlapping, quic::Sample, Nonce}; use crate::cpu; use cfg_if::cfg_if; cfg_if! { if #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ))] { #[macro_use] mod ffi; #[cfg(any(target_arch = "x86", test))] mod fallback; } else { mod fallback; } } use crate::polyfill::ArraySplitMap; pub type Overlapping<'o> = overlapping::Overlapping<'o, u8>; #[derive(Clone)] pub struct Key { words: [u32; KEY_LEN / 4], } impl Key { pub(super) fn new(value: [u8; KEY_LEN]) -> Self { Self { words: value.array_split_map(u32::from_le_bytes), } } } impl Key { // Encrypts `in_out` with the counter 0 and returns counter 1, // where the counter is derived from the nonce `nonce`. #[inline] pub(super) fn encrypt_single_block_with_ctr_0( &self, nonce: Nonce, in_out: &mut [u8; N], cpu: cpu::Features, ) -> Counter { assert!(N <= BLOCK_LEN); let (zero, one) = Counter::zero_one_less_safe(nonce); self.encrypt(zero, in_out.as_mut().into(), cpu); one } #[inline] pub fn new_mask(&self, sample: Sample) -> [u8; 5] { let cpu = cpu::features(); // TODO: Remove this. let (ctr, nonce) = sample.split_at(4); let ctr = u32::from_le_bytes(ctr.try_into().unwrap()); let nonce = Nonce::assume_unique_for_key(nonce.try_into().unwrap()); let ctr = Counter::from_nonce_and_ctr(nonce, ctr); let mut out: [u8; 5] = [0; 5]; self.encrypt(ctr, out.as_mut().into(), cpu); out } #[inline(always)] pub(super) fn encrypt(&self, counter: Counter, in_out: Overlapping<'_>, cpu: cpu::Features) { cfg_if! { if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { use cpu::{GetFeature as _, arm::Neon}; const NEON_MIN_LEN: usize = 192 + 1; if in_out.len() >= NEON_MIN_LEN { if let Some(cpu) = cpu.get_feature() { return chacha20_ctr32_ffi!( unsafe { (NEON_MIN_LEN, Neon, Overlapping<'_>) => ChaCha20_ctr32_neon }, self, counter, in_out, cpu); } } if in_out.len() >= 1 { chacha20_ctr32_ffi!( unsafe { (1, (), Overlapping<'_>) => ChaCha20_ctr32_nohw }, self, counter, in_out, ()) } } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] { use cpu::{GetFeature as _, arm::Neon}; const NEON_MIN_LEN: usize = 192 + 1; if in_out.len() >= NEON_MIN_LEN { if let Some(cpu) = cpu.get_feature() { return chacha20_ctr32_ffi!( unsafe { (NEON_MIN_LEN, Neon, &mut [u8]) => ChaCha20_ctr32_neon }, self, counter, in_out.copy_within(), cpu); } } if in_out.len() >= 1 { chacha20_ctr32_ffi!( unsafe { (1, (), &mut [u8]) => ChaCha20_ctr32_nohw }, self, counter, in_out.copy_within(), ()) } } else if #[cfg(target_arch = "x86")] { use cpu::{GetFeature as _, intel::Ssse3}; if in_out.len() >= 1 { if let Some(cpu) = cpu.get_feature() { chacha20_ctr32_ffi!( unsafe { (1, Ssse3, &mut [u8]) => ChaCha20_ctr32_ssse3 }, self, counter, in_out.copy_within(), cpu) } else { let _: cpu::Features = cpu; fallback::ChaCha20_ctr32(self, counter, in_out) } } } else if #[cfg(target_arch = "x86_64")] { use cpu::{GetFeature, intel::{Avx2, Ssse3}}; const SSE_MIN_LEN: usize = 128 + 1; // Also AVX2, SSSE3_4X, SSSE3 if in_out.len() >= SSE_MIN_LEN { let values = cpu.values(); if let Some(cpu) = values.get_feature() { return chacha20_ctr32_ffi!( unsafe { (SSE_MIN_LEN, Avx2, Overlapping<'_>) => ChaCha20_ctr32_avx2 }, self, counter, in_out, cpu); } if let Some(cpu) = values.get_feature() { return chacha20_ctr32_ffi!( unsafe { (SSE_MIN_LEN, Ssse3, Overlapping<'_>) => ChaCha20_ctr32_ssse3_4x }, self, counter, in_out, cpu); } } if in_out.len() >= 1 { chacha20_ctr32_ffi!( unsafe { (1, (), Overlapping<'_>) => ChaCha20_ctr32_nohw }, self, counter, in_out, ()) } } else { let _: cpu::Features = cpu; fallback::ChaCha20_ctr32(self, counter, in_out) } } } #[inline] pub(super) fn words_less_safe(&self) -> &[u32; KEY_LEN / 4] { &self.words } } /// Counter || Nonce, all native endian. #[repr(transparent)] pub struct Counter([u32; 4]); impl Counter { // Nonce-reuse: the caller must only use the first counter (0) for at most // a single block. fn zero_one_less_safe(nonce: Nonce) -> (Self, Self) { let ctr0 @ Self([_, n0, n1, n2]) = Self::from_nonce_and_ctr(nonce, 0); let ctr1 = Self([1, n0, n1, n2]); (ctr0, ctr1) } fn from_nonce_and_ctr(nonce: Nonce, ctr: u32) -> Self { let [n0, n1, n2] = nonce.as_ref().array_split_map(u32::from_le_bytes); Self([ctr, n0, n1, n2]) } /// This is "less safe" because it hands off management of the counter to /// the caller. #[cfg(any( test, not(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86_64" )) ))] fn into_words_less_safe(self) -> [u32; 4] { self.0 } } pub const KEY_LEN: usize = 32; const BLOCK_LEN: usize = 64; #[cfg(test)] mod tests { extern crate alloc; use super::{super::overlapping::IndexError, *}; use crate::error; use crate::testutil as test; use alloc::vec; const MAX_ALIGNMENT_AND_OFFSET: (usize, usize) = (15, 259); const MAX_ALIGNMENT_AND_OFFSET_SUBSET: (usize, usize) = if cfg!(any(not(debug_assertions), feature = "slow_tests")) { MAX_ALIGNMENT_AND_OFFSET } else { (0, 0) }; #[test] fn chacha20_test_default() { // Always use `MAX_OFFSET` if we hav assembly code. let max_offset = if cfg!(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" )) { MAX_ALIGNMENT_AND_OFFSET } else { MAX_ALIGNMENT_AND_OFFSET_SUBSET }; chacha20_test(max_offset, Key::encrypt); } // Smoketest the fallback implementation. #[test] fn chacha20_test_fallback() { chacha20_test(MAX_ALIGNMENT_AND_OFFSET_SUBSET, |key, ctr, in_out, _cpu| { fallback::ChaCha20_ctr32(key, ctr, in_out) }); } // Verifies the encryption is successful when done on overlapping buffers. // // On some branches of the 32-bit x86 and ARM assembly code the in-place // operation fails in some situations where the input/output buffers are // not exactly overlapping. Such failures are dependent not only on the // degree of overlapping but also the length of the data. `encrypt_within` // works around that. fn chacha20_test( max_alignment_and_offset: (usize, usize), f: impl for<'k, 'o> Fn(&'k Key, Counter, Overlapping<'o>, cpu::Features), ) { let cpu = cpu::features(); // Reuse a buffer to avoid slowing down the tests with allocations. let mut buf = vec![0u8; 1300]; test::run( test_vector_file!("chacha_tests.txt"), move |section, test_case| { assert_eq!(section, ""); let key = test_case.consume_bytes("Key"); let key: &[u8; KEY_LEN] = key.as_slice().try_into()?; let key = Key::new(*key); let ctr = test_case.consume_usize("Ctr"); let nonce = test_case.consume_bytes("Nonce"); let input = test_case.consume_bytes("Input"); let output = test_case.consume_bytes("Output"); // Run the test case over all prefixes of the input because the // behavior of ChaCha20 implementation changes dependent on the // length of the input. for len in 0..=input.len() { #[allow(clippy::cast_possible_truncation)] chacha20_test_case_inner( &key, &nonce, ctr as u32, &input[..len], &output[..len], &mut buf, max_alignment_and_offset, cpu, &f, ); } Ok(()) }, ); } fn chacha20_test_case_inner( key: &Key, nonce: &[u8], ctr: u32, input: &[u8], expected: &[u8], buf: &mut [u8], (max_alignment, max_offset): (usize, usize), cpu: cpu::Features, f: &impl for<'k, 'o> Fn(&'k Key, Counter, Overlapping<'o>, cpu::Features), ) { const ARBITRARY: u8 = 123; for alignment in 0..=max_alignment { buf[..alignment].fill(ARBITRARY); let buf = &mut buf[alignment..]; for offset in 0..=max_offset { let buf = &mut buf[..(offset + input.len())]; buf[..offset].fill(ARBITRARY); let src = offset..; buf[src.clone()].copy_from_slice(input); let ctr = Counter::from_nonce_and_ctr( Nonce::try_assume_unique_for_key(nonce).unwrap(), ctr, ); let in_out = Overlapping::new(buf, src) .map_err(error::erase::) .unwrap(); f(key, ctr, in_out, cpu); assert_eq!(&buf[..input.len()], expected) } } } } ring-0.17.14/src/aead/chacha20_poly1305/integrated.rs000064400000000000000000000164011046102023000200410ustar 00000000000000// Copyright 2015-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{ super::{NONCE_LEN, TAG_LEN}, chacha::Overlapping, check_input_lengths, Aad, InputTooLongError, Key, Nonce, Tag, KEY_LEN, }; use cfg_if::cfg_if; macro_rules! declare_open { ( $name:ident ) => { prefixed_extern! { fn $name( out_plaintext: *mut u8, ciphertext: *const u8, plaintext_len: usize, ad: *const u8, ad_len: usize, data: &mut InOut, ); } }; } macro_rules! declare_seal { ( $name:ident ) => { prefixed_extern! { fn $name( out_ciphertext: *mut u8, plaintext: *const u8, plaintext_len: usize, ad: *const u8, ad_len: usize, data: &mut InOut, ); } }; } cfg_if! { if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { use crate::cpu::arm::Neon; type RequiredCpuFeatures = Neon; type OptionalCpuFeatures = (); } else { use crate::cpu::intel::{Avx2, Bmi2, Sse41}; type RequiredCpuFeatures = Sse41; type OptionalCpuFeatures = (Avx2, Bmi2); } } pub(super) fn seal( Key(key): &Key, nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], required_cpu_features: RequiredCpuFeatures, optional_cpu_features: Option, ) -> Result { check_input_lengths(aad, in_out)?; // XXX: BoringSSL uses `alignas(16)` on `key` instead of on the // structure, but Rust can't do that yet; see // https://github.com/rust-lang/rust/issues/73557. // // Keep in sync with the anonymous struct of BoringSSL's // `chacha20_poly1305_seal_data`. #[repr(align(16), C)] #[derive(Clone, Copy)] struct seal_data_in { key: [u32; KEY_LEN / 4], counter: u32, nonce: [u8; NONCE_LEN], extra_ciphertext: *const u8, extra_ciphertext_len: usize, } let mut data = InOut { input: seal_data_in { key: *key.words_less_safe(), counter: 0, nonce: *nonce.as_ref(), extra_ciphertext: core::ptr::null(), extra_ciphertext_len: 0, }, }; // Encrypts `plaintext_len` bytes from `plaintext` and writes them to `out_ciphertext`. let output = in_out.as_mut_ptr(); let input = in_out.as_ptr(); let len = in_out.len(); let ad = aad.as_ref().as_ptr(); let ad_len = aad.as_ref().len(); #[allow(clippy::needless_late_init)] let tag; cfg_if! { if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { declare_seal! { chacha20_poly1305_seal } let _: Neon = required_cpu_features; let _: Option<()> = optional_cpu_features; tag = unsafe { chacha20_poly1305_seal(output, input, len, ad, ad_len, &mut data); &data.out.tag }; } else { let _: Sse41 = required_cpu_features; if matches!(optional_cpu_features, Some((Avx2 { .. }, Bmi2 { .. }))) { declare_seal! { chacha20_poly1305_seal_avx2 } tag = unsafe { chacha20_poly1305_seal_avx2(output, input, len, ad, ad_len, &mut data); &data.out.tag }; } else { declare_seal! { chacha20_poly1305_seal_sse41 } tag = unsafe { chacha20_poly1305_seal_sse41(output, input, len, ad, ad_len, &mut data); &data.out.tag }; } } } Ok(Tag(*tag)) } pub(super) fn open( Key(key): &Key, nonce: Nonce, aad: Aad<&[u8]>, in_out: Overlapping<'_>, required_cpu_features: RequiredCpuFeatures, optional_cpu_features: Option, ) -> Result { check_input_lengths(aad, in_out.input())?; // XXX: BoringSSL uses `alignas(16)` on `key` instead of on the // structure, but Rust can't do that yet; see // https://github.com/rust-lang/rust/issues/73557. // // Keep in sync with the anonymous struct of BoringSSL's // `chacha20_poly1305_open_data`. #[derive(Copy, Clone)] #[repr(align(16), C)] struct open_data_in { key: [u32; KEY_LEN / 4], counter: u32, nonce: [u8; NONCE_LEN], } let mut data = InOut { input: open_data_in { key: *key.words_less_safe(), counter: 0, nonce: *nonce.as_ref(), }, }; in_out.with_input_output_len(|input, output, len| { let ad = aad.as_ref().as_ptr(); let ad_len = aad.as_ref().len(); #[allow(clippy::needless_late_init)] let tag; cfg_if! { if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { declare_open! { chacha20_poly1305_open } let _: Neon = required_cpu_features; let _: Option<()> = optional_cpu_features; tag = unsafe { chacha20_poly1305_open(output, input, len, ad, ad_len, &mut data); &data.out.tag }; } else { let _: Sse41 = required_cpu_features; if matches!(optional_cpu_features, Some((Avx2 { .. }, Bmi2 { .. }))) { declare_open! { chacha20_poly1305_open_avx2 } tag = unsafe { chacha20_poly1305_open_avx2(output, input, len, ad, ad_len, &mut data); &data.out.tag }; } else { declare_open! { chacha20_poly1305_open_sse41 } tag = unsafe { chacha20_poly1305_open_sse41(output, input, len, ad, ad_len, &mut data); &data.out.tag }; } } } Ok(Tag(*tag)) }) } // Keep in sync with BoringSSL's `chacha20_poly1305_open_data` and // `chacha20_poly1305_seal_data`. #[repr(C)] pub(super) union InOut where T: Copy, { pub(super) input: T, pub(super) out: Out, } // It isn't obvious whether the assembly code works for tags that aren't // 16-byte aligned. In practice it will always be 16-byte aligned because it // is embedded in a union where the other member of the union is 16-byte // aligned. #[derive(Clone, Copy)] #[repr(align(16), C)] pub(super) struct Out { pub(super) tag: [u8; TAG_LEN], } ring-0.17.14/src/aead/chacha20_poly1305/mod.rs000064400000000000000000000124441046102023000164750ustar 00000000000000// Copyright 2015-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{ chacha::{self, Counter, Overlapping}, poly1305, Aad, Nonce, Tag, }; use crate::{ cpu, error::InputTooLongError, polyfill::{slice, sliceutil, u64_from_usize, usize_from_u64_saturated}, }; use cfg_if::cfg_if; cfg_if! { if #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64"))] { use cpu::GetFeature as _; mod integrated; } } pub(super) const KEY_LEN: usize = chacha::KEY_LEN; const MAX_IN_OUT_LEN: usize = super::max_input_len(64, 1); // https://tools.ietf.org/html/rfc8439#section-2.8 const _MAX_IN_OUT_LEN_BOUNDED_BY_RFC: () = assert!(MAX_IN_OUT_LEN == usize_from_u64_saturated(274_877_906_880u64)); #[derive(Clone)] pub(super) struct Key(chacha::Key); impl Key { pub(super) fn new(value: [u8; KEY_LEN]) -> Self { Self(chacha::Key::new(value)) } } pub(super) fn seal( key: &Key, nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], cpu: cpu::Features, ) -> Result { #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64" ))] if let Some(required) = cpu.get_feature() { return integrated::seal(key, nonce, aad, in_out, required, cpu.get_feature()); } seal_fallback(key, nonce, aad, in_out, cpu) } pub(super) fn seal_fallback( Key(chacha20_key): &Key, nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], cpu: cpu::Features, ) -> Result { let (counter, poly1305_key) = begin(chacha20_key, nonce, aad, in_out, cpu)?; let mut auth = poly1305::Context::from_key(poly1305_key, cpu); poly1305_update_padded_16(&mut auth, aad.as_ref()); chacha20_key.encrypt(counter, in_out.into(), cpu); poly1305_update_padded_16(&mut auth, in_out); Ok(finish(auth, aad.as_ref().len(), in_out.len())) } pub(super) fn open( key: &Key, nonce: Nonce, aad: Aad<&[u8]>, in_out: Overlapping<'_>, cpu: cpu::Features, ) -> Result { #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64" ))] if let Some(required) = cpu.get_feature() { return integrated::open(key, nonce, aad, in_out, required, cpu.get_feature()); } open_fallback(key, nonce, aad, in_out, cpu) } pub(super) fn open_fallback( Key(chacha20_key): &Key, nonce: Nonce, aad: Aad<&[u8]>, in_out: Overlapping<'_>, cpu: cpu::Features, ) -> Result { let (counter, poly1305_key) = begin(chacha20_key, nonce, aad, in_out.input(), cpu)?; let mut auth = poly1305::Context::from_key(poly1305_key, cpu); poly1305_update_padded_16(&mut auth, aad.as_ref()); poly1305_update_padded_16(&mut auth, in_out.input()); let in_out_len = in_out.len(); chacha20_key.encrypt(counter, in_out, cpu); Ok(finish(auth, aad.as_ref().len(), in_out_len)) } fn check_input_lengths(aad: Aad<&[u8]>, input: &[u8]) -> Result<(), InputTooLongError> { if input.len() > MAX_IN_OUT_LEN { return Err(InputTooLongError::new(input.len())); } // RFC 8439 Section 2.8 says the maximum AAD length is 2**64 - 1, which is // never larger than usize::MAX, so we don't need an explicit length // check. const _USIZE_BOUNDED_BY_U64: u64 = u64_from_usize(usize::MAX); let _ = aad; Ok(()) } // Also used by chacha20_poly1305_openssh. pub(super) fn begin( key: &chacha::Key, nonce: Nonce, aad: Aad<&[u8]>, input: &[u8], cpu: cpu::Features, ) -> Result<(Counter, poly1305::Key), InputTooLongError> { check_input_lengths(aad, input)?; let mut key_bytes = [0u8; poly1305::KEY_LEN]; let counter = key.encrypt_single_block_with_ctr_0(nonce, &mut key_bytes, cpu); let poly1305_key = poly1305::Key::new(key_bytes); Ok((counter, poly1305_key)) } fn finish(auth: poly1305::Context, aad_len: usize, in_out_len: usize) -> Tag { let mut block = [0u8; poly1305::BLOCK_LEN]; let (alen, clen) = block.split_at_mut(poly1305::BLOCK_LEN / 2); alen.copy_from_slice(&u64::to_le_bytes(u64_from_usize(aad_len))); clen.copy_from_slice(&u64::to_le_bytes(u64_from_usize(in_out_len))); auth.finish(&block) } #[inline] fn poly1305_update_padded_16(ctx: &mut poly1305::Context, input: &[u8]) { let (whole, remainder) = slice::as_chunks(input); ctx.update(whole); if !remainder.is_empty() { let mut block = [0u8; poly1305::BLOCK_LEN]; sliceutil::overwrite_at_start(&mut block, remainder); ctx.update_block(block); } } ring-0.17.14/src/aead/chacha20_poly1305_openssh.rs000064400000000000000000000167051046102023000174610ustar 00000000000000// Copyright 2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! The [chacha20-poly1305@openssh.com] AEAD-ish construct. //! //! This should only be used by SSH implementations. It has a similar, but //! different API from `ring::aead` because the construct cannot use the same //! API as `ring::aead` due to the way the construct handles the encrypted //! packet length. //! //! The concatenation of a and b is denoted `a||b`. `K_1` and `K_2` are defined //! in the [chacha20-poly1305@openssh.com] specification. `packet_length`, //! `padding_length`, `payload`, and `random padding` are defined in //! [RFC 4253]. The term `plaintext` is used as a shorthand for //! `padding_length||payload||random padding`. //! //! [chacha20-poly1305@openssh.com]: //! http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/usr.bin/ssh/PROTOCOL.chacha20poly1305?annotate=HEAD //! [RFC 4253]: https://tools.ietf.org/html/rfc4253 use super::{ chacha::{self, *}, chacha20_poly1305, cpu, poly1305, Aad, Nonce, Tag, }; use crate::{ bb, error::{self, InputTooLongError}, polyfill::slice, }; /// A key for sealing packets. pub struct SealingKey { key: Key, } impl SealingKey { /// Constructs a new `SealingKey`. pub fn new(key_material: &[u8; KEY_LEN]) -> Self { Self { key: Key::new(key_material), } } /// Seals (encrypts and signs) a packet. /// /// On input, `plaintext_in_ciphertext_out` must contain the unencrypted /// `packet_length||plaintext` where `plaintext` is the /// `padding_length||payload||random padding`. It will be overwritten by /// `encrypted_packet_length||ciphertext`, where `encrypted_packet_length` /// is encrypted with `K_1` and `ciphertext` is encrypted by `K_2`. /// /// # Panics /// /// Panics if `plaintext_in_ciphertext_out.len() < PACKET_LENGTH_LEN`. /// /// Panics if `plaintext_in_ciphertext_out` is longer than the maximum /// input size for ChaCha20-Poly1305. Note that this limit is much, /// much larger than SSH's 256KB maximum record size. pub fn seal_in_place( &self, sequence_number: u32, plaintext_in_ciphertext_out: &mut [u8], tag_out: &mut [u8; TAG_LEN], ) { // XXX/TODO(SemVer): Refactor API to return an error. let (len_in_out, data_and_padding_in_out): (&mut [u8; PACKET_LENGTH_LEN], _) = slice::split_first_chunk_mut(plaintext_in_ciphertext_out).unwrap(); let cpu = cpu::features(); // XXX/TODO(SemVer): Refactor API to return an error. let (counter, poly_key) = chacha20_poly1305::begin( &self.key.k_2, make_nonce(sequence_number), Aad::from(len_in_out), data_and_padding_in_out, cpu, ) .map_err(error::erase::) .unwrap(); let _: Counter = self.key.k_1.encrypt_single_block_with_ctr_0( make_nonce(sequence_number), len_in_out, cpu, ); self.key .k_2 .encrypt(counter, data_and_padding_in_out.into(), cpu); let Tag(tag) = poly1305::sign(poly_key, plaintext_in_ciphertext_out, cpu); *tag_out = tag; } } /// A key for opening packets. pub struct OpeningKey { key: Key, } impl OpeningKey { /// Constructs a new `OpeningKey`. pub fn new(key_material: &[u8; KEY_LEN]) -> Self { Self { key: Key::new(key_material), } } /// Returns the decrypted, but unauthenticated, packet length. /// /// Importantly, the result won't be authenticated until `open_in_place` is /// called. pub fn decrypt_packet_length( &self, sequence_number: u32, encrypted_packet_length: [u8; PACKET_LENGTH_LEN], ) -> [u8; PACKET_LENGTH_LEN] { let cpu = cpu::features(); let mut packet_length = encrypted_packet_length; let _: Counter = self.key.k_1.encrypt_single_block_with_ctr_0( make_nonce(sequence_number), &mut packet_length, cpu, ); packet_length } /// Opens (authenticates and decrypts) a packet. /// /// `ciphertext_in_plaintext_out` must be of the form /// `encrypted_packet_length||ciphertext` where `ciphertext` is the /// encrypted `plaintext`. When the function succeeds the ciphertext is /// replaced by the plaintext and the result is `Ok(plaintext)`, where /// `plaintext` is `&ciphertext_in_plaintext_out[PACKET_LENGTH_LEN..]`; /// otherwise the contents of `ciphertext_in_plaintext_out` are unspecified /// and must not be used. pub fn open_in_place<'a>( &self, sequence_number: u32, ciphertext_in_plaintext_out: &'a mut [u8], tag: &[u8; TAG_LEN], ) -> Result<&'a [u8], error::Unspecified> { let (packet_length, after_packet_length): (&mut [u8; PACKET_LENGTH_LEN], _) = slice::split_first_chunk_mut(ciphertext_in_plaintext_out).ok_or(error::Unspecified)?; let cpu = cpu::features(); let (counter, poly_key) = chacha20_poly1305::begin( &self.key.k_2, make_nonce(sequence_number), Aad::from(packet_length), after_packet_length, cpu, ) .map_err(error::erase::)?; // We must verify the tag before decrypting so that // `ciphertext_in_plaintext_out` is unmodified if verification fails. // This is beyond what we guarantee. let calculated_tag = poly1305::sign(poly_key, ciphertext_in_plaintext_out, cpu); bb::verify_slices_are_equal(calculated_tag.as_ref(), tag)?; // Won't panic because the length was checked above. let after_packet_length = &mut ciphertext_in_plaintext_out[PACKET_LENGTH_LEN..]; self.key .k_2 .encrypt(counter, after_packet_length.into(), cpu); Ok(after_packet_length) } } struct Key { k_1: chacha::Key, k_2: chacha::Key, } impl Key { fn new(key_material: &[u8; KEY_LEN]) -> Self { // The first half becomes K_2 and the second half becomes K_1. let (k_2, k_1) = key_material.split_at(chacha::KEY_LEN); Self { k_1: chacha::Key::new(k_1.try_into().unwrap()), k_2: chacha::Key::new(k_2.try_into().unwrap()), } } } fn make_nonce(sequence_number: u32) -> Nonce { let [s0, s1, s2, s3] = sequence_number.to_be_bytes(); let nonce = [0, 0, 0, 0, 0, 0, 0, 0, s0, s1, s2, s3]; Nonce::assume_unique_for_key(nonce) } /// The length of key. pub const KEY_LEN: usize = chacha::KEY_LEN * 2; /// The length in bytes of the `packet_length` field in a SSH packet. pub const PACKET_LENGTH_LEN: usize = 4; // 32 bits /// The length in bytes of an authentication tag. pub const TAG_LEN: usize = super::TAG_LEN; ring-0.17.14/src/aead/gcm/clmul.rs000064400000000000000000000047751046102023000146630ustar 00000000000000// Copyright 2018-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ))] use super::{ffi::KeyValue, HTable, UpdateBlock, Xi}; use crate::aead::gcm::ffi::BLOCK_LEN; use crate::cpu; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] use {super::UpdateBlocks, crate::polyfill::slice::AsChunks}; #[cfg(all(target_arch = "aarch64", target_endian = "little"))] pub(in super::super) type RequiredCpuFeatures = cpu::arm::PMull; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub(in super::super) type RequiredCpuFeatures = (cpu::intel::ClMul, cpu::intel::Ssse3); #[derive(Clone)] pub struct Key { h_table: HTable, } impl Key { #[cfg_attr(target_arch = "x86_64", inline(never))] pub(in super::super) fn new(value: KeyValue, _cpu: RequiredCpuFeatures) -> Self { Self { h_table: unsafe { htable_new!(gcm_init_clmul, value) }, } } #[cfg(target_arch = "aarch64")] pub(super) fn inner(&self) -> &HTable { &self.h_table } } impl UpdateBlock for Key { #[cfg(target_arch = "aarch64")] fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { prefixed_extern! { fn gcm_gmult_clmul(xi: &mut Xi, Htable: &HTable); } xi.bitxor_assign(a); unsafe { self.h_table.gmult(gcm_gmult_clmul, xi) }; } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { self.update_blocks(xi, (&a).into()) } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl UpdateBlocks for Key { fn update_blocks(&self, xi: &mut Xi, input: AsChunks) { unsafe { ghash!(gcm_ghash_clmul, xi, &self.h_table, input) } } } ring-0.17.14/src/aead/gcm/clmulavxmovbe.rs000064400000000000000000000031721046102023000164210ustar 00000000000000// Copyright 2018-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(target_arch = "x86_64")] use super::{HTable, KeyValue, UpdateBlock, UpdateBlocks, Xi, BLOCK_LEN}; use crate::{cpu::intel, polyfill::slice::AsChunks}; #[derive(Clone)] pub struct Key { h_table: HTable, } impl Key { #[inline(never)] pub(in super::super) fn new( value: KeyValue, _required_cpu_features: (intel::ClMul, intel::Avx, intel::Movbe), ) -> Self { Self { h_table: unsafe { htable_new!(gcm_init_avx, value) }, } } pub(super) fn inner(&self) -> &HTable { &self.h_table } } impl UpdateBlock for Key { fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { self.update_blocks(xi, (&a).into()) } } impl UpdateBlocks for Key { fn update_blocks(&self, xi: &mut Xi, input: AsChunks) { unsafe { ghash!(gcm_ghash_avx, xi, self.inner(), input) } } } ring-0.17.14/src/aead/gcm/fallback.rs000064400000000000000000000210041046102023000152660ustar 00000000000000// Copyright (c) 2019, Google Inc. // Portions Copyright 2020-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // This file is based on BoringSSL's gcm_nohw.c. // This file contains a implementation of GHASH based on the notes // in https://bearssl.org/constanttime.html#ghash-for-gcm and the reduction // algorithm described in // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf. // // Unlike the BearSSL notes, we use u128 in the 64-bit implementation. use super::{ffi::U128, KeyValue, UpdateBlock, UpdateBlocks, Xi, BLOCK_LEN}; use crate::polyfill::{slice::AsChunks, ArraySplitMap as _}; #[derive(Clone)] pub struct Key { h: U128, } impl Key { pub(in super::super) fn new(value: KeyValue) -> Self { Self { h: init(value) } } } impl UpdateBlock for Key { fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { xi.bitxor_assign(a); gmult(xi, self.h); } } impl UpdateBlocks for Key { fn update_blocks(&self, xi: &mut Xi, input: AsChunks) { ghash(xi, self.h, input); } } #[cfg(target_pointer_width = "64")] fn gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64) { #[allow(clippy::cast_possible_truncation)] #[inline(always)] fn lo(a: u128) -> u64 { a as u64 } #[inline(always)] fn hi(a: u128) -> u64 { lo(a >> 64) } #[inline(always)] fn mul(a: u64, b: u64) -> u128 { u128::from(a) * u128::from(b) } // One term every four bits means the largest term is 64/4 = 16, which barely // overflows into the next term. Using one term every five bits would cost 25 // multiplications instead of 16. It is faster to mask off the bottom four // bits of |a|, giving a largest term of 60/4 = 15, and apply the bottom bits // separately. let a0 = a & 0x1111111111111110; let a1 = a & 0x2222222222222220; let a2 = a & 0x4444444444444440; let a3 = a & 0x8888888888888880; let b0 = b & 0x1111111111111111; let b1 = b & 0x2222222222222222; let b2 = b & 0x4444444444444444; let b3 = b & 0x8888888888888888; let c0 = mul(a0, b0) ^ mul(a1, b3) ^ mul(a2, b2) ^ mul(a3, b1); let c1 = mul(a0, b1) ^ mul(a1, b0) ^ mul(a2, b3) ^ mul(a3, b2); let c2 = mul(a0, b2) ^ mul(a1, b1) ^ mul(a2, b0) ^ mul(a3, b3); let c3 = mul(a0, b3) ^ mul(a1, b2) ^ mul(a2, b1) ^ mul(a3, b0); // Multiply the bottom four bits of |a| with |b|. let a0_mask = 0u64.wrapping_sub(a & 1); let a1_mask = 0u64.wrapping_sub((a >> 1) & 1); let a2_mask = 0u64.wrapping_sub((a >> 2) & 1); let a3_mask = 0u64.wrapping_sub((a >> 3) & 1); let extra = u128::from(a0_mask & b) ^ (u128::from(a1_mask & b) << 1) ^ (u128::from(a2_mask & b) << 2) ^ (u128::from(a3_mask & b) << 3); let lo = (lo(c0) & 0x1111111111111111) ^ (lo(c1) & 0x2222222222222222) ^ (lo(c2) & 0x4444444444444444) ^ (lo(c3) & 0x8888888888888888) ^ lo(extra); let hi = (hi(c0) & 0x1111111111111111) ^ (hi(c1) & 0x2222222222222222) ^ (hi(c2) & 0x4444444444444444) ^ (hi(c3) & 0x8888888888888888) ^ hi(extra); (lo, hi) } #[cfg(not(target_pointer_width = "64"))] fn gcm_mul32_nohw(a: u32, b: u32) -> u64 { #[inline(always)] fn mul(a: u32, b: u32) -> u64 { u64::from(a) * u64::from(b) } // One term every four bits means the largest term is 32/4 = 8, which does not // overflow into the next term. let a0 = a & 0x11111111; let a1 = a & 0x22222222; let a2 = a & 0x44444444; let a3 = a & 0x88888888; let b0 = b & 0x11111111; let b1 = b & 0x22222222; let b2 = b & 0x44444444; let b3 = b & 0x88888888; let c0 = mul(a0, b0) ^ mul(a1, b3) ^ mul(a2, b2) ^ mul(a3, b1); let c1 = mul(a0, b1) ^ mul(a1, b0) ^ mul(a2, b3) ^ mul(a3, b2); let c2 = mul(a0, b2) ^ mul(a1, b1) ^ mul(a2, b0) ^ mul(a3, b3); let c3 = mul(a0, b3) ^ mul(a1, b2) ^ mul(a2, b1) ^ mul(a3, b0); (c0 & 0x1111111111111111) | (c1 & 0x2222222222222222) | (c2 & 0x4444444444444444) | (c3 & 0x8888888888888888) } #[cfg(not(target_pointer_width = "64"))] fn gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64) { #[inline(always)] fn lo(a: u64) -> u32 { a as u32 } #[inline(always)] fn hi(a: u64) -> u32 { lo(a >> 32) } let a0 = lo(a); let a1 = hi(a); let b0 = lo(b); let b1 = hi(b); // Karatsuba multiplication. let lo = gcm_mul32_nohw(a0, b0); let hi = gcm_mul32_nohw(a1, b1); let mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1) ^ lo ^ hi; (lo ^ (mid << 32), hi ^ (mid >> 32)) } fn init(value: KeyValue) -> U128 { let xi = value.into_inner(); // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This // avoids a shift by 1 in the multiplication, needed to account for bit // reversal losing a bit after multiplication, that is, // rev128(X) * rev128(Y) = rev255(X*Y). // // Per Appendix A, we run mulX_POLYVAL. Note this is the same transformation // applied by |gcm_init_clmul|, etc. Note |Xi| has already been byteswapped. // // See also slide 16 of // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf let mut lo = xi[1]; let mut hi = xi[0]; let mut carry = hi >> 63; carry = 0u64.wrapping_sub(carry); hi <<= 1; hi |= lo >> 63; lo <<= 1; // The irreducible polynomial is 1 + x^121 + x^126 + x^127 + x^128, so we // conditionally add 0xc200...0001. lo ^= carry & 1; hi ^= carry & 0xc200000000000000; // This implementation does not use the rest of |Htable|. U128 { hi, lo } } fn gcm_polyval_nohw(xi: &mut [u64; 2], h: U128) { // Karatsuba multiplication. The product of |Xi| and |H| is stored in |r0| // through |r3|. Note there is no byte or bit reversal because we are // evaluating POLYVAL. let (r0, mut r1) = gcm_mul64_nohw(xi[0], h.lo); let (mut r2, mut r3) = gcm_mul64_nohw(xi[1], h.hi); let (mut mid0, mut mid1) = gcm_mul64_nohw(xi[0] ^ xi[1], h.hi ^ h.lo); mid0 ^= r0 ^ r2; mid1 ^= r1 ^ r3; r2 ^= mid1; r1 ^= mid0; // Now we multiply our 256-bit result by x^-128 and reduce. |r2| and // |r3| shifts into position and we must multiply |r0| and |r1| by x^-128. We // have: // // 1 = x^121 + x^126 + x^127 + x^128 // x^-128 = x^-7 + x^-2 + x^-1 + 1 // // This is the GHASH reduction step, but with bits flowing in reverse. // The x^-7, x^-2, and x^-1 terms shift bits past x^0, which would require // another reduction steps. Instead, we gather the excess bits, incorporate // them into |r0| and |r1| and reduce once. See slides 17-19 // of https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf. r1 ^= (r0 << 63) ^ (r0 << 62) ^ (r0 << 57); // 1 r2 ^= r0; r3 ^= r1; // x^-1 r2 ^= r0 >> 1; r2 ^= r1 << 63; r3 ^= r1 >> 1; // x^-2 r2 ^= r0 >> 2; r2 ^= r1 << 62; r3 ^= r1 >> 2; // x^-7 r2 ^= r0 >> 7; r2 ^= r1 << 57; r3 ^= r1 >> 7; *xi = [r2, r3]; } fn gmult(xi: &mut Xi, h: U128) { with_swapped_xi(xi, |swapped| { gcm_polyval_nohw(swapped, h); }) } fn ghash(xi: &mut Xi, h: U128, input: AsChunks) { with_swapped_xi(xi, |swapped| { input.into_iter().for_each(|&input| { let input = input.array_split_map(u64::from_be_bytes); swapped[0] ^= input[1]; swapped[1] ^= input[0]; gcm_polyval_nohw(swapped, h); }); }); } #[inline] fn with_swapped_xi(Xi(xi): &mut Xi, f: impl FnOnce(&mut [u64; 2])) { let unswapped: [u64; 2] = xi.array_split_map(u64::from_be_bytes); let mut swapped: [u64; 2] = [unswapped[1], unswapped[0]]; f(&mut swapped); let (xi_0, xi_1) = xi.split_at_mut(BLOCK_LEN / 2); xi_0.copy_from_slice(&u64::to_be_bytes(swapped[1])); xi_1.copy_from_slice(&u64::to_be_bytes(swapped[0])); } ring-0.17.14/src/aead/gcm/ffi.rs000064400000000000000000000110441046102023000142760ustar 00000000000000// Copyright 2018 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::{ bb, polyfill::{slice::AsChunks, ArraySplitMap}, }; pub(in super::super) const BLOCK_LEN: usize = 16; pub(in super::super) type Block = [u8; BLOCK_LEN]; pub(super) const ZERO_BLOCK: Block = [0u8; BLOCK_LEN]; #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ))] macro_rules! htable_new { ( $name:ident, $value:expr $(,)? ) => {{ use crate::aead::gcm::ffi::HTable; prefixed_extern! { fn $name(HTable: &mut HTable, h: &[u64; 2]); } HTable::new($name, $value) }}; } /// SAFETY: /// * The function `$name` must meet the contract of the `f` paramweter of /// `ghash()`. #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ))] macro_rules! ghash { ( $name:ident, $xi:expr, $h_table:expr, $input:expr $(,)? ) => {{ use crate::aead::gcm::ffi::{HTable, Xi}; prefixed_extern! { fn $name( xi: &mut Xi, Htable: &HTable, inp: *const u8, len: crate::c::NonZero_size_t, ); } $h_table.ghash($name, $xi, $input) }}; } pub(in super::super) struct KeyValue([u64; 2]); impl KeyValue { pub(in super::super) fn new(value: Block) -> Self { Self(value.array_split_map(u64::from_be_bytes)) } pub(super) fn into_inner(self) -> [u64; 2] { self.0 } } /// SAFETY: /// * `f` must read `len` bytes from `inp`; it may assume /// that `len` is a (non-zero) multiple of `BLOCK_LEN`. /// * `f` may inspect CPU features. #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ))] impl HTable { pub(super) unsafe fn new( init: unsafe extern "C" fn(HTable: &mut HTable, &[u64; 2]), value: KeyValue, ) -> Self { let mut r = Self { Htable: [U128 { hi: 0, lo: 0 }; HTABLE_LEN], }; unsafe { init(&mut r, &value.0) }; r } #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little") ))] pub(super) unsafe fn gmult( &self, f: unsafe extern "C" fn(xi: &mut Xi, h_table: &HTable), xi: &mut Xi, ) { unsafe { f(xi, self) } } pub(super) unsafe fn ghash( &self, f: unsafe extern "C" fn( xi: &mut Xi, Htable: &HTable, inp: *const u8, len: crate::c::NonZero_size_t, ), xi: &mut Xi, input: AsChunks, ) { use core::num::NonZeroUsize; let input = input.as_flattened(); let input_len = match NonZeroUsize::new(input.len()) { Some(len) => len, None => { return; } }; // SAFETY: // * There are `input_len: NonZeroUsize` bytes available at `input` for // `f` to read. unsafe { f(xi, self, input.as_ptr(), input_len); } } } // The alignment is required by some assembly code, such as `ghash-ssse3-*`. #[derive(Clone)] #[repr(C, align(16))] pub(in super::super) struct HTable { Htable: [U128; HTABLE_LEN], } #[derive(Clone, Copy)] #[repr(C)] pub(super) struct U128 { pub(super) hi: u64, pub(super) lo: u64, } const HTABLE_LEN: usize = 16; #[repr(transparent)] pub(in super::super) struct Xi(pub(super) Block); impl Xi { #[inline] pub(super) fn bitxor_assign(&mut self, a: Block) { self.0 = bb::xor_16(self.0, a) } } ring-0.17.14/src/aead/gcm/neon.rs000064400000000000000000000034141046102023000144730ustar 00000000000000// Copyright 2018-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little") ))] use super::{HTable, KeyValue, UpdateBlock, UpdateBlocks, Xi, BLOCK_LEN}; use crate::{cpu, polyfill::slice::AsChunks}; pub(in super::super) type RequiredCpuFeatures = cpu::arm::Neon; #[derive(Clone)] pub struct Key { h_table: HTable, } impl Key { pub(in super::super) fn new(value: KeyValue, _cpu: RequiredCpuFeatures) -> Self { Self { h_table: unsafe { htable_new!(gcm_init_neon, value) }, } } } impl UpdateBlock for Key { fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { prefixed_extern! { fn gcm_gmult_neon(xi: &mut Xi, Htable: &HTable); } xi.bitxor_assign(a); unsafe { self.h_table.gmult(gcm_gmult_neon, xi) }; } } impl UpdateBlocks for Key { fn update_blocks(&self, xi: &mut Xi, input: AsChunks) { unsafe { ghash!(gcm_ghash_neon, xi, &self.h_table, input) } } } ring-0.17.14/src/aead/gcm/vclmulavx2.rs000064400000000000000000000030071046102023000156350ustar 00000000000000// Copyright 2018-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(target_arch = "x86_64")] use super::{ffi::KeyValue, HTable, UpdateBlock, Xi}; use crate::{ aead::gcm::ffi::BLOCK_LEN, cpu::intel::{Avx2, VAesClmul}, polyfill::slice::AsChunks, }; #[derive(Clone)] pub struct Key { h_table: HTable, } impl Key { pub(in super::super) fn new(value: KeyValue, _cpu: (Avx2, VAesClmul)) -> Self { Self { h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx2, value) }, } } pub(super) fn inner(&self) -> &HTable { &self.h_table } } impl UpdateBlock for Key { fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { let input: AsChunks = (&a).into(); unsafe { ghash!(gcm_ghash_vpclmulqdq_avx2_1, xi, &self.h_table, input) } } } ring-0.17.14/src/aead/gcm.rs000064400000000000000000000117601046102023000135370ustar 00000000000000// Copyright 2018-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use self::ffi::{Block, BLOCK_LEN, ZERO_BLOCK}; use super::{aes_gcm, Aad}; use crate::{ bits::{BitLength, FromByteLen as _}, error::{self, InputTooLongError}, polyfill::{slice::AsChunks, sliceutil::overwrite_at_start, NotSend}, }; use cfg_if::cfg_if; pub(super) use ffi::KeyValue; cfg_if! { if #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64"))] { pub(super) use self::ffi::{HTable, Xi}; } else { use self::ffi::{HTable, Xi}; } } #[macro_use] mod ffi; pub(super) mod clmul; pub(super) mod clmulavxmovbe; pub(super) mod fallback; pub(super) mod neon; pub(super) mod vclmulavx2; pub(super) struct Context<'key, K> { Xi: Xi, key: &'key K, aad_len: BitLength, in_out_len: BitLength, _not_send: NotSend, } impl<'key, K: UpdateBlock> Context<'key, K> { #[inline(always)] pub(crate) fn new( key: &'key K, aad: Aad<&[u8]>, in_out_len: usize, ) -> Result { if in_out_len > aes_gcm::MAX_IN_OUT_LEN { return Err(error::Unspecified); } let in_out_len = BitLength::from_byte_len(in_out_len).map_err(error::erase::)?; let aad_len = BitLength::from_byte_len(aad.as_ref().len()) .map_err(error::erase::)?; // NIST SP800-38D Section 5.2.1.1 says that the maximum AAD length is // 2**64 - 1 bits, i.e. BitLength::MAX, so we don't need to do an // explicit check here. let mut ctx = Self { Xi: Xi(ZERO_BLOCK), key, aad_len, in_out_len, _not_send: NotSend::VALUE, }; for ad in aad.0.chunks(BLOCK_LEN) { let mut block = ZERO_BLOCK; overwrite_at_start(&mut block, ad); ctx.update_block(block); } Ok(ctx) } } #[cfg(all( target_arch = "aarch64", target_endian = "little", target_pointer_width = "64" ))] impl Context<'_, K> { pub(super) fn in_out_whole_block_bits(&self) -> BitLength { use crate::polyfill::usize_from_u64; const WHOLE_BLOCK_BITS_MASK: usize = !0b111_1111; #[allow(clippy::assertions_on_constants)] const _WHOLE_BLOCK_BITS_MASK_CORRECT: () = assert!(WHOLE_BLOCK_BITS_MASK == !((BLOCK_LEN * 8) - 1)); BitLength::from_bits(usize_from_u64(self.in_out_len.as_bits()) & WHOLE_BLOCK_BITS_MASK) } } #[cfg(all(target_arch = "aarch64", target_endian = "little"))] /// Access to `inner` for the integrated AES-GCM implementations only. impl Context<'_, clmul::Key> { #[inline] pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) { (&self.key.inner(), &mut self.Xi) } } #[cfg(target_arch = "x86_64")] impl Context<'_, clmulavxmovbe::Key> { /// Access to `inner` for the integrated AES-GCM implementations only. #[inline] pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) { (self.key.inner(), &mut self.Xi) } } #[cfg(target_arch = "x86_64")] impl Context<'_, vclmulavx2::Key> { /// Access to `inner` for the integrated AES-GCM implementations only. #[inline] pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) { (self.key.inner(), &mut self.Xi) } } impl Context<'_, K> { #[inline(always)] pub fn update_blocks(&mut self, input: AsChunks) { self.key.update_blocks(&mut self.Xi, input); } } impl Context<'_, K> { pub fn update_block(&mut self, a: Block) { self.key.update_block(&mut self.Xi, a); } #[inline(always)] pub(super) fn pre_finish(mut self, f: F) -> super::Tag where F: FnOnce(Block) -> super::Tag, { let mut block = [0u8; BLOCK_LEN]; let (alen, clen) = block.split_at_mut(BLOCK_LEN / 2); alen.copy_from_slice(&BitLength::::to_be_bytes(self.aad_len)); clen.copy_from_slice(&BitLength::::to_be_bytes(self.in_out_len)); self.update_block(block); f(self.Xi.0) } } pub(super) trait UpdateBlock { fn update_block(&self, xi: &mut Xi, a: Block); } pub(super) trait UpdateBlocks { fn update_blocks(&self, xi: &mut Xi, input: AsChunks); } ring-0.17.14/src/aead/less_safe_key.rs000064400000000000000000000125041046102023000156020ustar 00000000000000// Copyright 2015-2021 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{Aad, Algorithm, KeyInner, Nonce, Tag, UnboundKey, TAG_LEN}; use crate::{cpu, error}; use core::ops::RangeFrom; /// Immutable keys for use in situations where `OpeningKey`/`SealingKey` and /// `NonceSequence` cannot reasonably be used. /// /// Prefer to use `OpeningKey`/`SealingKey` and `NonceSequence` when practical. #[derive(Clone)] pub struct LessSafeKey { inner: KeyInner, algorithm: &'static Algorithm, } impl LessSafeKey { /// Constructs a `LessSafeKey`. #[inline] pub fn new(key: UnboundKey) -> Self { key.into_inner() } pub(super) fn new_( algorithm: &'static Algorithm, key_bytes: &[u8], cpu_features: cpu::Features, ) -> Result { Ok(Self { inner: algorithm.new_key(key_bytes, cpu_features)?, algorithm, }) } /// Like [open_in_place](Self::open_in_place), except the authentication tag is /// passed separately. #[inline] pub fn open_in_place_separate_tag<'in_out, A>( &self, nonce: Nonce, aad: Aad, tag: Tag, in_out: &'in_out mut [u8], ciphertext: RangeFrom, ) -> Result<&'in_out mut [u8], error::Unspecified> where A: AsRef<[u8]>, { let aad = Aad::from(aad.as_ref()); self.algorithm.open_within( &self.inner, nonce, aad, tag, in_out, ciphertext, cpu::features(), ) } /// Like [`super::OpeningKey::open_in_place()`], except it accepts an /// arbitrary nonce. /// /// `nonce` must be unique for every use of the key to open data. #[inline] pub fn open_in_place<'in_out, A>( &self, nonce: Nonce, aad: Aad, in_out: &'in_out mut [u8], ) -> Result<&'in_out mut [u8], error::Unspecified> where A: AsRef<[u8]>, { self.open_within(nonce, aad, in_out, 0..) } /// Like [`super::OpeningKey::open_within()`], except it accepts an /// arbitrary nonce. /// /// `nonce` must be unique for every use of the key to open data. #[inline] pub fn open_within<'in_out, A>( &self, nonce: Nonce, aad: Aad, in_out: &'in_out mut [u8], ciphertext_and_tag: RangeFrom, ) -> Result<&'in_out mut [u8], error::Unspecified> where A: AsRef<[u8]>, { let tag_offset = in_out .len() .checked_sub(TAG_LEN) .ok_or(error::Unspecified)?; // Split the tag off the end of `in_out`. let (in_out, received_tag) = in_out.split_at_mut(tag_offset); let received_tag = (*received_tag).try_into()?; let ciphertext = ciphertext_and_tag; self.open_in_place_separate_tag(nonce, aad, received_tag, in_out, ciphertext) } /// Like [`super::SealingKey::seal_in_place_append_tag()`], except it /// accepts an arbitrary nonce. /// /// `nonce` must be unique for every use of the key to seal data. #[inline] pub fn seal_in_place_append_tag( &self, nonce: Nonce, aad: Aad, in_out: &mut InOut, ) -> Result<(), error::Unspecified> where A: AsRef<[u8]>, InOut: AsMut<[u8]> + for<'in_out> Extend<&'in_out u8>, { self.seal_in_place_separate_tag(nonce, aad, in_out.as_mut()) .map(|tag| in_out.extend(tag.as_ref())) } /// Like `super::SealingKey::seal_in_place_separate_tag()`, except it /// accepts an arbitrary nonce. /// /// `nonce` must be unique for every use of the key to seal data. #[inline] pub fn seal_in_place_separate_tag( &self, nonce: Nonce, aad: Aad, in_out: &mut [u8], ) -> Result where A: AsRef<[u8]>, { self.algorithm.seal( &self.inner, nonce, Aad::from(aad.as_ref()), in_out, cpu::features(), ) } /// The key's AEAD algorithm. #[inline] pub fn algorithm(&self) -> &'static Algorithm { self.algorithm } pub(super) fn fmt_debug( &self, type_name: &'static str, f: &mut core::fmt::Formatter, ) -> Result<(), core::fmt::Error> { f.debug_struct(type_name) .field("algorithm", &self.algorithm()) .finish() } } impl core::fmt::Debug for LessSafeKey { fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { self.fmt_debug("LessSafeKey", f) } } ring-0.17.14/src/aead/nonce.rs000064400000000000000000000036361046102023000140760ustar 00000000000000// Copyright 2018 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::error; /// A nonce for a single AEAD opening or sealing operation. /// /// The user must ensure, for a particular key, that each nonce is unique. /// /// `Nonce` intentionally doesn't implement `Clone` to ensure that each one is /// consumed at most once. pub struct Nonce([u8; NONCE_LEN]); impl Nonce { /// Constructs a `Nonce` with the given value, assuming that the value is /// unique for the lifetime of the key it is being used with. /// /// Fails if `value` isn't `NONCE_LEN` bytes long. #[inline] pub fn try_assume_unique_for_key(value: &[u8]) -> Result { let value: &[u8; NONCE_LEN] = value.try_into()?; Ok(Self::assume_unique_for_key(*value)) } /// Constructs a `Nonce` with the given value, assuming that the value is /// unique for the lifetime of the key it is being used with. #[inline] pub fn assume_unique_for_key(value: [u8; NONCE_LEN]) -> Self { Self(value) } } impl AsRef<[u8; NONCE_LEN]> for Nonce { fn as_ref(&self) -> &[u8; NONCE_LEN] { &self.0 } } /// All the AEADs we support use 96-bit nonces. pub const NONCE_LEN: usize = 96 / 8; ring-0.17.14/src/aead/opening_key.rs000064400000000000000000000124251046102023000152770ustar 00000000000000// Copyright 2015-2021 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Authenticated Encryption with Associated Data (AEAD). //! //! See [Authenticated encryption: relations among notions and analysis of the //! generic composition paradigm][AEAD] for an introduction to the concept of //! AEADs. //! //! [AEAD]: https://eprint.iacr.org/2000/025.pdf //! [`crypto.cipher.AEAD`]: https://golang.org/pkg/crypto/cipher/#AEAD use super::{Aad, Algorithm, BoundKey, LessSafeKey, NonceSequence, UnboundKey}; use crate::error; use core::ops::RangeFrom; /// An AEAD key for authenticating and decrypting ("opening"), bound to a nonce /// sequence. /// /// Intentionally not `Clone` or `Copy` since cloning would allow duplication /// of the nonce sequence. pub struct OpeningKey { key: LessSafeKey, nonce_sequence: N, } impl BoundKey for OpeningKey { fn new(key: UnboundKey, nonce_sequence: N) -> Self { Self { key: key.into_inner(), nonce_sequence, } } #[inline] fn algorithm(&self) -> &'static Algorithm { self.key.algorithm() } } impl core::fmt::Debug for OpeningKey { fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { self.key.fmt_debug("OpeningKey", f) } } impl OpeningKey { /// Authenticates and decrypts (“opens”) data in place. /// /// `aad` is the additional authenticated data (AAD), if any. /// /// On input, `in_out` must be the ciphertext followed by the tag. When /// `open_in_place()` returns `Ok(plaintext)`, the input ciphertext /// has been overwritten by the plaintext; `plaintext` will refer to the /// plaintext without the tag. /// /// When `open_in_place()` returns `Err(..)`, `in_out` may have been /// overwritten in an unspecified way. #[inline] pub fn open_in_place<'in_out, A>( &mut self, aad: Aad, in_out: &'in_out mut [u8], ) -> Result<&'in_out mut [u8], error::Unspecified> where A: AsRef<[u8]>, { self.key .open_in_place(self.nonce_sequence.advance()?, aad, in_out) } /// Authenticates and decrypts (“opens”) data in place, with a shift. /// /// `aad` is the additional authenticated data (AAD), if any. /// /// On input, `in_out[ciphertext_and_tag]` must be the ciphertext followed /// by the tag. When `open_within()` returns `Ok(plaintext)`, the plaintext /// will be at `in_out[0..plaintext.len()]`. In other words, the following /// two code fragments are equivalent for valid values of /// `ciphertext_and_tag`, except `open_within` will often be more efficient: /// /// /// ```skip /// let plaintext = key.open_within(aad, in_out, cipertext_and_tag)?; /// ``` /// /// ```skip /// let ciphertext_and_tag_len = in_out[ciphertext_and_tag].len(); /// in_out.copy_within(ciphertext_and_tag, 0); /// let plaintext = key.open_in_place(aad, &mut in_out[..ciphertext_and_tag_len])?; /// ``` /// /// Similarly, `key.open_within(aad, in_out, 0..)` is equivalent to /// `key.open_in_place(aad, in_out)`. /// /// When `open_in_place()` returns `Err(..)`, `in_out` may have been /// overwritten in an unspecified way. /// /// The shifting feature is useful in the case where multiple packets are /// being reassembled in place. Consider this example where the peer has /// sent the message “Split stream reassembled in place” split into /// three sealed packets: /// /// ```ascii-art /// Packet 1 Packet 2 Packet 3 /// Input: [Header][Ciphertext][Tag][Header][Ciphertext][Tag][Header][Ciphertext][Tag] /// | +--------------+ | /// +------+ +-----+ +----------------------------------+ /// v v v /// Output: [Plaintext][Plaintext][Plaintext] /// “Split stream reassembled in place” /// ``` /// /// This reassembly can be accomplished with three calls to `open_within()`. #[inline] pub fn open_within<'in_out, A>( &mut self, aad: Aad, in_out: &'in_out mut [u8], ciphertext_and_tag: RangeFrom, ) -> Result<&'in_out mut [u8], error::Unspecified> where A: AsRef<[u8]>, { self.key.open_within( self.nonce_sequence.advance()?, aad, in_out, ciphertext_and_tag, ) } } ring-0.17.14/src/aead/overlapping/array.rs000064400000000000000000000037721046102023000164410ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg_attr(not(test), allow(dead_code))] use super::Overlapping; use crate::error::LenMismatchError; use core::array::TryFromSliceError; pub struct Array<'o, T, const N: usize> { // Invariant: N != 0. // Invariant: `self.in_out.len() == N`. in_out: Overlapping<'o, T>, } impl<'o, T, const N: usize> Array<'o, T, N> { pub(super) fn new(in_out: Overlapping<'o, T>) -> Result { if N == 0 || in_out.len() != N { return Err(LenMismatchError::new(N)); } Ok(Self { in_out }) } pub fn into_unwritten_output(self) -> &'o mut [T; N] where &'o mut [T]: TryInto<&'o mut [T; N], Error = TryFromSliceError>, { self.in_out .into_unwritten_output() .try_into() .unwrap_or_else(|TryFromSliceError { .. }| { unreachable!() // Due to invariant }) } } impl Array<'_, T, N> { pub fn input<'s>(&'s self) -> &'s [T; N] where &'s [T]: TryInto<&'s [T; N], Error = TryFromSliceError>, { self.in_out .input() .try_into() .unwrap_or_else(|TryFromSliceError { .. }| { unreachable!() // Due to invariant }) } } ring-0.17.14/src/aead/overlapping/base.rs000064400000000000000000000121771046102023000162340ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. pub use self::index_error::IndexError; use super::Array; use crate::error::LenMismatchError; use core::{mem, ops::RangeFrom}; pub struct Overlapping<'o, T> { // Invariant: self.src.start <= in_out.len(). in_out: &'o mut [T], src: RangeFrom, } impl<'o, T> From<&'o mut [T]> for Overlapping<'o, T> { fn from(in_out: &'o mut [T]) -> Self { Self { in_out, src: 0.. } } } impl<'o, T> Overlapping<'o, T> { pub fn new(in_out: &'o mut [T], src: RangeFrom) -> Result { match in_out.get(src.clone()) { Some(_) => Ok(Self { in_out, src }), None => Err(IndexError::new(src.start)), } } #[cfg(any( all(target_arch = "arm", target_endian = "little"), target_arch = "x86" ))] pub fn copy_within(self) -> &'o mut [T] where T: Copy, { if self.src.start == 0 { self.in_out } else { let len = self.len(); self.in_out.copy_within(self.src, 0); &mut self.in_out[..len] } } #[cfg(any( all(target_arch = "arm", target_endian = "little"), target_arch = "x86" ))] pub fn into_slice_src_mut(self) -> (&'o mut [T], RangeFrom) { (self.in_out, self.src) } pub fn into_unwritten_output(self) -> &'o mut [T] { let len = self.len(); self.in_out.get_mut(..len).unwrap_or_else(|| { // The invariant ensures this succeeds. unreachable!() }) } } impl Overlapping<'_, T> { pub fn len(&self) -> usize { self.input().len() } pub fn input(&self) -> &[T] { self.in_out.get(self.src.clone()).unwrap_or_else(|| { // Ensured by invariant. unreachable!() }) } pub fn with_input_output_len(self, f: impl FnOnce(*const T, *mut T, usize) -> R) -> R { let len = self.len(); let output = self.in_out.as_mut_ptr(); // TODO: MSRV(1.65): use `output.cast_const()` let output_const: *const T = output; // SAFETY: The constructor ensures that `src` is a valid range. // Equivalent to `self.in_out[src.clone()].as_ptr()` but without // worries about compatibility with the stacked borrows model. // TODO(MSRV-1.80, probably): Avoid special casing 0; see // https://github.com/rust-lang/rust/pull/117329 // https://github.com/rust-lang/rustc_codegen_gcc/issues/516 let input = if self.src.start == 0 { output_const } else { unsafe { output_const.add(self.src.start) } }; f(input, output, len) } // Perhaps unlike `slice::split_first_chunk_mut`, this is biased, // performance-wise, against the case where `N > self.len()`, so callers // should be structured to avoid that. // // If the result is `Err` then nothing was written to `self`; if anything // was written then the result will not be `Err`. #[cfg_attr(not(test), allow(dead_code))] pub fn split_first_chunk( mut self, f: impl for<'a> FnOnce(Array<'a, T, N>), ) -> Result { let src = self.src.clone(); let end = self .src .start .checked_add(N) .ok_or_else(|| IndexError::new(N))?; let first = self .in_out .get_mut(..end) .ok_or_else(|| IndexError::new(N))?; let first = Overlapping::new(first, src).unwrap_or_else(|IndexError { .. }| { // Since `end == src.start + N`. unreachable!() }); let first = Array::new(first).unwrap_or_else(|LenMismatchError { .. }| { // Since `end == src.start + N`. unreachable!() }); // Once we call `f`, we must return `Ok` because `f` may have written // over (part of) the input. Ok({ f(first); let tail = mem::take(&mut self.in_out).get_mut(N..).unwrap_or_else(|| { // There are at least `N` elements since `end == src.start + N`. unreachable!() }); Self::new(tail, self.src).unwrap_or_else(|IndexError { .. }| { // Follows from `end == src.start + N`. unreachable!() }) }) } } cold_exhaustive_error! { struct index_error::IndexError { index: usize } } ring-0.17.14/src/aead/overlapping/mod.rs000064400000000000000000000016201046102023000160700ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. pub use self::{ array::Array, base::{IndexError, Overlapping}, partial_block::PartialBlock, }; mod array; mod base; mod partial_block; ring-0.17.14/src/aead/overlapping/partial_block.rs000064400000000000000000000037341046102023000201270ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::Overlapping; use crate::error::InputTooLongError; pub struct PartialBlock<'i, T, const BLOCK_LEN: usize> { // invariant: `self.in_out.len() < BLOCK_LEN`. in_out: Overlapping<'i, T>, } impl<'i, T, const BLOCK_LEN: usize> PartialBlock<'i, T, BLOCK_LEN> { pub fn new(in_out: Overlapping<'i, T>) -> Result { let len = in_out.len(); if len >= BLOCK_LEN { return Err(InputTooLongError::new(len)); } Ok(Self { in_out }) } pub fn overwrite_at_start(self, padded: [T; BLOCK_LEN]) where T: Copy, { let len = self.len(); let output = self.in_out.into_unwritten_output(); assert!(output.len() <= padded.len()); output.copy_from_slice(&padded[..len]); } } impl PartialBlock<'_, T, BLOCK_LEN> { #[inline(always)] pub fn input(&self) -> &[T] { let r = self.in_out.input(); // Help the optimizer optimize the caller using the invariant. // TODO: Does this actually help? if r.len() >= BLOCK_LEN { unreachable!() } r } #[inline(always)] pub fn len(&self) -> usize { self.input().len() } } ring-0.17.14/src/aead/poly1305/ffi_arm_neon.rs000064400000000000000000000060501046102023000167030ustar 00000000000000// Copyright 2015-2025 Brian Smith. // Portions Copyright (c) 2014, 2015, Google Inc. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(all(target_arch = "arm", target_endian = "little"))] use super::{Key, Tag, KEY_LEN, TAG_LEN}; use crate::{c, cpu::arm::Neon}; use core::num::NonZeroUsize; // XXX/TODO(MSRV): change to `pub(super)`. pub(in super::super) struct State { state: poly1305_state_st, neon: Neon, } // TODO: Is 16 enough? #[repr(C, align(16))] struct poly1305_state_st { r: fe1305x2, h: fe1305x2, c: fe1305x2, precomp: [fe1305x2; 2], data: [u8; data_len()], buf: [u8; 32], buf_used: c::size_t, key: [u8; 16], } const fn data_len() -> usize { 128 } #[derive(Clone, Copy)] #[repr(C)] struct fe1305x2 { v: [u32; 12], // for alignment; only using 10 } impl State { pub(super) fn new_context(Key { key_and_nonce }: Key, neon: Neon) -> super::Context { prefixed_extern! { fn CRYPTO_poly1305_init_neon(state: &mut poly1305_state_st, key: &[u8; KEY_LEN]); } let mut r = Self { state: poly1305_state_st { r: fe1305x2 { v: [0; 12] }, h: fe1305x2 { v: [0; 12] }, c: fe1305x2 { v: [0; 12] }, precomp: [fe1305x2 { v: [0; 12] }; 2], data: [0u8; data_len()], buf: Default::default(), buf_used: 0, key: [0u8; 16], }, neon, }; unsafe { CRYPTO_poly1305_init_neon(&mut r.state, &key_and_nonce) } super::Context::ArmNeon(r) } pub(super) fn update_internal(&mut self, input: &[u8]) { prefixed_extern! { fn CRYPTO_poly1305_update_neon( st: &mut poly1305_state_st, input: *const u8, in_len: c::NonZero_size_t); } if let Some(len) = NonZeroUsize::new(input.len()) { let _: Neon = self.neon; let input = input.as_ptr(); unsafe { CRYPTO_poly1305_update_neon(&mut self.state, input, len) } } } pub(super) fn finish(mut self) -> Tag { prefixed_extern! { fn CRYPTO_poly1305_finish_neon(st: &mut poly1305_state_st, mac: &mut [u8; TAG_LEN]); } let mut tag = Tag([0u8; TAG_LEN]); unsafe { CRYPTO_poly1305_finish_neon(&mut self.state, &mut tag.0) } tag } } ring-0.17.14/src/aead/poly1305/ffi_fallback.rs000064400000000000000000000056161046102023000166530ustar 00000000000000// Copyright 2015-2025 Brian Smith. // Portions Copyright (c) 2014, 2015, Google Inc. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{Key, Tag, KEY_LEN, TAG_LEN}; use crate::c; use core::num::NonZeroUsize; // XXX/TODO(MSRV): change to `pub(super)`. pub(in super::super) struct State { state: poly1305_state_st, } // Keep in sync with `poly1305_state_st` in poly1305.c #[repr(C, align(64))] struct poly1305_state_st { r0: u32, r1: u32, r2: u32, r3: u32, r4: u32, s1: u32, s2: u32, s3: u32, s4: u32, h0: u32, h1: u32, h2: u32, h3: u32, h4: u32, key: [u8; 16], } impl State { pub(super) fn new_context(Key { key_and_nonce }: Key) -> super::Context { prefixed_extern! { fn CRYPTO_poly1305_init(state: &mut poly1305_state_st, key: &[u8; KEY_LEN]); } let mut r = Self { state: poly1305_state_st { r0: 0, r1: 0, r2: 0, r3: 0, r4: 0, s1: 0, s2: 0, s3: 0, s4: 0, h0: 0, h1: 0, h2: 0, h3: 0, h4: 0, key: [0u8; 16], }, }; unsafe { CRYPTO_poly1305_init(&mut r.state, &key_and_nonce) } super::Context::Fallback(r) } // `input.len % BLOCK_LEN == 0` must be true for every call except the // final one. pub(super) fn update_internal(&mut self, input: &[u8]) { prefixed_extern! { fn CRYPTO_poly1305_update( state: &mut poly1305_state_st, input: *const u8, in_len: c::NonZero_size_t); } if let Some(len) = NonZeroUsize::new(input.len()) { let input = input.as_ptr(); unsafe { CRYPTO_poly1305_update(&mut self.state, input, len) } } } pub(super) fn finish(mut self) -> Tag { prefixed_extern! { fn CRYPTO_poly1305_finish(statep: &mut poly1305_state_st, mac: &mut [u8; TAG_LEN]); } let mut tag = Tag([0u8; TAG_LEN]); unsafe { CRYPTO_poly1305_finish(&mut self.state, &mut tag.0) } tag } } ring-0.17.14/src/aead/poly1305.rs000064400000000000000000000074331046102023000142670ustar 00000000000000// Copyright 2015-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // TODO: enforce maximum input length. use super::{Tag, TAG_LEN}; #[cfg(all(target_arch = "arm", target_endian = "little"))] use crate::cpu::GetFeature as _; use crate::{cpu, polyfill::slice::AsChunks}; mod ffi_arm_neon; mod ffi_fallback; /// A Poly1305 key. pub(super) struct Key { key_and_nonce: [u8; KEY_LEN], } pub(super) const BLOCK_LEN: usize = 16; pub(super) const KEY_LEN: usize = 2 * BLOCK_LEN; impl Key { #[inline] pub(super) fn new(key_and_nonce: [u8; KEY_LEN]) -> Self { Self { key_and_nonce } } } pub(super) enum Context { #[cfg(all(target_arch = "arm", target_endian = "little"))] ArmNeon(ffi_arm_neon::State), Fallback(ffi_fallback::State), } impl Context { #[inline] pub(super) fn from_key(key: Key, cpu: cpu::Features) -> Self { #[cfg(all(target_arch = "arm", target_endian = "little"))] if let Some(cpu) = cpu.get_feature() { return ffi_arm_neon::State::new_context(key, cpu); } let _: cpu::Features = cpu; ffi_fallback::State::new_context(key) } pub fn update_block(&mut self, input: [u8; BLOCK_LEN]) { self.update(AsChunks::from_ref(&input)) } pub fn update(&mut self, input: AsChunks) { self.update_internal(input.as_flattened()); } fn update_internal(&mut self, input: &[u8]) { match self { #[cfg(all(target_arch = "arm", target_endian = "little"))] Self::ArmNeon(state) => state.update_internal(input), Self::Fallback(state) => state.update_internal(input), } } pub(super) fn finish(mut self, input: &[u8]) -> Tag { self.update_internal(input); match self { #[cfg(all(target_arch = "arm", target_endian = "little"))] Self::ArmNeon(state) => state.finish(), Self::Fallback(state) => state.finish(), } } } /// Implements the original, non-IETF padding semantics. /// /// This is used by chacha20_poly1305_openssh and the standalone /// poly1305 test vectors. pub(super) fn sign(key: Key, input: &[u8], cpu_features: cpu::Features) -> Tag { let ctx = Context::from_key(key, cpu_features); ctx.finish(input) } #[cfg(test)] mod tests { use super::*; use crate::testutil as test; // Adapted from BoringSSL's crypto/poly1305/poly1305_test.cc. #[test] pub fn test_poly1305() { let cpu_features = cpu::features(); test::run( test_vector_file!("poly1305_test.txt"), |section, test_case| { assert_eq!(section, ""); let key = test_case.consume_bytes("Key"); let key: &[u8; KEY_LEN] = key.as_slice().try_into().unwrap(); let input = test_case.consume_bytes("Input"); let expected_mac = test_case.consume_bytes("MAC"); let key = Key::new(*key); let Tag(actual_mac) = sign(key, &input, cpu_features); assert_eq!(expected_mac, actual_mac.as_ref()); Ok(()) }, ) } } ring-0.17.14/src/aead/poly1305_test.txt000064400000000000000000002243671046102023000155300ustar 00000000000000# Test Vectors from OpenSSL commit bbe9769ba66ab2512678a87b0d9b266ba970db05. Key = 2d773be37adb1e4d683bf0075e79c4ee037918535a7f99ccb7040fb5f5f43aea Input = 89dab80b7717c1db5db437860a3f70218e93e1b8f461fb677f16f35f6f87e2a91c99bc3a47ace47640cc95c345be5ecca5a3523c35cc01893af0b64a620334270372ec12482d1b1e363561698a578b359803495bb4e2ef1930b17a5190b580f141300df30adbeca28f6427a8bc1a999fd51c554a017d095d8c3e3127daf9f595 MAC = c85d15ed44c378d6b00e23064c7bcd51 Key = 99e5822dd4173c995e3dae0ddefb97743fde3b080134b39f76e9bf8d0e88d546 Input = 000000000000000b170303020000000006db1f1f368d696a810a349c0c714c9a5e7850c2407d721acded95e018d7a85266a6e1289cdb4aeb18da5ac8a2b0026d24a59ad485227f3eaedbb2e7e35e1c66cd60f9abf716dcc9ac42682dd7dab287a7024c4eefc321cc0574e16793e37cec03c5bda42b54c114a80b57af26416c7be742005e20855c73e21dc8e2edc9d435cb6f6059280011c270b71570051c1c9b3052126620bc1e2730fa066c7a509d53c60e5ae1b40aa6e39e49669228c90eecb4a50db32a50bc49e90b4f4b359a1dfd11749cd3867fcf2fb7bb6cd4738f6a4ad6f7ca5058f7618845af9f020f6c3b967b8f4cd4a91e2813b507ae66f2d35c18284f7292186062e10fd5510d18775351ef334e7634ab4743f5b68f49adcab384d3fd75f7390f4006ef2a295c8c7a076ad54546cd25d2107fbe1436c840924aaebe5b370893cd63d1325b8616fc4810886bc152c53221b6df373119393255ee72bcaa880174f1717f9184fa91646f17a24ac55d16bfddca9581a92eda479201f0edbf633600d6066d1ab36d5d2415d71351bbcd608a25108d25641992c1f26c531cf9f90203bc4cc19f5927d834b0a47116d3884bbb164b8ec883d1ac832e56b3918a98601a08d171881541d594db399c6ae6151221745aec814c45b0b05b565436fd6f137aa10a0c0b643761dbd6f9a9dcb99b1a6e690854ce0769cde39761d82fcdec15f0d92d7d8e94ade8eb83fbe0 MAC = 2637408fe13086ea73f971e3425e2820 # RFC 8439, section 2.5.2. Key = 85d6be7857556d337f4452fe42d506a80103808afb0db2fd4abff6af4149f51b Input = "Cryptographic Forum Research Group" MAC = a8061dc1305136c6c22b8baf0c0127a9 # RFC 8439, section A.3. Key = 0000000000000000000000000000000000000000000000000000000000000000 Input = 00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 MAC = 00000000000000000000000000000000 Key = 0000000000000000000000000000000036e5f6b5c5e06070f0efca96227a863e Input = 416e79207375626d697373696f6e20746f20746865204945544620696e74656e6465642062792074686520436f6e7472696275746f7220666f72207075626c69636174696f6e20617320616c6c206f722070617274206f6620616e204945544620496e7465726e65742d4472616674206f722052464320616e6420616e792073746174656d656e74206d6164652077697468696e2074686520636f6e74657874206f6620616e204945544620616374697669747920697320636f6e7369646572656420616e20224945544620436f6e747269627574696f6e222e20537563682073746174656d656e747320696e636c756465206f72616c2073746174656d656e747320696e20494554462073657373696f6e732c2061732077656c6c206173207772697474656e20616e6420656c656374726f6e696320636f6d6d756e69636174696f6e73206d61646520617420616e792074696d65206f7220706c6163652c207768696368206172652061646472657373656420746f MAC = 36e5f6b5c5e06070f0efca96227a863e Key = 36e5f6b5c5e06070f0efca96227a863e00000000000000000000000000000000 Input = 416e79207375626d697373696f6e20746f20746865204945544620696e74656e6465642062792074686520436f6e7472696275746f7220666f72207075626c69636174696f6e20617320616c6c206f722070617274206f6620616e204945544620496e7465726e65742d4472616674206f722052464320616e6420616e792073746174656d656e74206d6164652077697468696e2074686520636f6e74657874206f6620616e204945544620616374697669747920697320636f6e7369646572656420616e20224945544620436f6e747269627574696f6e222e20537563682073746174656d656e747320696e636c756465206f72616c2073746174656d656e747320696e20494554462073657373696f6e732c2061732077656c6c206173207772697474656e20616e6420656c656374726f6e696320636f6d6d756e69636174696f6e73206d61646520617420616e792074696d65206f7220706c6163652c207768696368206172652061646472657373656420746f MAC = f3477e7cd95417af89a6b8794c310cf0 Key = 1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0 Input = 2754776173206272696c6c69672c20616e642074686520736c6974687920746f7665730a446964206779726520616e642067696d626c6520696e2074686520776162653a0a416c6c206d696d737920776572652074686520626f726f676f7665732c0a416e6420746865206d6f6d65207261746873206f757467726162652e MAC = 4541669a7eaaee61e708dc7cbcc5eb62 Key = 0200000000000000000000000000000000000000000000000000000000000000 Input = ffffffffffffffffffffffffffffffff MAC = 03000000000000000000000000000000 Key = 02000000000000000000000000000000ffffffffffffffffffffffffffffffff Input = 02000000000000000000000000000000 MAC = 03000000000000000000000000000000 Key = 0100000000000000000000000000000000000000000000000000000000000000 Input = fffffffffffffffffffffffffffffffff0ffffffffffffffffffffffffffffff11000000000000000000000000000000 MAC = 05000000000000000000000000000000 Key = 0100000000000000000000000000000000000000000000000000000000000000 Input = fffffffffffffffffffffffffffffffffbfefefefefefefefefefefefefefefe01010101010101010101010101010101 MAC = 00000000000000000000000000000000 Key = 0200000000000000000000000000000000000000000000000000000000000000 Input = fdffffffffffffffffffffffffffffff MAC = faffffffffffffffffffffffffffffff Key = 0100000000000000040000000000000000000000000000000000000000000000 Input = e33594d7505e43b900000000000000003394d7505e4379cd01000000000000000000000000000000000000000000000001000000000000000000000000000000 MAC = 14000000000000005500000000000000 Key = 0100000000000000040000000000000000000000000000000000000000000000 Input = e33594d7505e43b900000000000000003394d7505e4379cd010000000000000000000000000000000000000000000000 MAC = 13000000000000000000000000000000 # Additional test vectors that are long enough to ensure OpenSSL's SIMD # assembly is fully tested. # Length 2048. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfed MAC = 69d28f73dd09d39a92aa179da354b7ea # Length 2049. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc8 MAC = d6a26654b88572e875d9661c83471c1b # Length 2050. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852 MAC = 9fbbb7f7adcd0cd5b46a4a520b22499a # Length 2051. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f5 MAC = eb7cdceb97ade2a07622f8f5a4b1ce15 # Length 2052. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f590 MAC = d41c310927cd92e14784ea78b85503db # Length 2053. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073 MAC = 16af133c423f783a14c49d9f526384cf # Length 2054. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4 MAC = 00c75db8f0636b22f195645b03091f5f # Length 2055. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f434 MAC = 4a532bc740f581555831345f3b75bf33 # Length 2056. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a MAC = 698c7d32c5923871d124a2479e521706 # Length 2057. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c MAC = a677187dbf3c927aeeafb9ebce0f61dc # Length 2058. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a MAC = 201fed7eee981b31d2cc42ff6c38141a # Length 2059. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28 MAC = 0c3d3d01a37f347c4f7c5826bcafb3e1 # Length 2060. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28c9 MAC = 33a4e0e0bed7c84c5cc5dd4784410f07 # Length 2061. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28c99e MAC = 8e41c40a2f8ec58fe594f3a3a2de4ae1 # Length 2062. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28c99e21 MAC = c6e5d1810fd878ac6b844c66cef36a22 # Length 2063. Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28c99e21df MAC = f6eaae369c3cb5c05748e8d919178e00 # Regression test for https://rt.openssl.org/Ticket/Display.html?id=4439 Key = 2d773be37adb1e4d683bf0075e79c4ee037918535a7f99ccb7040fb5f5f43aea Input = 89dab80b7717c1db5db437860a3f70218e93e1b8f461fb677f16f35f6f87e2a91c99bc3a47ace47640cc95c345be5ecca5a3523c35cc01893af0b64a620334270372ec12482d1b1e363561698a578b359803495bb4e2ef1930b17a5190b580f141300df30adbeca28f6427a8bc1a999fd51c554a017d095d8c3e3127daf9f595 MAC = c85d15ed44c378d6b00e23064c7bcd51 # Regression tests for https://rt.openssl.org/Ticket/Display.html?id=4483 Key = 7f1b02640000000000000000000000000000000000000000cccccccccccccccc Input = cccccccccccccccccccccccccccccccccccccccccccccccccc80ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccceccccccccccccccccccccccccccccccccccccc5cccccccccccccccccccccccccccccccccccccccccce3ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccaccccccccccccccccccccce6cccccccccc000000afccccccccccccccccccfffffff5000000000000000000000000000000000000000000000000000000ffffffe70000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000719205a8521dfc MAC = 8559b876eceed66eb37798c0457baff9 Key = e00016000000000000000000000000000000aaaaaaaaaaaaaaaaaaaaaaaaaaaa Input = aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa000000000000000000800264 MAC = 00bd1258978e205444c9aaaa82006fed Key = 0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c Input = 02fc MAC = 06120c0c0c0c0c0c0c0c0c0c0c0c0c0c Key = 00ff000000000000000000000000000000000000001e00000000000000007b7b Input = 7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b007b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b001300000000b300000000000000000000000000000000000000000000f20000000000000000000000000000000000002000efff0009000000000000000000000000100000000009000000640000000000000000000000001300000000b300000000000000000000000000000000000000000000f20000000000000000000000000000000000002000efff00090000000000000000007a000010000000000900000064000000000000000000000000000000000000000000000000fc MAC = 33205bbf9e9f8f7212ab9e2ab9b7e4a5 ring-0.17.14/src/aead/quic.rs000064400000000000000000000120371046102023000137300ustar 00000000000000// Copyright 2018 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! QUIC Header Protection. //! //! See draft-ietf-quic-tls. use crate::{ aead::{aes, chacha}, cpu, error, hkdf, }; /// A key for generating QUIC Header Protection masks. pub struct HeaderProtectionKey { inner: KeyInner, algorithm: &'static Algorithm, } #[allow(clippy::large_enum_variant, variant_size_differences)] enum KeyInner { Aes(aes::Key), ChaCha20(chacha::Key), } impl From> for HeaderProtectionKey { fn from(okm: hkdf::Okm<&'static Algorithm>) -> Self { let mut key_bytes = [0; super::MAX_KEY_LEN]; let algorithm = *okm.len(); let key_bytes = &mut key_bytes[..algorithm.key_len()]; okm.fill(key_bytes).unwrap(); Self::new(algorithm, key_bytes).unwrap() } } impl HeaderProtectionKey { /// Create a new header protection key. /// /// `key_bytes` must be exactly `algorithm.key_len` bytes long. pub fn new( algorithm: &'static Algorithm, key_bytes: &[u8], ) -> Result { Ok(Self { inner: (algorithm.init)(key_bytes, cpu::features())?, algorithm, }) } /// Generate a new QUIC Header Protection mask. /// /// `sample` must be exactly `self.algorithm().sample_len()` bytes long. pub fn new_mask(&self, sample: &[u8]) -> Result<[u8; 5], error::Unspecified> { let sample = <&[u8; SAMPLE_LEN]>::try_from(sample)?; let out = (self.algorithm.new_mask)(&self.inner, *sample); Ok(out) } /// The key's algorithm. #[inline(always)] pub fn algorithm(&self) -> &'static Algorithm { self.algorithm } } const SAMPLE_LEN: usize = super::TAG_LEN; /// QUIC sample for new key masks pub type Sample = [u8; SAMPLE_LEN]; /// A QUIC Header Protection Algorithm. pub struct Algorithm { init: fn(key: &[u8], cpu_features: cpu::Features) -> Result, new_mask: fn(key: &KeyInner, sample: Sample) -> [u8; 5], key_len: usize, id: AlgorithmID, } impl hkdf::KeyType for &'static Algorithm { #[inline] fn len(&self) -> usize { self.key_len() } } impl Algorithm { /// The length of the key. #[inline(always)] pub fn key_len(&self) -> usize { self.key_len } /// The required sample length. #[inline(always)] pub fn sample_len(&self) -> usize { SAMPLE_LEN } } derive_debug_via_id!(Algorithm); #[derive(Debug, Eq, PartialEq)] enum AlgorithmID { AES_128, AES_256, CHACHA20, } impl PartialEq for Algorithm { fn eq(&self, other: &Self) -> bool { self.id == other.id } } impl Eq for Algorithm {} /// AES-128. pub static AES_128: Algorithm = Algorithm { key_len: 16, init: aes_init_128, new_mask: aes_new_mask, id: AlgorithmID::AES_128, }; /// AES-256. pub static AES_256: Algorithm = Algorithm { key_len: 32, init: aes_init_256, new_mask: aes_new_mask, id: AlgorithmID::AES_256, }; fn aes_init_128(key: &[u8], cpu_features: cpu::Features) -> Result { let key = key.try_into().map_err(|_| error::Unspecified)?; let aes_key = aes::Key::new(aes::KeyBytes::AES_128(key), cpu_features)?; Ok(KeyInner::Aes(aes_key)) } fn aes_init_256(key: &[u8], cpu_features: cpu::Features) -> Result { let key = key.try_into().map_err(|_| error::Unspecified)?; let aes_key = aes::Key::new(aes::KeyBytes::AES_256(key), cpu_features)?; Ok(KeyInner::Aes(aes_key)) } fn aes_new_mask(key: &KeyInner, sample: Sample) -> [u8; 5] { let aes_key = match key { KeyInner::Aes(key) => key, _ => unreachable!(), }; aes_key.new_mask(sample) } /// ChaCha20. pub static CHACHA20: Algorithm = Algorithm { key_len: chacha::KEY_LEN, init: chacha20_init, new_mask: chacha20_new_mask, id: AlgorithmID::CHACHA20, }; fn chacha20_init(key: &[u8], _cpu_features: cpu::Features) -> Result { let chacha20_key: [u8; chacha::KEY_LEN] = key.try_into()?; Ok(KeyInner::ChaCha20(chacha::Key::new(chacha20_key))) } fn chacha20_new_mask(key: &KeyInner, sample: Sample) -> [u8; 5] { let chacha20_key = match key { KeyInner::ChaCha20(key) => key, _ => unreachable!(), }; chacha20_key.new_mask(sample) } ring-0.17.14/src/aead/sealing_key.rs000064400000000000000000000072311046102023000152610ustar 00000000000000// Copyright 2015-2021 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Authenticated Encryption with Associated Data (AEAD). //! //! See [Authenticated encryption: relations among notions and analysis of the //! generic composition paradigm][AEAD] for an introduction to the concept of //! AEADs. //! //! [AEAD]: https://eprint.iacr.org/2000/025.pdf //! [`crypto.cipher.AEAD`]: https://golang.org/pkg/crypto/cipher/#AEAD use super::{Aad, Algorithm, BoundKey, LessSafeKey, NonceSequence, Tag, UnboundKey}; use crate::error; /// An AEAD key for encrypting and signing ("sealing"), bound to a nonce /// sequence. /// /// Intentionally not `Clone` or `Copy` since cloning would allow duplication /// of the nonce sequence. pub struct SealingKey { key: LessSafeKey, nonce_sequence: N, } impl BoundKey for SealingKey { fn new(key: UnboundKey, nonce_sequence: N) -> Self { Self { key: key.into_inner(), nonce_sequence, } } #[inline] fn algorithm(&self) -> &'static Algorithm { self.key.algorithm() } } impl core::fmt::Debug for SealingKey { fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { self.key.fmt_debug("SealingKey", f) } } impl SealingKey { /// Encrypts and signs (“seals”) data in place, appending the tag to the /// resulting ciphertext. /// /// `key.seal_in_place_append_tag(aad, in_out)` is equivalent to: /// /// ```skip /// key.seal_in_place_separate_tag(aad, in_out.as_mut()) /// .map(|tag| in_out.extend(tag.as_ref())) /// ``` #[inline] pub fn seal_in_place_append_tag( &mut self, aad: Aad, in_out: &mut InOut, ) -> Result<(), error::Unspecified> where A: AsRef<[u8]>, InOut: AsMut<[u8]> + for<'in_out> Extend<&'in_out u8>, { self.key .seal_in_place_append_tag(self.nonce_sequence.advance()?, aad, in_out) } /// Encrypts and signs (“seals”) data in place. /// /// `aad` is the additional authenticated data (AAD), if any. This is /// authenticated but not encrypted. The type `A` could be a byte slice /// `&[u8]`, a byte array `[u8; N]` for some constant `N`, `Vec`, etc. /// If there is no AAD then use `Aad::empty()`. /// /// The plaintext is given as the input value of `in_out`. `seal_in_place()` /// will overwrite the plaintext with the ciphertext and return the tag. /// For most protocols, the caller must append the tag to the ciphertext. /// The tag will be `self.algorithm.tag_len()` bytes long. #[inline] pub fn seal_in_place_separate_tag( &mut self, aad: Aad, in_out: &mut [u8], ) -> Result where A: AsRef<[u8]>, { self.key .seal_in_place_separate_tag(self.nonce_sequence.advance()?, aad, in_out) } } ring-0.17.14/src/aead/shift.rs000064400000000000000000000026251046102023000141060ustar 00000000000000// Copyright 2018 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #[cfg(target_arch = "x86")] pub fn shift_full_blocks( in_out: super::overlapping::Overlapping<'_, u8>, mut transform: impl FnMut(&[u8; BLOCK_LEN]) -> [u8; BLOCK_LEN], ) { let (in_out, src) = in_out.into_slice_src_mut(); let in_out_len = in_out[src.clone()].len(); for i in (0..in_out_len).step_by(BLOCK_LEN) { let block = { let input = <&[u8; BLOCK_LEN]>::try_from(&in_out[(src.start + i)..][..BLOCK_LEN]).unwrap(); transform(input) }; let output = <&mut [u8; BLOCK_LEN]>::try_from(&mut in_out[i..][..BLOCK_LEN]).unwrap(); *output = block; } } ring-0.17.14/src/aead/unbound_key.rs000064400000000000000000000047311046102023000153130ustar 00000000000000// Copyright 2015-2021 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Authenticated Encryption with Associated Data (AEAD). //! //! See [Authenticated encryption: relations among notions and analysis of the //! generic composition paradigm][AEAD] for an introduction to the concept of //! AEADs. //! //! [AEAD]: https://eprint.iacr.org/2000/025.pdf //! [`crypto.cipher.AEAD`]: https://golang.org/pkg/crypto/cipher/#AEAD use super::{Algorithm, LessSafeKey, MAX_KEY_LEN}; use crate::{cpu, error, hkdf}; /// An AEAD key without a designated role or nonce sequence. pub struct UnboundKey { inner: LessSafeKey, } impl UnboundKey { /// Constructs a `UnboundKey`. /// /// Fails if `key_bytes.len() != algorithm.key_len()`. #[inline] pub fn new( algorithm: &'static Algorithm, key_bytes: &[u8], ) -> Result { Ok(Self { inner: LessSafeKey::new_(algorithm, key_bytes, cpu::features())?, }) } /// The key's AEAD algorithm. #[inline] pub fn algorithm(&self) -> &'static Algorithm { self.inner.algorithm() } #[inline] pub(super) fn into_inner(self) -> LessSafeKey { self.inner } } impl core::fmt::Debug for UnboundKey { fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { self.inner.fmt_debug("UnboundKey", f) } } impl From> for UnboundKey { fn from(okm: hkdf::Okm<&'static Algorithm>) -> Self { let mut key_bytes = [0; MAX_KEY_LEN]; let key_bytes = &mut key_bytes[..okm.len().key_len()]; let algorithm = *okm.len(); okm.fill(key_bytes).unwrap(); Self { inner: LessSafeKey::new_(algorithm, key_bytes, cpu::features()).unwrap(), } } } ring-0.17.14/src/aead.rs000064400000000000000000000122001046102023000127570ustar 00000000000000// Copyright 2015-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Authenticated Encryption with Associated Data (AEAD). //! //! See [Authenticated encryption: relations among notions and analysis of the //! generic composition paradigm][AEAD] for an introduction to the concept of //! AEADs. //! //! [AEAD]: https://eprint.iacr.org/2000/025.pdf //! [`crypto.cipher.AEAD`]: https://golang.org/pkg/crypto/cipher/#AEAD use crate::{ cpu, error, polyfill::{u64_from_usize, usize_from_u64_saturated}, }; pub use self::{ algorithm::{Algorithm, AES_128_GCM, AES_256_GCM, CHACHA20_POLY1305}, less_safe_key::LessSafeKey, nonce::{Nonce, NONCE_LEN}, opening_key::OpeningKey, sealing_key::SealingKey, unbound_key::UnboundKey, }; /// A sequences of unique nonces. /// /// A given `NonceSequence` must never return the same `Nonce` twice from /// `advance()`. /// /// A simple counter is a reasonable (but probably not ideal) `NonceSequence`. /// /// Intentionally not `Clone` or `Copy` since cloning would allow duplication /// of the sequence. pub trait NonceSequence { /// Returns the next nonce in the sequence. /// /// This may fail if "too many" nonces have been requested, where how many /// is too many is up to the implementation of `NonceSequence`. An /// implementation may that enforce a maximum number of records are /// sent/received under a key this way. Once `advance()` fails, it must /// fail for all subsequent calls. fn advance(&mut self) -> Result; } /// An AEAD key bound to a nonce sequence. pub trait BoundKey: core::fmt::Debug { /// Constructs a new key from the given `UnboundKey` and `NonceSequence`. fn new(key: UnboundKey, nonce_sequence: N) -> Self; /// The key's AEAD algorithm. fn algorithm(&self) -> &'static Algorithm; } /// The additionally authenticated data (AAD) for an opening or sealing /// operation. This data is authenticated but is **not** encrypted. /// /// The type `A` could be a byte slice `&[u8]`, a byte array `[u8; N]` /// for some constant `N`, `Vec`, etc. #[derive(Clone, Copy)] pub struct Aad(A); impl> Aad { /// Construct the `Aad` from the given bytes. #[inline] pub fn from(aad: A) -> Self { Self(aad) } } impl AsRef<[u8]> for Aad where A: AsRef<[u8]>, { fn as_ref(&self) -> &[u8] { self.0.as_ref() } } impl Aad<[u8; 0]> { /// Construct an empty `Aad`. pub fn empty() -> Self { Self::from([]) } } impl core::fmt::Debug for Aad where A: core::fmt::Debug, { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_tuple("Aad").field(&self.0).finish() } } impl PartialEq for Aad where A: PartialEq, { #[inline] fn eq(&self, other: &Self) -> bool { self.0.eq(&other.0) } } impl Eq for Aad where A: Eq {} #[allow(clippy::large_enum_variant, variant_size_differences)] #[derive(Clone)] enum KeyInner { AesGcm(aes_gcm::Key), ChaCha20Poly1305(chacha20_poly1305::Key), } const fn max_input_len(block_len: usize, overhead_blocks_per_nonce: usize) -> usize { // Each of our AEADs use a 32-bit block counter so the maximum is the // largest input that will not overflow the counter. usize_from_u64_saturated( ((1u64 << 32) - u64_from_usize(overhead_blocks_per_nonce)) * u64_from_usize(block_len), ) } /// A possibly valid authentication tag. #[must_use] #[repr(C)] #[derive(Clone, Copy)] pub struct Tag([u8; TAG_LEN]); impl AsRef<[u8]> for Tag { fn as_ref(&self) -> &[u8] { self.0.as_ref() } } impl TryFrom<&[u8]> for Tag { type Error = error::Unspecified; fn try_from(value: &[u8]) -> Result { let raw_tag: [u8; TAG_LEN] = value.try_into().map_err(|_| error::Unspecified)?; Ok(Self::from(raw_tag)) } } impl From<[u8; TAG_LEN]> for Tag { #[inline] fn from(value: [u8; TAG_LEN]) -> Self { Self(value) } } const MAX_KEY_LEN: usize = 32; // All the AEADs we support use 128-bit tags. const TAG_LEN: usize = 16; /// The maximum length of a tag for the algorithms in this module. pub const MAX_TAG_LEN: usize = TAG_LEN; mod aes; mod aes_gcm; mod algorithm; mod chacha; mod chacha20_poly1305; pub mod chacha20_poly1305_openssh; mod gcm; mod less_safe_key; mod nonce; mod opening_key; mod overlapping; mod poly1305; pub mod quic; mod sealing_key; mod shift; mod unbound_key; ring-0.17.14/src/agreement.rs000064400000000000000000000232261046102023000140460ustar 00000000000000// Copyright 2015-2017 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Key Agreement: ECDH, including X25519. //! //! # Example //! //! Note that this example uses X25519, but ECDH using NIST P-256/P-384 is done //! exactly the same way, just substituting //! `agreement::ECDH_P256`/`agreement::ECDH_P384` for `agreement::X25519`. //! //! ``` //! use ring::{agreement, rand}; //! //! let rng = rand::SystemRandom::new(); //! //! let my_private_key = agreement::EphemeralPrivateKey::generate(&agreement::X25519, &rng)?; //! //! // Make `my_public_key` a byte slice containing my public key. In a real //! // application, this would be sent to the peer in an encoded protocol //! // message. //! let my_public_key = my_private_key.compute_public_key()?; //! //! let peer_public_key_bytes = { //! // In a real application, the peer public key would be parsed out of a //! // protocol message. Here we just generate one. //! let peer_private_key = //! agreement::EphemeralPrivateKey::generate(&agreement::X25519, &rng)?; //! peer_private_key.compute_public_key()? //! }; //! //! let peer_public_key = agreement::UnparsedPublicKey::new( //! &agreement::X25519, //! peer_public_key_bytes); //! //! agreement::agree_ephemeral( //! my_private_key, //! &peer_public_key, //! |_key_material| { //! // In a real application, we'd apply a KDF to the key material and the //! // public keys (as recommended in RFC 7748) and then derive session //! // keys from the result. We omit all that here. //! }, //! )?; //! //! # Ok::<(), ring::error::Unspecified>(()) //! ``` // The "NSA Guide" steps here are from from section 3.1, "Ephemeral Unified // Model." use crate::{cpu, debug, ec, error, rand}; pub use crate::ec::{ curve25519::x25519::X25519, suite_b::ecdh::{ECDH_P256, ECDH_P384}, }; /// A key agreement algorithm. pub struct Algorithm { pub(crate) curve: &'static ec::Curve, pub(crate) ecdh: fn( out: &mut [u8], private_key: &ec::Seed, peer_public_key: untrusted::Input, cpu: cpu::Features, ) -> Result<(), error::Unspecified>, } derive_debug_via_field!(Algorithm, curve); impl Eq for Algorithm {} impl PartialEq for Algorithm { fn eq(&self, other: &Self) -> bool { self.curve.id == other.curve.id } } /// An ephemeral private key for use (only) with `agree_ephemeral`. The /// signature of `agree_ephemeral` ensures that an `EphemeralPrivateKey` can be /// used for at most one key agreement. pub struct EphemeralPrivateKey { private_key: ec::Seed, algorithm: &'static Algorithm, } derive_debug_via_field!( EphemeralPrivateKey, stringify!(EphemeralPrivateKey), algorithm ); impl EphemeralPrivateKey { /// Generate a new ephemeral private key for the given algorithm. pub fn generate( alg: &'static Algorithm, rng: &dyn rand::SecureRandom, ) -> Result { let cpu_features = cpu::features(); // NSA Guide Step 1. // // This only handles the key generation part of step 1. The rest of // step one is done by `compute_public_key()`. let private_key = ec::Seed::generate(alg.curve, rng, cpu_features)?; Ok(Self { private_key, algorithm: alg, }) } /// Computes the public key from the private key. #[inline(always)] pub fn compute_public_key(&self) -> Result { // NSA Guide Step 1. // // Obviously, this only handles the part of Step 1 between the private // key generation and the sending of the public key to the peer. `out` // is what should be sent to the peer. self.private_key .compute_public_key(cpu::features()) .map(|public_key| PublicKey { algorithm: self.algorithm, bytes: public_key, }) } /// The algorithm for the private key. #[inline] pub fn algorithm(&self) -> &'static Algorithm { self.algorithm } /// Do not use. #[deprecated] #[cfg(test)] pub fn bytes(&self) -> &[u8] { self.bytes_for_test() } #[cfg(test)] pub(super) fn bytes_for_test(&self) -> &[u8] { self.private_key.bytes_less_safe() } } /// A public key for key agreement. #[derive(Clone)] pub struct PublicKey { algorithm: &'static Algorithm, bytes: ec::PublicKey, } impl AsRef<[u8]> for PublicKey { fn as_ref(&self) -> &[u8] { self.bytes.as_ref() } } impl core::fmt::Debug for PublicKey { fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { f.debug_struct("PublicKey") .field("algorithm", &self.algorithm) .field("bytes", &debug::HexStr(self.as_ref())) .finish() } } impl PublicKey { /// The algorithm for the public key. #[inline] pub fn algorithm(&self) -> &'static Algorithm { self.algorithm } } /// An unparsed, possibly malformed, public key for key agreement. #[derive(Clone, Copy)] pub struct UnparsedPublicKey { algorithm: &'static Algorithm, bytes: B, } impl AsRef<[u8]> for UnparsedPublicKey where B: AsRef<[u8]>, { fn as_ref(&self) -> &[u8] { self.bytes.as_ref() } } impl core::fmt::Debug for UnparsedPublicKey where B: AsRef<[u8]>, { fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { f.debug_struct("UnparsedPublicKey") .field("algorithm", &self.algorithm) .field("bytes", &debug::HexStr(self.bytes.as_ref())) .finish() } } impl UnparsedPublicKey { /// Constructs a new `UnparsedPublicKey`. pub fn new(algorithm: &'static Algorithm, bytes: B) -> Self { Self { algorithm, bytes } } /// The algorithm for the public key. #[inline] pub fn algorithm(&self) -> &'static Algorithm { self.algorithm } /// TODO: doc #[inline] pub fn bytes(&self) -> &B { &self.bytes } } /// Performs a key agreement with an ephemeral private key and the given public /// key. /// /// `my_private_key` is the ephemeral private key to use. Since it is moved, it /// will not be usable after calling `agree_ephemeral`, thus guaranteeing that /// the key is used for only one key agreement. /// /// `peer_public_key` is the peer's public key. `agree_ephemeral` will return /// `Err(error_value)` if it does not match `my_private_key's` algorithm/curve. /// `agree_ephemeral` verifies that it is encoded in the standard form for the /// algorithm and that the key is *valid*; see the algorithm's documentation for /// details on how keys are to be encoded and what constitutes a valid key for /// that algorithm. /// /// After the key agreement is done, `agree_ephemeral` calls `kdf` with the raw /// key material from the key agreement operation and then returns what `kdf` /// returns. #[inline] pub fn agree_ephemeral, R>( my_private_key: EphemeralPrivateKey, peer_public_key: &UnparsedPublicKey, kdf: impl FnOnce(&[u8]) -> R, ) -> Result { let peer_public_key = UnparsedPublicKey { algorithm: peer_public_key.algorithm, bytes: peer_public_key.bytes.as_ref(), }; agree_ephemeral_(my_private_key, peer_public_key, kdf, cpu::features()) } fn agree_ephemeral_( my_private_key: EphemeralPrivateKey, peer_public_key: UnparsedPublicKey<&[u8]>, kdf: impl FnOnce(&[u8]) -> R, cpu: cpu::Features, ) -> Result { // NSA Guide Prerequisite 1. // // The domain parameters are hard-coded. This check verifies that the // peer's public key's domain parameters match the domain parameters of // this private key. if peer_public_key.algorithm != my_private_key.algorithm { return Err(error::Unspecified); } let alg = &my_private_key.algorithm; // NSA Guide Prerequisite 2, regarding which KDFs are allowed, is delegated // to the caller. // NSA Guide Prerequisite 3, "Prior to or during the key-agreement process, // each party shall obtain the identifier associated with the other party // during the key-agreement scheme," is delegated to the caller. // NSA Guide Step 1 is handled by `EphemeralPrivateKey::generate()` and // `EphemeralPrivateKey::compute_public_key()`. let mut shared_key = [0u8; ec::ELEM_MAX_BYTES]; let shared_key = &mut shared_key[..alg.curve.elem_scalar_seed_len]; // NSA Guide Steps 2, 3, and 4. // // We have a pretty liberal interpretation of the NIST's spec's "Destroy" // that doesn't meet the NSA requirement to "zeroize." (alg.ecdh)( shared_key, &my_private_key.private_key, untrusted::Input::from(peer_public_key.bytes), cpu, )?; // NSA Guide Steps 5 and 6. // // Again, we have a pretty liberal interpretation of the NIST's spec's // "Destroy" that doesn't meet the NSA requirement to "zeroize." Ok(kdf(shared_key)) } ring-0.17.14/src/arithmetic/bigint/boxed_limbs.rs000064400000000000000000000044171046102023000177740ustar 00000000000000// Copyright 2015-2023 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::Modulus; use crate::{ error, limb::{self, Limb}, }; use alloc::{boxed::Box, vec}; use core::{ marker::PhantomData, ops::{Deref, DerefMut}, }; /// All `BoxedLimbs` are stored in the same number of limbs. pub(super) struct BoxedLimbs { limbs: Box<[Limb]>, /// The modulus *m* that determines the size of `limbx`. m: PhantomData, } impl Deref for BoxedLimbs { type Target = [Limb]; #[inline] fn deref(&self) -> &Self::Target { &self.limbs } } impl DerefMut for BoxedLimbs { #[inline] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.limbs } } // TODO: `derive(Clone)` after https://github.com/rust-lang/rust/issues/26925 // is resolved or restrict `M: Clone`. impl Clone for BoxedLimbs { fn clone(&self) -> Self { Self { limbs: self.limbs.clone(), m: self.m, } } } impl BoxedLimbs { pub(super) fn from_be_bytes_padded_less_than( input: untrusted::Input, m: &Modulus, ) -> Result { let mut r = Self::zero(m.limbs().len()); limb::parse_big_endian_and_pad_consttime(input, &mut r)?; limb::verify_limbs_less_than_limbs_leak_bit(&r, m.limbs())?; Ok(r) } pub(super) fn zero(len: usize) -> Self { Self { limbs: vec![0; len].into_boxed_slice(), m: PhantomData, } } pub(super) fn into_limbs(self) -> Box<[Limb]> { self.limbs } } ring-0.17.14/src/arithmetic/bigint/modulus.rs000064400000000000000000000160401046102023000171700ustar 00000000000000// Copyright 2015-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{ super::montgomery::Unencoded, unwrap_impossible_len_mismatch_error, BoxedLimbs, Elem, OwnedModulusValue, PublicModulus, Storage, N0, }; use crate::{ bits::BitLength, cpu, error, limb::{self, Limb, LIMB_BITS}, polyfill::LeadingZerosStripped, }; use core::marker::PhantomData; /// The modulus *m* for a ring ℤ/mℤ, along with the precomputed values needed /// for efficient Montgomery multiplication modulo *m*. The value must be odd /// and larger than 2. The larger-than-1 requirement is imposed, at least, by /// the modular inversion code. pub struct OwnedModulus { inner: OwnedModulusValue, // n0 * N == -1 (mod r). // // r == 2**(N0::LIMBS_USED * LIMB_BITS) and LG_LITTLE_R == lg(r). This // ensures that we can do integer division by |r| by simply ignoring // `N0::LIMBS_USED` limbs. Similarly, we can calculate values modulo `r` by // just looking at the lowest `N0::LIMBS_USED` limbs. This is what makes // Montgomery multiplication efficient. // // As shown in Algorithm 1 of "Fast Prime Field Elliptic Curve Cryptography // with 256 Bit Primes" by Shay Gueron and Vlad Krasnov, in the loop of a // multi-limb Montgomery multiplication of a * b (mod n), given the // unreduced product t == a * b, we repeatedly calculate: // // t1 := t % r |t1| is |t|'s lowest limb (see previous paragraph). // t2 := t1*n0*n // t3 := t + t2 // t := t3 / r copy all limbs of |t3| except the lowest to |t|. // // In the last step, it would only make sense to ignore the lowest limb of // |t3| if it were zero. The middle steps ensure that this is the case: // // t3 == 0 (mod r) // t + t2 == 0 (mod r) // t + t1*n0*n == 0 (mod r) // t1*n0*n == -t (mod r) // t*n0*n == -t (mod r) // n0*n == -1 (mod r) // n0 == -1/n (mod r) // // Thus, in each iteration of the loop, we multiply by the constant factor // n0, the negative inverse of n (mod r). // // TODO(perf): Not all 32-bit platforms actually make use of n0[1]. For the // ones that don't, we could use a shorter `R` value and use faster `Limb` // calculations instead of double-precision `u64` calculations. n0: N0, } impl Clone for OwnedModulus { fn clone(&self) -> Self { Self { inner: self.inner.clone(), n0: self.n0, } } } impl OwnedModulus { pub(crate) fn from(n: OwnedModulusValue) -> Self { // n_mod_r = n % r. As explained in the documentation for `n0`, this is // done by taking the lowest `N0::LIMBS_USED` limbs of `n`. #[allow(clippy::useless_conversion)] let n0 = { prefixed_extern! { fn bn_neg_inv_mod_r_u64(n: u64) -> u64; } // XXX: u64::from isn't guaranteed to be constant time. let mut n_mod_r: u64 = u64::from(n.limbs()[0]); if N0::LIMBS_USED == 2 { // XXX: If we use `<< LIMB_BITS` here then 64-bit builds // fail to compile because of `deny(exceeding_bitshifts)`. debug_assert_eq!(LIMB_BITS, 32); n_mod_r |= u64::from(n.limbs()[1]) << 32; } N0::precalculated(unsafe { bn_neg_inv_mod_r_u64(n_mod_r) }) }; Self { inner: n, n0 } } pub fn to_elem(&self, l: &Modulus) -> Result, error::Unspecified> { self.inner.verify_less_than(l)?; let mut limbs = BoxedLimbs::zero(l.limbs().len()); limbs[..self.inner.limbs().len()].copy_from_slice(self.inner.limbs()); Ok(Elem { limbs, encoding: PhantomData, }) } pub(crate) fn modulus(&self, cpu_features: cpu::Features) -> Modulus { Modulus { limbs: self.inner.limbs(), n0: self.n0, len_bits: self.len_bits(), m: PhantomData, cpu_features, } } pub fn len_bits(&self) -> BitLength { self.inner.len_bits() } } impl OwnedModulus { pub fn be_bytes(&self) -> LeadingZerosStripped + Clone + '_> { LeadingZerosStripped::new(limb::unstripped_be_bytes(self.inner.limbs())) } } pub struct Modulus<'a, M> { limbs: &'a [Limb], n0: N0, len_bits: BitLength, m: PhantomData, cpu_features: cpu::Features, } impl Modulus<'_, M> { pub(super) fn oneR(&self, out: &mut [Limb]) { assert_eq!(self.limbs.len(), out.len()); let r = self.limbs.len() * LIMB_BITS; // out = 2**r - m where m = self. limb::limbs_negative_odd(out, self.limbs); let lg_m = self.len_bits().as_bits(); let leading_zero_bits_in_m = r - lg_m; // When m's length is a multiple of LIMB_BITS, which is the case we // most want to optimize for, then we already have // out == 2**r - m == 2**r (mod m). if leading_zero_bits_in_m != 0 { debug_assert!(leading_zero_bits_in_m < LIMB_BITS); // Correct out to 2**(lg m) (mod m). `limbs_negative_odd` flipped // all the leading zero bits to ones. Flip them back. *out.last_mut().unwrap() &= (!0) >> leading_zero_bits_in_m; // Now we have out == 2**(lg m) (mod m). Keep doubling until we get // to 2**r (mod m). for _ in 0..leading_zero_bits_in_m { limb::limbs_double_mod(out, self.limbs) .unwrap_or_else(unwrap_impossible_len_mismatch_error); } } // Now out == 2**r (mod m) == 1*R. } // TODO: XXX Avoid duplication with `Modulus`. pub fn alloc_zero(&self) -> Storage { Storage { limbs: BoxedLimbs::zero(self.limbs.len()), } } #[inline] pub(super) fn limbs(&self) -> &[Limb] { self.limbs } #[inline] pub(super) fn n0(&self) -> &N0 { &self.n0 } pub fn len_bits(&self) -> BitLength { self.len_bits } #[inline] pub(crate) fn cpu_features(&self) -> cpu::Features { self.cpu_features } } ring-0.17.14/src/arithmetic/bigint/modulusvalue.rs000064400000000000000000000060021046102023000202220ustar 00000000000000// Copyright 2015-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{ super::{MAX_LIMBS, MIN_LIMBS}, BoxedLimbs, Modulus, PublicModulus, }; use crate::{ bits::BitLength, error, limb::{self, Limb, LIMB_BYTES}, }; /// `OwnedModulus`, without the overhead of Montgomery multiplication support. pub(crate) struct OwnedModulusValue { limbs: BoxedLimbs, // Also `value >= 3`. len_bits: BitLength, } impl Clone for OwnedModulusValue { fn clone(&self) -> Self { Self { limbs: self.limbs.clone(), len_bits: self.len_bits, } } } impl OwnedModulusValue { pub(crate) fn from_be_bytes(input: untrusted::Input) -> Result { let num_limbs = (input.len() + LIMB_BYTES - 1) / LIMB_BYTES; const _MODULUS_MIN_LIMBS_AT_LEAST_2: () = assert!(MIN_LIMBS >= 2); if num_limbs < MIN_LIMBS { return Err(error::KeyRejected::unexpected_error()); } if num_limbs > MAX_LIMBS { return Err(error::KeyRejected::too_large()); } // The above implies n >= 3, so we don't need to check that. // Reject leading zeros. Also reject the value zero ([0]) because zero // isn't positive. if untrusted::Reader::new(input).peek(0) { return Err(error::KeyRejected::invalid_encoding()); } let mut limbs = BoxedLimbs::zero(num_limbs); limb::parse_big_endian_and_pad_consttime(input, &mut limbs) .map_err(|error::Unspecified| error::KeyRejected::unexpected_error())?; limb::limbs_reject_even_leak_bit(&limbs) .map_err(|_: error::Unspecified| error::KeyRejected::invalid_component())?; let len_bits = limb::limbs_minimal_bits(&limbs); Ok(Self { limbs, len_bits }) } pub fn verify_less_than(&self, l: &Modulus) -> Result<(), error::Unspecified> { if self.len_bits() > l.len_bits() { return Err(error::Unspecified); } if self.limbs.len() == l.limbs().len() { limb::verify_limbs_less_than_limbs_leak_bit(&self.limbs, l.limbs())?; } Ok(()) } pub fn len_bits(&self) -> BitLength { self.len_bits } #[inline] pub(super) fn limbs(&self) -> &[Limb] { &self.limbs } } ring-0.17.14/src/arithmetic/bigint/private_exponent.rs000064400000000000000000000054541046102023000211010ustar 00000000000000// Copyright 2015-2023 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{limb, BoxedLimbs, Limb, Modulus}; use crate::error; use alloc::boxed::Box; pub struct PrivateExponent { // Unlike most `[Limb]` we deal with, these are stored most significant // word first. limbs: Box<[Limb]>, } impl PrivateExponent { // `p` is the modulus for which the exponent is in the interval [1, `p` - 1). pub fn from_be_bytes_padded( input: untrusted::Input, p: &Modulus, ) -> Result { let mut dP = BoxedLimbs::from_be_bytes_padded_less_than(input, p)?; // Proof that `dP < p - 1`: // // If `dP < p` then either `dP == p - 1` or `dP < p - 1`. Since `p` is // odd, `p - 1` is even. `d` is odd, and an odd number modulo an even // number is odd. Therefore `dP` must be odd. But then it cannot be // `p - 1` and so we know `dP < p - 1`. // // Further we know `dP != 0` because `dP` is not even. limb::limbs_reject_even_leak_bit(&dP)?; dP.reverse(); Ok(Self { limbs: dP.into_limbs(), }) } // Create a `PrivateExponent` with a value that we do not support in // production use, to allow testing with additional test vectors. #[cfg(test)] pub fn from_be_bytes_for_test_only( input: untrusted::Input, p: &Modulus, ) -> Result { use crate::limb::LIMB_BYTES; // Do exactly what `from_be_bytes_padded` does for any inputs it accepts. if let r @ Ok(_) = Self::from_be_bytes_padded(input, p) { return r; } let num_limbs = (input.len() + LIMB_BYTES - 1) / LIMB_BYTES; let mut limbs = BoxedLimbs::::zero(num_limbs); limb::parse_big_endian_and_pad_consttime(input, &mut limbs) .map_err(|error::Unspecified| error::KeyRejected::unexpected_error())?; limbs.reverse(); Ok(Self { limbs: limbs.into_limbs(), }) } #[inline] pub(super) fn limbs(&self) -> &[Limb] { &self.limbs } } ring-0.17.14/src/arithmetic/bigint.rs000064400000000000000000001073011046102023000155010ustar 00000000000000// Copyright 2015-2023 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Multi-precision integers. //! //! # Modular Arithmetic. //! //! Modular arithmetic is done in finite commutative rings ℤ/mℤ for some //! modulus *m*. We work in finite commutative rings instead of finite fields //! because the RSA public modulus *n* is not prime, which means ℤ/nℤ contains //! nonzero elements that have no multiplicative inverse, so ℤ/nℤ is not a //! finite field. //! //! In some calculations we need to deal with multiple rings at once. For //! example, RSA private key operations operate in the rings ℤ/nℤ, ℤ/pℤ, and //! ℤ/qℤ. Types and functions dealing with such rings are all parameterized //! over a type `M` to ensure that we don't wrongly mix up the math, e.g. by //! multiplying an element of ℤ/pℤ by an element of ℤ/qℤ modulo q. This follows //! the "unit" pattern described in [Static checking of units in Servo]. //! //! `Elem` also uses the static unit checking pattern to statically track the //! Montgomery factors that need to be canceled out in each value using it's //! `E` parameter. //! //! [Static checking of units in Servo]: //! https://blog.mozilla.org/research/2014/06/23/static-checking-of-units-in-servo/ use self::boxed_limbs::BoxedLimbs; pub(crate) use self::{ modulus::{Modulus, OwnedModulus}, modulusvalue::OwnedModulusValue, private_exponent::PrivateExponent, }; use super::{inout::AliasingSlices3, limbs512, montgomery::*, LimbSliceError, MAX_LIMBS}; use crate::{ bits::BitLength, c, error::{self, LenMismatchError}, limb::{self, Limb, LIMB_BITS}, polyfill::slice::{self, AsChunks}, }; use core::{ marker::PhantomData, num::{NonZeroU64, NonZeroUsize}, }; mod boxed_limbs; mod modulus; mod modulusvalue; mod private_exponent; pub trait PublicModulus {} // When we need to create a new `Elem`, first we create a `Storage` and then // move its `limbs` into the new element. When we want to recylce an `Elem`'s // memory allocation, we convert it back into a `Storage`. pub struct Storage { limbs: BoxedLimbs, } impl From> for Storage { fn from(elem: Elem) -> Self { Self { limbs: elem.limbs } } } /// Elements of ℤ/mℤ for some modulus *m*. // // Defaulting `E` to `Unencoded` is a convenience for callers from outside this // submodule. However, for maximum clarity, we always explicitly use // `Unencoded` within the `bigint` submodule. pub struct Elem { limbs: BoxedLimbs, /// The number of Montgomery factors that need to be canceled out from /// `value` to get the actual value. encoding: PhantomData, } impl Elem { pub fn clone_into(&self, mut out: Storage) -> Self { out.limbs.copy_from_slice(&self.limbs); Self { limbs: out.limbs, encoding: self.encoding, } } } impl Elem { #[inline] pub fn is_zero(&self) -> bool { limb::limbs_are_zero(&self.limbs).leak() } } /// Does a Montgomery reduction on `limbs` assuming they are Montgomery-encoded ('R') and assuming /// they are the same size as `m`, but perhaps not reduced mod `m`. The result will be /// fully reduced mod `m`. /// /// WARNING: Takes a `Storage` as an in/out value. fn from_montgomery_amm(mut in_out: Storage, m: &Modulus) -> Elem { let mut one = [0; MAX_LIMBS]; one[0] = 1; let one = &one[..m.limbs().len()]; limbs_mul_mont( (&mut in_out.limbs[..], one), m.limbs(), m.n0(), m.cpu_features(), ) .unwrap_or_else(unwrap_impossible_limb_slice_error); Elem { limbs: in_out.limbs, encoding: PhantomData, } } #[cfg(any(test, not(target_arch = "x86_64")))] impl Elem { #[inline] pub fn into_unencoded(self, m: &Modulus) -> Elem { from_montgomery_amm(Storage::from(self), m) } } impl Elem { pub fn from_be_bytes_padded( input: untrusted::Input, m: &Modulus, ) -> Result { Ok(Self { limbs: BoxedLimbs::from_be_bytes_padded_less_than(input, m)?, encoding: PhantomData, }) } #[inline] pub fn fill_be_bytes(&self, out: &mut [u8]) { // See Falko Strenzke, "Manger's Attack revisited", ICICS 2010. limb::big_endian_from_limbs(&self.limbs, out) } } pub fn elem_mul_into( mut out: Storage, a: &Elem, b: &Elem, m: &Modulus, ) -> Elem::Output> where (AF, BF): ProductEncoding, { limbs_mul_mont( (out.limbs.as_mut(), b.limbs.as_ref(), a.limbs.as_ref()), m.limbs(), m.n0(), m.cpu_features(), ) .unwrap_or_else(unwrap_impossible_limb_slice_error); Elem { limbs: out.limbs, encoding: PhantomData, } } pub fn elem_mul( a: &Elem, mut b: Elem, m: &Modulus, ) -> Elem::Output> where (AF, BF): ProductEncoding, { limbs_mul_mont( (&mut b.limbs[..], &a.limbs[..]), m.limbs(), m.n0(), m.cpu_features(), ) .unwrap_or_else(unwrap_impossible_limb_slice_error); Elem { limbs: b.limbs, encoding: PhantomData, } } // r *= 2. fn elem_double(r: &mut Elem, m: &Modulus) { limb::limbs_double_mod(&mut r.limbs, m.limbs()) .unwrap_or_else(unwrap_impossible_len_mismatch_error) } // TODO: This is currently unused, but we intend to eventually use this to // reduce elements (x mod q) mod p in the RSA CRT. If/when we do so, we // should update the testing so it is reflective of that usage, instead of // the old usage. pub fn elem_reduced_once( mut r: Storage, a: &Elem, m: &Modulus, other_modulus_len_bits: BitLength, ) -> Elem { assert_eq!(m.len_bits(), other_modulus_len_bits); r.limbs.copy_from_slice(&a.limbs); limb::limbs_reduce_once(&mut r.limbs, m.limbs()) .unwrap_or_else(unwrap_impossible_len_mismatch_error); Elem { limbs: r.limbs, encoding: PhantomData, } } #[inline] pub fn elem_reduced( mut r: Storage, a: &Elem, m: &Modulus, other_prime_len_bits: BitLength, ) -> Elem { // This is stricter than required mathematically but this is what we // guarantee and this is easier to check. The real requirement is that // that `a < m*R` where `R` is the Montgomery `R` for `m`. assert_eq!(other_prime_len_bits, m.len_bits()); // `limbs_from_mont_in_place` requires this. assert_eq!(a.limbs.len(), m.limbs().len() * 2); let mut tmp = [0; MAX_LIMBS]; let tmp = &mut tmp[..a.limbs.len()]; tmp.copy_from_slice(&a.limbs); limbs_from_mont_in_place(&mut r.limbs, tmp, m.limbs(), m.n0()); Elem { limbs: r.limbs, encoding: PhantomData, } } #[inline] fn elem_squared( mut a: Elem, m: &Modulus, ) -> Elem::Output> where (E, E): ProductEncoding, { limbs_square_mont(&mut a.limbs, m.limbs(), m.n0(), m.cpu_features()) .unwrap_or_else(unwrap_impossible_limb_slice_error); Elem { limbs: a.limbs, encoding: PhantomData, } } pub fn elem_widen( mut r: Storage, a: Elem, m: &Modulus, smaller_modulus_bits: BitLength, ) -> Result, error::Unspecified> { if smaller_modulus_bits >= m.len_bits() { return Err(error::Unspecified); } let (to_copy, to_zero) = r.limbs.split_at_mut(a.limbs.len()); to_copy.copy_from_slice(&a.limbs); to_zero.fill(0); Ok(Elem { limbs: r.limbs, encoding: PhantomData, }) } // TODO: Document why this works for all Montgomery factors. pub fn elem_add(mut a: Elem, b: Elem, m: &Modulus) -> Elem { limb::limbs_add_assign_mod(&mut a.limbs, &b.limbs, m.limbs()) .unwrap_or_else(unwrap_impossible_len_mismatch_error); a } // TODO: Document why this works for all Montgomery factors. pub fn elem_sub(mut a: Elem, b: &Elem, m: &Modulus) -> Elem { prefixed_extern! { // `r` and `a` may alias. fn LIMBS_sub_mod( r: *mut Limb, a: *const Limb, b: *const Limb, m: *const Limb, num_limbs: c::NonZero_size_t, ); } let num_limbs = NonZeroUsize::new(m.limbs().len()).unwrap(); (a.limbs.as_mut(), b.limbs.as_ref()) .with_non_dangling_non_null_pointers_rab(num_limbs, |r, a, b| { let m = m.limbs().as_ptr(); // Also non-dangling because num_limbs is non-zero. unsafe { LIMBS_sub_mod(r, a, b, m, num_limbs) } }) .unwrap_or_else(unwrap_impossible_len_mismatch_error); a } // The value 1, Montgomery-encoded some number of times. pub struct One(Elem); impl One { // Returns RR = = R**2 (mod n) where R = 2**r is the smallest power of // 2**LIMB_BITS such that R > m. // // Even though the assembly on some 32-bit platforms works with 64-bit // values, using `LIMB_BITS` here, rather than `N0::LIMBS_USED * LIMB_BITS`, // is correct because R**2 will still be a multiple of the latter as // `N0::LIMBS_USED` is either one or two. pub(crate) fn newRR(mut out: Storage, m: &Modulus) -> Self { // The number of limbs in the numbers involved. let w = m.limbs().len(); // The length of the numbers involved, in bits. R = 2**r. let r = w * LIMB_BITS; m.oneR(&mut out.limbs); let mut acc: Elem = Elem { limbs: out.limbs, encoding: PhantomData, }; // 2**t * R can be calculated by t doublings starting with R. // // Choose a t that divides r and where t doublings are cheaper than 1 squaring. // // We could choose other values of t than w. But if t < d then the exponentiation that // follows would require multiplications. Normally d is 1 (i.e. the modulus length is a // power of two: RSA 1024, 2048, 4097, 8192) or 3 (RSA 1536, 3072). // // XXX(perf): Currently t = w / 2 is slightly faster. TODO(perf): Optimize `elem_double` // and re-run benchmarks to rebalance this. let t = w; let z = w.trailing_zeros(); let d = w >> z; debug_assert_eq!(w, d * (1 << z)); debug_assert!(d <= t); debug_assert!(t < r); for _ in 0..t { elem_double(&mut acc, m); } // Because t | r: // // MontExp(2**t * R, r / t) // = (2**t)**(r / t) * R (mod m) by definition of MontExp. // = (2**t)**(1/t * r) * R (mod m) // = (2**(t * 1/t))**r * R (mod m) // = (2**1)**r * R (mod m) // = 2**r * R (mod m) // = R * R (mod m) // = RR // // Like BoringSSL, use t = w (`m.limbs.len()`) which ensures that the exponent is a power // of two. Consequently, there will be no multiplications in the Montgomery exponentiation; // there will only be lg(r / t) squarings. // // lg(r / t) // = lg((w * 2**b) / t) // = lg((t * 2**b) / t) // = lg(2**b) // = b // TODO(MSRV:1.67): const B: u32 = LIMB_BITS.ilog2(); const B: u32 = if cfg!(target_pointer_width = "64") { 6 } else if cfg!(target_pointer_width = "32") { 5 } else { panic!("unsupported target_pointer_width") }; #[allow(clippy::assertions_on_constants)] const _LIMB_BITS_IS_2_POW_B: () = assert!(LIMB_BITS == 1 << B); debug_assert_eq!(r, t * (1 << B)); for _ in 0..B { acc = elem_squared(acc, m); } Self(Elem { limbs: acc.limbs, encoding: PhantomData, // PhantomData }) } } impl One { pub(crate) fn newRRR(One(oneRR): One, m: &Modulus) -> Self { Self(elem_squared(oneRR, m)) } } impl AsRef> for One { fn as_ref(&self) -> &Elem { &self.0 } } impl One { pub fn clone_into(&self, out: Storage) -> Self { Self(self.0.clone_into(out)) } } /// Calculates base**exponent (mod m). /// /// The run time is a function of the number of limbs in `m` and the bit /// length and Hamming Weight of `exponent`. The bounds on `m` are pretty /// obvious but the bounds on `exponent` are less obvious. Callers should /// document the bounds they place on the maximum value and maximum Hamming /// weight of `exponent`. // TODO: The test coverage needs to be expanded, e.g. test with the largest // accepted exponent and with the most common values of 65537 and 3. pub(crate) fn elem_exp_vartime( out: Storage, base: Elem, exponent: NonZeroU64, m: &Modulus, ) -> Elem { // Use what [Knuth] calls the "S-and-X binary method", i.e. variable-time // square-and-multiply that scans the exponent from the most significant // bit to the least significant bit (left-to-right). Left-to-right requires // less storage compared to right-to-left scanning, at the cost of needing // to compute `exponent.leading_zeros()`, which we assume to be cheap. // // As explained in [Knuth], exponentiation by squaring is the most // efficient algorithm when the Hamming weight is 2 or less. It isn't the // most efficient for all other, uncommon, exponent values but any // suboptimality is bounded at least by the small bit length of `exponent` // as enforced by its type. // // This implementation is slightly simplified by taking advantage of the // fact that we require the exponent to be a positive integer. // // [Knuth]: The Art of Computer Programming, Volume 2: Seminumerical // Algorithms (3rd Edition), Section 4.6.3. let exponent = exponent.get(); let mut acc = base.clone_into(out); let mut bit = 1 << (64 - 1 - exponent.leading_zeros()); debug_assert!((exponent & bit) != 0); while bit > 1 { bit >>= 1; acc = elem_squared(acc, m); if (exponent & bit) != 0 { acc = elem_mul(&base, acc, m); } } acc } pub fn elem_exp_consttime( out: Storage

, base: &Elem, oneRRR: &One, exponent: &PrivateExponent, p: &Modulus

, q: PrivateCrtPrime, qInv: bigint::Elem, public: PublicKey, } derive_debug_via_field!(KeyPair, stringify!(RsaKeyPair), public); impl KeyPair { /// Parses an unencrypted PKCS#8-encoded RSA private key. /// /// This will generate a 2048-bit RSA private key of the correct form using /// OpenSSL's command line tool: /// /// ```sh /// openssl genpkey -algorithm RSA \ /// -pkeyopt rsa_keygen_bits:2048 \ /// -pkeyopt rsa_keygen_pubexp:65537 | \ /// openssl pkcs8 -topk8 -nocrypt -outform der > rsa-2048-private-key.pk8 /// ``` /// /// This will generate a 3072-bit RSA private key of the correct form: /// /// ```sh /// openssl genpkey -algorithm RSA \ /// -pkeyopt rsa_keygen_bits:3072 \ /// -pkeyopt rsa_keygen_pubexp:65537 | \ /// openssl pkcs8 -topk8 -nocrypt -outform der > rsa-3072-private-key.pk8 /// ``` /// /// Often, keys generated for use in OpenSSL-based software are stored in /// the Base64 “PEM” format without the PKCS#8 wrapper. Such keys can be /// converted to binary PKCS#8 form using the OpenSSL command line tool like /// this: /// /// ```sh /// openssl pkcs8 -topk8 -nocrypt -outform der \ /// -in rsa-2048-private-key.pem > rsa-2048-private-key.pk8 /// ``` /// /// Base64 (“PEM”) PKCS#8-encoded keys can be converted to the binary PKCS#8 /// form like this: /// /// ```sh /// openssl pkcs8 -nocrypt -outform der \ /// -in rsa-2048-private-key.pem > rsa-2048-private-key.pk8 /// ``` /// /// See [`Self::from_components`] for more details on how the input is /// validated. /// /// See [RFC 5958] and [RFC 3447 Appendix A.1.2] for more details of the /// encoding of the key. /// /// [NIST SP-800-56B rev. 1]: /// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Br1.pdf /// /// [RFC 3447 Appendix A.1.2]: /// https://tools.ietf.org/html/rfc3447#appendix-A.1.2 /// /// [RFC 5958]: /// https://tools.ietf.org/html/rfc5958 pub fn from_pkcs8(pkcs8: &[u8]) -> Result { const RSA_ENCRYPTION: &[u8] = include_bytes!("../data/alg-rsa-encryption.der"); let (der, _) = pkcs8::unwrap_key_( untrusted::Input::from(RSA_ENCRYPTION), pkcs8::Version::V1Only, untrusted::Input::from(pkcs8), )?; Self::from_der(der.as_slice_less_safe()) } /// Parses an RSA private key that is not inside a PKCS#8 wrapper. /// /// The private key must be encoded as a binary DER-encoded ASN.1 /// `RSAPrivateKey` as described in [RFC 3447 Appendix A.1.2]). In all other /// respects, this is just like `from_pkcs8()`. See the documentation for /// `from_pkcs8()` for more details. /// /// It is recommended to use `from_pkcs8()` (with a PKCS#8-encoded key) /// instead. /// /// See [`Self::from_components()`] for more details on how the input is /// validated. /// /// [RFC 3447 Appendix A.1.2]: /// https://tools.ietf.org/html/rfc3447#appendix-A.1.2 /// /// [NIST SP-800-56B rev. 1]: /// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Br1.pdf pub fn from_der(input: &[u8]) -> Result { untrusted::Input::from(input).read_all(KeyRejected::invalid_encoding(), |input| { der::nested( input, der::Tag::Sequence, KeyRejected::invalid_encoding(), Self::from_der_reader, ) }) } fn from_der_reader(input: &mut untrusted::Reader) -> Result { let version = der::small_nonnegative_integer(input) .map_err(|error::Unspecified| KeyRejected::invalid_encoding())?; if version != 0 { return Err(KeyRejected::version_not_supported()); } fn nonnegative_integer<'a>( input: &mut untrusted::Reader<'a>, ) -> Result<&'a [u8], KeyRejected> { der::nonnegative_integer(input) .map(|input| input.as_slice_less_safe()) .map_err(|error::Unspecified| KeyRejected::invalid_encoding()) } let n = nonnegative_integer(input)?; let e = nonnegative_integer(input)?; let d = nonnegative_integer(input)?; let p = nonnegative_integer(input)?; let q = nonnegative_integer(input)?; let dP = nonnegative_integer(input)?; let dQ = nonnegative_integer(input)?; let qInv = nonnegative_integer(input)?; let components = KeyPairComponents { public_key: PublicKeyComponents { n, e }, d, p, q, dP, dQ, qInv, }; Self::from_components(&components) } /// Constructs an RSA private key from its big-endian-encoded components. /// /// Only two-prime (not multi-prime) keys are supported. The public modulus /// (n) must be at least 2047 bits. The public modulus must be no larger /// than 4096 bits. It is recommended that the public modulus be exactly /// 2048 or 3072 bits. The public exponent must be at least 65537 and must /// be no more than 33 bits long. /// /// The private key is validated according to [NIST SP-800-56B rev. 1] /// section 6.4.1.4.3, crt_pkv (Intended Exponent-Creation Method Unknown), /// with the following exceptions: /// /// * Section 6.4.1.2.1, Step 1: Neither a target security level nor an /// expected modulus length is provided as a parameter, so checks /// regarding these expectations are not done. /// * Section 6.4.1.2.1, Step 3: Since neither the public key nor the /// expected modulus length is provided as a parameter, the consistency /// check between these values and the private key's value of n isn't /// done. /// * Section 6.4.1.2.1, Step 5: No primality tests are done, both for /// performance reasons and to avoid any side channels that such tests /// would provide. /// * Section 6.4.1.2.1, Step 6, and 6.4.1.4.3, Step 7: /// * *ring* has a slightly looser lower bound for the values of `p` /// and `q` than what the NIST document specifies. This looser lower /// bound matches what most other crypto libraries do. The check might /// be tightened to meet NIST's requirements in the future. Similarly, /// the check that `p` and `q` are not too close together is skipped /// currently, but may be added in the future. /// * The validity of the mathematical relationship of `dP`, `dQ`, `e` /// and `n` is verified only during signing. Some size checks of `d`, /// `dP` and `dQ` are performed at construction, but some NIST checks /// are skipped because they would be expensive and/or they would leak /// information through side channels. If a preemptive check of the /// consistency of `dP`, `dQ`, `e` and `n` with each other is /// necessary, that can be done by signing any message with the key /// pair. /// /// * `d` is not fully validated, neither at construction nor during /// signing. This is OK as far as *ring*'s usage of the key is /// concerned because *ring* never uses the value of `d` (*ring* always /// uses `p`, `q`, `dP` and `dQ` via the Chinese Remainder Theorem, /// instead). However, *ring*'s checks would not be sufficient for /// validating a key pair for use by some other system; that other /// system must check the value of `d` itself if `d` is to be used. pub fn from_components( components: &KeyPairComponents, ) -> Result where Public: AsRef<[u8]>, Private: AsRef<[u8]>, { let components = KeyPairComponents { public_key: PublicKeyComponents { n: components.public_key.n.as_ref(), e: components.public_key.e.as_ref(), }, d: components.d.as_ref(), p: components.p.as_ref(), q: components.q.as_ref(), dP: components.dP.as_ref(), dQ: components.dQ.as_ref(), qInv: components.qInv.as_ref(), }; Self::from_components_(&components, cpu::features()) } fn from_components_( &KeyPairComponents { public_key, d, p, q, dP, dQ, qInv, }: &KeyPairComponents<&[u8]>, cpu_features: cpu::Features, ) -> Result { let d = untrusted::Input::from(d); let p = untrusted::Input::from(p); let q = untrusted::Input::from(q); let dP = untrusted::Input::from(dP); let dQ = untrusted::Input::from(dQ); let qInv = untrusted::Input::from(qInv); // XXX: Some steps are done out of order, but the NIST steps are worded // in such a way that it is clear that NIST intends for them to be done // in order. TODO: Does this matter at all? // 6.4.1.4.3/6.4.1.2.1 - Step 1. // Step 1.a is omitted, as explained above. // Step 1.b is omitted per above. Instead, we check that the public // modulus is 2048 to `PRIVATE_KEY_PUBLIC_MODULUS_MAX_BITS` bits. // XXX: The maximum limit of 4096 bits is primarily due to lack of // testing of larger key sizes; see, in particular, // https://www.mail-archive.com/openssl-dev@openssl.org/msg44586.html // and // https://www.mail-archive.com/openssl-dev@openssl.org/msg44759.html. // Also, this limit might help with memory management decisions later. // Step 1.c. We validate e >= 65537. let n = untrusted::Input::from(public_key.n); let e = untrusted::Input::from(public_key.e); let public_key = PublicKey::from_modulus_and_exponent( n, e, BitLength::from_bits(2048), super::PRIVATE_KEY_PUBLIC_MODULUS_MAX_BITS, PublicExponent::_65537, cpu_features, )?; let n_one = public_key.inner().n().oneRR(); let n = &public_key.inner().n().value(cpu_features); // 6.4.1.4.3 says to skip 6.4.1.2.1 Step 2. // 6.4.1.4.3 Step 3. // Step 3.a is done below, out of order. // Step 3.b is unneeded since `n_bits` is derived here from `n`. // 6.4.1.4.3 says to skip 6.4.1.2.1 Step 4. (We don't need to recover // the prime factors since they are already given.) // 6.4.1.4.3 - Step 5. // Steps 5.a and 5.b are omitted, as explained above. let n_bits = public_key.inner().n().len_bits(); let p = PrivatePrime::new(p, n_bits, cpu_features)?; let q = PrivatePrime::new(q, n_bits, cpu_features)?; // TODO: Step 5.i // // 3.b is unneeded since `n_bits` is derived here from `n`. // 6.4.1.4.3 - Step 3.a (out of order). // // Verify that p * q == n. We restrict ourselves to modular // multiplication. We rely on the fact that we've verified // 0 < q < p < n. We check that q and p are close to sqrt(n) and then // assume that these preconditions are enough to let us assume that // checking p * q == 0 (mod n) is equivalent to checking p * q == n. let q_mod_n = q .modulus .to_elem(n) .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?; let p_mod_n = p .modulus .to_elem(n) .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?; let p_mod_n = bigint::elem_mul(n_one, p_mod_n, n); let pq_mod_n = bigint::elem_mul(&q_mod_n, p_mod_n, n); if !pq_mod_n.is_zero() { return Err(KeyRejected::inconsistent_components()); } // 6.4.1.4.3/6.4.1.2.1 - Step 6. // Step 6.a, partial. // // First, validate `2**half_n_bits < d`. Since 2**half_n_bits has a bit // length of half_n_bits + 1, this check gives us 2**half_n_bits <= d, // and knowing d is odd makes the inequality strict. let d = bigint::OwnedModulusValue::::from_be_bytes(d) .map_err(|_| KeyRejected::invalid_component())?; if !(n_bits.half_rounded_up() < d.len_bits()) { return Err(KeyRejected::inconsistent_components()); } // XXX: This check should be `d < LCM(p - 1, q - 1)`, but we don't have // a good way of calculating LCM, so it is omitted, as explained above. d.verify_less_than(n) .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?; // Step 6.b is omitted as explained above. let pm = &p.modulus.modulus(cpu_features); // 6.4.1.4.3 - Step 7. // Step 7.c. let qInv = bigint::Elem::from_be_bytes_padded(qInv, pm) .map_err(|error::Unspecified| KeyRejected::invalid_component())?; // Steps 7.d and 7.e are omitted per the documentation above, and // because we don't (in the long term) have a good way to do modulo // with an even modulus. // Step 7.f. let qInv = bigint::elem_mul(p.oneRR.as_ref(), qInv, pm); let q_mod_p = bigint::elem_reduced(pm.alloc_zero(), &q_mod_n, pm, q.modulus.len_bits()); let q_mod_p = bigint::elem_mul(p.oneRR.as_ref(), q_mod_p, pm); bigint::verify_inverses_consttime(&qInv, q_mod_p, pm) .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?; // This should never fail since `n` and `e` were validated above. let p = PrivateCrtPrime::new(p, dP, cpu_features)?; let q = PrivateCrtPrime::new(q, dQ, cpu_features)?; Ok(Self { p, q, qInv, public: public_key, }) } /// Returns a reference to the public key. pub fn public(&self) -> &PublicKey { &self.public } /// Returns the length in bytes of the key pair's public modulus. /// /// A signature has the same length as the public modulus. #[deprecated = "Use `public().modulus_len()`"] #[inline] pub fn public_modulus_len(&self) -> usize { self.public().modulus_len() } } impl signature::KeyPair for KeyPair { type PublicKey = PublicKey; fn public_key(&self) -> &Self::PublicKey { self.public() } } struct PrivatePrime { modulus: bigint::OwnedModulus, oneRR: bigint::One, } impl PrivatePrime { fn new( p: untrusted::Input, n_bits: BitLength, cpu_features: cpu::Features, ) -> Result { let p = bigint::OwnedModulusValue::from_be_bytes(p)?; // 5.c / 5.g: // // TODO: First, stop if `p < (√2) * 2**((nBits/2) - 1)`. // TODO: First, stop if `q < (√2) * 2**((nBits/2) - 1)`. // // Second, stop if `p > 2**(nBits/2) - 1`. // Second, stop if `q > 2**(nBits/2) - 1`. if p.len_bits() != n_bits.half_rounded_up() { return Err(KeyRejected::inconsistent_components()); } if p.len_bits().as_bits() % 512 != 0 { return Err(KeyRejected::private_modulus_len_not_multiple_of_512_bits()); } // TODO: Step 5.d: Verify GCD(p - 1, e) == 1. // TODO: Step 5.h: Verify GCD(q - 1, e) == 1. // Steps 5.e and 5.f are omitted as explained above. let p = bigint::OwnedModulus::from(p); let pm = p.modulus(cpu_features); let oneRR = bigint::One::newRR(pm.alloc_zero(), &pm); Ok(Self { modulus: p, oneRR }) } } struct PrivateCrtPrime { modulus: bigint::OwnedModulus, oneRRR: bigint::One, exponent: bigint::PrivateExponent, } impl PrivateCrtPrime { /// Constructs a `PrivateCrtPrime` from the private prime `p` and `dP` where /// dP == d % (p - 1). fn new( p: PrivatePrime, dP: untrusted::Input, cpu_features: cpu::Features, ) -> Result { let m = &p.modulus.modulus(cpu_features); // [NIST SP-800-56B rev. 1] 6.4.1.4.3 - Steps 7.a & 7.b. let dP = bigint::PrivateExponent::from_be_bytes_padded(dP, m) .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?; // XXX: Steps 7.d and 7.e are omitted. We don't check that // `dP == d % (p - 1)` because we don't (in the long term) have a good // way to do modulo with an even modulus. Instead we just check that // `1 <= dP < p - 1`. We'll check it, to some unknown extent, when we // do the private key operation, since we verify that the result of the // private key operation using the CRT parameters is consistent with `n` // and `e`. TODO: Either prove that what we do is sufficient, or make // it so. let oneRRR = bigint::One::newRRR(p.oneRR, m); Ok(Self { modulus: p.modulus, oneRRR, exponent: dP, }) } } fn elem_exp_consttime( c: &bigint::Elem, p: &PrivateCrtPrime, other_prime_len_bits: BitLength, cpu_features: cpu::Features, ) -> Result, error::Unspecified> { let m = &p.modulus.modulus(cpu_features); bigint::elem_exp_consttime( m.alloc_zero(), c, &p.oneRRR, &p.exponent, m, other_prime_len_bits, ) .map_err(error::erase::) } // Type-level representations of the different moduli used in RSA signing, in // addition to `super::N`. See `super::bigint`'s modulue-level documentation. enum P {} enum Q {} enum D {} impl KeyPair { /// Computes the signature of `msg` and writes it into `signature`. /// /// `msg` is digested using the digest algorithm from `padding_alg` and the /// digest is then padded using the padding algorithm from `padding_alg`. /// /// The signature it written into `signature`; `signature`'s length must be /// exactly the length returned by `self::public().modulus_len()` or else /// an error will be returned. On failure, `signature` may contain /// intermediate results, but won't contain anything that would endanger the /// private key. /// /// `rng` may be used to randomize the padding (e.g. for PSS). /// /// Many other crypto libraries have signing functions that takes a /// precomputed digest as input, instead of the message to digest. This /// function does *not* take a precomputed digest; instead, `sign` /// calculates the digest itself. pub fn sign( &self, padding_alg: &'static dyn RsaEncoding, rng: &dyn rand::SecureRandom, msg: &[u8], signature: &mut [u8], ) -> Result<(), error::Unspecified> { let cpu_features = cpu::features(); if signature.len() != self.public().modulus_len() { return Err(error::Unspecified); } let m_hash = digest::digest(padding_alg.digest_alg(), msg); // Use the output buffer as the scratch space for the signature to // reduce the required stack space. padding::encode( padding_alg, m_hash, signature, self.public().inner().n().len_bits(), rng, )?; // RFC 8017 Section 5.1.2: RSADP, using the Chinese Remainder Theorem // with Garner's algorithm. // Steps 1 and 2. let m = self.private_exponentiate(signature, cpu_features)?; // Step 3. m.fill_be_bytes(signature); Ok(()) } /// Returns base**d (mod n). /// /// This does not return or write any intermediate results into any buffers /// that are provided by the caller so that no intermediate state will be /// leaked that would endanger the private key. /// /// Panics if `in_out` is not `self.public().modulus_len()`. fn private_exponentiate( &self, base: &[u8], cpu_features: cpu::Features, ) -> Result, error::Unspecified> { assert_eq!(base.len(), self.public().modulus_len()); // RFC 8017 Section 5.1.2: RSADP, using the Chinese Remainder Theorem // with Garner's algorithm. let n = &self.public.inner().n().value(cpu_features); let n_one = self.public.inner().n().oneRR(); // Step 1. The value zero is also rejected. let base = bigint::Elem::from_be_bytes_padded(untrusted::Input::from(base), n)?; // Step 2 let c = base; // Step 2.b.i. let q_bits = self.q.modulus.len_bits(); let m_1 = elem_exp_consttime(&c, &self.p, q_bits, cpu_features)?; let m_2 = elem_exp_consttime(&c, &self.q, self.p.modulus.len_bits(), cpu_features)?; // Step 2.b.ii isn't needed since there are only two primes. // Step 2.b.iii. let h = { let p = &self.p.modulus.modulus(cpu_features); let m_2 = bigint::elem_reduced_once(p.alloc_zero(), &m_2, p, q_bits); let m_1_minus_m_2 = bigint::elem_sub(m_1, &m_2, p); bigint::elem_mul(&self.qInv, m_1_minus_m_2, p) }; // Step 2.b.iv. The reduction in the modular multiplication isn't // necessary because `h < p` and `p * q == n` implies `h * q < n`. // Modular arithmetic is used simply to avoid implementing // non-modular arithmetic. let p_bits = self.p.modulus.len_bits(); let h = bigint::elem_widen(n.alloc_zero(), h, n, p_bits)?; let q_mod_n = self.q.modulus.to_elem(n)?; let q_mod_n = bigint::elem_mul(n_one, q_mod_n, n); let q_times_h = bigint::elem_mul(&q_mod_n, h, n); let m_2 = bigint::elem_widen(n.alloc_zero(), m_2, n, q_bits)?; let m = bigint::elem_add(m_2, q_times_h, n); // Step 2.b.v isn't needed since there are only two primes. // Verify the result to protect against fault attacks as described // in "On the Importance of Checking Cryptographic Protocols for // Faults" by Dan Boneh, Richard A. DeMillo, and Richard J. Lipton. // This check is cheap assuming `e` is small, which is ensured during // `KeyPair` construction. Note that this is the only validation of `e` // that is done other than basic checks on its size, oddness, and // minimum value, since the relationship of `e` to `d`, `p`, and `q` is // not verified during `KeyPair` construction. { let verify = n.alloc_zero(); let verify = self .public .inner() .exponentiate_elem(verify, &m, cpu_features); bigint::elem_verify_equal_consttime(&verify, &c)?; } // Step 3 will be done by the caller. Ok(m) } } #[cfg(test)] mod tests { use super::*; use crate::testutil as test; use alloc::vec; #[test] fn test_rsakeypair_private_exponentiate() { let cpu = cpu::features(); test::run( test_vector_file!("keypair_private_exponentiate_tests.txt"), |section, test_case| { assert_eq!(section, ""); let key = test_case.consume_bytes("Key"); let key = KeyPair::from_pkcs8(&key).unwrap(); let test_cases = &[ test_case.consume_bytes("p"), test_case.consume_bytes("p_plus_1"), test_case.consume_bytes("p_minus_1"), test_case.consume_bytes("q"), test_case.consume_bytes("q_plus_1"), test_case.consume_bytes("q_minus_1"), ]; for test_case in test_cases { // THe call to `elem_verify_equal_consttime` will cause // `private_exponentiate` to fail if the computation is // incorrect. let mut padded = vec![0; key.public.modulus_len()]; let zeroes = padded.len() - test_case.len(); padded[zeroes..].copy_from_slice(test_case); let _: bigint::Elem<_> = key.private_exponentiate(&padded, cpu).unwrap(); } Ok(()) }, ); } } ring-0.17.14/src/rsa/keypair_components.rs000064400000000000000000000017651046102023000166010ustar 00000000000000use super::PublicKeyComponents; /// RSA key pair components. #[derive(Clone, Copy)] pub struct KeyPairComponents { /// The public key components. pub public_key: PublicKeyComponents, /// The private exponent. pub d: Private, /// The first prime factor of `d`. pub p: Private, /// The second prime factor of `d`. pub q: Private, /// `p`'s public Chinese Remainder Theorem exponent. pub dP: Private, /// `q`'s public Chinese Remainder Theorem exponent. pub dQ: Private, /// `q**-1 mod p`. pub qInv: Private, } impl core::fmt::Debug for KeyPairComponents where PublicKeyComponents: core::fmt::Debug, { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> { // Non-public components are intentionally skipped f.debug_struct("KeyPairComponents") .field("public_key", &self.public_key) .finish() } } ring-0.17.14/src/rsa/padding/pkcs1.rs000064400000000000000000000132551046102023000153140ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{super::PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN, Padding, RsaEncoding, Verification}; use crate::{bits, digest, error, io::der, rand}; /// PKCS#1 1.5 padding as described in [RFC 3447 Section 8.2]. /// /// See "`RSA_PSS_*` Details\" in `ring::signature`'s module-level /// documentation for more details. /// /// [RFC 3447 Section 8.2]: https://tools.ietf.org/html/rfc3447#section-8.2 #[derive(Debug)] pub struct PKCS1 { digest_alg: &'static digest::Algorithm, digestinfo_prefix: &'static [u8], } impl crate::sealed::Sealed for PKCS1 {} impl Padding for PKCS1 { fn digest_alg(&self) -> &'static digest::Algorithm { self.digest_alg } } impl RsaEncoding for PKCS1 { fn encode( &self, m_hash: digest::Digest, m_out: &mut [u8], _mod_bits: bits::BitLength, _rng: &dyn rand::SecureRandom, ) -> Result<(), error::Unspecified> { pkcs1_encode(self, m_hash, m_out); Ok(()) } } impl Verification for PKCS1 { fn verify( &self, m_hash: digest::Digest, m: &mut untrusted::Reader, mod_bits: bits::BitLength, ) -> Result<(), error::Unspecified> { // `mod_bits.as_usize_bytes_rounded_up() <= // PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN` is ensured by `verify_rsa_()`. let mut calculated = [0u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN]; let calculated = &mut calculated[..mod_bits.as_usize_bytes_rounded_up()]; pkcs1_encode(self, m_hash, calculated); if m.read_bytes_to_end().as_slice_less_safe() != calculated { return Err(error::Unspecified); } Ok(()) } } // Implement padding procedure per EMSA-PKCS1-v1_5, // https://tools.ietf.org/html/rfc3447#section-9.2. This is used by both // verification and signing so it needs to be able to handle moduli of the // minimum and maximum sizes for both operations. fn pkcs1_encode(pkcs1: &PKCS1, m_hash: digest::Digest, m_out: &mut [u8]) { let em = m_out; let digest_len = pkcs1.digestinfo_prefix.len() + pkcs1.digest_alg.output_len(); // The specification requires at least 8 bytes of padding. Since we // disallow keys smaller than 1024 bits, this should always be true. assert!(em.len() >= digest_len + 11); let pad_len = em.len() - digest_len - 3; em[0] = 0; em[1] = 1; for i in 0..pad_len { em[2 + i] = 0xff; } em[2 + pad_len] = 0; let (digest_prefix, digest_dst) = em[3 + pad_len..].split_at_mut(pkcs1.digestinfo_prefix.len()); digest_prefix.copy_from_slice(pkcs1.digestinfo_prefix); digest_dst.copy_from_slice(m_hash.as_ref()); } macro_rules! rsa_pkcs1_padding { ( $vis:vis $PADDING_ALGORITHM:ident, $digest_alg:expr, $digestinfo_prefix:expr, $doc_str:expr ) => { #[doc=$doc_str] $vis static $PADDING_ALGORITHM: PKCS1 = PKCS1 { digest_alg: $digest_alg, digestinfo_prefix: $digestinfo_prefix, }; }; } // Intentionally not exposed except internally for signature verification. At a // minimum, we'd need to create test vectors for signing with it, which we // don't currently have. But, it's a bad idea to use SHA-1 anyway, so perhaps // we just won't ever expose it. rsa_pkcs1_padding!( pub(in super::super) RSA_PKCS1_SHA1_FOR_LEGACY_USE_ONLY, &digest::SHA1_FOR_LEGACY_USE_ONLY, &SHA1_PKCS1_DIGESTINFO_PREFIX, "PKCS#1 1.5 padding using SHA-1 for RSA signatures." ); rsa_pkcs1_padding!( pub RSA_PKCS1_SHA256, &digest::SHA256, &SHA256_PKCS1_DIGESTINFO_PREFIX, "PKCS#1 1.5 padding using SHA-256 for RSA signatures." ); rsa_pkcs1_padding!( pub RSA_PKCS1_SHA384, &digest::SHA384, &SHA384_PKCS1_DIGESTINFO_PREFIX, "PKCS#1 1.5 padding using SHA-384 for RSA signatures." ); rsa_pkcs1_padding!( pub RSA_PKCS1_SHA512, &digest::SHA512, &SHA512_PKCS1_DIGESTINFO_PREFIX, "PKCS#1 1.5 padding using SHA-512 for RSA signatures." ); macro_rules! pkcs1_digestinfo_prefix { ( $name:ident, $digest_len:expr, $digest_oid_len:expr, [ $( $digest_oid:expr ),* ] ) => { static $name: [u8; 2 + 8 + $digest_oid_len] = [ der::Tag::Sequence.into(), 8 + $digest_oid_len + $digest_len, der::Tag::Sequence.into(), 2 + $digest_oid_len + 2, der::Tag::OID.into(), $digest_oid_len, $( $digest_oid ),*, der::Tag::Null.into(), 0, der::Tag::OctetString.into(), $digest_len, ]; } } pkcs1_digestinfo_prefix!( SHA1_PKCS1_DIGESTINFO_PREFIX, 20, 5, [0x2b, 0x0e, 0x03, 0x02, 0x1a] ); pkcs1_digestinfo_prefix!( SHA256_PKCS1_DIGESTINFO_PREFIX, 32, 9, [0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01] ); pkcs1_digestinfo_prefix!( SHA384_PKCS1_DIGESTINFO_PREFIX, 48, 9, [0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02] ); pkcs1_digestinfo_prefix!( SHA512_PKCS1_DIGESTINFO_PREFIX, 64, 9, [0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03] ); ring-0.17.14/src/rsa/padding/pss.rs000064400000000000000000000227271046102023000151040ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{super::PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN, mgf1, Padding, RsaEncoding, Verification}; use crate::{bb, bits, digest, error, rand}; /// RSA PSS padding as described in [RFC 3447 Section 8.1]. /// /// See "`RSA_PSS_*` Details\" in `ring::signature`'s module-level /// documentation for more details. /// /// [RFC 3447 Section 8.1]: https://tools.ietf.org/html/rfc3447#section-8.1 #[allow(clippy::upper_case_acronyms)] // TODO: Until we implement cargo-semver-checks #[derive(Debug)] pub struct PSS { digest_alg: &'static digest::Algorithm, } impl crate::sealed::Sealed for PSS {} impl Padding for PSS { fn digest_alg(&self) -> &'static digest::Algorithm { self.digest_alg } } impl RsaEncoding for PSS { // Implement padding procedure per EMSA-PSS, // https://tools.ietf.org/html/rfc3447#section-9.1. fn encode( &self, m_hash: digest::Digest, m_out: &mut [u8], mod_bits: bits::BitLength, rng: &dyn rand::SecureRandom, ) -> Result<(), error::Unspecified> { let metrics = PSSMetrics::new(self.digest_alg, mod_bits)?; // The `m_out` this function fills is the big-endian-encoded value of `m` // from the specification, padded to `k` bytes, where `k` is the length // in bytes of the public modulus. The spec says "Note that emLen will // be one less than k if modBits - 1 is divisible by 8 and equal to k // otherwise." In other words we might need to prefix `em` with a // leading zero byte to form a correct value of `m`. let em = if metrics.top_byte_mask == 0xff { m_out[0] = 0; &mut m_out[1..] } else { m_out }; assert_eq!(em.len(), metrics.em_len); // Steps 1 and 2 are done by the caller to produce `m_hash`. // Step 3 is done by `PSSMetrics::new()` above. let (db, digest_terminator) = em.split_at_mut(metrics.db_len); let separator_pos = db.len() - 1 - metrics.s_len; // Step 4. let salt: &[u8] = { let salt = &mut db[(separator_pos + 1)..]; rng.fill(salt)?; // salt salt }; // Steps 5 and 6. let h = pss_digest(self.digest_alg, m_hash, salt); // Step 7. db[..separator_pos].fill(0); // ps // Step 8. db[separator_pos] = 0x01; // Steps 9 and 10. mgf1(self.digest_alg, h.as_ref(), db); // Step 11. db[0] &= metrics.top_byte_mask; // Step 12. digest_terminator[..metrics.h_len].copy_from_slice(h.as_ref()); digest_terminator[metrics.h_len] = 0xbc; Ok(()) } } impl Verification for PSS { // RSASSA-PSS-VERIFY from https://tools.ietf.org/html/rfc3447#section-8.1.2 // where steps 1, 2(a), and 2(b) have been done for us. fn verify( &self, m_hash: digest::Digest, m: &mut untrusted::Reader, mod_bits: bits::BitLength, ) -> Result<(), error::Unspecified> { let metrics = PSSMetrics::new(self.digest_alg, mod_bits)?; // RSASSA-PSS-VERIFY Step 2(c). The `m` this function is given is the // big-endian-encoded value of `m` from the specification, padded to // `k` bytes, where `k` is the length in bytes of the public modulus. // The spec. says "Note that emLen will be one less than k if // modBits - 1 is divisible by 8 and equal to k otherwise," where `k` // is the length in octets of the RSA public modulus `n`. In other // words, `em` might have an extra leading zero byte that we need to // strip before we start the PSS decoding steps which is an artifact of // the `Verification` interface. if metrics.top_byte_mask == 0xff { if m.read_byte()? != 0 { return Err(error::Unspecified); } }; let em = m; // The rest of this function is EMSA-PSS-VERIFY from // https://tools.ietf.org/html/rfc3447#section-9.1.2. // Steps 1 and 2 are done by the caller to produce `m_hash`. // Step 3 is done by `PSSMetrics::new()` above. // Step 5, out of order. let masked_db = em.read_bytes(metrics.db_len)?; let h_hash = em.read_bytes(metrics.h_len)?; // Step 4. if em.read_byte()? != 0xbc { return Err(error::Unspecified); } // Step 7. let mut db = [0u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN]; let db = &mut db[..metrics.db_len]; mgf1(self.digest_alg, h_hash.as_slice_less_safe(), db); masked_db.read_all(error::Unspecified, |masked_bytes| { // Step 6. Check the top bits of first byte are zero. let b = masked_bytes.read_byte()?; if b & !metrics.top_byte_mask != 0 { return Err(error::Unspecified); } db[0] ^= b; // Step 8. let db_rest = &mut db[1..]; let masked_bytes = masked_bytes.read_bytes(db_rest.len())?; bb::xor_assign_at_start(db_rest, masked_bytes.as_slice_less_safe()); Ok(()) })?; // Step 9. db[0] &= metrics.top_byte_mask; // Step 10. let ps_len = metrics.ps_len; if db[0..ps_len].iter().any(|&db| db != 0) { return Err(error::Unspecified); } if db[metrics.ps_len] != 1 { return Err(error::Unspecified); } // Step 11. let salt = &db[(db.len() - metrics.s_len)..]; // Step 12 and 13. let h_prime = pss_digest(self.digest_alg, m_hash, salt); // Step 14. if h_hash.as_slice_less_safe() != h_prime.as_ref() { return Err(error::Unspecified); } Ok(()) } } struct PSSMetrics { #[cfg_attr(not(feature = "alloc"), allow(dead_code))] em_len: usize, db_len: usize, ps_len: usize, s_len: usize, h_len: usize, top_byte_mask: u8, } impl PSSMetrics { fn new( digest_alg: &'static digest::Algorithm, mod_bits: bits::BitLength, ) -> Result { let em_bits = mod_bits.try_sub_1()?; let em_len = em_bits.as_usize_bytes_rounded_up(); let leading_zero_bits = (8 * em_len) - em_bits.as_bits(); debug_assert!(leading_zero_bits < 8); let top_byte_mask = 0xffu8 >> leading_zero_bits; let h_len = digest_alg.output_len(); // We require the salt length to be equal to the digest length. let s_len = h_len; // Step 3 of both `EMSA-PSS-ENCODE` is `EMSA-PSS-VERIFY` requires that // we reject inputs where "emLen < hLen + sLen + 2". The definition of // `emBits` in RFC 3447 Sections 9.1.1 and 9.1.2 says `emBits` must be // "at least 8hLen + 8sLen + 9". Since 9 bits requires two bytes, these // two conditions are equivalent. 9 bits are required as the 0x01 // before the salt requires 1 bit and the 0xbc after the digest // requires 8 bits. let db_len = em_len.checked_sub(1 + s_len).ok_or(error::Unspecified)?; let ps_len = db_len.checked_sub(h_len + 1).ok_or(error::Unspecified)?; debug_assert!(em_bits.as_bits() >= (8 * h_len) + (8 * s_len) + 9); Ok(Self { em_len, db_len, ps_len, s_len, h_len, top_byte_mask, }) } } fn pss_digest( digest_alg: &'static digest::Algorithm, m_hash: digest::Digest, salt: &[u8], ) -> digest::Digest { // Fixed prefix. const PREFIX_ZEROS: [u8; 8] = [0u8; 8]; // Encoding step 5 and 6, Verification step 12 and 13. let mut ctx = digest::Context::new(digest_alg); ctx.update(&PREFIX_ZEROS); ctx.update(m_hash.as_ref()); ctx.update(salt); ctx.finish() } macro_rules! rsa_pss_padding { ( $vis:vis $PADDING_ALGORITHM:ident, $digest_alg:expr, $doc_str:expr ) => { #[doc=$doc_str] $vis static $PADDING_ALGORITHM: PSS = PSS { digest_alg: $digest_alg, }; }; } rsa_pss_padding!( pub RSA_PSS_SHA256, &digest::SHA256, "RSA PSS padding using SHA-256 for RSA signatures.\n\nSee \"`RSA_PSS_*` Details\" in `ring::signature`'s module-level documentation for more details." ); rsa_pss_padding!( pub RSA_PSS_SHA384, &digest::SHA384, "RSA PSS padding using SHA-384 for RSA signatures.\n\nSee \"`RSA_PSS_*` Details\" in `ring::signature`'s module-level documentation for more details." ); rsa_pss_padding!( pub RSA_PSS_SHA512, &digest::SHA512, "RSA PSS padding using SHA-512 for RSA signatures.\n\nSee \"`RSA_PSS_*` Details\" in `ring::signature`'s module-level documentation for more details." ); ring-0.17.14/src/rsa/padding.rs000064400000000000000000000145271046102023000142760ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::{bb, bits, digest, error, rand}; mod pkcs1; mod pss; pub use self::{ pkcs1::{RSA_PKCS1_SHA256, RSA_PKCS1_SHA384, RSA_PKCS1_SHA512}, pss::{RSA_PSS_SHA256, RSA_PSS_SHA384, RSA_PSS_SHA512}, }; pub(super) use pkcs1::RSA_PKCS1_SHA1_FOR_LEGACY_USE_ONLY; /// Common features of both RSA padding encoding and RSA padding verification. pub trait Padding: 'static + Sync + crate::sealed::Sealed + core::fmt::Debug { // The digest algorithm used for digesting the message (and maybe for // other things). fn digest_alg(&self) -> &'static digest::Algorithm; } pub(super) fn encode( encoding: &dyn RsaEncoding, m_hash: digest::Digest, m_out: &mut [u8], mod_bits: bits::BitLength, rng: &dyn rand::SecureRandom, ) -> Result<(), error::Unspecified> { #[allow(deprecated)] encoding.encode(m_hash, m_out, mod_bits, rng) } /// An RSA signature encoding as described in [RFC 3447 Section 8]. /// /// [RFC 3447 Section 8]: https://tools.ietf.org/html/rfc3447#section-8 #[cfg(feature = "alloc")] pub trait RsaEncoding: Padding { #[deprecated(note = "internal API that will be removed")] #[doc(hidden)] fn encode( &self, m_hash: digest::Digest, m_out: &mut [u8], mod_bits: bits::BitLength, rng: &dyn rand::SecureRandom, ) -> Result<(), error::Unspecified>; } /// Verification of an RSA signature encoding as described in /// [RFC 3447 Section 8]. /// /// [RFC 3447 Section 8]: https://tools.ietf.org/html/rfc3447#section-8 pub trait Verification: Padding { fn verify( &self, m_hash: digest::Digest, m: &mut untrusted::Reader, mod_bits: bits::BitLength, ) -> Result<(), error::Unspecified>; } // Masks `out` with the output of the mask-generating function MGF1 as // described in https://tools.ietf.org/html/rfc3447#appendix-B.2.1. fn mgf1(digest_alg: &'static digest::Algorithm, seed: &[u8], out: &mut [u8]) { let digest_len = digest_alg.output_len(); // Maximum counter value is the value of (mask_len / digest_len) rounded up. for (i, out) in out.chunks_mut(digest_len).enumerate() { let mut ctx = digest::Context::new(digest_alg); ctx.update(seed); // The counter will always fit in a `u32` because we reject absurdly // long inputs very early. ctx.update(&u32::to_be_bytes(i.try_into().unwrap())); let digest = ctx.finish(); // The last chunk may legitimately be shorter than `digest`, but // `digest` will never be shorter than `out`. bb::xor_assign_at_start(out, digest.as_ref()); } } #[cfg(test)] mod test { use super::*; use crate::testutil as test; use crate::{digest, error}; use alloc::vec; #[test] fn test_pss_padding_verify() { test::run( test_vector_file!("rsa_pss_padding_tests.txt"), |section, test_case| { assert_eq!(section, ""); let digest_name = test_case.consume_string("Digest"); let alg = match digest_name.as_ref() { "SHA256" => &RSA_PSS_SHA256, "SHA384" => &RSA_PSS_SHA384, "SHA512" => &RSA_PSS_SHA512, _ => panic!("Unsupported digest: {}", digest_name), }; let msg = test_case.consume_bytes("Msg"); let msg = untrusted::Input::from(&msg); let m_hash = digest::digest(alg.digest_alg(), msg.as_slice_less_safe()); let encoded = test_case.consume_bytes("EM"); let encoded = untrusted::Input::from(&encoded); // Salt is recomputed in verification algorithm. let _ = test_case.consume_bytes("Salt"); let bit_len = test_case.consume_usize_bits("Len"); let is_valid = test_case.consume_string("Result") == "P"; let actual_result = encoded.read_all(error::Unspecified, |m| alg.verify(m_hash, m, bit_len)); assert_eq!(actual_result.is_ok(), is_valid); Ok(()) }, ); } // Tests PSS encoding for various public modulus lengths. #[cfg(feature = "alloc")] #[test] fn test_pss_padding_encode() { test::run( test_vector_file!("rsa_pss_padding_tests.txt"), |section, test_case| { assert_eq!(section, ""); let digest_name = test_case.consume_string("Digest"); let alg = match digest_name.as_ref() { "SHA256" => &RSA_PSS_SHA256, "SHA384" => &RSA_PSS_SHA384, "SHA512" => &RSA_PSS_SHA512, _ => panic!("Unsupported digest: {}", digest_name), }; let msg = test_case.consume_bytes("Msg"); let salt = test_case.consume_bytes("Salt"); let encoded = test_case.consume_bytes("EM"); let bit_len = test_case.consume_usize_bits("Len"); let expected_result = test_case.consume_string("Result"); // Only test the valid outputs if expected_result != "P" { return Ok(()); } let rng = test::rand::FixedSliceRandom { bytes: &salt }; let mut m_out = vec![0u8; bit_len.as_usize_bytes_rounded_up()]; let digest = digest::digest(alg.digest_alg(), &msg); #[allow(deprecated)] alg.encode(digest, &mut m_out, bit_len, &rng).unwrap(); assert_eq!(m_out, encoded); Ok(()) }, ); } } ring-0.17.14/src/rsa/public_exponent.rs000064400000000000000000000075051046102023000160640ustar 00000000000000use crate::error; use crate::polyfill::{unwrap_const, ArrayFlatMap, LeadingZerosStripped}; use core::num::NonZeroU64; /// The exponent `e` of an RSA public key. #[derive(Clone, Copy)] pub struct PublicExponent(NonZeroU64); impl PublicExponent { #[cfg(test)] const ALL_CONSTANTS: [Self; 3] = [Self::_3, Self::_65537, Self::MAX]; pub(super) const _3: Self = Self(unwrap_const(NonZeroU64::new(3))); pub(super) const _65537: Self = Self(unwrap_const(NonZeroU64::new(65537))); // This limit was chosen to bound the performance of the simple // exponentiation-by-squaring implementation in `elem_exp_vartime`. In // particular, it helps mitigate theoretical resource exhaustion attacks. 33 // bits was chosen as the limit based on the recommendations in [1] and // [2]. Windows CryptoAPI (at least older versions) doesn't support values // larger than 32 bits [3], so it is unlikely that exponents larger than 32 // bits are being used for anything Windows commonly does. // // [1] https://www.imperialviolet.org/2012/03/16/rsae.html // [2] https://www.imperialviolet.org/2012/03/17/rsados.html // [3] https://msdn.microsoft.com/en-us/library/aa387685(VS.85).aspx const MAX: Self = Self(unwrap_const(NonZeroU64::new((1u64 << 33) - 1))); pub(super) fn from_be_bytes( input: untrusted::Input, min_value: Self, ) -> Result { // See `PublicKey::from_modulus_and_exponent` for background on the step // numbering. if input.len() > 5 { return Err(error::KeyRejected::too_large()); } let value = input.read_all(error::KeyRejected::invalid_encoding(), |input| { // The exponent can't be zero and it can't be prefixed with // zero-valued bytes. if input.peek(0) { return Err(error::KeyRejected::invalid_encoding()); } let mut value = 0u64; loop { let byte = input .read_byte() .map_err(|untrusted::EndOfInput| error::KeyRejected::invalid_encoding())?; value = (value << 8) | u64::from(byte); if input.at_end() { return Ok(value); } } })?; // Step 2 / Step b. NIST SP800-89 defers to FIPS 186-3, which requires // `e >= 65537`. We enforce this when signing, but are more flexible in // verification, for compatibility. Only small public exponents are // supported. let value = NonZeroU64::new(value).ok_or_else(error::KeyRejected::too_small)?; if value < min_value.0 { return Err(error::KeyRejected::too_small()); } if value > Self::MAX.0 { return Err(error::KeyRejected::too_large()); } // Step 3 / Step c. if value.get() & 1 != 1 { return Err(error::KeyRejected::invalid_component()); } Ok(Self(value)) } /// The big-endian encoding of the exponent. /// /// There are no leading zeros. pub fn be_bytes(&self) -> impl ExactSizeIterator + Clone + '_ { // The `unwrap()` won't fail as `self.0` is only a few bytes long. let bytes = ArrayFlatMap::new(core::iter::once(self.0.get()), u64::to_be_bytes).unwrap(); LeadingZerosStripped::new(bytes) } pub(super) fn value(self) -> NonZeroU64 { self.0 } } #[cfg(test)] mod tests { use super::*; #[test] fn test_public_exponent_constants() { for value in PublicExponent::ALL_CONSTANTS.iter() { let value: u64 = value.0.into(); assert_eq!(value & 1, 1); assert!(value >= PublicExponent::_3.0.into()); // The absolute minimum. assert!(value <= PublicExponent::MAX.0.into()); } } } ring-0.17.14/src/rsa/public_key.rs000064400000000000000000000200371046102023000150070ustar 00000000000000// Copyright 2015-2021 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{PublicExponent, PublicModulus, N, PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN}; use crate::{ arithmetic::bigint, bits, cpu, error, io::{self, der, der_writer}, limb::LIMB_BYTES, }; use alloc::boxed::Box; use core::num::NonZeroU64; /// An RSA Public Key. #[derive(Clone)] pub struct PublicKey { inner: Inner, serialized: Box<[u8]>, } derive_debug_self_as_ref_hex_bytes!(PublicKey); impl PublicKey { pub(super) fn from_modulus_and_exponent( n: untrusted::Input, e: untrusted::Input, n_min_bits: bits::BitLength, n_max_bits: bits::BitLength, e_min_value: PublicExponent, cpu_features: cpu::Features, ) -> Result { let inner = Inner::from_modulus_and_exponent( n, e, n_min_bits, n_max_bits, e_min_value, cpu_features, )?; let n_bytes = n; let e_bytes = e; // TODO: Remove this re-parsing, and stop allocating this here. // Instead we should serialize on demand without allocation, from // `Modulus::be_bytes()` and `Exponent::be_bytes()`. Once this is // fixed, merge `Inner` back into `PublicKey`. let n_bytes = io::Positive::from_be_bytes(n_bytes) .map_err(|_: error::Unspecified| error::KeyRejected::unexpected_error())?; let e_bytes = io::Positive::from_be_bytes(e_bytes) .map_err(|_: error::Unspecified| error::KeyRejected::unexpected_error())?; let serialized = der_writer::write_all(der::Tag::Sequence, &|output| { der_writer::write_positive_integer(output, &n_bytes)?; der_writer::write_positive_integer(output, &e_bytes) }) .map_err(|_: io::TooLongError| error::KeyRejected::unexpected_error())?; Ok(Self { inner, serialized }) } /// The length, in bytes, of the public modulus. /// /// The modulus length is rounded up to a whole number of bytes if its /// bit length isn't a multiple of 8. pub fn modulus_len(&self) -> usize { self.inner.n().len_bits().as_usize_bytes_rounded_up() } pub(super) fn inner(&self) -> &Inner { &self.inner } } /// `PublicKey` but without any superfluous allocations, optimized for one-shot /// RSA signature verification. #[derive(Clone)] pub(crate) struct Inner { n: PublicModulus, e: PublicExponent, } impl Inner { pub(super) fn from_modulus_and_exponent( n: untrusted::Input, e: untrusted::Input, n_min_bits: bits::BitLength, n_max_bits: bits::BitLength, e_min_value: PublicExponent, cpu_features: cpu::Features, ) -> Result { // This is an incomplete implementation of NIST SP800-56Br1 Section // 6.4.2.2, "Partial Public-Key Validation for RSA." That spec defers // to NIST SP800-89 Section 5.3.3, "(Explicit) Partial Public Key // Validation for RSA," "with the caveat that the length of the modulus // shall be a length that is specified in this Recommendation." In // SP800-89, two different sets of steps are given, one set numbered, // and one set lettered. TODO: Document this in the end-user // documentation for RSA keys. let n = PublicModulus::from_be_bytes(n, n_min_bits..=n_max_bits, cpu_features)?; let e = PublicExponent::from_be_bytes(e, e_min_value)?; // If `n` is less than `e` then somebody has probably accidentally swapped // them. The largest acceptable `e` is smaller than the smallest acceptable // `n`, so no additional checks need to be done. // XXX: Steps 4 & 5 / Steps d, e, & f are not implemented. This is also the // case in most other commonly-used crypto libraries. Ok(Self { n, e }) } /// The public modulus. #[inline] pub(super) fn n(&self) -> &PublicModulus { &self.n } /// The public exponent. #[inline] pub(super) fn e(&self) -> PublicExponent { self.e } /// Calculates base**e (mod n), filling the first part of `out_buffer` with /// the result. /// /// This is constant-time with respect to the value in `base` (only). /// /// The result will be a slice of the encoded bytes of the result within /// `out_buffer`, if successful. pub(super) fn exponentiate<'out>( &self, base: untrusted::Input, out_buffer: &'out mut [u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN], cpu_features: cpu::Features, ) -> Result<&'out [u8], error::Unspecified> { let n = &self.n.value(cpu_features); // The encoded value of the base must be the same length as the modulus, // in bytes. if base.len() != self.n.len_bits().as_usize_bytes_rounded_up() { return Err(error::Unspecified); } // RFC 8017 Section 5.2.2: RSAVP1. // Step 1. let s = bigint::Elem::from_be_bytes_padded(base, n)?; if s.is_zero() { return Err(error::Unspecified); } // Step 2. let m = n.alloc_zero(); let m = self.exponentiate_elem(m, &s, cpu_features); // Step 3. Ok(fill_be_bytes_n(m, self.n.len_bits(), out_buffer)) } /// Calculates base**e (mod n). /// /// This is constant-time with respect to `base` only. pub(super) fn exponentiate_elem( &self, out: bigint::Storage, base: &bigint::Elem, cpu_features: cpu::Features, ) -> bigint::Elem { // The exponent was already checked to be at least 3. let exponent_without_low_bit = NonZeroU64::try_from(self.e.value().get() & !1).unwrap(); // The exponent was already checked to be odd. debug_assert_ne!(exponent_without_low_bit, self.e.value()); let n = &self.n.value(cpu_features); let tmp = n.alloc_zero(); let base_r = bigint::elem_mul_into(tmp, self.n.oneRR(), base, n); // During RSA public key operations the exponent is almost always either // 65537 (0b10000000000000001) or 3 (0b11), both of which have a Hamming // weight of 2. The maximum bit length and maximum Hamming weight of the // exponent is bounded by the value of `PublicExponent::MAX`. let acc = bigint::elem_exp_vartime(out, base_r, exponent_without_low_bit, n); // Now do the multiplication for the low bit and convert out of the Montgomery domain. bigint::elem_mul(base, acc, n) } } // XXX: Refactor `signature::KeyPair` to get rid of this. impl AsRef<[u8]> for PublicKey { fn as_ref(&self) -> &[u8] { &self.serialized } } /// Returns the big-endian representation of `elem` that is /// the same length as the minimal-length big-endian representation of /// the modulus `n`. /// /// `n_bits` must be the bit length of the public modulus `n`. fn fill_be_bytes_n( elem: bigint::Elem, n_bits: bits::BitLength, out: &mut [u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN], ) -> &[u8] { let n_bytes = n_bits.as_usize_bytes_rounded_up(); let n_bytes_padded = ((n_bytes + (LIMB_BYTES - 1)) / LIMB_BYTES) * LIMB_BYTES; let out = &mut out[..n_bytes_padded]; elem.fill_be_bytes(out); let (padding, out) = out.split_at(n_bytes_padded - n_bytes); assert!(padding.iter().all(|&b| b == 0)); out } ring-0.17.14/src/rsa/public_key_components.rs000064400000000000000000000033511046102023000172540ustar 00000000000000// Copyright 2015-2021 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::PublicKey; use core::iter::FromIterator; /// RSA public key components. /// /// `B` must implement `AsRef<[u8]>` like `&[u8]` or `Vec`. #[derive(Clone, Copy)] pub struct PublicKeyComponents { /// The public modulus, encoded in big-endian bytes without leading zeros. pub n: B, /// The public exponent, encoded in big-endian bytes without leading zeros. pub e: B, } impl core::fmt::Debug for PublicKeyComponents where B: core::fmt::Debug, { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> { f.debug_struct("PublicKeyComponents") .field("n", &self.n) .field("e", &self.e) .finish() } } impl From<&PublicKey> for PublicKeyComponents where B: FromIterator, { fn from(public_key: &PublicKey) -> Self { Self { n: public_key.inner().n().be_bytes().collect(), e: public_key.inner().e().be_bytes().collect(), } } } ring-0.17.14/src/rsa/public_modulus.rs000064400000000000000000000062131046102023000157070ustar 00000000000000use crate::{ arithmetic::{bigint, montgomery::RR}, bits::{self, FromByteLen as _}, cpu, error::{self, InputTooLongError}, rsa::N, }; use core::ops::RangeInclusive; /// The modulus (n) of an RSA public key. pub struct PublicModulus { value: bigint::OwnedModulus, oneRR: bigint::One, } impl Clone for PublicModulus { fn clone(&self) -> Self { let PublicModulus { value, oneRR } = self; let value = value.clone(); // XXX: Shouldn't really be needed just to call `alloc_zero()`, // but not worth optimizing away. let cpu = cpu::features(); let n = value.modulus(cpu); let oneRR = oneRR.clone_into(n.alloc_zero()); Self { value, oneRR } } } /* impl core::fmt::Debug for PublicModulus { fn fmt(&self, fmt: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> { self.value.fmt(fmt) } }*/ impl PublicModulus { pub(super) fn from_be_bytes( n: untrusted::Input, allowed_bit_lengths: RangeInclusive, cpu_features: cpu::Features, ) -> Result { // See `PublicKey::from_modulus_and_exponent` for background on the step // numbering. let min_bits = *allowed_bit_lengths.start(); let max_bits = *allowed_bit_lengths.end(); // `pkcs1_encode` depends on this not being small. Otherwise, // `pkcs1_encode` would generate padding that is invalid (too few 0xFF // bytes) for very small keys. const MIN_BITS: bits::BitLength = bits::BitLength::from_bits(1024); // Step 3 / Step c for `n` (out of order). let value = bigint::OwnedModulusValue::from_be_bytes(n)?; let bits = value.len_bits(); // Step 1 / Step a. XXX: SP800-56Br1 and SP800-89 require the length of // the public modulus to be exactly 2048 or 3072 bits, but we are more // flexible to be compatible with other commonly-used crypto libraries. assert!(min_bits >= MIN_BITS); let bits_rounded_up = bits::BitLength::from_byte_len(bits.as_usize_bytes_rounded_up()) .map_err(error::erase::) .unwrap(); // TODO: safe? if bits_rounded_up < min_bits { return Err(error::KeyRejected::too_small()); } if bits > max_bits { return Err(error::KeyRejected::too_large()); } let value = bigint::OwnedModulus::from(value); let m = value.modulus(cpu_features); let oneRR = bigint::One::newRR(m.alloc_zero(), &m); Ok(Self { value, oneRR }) } /// The big-endian encoding of the modulus. /// /// There are no leading zeros. pub fn be_bytes(&self) -> impl ExactSizeIterator + Clone + '_ { self.value.be_bytes() } /// The length of the modulus in bits. pub fn len_bits(&self) -> bits::BitLength { self.value.len_bits() } pub(super) fn value(&self, cpu_features: cpu::Features) -> bigint::Modulus { self.value.modulus(cpu_features) } pub(super) fn oneRR(&self) -> &bigint::Elem { self.oneRR.as_ref() } } ring-0.17.14/src/rsa/signature_rsa_example_private_key.der000064400000000000000000000022501046102023000217670ustar 000000000000000Ψu2L4x'4ƣ,I^j'g<]^w *[P0zw*6o[2<9T[$qL2U^N$M0,bme/!Gım(n^9 #X/1LԿ3<6 <}D@q;5 [r)NAcrw؁>ƒZC&E cLG|Ddž+ pr_fpjZ- ['.VĤӈ]bUlkApZ5&"=F@5v> YʮF@|)VZ.5AdL^w)S^0aV=?II18bG n4wzlLKN]al97SU鍈4w;I؍1_.S[T c{㘣K dn 앱0P7+0 ,k] [l5JO! ʧ+O\6?2LUaAEKIo8 P%|A[82/8\<7E1ă0ΰ%\ >2bqy>#LԶ*v8Ul$SE߀!Qc&g1 =,ic hvxjz |ڵ* 3:2 ܈MM7Zh?s1G;Y`9dU>m=cgXsIn@"D׶y?0whh&UJ" $)DףQ:兇FǕc麷kC: \Ea=8oO.Kk`I-j_xb:nsc$i~ {-}}Ͷr5lf`ܗPhY$65>_jxsbI3mq=4P>*#y޲xQ?Z`@AkԲ]$Jj!ګZM?vawm!ޓ| 0j(nגb^Shtv3I5ьmo?Uring-0.17.14/src/rsa/signature_rsa_example_public_key.der000064400000000000000000000004161046102023000215750ustar 000000000000000 Ψu2L4x'4ƣ,I^j'g<]^w *[P0zw*6o[2<9T[$qL2U^N$M0,bme/!Gım(n^9 #X/1LԿ3<6 <}D@q;5 [r)NAcrw؁>ƒZC&Ering-0.17.14/src/rsa/verification.rs000064400000000000000000000213101046102023000153360ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Verification of RSA signatures. use super::{ parse_public_key, public_key, PublicExponent, RsaParameters, PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN, }; use crate::{ bits::{self, FromByteLen as _}, cpu, digest, error::{self, InputTooLongError}, sealed, signature, }; impl signature::VerificationAlgorithm for RsaParameters { fn verify( &self, public_key: untrusted::Input, msg: untrusted::Input, signature: untrusted::Input, ) -> Result<(), error::Unspecified> { let (n, e) = parse_public_key(public_key)?; verify_rsa_( self, ( n.big_endian_without_leading_zero_as_input(), e.big_endian_without_leading_zero_as_input(), ), msg, signature, cpu::features(), ) } } impl sealed::Sealed for RsaParameters {} macro_rules! rsa_params { ( $VERIFY_ALGORITHM:ident, $min_bits:expr, $PADDING_ALGORITHM:expr, $doc_str:expr ) => { #[doc=$doc_str] /// /// Only available in `alloc` mode. pub static $VERIFY_ALGORITHM: RsaParameters = RsaParameters { padding_alg: $PADDING_ALGORITHM, min_bits: bits::BitLength::from_bits($min_bits), }; }; } rsa_params!( RSA_PKCS1_1024_8192_SHA1_FOR_LEGACY_USE_ONLY, 1024, &super::padding::RSA_PKCS1_SHA1_FOR_LEGACY_USE_ONLY, "Verification of signatures using RSA keys of 1024-8192 bits, PKCS#1.5 padding, and SHA-1.\n\nSee \"`RSA_PKCS1_*` Details\" in `ring::signature`'s module-level documentation for more details." ); rsa_params!( RSA_PKCS1_2048_8192_SHA1_FOR_LEGACY_USE_ONLY, 2048, &super::padding::RSA_PKCS1_SHA1_FOR_LEGACY_USE_ONLY, "Verification of signatures using RSA keys of 2048-8192 bits, PKCS#1.5 padding, and SHA-1.\n\nSee \"`RSA_PKCS1_*` Details\" in `ring::signature`'s module-level documentation for more details." ); rsa_params!( RSA_PKCS1_1024_8192_SHA256_FOR_LEGACY_USE_ONLY, 1024, &super::padding::RSA_PKCS1_SHA256, "Verification of signatures using RSA keys of 1024-8192 bits, PKCS#1.5 padding, and SHA-256.\n\nSee \"`RSA_PKCS1_*` Details\" in `ring::signature`'s module-level documentation for more details." ); rsa_params!( RSA_PKCS1_2048_8192_SHA256, 2048, &super::padding::RSA_PKCS1_SHA256, "Verification of signatures using RSA keys of 2048-8192 bits, PKCS#1.5 padding, and SHA-256.\n\nSee \"`RSA_PKCS1_*` Details\" in `ring::signature`'s module-level documentation for more details." ); rsa_params!( RSA_PKCS1_2048_8192_SHA384, 2048, &super::padding::RSA_PKCS1_SHA384, "Verification of signatures using RSA keys of 2048-8192 bits, PKCS#1.5 padding, and SHA-384.\n\nSee \"`RSA_PKCS1_*` Details\" in `ring::signature`'s module-level documentation for more details." ); rsa_params!( RSA_PKCS1_2048_8192_SHA512, 2048, &super::padding::RSA_PKCS1_SHA512, "Verification of signatures using RSA keys of 2048-8192 bits, PKCS#1.5 padding, and SHA-512.\n\nSee \"`RSA_PKCS1_*` Details\" in `ring::signature`'s module-level documentation for more details." ); rsa_params!( RSA_PKCS1_1024_8192_SHA512_FOR_LEGACY_USE_ONLY, 1024, &super::padding::RSA_PKCS1_SHA512, "Verification of signatures using RSA keys of 1024-8192 bits, PKCS#1.5 padding, and SHA-512.\n\nSee \"`RSA_PKCS1_*` Details\" in `ring::signature`'s module-level documentation for more details." ); rsa_params!( RSA_PKCS1_3072_8192_SHA384, 3072, &super::padding::RSA_PKCS1_SHA384, "Verification of signatures using RSA keys of 3072-8192 bits, PKCS#1.5 padding, and SHA-384.\n\nSee \"`RSA_PKCS1_*` Details\" in `ring::signature`'s module-level documentation for more details." ); rsa_params!( RSA_PSS_2048_8192_SHA256, 2048, &super::padding::RSA_PSS_SHA256, "Verification of signatures using RSA keys of 2048-8192 bits, PSS padding, and SHA-256.\n\nSee \"`RSA_PSS_*` Details\" in `ring::signature`'s module-level documentation for more details." ); rsa_params!( RSA_PSS_2048_8192_SHA384, 2048, &super::padding::RSA_PSS_SHA384, "Verification of signatures using RSA keys of 2048-8192 bits, PSS padding, and SHA-384.\n\nSee \"`RSA_PSS_*` Details\" in `ring::signature`'s module-level documentation for more details." ); rsa_params!( RSA_PSS_2048_8192_SHA512, 2048, &super::padding::RSA_PSS_SHA512, "Verification of signatures using RSA keys of 2048-8192 bits, PSS padding, and SHA-512.\n\nSee \"`RSA_PSS_*` Details\" in `ring::signature`'s module-level documentation for more details." ); pub use super::PublicKeyComponents as RsaPublicKeyComponents; impl super::PublicKeyComponents where B: AsRef<[u8]>, { /// Verifies that `signature` is a valid signature of `message` using `self` /// as the public key. `params` determine what algorithm parameters /// (padding, digest algorithm, key length range, etc.) are used in the /// verification. /// /// When the public key is in DER-encoded PKCS#1 ASN.1 format, it is /// recommended to use `ring::signature::verify()` with /// `ring::signature::RSA_PKCS1_*`, because `ring::signature::verify()` /// will handle the parsing in that case. Otherwise, this function can be used /// to pass in the raw bytes for the public key components as /// `untrusted::Input` arguments. // // There are a small number of tests that test this directly, but the // test coverage for this function mostly depends on the test coverage for the // `signature::VerificationAlgorithm` implementation for `RsaParameters`. If we // change that, test coverage for `verify_rsa()` will need to be reconsidered. // (The NIST test vectors were originally in a form that was optimized for // testing `verify_rsa` directly, but the testing work for RSA PKCS#1 // verification was done during the implementation of // `signature::VerificationAlgorithm`, before `verify_rsa` was factored out). pub fn verify( &self, params: &RsaParameters, message: &[u8], signature: &[u8], ) -> Result<(), error::Unspecified> { verify_rsa_( params, ( untrusted::Input::from(self.n.as_ref()), untrusted::Input::from(self.e.as_ref()), ), untrusted::Input::from(message), untrusted::Input::from(signature), cpu::features(), ) } } pub(crate) fn verify_rsa_( params: &RsaParameters, (n, e): (untrusted::Input, untrusted::Input), msg: untrusted::Input, signature: untrusted::Input, cpu_features: cpu::Features, ) -> Result<(), error::Unspecified> { let max_bits: bits::BitLength = bits::BitLength::from_byte_len(PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN) .map_err(error::erase::)?; // XXX: FIPS 186-4 seems to indicate that the minimum // exponent value is 2**16 + 1, but it isn't clear if this is just for // signing or also for verification. We support exponents of 3 and larger // for compatibility with other commonly-used crypto libraries. let key = public_key::Inner::from_modulus_and_exponent( n, e, params.min_bits, max_bits, PublicExponent::_3, cpu_features, )?; // RFC 8017 Section 5.2.2: RSAVP1. let mut decoded = [0u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN]; let decoded = key.exponentiate(signature, &mut decoded, cpu_features)?; // Verify the padded message is correct. let m_hash = digest::digest(params.padding_alg.digest_alg(), msg.as_slice_less_safe()); untrusted::Input::from(decoded).read_all(error::Unspecified, |m| { params.padding_alg.verify(m_hash, m, key.n().len_bits()) }) } ring-0.17.14/src/rsa.rs000064400000000000000000000050261046102023000126620ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // *R* and *r* in Montgomery math refer to different things, so we always use // `R` to refer to *R* to avoid confusion, even when that's against the normal // naming conventions. Also the standard camelCase names are used for `KeyPair` // components. //! RSA. use crate::{ arithmetic::bigint, bits, error, io::{self, der}, }; pub(crate) mod padding; // Maximum RSA modulus size supported for signature verification (in bytes). const PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN: usize = bits::BitLength::from_bits(8192).as_usize_bytes_rounded_up(); // Keep in sync with the documentation comment for `KeyPair`. const PRIVATE_KEY_PUBLIC_MODULUS_MAX_BITS: bits::BitLength = bits::BitLength::from_bits(4096); /// Parameters for RSA verification. #[derive(Debug)] pub struct RsaParameters { padding_alg: &'static dyn padding::Verification, min_bits: bits::BitLength, } fn parse_public_key( input: untrusted::Input, ) -> Result<(io::Positive, io::Positive), error::Unspecified> { input.read_all(error::Unspecified, |input| { der::nested(input, der::Tag::Sequence, error::Unspecified, |input| { let n = der::positive_integer(input)?; let e = der::positive_integer(input)?; Ok((n, e)) }) }) } // Type-level representation of an RSA public modulus *n*. See // `super::bigint`'s modulue-level documentation. enum N {} impl bigint::PublicModulus for N {} mod keypair; mod keypair_components; mod public_exponent; mod public_key; mod public_key_components; mod public_modulus; pub(crate) mod verification; use self::{public_exponent::PublicExponent, public_modulus::PublicModulus}; pub use self::{ keypair::KeyPair, keypair_components::KeyPairComponents, public_key::PublicKey, public_key_components::PublicKeyComponents, }; ring-0.17.14/src/signature.rs000064400000000000000000000366161046102023000141070ustar 00000000000000// Copyright 2015-2017 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Public key signatures: signing and verification. //! //! Use the `verify` function to verify signatures, passing a reference to the //! algorithm that identifies the algorithm. See the documentation for `verify` //! for examples. //! //! For signature verification, this API treats each combination of parameters //! as a separate algorithm. For example, instead of having a single "RSA" //! algorithm with a verification function that takes a bunch of parameters, //! there are `RSA_PKCS1_2048_8192_SHA256`, `RSA_PKCS1_2048_8192_SHA384`, etc., //! which encode sets of parameter choices into objects. This is designed to //! reduce the risks of algorithm agility and to provide consistency with ECDSA //! and EdDSA. //! //! Currently this module does not support digesting the message to be signed //! separately from the public key operation, as it is currently being //! optimized for Ed25519 and for the implementation of protocols that do not //! requiring signing large messages. An interface for efficiently supporting //! larger messages may be added later. //! //! //! # Algorithm Details //! //! ## `ECDSA_*_ASN1` Details: ASN.1-encoded ECDSA Signatures //! //! The signature is a ASN.1 DER-encoded `Ecdsa-Sig-Value` as described in //! [RFC 3279 Section 2.2.3]. This is the form of ECDSA signature used in //! X.509-related structures and in TLS's `ServerKeyExchange` messages. //! //! The public key is encoding in uncompressed form using the //! Octet-String-to-Elliptic-Curve-Point algorithm in //! [SEC 1: Elliptic Curve Cryptography, Version 2.0]. //! //! During verification, the public key is validated using the ECC Partial //! Public-Key Validation Routine from Section 5.6.2.3.3 of //! [NIST Special Publication 800-56A, revision 2] and Appendix A.3 of the //! NSA's [Suite B implementer's guide to FIPS 186-3]. Note that, as explained //! in the NSA guide, ECC Partial Public-Key Validation is equivalent to ECC //! Full Public-Key Validation for prime-order curves like this one. //! //! ## `ECDSA_*_FIXED` Details: Fixed-length (PKCS#11-style) ECDSA Signatures //! //! The signature is *r*||*s*, where || denotes concatenation, and where both //! *r* and *s* are both big-endian-encoded values that are left-padded to the //! maximum length. A P-256 signature will be 64 bytes long (two 32-byte //! components) and a P-384 signature will be 96 bytes long (two 48-byte //! components). This is the form of ECDSA signature used PKCS#11 and DNSSEC. //! //! The public key is encoding in uncompressed form using the //! Octet-String-to-Elliptic-Curve-Point algorithm in //! [SEC 1: Elliptic Curve Cryptography, Version 2.0]. //! //! During verification, the public key is validated using the ECC Partial //! Public-Key Validation Routine from Section 5.6.2.3.3 of //! [NIST Special Publication 800-56A, revision 2] and Appendix A.3 of the //! NSA's [Suite B implementer's guide to FIPS 186-3]. Note that, as explained //! in the NSA guide, ECC Partial Public-Key Validation is equivalent to ECC //! Full Public-Key Validation for prime-order curves like this one. //! //! ## `RSA_PKCS1_*` Details: RSA PKCS#1 1.5 Signatures //! //! The signature is an RSASSA-PKCS1-v1_5 signature as described in //! [RFC 3447 Section 8.2]. //! //! The public key is encoded as an ASN.1 `RSAPublicKey` as described in //! [RFC 3447 Appendix-A.1.1]. The public key modulus length, rounded *up* to //! the nearest (larger) multiple of 8 bits, must be in the range given in the //! name of the algorithm. The public exponent must be an odd integer of 2-33 //! bits, inclusive. //! //! //! ## `RSA_PSS_*` Details: RSA PSS Signatures //! //! The signature is an RSASSA-PSS signature as described in //! [RFC 3447 Section 8.1]. //! //! The public key is encoded as an ASN.1 `RSAPublicKey` as described in //! [RFC 3447 Appendix-A.1.1]. The public key modulus length, rounded *up* to //! the nearest (larger) multiple of 8 bits, must be in the range given in the //! name of the algorithm. The public exponent must be an odd integer of 2-33 //! bits, inclusive. //! //! During verification, signatures will only be accepted if the MGF1 digest //! algorithm is the same as the message digest algorithm and if the salt //! length is the same length as the message digest. This matches the //! requirements in TLS 1.3 and other recent specifications. //! //! During signing, the message digest algorithm will be used as the MGF1 //! digest algorithm. The salt will be the same length as the message digest. //! This matches the requirements in TLS 1.3 and other recent specifications. //! Additionally, the entire salt is randomly generated separately for each //! signature using the secure random number generator passed to `sign()`. //! //! //! [SEC 1: Elliptic Curve Cryptography, Version 2.0]: //! http://www.secg.org/sec1-v2.pdf //! [NIST Special Publication 800-56A, revision 2]: //! http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf //! [Suite B implementer's guide to FIPS 186-3]: //! https://github.com/briansmith/ring/blob/main/doc/ecdsa.pdf //! [RFC 3279 Section 2.2.3]: //! https://tools.ietf.org/html/rfc3279#section-2.2.3 //! [RFC 3447 Section 8.2]: //! https://tools.ietf.org/html/rfc3447#section-7.2 //! [RFC 3447 Section 8.1]: //! https://tools.ietf.org/html/rfc3447#section-8.1 //! [RFC 3447 Appendix-A.1.1]: //! https://tools.ietf.org/html/rfc3447#appendix-A.1.1 //! //! //! # Examples //! //! ## Signing and verifying with Ed25519 //! //! ``` //! use ring::{ //! rand, //! signature::{self, KeyPair}, //! }; //! //! # fn main() -> Result<(), ring::error::Unspecified> { //! // Generate a key pair in PKCS#8 (v2) format. //! let rng = rand::SystemRandom::new(); //! let pkcs8_bytes = signature::Ed25519KeyPair::generate_pkcs8(&rng)?; //! //! // Normally the application would store the PKCS#8 file persistently. Later //! // it would read the PKCS#8 file from persistent storage to use it. //! //! let key_pair = signature::Ed25519KeyPair::from_pkcs8(pkcs8_bytes.as_ref())?; //! //! // Sign the message "hello, world". //! const MESSAGE: &[u8] = b"hello, world"; //! let sig = key_pair.sign(MESSAGE); //! //! // Normally an application would extract the bytes of the signature and //! // send them in a protocol message to the peer(s). Here we just get the //! // public key key directly from the key pair. //! let peer_public_key_bytes = key_pair.public_key().as_ref(); //! //! // Verify the signature of the message using the public key. Normally the //! // verifier of the message would parse the inputs to this code out of the //! // protocol message(s) sent by the signer. //! let peer_public_key = //! signature::UnparsedPublicKey::new(&signature::ED25519, peer_public_key_bytes); //! peer_public_key.verify(MESSAGE, sig.as_ref())?; //! //! # Ok(()) //! # } //! ``` //! //! ## Signing and verifying with RSA (PKCS#1 1.5 padding) //! //! By default OpenSSL writes RSA public keys in SubjectPublicKeyInfo format, //! not RSAPublicKey format, and Base64-encodes them (“PEM” format). //! //! To convert the PEM SubjectPublicKeyInfo format (“BEGIN PUBLIC KEY”) to the //! binary RSAPublicKey format needed by `verify()`, use: //! //! ```sh //! openssl rsa -pubin \ //! -in public_key.pem \ //! -inform PEM \ //! -RSAPublicKey_out \ //! -outform DER \ //! -out public_key.der //! ``` //! //! To extract the RSAPublicKey-formatted public key from an ASN.1 (binary) //! DER-encoded RSAPrivateKey format private key file, use: //! //! ```sh //! openssl rsa -in private_key.der \ //! -inform DER \ //! -RSAPublicKey_out \ //! -outform DER \ //! -out public_key.der //! ``` //! //! ``` //! # #[cfg(feature = "std")] //! use ring::{rand, rsa, signature}; //! //! # #[cfg(feature = "std")] //! fn sign_and_verify_rsa(private_key_path: &std::path::Path, //! public_key_path: &std::path::Path) //! -> Result<(), MyError> { //! // Create an RSA keypair from the DER-encoded bytes. This example uses //! // a 2048-bit key, but larger keys are also supported. //! let private_key_der = read_file(private_key_path)?; //! let key_pair = rsa::KeyPair::from_der(&private_key_der) //! .map_err(|_| MyError::BadPrivateKey)?; //! //! // Sign the message "hello, world", using PKCS#1 v1.5 padding and the //! // SHA256 digest algorithm. //! const MESSAGE: &'static [u8] = b"hello, world"; //! let rng = rand::SystemRandom::new(); //! let mut signature = vec![0; key_pair.public().modulus_len()]; //! key_pair.sign(&signature::RSA_PKCS1_SHA256, &rng, MESSAGE, &mut signature) //! .map_err(|_| MyError::OOM)?; //! //! // Verify the signature. //! let public_key = //! signature::UnparsedPublicKey::new(&signature::RSA_PKCS1_2048_8192_SHA256, //! read_file(public_key_path)?); //! public_key.verify(MESSAGE, &signature) //! .map_err(|_| MyError::BadSignature) //! } //! //! #[derive(Debug)] //! enum MyError { //! # #[cfg(feature = "std")] //! IO(std::io::Error), //! BadPrivateKey, //! OOM, //! BadSignature, //! } //! //! # #[cfg(feature = "std")] //! fn read_file(path: &std::path::Path) -> Result, MyError> { //! use std::io::Read; //! //! let mut file = std::fs::File::open(path).map_err(|e| MyError::IO(e))?; //! let mut contents: Vec = Vec::new(); //! file.read_to_end(&mut contents).map_err(|e| MyError::IO(e))?; //! Ok(contents) //! } //! # //! # #[cfg(not(feature = "std"))] //! # fn sign_and_verify_rsa(_private_key_path: &std::path::Path, //! # _public_key_path: &std::path::Path) //! # -> Result<(), ()> { //! # Ok(()) //! # } //! # //! # fn main() { //! # let private_key_path = //! # std::path::Path::new("src/rsa/signature_rsa_example_private_key.der"); //! # let public_key_path = //! # std::path::Path::new("src/rsa/signature_rsa_example_public_key.der"); //! # sign_and_verify_rsa(&private_key_path, &public_key_path).unwrap() //! # } //! ``` use crate::{cpu, debug, ec, error, sealed}; pub use crate::ec::{ curve25519::ed25519::{ signing::Ed25519KeyPair, verification::{EdDSAParameters, ED25519}, ED25519_PUBLIC_KEY_LEN, }, suite_b::ecdsa::{ signing::{ EcdsaKeyPair, EcdsaSigningAlgorithm, ECDSA_P256_SHA256_ASN1_SIGNING, ECDSA_P256_SHA256_FIXED_SIGNING, ECDSA_P384_SHA384_ASN1_SIGNING, ECDSA_P384_SHA384_FIXED_SIGNING, }, verification::{ EcdsaVerificationAlgorithm, ECDSA_P256_SHA256_ASN1, ECDSA_P256_SHA256_FIXED, ECDSA_P256_SHA384_ASN1, ECDSA_P384_SHA256_ASN1, ECDSA_P384_SHA384_ASN1, ECDSA_P384_SHA384_FIXED, }, }, }; #[cfg(feature = "alloc")] pub use crate::rsa::{ padding::{ RsaEncoding, RSA_PKCS1_SHA256, RSA_PKCS1_SHA384, RSA_PKCS1_SHA512, RSA_PSS_SHA256, RSA_PSS_SHA384, RSA_PSS_SHA512, }, verification::{ RsaPublicKeyComponents, RSA_PKCS1_1024_8192_SHA1_FOR_LEGACY_USE_ONLY, RSA_PKCS1_1024_8192_SHA256_FOR_LEGACY_USE_ONLY, RSA_PKCS1_1024_8192_SHA512_FOR_LEGACY_USE_ONLY, RSA_PKCS1_2048_8192_SHA1_FOR_LEGACY_USE_ONLY, RSA_PKCS1_2048_8192_SHA256, RSA_PKCS1_2048_8192_SHA384, RSA_PKCS1_2048_8192_SHA512, RSA_PKCS1_3072_8192_SHA384, RSA_PSS_2048_8192_SHA256, RSA_PSS_2048_8192_SHA384, RSA_PSS_2048_8192_SHA512, }, RsaParameters, }; /// An RSA key pair, used for signing. #[cfg(feature = "alloc")] pub type RsaKeyPair = crate::rsa::KeyPair; /// A public key signature returned from a signing operation. #[derive(Clone, Copy)] pub struct Signature { value: [u8; MAX_LEN], len: usize, } impl Signature { // Panics if `value` is too long. pub(crate) fn new(fill: F) -> Self where F: FnOnce(&mut [u8; MAX_LEN]) -> usize, { let mut r = Self { value: [0; MAX_LEN], len: 0, }; r.len = fill(&mut r.value); r } } impl AsRef<[u8]> for Signature { fn as_ref(&self) -> &[u8] { &self.value[..self.len] } } /// Key pairs for signing messages (private key and public key). pub trait KeyPair: core::fmt::Debug + Send + Sized + Sync { /// The type of the public key. type PublicKey: AsRef<[u8]> + core::fmt::Debug + Clone + Send + Sized + Sync; /// The public key for the key pair. fn public_key(&self) -> &Self::PublicKey; } /// The longest signature is an ASN.1 P-384 signature where *r* and *s* are of /// maximum length with the leading high bit set on each. Then each component /// will have a tag, a one-byte length, and a one-byte “I'm not negative” /// prefix, and the outer sequence will have a two-byte length. pub(crate) const MAX_LEN: usize = 1/*tag:SEQUENCE*/ + 2/*len*/ + (2 * (1/*tag:INTEGER*/ + 1/*len*/ + 1/*zero*/ + ec::SCALAR_MAX_BYTES)); /// A signature verification algorithm. pub trait VerificationAlgorithm: core::fmt::Debug + Sync + sealed::Sealed { /// Verify the signature `signature` of message `msg` with the public key /// `public_key`. fn verify( &self, public_key: untrusted::Input, msg: untrusted::Input, signature: untrusted::Input, ) -> Result<(), error::Unspecified>; } /// An unparsed, possibly malformed, public key for signature verification. #[derive(Clone, Copy)] pub struct UnparsedPublicKey { algorithm: &'static dyn VerificationAlgorithm, bytes: B, } impl AsRef<[u8]> for UnparsedPublicKey where B: AsRef<[u8]>, { fn as_ref(&self) -> &[u8] { self.bytes.as_ref() } } impl core::fmt::Debug for UnparsedPublicKey where B: AsRef<[u8]>, { fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { f.debug_struct("UnparsedPublicKey") .field("algorithm", &self.algorithm) .field("bytes", &debug::HexStr(self.bytes.as_ref())) .finish() } } impl UnparsedPublicKey { /// Construct a new `UnparsedPublicKey`. /// /// No validation of `bytes` is done until `verify()` is called. #[inline] pub fn new(algorithm: &'static dyn VerificationAlgorithm, bytes: B) -> Self { Self { algorithm, bytes } } /// Parses the public key and verifies `signature` is a valid signature of /// `message` using it. /// /// See the [crate::signature] module-level documentation for examples. pub fn verify(&self, message: &[u8], signature: &[u8]) -> Result<(), error::Unspecified> where B: AsRef<[u8]>, { let _ = cpu::features(); self.algorithm.verify( untrusted::Input::from(self.bytes.as_ref()), untrusted::Input::from(message), untrusted::Input::from(signature), ) } } ring-0.17.14/src/tests/bits_tests.rs000064400000000000000000000045661046102023000154320ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::{ bits::{BitLength, FromByteLen as _}, polyfill::u64_from_usize, }; #[test] fn test_from_byte_len_overflow() { const USIZE_MAX_VALID_BYTES: usize = usize::MAX / 8; // Maximum valid input for BitLength. match BitLength::::from_byte_len(USIZE_MAX_VALID_BYTES) { Ok(bits) => { assert_eq!(bits.as_usize_bytes_rounded_up(), USIZE_MAX_VALID_BYTES); assert_eq!(bits.as_bits(), usize::MAX & !0b111); } Err(_) => unreachable!(), } // Minimum invalid usize input for BitLength. assert!(BitLength::::from_byte_len(USIZE_MAX_VALID_BYTES + 1).is_err()); // Minimum invalid usize input for BitLength on 64-bit targets. { let r = BitLength::::from_byte_len(USIZE_MAX_VALID_BYTES + 1); if cfg!(target_pointer_width = "64") { assert!(r.is_err()); } else { match r { Ok(bits) => { assert_eq!( bits.as_bits(), (u64_from_usize(USIZE_MAX_VALID_BYTES) + 1) * 8 ); } Err(_) => unreachable!(), } } } const U64_MAX_VALID_BYTES: u64 = u64::MAX / 8; // Maximum valid u64 input for BitLength. match BitLength::::from_byte_len(U64_MAX_VALID_BYTES) { Ok(bits) => assert_eq!(bits.as_bits(), u64::MAX & !0b111), Err(_) => unreachable!(), }; // Minimum invalid usize input for BitLength on 64-bit targets. assert!(BitLength::::from_byte_len(U64_MAX_VALID_BYTES + 1).is_err()); } ring-0.17.14/src/tests/mod.rs000064400000000000000000000014701046102023000140150ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Integration tests for non-public APIs. mod bits_tests; ring-0.17.14/src/testutil.rs000064400000000000000000000542641046102023000137620ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Testing framework. //! //! Unlike the rest of *ring*, this testing framework uses panics pretty //! liberally. It was originally designed for internal use--it drives most of //! *ring*'s internal tests, and so it is optimized for getting *ring*'s tests //! written quickly at the expense of some usability. The documentation is //! lacking. The best way to learn it is to look at some examples. The digest //! tests are the most complicated because they use named sections. Other tests //! avoid named sections and so are easier to understand. //! //! # Examples //! //! ## Writing Tests //! //! Input files look like this: //! //! ```text //! # This is a comment. //! //! HMAC = SHA1 //! Input = "My test data" //! Key = "" //! Output = 61afdecb95429ef494d61fdee15990cabf0826fc //! //! HMAC = SHA256 //! Input = "Sample message for keylen //! at C:\Users\Example\example\:4 //! 9: 0x7ff65496d49c - example_test //! at C:\Users\Example\example\src\example.rs:652 //! 10: 0x7ff6549d192a - test::stats::Summary::new::ha139494ed2e4e01f //! 11: 0x7ff6549d51a2 - test::stats::Summary::new::ha139494ed2e4e01f //! 12: 0x7ff654a0a911 - _rust_maybe_catch_panic //! 13: 0x7ff6549d56dd - test::stats::Summary::new::ha139494ed2e4e01f //! 14: 0x7ff654a03783 - std::sys::thread::Thread::new::h2b08da6cd2517f79 //! 15: 0x7ff968518101 - BaseThreadInitThunk //! ``` //! //! Notice that the output shows the name of the data file //! (`src/example_tests.txt`), the test inputs that led to the failure, and the //! stack trace to the line in the test code that panicked: entry 9 in the //! stack trace pointing to line 652 of the file `example.rs`. extern crate alloc; use alloc::{format, string::String, vec::Vec}; use crate::{bits, digest, error}; #[cfg(any(feature = "std", feature = "test_logging"))] extern crate std; /// `compile_time_assert_clone::();` fails to compile if `T` doesn't /// implement `Clone`. pub const fn compile_time_assert_clone() {} /// `compile_time_assert_copy::();` fails to compile if `T` doesn't /// implement `Copy`. pub const fn compile_time_assert_copy() {} /// `compile_time_assert_eq::();` fails to compile if `T` doesn't /// implement `Eq`. pub const fn compile_time_assert_eq() {} /// `compile_time_assert_send::();` fails to compile if `T` doesn't /// implement `Send`. pub const fn compile_time_assert_send() {} /// `compile_time_assert_sync::();` fails to compile if `T` doesn't /// implement `Sync`. pub const fn compile_time_assert_sync() {} /// `compile_time_assert_std_error_error::();` fails to compile if `T` /// doesn't implement `std::error::Error`. #[cfg(feature = "std")] pub const fn compile_time_assert_std_error_error() {} /// A test case. A test case consists of a set of named attributes. Every /// attribute in the test case must be consumed exactly once; this helps catch /// typos and omissions. /// /// Requires the `alloc` default feature to be enabled. #[derive(Debug)] pub struct TestCase { attributes: Vec<(String, String, bool)>, } impl TestCase { /// Maps the string "true" to true and the string "false" to false. pub fn consume_bool(&mut self, key: &str) -> bool { match self.consume_string(key).as_ref() { "true" => true, "false" => false, s => panic!("Invalid bool value: {}", s), } } /// Maps the strings "SHA1", "SHA256", "SHA384", and "SHA512" to digest /// algorithms, maps "SHA224" to `None`, and panics on other (erroneous) /// inputs. "SHA224" is mapped to None because *ring* intentionally does /// not support SHA224, but we need to consume test vectors from NIST that /// have SHA224 vectors in them. pub fn consume_digest_alg(&mut self, key: &str) -> Option<&'static digest::Algorithm> { let name = self.consume_string(key); match name.as_ref() { "SHA1" => Some(&digest::SHA1_FOR_LEGACY_USE_ONLY), "SHA224" => None, // We actively skip SHA-224 support. "SHA256" => Some(&digest::SHA256), "SHA384" => Some(&digest::SHA384), "SHA512" => Some(&digest::SHA512), "SHA512_256" => Some(&digest::SHA512_256), _ => panic!("Unsupported digest algorithm: {}", name), } } /// Returns the value of an attribute that is encoded as a sequence of an /// even number of hex digits, or as a double-quoted UTF-8 string. The /// empty (zero-length) value is represented as "". pub fn consume_bytes(&mut self, key: &str) -> Vec { self.consume_optional_bytes(key) .unwrap_or_else(|| panic!("No attribute named \"{}\"", key)) } /// Like `consume_bytes()` except it returns `None` if the test case /// doesn't have the attribute. pub fn consume_optional_bytes(&mut self, key: &str) -> Option> { let s = self.consume_optional_string(key)?; let result = if let [b'\"', s @ ..] = s.as_bytes() { // The value is a quoted UTF-8 string. let mut s = s.iter(); let mut bytes = Vec::with_capacity(s.len() - 1); loop { let b = match s.next() { Some(b'\\') => { match s.next() { // We don't allow all octal escape sequences, only "\0" for null. Some(b'0') => 0u8, Some(b't') => b'\t', Some(b'n') => b'\n', // "\xHH" Some(b'x') => { let hi = s.next().expect("Invalid hex escape sequence in string."); let lo = s.next().expect("Invalid hex escape sequence in string."); if let (Ok(hi), Ok(lo)) = (from_hex_digit(*hi), from_hex_digit(*lo)) { (hi << 4) | lo } else { panic!("Invalid hex escape sequence in string."); } } _ => { panic!("Invalid hex escape sequence in string."); } } } Some(b'"') => { if s.next().is_some() { panic!("characters after the closing quote of a quoted string."); } break; } Some(b) => *b, None => panic!("Missing terminating '\"' in string literal."), }; bytes.push(b); } bytes } else { // The value is hex encoded. match from_hex(&s) { Ok(s) => s, Err(err_str) => { panic!("{} in {}", err_str, s); } } }; Some(result) } /// Returns the value of an attribute that is an integer, in decimal /// notation. pub fn consume_usize(&mut self, key: &str) -> usize { let s = self.consume_string(key); s.parse::().unwrap() } /// Returns the value of an attribute that is an integer, in decimal /// notation, as a bit length. pub fn consume_usize_bits(&mut self, key: &str) -> bits::BitLength { let s = self.consume_string(key); let bits = s.parse::().unwrap(); bits::BitLength::from_bits(bits) } /// Returns the raw value of an attribute, without any unquoting or /// other interpretation. pub fn consume_string(&mut self, key: &str) -> String { self.consume_optional_string(key) .unwrap_or_else(|| panic!("No attribute named \"{}\"", key)) } /// Like `consume_string()` except it returns `None` if the test case /// doesn't have the attribute. pub fn consume_optional_string(&mut self, key: &str) -> Option { for (name, value, consumed) in &mut self.attributes { if key == name { if *consumed { panic!("Attribute {} was already consumed", key); } *consumed = true; return Some(value.clone()); } } None } } /// References a test input file. #[cfg(test)] macro_rules! test_vector_file { ($file_name:expr) => { $crate::testutil::File { file_name: $file_name, contents: include_str!($file_name), } }; } /// A test input file. pub struct File<'a> { /// The name (path) of the file. pub file_name: &'a str, /// The contents of the file. pub contents: &'a str, } /// Parses test cases out of the given file, calling `f` on each vector until /// `f` fails or until all the test vectors have been read. `f` can indicate /// failure either by returning `Err()` or by panicking. pub fn run(test_file: File, mut f: F) where F: FnMut(&str, &mut TestCase) -> Result<(), error::Unspecified>, { let lines = &mut test_file.contents.lines(); let mut current_section = String::from(""); let mut failed = false; while let Some(mut test_case) = parse_test_case(&mut current_section, lines) { let result = match f(¤t_section, &mut test_case) { Ok(()) => { if !test_case .attributes .iter() .any(|&(_, _, consumed)| !consumed) { Ok(()) } else { failed = true; Err("Test didn't consume all attributes.") } } Err(error::Unspecified) => Err("Test returned Err(error::Unspecified)."), }; if result.is_err() { failed = true; } #[cfg(feature = "test_logging")] if let Err(msg) = result { std::println!("{}: {}", test_file.file_name, msg); for (name, value, consumed) in test_case.attributes { let consumed_str = if consumed { "" } else { " (unconsumed)" }; std::println!("{}{} = {}", name, consumed_str, value); } }; } if failed { panic!("Test failed.") } } /// Decode an string of hex digits into a sequence of bytes. The input must /// have an even number of digits. pub fn from_hex(hex_str: &str) -> Result, String> { if hex_str.len() % 2 != 0 { return Err(String::from( "Hex string does not have an even number of digits", )); } let mut result = Vec::with_capacity(hex_str.len() / 2); for digits in hex_str.as_bytes().chunks(2) { let hi = from_hex_digit(digits[0])?; let lo = from_hex_digit(digits[1])?; result.push((hi * 0x10) | lo); } Ok(result) } fn from_hex_digit(d: u8) -> Result { use core::ops::RangeInclusive; const DECIMAL: (u8, RangeInclusive) = (0, b'0'..=b'9'); const HEX_LOWER: (u8, RangeInclusive) = (10, b'a'..=b'f'); const HEX_UPPER: (u8, RangeInclusive) = (10, b'A'..=b'F'); for (offset, range) in &[DECIMAL, HEX_LOWER, HEX_UPPER] { if range.contains(&d) { return Ok(d - range.start() + offset); } } Err(format!("Invalid hex digit '{}'", d as char)) } fn parse_test_case( current_section: &mut String, lines: &mut dyn Iterator, ) -> Option { let mut attributes = Vec::new(); let mut is_first_line = true; loop { let line = lines.next(); #[cfg(feature = "test_logging")] if let Some(text) = &line { std::println!("Line: {}", text); } match line { // If we get to EOF when we're not in the middle of a test case, // then we're done. None if is_first_line => { return None; } // End of the file on a non-empty test cases ends the test case. None => { return Some(TestCase { attributes }); } // A blank line ends a test case if the test case isn't empty. Some("") => { if !is_first_line { return Some(TestCase { attributes }); } // Ignore leading blank lines. } // Comments start with '#'; ignore them. Some(line) if line.starts_with('#') => (), Some(line) if line.starts_with('[') => { assert!(is_first_line); assert!(line.ends_with(']')); current_section.truncate(0); current_section.push_str(line); let _ = current_section.pop(); let _ = current_section.remove(0); } Some(line) => { is_first_line = false; let parts: Vec<&str> = line.splitn(2, " = ").collect(); if parts.len() != 2 { panic!("Syntax error: Expected Key = Value."); }; let key = parts[0].trim(); let value = parts[1].trim(); // Don't allow the value to be omitted. An empty value can be // represented as an empty quoted string. assert_ne!(value.len(), 0); // Checking is_none() ensures we don't accept duplicate keys. attributes.push((String::from(key), String::from(value), false)); } } } } /// Deterministic implementations of `ring::rand::SecureRandom`. /// /// These implementations are particularly useful for testing implementations /// of randomized algorithms & protocols using known-answer-tests where the /// test vectors contain the random seed to use. They are also especially /// useful for some types of fuzzing. #[doc(hidden)] pub mod rand { use crate::{error, rand}; /// An implementation of `SecureRandom` that always fills the output slice /// with the given byte. #[derive(Debug)] pub struct FixedByteRandom { pub byte: u8, } impl rand::sealed::SecureRandom for FixedByteRandom { fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> { dest.fill(self.byte); Ok(()) } } /// An implementation of `SecureRandom` that always fills the output slice /// with the slice in `bytes`. The length of the slice given to `slice` /// must match exactly. #[derive(Debug)] pub struct FixedSliceRandom<'a> { pub bytes: &'a [u8], } impl rand::sealed::SecureRandom for FixedSliceRandom<'_> { fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> { dest.copy_from_slice(self.bytes); Ok(()) } } /// An implementation of `SecureRandom` where each slice in `bytes` is a /// test vector for one call to `fill()`. *Not thread-safe.* /// /// The first slice in `bytes` is the output for the first call to /// `fill()`, the second slice is the output for the second call to /// `fill()`, etc. The output slice passed to `fill()` must have exactly /// the length of the corresponding entry in `bytes`. `current` must be /// initialized to zero. `fill()` must be called exactly once for each /// entry in `bytes`. #[derive(Debug)] pub struct FixedSliceSequenceRandom<'a> { /// The value. pub bytes: &'a [&'a [u8]], pub current: core::cell::UnsafeCell, } impl rand::sealed::SecureRandom for FixedSliceSequenceRandom<'_> { fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> { let current = unsafe { *self.current.get() }; let bytes = self.bytes[current]; dest.copy_from_slice(bytes); // Remember that we returned this slice and prepare to return // the next one, if any. unsafe { *self.current.get() += 1 }; Ok(()) } } impl Drop for FixedSliceSequenceRandom<'_> { fn drop(&mut self) { // Ensure that `fill()` was called exactly the right number of // times. assert_eq!(unsafe { *self.current.get() }, self.bytes.len()); } } } #[cfg(test)] mod tests { use crate::error; use crate::testutil as test; #[test] fn one_ok() { test::run(test_vector_file!("test_1_tests.txt"), |_, test_case| { let _ = test_case.consume_string("Key"); Ok(()) }); } #[test] #[should_panic(expected = "Test failed.")] fn one_err() { test::run(test_vector_file!("test_1_tests.txt"), |_, test_case| { let _ = test_case.consume_string("Key"); Err(error::Unspecified) }); } #[test] #[should_panic(expected = "Oh noes!")] fn one_panics() { test::run(test_vector_file!("test_1_tests.txt"), |_, test_case| { let _ = test_case.consume_string("Key"); panic!("Oh noes!"); }); } #[test] #[should_panic(expected = "Test failed.")] fn first_err() { err_one(0) } #[test] #[should_panic(expected = "Test failed.")] fn middle_err() { err_one(1) } #[test] #[should_panic(expected = "Test failed.")] fn last_err() { err_one(2) } fn err_one(test_to_fail: usize) { let mut n = 0; test::run(test_vector_file!("test_3_tests.txt"), |_, test_case| { let _ = test_case.consume_string("Key"); let result = if n != test_to_fail { Ok(()) } else { Err(error::Unspecified) }; n += 1; result }); } #[test] #[should_panic(expected = "Oh Noes!")] fn first_panic() { panic_one(0) } #[test] #[should_panic(expected = "Oh Noes!")] fn middle_panic() { panic_one(1) } #[test] #[should_panic(expected = "Oh Noes!")] fn last_panic() { panic_one(2) } fn panic_one(test_to_fail: usize) { let mut n = 0; test::run(test_vector_file!("test_3_tests.txt"), |_, test_case| { let _ = test_case.consume_string("Key"); if n == test_to_fail { panic!("Oh Noes!"); }; n += 1; Ok(()) }); } #[test] #[should_panic(expected = "Syntax error: Expected Key = Value.")] fn syntax_error() { test::run( test_vector_file!("test_1_syntax_error_tests.txt"), |_, _| Ok(()), ); } } ring-0.17.14/tests/aead_tests.rs000064400000000000000000000533521046102023000145710ustar 00000000000000// Copyright 2015-2021 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![allow(missing_docs)] #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] wasm_bindgen_test_configure!(run_in_browser); use core::ops::RangeFrom; use ring::{aead, error}; #[allow(deprecated)] use ring::{test, test_file}; /// Generate the known answer test functions for the given algorithm and test /// case input file, where each test is implemented by a test in `$test`. /// /// All of these tests can be run in parallel. macro_rules! test_known_answer { ( $alg:ident, $test_file:expr, [ $( $test:ident ),+, ] ) => { $( #[test] fn $test() { test_aead( &aead::$alg, super::super::$test, test_file!($test_file)); } )+ } } /// Generate the tests for a given algorithm. /// /// All of these tests can be run in parallel. macro_rules! test_aead { { $( { $alg:ident, $test_file:expr } ),+, } => { mod aead_test { // Make `cargo test aead` include these files. $( #[allow(non_snake_case)] mod $alg { // Provide a separate namespace for each algorithm's test. use super::super::*; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] use wasm_bindgen_test::wasm_bindgen_test as test; test_known_answer!( $alg, $test_file, [ less_safe_key_open_in_place, less_safe_key_open_within, less_safe_key_seal_in_place_append_tag, less_safe_key_seal_in_place_separate_tag, opening_key_open_in_place, opening_key_open_within, sealing_key_seal_in_place_append_tag, sealing_key_seal_in_place_separate_tag, test_open_in_place_seperate_tag, ]); #[test] fn key_sizes() { super::super::key_sizes(&aead::$alg); } } )+ } } } test_aead! { { AES_128_GCM, "aead_aes_128_gcm_tests.txt" }, { AES_256_GCM, "aead_aes_256_gcm_tests.txt" }, { CHACHA20_POLY1305, "aead_chacha20_poly1305_tests.txt" }, } struct KnownAnswerTestCase<'a> { key: &'a [u8], nonce: [u8; aead::NONCE_LEN], plaintext: &'a [u8], aad: aead::Aad<&'a [u8]>, ciphertext: &'a [u8], tag: &'a [u8], } fn test_aead( aead_alg: &'static aead::Algorithm, f: impl Fn(&'static aead::Algorithm, KnownAnswerTestCase) -> Result<(), error::Unspecified>, test_file: test::File, ) { test::run(test_file, |section, test_case| { assert_eq!(section, ""); let key = test_case.consume_bytes("KEY"); let nonce = test_case.consume_bytes("NONCE"); let plaintext = test_case.consume_bytes("IN"); let aad = test_case.consume_bytes("AD"); let ct = test_case.consume_bytes("CT"); let tag = test_case.consume_bytes("TAG"); let error = test_case.consume_optional_string("FAILS"); match error.as_deref() { Some("WRONG_NONCE_LENGTH") => { assert!(matches!( aead::Nonce::try_assume_unique_for_key(&nonce), Err(error::Unspecified) )); return Ok(()); } Some(unexpected) => { unreachable!("unexpected error in test data: {}", unexpected); } None => {} }; let test_case = KnownAnswerTestCase { key: &key, nonce: nonce.as_slice().try_into().unwrap(), plaintext: &plaintext, aad: aead::Aad::from(&aad), ciphertext: &ct, tag: &tag, }; f(aead_alg, test_case) }) } fn test_seal_append_tag( tc: &KnownAnswerTestCase, seal: Seal, ) -> Result<(), error::Unspecified> where Seal: FnOnce(aead::Nonce, &mut Vec) -> Result<(), error::Unspecified>, { let mut in_out = Vec::from(tc.plaintext); seal(aead::Nonce::assume_unique_for_key(tc.nonce), &mut in_out)?; let mut expected_ciphertext_and_tag = Vec::from(tc.ciphertext); expected_ciphertext_and_tag.extend_from_slice(tc.tag); assert_eq!(in_out, expected_ciphertext_and_tag); Ok(()) } fn test_seal_separate_tag( tc: &KnownAnswerTestCase, seal: Seal, ) -> Result<(), error::Unspecified> where Seal: Fn(aead::Nonce, &mut [u8]) -> Result, { let mut in_out = Vec::from(tc.plaintext); let actual_tag = seal(aead::Nonce::assume_unique_for_key(tc.nonce), &mut in_out)?; assert_eq!(actual_tag.as_ref(), tc.tag); assert_eq!(in_out, tc.ciphertext); Ok(()) } fn test_open_in_place( tc: &KnownAnswerTestCase<'_>, open_in_place: OpenInPlace, ) -> Result<(), error::Unspecified> where OpenInPlace: for<'a> FnOnce(aead::Nonce, &'a mut [u8]) -> Result<&'a mut [u8], error::Unspecified>, { let nonce = aead::Nonce::assume_unique_for_key(tc.nonce); let mut in_out = Vec::from(tc.ciphertext); in_out.extend_from_slice(tc.tag); let actual_plaintext = open_in_place(nonce, &mut in_out)?; assert_eq!(actual_plaintext, tc.plaintext); assert_eq!(&in_out[..tc.plaintext.len()], tc.plaintext); Ok(()) } fn test_open_in_place_seperate_tag( alg: &'static aead::Algorithm, tc: KnownAnswerTestCase, ) -> Result<(), error::Unspecified> { let key = make_less_safe_key(alg, tc.key); let mut in_out = Vec::from(tc.ciphertext); let tag = tc.tag.try_into().unwrap(); // Test the simplest behavior. { let nonce = aead::Nonce::assume_unique_for_key(tc.nonce); let actual_plaintext = key.open_in_place_separate_tag(nonce, tc.aad, tag, &mut in_out, 0..)?; assert_eq!(actual_plaintext, tc.plaintext); assert_eq!(&in_out[..tc.plaintext.len()], tc.plaintext); } // Test that ciphertext range shifting works as expected. { let range = in_out.len()..; in_out.extend_from_slice(tc.ciphertext); let nonce = aead::Nonce::assume_unique_for_key(tc.nonce); let actual_plaintext = key.open_in_place_separate_tag(nonce, tc.aad, tag, &mut in_out, range)?; assert_eq!(actual_plaintext, tc.plaintext); assert_eq!(&in_out[..tc.plaintext.len()], tc.plaintext); } Ok(()) } fn test_open_within( tc: &KnownAnswerTestCase<'_>, open_within: OpenWithin, ) -> Result<(), error::Unspecified> where OpenWithin: for<'a> Fn( aead::Nonce, &'a mut [u8], RangeFrom, ) -> Result<&'a mut [u8], error::Unspecified>, { // In release builds, test all prefix lengths from 0 to 4096 bytes. // Debug builds are too slow for this, so for those builds, only // test a smaller subset. // TLS record headers are 5 bytes long. // TLS explicit nonces for AES-GCM are 8 bytes long. static MINIMAL_IN_PREFIX_LENS: [usize; 36] = [ // No input prefix to overwrite; i.e. the opening is exactly // "in place." 0, 1, 2, // Proposed TLS 1.3 header (no explicit nonce). 5, 8, // Probably the most common use of a non-zero `in_prefix_len` // would be to write a decrypted TLS record over the top of the // TLS header and nonce. 5 /* record header */ + 8, /* explicit nonce */ // The stitched AES-GCM x86-64 code works on 6-block (96 byte) // units. Some of the ChaCha20 code is even weirder. 15, // The maximum partial AES block. 16, // One AES block. 17, // One byte more than a full AES block. 31, // 2 AES blocks or 1 ChaCha20 block, minus 1. 32, // Two AES blocks, one ChaCha20 block. 33, // 2 AES blocks or 1 ChaCha20 block, plus 1. 47, // Three AES blocks - 1. 48, // Three AES blocks. 49, // Three AES blocks + 1. 63, // Four AES blocks or two ChaCha20 blocks, minus 1. 64, // Four AES blocks or two ChaCha20 blocks. 65, // Four AES blocks or two ChaCha20 blocks, plus 1. 79, // Five AES blocks, minus 1. 80, // Five AES blocks. 81, // Five AES blocks, plus 1. 95, // Six AES blocks or three ChaCha20 blocks, minus 1. 96, // Six AES blocks or three ChaCha20 blocks. 97, // Six AES blocks or three ChaCha20 blocks, plus 1. 111, // Seven AES blocks, minus 1. 112, // Seven AES blocks. 113, // Seven AES blocks, plus 1. 127, // Eight AES blocks or four ChaCha20 blocks, minus 1. 128, // Eight AES blocks or four ChaCha20 blocks. 129, // Eight AES blocks or four ChaCha20 blocks, plus 1. 143, // Nine AES blocks, minus 1. 144, // Nine AES blocks. 145, // Nine AES blocks, plus 1. 255, // 16 AES blocks or 8 ChaCha20 blocks, minus 1. 256, // 16 AES blocks or 8 ChaCha20 blocks. 257, // 16 AES blocks or 8 ChaCha20 blocks, plus 1. ]; let mut more_comprehensive_in_prefix_lengths = [0; 4096]; let in_prefix_lengths = if cfg!(debug_assertions) { &MINIMAL_IN_PREFIX_LENS[..] } else { #[allow(clippy::needless_range_loop)] for b in 0..more_comprehensive_in_prefix_lengths.len() { more_comprehensive_in_prefix_lengths[b] = b; } &more_comprehensive_in_prefix_lengths[..] }; let mut in_out = vec![123u8; 4096]; for &in_prefix_len in in_prefix_lengths.iter() { in_out.truncate(0); in_out.resize(in_prefix_len, 123); in_out.extend_from_slice(tc.ciphertext); in_out.extend_from_slice(tc.tag); let actual_plaintext = open_within( aead::Nonce::assume_unique_for_key(tc.nonce), &mut in_out, in_prefix_len.., )?; assert_eq!(actual_plaintext, tc.plaintext); assert_eq!(&in_out[..tc.plaintext.len()], tc.plaintext); } Ok(()) } fn sealing_key_seal_in_place_append_tag( alg: &'static aead::Algorithm, tc: KnownAnswerTestCase, ) -> Result<(), error::Unspecified> { test_seal_append_tag(&tc, |nonce, in_out| { let mut key: aead::SealingKey = make_key(alg, tc.key, nonce); key.seal_in_place_append_tag(tc.aad, in_out) }) } fn sealing_key_seal_in_place_separate_tag( alg: &'static aead::Algorithm, tc: KnownAnswerTestCase, ) -> Result<(), error::Unspecified> { test_seal_separate_tag(&tc, |nonce, in_out| { let mut key: aead::SealingKey<_> = make_key(alg, tc.key, nonce); key.seal_in_place_separate_tag(tc.aad, in_out) }) } fn opening_key_open_in_place( alg: &'static aead::Algorithm, tc: KnownAnswerTestCase, ) -> Result<(), error::Unspecified> { test_open_in_place(&tc, |nonce, in_out| { let mut key: aead::OpeningKey<_> = make_key(alg, tc.key, nonce); key.open_in_place(tc.aad, in_out) }) } fn opening_key_open_within( alg: &'static aead::Algorithm, tc: KnownAnswerTestCase, ) -> Result<(), error::Unspecified> { test_open_within(&tc, |nonce, in_out, ciphertext_and_tag| { let mut key: aead::OpeningKey = make_key(alg, tc.key, nonce); key.open_within(tc.aad, in_out, ciphertext_and_tag) }) } fn less_safe_key_seal_in_place_append_tag( alg: &'static aead::Algorithm, tc: KnownAnswerTestCase, ) -> Result<(), error::Unspecified> { test_seal_append_tag(&tc, |nonce, in_out| { let key = make_less_safe_key(alg, tc.key); key.seal_in_place_append_tag(nonce, tc.aad, in_out) }) } fn less_safe_key_open_in_place( alg: &'static aead::Algorithm, tc: KnownAnswerTestCase, ) -> Result<(), error::Unspecified> { test_open_in_place(&tc, |nonce, in_out| { let key = make_less_safe_key(alg, tc.key); key.open_in_place(nonce, tc.aad, in_out) }) } fn less_safe_key_seal_in_place_separate_tag( alg: &'static aead::Algorithm, tc: KnownAnswerTestCase, ) -> Result<(), error::Unspecified> { test_seal_separate_tag(&tc, |nonce, in_out| { let key = make_less_safe_key(alg, tc.key); key.seal_in_place_separate_tag(nonce, tc.aad, in_out) }) } fn less_safe_key_open_within( alg: &'static aead::Algorithm, tc: KnownAnswerTestCase, ) -> Result<(), error::Unspecified> { test_open_within(&tc, |nonce, in_out, ciphertext_and_tag| { let key = make_less_safe_key(alg, tc.key); key.open_within(nonce, tc.aad, in_out, ciphertext_and_tag) }) } #[allow(clippy::range_plus_one)] fn key_sizes(aead_alg: &'static aead::Algorithm) { let key_len = aead_alg.key_len(); let key_data = vec![0u8; key_len * 2]; // Key is the right size. assert!(aead::UnboundKey::new(aead_alg, &key_data[..key_len]).is_ok()); // Key is one byte too small. assert!(aead::UnboundKey::new(aead_alg, &key_data[..(key_len - 1)]).is_err()); // Key is one byte too large. assert!(aead::UnboundKey::new(aead_alg, &key_data[..(key_len + 1)]).is_err()); // Key is half the required size. assert!(aead::UnboundKey::new(aead_alg, &key_data[..(key_len / 2)]).is_err()); // Key is twice the required size. assert!(aead::UnboundKey::new(aead_alg, &key_data[..(key_len * 2)]).is_err()); // Key is empty. assert!(aead::UnboundKey::new(aead_alg, &[]).is_err()); // Key is one byte. assert!(aead::UnboundKey::new(aead_alg, &[0]).is_err()); } // Test that we reject non-standard nonce sizes. #[allow(clippy::range_plus_one)] #[test] fn test_aead_nonce_sizes() { let nonce_len = aead::NONCE_LEN; let nonce = vec![0u8; nonce_len * 2]; assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..nonce_len]).is_ok()); assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..(nonce_len - 1)]).is_err()); assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..(nonce_len + 1)]).is_err()); assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..(nonce_len / 2)]).is_err()); assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..(nonce_len * 2)]).is_err()); assert!(aead::Nonce::try_assume_unique_for_key(&[]).is_err()); assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..1]).is_err()); assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..16]).is_err()); // 128 bits. } #[allow(clippy::range_plus_one)] #[test] fn aead_chacha20_poly1305_openssh() { // TODO: test_aead_key_sizes(...); test::run( test_file!("aead_chacha20_poly1305_openssh_tests.txt"), |section, test_case| { assert_eq!(section, ""); // XXX: `polyfill::convert` isn't available here. let key_bytes = { let as_vec = test_case.consume_bytes("KEY"); let mut as_array = [0u8; aead::chacha20_poly1305_openssh::KEY_LEN]; as_array.copy_from_slice(&as_vec); as_array }; let sequence_num: u32 = test_case .consume_usize("SEQUENCE_NUMBER") .try_into() .unwrap(); let plaintext = test_case.consume_bytes("IN"); let ct = test_case.consume_bytes("CT"); let expected_tag = test_case.consume_bytes("TAG"); // TODO: Add some tests for when things fail. //let error = test_case.consume_optional_string("FAILS"); let mut tag = [0u8; aead::chacha20_poly1305_openssh::TAG_LEN]; let mut s_in_out = plaintext.clone(); let s_key = aead::chacha20_poly1305_openssh::SealingKey::new(&key_bytes); s_key.seal_in_place(sequence_num, &mut s_in_out[..], &mut tag); assert_eq!(&ct, &s_in_out); assert_eq!(&expected_tag, &tag); let o_key = aead::chacha20_poly1305_openssh::OpeningKey::new(&key_bytes); { let o_result = o_key.open_in_place(sequence_num, &mut s_in_out[..], &tag); assert_eq!(o_result, Ok(&plaintext[4..])); } assert_eq!(&s_in_out[..4], &ct[..4]); assert_eq!(&s_in_out[4..], &plaintext[4..]); Ok(()) }, ); } #[test] fn aead_test_aad_traits() { test::compile_time_assert_send::>(); test::compile_time_assert_sync::>(); test::compile_time_assert_copy::>(); test::compile_time_assert_eq::>>(); // `!Copy` let aad_123 = aead::Aad::from(vec![1, 2, 3]); // `!Copy` assert_eq!(aad_123, aad_123.clone()); // Cover `Clone` and `PartialEq` assert_eq!( format!("{:?}", aead::Aad::from(&[1, 2, 3])), "Aad([1, 2, 3])" ); } #[test] fn test_nonce_traits() { test::compile_time_assert_send::(); test::compile_time_assert_sync::(); } #[test] fn test_tag_traits() { test::compile_time_assert_send::(); test::compile_time_assert_sync::(); test::compile_time_assert_copy::(); test::compile_time_assert_clone::(); let tag = aead::Tag::from([4u8; 16]); let _tag_2 = tag; // Cover `Copy` assert_eq!(tag.as_ref(), tag.clone().as_ref()); // Cover `Clone` } fn test_aead_key_traits() {} #[test] fn test_aead_key_traits_all() { test_aead_key_traits::>(); test_aead_key_traits::>(); test_aead_key_traits::(); } #[test] fn test_aead_key_debug() { let key_bytes = [0; 32]; let nonce = [0; aead::NONCE_LEN]; let key = aead::UnboundKey::new(&aead::AES_256_GCM, &key_bytes).unwrap(); assert_eq!( "UnboundKey { algorithm: AES_256_GCM }", format!("{:?}", key) ); let sealing_key: aead::SealingKey = make_key( &aead::AES_256_GCM, &key_bytes, aead::Nonce::try_assume_unique_for_key(&nonce).unwrap(), ); assert_eq!( "SealingKey { algorithm: AES_256_GCM }", format!("{:?}", sealing_key) ); let opening_key: aead::OpeningKey = make_key( &aead::AES_256_GCM, &key_bytes, aead::Nonce::try_assume_unique_for_key(&nonce).unwrap(), ); assert_eq!( "OpeningKey { algorithm: AES_256_GCM }", format!("{:?}", opening_key) ); let key: aead::LessSafeKey = make_less_safe_key(&aead::AES_256_GCM, &key_bytes); assert_eq!( "LessSafeKey { algorithm: AES_256_GCM }", format!("{:?}", key) ); } fn test_aead_lesssafekey_clone_for_algorithm(algorithm: &'static aead::Algorithm) { let test_bytes: Vec = (0..32).collect(); let key_bytes = &test_bytes[..algorithm.key_len()]; let nonce_bytes = &test_bytes[..algorithm.nonce_len()]; let key1: aead::LessSafeKey = aead::LessSafeKey::new(aead::UnboundKey::new(algorithm, key_bytes).unwrap()); let key2 = key1.clone(); // LessSafeKey doesn't support AsRef or PartialEq, so instead just check that both keys produce // the same encrypted output. let mut buf1: Vec = (0..100).collect(); let mut buf2 = buf1.clone(); let tag1 = key1 .seal_in_place_separate_tag( aead::Nonce::try_assume_unique_for_key(nonce_bytes).unwrap(), aead::Aad::empty(), &mut buf1, ) .unwrap(); let tag2 = key2 .seal_in_place_separate_tag( aead::Nonce::try_assume_unique_for_key(nonce_bytes).unwrap(), aead::Aad::empty(), &mut buf2, ) .unwrap(); assert_eq!(tag1.as_ref(), tag2.as_ref()); assert_eq!(buf1, buf2); } #[test] fn test_aead_lesssafekey_clone_aes_128_gcm() { test_aead_lesssafekey_clone_for_algorithm(&aead::AES_128_GCM); } #[test] fn test_aead_lesssafekey_clone_aes_256_gcm() { test_aead_lesssafekey_clone_for_algorithm(&aead::AES_256_GCM); } #[test] fn test_aead_lesssafekey_clone_chacha20_poly1305() { test_aead_lesssafekey_clone_for_algorithm(&aead::CHACHA20_POLY1305); } fn make_key>( algorithm: &'static aead::Algorithm, key: &[u8], nonce: aead::Nonce, ) -> K { let key = aead::UnboundKey::new(algorithm, key).unwrap(); let nonce_sequence = OneNonceSequence::new(nonce); K::new(key, nonce_sequence) } fn make_less_safe_key(algorithm: &'static aead::Algorithm, key: &[u8]) -> aead::LessSafeKey { let key = aead::UnboundKey::new(algorithm, key).unwrap(); aead::LessSafeKey::new(key) } struct OneNonceSequence(Option); impl OneNonceSequence { /// Constructs the sequence allowing `advance()` to be called /// `allowed_invocations` times. fn new(nonce: aead::Nonce) -> Self { Self(Some(nonce)) } } impl aead::NonceSequence for OneNonceSequence { fn advance(&mut self) -> Result { self.0.take().ok_or(error::Unspecified) } } ring-0.17.14/tests/agreement_tests.rs000064400000000000000000000165411046102023000156450ustar 00000000000000// Copyright 2015-2017 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![allow(missing_docs)] #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] wasm_bindgen_test_configure!(run_in_browser); extern crate alloc; use ring::{agreement, error, rand}; #[allow(deprecated)] use ring::{test, test_file}; #[test] fn agreement_traits() { use alloc::vec::Vec; let rng = rand::SystemRandom::new(); let private_key = agreement::EphemeralPrivateKey::generate(&agreement::ECDH_P256, &rng).unwrap(); test::compile_time_assert_send::(); test::compile_time_assert_sync::(); assert_eq!( format!("{:?}", &private_key), "EphemeralPrivateKey { algorithm: Algorithm { curve: P256 } }" ); let public_key = private_key.compute_public_key().unwrap(); test::compile_time_assert_clone::(); test::compile_time_assert_send::(); test::compile_time_assert_sync::(); // Verify `PublicKey` implements `Debug`. // // TODO: Test the actual output. let _: &dyn core::fmt::Debug = &public_key; test::compile_time_assert_clone::>(); test::compile_time_assert_copy::>(); test::compile_time_assert_sync::>(); test::compile_time_assert_clone::>>(); test::compile_time_assert_sync::>>(); let unparsed_public_key = agreement::UnparsedPublicKey::new(&agreement::X25519, &[0x01, 0x02, 0x03]); assert_eq!( format!("{:?}", unparsed_public_key), r#"UnparsedPublicKey { algorithm: Algorithm { curve: Curve25519 }, bytes: "010203" }"# ); // Test `AsRef<[u8]>` assert_eq!(unparsed_public_key.as_ref(), &[0x01, 0x02, 0x03]); } #[test] fn agreement_agree_ephemeral() { let rng = rand::SystemRandom::new(); test::run(test_file!("agreement_tests.txt"), |section, test_case| { assert_eq!(section, ""); let curve_name = test_case.consume_string("Curve"); let alg = alg_from_curve_name(&curve_name); let peer_public = agreement::UnparsedPublicKey::new(alg, test_case.consume_bytes("PeerQ")); match test_case.consume_optional_string("Error") { None => { let my_private = test_case.consume_bytes("D"); let my_private = { #[allow(deprecated)] let rng = test::rand::FixedSliceRandom { bytes: &my_private }; agreement::EphemeralPrivateKey::generate(alg, &rng)? }; let my_public = test_case.consume_bytes("MyQ"); let output = test_case.consume_bytes("Output"); assert_eq!(my_private.algorithm(), alg); let computed_public = my_private.compute_public_key().unwrap(); assert_eq!(computed_public.as_ref(), &my_public[..]); assert_eq!(my_private.algorithm(), alg); let result = agreement::agree_ephemeral(my_private, &peer_public, |key_material| { assert_eq!(key_material, &output[..]); }); assert_eq!(result, Ok(())); } Some(_) => { // In the no-heap mode, some algorithms aren't supported so // we have to skip those algorithms' test cases. let dummy_private_key = agreement::EphemeralPrivateKey::generate(alg, &rng)?; fn kdf_not_called(_: &[u8]) -> Result<(), ()> { panic!( "The KDF was called during ECDH when the peer's \ public key is invalid." ); } assert!(agreement::agree_ephemeral( dummy_private_key, &peer_public, kdf_not_called ) .is_err()); } } Ok(()) }); } #[test] fn test_agreement_ecdh_x25519_rfc_iterated() { let mut k = h("0900000000000000000000000000000000000000000000000000000000000000"); let mut u = k.clone(); fn expect_iterated_x25519( expected_result: &str, range: core::ops::Range, k: &mut Vec, u: &mut Vec, ) { for _ in range { let new_k = x25519(k, u); u.clone_from(k); *k = new_k; } assert_eq!(&h(expected_result), k); } expect_iterated_x25519( "422c8e7a6227d7bca1350b3e2bb7279f7897b87bb6854b783c60e80311ae3079", 0..1, &mut k, &mut u, ); expect_iterated_x25519( "684cf59ba83309552800ef566f2f4d3c1c3887c49360e3875f2eb94d99532c51", 1..1_000, &mut k, &mut u, ); // The spec gives a test vector for 1,000,000 iterations but it takes // too long to do 1,000,000 iterations by default right now. This // 10,000 iteration vector is self-computed. expect_iterated_x25519( "2c125a20f639d504a7703d2e223c79a79de48c4ee8c23379aa19a62ecd211815", 1_000..10_000, &mut k, &mut u, ); if cfg!(feature = "slow_tests") { expect_iterated_x25519( "7c3911e0ab2586fd864497297e575e6f3bc601c0883c30df5f4dd2d24f665424", 10_000..1_000_000, &mut k, &mut u, ); } } fn x25519(private_key: &[u8], public_key: &[u8]) -> Vec { x25519_(private_key, public_key).unwrap() } fn x25519_(private_key: &[u8], public_key: &[u8]) -> Result, error::Unspecified> { #[allow(deprecated)] let rng = test::rand::FixedSliceRandom { bytes: private_key }; let private_key = agreement::EphemeralPrivateKey::generate(&agreement::X25519, &rng)?; let public_key = agreement::UnparsedPublicKey::new(&agreement::X25519, public_key); agreement::agree_ephemeral(private_key, &public_key, |agreed_value| { Vec::from(agreed_value) }) } fn h(s: &str) -> Vec { match test::from_hex(s) { Ok(v) => v, Err(msg) => { panic!("{} in {}", msg, s); } } } fn alg_from_curve_name(curve_name: &str) -> &'static agreement::Algorithm { if curve_name == "P-256" { &agreement::ECDH_P256 } else if curve_name == "P-384" { &agreement::ECDH_P384 } else if curve_name == "X25519" { &agreement::X25519 } else { panic!("Unsupported curve: {}", curve_name); } } ring-0.17.14/tests/constant_time_tests.rs000064400000000000000000000056141046102023000165440ustar 00000000000000// Copyright 2020 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![allow(missing_docs)] #[allow(deprecated)] use constant_time::verify_slices_are_equal; #[allow(deprecated)] use ring::constant_time; use ring::{error, rand}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] wasm_bindgen_test_configure!(run_in_browser); // This logic is loosely based on BoringSSL's `TEST(ConstantTimeTest, MemCmp)`. #[allow(deprecated)] #[test] fn test_verify_slices_are_equal() { let initial: [u8; 256] = rand::generate(&rand::SystemRandom::new()).unwrap().expose(); { let copy = initial; for len in 0..copy.len() { // Not equal because the lengths do not match. assert_eq!( verify_slices_are_equal(&initial, ©[..len]), Err(error::Unspecified) ); // Equal lengths and equal contents. assert_eq!( verify_slices_are_equal(&initial[..len], ©[..len]), Ok(()) ); } // Equal lengths and equal contents. assert_eq!(verify_slices_are_equal(&initial, ©), Ok(())); } for i in 0..initial.len() { for bit in 0..8 { let mut copy = initial; copy[i] ^= 1u8 << bit; for len in 0..=initial.len() { // We flipped at least one bit in `copy`. assert_ne!(&initial[..], ©[..]); let a = &initial[..len]; let b = ©[..len]; let expected_result = if i < len { // The flipped bit is within `b` so `a` and `b` are not equal. Err(error::Unspecified) } else { // The flipped bit is outside of `b` so `a` and `b` are equal. Ok(()) }; assert_eq!(a == b, expected_result.is_ok()); // Sanity check. assert_eq!(verify_slices_are_equal(a, b), expected_result); assert_eq!(verify_slices_are_equal(b, a), expected_result); } } } } ring-0.17.14/tests/digest_tests.rs000064400000000000000000000120071046102023000151460ustar 00000000000000// Copyright 2015-2017 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![allow(missing_docs)] use ring::digest; #[allow(deprecated)] use ring::{test, test_file}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] wasm_bindgen_test_configure!(run_in_browser); /// Test vectors from BoringSSL, Go, and other sources. #[test] fn digest_misc() { test::run(test_file!("digest_tests.txt"), |section, test_case| { assert_eq!(section, ""); let digest_alg = test_case.consume_digest_alg("Hash").unwrap(); let input = test_case.consume_bytes("Input"); let repeat = test_case.consume_usize("Repeat"); let expected = test_case.consume_bytes("Output"); let mut ctx = digest::Context::new(digest_alg); let mut data = Vec::new(); for _ in 0..repeat { ctx.update(&input); data.extend(&input); } let actual_from_chunks = ctx.finish(); assert_eq!(&expected, &actual_from_chunks.as_ref()); let actual_from_one_shot = digest::digest(digest_alg, &data); assert_eq!(&expected, &actual_from_one_shot.as_ref()); Ok(()) }); } /// Test some ways in which `Context::update` and/or `Context::finish` /// could go wrong by testing every combination of updating three inputs /// that vary from zero bytes to one byte larger than the block length. /// /// These are not run in dev (debug) builds because they are too slow. macro_rules! test_i_u_f { ( $test_name:ident, $alg:expr) => { #[cfg(not(debug_assertions))] #[test] fn $test_name() { let mut input = [0; (digest::MAX_BLOCK_LEN + 1) * 3]; let max = $alg.block_len() + 1; for i in 0..(max * 3) { input[i] = (i & 0xff) as u8; } for i in 0..max { for j in 0..max { for k in 0..max { let part1 = &input[..i]; let part2 = &input[i..(i + j)]; let part3 = &input[(i + j)..(i + j + k)]; let mut ctx = digest::Context::new(&$alg); ctx.update(part1); ctx.update(part2); ctx.update(part3); let i_u_f = ctx.finish(); let one_shot = digest::digest(&$alg, &input[..(i + j + k)]); assert_eq!(i_u_f.as_ref(), one_shot.as_ref()); } } } } }; } test_i_u_f!(digest_test_i_u_f_sha1, digest::SHA1_FOR_LEGACY_USE_ONLY); test_i_u_f!(digest_test_i_u_f_sha256, digest::SHA256); test_i_u_f!(digest_test_i_u_f_sha384, digest::SHA384); test_i_u_f!(digest_test_i_u_f_sha512, digest::SHA512); #[test] fn test_fmt_algorithm() { assert_eq!("SHA1", &format!("{:?}", digest::SHA1_FOR_LEGACY_USE_ONLY)); assert_eq!("SHA256", &format!("{:?}", digest::SHA256)); assert_eq!("SHA384", &format!("{:?}", digest::SHA384)); assert_eq!("SHA512", &format!("{:?}", digest::SHA512)); assert_eq!("SHA512_256", &format!("{:?}", digest::SHA512_256)); } #[test] fn digest_test_fmt() { assert_eq!( "SHA1:b7e23ec29af22b0b4e41da31e868d57226121c84", &format!( "{:?}", digest::digest(&digest::SHA1_FOR_LEGACY_USE_ONLY, b"hello, world") ) ); assert_eq!( "SHA256:09ca7e4eaa6e8ae9c7d261167129184883644d\ 07dfba7cbfbc4c8a2e08360d5b", &format!("{:?}", digest::digest(&digest::SHA256, b"hello, world")) ); assert_eq!( "SHA384:1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5\ fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f0\ 0418a70cdb7e", &format!("{:?}", digest::digest(&digest::SHA384, b"hello, world")) ); assert_eq!( "SHA512:8710339dcb6814d0d9d2290ef422285c9322b7\ 163951f9a0ca8f883d3305286f44139aa374848e4174f5\ aada663027e4548637b6d19894aec4fb6c46a139fbf9", &format!("{:?}", digest::digest(&digest::SHA512, b"hello, world")) ); assert_eq!( "SHA512_256:11f2c88c04f0a9c3d0970894ad2472505e\ 0bc6e8c7ec46b5211cd1fa3e253e62", &format!("{:?}", digest::digest(&digest::SHA512_256, b"hello, world")) ); } ring-0.17.14/tests/ecdsa_test_private_key_p256.p8000064400000000000000000000002121046102023000176370ustar 0000000000000000*H=*H=m0k W)WHKb WPwC"bpT⃡DBf#eP墳M`*eғP3M?5Y$^[Zzܧk3Y5 .ring-0.17.14/tests/ecdsa_test_public_key_p256.der000064400000000000000000000001011046102023000176630ustar 00000000000000f#eP墳M`*eғP3M?5Y$^[Zzܧk3Y5 .ring-0.17.14/tests/ecdsa_test_public_key_p256_debug.txt000064400000000000000000000002171046102023000211060ustar 00000000000000PublicKey("04fc116698a3e3236550c4c9efa9bd4d0619602a65d2930e9150ab33e84dbc83f8a6a6b9933f35ab59245e5b5a7af5dca76b33cbe7aeee5981b3ca350bebf52ecd")ring-0.17.14/tests/ecdsa_tests.rs000064400000000000000000000274501046102023000147560ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![allow(missing_docs)] use ring::{ rand, signature::{self, KeyPair}, }; #[allow(deprecated)] use ring::{test, test_file}; // ECDSA *signing* tests are in src/ec/ecdsa/signing.rs. #[test] fn ecdsa_from_pkcs8_test() { let rng = rand::SystemRandom::new(); test::run( test_file!("ecdsa_from_pkcs8_tests.txt"), |section, test_case| { assert_eq!(section, ""); let curve_name = test_case.consume_string("Curve"); let ((this_fixed, this_asn1), (other_fixed, other_asn1)) = match curve_name.as_str() { "P-256" => ( ( &signature::ECDSA_P256_SHA256_FIXED_SIGNING, &signature::ECDSA_P256_SHA256_ASN1_SIGNING, ), ( &signature::ECDSA_P384_SHA384_FIXED_SIGNING, &signature::ECDSA_P384_SHA384_ASN1_SIGNING, ), ), "P-384" => ( ( &signature::ECDSA_P384_SHA384_FIXED_SIGNING, &signature::ECDSA_P384_SHA384_ASN1_SIGNING, ), ( &signature::ECDSA_P256_SHA256_FIXED_SIGNING, &signature::ECDSA_P256_SHA256_ASN1_SIGNING, ), ), _ => unreachable!(), }; let input = test_case.consume_bytes("Input"); let error = test_case.consume_optional_string("Error"); match ( signature::EcdsaKeyPair::from_pkcs8(this_fixed, &input, &rng), error.clone(), ) { (Ok(_), None) => (), (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e), (Ok(_), Some(e)) => panic!("Succeeded, but expected error \"{}\"", e), (Err(actual), Some(expected)) => assert_eq!(format!("{}", actual), expected), }; match ( signature::EcdsaKeyPair::from_pkcs8(this_asn1, &input, &rng), error, ) { (Ok(_), None) => (), (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e), (Ok(_), Some(e)) => panic!("Succeeded, but expected error \"{}\"", e), (Err(actual), Some(expected)) => assert_eq!(format!("{}", actual), expected), }; assert!(signature::EcdsaKeyPair::from_pkcs8(other_fixed, &input, &rng).is_err()); assert!(signature::EcdsaKeyPair::from_pkcs8(other_asn1, &input, &rng).is_err()); Ok(()) }, ); } // Verify that, at least, we generate PKCS#8 documents that we can read. #[test] fn ecdsa_generate_pkcs8_test() { let rng = rand::SystemRandom::new(); for alg in &[ &signature::ECDSA_P256_SHA256_ASN1_SIGNING, &signature::ECDSA_P256_SHA256_FIXED_SIGNING, &signature::ECDSA_P384_SHA384_ASN1_SIGNING, &signature::ECDSA_P384_SHA384_FIXED_SIGNING, ] { let pkcs8 = signature::EcdsaKeyPair::generate_pkcs8(alg, &rng).unwrap(); println!(); for b in pkcs8.as_ref() { print!("{:02x}", *b); } println!(); println!(); #[cfg(feature = "alloc")] let _ = signature::EcdsaKeyPair::from_pkcs8(alg, pkcs8.as_ref(), &rng).unwrap(); } } #[test] fn signature_ecdsa_verify_asn1_test() { test::run( test_file!("ecdsa_verify_asn1_tests.txt"), |section, test_case| { assert_eq!(section, ""); let curve_name = test_case.consume_string("Curve"); let digest_name = test_case.consume_string("Digest"); let msg = test_case.consume_bytes("Msg"); let public_key = test_case.consume_bytes("Q"); let sig = test_case.consume_bytes("Sig"); let is_valid = test_case.consume_string("Result") == "P (0 )"; let alg = match (curve_name.as_str(), digest_name.as_str()) { ("P-256", "SHA256") => &signature::ECDSA_P256_SHA256_ASN1, ("P-256", "SHA384") => &signature::ECDSA_P256_SHA384_ASN1, ("P-384", "SHA256") => &signature::ECDSA_P384_SHA256_ASN1, ("P-384", "SHA384") => &signature::ECDSA_P384_SHA384_ASN1, _ => { panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); } }; let actual_result = signature::UnparsedPublicKey::new(alg, &public_key).verify(&msg, &sig); assert_eq!(actual_result.is_ok(), is_valid); Ok(()) }, ); } #[test] fn signature_ecdsa_verify_fixed_test() { test::run( test_file!("ecdsa_verify_fixed_tests.txt"), |section, test_case| { assert_eq!(section, ""); let curve_name = test_case.consume_string("Curve"); let digest_name = test_case.consume_string("Digest"); let msg = test_case.consume_bytes("Msg"); let public_key = test_case.consume_bytes("Q"); let sig = test_case.consume_bytes("Sig"); let expected_result = test_case.consume_string("Result"); let alg = match (curve_name.as_str(), digest_name.as_str()) { ("P-256", "SHA256") => &signature::ECDSA_P256_SHA256_FIXED, ("P-384", "SHA384") => &signature::ECDSA_P384_SHA384_FIXED, _ => { panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); } }; let is_valid = expected_result == "P (0 )"; let actual_result = signature::UnparsedPublicKey::new(alg, &public_key).verify(&msg, &sig); assert_eq!(actual_result.is_ok(), is_valid); Ok(()) }, ); } #[test] fn ecdsa_test_public_key_coverage() { const PRIVATE_KEY: &[u8] = include_bytes!("ecdsa_test_private_key_p256.p8"); const PUBLIC_KEY: &[u8] = include_bytes!("ecdsa_test_public_key_p256.der"); const PUBLIC_KEY_DEBUG: &str = include_str!("ecdsa_test_public_key_p256_debug.txt"); let rng = rand::SystemRandom::new(); let key_pair = signature::EcdsaKeyPair::from_pkcs8( &signature::ECDSA_P256_SHA256_FIXED_SIGNING, PRIVATE_KEY, &rng, ) .unwrap(); // Test `AsRef<[u8]>` assert_eq!(key_pair.public_key().as_ref(), PUBLIC_KEY); // Test `Clone`. #[allow(clippy::clone_on_copy, clippy::redundant_clone)] let _: ::PublicKey = key_pair.public_key().clone(); // Test `Copy`. let _: ::PublicKey = *key_pair.public_key(); // Test `Debug`. assert_eq!(PUBLIC_KEY_DEBUG, format!("{:?}", key_pair.public_key())); assert_eq!( format!("EcdsaKeyPair {{ public_key: {:?} }}", key_pair.public_key()), format!("{:?}", key_pair) ); } // This test is not a known-answer test, though it re-uses the known-answer // test vectors. Because the nonce is randomized, the signature will be // different each time. Because of that, here we simply verify that the // signature verifies correctly. The known-answer tests themselves are in // ecsda/signing.rs. #[test] fn signature_ecdsa_sign_fixed_sign_and_verify_test() { let rng = rand::SystemRandom::new(); test::run( test_file!("../src/ec/suite_b/ecdsa/ecdsa_sign_fixed_tests.txt"), |section, test_case| { assert_eq!(section, ""); let curve_name = test_case.consume_string("Curve"); let digest_name = test_case.consume_string("Digest"); let msg = test_case.consume_bytes("Msg"); let d = test_case.consume_bytes("d"); let q = test_case.consume_bytes("Q"); // Ignored since the actual signature will use a randomized nonce. let _k = test_case.consume_bytes("k"); let _expected_result = test_case.consume_bytes("Sig"); let (signing_alg, verification_alg) = match (curve_name.as_str(), digest_name.as_str()) { ("P-256", "SHA256") => ( &signature::ECDSA_P256_SHA256_FIXED_SIGNING, &signature::ECDSA_P256_SHA256_FIXED, ), ("P-384", "SHA384") => ( &signature::ECDSA_P384_SHA384_FIXED_SIGNING, &signature::ECDSA_P384_SHA384_FIXED, ), _ => { panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); } }; let private_key = signature::EcdsaKeyPair::from_private_key_and_public_key(signing_alg, &d, &q, &rng) .unwrap(); let signature = private_key.sign(&rng, &msg).unwrap(); let public_key = signature::UnparsedPublicKey::new(verification_alg, q); assert_eq!(public_key.verify(&msg, signature.as_ref()), Ok(())); Ok(()) }, ); } // This test is not a known-answer test, though it re-uses the known-answer // test vectors. Because the nonce is randomized, the signature will be // different each time. Because of that, here we simply verify that the // signature verifies correctly. The known-answer tests themselves are in // ecsda/signing.rs. #[test] fn signature_ecdsa_sign_asn1_test() { let rng = rand::SystemRandom::new(); test::run( test_file!("../src/ec/suite_b/ecdsa/ecdsa_sign_asn1_tests.txt"), |section, test_case| { assert_eq!(section, ""); let curve_name = test_case.consume_string("Curve"); let digest_name = test_case.consume_string("Digest"); let msg = test_case.consume_bytes("Msg"); let d = test_case.consume_bytes("d"); let q = test_case.consume_bytes("Q"); // Ignored since the actual signature will use a randomized nonce. let _k = test_case.consume_bytes("k"); let _expected_result = test_case.consume_bytes("Sig"); let (signing_alg, verification_alg) = match (curve_name.as_str(), digest_name.as_str()) { ("P-256", "SHA256") => ( &signature::ECDSA_P256_SHA256_ASN1_SIGNING, &signature::ECDSA_P256_SHA256_ASN1, ), ("P-384", "SHA384") => ( &signature::ECDSA_P384_SHA384_ASN1_SIGNING, &signature::ECDSA_P384_SHA384_ASN1, ), _ => { panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); } }; let private_key = signature::EcdsaKeyPair::from_private_key_and_public_key(signing_alg, &d, &q, &rng) .unwrap(); let signature = private_key.sign(&rng, &msg).unwrap(); let public_key = signature::UnparsedPublicKey::new(verification_alg, q); assert_eq!(public_key.verify(&msg, signature.as_ref()), Ok(())); Ok(()) }, ); } ring-0.17.14/tests/ed25519_test_private_key.bin000064400000000000000000000000401046102023000172220ustar 00000000000000aZ`J,DIi{2ip;`ring-0.17.14/tests/ed25519_test_private_key.p8000064400000000000000000000001231046102023000170030ustar 000000000000000Q0+ep" ]>! XMEw5|cI!X X~WX5ök}ring-0.17.14/tests/ed25519_test_public_key.bin000064400000000000000000000000401046102023000170260ustar 00000000000000Z Kd:rڦ#%hQring-0.17.14/tests/ed25519_test_public_key.der000064400000000000000000000000401046102023000170300ustar 00000000000000X X~WX5ök}ring-0.17.14/tests/ed25519_tests.rs000064400000000000000000000177471046102023000147050ustar 00000000000000// Copyright 2015-2017 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![allow(missing_docs)] use ring::{ error, rand, signature::{self, Ed25519KeyPair, KeyPair}, }; #[allow(deprecated)] use ring::{test, test_file}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] wasm_bindgen_test_configure!(run_in_browser); /// Test vectors from BoringSSL. #[test] fn test_signature_ed25519() { test::run(test_file!("ed25519_tests.txt"), |section, test_case| { assert_eq!(section, ""); let seed = test_case.consume_bytes("SEED"); assert_eq!(32, seed.len()); let public_key = test_case.consume_bytes("PUB"); assert_eq!(32, public_key.len()); let msg = test_case.consume_bytes("MESSAGE"); let expected_sig = test_case.consume_bytes("SIG"); { let key_pair = Ed25519KeyPair::from_seed_and_public_key(&seed, &public_key).unwrap(); let actual_sig = key_pair.sign(&msg); assert_eq!(&expected_sig[..], actual_sig.as_ref()); } // Test PKCS#8 generation, parsing, and private-to-public calculations. #[allow(deprecated)] let rng = test::rand::FixedSliceRandom { bytes: &seed }; let pkcs8 = Ed25519KeyPair::generate_pkcs8(&rng).unwrap(); let key_pair = Ed25519KeyPair::from_pkcs8(pkcs8.as_ref()).unwrap(); assert_eq!(public_key, key_pair.public_key().as_ref()); // Test Signature generation. let actual_sig = key_pair.sign(&msg); assert_eq!(&expected_sig[..], actual_sig.as_ref()); // Test Signature verification. test_signature_verification(&public_key, &msg, &expected_sig, Ok(())); let mut tampered_sig = expected_sig; tampered_sig[0] ^= 1; test_signature_verification(&public_key, &msg, &tampered_sig, Err(error::Unspecified)); Ok(()) }); } /// Test vectors from BoringSSL. #[test] fn test_signature_ed25519_verify() { test::run( test_file!("ed25519_verify_tests.txt"), |section, test_case| { assert_eq!(section, ""); let public_key = test_case.consume_bytes("PUB"); let msg = test_case.consume_bytes("MESSAGE"); let sig = test_case.consume_bytes("SIG"); let expected_result = match test_case.consume_string("Result").as_str() { "P" => Ok(()), "F" => Err(error::Unspecified), s => panic!("{:?} is not a valid result", s), }; test_signature_verification(&public_key, &msg, &sig, expected_result); Ok(()) }, ); } fn test_signature_verification( public_key: &[u8], msg: &[u8], sig: &[u8], expected_result: Result<(), error::Unspecified>, ) { assert_eq!( expected_result, signature::UnparsedPublicKey::new(&signature::ED25519, public_key).verify(msg, sig) ); } #[test] fn test_ed25519_from_seed_and_public_key_misuse() { const PRIVATE_KEY: &[u8] = include_bytes!("ed25519_test_private_key.bin"); const PUBLIC_KEY: &[u8] = include_bytes!("ed25519_test_public_key.bin"); assert!(Ed25519KeyPair::from_seed_and_public_key(PRIVATE_KEY, PUBLIC_KEY).is_ok()); // Truncated private key. assert!(Ed25519KeyPair::from_seed_and_public_key(&PRIVATE_KEY[..31], PUBLIC_KEY).is_err()); // Truncated public key. assert!(Ed25519KeyPair::from_seed_and_public_key(PRIVATE_KEY, &PUBLIC_KEY[..31]).is_err()); // Swapped public and private key. assert!(Ed25519KeyPair::from_seed_and_public_key(PUBLIC_KEY, PRIVATE_KEY).is_err()); } enum FromPkcs8Variant { Checked, MaybeUnchecked, } #[test] fn test_ed25519_from_pkcs8_unchecked() { test_ed25519_from_pkcs8_( FromPkcs8Variant::MaybeUnchecked, Ed25519KeyPair::from_pkcs8_maybe_unchecked, ) } #[test] fn test_ed25519_from_pkcs8() { test_ed25519_from_pkcs8_(FromPkcs8Variant::Checked, Ed25519KeyPair::from_pkcs8) } fn test_ed25519_from_pkcs8_( variant: FromPkcs8Variant, f: impl Fn(&[u8]) -> Result, ) { // Just test that we can parse the input. test::run( test_file!("ed25519_from_pkcs8_tests.txt"), |section, test_case| { assert_eq!(section, ""); let input = test_case.consume_bytes("Input"); let expected_error = { let expected_checked = test_case.consume_string("Result-Checked"); let expected_maybe_unchecked = test_case.consume_string("Result-Maybe-Unchecked"); let expected_result = match variant { FromPkcs8Variant::Checked => expected_checked, FromPkcs8Variant::MaybeUnchecked => expected_maybe_unchecked, }; if expected_result == "OK" { None } else { Some(expected_result) } }; let expected_public = { let expected_if_no_error = test_case.consume_optional_bytes("Public"); if expected_error.is_none() { Some(expected_if_no_error.unwrap()) } else { None } }; match f(&input) { Ok(keypair) => { assert_eq!(expected_error, None); assert_eq!( expected_public.as_deref(), Some(keypair.public_key().as_ref()) ); } Err(actual_error) => { assert_eq!(expected_error, Some(format!("{}", actual_error))); assert_eq!(expected_public, None); } } Ok(()) }, ); } #[test] fn ed25519_test_generate_pkcs8() { let rng = rand::SystemRandom::new(); let generated = Ed25519KeyPair::generate_pkcs8(&rng).unwrap(); let generated = generated.as_ref(); let _ronudtripped = Ed25519KeyPair::from_pkcs8(generated).unwrap(); // Regression test: Verify we're generating the correct encoding, as // `Ed25519KeyPair::from_pkcs8` also accepts our old wrong encoding. assert_eq!(generated.len(), 19 + 32 + 32); assert_eq!(&generated[..2], &[0x30, 0x51]); } #[test] fn ed25519_test_public_key_coverage() { const PRIVATE_KEY: &[u8] = include_bytes!("ed25519_test_private_key.p8"); const PUBLIC_KEY: &[u8] = include_bytes!("ed25519_test_public_key.der"); const PUBLIC_KEY_DEBUG: &str = "PublicKey(\"5809e9fef6dcec58f0f2e3b0d67e9880a11957e083ace85835c3b6c8fbaf6b7d\")"; let key_pair = Ed25519KeyPair::from_pkcs8(PRIVATE_KEY).unwrap(); // Test `AsRef<[u8]>` assert_eq!(key_pair.public_key().as_ref(), PUBLIC_KEY); // Test `Clone`. #[allow(clippy::clone_on_copy)] let _: ::PublicKey = key_pair.public_key().clone(); // Test `Copy`. let _: ::PublicKey = *key_pair.public_key(); // Test `Debug`. assert_eq!(PUBLIC_KEY_DEBUG, format!("{:?}", key_pair.public_key())); assert_eq!( format!( "Ed25519KeyPair {{ public_key: {:?} }}", key_pair.public_key() ), format!("{:?}", key_pair) ); } ring-0.17.14/tests/error_tests.rs000064400000000000000000000004631046102023000150230ustar 00000000000000#![allow(missing_docs)] #[cfg(feature = "std")] #[test] fn error_impl_std_error_error_test() { use ring::error; #[allow(deprecated)] use ring::test; test::compile_time_assert_std_error_error::(); test::compile_time_assert_std_error_error::(); } ring-0.17.14/tests/hkdf_tests.rs000064400000000000000000000104161046102023000146050ustar 00000000000000// Copyright 2015 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![allow(missing_docs)] use ring::{digest, error, hkdf}; #[allow(deprecated)] use ring::{test, test_file}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] wasm_bindgen_test_configure!(run_in_browser); #[test] fn hkdf_tests() { test::run(test_file!("hkdf_tests.txt"), |section, test_case| { assert_eq!(section, ""); let alg = { let digest_alg = test_case .consume_digest_alg("Hash") .ok_or(error::Unspecified)?; if digest_alg == &digest::SHA256 { hkdf::HKDF_SHA256 } else { // TODO: add test vectors for other algorithms panic!("unsupported algorithm: {:?}", digest_alg); } }; let secret = test_case.consume_bytes("IKM"); let salt = test_case.consume_bytes("salt"); let info = test_case.consume_bytes("info"); let _ = test_case.consume_bytes("PRK"); let expected_out = test_case.consume_bytes("OKM"); let salt = hkdf::Salt::new(alg, &salt); // TODO: test multi-part info, especially with empty parts. let My(out) = salt .extract(&secret) .expand(&[&info], My(expected_out.len())) .unwrap() .into(); assert_eq!(out, expected_out); Ok(()) }); } #[test] fn hkdf_output_len_tests() { for &alg in &[hkdf::HKDF_SHA256, hkdf::HKDF_SHA384, hkdf::HKDF_SHA512] { const MAX_BLOCKS: usize = 255; let salt = hkdf::Salt::new(alg, &[]); let prk = salt.extract(&[]); // TODO: enforce minimum length. { // Test zero length. let okm = prk.expand(&[b"info"], My(0)).unwrap(); let result: My> = okm.into(); assert_eq!(&result.0, &[]); } let max_out_len = MAX_BLOCKS * alg.hmac_algorithm().digest_algorithm().output_len(); { // Test maximum length output succeeds. let okm = prk.expand(&[b"info"], My(max_out_len)).unwrap(); let result: My> = okm.into(); assert_eq!(result.0.len(), max_out_len); } { // Test too-large output fails. assert!(prk.expand(&[b"info"], My(max_out_len + 1)).is_err()); } { // Test length mismatch (smaller). let okm = prk.expand(&[b"info"], My(2)).unwrap(); let mut buf = [0u8; 1]; assert_eq!(okm.fill(&mut buf), Err(error::Unspecified)); } { // Test length mismatch (larger). let okm = prk.expand(&[b"info"], My(2)).unwrap(); let mut buf = [0u8; 3]; assert_eq!(okm.fill(&mut buf), Err(error::Unspecified)); } { // Control for above two tests. let okm = prk.expand(&[b"info"], My(2)).unwrap(); let mut buf = [0u8; 2]; assert_eq!(okm.fill(&mut buf), Ok(())); } } } /// Generic newtype wrapper that lets us implement traits for externally-defined /// types. #[derive(Debug, PartialEq)] struct My(T); impl hkdf::KeyType for My { fn len(&self) -> usize { self.0 } } impl From>> for My> { fn from(okm: hkdf::Okm>) -> Self { let mut r = vec![0u8; okm.len().0]; okm.fill(&mut r).unwrap(); Self(r) } } ring-0.17.14/tests/hmac_tests.rs000064400000000000000000000072331046102023000146040ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![allow(missing_docs)] use ring::{digest, hmac}; #[allow(deprecated)] use ring::{test, test_file}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] wasm_bindgen_test_configure!(run_in_browser); #[test] fn hmac_tests() { test::run(test_file!("hmac_tests.txt"), |section, test_case| { assert_eq!(section, ""); let digest_alg = test_case.consume_digest_alg("HMAC"); let key_value = test_case.consume_bytes("Key"); let mut input = test_case.consume_bytes("Input"); let output = test_case.consume_bytes("Output"); let algorithm = { let digest_alg = match digest_alg { Some(digest_alg) => digest_alg, None => { return Ok(()); } // Unsupported digest algorithm }; if digest_alg == &digest::SHA1_FOR_LEGACY_USE_ONLY { hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY } else if digest_alg == &digest::SHA256 { hmac::HMAC_SHA256 } else if digest_alg == &digest::SHA384 { hmac::HMAC_SHA384 } else if digest_alg == &digest::SHA512 { hmac::HMAC_SHA512 } else { unreachable!() } }; hmac_test_case_inner(algorithm, &key_value[..], &input[..], &output[..], true); // Tamper with the input and check that verification fails. if input.is_empty() { input.push(0); } else { input[0] ^= 1; } hmac_test_case_inner(algorithm, &key_value[..], &input[..], &output[..], false); Ok(()) }); } fn hmac_test_case_inner( algorithm: hmac::Algorithm, key_value: &[u8], input: &[u8], output: &[u8], is_ok: bool, ) { let key = hmac::Key::new(algorithm, key_value); // One-shot API. { let signature = hmac::sign(&key, input); assert_eq!(is_ok, signature.as_ref() == output); assert_eq!(is_ok, hmac::verify(&key, input, output).is_ok()); } // Multi-part API, one single part. { let mut s_ctx = hmac::Context::with_key(&key); s_ctx.update(input); let signature = s_ctx.sign(); assert_eq!(is_ok, signature.as_ref() == output); } // Multi-part API, byte by byte. { let mut ctx = hmac::Context::with_key(&key); for b in input { ctx.update(&[*b]); } let signature = ctx.sign(); assert_eq!(is_ok, signature.as_ref() == output); } } #[test] fn hmac_debug() { let key = hmac::Key::new(hmac::HMAC_SHA256, &[0; 32]); assert_eq!("Key { algorithm: SHA256 }", format!("{:?}", &key)); let ctx = hmac::Context::with_key(&key); assert_eq!("Context { algorithm: SHA256 }", format!("{:?}", &ctx)); } ring-0.17.14/tests/pbkdf2_tests.rs000064400000000000000000000055321046102023000150440ustar 00000000000000// Copyright 2015-2017 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![allow(missing_docs)] use core::num::NonZeroU32; use ring::{digest, error, pbkdf2}; #[allow(deprecated)] use ring::{test, test_file}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] wasm_bindgen_test_configure!(run_in_browser); /// Test vectors from BoringSSL, Go, and other sources. #[test] pub fn pbkdf2_tests() { test::run(test_file!("pbkdf2_tests.txt"), |section, test_case| { assert_eq!(section, ""); let algorithm = { let digest_alg = test_case.consume_digest_alg("Hash").unwrap(); if digest_alg == &digest::SHA1_FOR_LEGACY_USE_ONLY { pbkdf2::PBKDF2_HMAC_SHA1 } else if digest_alg == &digest::SHA256 { pbkdf2::PBKDF2_HMAC_SHA256 } else if digest_alg == &digest::SHA384 { pbkdf2::PBKDF2_HMAC_SHA384 } else if digest_alg == &digest::SHA512 { pbkdf2::PBKDF2_HMAC_SHA512 } else { unreachable!() } }; let iterations: u32 = test_case.consume_usize("c").try_into().unwrap(); let iterations: NonZeroU32 = iterations.try_into().unwrap(); let secret = test_case.consume_bytes("P"); let salt = test_case.consume_bytes("S"); let dk = test_case.consume_bytes("DK"); let verify_expected_result = test_case.consume_string("Verify"); let verify_expected_result = match verify_expected_result.as_str() { "OK" => Ok(()), "Err" => Err(error::Unspecified), _ => panic!("Unsupported value of \"Verify\""), }; { let mut out = vec![0u8; dk.len()]; pbkdf2::derive(algorithm, iterations, &salt, &secret, &mut out); assert_eq!(dk == out, verify_expected_result.is_ok() || dk.is_empty()); } assert_eq!( pbkdf2::verify(algorithm, iterations, &salt, &secret, &dk), verify_expected_result ); Ok(()) }); } ring-0.17.14/tests/quic_tests.rs000064400000000000000000000055721046102023000146410ustar 00000000000000// Copyright 2018 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![allow(missing_docs)] use ring::aead::quic; #[allow(deprecated)] use ring::{test, test_file}; #[test] fn quic_aes_128() { test_quic(&quic::AES_128, test_file!("quic_aes_128_tests.txt")); } #[test] fn quic_aes_256() { test_quic(&quic::AES_256, test_file!("quic_aes_256_tests.txt")); } #[test] fn quic_chacha20() { test_quic(&quic::CHACHA20, test_file!("quic_chacha20_tests.txt")); } fn test_quic(alg: &'static quic::Algorithm, test_file: test::File) { test_key_len(alg); test_sample_len(alg); test::run(test_file, |section, test_case| { assert_eq!(section, ""); let key_bytes = test_case.consume_bytes("KEY"); let sample = test_case.consume_bytes("SAMPLE"); let mask = test_case.consume_bytes("MASK"); let key = quic::HeaderProtectionKey::new(alg, &key_bytes)?; assert_eq!(mask.as_ref(), key.new_mask(&sample)?); Ok(()) }); } #[allow(clippy::range_plus_one)] fn test_key_len(alg: &'static quic::Algorithm) { let key_len = alg.key_len(); let key_data = vec![0u8; key_len + 1]; assert!(quic::HeaderProtectionKey::new(alg, &[]).is_err()); assert!(quic::HeaderProtectionKey::new(alg, &key_data[..key_len]).is_ok()); assert!(quic::HeaderProtectionKey::new(alg, &key_data[..(key_len + 1)]).is_err()); assert!(quic::HeaderProtectionKey::new(alg, &key_data[..(key_len - 1)]).is_err()); } #[allow(clippy::range_plus_one)] fn test_sample_len(alg: &'static quic::Algorithm) { let key_len = alg.key_len(); let key_data = vec![0u8; key_len]; let key = quic::HeaderProtectionKey::new(alg, &key_data).unwrap(); let sample_len = alg.sample_len(); assert_eq!(sample_len, 16); // For all currently-implemented algorithms let sample_data = vec![0u8; sample_len + 2]; // Sample is the right size. assert!(key.new_mask(&sample_data[..sample_len]).is_ok()); // Sample is one byte too small. assert!(key.new_mask(&sample_data[..(sample_len - 1)]).is_err()); // Sample is one byte too big. assert!(key.new_mask(&sample_data[..(sample_len + 1)]).is_err()); // Sample is empty. assert!(key.new_mask(&[]).is_err()); } ring-0.17.14/tests/rand_tests.rs000064400000000000000000000045751046102023000146260ustar 00000000000000// Copyright 2015-2019 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![allow(missing_docs)] use ring::rand::{self, SecureRandom as _}; #[allow(deprecated)] use ring::test; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] wasm_bindgen_test_configure!(run_in_browser); #[test] fn test_system_random_lengths() { const LINUX_LIMIT: usize = 256; const WEB_LIMIT: usize = 65536; // Test that `fill` succeeds for various interesting lengths. `256` and // multiples thereof are interesting because that's an edge case for // `getrandom` on Linux. let lengths = [ 0, 1, 2, 3, 96, LINUX_LIMIT - 1, LINUX_LIMIT, LINUX_LIMIT + 1, LINUX_LIMIT * 2, 511, 512, 513, 4096, WEB_LIMIT - 1, WEB_LIMIT, WEB_LIMIT + 1, WEB_LIMIT * 2, ]; for len in lengths.iter() { let mut buf = vec![0; *len]; let rng = rand::SystemRandom::new(); assert!(rng.fill(&mut buf).is_ok()); // If `len` < 96 then there's a big chance of false positives, but // otherwise the likelihood of a false positive is so too low to // worry about. if *len >= 96 { assert!(buf.iter().any(|x| *x != 0)); } } } #[test] fn test_system_random_traits() { test::compile_time_assert_clone::(); test::compile_time_assert_send::(); assert_eq!( "SystemRandom(())", format!("{:?}", rand::SystemRandom::new()) ); } ring-0.17.14/tests/rsa_test_private_key_2048.p8000064400000000000000000000023021046102023000172500ustar 0000000000000000  *H 0ȧPێlvuSe1tDu輒&D5~] rJOwv&?ya.F~k@X 19ssr.5UIaeȁMX$\RlI ZcLkگۨ8;mm"Q 132Qz^nBI#:8j-.vPu/jshdλ~]{lsOH8_݅x{ jdoӲ >7Gp0>Od9܈$;!KAxr_k)x&[36%0Mgāߒu,9i*! ?{q$&bf0z\HW38,! Ez[)\4XaU1Wll%i|M_ejyfR6ݍy} [)P#*UEسȸW|VosaNkvܲdp=q-Lœ!疞5_[-z\@W4_N_6ݠ/Bh}2xE84H8'4Ca1%w0N9h\`B܏nR&KcJ)BDՎO_'U \#zGQR2\=f2@.]-׻BLTݖ Aj7 C\^P.mZ t=ja-qnLm3"Rse:ring-0.17.14/tests/rsa_test_public_key_2048.der000064400000000000000000000004161046102023000173030ustar 000000000000000 ȧPێlvuSe1tDu輒&D5~] rJOwv&?ya.F~k@X 19ssr.5UIaeȁMX$\RlI ZcLkگۨ8;mm"Q 132Qz^nBIring-0.17.14/tests/rsa_test_public_key_2048_debug.txt000064400000000000000000000010511046102023000205120ustar 00000000000000PublicKey("3082010a0282010100c8a78500a5a250db8ed36c85b8dcf83c4be1953114faaac7616e0ea24922fa6b7ab01f85582c815cc3bdeb5ed46762bc536accaa8b72705b00cef316b2ec508fb9697241b9e34238419cccf7339eeb8b062147af4f5932f613d9bc0ae70bf6d56d4432e83e13767587531bfa9dd56531741244be75e8bc9226b9fa44b4b8a101358d7e8bb75d0c724a4f11ece77776263faefe79612eb1d71646e77e8982866be1400eafc3580d3139b41aaa7380187372f22e35bd55b288496165c881ed154d5811245c52d56cc09d4916d4f2a50bcf5ae0a2637f4cfa6bf9daafc113dba8383b6dd7da6dd8db22d8510a8d3115983308909a1a0332517aa55e896e154249b30203010001")ring-0.17.14/tests/rsa_test_public_modulus.bin000064400000000000000000000004001046102023000175150ustar 00000000000000ȧPێlvuSe1tDu輒&D5~] rJOwv&?ya.F~k@X 19ssr.5UIaeȁMX$\RlI ZcLkگۨ8;mm"Q 132Qz^nBIring-0.17.14/tests/rsa_tests.rs000064400000000000000000000316361046102023000144650ustar 00000000000000// Copyright 2017 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![allow(missing_docs)] #![cfg(feature = "alloc")] use ring::{ error, io::der, rand, rsa, signature::{self, KeyPair}, }; #[allow(deprecated)] use ring::{test, test_file}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] wasm_bindgen_test_configure!(run_in_browser); #[test] fn rsa_from_pkcs8_test() { test::run( test_file!("rsa_from_pkcs8_tests.txt"), |section, test_case| { assert_eq!(section, ""); let input = test_case.consume_bytes("Input"); let error = test_case.consume_optional_string("Error"); match (rsa::KeyPair::from_pkcs8(&input), error) { (Ok(_), None) => {} (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e), (Ok(_), Some(e)) => panic!("Succeeded, but expected error \"{}\"", e), (Err(actual), Some(expected)) => assert_eq!(format!("{}", actual), expected), }; Ok(()) }, ); } #[cfg(feature = "alloc")] #[test] fn test_signature_rsa_pkcs1_sign() { let rng = rand::SystemRandom::new(); test::run( test_file!("rsa_pkcs1_sign_tests.txt"), |section, test_case| { assert_eq!(section, ""); let digest_name = test_case.consume_string("Digest"); let alg = match digest_name.as_ref() { "SHA256" => &signature::RSA_PKCS1_SHA256, "SHA384" => &signature::RSA_PKCS1_SHA384, "SHA512" => &signature::RSA_PKCS1_SHA512, _ => panic!("Unsupported digest: {}", digest_name), }; let private_key = test_case.consume_bytes("Key"); let msg = test_case.consume_bytes("Msg"); let expected = test_case.consume_bytes("Sig"); let result = test_case.consume_string("Result"); let key_pair = rsa::KeyPair::from_der(&private_key); if result == "Fail-Invalid-Key" { assert!(key_pair.is_err()); return Ok(()); } let key_pair = key_pair.unwrap(); // XXX: This test is too slow on Android ARM Travis CI builds. // TODO: re-enable these tests on Android ARM. let mut actual = vec![0u8; key_pair.public().modulus_len()]; key_pair .sign(alg, &rng, &msg, actual.as_mut_slice()) .unwrap(); assert_eq!(actual.as_slice() == &expected[..], result == "Pass"); Ok(()) }, ); } #[cfg(feature = "alloc")] #[test] fn test_signature_rsa_pss_sign() { test::run( test_file!("rsa_pss_sign_tests.txt"), |section, test_case| { assert_eq!(section, ""); let digest_name = test_case.consume_string("Digest"); let alg = match digest_name.as_ref() { "SHA256" => &signature::RSA_PSS_SHA256, "SHA384" => &signature::RSA_PSS_SHA384, "SHA512" => &signature::RSA_PSS_SHA512, _ => panic!("Unsupported digest: {}", digest_name), }; let result = test_case.consume_string("Result"); let private_key = test_case.consume_bytes("Key"); let key_pair = rsa::KeyPair::from_der(&private_key); if key_pair.is_err() && result == "Fail-Invalid-Key" { return Ok(()); } let key_pair = key_pair.unwrap(); let msg = test_case.consume_bytes("Msg"); let salt = test_case.consume_bytes("Salt"); let expected = test_case.consume_bytes("Sig"); #[allow(deprecated)] let rng = test::rand::FixedSliceRandom { bytes: &salt }; let mut actual = vec![0u8; key_pair.public().modulus_len()]; key_pair.sign(alg, &rng, &msg, actual.as_mut_slice())?; assert_eq!(actual.as_slice() == &expected[..], result == "Pass"); Ok(()) }, ); } // `KeyPair::sign` requires that the output buffer is the same length as // the public key modulus. Test what happens when it isn't the same length. #[test] fn test_signature_rsa_pkcs1_sign_output_buffer_len() { // Sign the message "hello, world", using PKCS#1 v1.5 padding and the // SHA256 digest algorithm. const MESSAGE: &[u8] = b"hello, world"; let rng = rand::SystemRandom::new(); const PRIVATE_KEY_DER: &[u8] = include_bytes!("../src/rsa/signature_rsa_example_private_key.der"); let key_pair = rsa::KeyPair::from_der(PRIVATE_KEY_DER).unwrap(); // When the output buffer is not exactly the right length, `sign()` returns // an error (and does not panic or invoke UB). if `sign` doesn't check that // the length is correct at the beginning then there are various possible // failure points when the output buffer is too small. for len in 0..key_pair.public().modulus_len() + 1 { let mut signature = vec![0; len]; assert_eq!( len == key_pair.public().modulus_len(), key_pair .sign(&signature::RSA_PKCS1_SHA256, &rng, MESSAGE, &mut signature) .is_ok() ); } } #[cfg(feature = "alloc")] #[test] fn test_signature_rsa_pkcs1_verify() { let sha1_params = &[ ( &signature::RSA_PKCS1_1024_8192_SHA1_FOR_LEGACY_USE_ONLY, 1024, ), ( &signature::RSA_PKCS1_2048_8192_SHA1_FOR_LEGACY_USE_ONLY, 2048, ), ]; let sha256_params = &[ ( &signature::RSA_PKCS1_1024_8192_SHA256_FOR_LEGACY_USE_ONLY, 1024, ), (&signature::RSA_PKCS1_2048_8192_SHA256, 2048), ]; let sha384_params = &[ (&signature::RSA_PKCS1_2048_8192_SHA384, 2048), (&signature::RSA_PKCS1_3072_8192_SHA384, 3072), ]; let sha512_params = &[ ( &signature::RSA_PKCS1_1024_8192_SHA512_FOR_LEGACY_USE_ONLY, 1024, ), (&signature::RSA_PKCS1_2048_8192_SHA512, 2048), ]; test::run( test_file!("rsa_pkcs1_verify_tests.txt"), |section, test_case| { assert_eq!(section, ""); let digest_name = test_case.consume_string("Digest"); let params: &[_] = match digest_name.as_ref() { "SHA1" => sha1_params, "SHA256" => sha256_params, "SHA384" => sha384_params, "SHA512" => sha512_params, _ => panic!("Unsupported digest: {}", digest_name), }; let public_key = test_case.consume_bytes("Key"); // Sanity check that we correctly DER-encoded the originally- // provided separate (n, e) components. When we add test vectors // for improperly-encoded signatures, we'll have to revisit this. let key_bits = untrusted::Input::from(&public_key) .read_all(error::Unspecified, |input| { der::nested(input, der::Tag::Sequence, error::Unspecified, |input| { let n_bytes = der::positive_integer(input)?.big_endian_without_leading_zero(); let _e = der::positive_integer(input)?; // Because `n_bytes` has the leading zeros stripped and is big-endian, there // must be less than 8 leading zero bits. let n_leading_zeros = usize::try_from(n_bytes[0].leading_zeros()).unwrap(); assert!(n_leading_zeros < 8); Ok((n_bytes.len() * 8) - n_leading_zeros) }) }) .expect("invalid DER"); let msg = test_case.consume_bytes("Msg"); let sig = test_case.consume_bytes("Sig"); let is_valid = test_case.consume_string("Result") == "P"; for &(alg, min_bits) in params { let width_ok = key_bits >= min_bits; let actual_result = signature::UnparsedPublicKey::new(alg, &public_key).verify(&msg, &sig); assert_eq!(actual_result.is_ok(), is_valid && width_ok); } Ok(()) }, ); } #[cfg(feature = "alloc")] #[test] fn test_signature_rsa_pss_verify() { test::run( test_file!("rsa_pss_verify_tests.txt"), |section, test_case| { assert_eq!(section, ""); let digest_name = test_case.consume_string("Digest"); let alg = match digest_name.as_ref() { "SHA256" => &signature::RSA_PSS_2048_8192_SHA256, "SHA384" => &signature::RSA_PSS_2048_8192_SHA384, "SHA512" => &signature::RSA_PSS_2048_8192_SHA512, _ => panic!("Unsupported digest: {}", digest_name), }; let public_key = test_case.consume_bytes("Key"); // Sanity check that we correctly DER-encoded the originally- // provided separate (n, e) components. When we add test vectors // for improperly-encoded signatures, we'll have to revisit this. assert!(untrusted::Input::from(&public_key) .read_all(error::Unspecified, |input| der::nested( input, der::Tag::Sequence, error::Unspecified, |input| { let _ = der::positive_integer(input)?; let _ = der::positive_integer(input)?; Ok(()) } )) .is_ok()); let msg = test_case.consume_bytes("Msg"); let sig = test_case.consume_bytes("Sig"); let is_valid = test_case.consume_string("Result") == "P"; let actual_result = signature::UnparsedPublicKey::new(alg, &public_key).verify(&msg, &sig); assert_eq!(actual_result.is_ok(), is_valid); Ok(()) }, ); } // Test for `primitive::verify()`. Read public key parts from a file // and use them to verify a signature. #[cfg(feature = "alloc")] #[test] fn test_signature_rsa_primitive_verification() { test::run( test_file!("rsa_primitive_verify_tests.txt"), |section, test_case| { assert_eq!(section, ""); let n = test_case.consume_bytes("n"); let e = test_case.consume_bytes("e"); let msg = test_case.consume_bytes("Msg"); let sig = test_case.consume_bytes("Sig"); let expected = test_case.consume_string("Result"); let public_key = signature::RsaPublicKeyComponents { n: &n, e: &e }; let result = public_key.verify(&signature::RSA_PKCS1_2048_8192_SHA256, &msg, &sig); assert_eq!(result.is_ok(), expected == "Pass"); Ok(()) }, ) } #[cfg(feature = "alloc")] #[test] fn rsa_test_keypair_coverage() { const PRIVATE_KEY: &[u8] = include_bytes!("rsa_test_private_key_2048.p8"); let key_pair = rsa::KeyPair::from_pkcs8(PRIVATE_KEY).unwrap(); // Test that `signature::KeyPair::PublicKey` is `rsa::PublicKey`; if it // were a separate type then it would need to be tested separately. let _: &rsa::PublicKey = key_pair.public_key(); test_public_key_coverage(key_pair.public()); // Test clones. test_public_key_coverage(&key_pair.public().clone()); // Test `Debug` assert_eq!( format!("RsaKeyPair {{ public: {:?} }}", key_pair.public_key()), format!("{:?}", key_pair) ); } fn test_public_key_coverage(key: &rsa::PublicKey) { // Test `AsRef<[u8]>` const PUBLIC_KEY: &[u8] = include_bytes!("rsa_test_public_key_2048.der"); assert_eq!(key.as_ref(), PUBLIC_KEY); // Test `Debug`. const PUBLIC_KEY_DEBUG: &str = include_str!("rsa_test_public_key_2048_debug.txt"); assert_eq!(PUBLIC_KEY_DEBUG, format!("{:?}", key)); let components = rsa::PublicKeyComponents::>::from(key); const PUBLIC_KEY_MODULUS_BE_BYTES: &[u8] = include_bytes!("rsa_test_public_modulus.bin"); assert_eq!(PUBLIC_KEY_MODULUS_BE_BYTES, &components.n); const _65537: &[u8] = &[0x01, 0x00, 0x01]; assert_eq!(_65537, &components.e); } ring-0.17.14/tests/signature_tests.rs000064400000000000000000000017311046102023000156720ustar 00000000000000#![allow(missing_docs)] use ring::signature; #[allow(deprecated)] use ring::test; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] wasm_bindgen_test_configure!(run_in_browser); #[test] fn signature_impl_test() { test::compile_time_assert_clone::(); test::compile_time_assert_copy::(); test::compile_time_assert_send::(); test::compile_time_assert_sync::(); let unparsed_public_key = signature::UnparsedPublicKey::new(&signature::ED25519, &[0x01, 0x02, 0x03]); assert_eq!( format!("{:?}", unparsed_public_key), r#"UnparsedPublicKey { algorithm: ring::signature::ED25519, bytes: "010203" }"# ); // Test `AsRef<[u8]>` assert_eq!(unparsed_public_key.as_ref(), &[0x01, 0x02, 0x03]); } ring-0.17.14/third_party/fiat/LICENSE000064400000000000000000000011761046102023000152230ustar 00000000000000The Apache License, Version 2.0 (Apache-2.0) Copyright 2015-2020 the fiat-crypto authors (see the AUTHORS file) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_mul.S000064400000000000000000000066101046102023000213660ustar 00000000000000#include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ (defined(__APPLE__) || defined(__ELF__)) .intel_syntax noprefix .text #if defined(__APPLE__) .private_extern _fiat_curve25519_adx_mul .global _fiat_curve25519_adx_mul _fiat_curve25519_adx_mul: #else .type fiat_curve25519_adx_mul, @function .hidden fiat_curve25519_adx_mul .global fiat_curve25519_adx_mul fiat_curve25519_adx_mul: #endif .cfi_startproc _CET_ENDBR push rbp .cfi_adjust_cfa_offset 8 .cfi_offset rbp, -16 mov rbp, rsp mov rax, rdx mov rdx, [ rsi + 0x18 ] mulx r11, r10, [ rax + 0x8 ] mov rdx, [ rax + 0x0 ] mov [ rsp - 0x58 ], r15 .cfi_offset r15, -16-0x58 mulx r8, rcx, [ rsi + 0x18 ] mov rdx, [ rsi + 0x8 ] mov [ rsp - 0x80 ], rbx .cfi_offset rbx, -16-0x80 mulx rbx, r9, [ rax + 0x18 ] mov rdx, [ rsi + 0x8 ] mov [ rsp - 0x70 ], r12 .cfi_offset r12, -16-0x70 mulx r15, r12, [ rax + 0x8 ] mov rdx, [ rsi + 0x0 ] mov [ rsp - 0x68 ], r13 .cfi_offset r13, -16-0x68 mov [ rsp - 0x60 ], r14 .cfi_offset r14, -16-0x60 mulx r14, r13, [ rax + 0x0 ] mov rdx, [ rax + 0x10 ] mov [ rsp - 0x18 ], r15 mov [ rsp - 0x50 ], rdi mulx rdi, r15, [ rsi + 0x0 ] mov rdx, [ rax + 0x18 ] mov [ rsp - 0x48 ], r13 mov [ rsp - 0x40 ], r9 mulx r9, r13, [ rsi + 0x0 ] test al, al adox rcx, rdi mov rdx, [ rsi + 0x10 ] mov [ rsp - 0x38 ], r13 mulx r13, rdi, [ rax + 0x8 ] adox r10, r9 mov rdx, 0x0 adox rbx, rdx adcx rdi, rcx adcx r8, r10 mov r9, rdx adcx r9, rbx mov rdx, [ rsi + 0x10 ] mulx r10, rcx, [ rax + 0x0 ] mov rdx, [ rsi + 0x0 ] mov [ rsp - 0x30 ], r15 mulx r15, rbx, [ rax + 0x8 ] mov rdx, -0x2 inc rdx adox rcx, r15 setc r15b clc adcx rcx, r12 adox r10, rdi mov rdx, [ rax + 0x10 ] mov [ rsp - 0x78 ], rcx mulx rcx, rdi, [ rsi + 0x10 ] adox rdi, r8 mov rdx, [ rax + 0x18 ] mov [ rsp - 0x28 ], rcx mulx rcx, r8, [ rsi + 0x10 ] mov rdx, [ rax + 0x10 ] mov [ rsp - 0x20 ], r8 mulx r12, r8, [ rsi + 0x18 ] adox r8, r9 mov rdx, [ rsi + 0x8 ] mov [ rsp - 0x10 ], r12 mulx r12, r9, [ rax + 0x10 ] movzx rdx, r15b lea rdx, [ rdx + rcx ] adcx r9, r10 adcx r13, rdi mov r15, 0x0 mov r10, r15 adox r10, rdx mov rdx, [ rax + 0x18 ] mulx rcx, rdi, [ rsi + 0x18 ] adox rcx, r15 adcx r11, r8 mov rdx, r15 adcx rdx, r10 adcx rcx, r15 mov r8, rdx mov rdx, [ rax + 0x0 ] mulx r15, r10, [ rsi + 0x8 ] test al, al adox r10, r14 adcx rbx, r10 adox r15, [ rsp - 0x78 ] adcx r15, [ rsp - 0x30 ] adox r9, [ rsp - 0x18 ] adcx r9, [ rsp - 0x38 ] adox r13, [ rsp - 0x40 ] adcx r12, r13 adox r11, [ rsp - 0x20 ] adcx r11, [ rsp - 0x28 ] mov rdx, 0x26 mulx rsi, r14, r12 adox rdi, r8 adcx rdi, [ rsp - 0x10 ] mulx r10, r8, r11 mov r13, 0x0 adox rcx, r13 adcx rcx, r13 mulx r11, r12, rdi xor rdi, rdi adox r8, rbx adox r12, r15 mulx rbx, r13, rcx adcx r14, [ rsp - 0x48 ] adox r13, r9 adox rbx, rdi adcx rsi, r8 adcx r10, r12 adcx r11, r13 adc rbx, 0x0 mulx r9, r15, rbx xor r9, r9 adox r15, r14 mov rdi, r9 adox rdi, rsi mov rcx, r9 adox rcx, r10 mov r8, [ rsp - 0x50 ] mov [ r8 + 0x8 ], rdi mov r12, r9 adox r12, r11 mov r14, r9 cmovo r14, rdx mov [ r8 + 0x18 ], r12 adcx r15, r14 mov [ r8 + 0x0 ], r15 mov [ r8 + 0x10 ], rcx mov rbx, [ rsp - 0x80 ] .cfi_restore rbx mov r12, [ rsp - 0x70 ] .cfi_restore r12 mov r13, [ rsp - 0x68 ] .cfi_restore r13 mov r14, [ rsp - 0x60 ] .cfi_restore r14 mov r15, [ rsp - 0x58 ] .cfi_restore r15 pop rbp .cfi_restore rbp .cfi_adjust_cfa_offset -8 ret .cfi_endproc #if defined(__ELF__) .size fiat_curve25519_adx_mul, .-fiat_curve25519_adx_mul #endif #endif ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_square.S000064400000000000000000000051431046102023000220710ustar 00000000000000#include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ (defined(__APPLE__) || defined(__ELF__)) .intel_syntax noprefix .text #if defined(__APPLE__) .private_extern _fiat_curve25519_adx_square .global _fiat_curve25519_adx_square _fiat_curve25519_adx_square: #else .type fiat_curve25519_adx_square, @function .hidden fiat_curve25519_adx_square .global fiat_curve25519_adx_square fiat_curve25519_adx_square: #endif .cfi_startproc _CET_ENDBR push rbp .cfi_adjust_cfa_offset 8 .cfi_offset rbp, -16 mov rbp, rsp mov rdx, [ rsi + 0x0 ] mulx r10, rax, [ rsi + 0x8 ] mov rdx, [ rsi + 0x0 ] mulx rcx, r11, [ rsi + 0x10 ] xor rdx, rdx adox r11, r10 mov rdx, [ rsi + 0x0 ] mulx r9, r8, [ rsi + 0x18 ] mov rdx, [ rsi + 0x8 ] mov [ rsp - 0x80 ], rbx .cfi_offset rbx, -16-0x80 mulx rbx, r10, [ rsi + 0x18 ] adox r8, rcx mov [rsp - 0x48 ], rdi adox r10, r9 adcx rax, rax mov rdx, [ rsi + 0x10 ] mulx r9, rcx, [ rsi + 0x18 ] adox rcx, rbx mov rdx, [ rsi + 0x10 ] mulx rdi, rbx, [ rsi + 0x8 ] mov rdx, 0x0 adox r9, rdx mov [ rsp - 0x70 ], r12 .cfi_offset r12, -16-0x70 mov r12, -0x3 inc r12 adox rbx, r8 adox rdi, r10 adcx r11, r11 mov r8, rdx adox r8, rcx mov r10, rdx adox r10, r9 adcx rbx, rbx mov rdx, [ rsi + 0x0 ] mulx r9, rcx, rdx mov rdx, [ rsi + 0x8 ] mov [ rsp - 0x68 ], r13 .cfi_offset r13, -16-0x68 mov [ rsp - 0x60 ], r14 .cfi_offset r14, -16-0x60 mulx r14, r13, rdx seto dl inc r12 adox r9, rax adox r13, r11 adox r14, rbx adcx rdi, rdi mov al, dl mov rdx, [ rsi + 0x10 ] mulx rbx, r11, rdx adox r11, rdi adcx r8, r8 adox rbx, r8 adcx r10, r10 movzx rdx, al mov rdi, 0x0 adcx rdx, rdi movzx r8, al lea r8, [ r8 + rdx ] mov rdx, [ rsi + 0x18 ] mulx rdi, rax, rdx adox rax, r10 mov rdx, 0x26 mov [ rsp - 0x58 ], r15 .cfi_offset r15, -16-0x58 mulx r15, r10, r11 clc adcx r10, rcx mulx r11, rcx, rbx adox r8, rdi mulx rdi, rbx, r8 inc r12 adox rcx, r9 mulx r8, r9, rax adcx r15, rcx adox r9, r13 adcx r11, r9 adox rbx, r14 adox rdi, r12 adcx r8, rbx adc rdi, 0x0 mulx r14, r13, rdi test al, al mov rdi, [ rsp - 0x48 ] adox r13, r10 mov r14, r12 adox r14, r15 mov [ rdi + 0x8 ], r14 mov rax, r12 adox rax, r11 mov r10, r12 adox r10, r8 mov [ rdi + 0x10 ], rax mov rcx, r12 cmovo rcx, rdx adcx r13, rcx mov [ rdi + 0x0 ], r13 mov [ rdi + 0x18 ], r10 mov rbx, [ rsp - 0x80 ] .cfi_restore rbx mov r12, [ rsp - 0x70 ] .cfi_restore r12 mov r13, [ rsp - 0x68 ] .cfi_restore r13 mov r14, [ rsp - 0x60 ] .cfi_restore r14 mov r15, [ rsp - 0x58 ] .cfi_restore r15 pop rbp .cfi_restore rbp .cfi_adjust_cfa_offset -8 ret .cfi_endproc #if defined(__ELF__) .size fiat_curve25519_adx_square, .-fiat_curve25519_adx_square #endif #endif ring-0.17.14/third_party/fiat/curve25519_32.h000064400000000000000000001237441046102023000164330ustar 00000000000000/* Autogenerated: 'src/ExtractionOCaml/unsaturated_solinas' --inline --static --use-value-barrier 25519 32 '(auto)' '2^255 - 19' carry_mul carry_square carry add sub opp selectznz to_bytes from_bytes relax carry_scmul121666 */ /* curve description: 25519 */ /* machine_wordsize = 32 (from "32") */ /* requested operations: carry_mul, carry_square, carry, add, sub, opp, selectznz, to_bytes, from_bytes, relax, carry_scmul121666 */ /* n = 10 (from "(auto)") */ /* s-c = 2^255 - [(1, 19)] (from "2^255 - 19") */ /* tight_bounds_multiplier = 1 (from "") */ /* */ /* Computed values: */ /* carry_chain = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1] */ /* eval z = z[0] + (z[1] << 26) + (z[2] << 51) + (z[3] << 77) + (z[4] << 102) + (z[5] << 128) + (z[6] << 153) + (z[7] << 179) + (z[8] << 204) + (z[9] << 230) */ /* bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */ /* balance = [0x7ffffda, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe] */ #include typedef unsigned char fiat_25519_uint1; typedef signed char fiat_25519_int1; #if defined(__GNUC__) || defined(__clang__) # define FIAT_25519_FIAT_INLINE __inline__ #else # define FIAT_25519_FIAT_INLINE #endif /* The type fiat_25519_loose_field_element is a field element with loose bounds. */ /* Bounds: [[0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000]] */ typedef uint32_t fiat_25519_loose_field_element[10]; /* The type fiat_25519_tight_field_element is a field element with tight bounds. */ /* Bounds: [[0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000]] */ typedef uint32_t fiat_25519_tight_field_element[10]; #if (-1 & 3) != 3 #error "This code only works on a two's complement system" #endif #if !defined(FIAT_25519_NO_ASM) && (defined(__GNUC__) || defined(__clang__)) static __inline__ uint32_t fiat_25519_value_barrier_u32(uint32_t a) { __asm__("" : "+r"(a) : /* no inputs */); return a; } #else # define fiat_25519_value_barrier_u32(x) (x) #endif /* * The function fiat_25519_addcarryx_u26 is an addition with carry. * * Postconditions: * out1 = (arg1 + arg2 + arg3) mod 2^26 * out2 = ⌊(arg1 + arg2 + arg3) / 2^26⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0x3ffffff] * arg3: [0x0 ~> 0x3ffffff] * Output Bounds: * out1: [0x0 ~> 0x3ffffff] * out2: [0x0 ~> 0x1] */ static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u26(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) { uint32_t x1; uint32_t x2; fiat_25519_uint1 x3; x1 = ((arg1 + arg2) + arg3); x2 = (x1 & UINT32_C(0x3ffffff)); x3 = (fiat_25519_uint1)(x1 >> 26); *out1 = x2; *out2 = x3; } /* * The function fiat_25519_subborrowx_u26 is a subtraction with borrow. * * Postconditions: * out1 = (-arg1 + arg2 + -arg3) mod 2^26 * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^26⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0x3ffffff] * arg3: [0x0 ~> 0x3ffffff] * Output Bounds: * out1: [0x0 ~> 0x3ffffff] * out2: [0x0 ~> 0x1] */ static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u26(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) { int32_t x1; fiat_25519_int1 x2; uint32_t x3; x1 = ((int32_t)(arg2 - arg1) - (int32_t)arg3); x2 = (fiat_25519_int1)(x1 >> 26); x3 = (x1 & UINT32_C(0x3ffffff)); *out1 = x3; *out2 = (fiat_25519_uint1)(0x0 - x2); } /* * The function fiat_25519_addcarryx_u25 is an addition with carry. * * Postconditions: * out1 = (arg1 + arg2 + arg3) mod 2^25 * out2 = ⌊(arg1 + arg2 + arg3) / 2^25⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0x1ffffff] * arg3: [0x0 ~> 0x1ffffff] * Output Bounds: * out1: [0x0 ~> 0x1ffffff] * out2: [0x0 ~> 0x1] */ static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u25(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) { uint32_t x1; uint32_t x2; fiat_25519_uint1 x3; x1 = ((arg1 + arg2) + arg3); x2 = (x1 & UINT32_C(0x1ffffff)); x3 = (fiat_25519_uint1)(x1 >> 25); *out1 = x2; *out2 = x3; } /* * The function fiat_25519_subborrowx_u25 is a subtraction with borrow. * * Postconditions: * out1 = (-arg1 + arg2 + -arg3) mod 2^25 * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^25⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0x1ffffff] * arg3: [0x0 ~> 0x1ffffff] * Output Bounds: * out1: [0x0 ~> 0x1ffffff] * out2: [0x0 ~> 0x1] */ static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u25(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) { int32_t x1; fiat_25519_int1 x2; uint32_t x3; x1 = ((int32_t)(arg2 - arg1) - (int32_t)arg3); x2 = (fiat_25519_int1)(x1 >> 25); x3 = (x1 & UINT32_C(0x1ffffff)); *out1 = x3; *out2 = (fiat_25519_uint1)(0x0 - x2); } /* * The function fiat_25519_cmovznz_u32 is a single-word conditional move. * * Postconditions: * out1 = (if arg1 = 0 then arg2 else arg3) * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffff] * arg3: [0x0 ~> 0xffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffff] */ static FIAT_25519_FIAT_INLINE void fiat_25519_cmovznz_u32(uint32_t* out1, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) { fiat_25519_uint1 x1; uint32_t x2; uint32_t x3; x1 = (!(!arg1)); x2 = ((fiat_25519_int1)(0x0 - x1) & UINT32_C(0xffffffff)); x3 = ((fiat_25519_value_barrier_u32(x2) & arg3) | (fiat_25519_value_barrier_u32((~x2)) & arg2)); *out1 = x3; } /* * The function fiat_25519_carry_mul multiplies two field elements and reduces the result. * * Postconditions: * eval out1 mod m = (eval arg1 * eval arg2) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_carry_mul(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1, const fiat_25519_loose_field_element arg2) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; uint64_t x13; uint64_t x14; uint64_t x15; uint64_t x16; uint64_t x17; uint64_t x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; uint64_t x26; uint64_t x27; uint64_t x28; uint64_t x29; uint64_t x30; uint64_t x31; uint64_t x32; uint64_t x33; uint64_t x34; uint64_t x35; uint64_t x36; uint64_t x37; uint64_t x38; uint64_t x39; uint64_t x40; uint64_t x41; uint64_t x42; uint64_t x43; uint64_t x44; uint64_t x45; uint64_t x46; uint64_t x47; uint64_t x48; uint64_t x49; uint64_t x50; uint64_t x51; uint64_t x52; uint64_t x53; uint64_t x54; uint64_t x55; uint64_t x56; uint64_t x57; uint64_t x58; uint64_t x59; uint64_t x60; uint64_t x61; uint64_t x62; uint64_t x63; uint64_t x64; uint64_t x65; uint64_t x66; uint64_t x67; uint64_t x68; uint64_t x69; uint64_t x70; uint64_t x71; uint64_t x72; uint64_t x73; uint64_t x74; uint64_t x75; uint64_t x76; uint64_t x77; uint64_t x78; uint64_t x79; uint64_t x80; uint64_t x81; uint64_t x82; uint64_t x83; uint64_t x84; uint64_t x85; uint64_t x86; uint64_t x87; uint64_t x88; uint64_t x89; uint64_t x90; uint64_t x91; uint64_t x92; uint64_t x93; uint64_t x94; uint64_t x95; uint64_t x96; uint64_t x97; uint64_t x98; uint64_t x99; uint64_t x100; uint64_t x101; uint64_t x102; uint32_t x103; uint64_t x104; uint64_t x105; uint64_t x106; uint64_t x107; uint64_t x108; uint64_t x109; uint64_t x110; uint64_t x111; uint64_t x112; uint64_t x113; uint64_t x114; uint32_t x115; uint64_t x116; uint64_t x117; uint32_t x118; uint64_t x119; uint64_t x120; uint32_t x121; uint64_t x122; uint64_t x123; uint32_t x124; uint64_t x125; uint64_t x126; uint32_t x127; uint64_t x128; uint64_t x129; uint32_t x130; uint64_t x131; uint64_t x132; uint32_t x133; uint64_t x134; uint64_t x135; uint32_t x136; uint64_t x137; uint64_t x138; uint32_t x139; uint64_t x140; uint64_t x141; uint32_t x142; uint32_t x143; uint32_t x144; fiat_25519_uint1 x145; uint32_t x146; uint32_t x147; x1 = ((uint64_t)(arg1[9]) * ((arg2[9]) * UINT8_C(0x26))); x2 = ((uint64_t)(arg1[9]) * ((arg2[8]) * UINT8_C(0x13))); x3 = ((uint64_t)(arg1[9]) * ((arg2[7]) * UINT8_C(0x26))); x4 = ((uint64_t)(arg1[9]) * ((arg2[6]) * UINT8_C(0x13))); x5 = ((uint64_t)(arg1[9]) * ((arg2[5]) * UINT8_C(0x26))); x6 = ((uint64_t)(arg1[9]) * ((arg2[4]) * UINT8_C(0x13))); x7 = ((uint64_t)(arg1[9]) * ((arg2[3]) * UINT8_C(0x26))); x8 = ((uint64_t)(arg1[9]) * ((arg2[2]) * UINT8_C(0x13))); x9 = ((uint64_t)(arg1[9]) * ((arg2[1]) * UINT8_C(0x26))); x10 = ((uint64_t)(arg1[8]) * ((arg2[9]) * UINT8_C(0x13))); x11 = ((uint64_t)(arg1[8]) * ((arg2[8]) * UINT8_C(0x13))); x12 = ((uint64_t)(arg1[8]) * ((arg2[7]) * UINT8_C(0x13))); x13 = ((uint64_t)(arg1[8]) * ((arg2[6]) * UINT8_C(0x13))); x14 = ((uint64_t)(arg1[8]) * ((arg2[5]) * UINT8_C(0x13))); x15 = ((uint64_t)(arg1[8]) * ((arg2[4]) * UINT8_C(0x13))); x16 = ((uint64_t)(arg1[8]) * ((arg2[3]) * UINT8_C(0x13))); x17 = ((uint64_t)(arg1[8]) * ((arg2[2]) * UINT8_C(0x13))); x18 = ((uint64_t)(arg1[7]) * ((arg2[9]) * UINT8_C(0x26))); x19 = ((uint64_t)(arg1[7]) * ((arg2[8]) * UINT8_C(0x13))); x20 = ((uint64_t)(arg1[7]) * ((arg2[7]) * UINT8_C(0x26))); x21 = ((uint64_t)(arg1[7]) * ((arg2[6]) * UINT8_C(0x13))); x22 = ((uint64_t)(arg1[7]) * ((arg2[5]) * UINT8_C(0x26))); x23 = ((uint64_t)(arg1[7]) * ((arg2[4]) * UINT8_C(0x13))); x24 = ((uint64_t)(arg1[7]) * ((arg2[3]) * UINT8_C(0x26))); x25 = ((uint64_t)(arg1[6]) * ((arg2[9]) * UINT8_C(0x13))); x26 = ((uint64_t)(arg1[6]) * ((arg2[8]) * UINT8_C(0x13))); x27 = ((uint64_t)(arg1[6]) * ((arg2[7]) * UINT8_C(0x13))); x28 = ((uint64_t)(arg1[6]) * ((arg2[6]) * UINT8_C(0x13))); x29 = ((uint64_t)(arg1[6]) * ((arg2[5]) * UINT8_C(0x13))); x30 = ((uint64_t)(arg1[6]) * ((arg2[4]) * UINT8_C(0x13))); x31 = ((uint64_t)(arg1[5]) * ((arg2[9]) * UINT8_C(0x26))); x32 = ((uint64_t)(arg1[5]) * ((arg2[8]) * UINT8_C(0x13))); x33 = ((uint64_t)(arg1[5]) * ((arg2[7]) * UINT8_C(0x26))); x34 = ((uint64_t)(arg1[5]) * ((arg2[6]) * UINT8_C(0x13))); x35 = ((uint64_t)(arg1[5]) * ((arg2[5]) * UINT8_C(0x26))); x36 = ((uint64_t)(arg1[4]) * ((arg2[9]) * UINT8_C(0x13))); x37 = ((uint64_t)(arg1[4]) * ((arg2[8]) * UINT8_C(0x13))); x38 = ((uint64_t)(arg1[4]) * ((arg2[7]) * UINT8_C(0x13))); x39 = ((uint64_t)(arg1[4]) * ((arg2[6]) * UINT8_C(0x13))); x40 = ((uint64_t)(arg1[3]) * ((arg2[9]) * UINT8_C(0x26))); x41 = ((uint64_t)(arg1[3]) * ((arg2[8]) * UINT8_C(0x13))); x42 = ((uint64_t)(arg1[3]) * ((arg2[7]) * UINT8_C(0x26))); x43 = ((uint64_t)(arg1[2]) * ((arg2[9]) * UINT8_C(0x13))); x44 = ((uint64_t)(arg1[2]) * ((arg2[8]) * UINT8_C(0x13))); x45 = ((uint64_t)(arg1[1]) * ((arg2[9]) * UINT8_C(0x26))); x46 = ((uint64_t)(arg1[9]) * (arg2[0])); x47 = ((uint64_t)(arg1[8]) * (arg2[1])); x48 = ((uint64_t)(arg1[8]) * (arg2[0])); x49 = ((uint64_t)(arg1[7]) * (arg2[2])); x50 = ((uint64_t)(arg1[7]) * ((arg2[1]) * 0x2)); x51 = ((uint64_t)(arg1[7]) * (arg2[0])); x52 = ((uint64_t)(arg1[6]) * (arg2[3])); x53 = ((uint64_t)(arg1[6]) * (arg2[2])); x54 = ((uint64_t)(arg1[6]) * (arg2[1])); x55 = ((uint64_t)(arg1[6]) * (arg2[0])); x56 = ((uint64_t)(arg1[5]) * (arg2[4])); x57 = ((uint64_t)(arg1[5]) * ((arg2[3]) * 0x2)); x58 = ((uint64_t)(arg1[5]) * (arg2[2])); x59 = ((uint64_t)(arg1[5]) * ((arg2[1]) * 0x2)); x60 = ((uint64_t)(arg1[5]) * (arg2[0])); x61 = ((uint64_t)(arg1[4]) * (arg2[5])); x62 = ((uint64_t)(arg1[4]) * (arg2[4])); x63 = ((uint64_t)(arg1[4]) * (arg2[3])); x64 = ((uint64_t)(arg1[4]) * (arg2[2])); x65 = ((uint64_t)(arg1[4]) * (arg2[1])); x66 = ((uint64_t)(arg1[4]) * (arg2[0])); x67 = ((uint64_t)(arg1[3]) * (arg2[6])); x68 = ((uint64_t)(arg1[3]) * ((arg2[5]) * 0x2)); x69 = ((uint64_t)(arg1[3]) * (arg2[4])); x70 = ((uint64_t)(arg1[3]) * ((arg2[3]) * 0x2)); x71 = ((uint64_t)(arg1[3]) * (arg2[2])); x72 = ((uint64_t)(arg1[3]) * ((arg2[1]) * 0x2)); x73 = ((uint64_t)(arg1[3]) * (arg2[0])); x74 = ((uint64_t)(arg1[2]) * (arg2[7])); x75 = ((uint64_t)(arg1[2]) * (arg2[6])); x76 = ((uint64_t)(arg1[2]) * (arg2[5])); x77 = ((uint64_t)(arg1[2]) * (arg2[4])); x78 = ((uint64_t)(arg1[2]) * (arg2[3])); x79 = ((uint64_t)(arg1[2]) * (arg2[2])); x80 = ((uint64_t)(arg1[2]) * (arg2[1])); x81 = ((uint64_t)(arg1[2]) * (arg2[0])); x82 = ((uint64_t)(arg1[1]) * (arg2[8])); x83 = ((uint64_t)(arg1[1]) * ((arg2[7]) * 0x2)); x84 = ((uint64_t)(arg1[1]) * (arg2[6])); x85 = ((uint64_t)(arg1[1]) * ((arg2[5]) * 0x2)); x86 = ((uint64_t)(arg1[1]) * (arg2[4])); x87 = ((uint64_t)(arg1[1]) * ((arg2[3]) * 0x2)); x88 = ((uint64_t)(arg1[1]) * (arg2[2])); x89 = ((uint64_t)(arg1[1]) * ((arg2[1]) * 0x2)); x90 = ((uint64_t)(arg1[1]) * (arg2[0])); x91 = ((uint64_t)(arg1[0]) * (arg2[9])); x92 = ((uint64_t)(arg1[0]) * (arg2[8])); x93 = ((uint64_t)(arg1[0]) * (arg2[7])); x94 = ((uint64_t)(arg1[0]) * (arg2[6])); x95 = ((uint64_t)(arg1[0]) * (arg2[5])); x96 = ((uint64_t)(arg1[0]) * (arg2[4])); x97 = ((uint64_t)(arg1[0]) * (arg2[3])); x98 = ((uint64_t)(arg1[0]) * (arg2[2])); x99 = ((uint64_t)(arg1[0]) * (arg2[1])); x100 = ((uint64_t)(arg1[0]) * (arg2[0])); x101 = (x100 + (x45 + (x44 + (x42 + (x39 + (x35 + (x30 + (x24 + (x17 + x9))))))))); x102 = (x101 >> 26); x103 = (uint32_t)(x101 & UINT32_C(0x3ffffff)); x104 = (x91 + (x82 + (x74 + (x67 + (x61 + (x56 + (x52 + (x49 + (x47 + x46))))))))); x105 = (x92 + (x83 + (x75 + (x68 + (x62 + (x57 + (x53 + (x50 + (x48 + x1))))))))); x106 = (x93 + (x84 + (x76 + (x69 + (x63 + (x58 + (x54 + (x51 + (x10 + x2))))))))); x107 = (x94 + (x85 + (x77 + (x70 + (x64 + (x59 + (x55 + (x18 + (x11 + x3))))))))); x108 = (x95 + (x86 + (x78 + (x71 + (x65 + (x60 + (x25 + (x19 + (x12 + x4))))))))); x109 = (x96 + (x87 + (x79 + (x72 + (x66 + (x31 + (x26 + (x20 + (x13 + x5))))))))); x110 = (x97 + (x88 + (x80 + (x73 + (x36 + (x32 + (x27 + (x21 + (x14 + x6))))))))); x111 = (x98 + (x89 + (x81 + (x40 + (x37 + (x33 + (x28 + (x22 + (x15 + x7))))))))); x112 = (x99 + (x90 + (x43 + (x41 + (x38 + (x34 + (x29 + (x23 + (x16 + x8))))))))); x113 = (x102 + x112); x114 = (x113 >> 25); x115 = (uint32_t)(x113 & UINT32_C(0x1ffffff)); x116 = (x114 + x111); x117 = (x116 >> 26); x118 = (uint32_t)(x116 & UINT32_C(0x3ffffff)); x119 = (x117 + x110); x120 = (x119 >> 25); x121 = (uint32_t)(x119 & UINT32_C(0x1ffffff)); x122 = (x120 + x109); x123 = (x122 >> 26); x124 = (uint32_t)(x122 & UINT32_C(0x3ffffff)); x125 = (x123 + x108); x126 = (x125 >> 25); x127 = (uint32_t)(x125 & UINT32_C(0x1ffffff)); x128 = (x126 + x107); x129 = (x128 >> 26); x130 = (uint32_t)(x128 & UINT32_C(0x3ffffff)); x131 = (x129 + x106); x132 = (x131 >> 25); x133 = (uint32_t)(x131 & UINT32_C(0x1ffffff)); x134 = (x132 + x105); x135 = (x134 >> 26); x136 = (uint32_t)(x134 & UINT32_C(0x3ffffff)); x137 = (x135 + x104); x138 = (x137 >> 25); x139 = (uint32_t)(x137 & UINT32_C(0x1ffffff)); x140 = (x138 * UINT8_C(0x13)); x141 = (x103 + x140); x142 = (uint32_t)(x141 >> 26); x143 = (uint32_t)(x141 & UINT32_C(0x3ffffff)); x144 = (x142 + x115); x145 = (fiat_25519_uint1)(x144 >> 25); x146 = (x144 & UINT32_C(0x1ffffff)); x147 = (x145 + x118); out1[0] = x143; out1[1] = x146; out1[2] = x147; out1[3] = x121; out1[4] = x124; out1[5] = x127; out1[6] = x130; out1[7] = x133; out1[8] = x136; out1[9] = x139; } /* * The function fiat_25519_carry_square squares a field element and reduces the result. * * Postconditions: * eval out1 mod m = (eval arg1 * eval arg1) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_carry_square(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { uint32_t x1; uint32_t x2; uint32_t x3; uint32_t x4; uint64_t x5; uint32_t x6; uint32_t x7; uint32_t x8; uint32_t x9; uint32_t x10; uint64_t x11; uint32_t x12; uint32_t x13; uint32_t x14; uint32_t x15; uint32_t x16; uint32_t x17; uint32_t x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; uint64_t x26; uint64_t x27; uint64_t x28; uint64_t x29; uint64_t x30; uint64_t x31; uint64_t x32; uint64_t x33; uint64_t x34; uint64_t x35; uint64_t x36; uint64_t x37; uint64_t x38; uint64_t x39; uint64_t x40; uint64_t x41; uint64_t x42; uint64_t x43; uint64_t x44; uint64_t x45; uint64_t x46; uint64_t x47; uint64_t x48; uint64_t x49; uint64_t x50; uint64_t x51; uint64_t x52; uint64_t x53; uint64_t x54; uint64_t x55; uint64_t x56; uint64_t x57; uint64_t x58; uint64_t x59; uint64_t x60; uint64_t x61; uint64_t x62; uint64_t x63; uint64_t x64; uint64_t x65; uint64_t x66; uint64_t x67; uint64_t x68; uint64_t x69; uint64_t x70; uint64_t x71; uint64_t x72; uint64_t x73; uint64_t x74; uint64_t x75; uint32_t x76; uint64_t x77; uint64_t x78; uint64_t x79; uint64_t x80; uint64_t x81; uint64_t x82; uint64_t x83; uint64_t x84; uint64_t x85; uint64_t x86; uint64_t x87; uint32_t x88; uint64_t x89; uint64_t x90; uint32_t x91; uint64_t x92; uint64_t x93; uint32_t x94; uint64_t x95; uint64_t x96; uint32_t x97; uint64_t x98; uint64_t x99; uint32_t x100; uint64_t x101; uint64_t x102; uint32_t x103; uint64_t x104; uint64_t x105; uint32_t x106; uint64_t x107; uint64_t x108; uint32_t x109; uint64_t x110; uint64_t x111; uint32_t x112; uint64_t x113; uint64_t x114; uint32_t x115; uint32_t x116; uint32_t x117; fiat_25519_uint1 x118; uint32_t x119; uint32_t x120; x1 = ((arg1[9]) * UINT8_C(0x13)); x2 = (x1 * 0x2); x3 = ((arg1[9]) * 0x2); x4 = ((arg1[8]) * UINT8_C(0x13)); x5 = ((uint64_t)x4 * 0x2); x6 = ((arg1[8]) * 0x2); x7 = ((arg1[7]) * UINT8_C(0x13)); x8 = (x7 * 0x2); x9 = ((arg1[7]) * 0x2); x10 = ((arg1[6]) * UINT8_C(0x13)); x11 = ((uint64_t)x10 * 0x2); x12 = ((arg1[6]) * 0x2); x13 = ((arg1[5]) * UINT8_C(0x13)); x14 = ((arg1[5]) * 0x2); x15 = ((arg1[4]) * 0x2); x16 = ((arg1[3]) * 0x2); x17 = ((arg1[2]) * 0x2); x18 = ((arg1[1]) * 0x2); x19 = ((uint64_t)(arg1[9]) * (x1 * 0x2)); x20 = ((uint64_t)(arg1[8]) * x2); x21 = ((uint64_t)(arg1[8]) * x4); x22 = ((arg1[7]) * ((uint64_t)x2 * 0x2)); x23 = ((arg1[7]) * x5); x24 = ((uint64_t)(arg1[7]) * (x7 * 0x2)); x25 = ((uint64_t)(arg1[6]) * x2); x26 = ((arg1[6]) * x5); x27 = ((uint64_t)(arg1[6]) * x8); x28 = ((uint64_t)(arg1[6]) * x10); x29 = ((arg1[5]) * ((uint64_t)x2 * 0x2)); x30 = ((arg1[5]) * x5); x31 = ((arg1[5]) * ((uint64_t)x8 * 0x2)); x32 = ((arg1[5]) * x11); x33 = ((uint64_t)(arg1[5]) * (x13 * 0x2)); x34 = ((uint64_t)(arg1[4]) * x2); x35 = ((arg1[4]) * x5); x36 = ((uint64_t)(arg1[4]) * x8); x37 = ((arg1[4]) * x11); x38 = ((uint64_t)(arg1[4]) * x14); x39 = ((uint64_t)(arg1[4]) * (arg1[4])); x40 = ((arg1[3]) * ((uint64_t)x2 * 0x2)); x41 = ((arg1[3]) * x5); x42 = ((arg1[3]) * ((uint64_t)x8 * 0x2)); x43 = ((uint64_t)(arg1[3]) * x12); x44 = ((uint64_t)(arg1[3]) * (x14 * 0x2)); x45 = ((uint64_t)(arg1[3]) * x15); x46 = ((uint64_t)(arg1[3]) * ((arg1[3]) * 0x2)); x47 = ((uint64_t)(arg1[2]) * x2); x48 = ((arg1[2]) * x5); x49 = ((uint64_t)(arg1[2]) * x9); x50 = ((uint64_t)(arg1[2]) * x12); x51 = ((uint64_t)(arg1[2]) * x14); x52 = ((uint64_t)(arg1[2]) * x15); x53 = ((uint64_t)(arg1[2]) * x16); x54 = ((uint64_t)(arg1[2]) * (arg1[2])); x55 = ((arg1[1]) * ((uint64_t)x2 * 0x2)); x56 = ((uint64_t)(arg1[1]) * x6); x57 = ((uint64_t)(arg1[1]) * (x9 * 0x2)); x58 = ((uint64_t)(arg1[1]) * x12); x59 = ((uint64_t)(arg1[1]) * (x14 * 0x2)); x60 = ((uint64_t)(arg1[1]) * x15); x61 = ((uint64_t)(arg1[1]) * (x16 * 0x2)); x62 = ((uint64_t)(arg1[1]) * x17); x63 = ((uint64_t)(arg1[1]) * ((arg1[1]) * 0x2)); x64 = ((uint64_t)(arg1[0]) * x3); x65 = ((uint64_t)(arg1[0]) * x6); x66 = ((uint64_t)(arg1[0]) * x9); x67 = ((uint64_t)(arg1[0]) * x12); x68 = ((uint64_t)(arg1[0]) * x14); x69 = ((uint64_t)(arg1[0]) * x15); x70 = ((uint64_t)(arg1[0]) * x16); x71 = ((uint64_t)(arg1[0]) * x17); x72 = ((uint64_t)(arg1[0]) * x18); x73 = ((uint64_t)(arg1[0]) * (arg1[0])); x74 = (x73 + (x55 + (x48 + (x42 + (x37 + x33))))); x75 = (x74 >> 26); x76 = (uint32_t)(x74 & UINT32_C(0x3ffffff)); x77 = (x64 + (x56 + (x49 + (x43 + x38)))); x78 = (x65 + (x57 + (x50 + (x44 + (x39 + x19))))); x79 = (x66 + (x58 + (x51 + (x45 + x20)))); x80 = (x67 + (x59 + (x52 + (x46 + (x22 + x21))))); x81 = (x68 + (x60 + (x53 + (x25 + x23)))); x82 = (x69 + (x61 + (x54 + (x29 + (x26 + x24))))); x83 = (x70 + (x62 + (x34 + (x30 + x27)))); x84 = (x71 + (x63 + (x40 + (x35 + (x31 + x28))))); x85 = (x72 + (x47 + (x41 + (x36 + x32)))); x86 = (x75 + x85); x87 = (x86 >> 25); x88 = (uint32_t)(x86 & UINT32_C(0x1ffffff)); x89 = (x87 + x84); x90 = (x89 >> 26); x91 = (uint32_t)(x89 & UINT32_C(0x3ffffff)); x92 = (x90 + x83); x93 = (x92 >> 25); x94 = (uint32_t)(x92 & UINT32_C(0x1ffffff)); x95 = (x93 + x82); x96 = (x95 >> 26); x97 = (uint32_t)(x95 & UINT32_C(0x3ffffff)); x98 = (x96 + x81); x99 = (x98 >> 25); x100 = (uint32_t)(x98 & UINT32_C(0x1ffffff)); x101 = (x99 + x80); x102 = (x101 >> 26); x103 = (uint32_t)(x101 & UINT32_C(0x3ffffff)); x104 = (x102 + x79); x105 = (x104 >> 25); x106 = (uint32_t)(x104 & UINT32_C(0x1ffffff)); x107 = (x105 + x78); x108 = (x107 >> 26); x109 = (uint32_t)(x107 & UINT32_C(0x3ffffff)); x110 = (x108 + x77); x111 = (x110 >> 25); x112 = (uint32_t)(x110 & UINT32_C(0x1ffffff)); x113 = (x111 * UINT8_C(0x13)); x114 = (x76 + x113); x115 = (uint32_t)(x114 >> 26); x116 = (uint32_t)(x114 & UINT32_C(0x3ffffff)); x117 = (x115 + x88); x118 = (fiat_25519_uint1)(x117 >> 25); x119 = (x117 & UINT32_C(0x1ffffff)); x120 = (x118 + x91); out1[0] = x116; out1[1] = x119; out1[2] = x120; out1[3] = x94; out1[4] = x97; out1[5] = x100; out1[6] = x103; out1[7] = x106; out1[8] = x109; out1[9] = x112; } /* * The function fiat_25519_carry reduces a field element. * * Postconditions: * eval out1 mod m = eval arg1 mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_carry(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { uint32_t x1; uint32_t x2; uint32_t x3; uint32_t x4; uint32_t x5; uint32_t x6; uint32_t x7; uint32_t x8; uint32_t x9; uint32_t x10; uint32_t x11; uint32_t x12; uint32_t x13; uint32_t x14; uint32_t x15; uint32_t x16; uint32_t x17; uint32_t x18; uint32_t x19; uint32_t x20; uint32_t x21; uint32_t x22; x1 = (arg1[0]); x2 = ((x1 >> 26) + (arg1[1])); x3 = ((x2 >> 25) + (arg1[2])); x4 = ((x3 >> 26) + (arg1[3])); x5 = ((x4 >> 25) + (arg1[4])); x6 = ((x5 >> 26) + (arg1[5])); x7 = ((x6 >> 25) + (arg1[6])); x8 = ((x7 >> 26) + (arg1[7])); x9 = ((x8 >> 25) + (arg1[8])); x10 = ((x9 >> 26) + (arg1[9])); x11 = ((x1 & UINT32_C(0x3ffffff)) + ((x10 >> 25) * UINT8_C(0x13))); x12 = ((fiat_25519_uint1)(x11 >> 26) + (x2 & UINT32_C(0x1ffffff))); x13 = (x11 & UINT32_C(0x3ffffff)); x14 = (x12 & UINT32_C(0x1ffffff)); x15 = ((fiat_25519_uint1)(x12 >> 25) + (x3 & UINT32_C(0x3ffffff))); x16 = (x4 & UINT32_C(0x1ffffff)); x17 = (x5 & UINT32_C(0x3ffffff)); x18 = (x6 & UINT32_C(0x1ffffff)); x19 = (x7 & UINT32_C(0x3ffffff)); x20 = (x8 & UINT32_C(0x1ffffff)); x21 = (x9 & UINT32_C(0x3ffffff)); x22 = (x10 & UINT32_C(0x1ffffff)); out1[0] = x13; out1[1] = x14; out1[2] = x15; out1[3] = x16; out1[4] = x17; out1[5] = x18; out1[6] = x19; out1[7] = x20; out1[8] = x21; out1[9] = x22; } /* * The function fiat_25519_add adds two field elements. * * Postconditions: * eval out1 mod m = (eval arg1 + eval arg2) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_add(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) { uint32_t x1; uint32_t x2; uint32_t x3; uint32_t x4; uint32_t x5; uint32_t x6; uint32_t x7; uint32_t x8; uint32_t x9; uint32_t x10; x1 = ((arg1[0]) + (arg2[0])); x2 = ((arg1[1]) + (arg2[1])); x3 = ((arg1[2]) + (arg2[2])); x4 = ((arg1[3]) + (arg2[3])); x5 = ((arg1[4]) + (arg2[4])); x6 = ((arg1[5]) + (arg2[5])); x7 = ((arg1[6]) + (arg2[6])); x8 = ((arg1[7]) + (arg2[7])); x9 = ((arg1[8]) + (arg2[8])); x10 = ((arg1[9]) + (arg2[9])); out1[0] = x1; out1[1] = x2; out1[2] = x3; out1[3] = x4; out1[4] = x5; out1[5] = x6; out1[6] = x7; out1[7] = x8; out1[8] = x9; out1[9] = x10; } /* * The function fiat_25519_sub subtracts two field elements. * * Postconditions: * eval out1 mod m = (eval arg1 - eval arg2) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_sub(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) { uint32_t x1; uint32_t x2; uint32_t x3; uint32_t x4; uint32_t x5; uint32_t x6; uint32_t x7; uint32_t x8; uint32_t x9; uint32_t x10; x1 = ((UINT32_C(0x7ffffda) + (arg1[0])) - (arg2[0])); x2 = ((UINT32_C(0x3fffffe) + (arg1[1])) - (arg2[1])); x3 = ((UINT32_C(0x7fffffe) + (arg1[2])) - (arg2[2])); x4 = ((UINT32_C(0x3fffffe) + (arg1[3])) - (arg2[3])); x5 = ((UINT32_C(0x7fffffe) + (arg1[4])) - (arg2[4])); x6 = ((UINT32_C(0x3fffffe) + (arg1[5])) - (arg2[5])); x7 = ((UINT32_C(0x7fffffe) + (arg1[6])) - (arg2[6])); x8 = ((UINT32_C(0x3fffffe) + (arg1[7])) - (arg2[7])); x9 = ((UINT32_C(0x7fffffe) + (arg1[8])) - (arg2[8])); x10 = ((UINT32_C(0x3fffffe) + (arg1[9])) - (arg2[9])); out1[0] = x1; out1[1] = x2; out1[2] = x3; out1[3] = x4; out1[4] = x5; out1[5] = x6; out1[6] = x7; out1[7] = x8; out1[8] = x9; out1[9] = x10; } /* * The function fiat_25519_opp negates a field element. * * Postconditions: * eval out1 mod m = -eval arg1 mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_opp(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1) { uint32_t x1; uint32_t x2; uint32_t x3; uint32_t x4; uint32_t x5; uint32_t x6; uint32_t x7; uint32_t x8; uint32_t x9; uint32_t x10; x1 = (UINT32_C(0x7ffffda) - (arg1[0])); x2 = (UINT32_C(0x3fffffe) - (arg1[1])); x3 = (UINT32_C(0x7fffffe) - (arg1[2])); x4 = (UINT32_C(0x3fffffe) - (arg1[3])); x5 = (UINT32_C(0x7fffffe) - (arg1[4])); x6 = (UINT32_C(0x3fffffe) - (arg1[5])); x7 = (UINT32_C(0x7fffffe) - (arg1[6])); x8 = (UINT32_C(0x3fffffe) - (arg1[7])); x9 = (UINT32_C(0x7fffffe) - (arg1[8])); x10 = (UINT32_C(0x3fffffe) - (arg1[9])); out1[0] = x1; out1[1] = x2; out1[2] = x3; out1[3] = x4; out1[4] = x5; out1[5] = x6; out1[6] = x7; out1[7] = x8; out1[8] = x9; out1[9] = x10; } /* * The function fiat_25519_to_bytes serializes a field element to bytes in little-endian order. * * Postconditions: * out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31] * * Output Bounds: * out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]] */ static FIAT_25519_FIAT_INLINE void fiat_25519_to_bytes(uint8_t out1[32], const fiat_25519_tight_field_element arg1) { uint32_t x1; fiat_25519_uint1 x2; uint32_t x3; fiat_25519_uint1 x4; uint32_t x5; fiat_25519_uint1 x6; uint32_t x7; fiat_25519_uint1 x8; uint32_t x9; fiat_25519_uint1 x10; uint32_t x11; fiat_25519_uint1 x12; uint32_t x13; fiat_25519_uint1 x14; uint32_t x15; fiat_25519_uint1 x16; uint32_t x17; fiat_25519_uint1 x18; uint32_t x19; fiat_25519_uint1 x20; uint32_t x21; uint32_t x22; fiat_25519_uint1 x23; uint32_t x24; fiat_25519_uint1 x25; uint32_t x26; fiat_25519_uint1 x27; uint32_t x28; fiat_25519_uint1 x29; uint32_t x30; fiat_25519_uint1 x31; uint32_t x32; fiat_25519_uint1 x33; uint32_t x34; fiat_25519_uint1 x35; uint32_t x36; fiat_25519_uint1 x37; uint32_t x38; fiat_25519_uint1 x39; uint32_t x40; fiat_25519_uint1 x41; uint32_t x42; uint32_t x43; uint32_t x44; uint32_t x45; uint32_t x46; uint32_t x47; uint32_t x48; uint32_t x49; uint8_t x50; uint32_t x51; uint8_t x52; uint32_t x53; uint8_t x54; uint8_t x55; uint32_t x56; uint8_t x57; uint32_t x58; uint8_t x59; uint32_t x60; uint8_t x61; uint8_t x62; uint32_t x63; uint8_t x64; uint32_t x65; uint8_t x66; uint32_t x67; uint8_t x68; uint8_t x69; uint32_t x70; uint8_t x71; uint32_t x72; uint8_t x73; uint32_t x74; uint8_t x75; uint8_t x76; uint32_t x77; uint8_t x78; uint32_t x79; uint8_t x80; uint32_t x81; uint8_t x82; uint8_t x83; uint8_t x84; uint32_t x85; uint8_t x86; uint32_t x87; uint8_t x88; fiat_25519_uint1 x89; uint32_t x90; uint8_t x91; uint32_t x92; uint8_t x93; uint32_t x94; uint8_t x95; uint8_t x96; uint32_t x97; uint8_t x98; uint32_t x99; uint8_t x100; uint32_t x101; uint8_t x102; uint8_t x103; uint32_t x104; uint8_t x105; uint32_t x106; uint8_t x107; uint32_t x108; uint8_t x109; uint8_t x110; uint32_t x111; uint8_t x112; uint32_t x113; uint8_t x114; uint32_t x115; uint8_t x116; uint8_t x117; fiat_25519_subborrowx_u26(&x1, &x2, 0x0, (arg1[0]), UINT32_C(0x3ffffed)); fiat_25519_subborrowx_u25(&x3, &x4, x2, (arg1[1]), UINT32_C(0x1ffffff)); fiat_25519_subborrowx_u26(&x5, &x6, x4, (arg1[2]), UINT32_C(0x3ffffff)); fiat_25519_subborrowx_u25(&x7, &x8, x6, (arg1[3]), UINT32_C(0x1ffffff)); fiat_25519_subborrowx_u26(&x9, &x10, x8, (arg1[4]), UINT32_C(0x3ffffff)); fiat_25519_subborrowx_u25(&x11, &x12, x10, (arg1[5]), UINT32_C(0x1ffffff)); fiat_25519_subborrowx_u26(&x13, &x14, x12, (arg1[6]), UINT32_C(0x3ffffff)); fiat_25519_subborrowx_u25(&x15, &x16, x14, (arg1[7]), UINT32_C(0x1ffffff)); fiat_25519_subborrowx_u26(&x17, &x18, x16, (arg1[8]), UINT32_C(0x3ffffff)); fiat_25519_subborrowx_u25(&x19, &x20, x18, (arg1[9]), UINT32_C(0x1ffffff)); fiat_25519_cmovznz_u32(&x21, x20, 0x0, UINT32_C(0xffffffff)); fiat_25519_addcarryx_u26(&x22, &x23, 0x0, x1, (x21 & UINT32_C(0x3ffffed))); fiat_25519_addcarryx_u25(&x24, &x25, x23, x3, (x21 & UINT32_C(0x1ffffff))); fiat_25519_addcarryx_u26(&x26, &x27, x25, x5, (x21 & UINT32_C(0x3ffffff))); fiat_25519_addcarryx_u25(&x28, &x29, x27, x7, (x21 & UINT32_C(0x1ffffff))); fiat_25519_addcarryx_u26(&x30, &x31, x29, x9, (x21 & UINT32_C(0x3ffffff))); fiat_25519_addcarryx_u25(&x32, &x33, x31, x11, (x21 & UINT32_C(0x1ffffff))); fiat_25519_addcarryx_u26(&x34, &x35, x33, x13, (x21 & UINT32_C(0x3ffffff))); fiat_25519_addcarryx_u25(&x36, &x37, x35, x15, (x21 & UINT32_C(0x1ffffff))); fiat_25519_addcarryx_u26(&x38, &x39, x37, x17, (x21 & UINT32_C(0x3ffffff))); fiat_25519_addcarryx_u25(&x40, &x41, x39, x19, (x21 & UINT32_C(0x1ffffff))); x42 = (x40 << 6); x43 = (x38 << 4); x44 = (x36 << 3); x45 = (x34 * (uint32_t)0x2); x46 = (x30 << 6); x47 = (x28 << 5); x48 = (x26 << 3); x49 = (x24 << 2); x50 = (uint8_t)(x22 & UINT8_C(0xff)); x51 = (x22 >> 8); x52 = (uint8_t)(x51 & UINT8_C(0xff)); x53 = (x51 >> 8); x54 = (uint8_t)(x53 & UINT8_C(0xff)); x55 = (uint8_t)(x53 >> 8); x56 = (x49 + (uint32_t)x55); x57 = (uint8_t)(x56 & UINT8_C(0xff)); x58 = (x56 >> 8); x59 = (uint8_t)(x58 & UINT8_C(0xff)); x60 = (x58 >> 8); x61 = (uint8_t)(x60 & UINT8_C(0xff)); x62 = (uint8_t)(x60 >> 8); x63 = (x48 + (uint32_t)x62); x64 = (uint8_t)(x63 & UINT8_C(0xff)); x65 = (x63 >> 8); x66 = (uint8_t)(x65 & UINT8_C(0xff)); x67 = (x65 >> 8); x68 = (uint8_t)(x67 & UINT8_C(0xff)); x69 = (uint8_t)(x67 >> 8); x70 = (x47 + (uint32_t)x69); x71 = (uint8_t)(x70 & UINT8_C(0xff)); x72 = (x70 >> 8); x73 = (uint8_t)(x72 & UINT8_C(0xff)); x74 = (x72 >> 8); x75 = (uint8_t)(x74 & UINT8_C(0xff)); x76 = (uint8_t)(x74 >> 8); x77 = (x46 + (uint32_t)x76); x78 = (uint8_t)(x77 & UINT8_C(0xff)); x79 = (x77 >> 8); x80 = (uint8_t)(x79 & UINT8_C(0xff)); x81 = (x79 >> 8); x82 = (uint8_t)(x81 & UINT8_C(0xff)); x83 = (uint8_t)(x81 >> 8); x84 = (uint8_t)(x32 & UINT8_C(0xff)); x85 = (x32 >> 8); x86 = (uint8_t)(x85 & UINT8_C(0xff)); x87 = (x85 >> 8); x88 = (uint8_t)(x87 & UINT8_C(0xff)); x89 = (fiat_25519_uint1)(x87 >> 8); x90 = (x45 + (uint32_t)x89); x91 = (uint8_t)(x90 & UINT8_C(0xff)); x92 = (x90 >> 8); x93 = (uint8_t)(x92 & UINT8_C(0xff)); x94 = (x92 >> 8); x95 = (uint8_t)(x94 & UINT8_C(0xff)); x96 = (uint8_t)(x94 >> 8); x97 = (x44 + (uint32_t)x96); x98 = (uint8_t)(x97 & UINT8_C(0xff)); x99 = (x97 >> 8); x100 = (uint8_t)(x99 & UINT8_C(0xff)); x101 = (x99 >> 8); x102 = (uint8_t)(x101 & UINT8_C(0xff)); x103 = (uint8_t)(x101 >> 8); x104 = (x43 + (uint32_t)x103); x105 = (uint8_t)(x104 & UINT8_C(0xff)); x106 = (x104 >> 8); x107 = (uint8_t)(x106 & UINT8_C(0xff)); x108 = (x106 >> 8); x109 = (uint8_t)(x108 & UINT8_C(0xff)); x110 = (uint8_t)(x108 >> 8); x111 = (x42 + (uint32_t)x110); x112 = (uint8_t)(x111 & UINT8_C(0xff)); x113 = (x111 >> 8); x114 = (uint8_t)(x113 & UINT8_C(0xff)); x115 = (x113 >> 8); x116 = (uint8_t)(x115 & UINT8_C(0xff)); x117 = (uint8_t)(x115 >> 8); out1[0] = x50; out1[1] = x52; out1[2] = x54; out1[3] = x57; out1[4] = x59; out1[5] = x61; out1[6] = x64; out1[7] = x66; out1[8] = x68; out1[9] = x71; out1[10] = x73; out1[11] = x75; out1[12] = x78; out1[13] = x80; out1[14] = x82; out1[15] = x83; out1[16] = x84; out1[17] = x86; out1[18] = x88; out1[19] = x91; out1[20] = x93; out1[21] = x95; out1[22] = x98; out1[23] = x100; out1[24] = x102; out1[25] = x105; out1[26] = x107; out1[27] = x109; out1[28] = x112; out1[29] = x114; out1[30] = x116; out1[31] = x117; } /* * The function fiat_25519_from_bytes deserializes a field element from bytes in little-endian order. * * Postconditions: * eval out1 mod m = bytes_eval arg1 mod m * * Input Bounds: * arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]] */ static FIAT_25519_FIAT_INLINE void fiat_25519_from_bytes(fiat_25519_tight_field_element out1, const uint8_t arg1[32]) { uint32_t x1; uint32_t x2; uint32_t x3; uint32_t x4; uint32_t x5; uint32_t x6; uint32_t x7; uint32_t x8; uint32_t x9; uint32_t x10; uint32_t x11; uint32_t x12; uint32_t x13; uint32_t x14; uint32_t x15; uint8_t x16; uint32_t x17; uint32_t x18; uint32_t x19; uint32_t x20; uint32_t x21; uint32_t x22; uint32_t x23; uint32_t x24; uint32_t x25; uint32_t x26; uint32_t x27; uint32_t x28; uint32_t x29; uint32_t x30; uint32_t x31; uint8_t x32; uint32_t x33; uint32_t x34; uint32_t x35; uint32_t x36; uint8_t x37; uint32_t x38; uint32_t x39; uint32_t x40; uint32_t x41; uint8_t x42; uint32_t x43; uint32_t x44; uint32_t x45; uint32_t x46; uint8_t x47; uint32_t x48; uint32_t x49; uint32_t x50; uint32_t x51; uint8_t x52; uint32_t x53; uint32_t x54; uint32_t x55; uint32_t x56; uint32_t x57; uint32_t x58; uint32_t x59; uint8_t x60; uint32_t x61; uint32_t x62; uint32_t x63; uint32_t x64; uint8_t x65; uint32_t x66; uint32_t x67; uint32_t x68; uint32_t x69; uint8_t x70; uint32_t x71; uint32_t x72; uint32_t x73; uint32_t x74; uint8_t x75; uint32_t x76; uint32_t x77; uint32_t x78; x1 = ((uint32_t)(arg1[31]) << 18); x2 = ((uint32_t)(arg1[30]) << 10); x3 = ((uint32_t)(arg1[29]) << 2); x4 = ((uint32_t)(arg1[28]) << 20); x5 = ((uint32_t)(arg1[27]) << 12); x6 = ((uint32_t)(arg1[26]) << 4); x7 = ((uint32_t)(arg1[25]) << 21); x8 = ((uint32_t)(arg1[24]) << 13); x9 = ((uint32_t)(arg1[23]) << 5); x10 = ((uint32_t)(arg1[22]) << 23); x11 = ((uint32_t)(arg1[21]) << 15); x12 = ((uint32_t)(arg1[20]) << 7); x13 = ((uint32_t)(arg1[19]) << 24); x14 = ((uint32_t)(arg1[18]) << 16); x15 = ((uint32_t)(arg1[17]) << 8); x16 = (arg1[16]); x17 = ((uint32_t)(arg1[15]) << 18); x18 = ((uint32_t)(arg1[14]) << 10); x19 = ((uint32_t)(arg1[13]) << 2); x20 = ((uint32_t)(arg1[12]) << 19); x21 = ((uint32_t)(arg1[11]) << 11); x22 = ((uint32_t)(arg1[10]) << 3); x23 = ((uint32_t)(arg1[9]) << 21); x24 = ((uint32_t)(arg1[8]) << 13); x25 = ((uint32_t)(arg1[7]) << 5); x26 = ((uint32_t)(arg1[6]) << 22); x27 = ((uint32_t)(arg1[5]) << 14); x28 = ((uint32_t)(arg1[4]) << 6); x29 = ((uint32_t)(arg1[3]) << 24); x30 = ((uint32_t)(arg1[2]) << 16); x31 = ((uint32_t)(arg1[1]) << 8); x32 = (arg1[0]); x33 = (x31 + (uint32_t)x32); x34 = (x30 + x33); x35 = (x29 + x34); x36 = (x35 & UINT32_C(0x3ffffff)); x37 = (uint8_t)(x35 >> 26); x38 = (x28 + (uint32_t)x37); x39 = (x27 + x38); x40 = (x26 + x39); x41 = (x40 & UINT32_C(0x1ffffff)); x42 = (uint8_t)(x40 >> 25); x43 = (x25 + (uint32_t)x42); x44 = (x24 + x43); x45 = (x23 + x44); x46 = (x45 & UINT32_C(0x3ffffff)); x47 = (uint8_t)(x45 >> 26); x48 = (x22 + (uint32_t)x47); x49 = (x21 + x48); x50 = (x20 + x49); x51 = (x50 & UINT32_C(0x1ffffff)); x52 = (uint8_t)(x50 >> 25); x53 = (x19 + (uint32_t)x52); x54 = (x18 + x53); x55 = (x17 + x54); x56 = (x15 + (uint32_t)x16); x57 = (x14 + x56); x58 = (x13 + x57); x59 = (x58 & UINT32_C(0x1ffffff)); x60 = (uint8_t)(x58 >> 25); x61 = (x12 + (uint32_t)x60); x62 = (x11 + x61); x63 = (x10 + x62); x64 = (x63 & UINT32_C(0x3ffffff)); x65 = (uint8_t)(x63 >> 26); x66 = (x9 + (uint32_t)x65); x67 = (x8 + x66); x68 = (x7 + x67); x69 = (x68 & UINT32_C(0x1ffffff)); x70 = (uint8_t)(x68 >> 25); x71 = (x6 + (uint32_t)x70); x72 = (x5 + x71); x73 = (x4 + x72); x74 = (x73 & UINT32_C(0x3ffffff)); x75 = (uint8_t)(x73 >> 26); x76 = (x3 + (uint32_t)x75); x77 = (x2 + x76); x78 = (x1 + x77); out1[0] = x36; out1[1] = x41; out1[2] = x46; out1[3] = x51; out1[4] = x55; out1[5] = x59; out1[6] = x64; out1[7] = x69; out1[8] = x74; out1[9] = x78; } /* * The function fiat_25519_carry_scmul_121666 multiplies a field element by 121666 and reduces the result. * * Postconditions: * eval out1 mod m = (121666 * eval arg1) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_carry_scmul_121666(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint32_t x11; uint32_t x12; uint64_t x13; uint32_t x14; uint32_t x15; uint64_t x16; uint32_t x17; uint32_t x18; uint64_t x19; uint32_t x20; uint32_t x21; uint64_t x22; uint32_t x23; uint32_t x24; uint64_t x25; uint32_t x26; uint32_t x27; uint64_t x28; uint32_t x29; uint32_t x30; uint64_t x31; uint32_t x32; uint32_t x33; uint64_t x34; uint32_t x35; uint32_t x36; uint64_t x37; uint32_t x38; uint32_t x39; uint32_t x40; uint32_t x41; fiat_25519_uint1 x42; uint32_t x43; uint32_t x44; fiat_25519_uint1 x45; uint32_t x46; uint32_t x47; x1 = ((uint64_t)UINT32_C(0x1db42) * (arg1[9])); x2 = ((uint64_t)UINT32_C(0x1db42) * (arg1[8])); x3 = ((uint64_t)UINT32_C(0x1db42) * (arg1[7])); x4 = ((uint64_t)UINT32_C(0x1db42) * (arg1[6])); x5 = ((uint64_t)UINT32_C(0x1db42) * (arg1[5])); x6 = ((uint64_t)UINT32_C(0x1db42) * (arg1[4])); x7 = ((uint64_t)UINT32_C(0x1db42) * (arg1[3])); x8 = ((uint64_t)UINT32_C(0x1db42) * (arg1[2])); x9 = ((uint64_t)UINT32_C(0x1db42) * (arg1[1])); x10 = ((uint64_t)UINT32_C(0x1db42) * (arg1[0])); x11 = (uint32_t)(x10 >> 26); x12 = (uint32_t)(x10 & UINT32_C(0x3ffffff)); x13 = (x11 + x9); x14 = (uint32_t)(x13 >> 25); x15 = (uint32_t)(x13 & UINT32_C(0x1ffffff)); x16 = (x14 + x8); x17 = (uint32_t)(x16 >> 26); x18 = (uint32_t)(x16 & UINT32_C(0x3ffffff)); x19 = (x17 + x7); x20 = (uint32_t)(x19 >> 25); x21 = (uint32_t)(x19 & UINT32_C(0x1ffffff)); x22 = (x20 + x6); x23 = (uint32_t)(x22 >> 26); x24 = (uint32_t)(x22 & UINT32_C(0x3ffffff)); x25 = (x23 + x5); x26 = (uint32_t)(x25 >> 25); x27 = (uint32_t)(x25 & UINT32_C(0x1ffffff)); x28 = (x26 + x4); x29 = (uint32_t)(x28 >> 26); x30 = (uint32_t)(x28 & UINT32_C(0x3ffffff)); x31 = (x29 + x3); x32 = (uint32_t)(x31 >> 25); x33 = (uint32_t)(x31 & UINT32_C(0x1ffffff)); x34 = (x32 + x2); x35 = (uint32_t)(x34 >> 26); x36 = (uint32_t)(x34 & UINT32_C(0x3ffffff)); x37 = (x35 + x1); x38 = (uint32_t)(x37 >> 25); x39 = (uint32_t)(x37 & UINT32_C(0x1ffffff)); x40 = (x38 * UINT8_C(0x13)); x41 = (x12 + x40); x42 = (fiat_25519_uint1)(x41 >> 26); x43 = (x41 & UINT32_C(0x3ffffff)); x44 = (x42 + x15); x45 = (fiat_25519_uint1)(x44 >> 25); x46 = (x44 & UINT32_C(0x1ffffff)); x47 = (x45 + x18); out1[0] = x43; out1[1] = x46; out1[2] = x47; out1[3] = x21; out1[4] = x24; out1[5] = x27; out1[6] = x30; out1[7] = x33; out1[8] = x36; out1[9] = x39; } ring-0.17.14/third_party/fiat/curve25519_64.h000064400000000000000000000656401046102023000164400ustar 00000000000000/* Autogenerated: 'src/ExtractionOCaml/unsaturated_solinas' --inline --static --use-value-barrier 25519 64 '(auto)' '2^255 - 19' carry_mul carry_square carry add sub opp selectznz to_bytes from_bytes relax carry_scmul121666 */ /* curve description: 25519 */ /* machine_wordsize = 64 (from "64") */ /* requested operations: carry_mul, carry_square, carry, add, sub, opp, selectznz, to_bytes, from_bytes, relax, carry_scmul121666 */ /* n = 5 (from "(auto)") */ /* s-c = 2^255 - [(1, 19)] (from "2^255 - 19") */ /* tight_bounds_multiplier = 1 (from "") */ /* */ /* Computed values: */ /* carry_chain = [0, 1, 2, 3, 4, 0, 1] */ /* eval z = z[0] + (z[1] << 51) + (z[2] << 102) + (z[3] << 153) + (z[4] << 204) */ /* bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */ /* balance = [0xfffffffffffda, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe] */ #include typedef unsigned char fiat_25519_uint1; typedef signed char fiat_25519_int1; #if defined(__GNUC__) || defined(__clang__) # define FIAT_25519_FIAT_EXTENSION __extension__ # define FIAT_25519_FIAT_INLINE __inline__ #else # define FIAT_25519_FIAT_EXTENSION # define FIAT_25519_FIAT_INLINE #endif FIAT_25519_FIAT_EXTENSION typedef signed __int128 fiat_25519_int128; FIAT_25519_FIAT_EXTENSION typedef unsigned __int128 fiat_25519_uint128; /* The type fiat_25519_loose_field_element is a field element with loose bounds. */ /* Bounds: [[0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000]] */ typedef uint64_t fiat_25519_loose_field_element[5]; /* The type fiat_25519_tight_field_element is a field element with tight bounds. */ /* Bounds: [[0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000]] */ typedef uint64_t fiat_25519_tight_field_element[5]; #if (-1 & 3) != 3 #error "This code only works on a two's complement system" #endif #if !defined(FIAT_25519_NO_ASM) && (defined(__GNUC__) || defined(__clang__)) static __inline__ uint64_t fiat_25519_value_barrier_u64(uint64_t a) { __asm__("" : "+r"(a) : /* no inputs */); return a; } #else # define fiat_25519_value_barrier_u64(x) (x) #endif /* * The function fiat_25519_addcarryx_u51 is an addition with carry. * * Postconditions: * out1 = (arg1 + arg2 + arg3) mod 2^51 * out2 = ⌊(arg1 + arg2 + arg3) / 2^51⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0x7ffffffffffff] * arg3: [0x0 ~> 0x7ffffffffffff] * Output Bounds: * out1: [0x0 ~> 0x7ffffffffffff] * out2: [0x0 ~> 0x1] */ static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { uint64_t x1; uint64_t x2; fiat_25519_uint1 x3; x1 = ((arg1 + arg2) + arg3); x2 = (x1 & UINT64_C(0x7ffffffffffff)); x3 = (fiat_25519_uint1)(x1 >> 51); *out1 = x2; *out2 = x3; } /* * The function fiat_25519_subborrowx_u51 is a subtraction with borrow. * * Postconditions: * out1 = (-arg1 + arg2 + -arg3) mod 2^51 * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^51⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0x7ffffffffffff] * arg3: [0x0 ~> 0x7ffffffffffff] * Output Bounds: * out1: [0x0 ~> 0x7ffffffffffff] * out2: [0x0 ~> 0x1] */ static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { int64_t x1; fiat_25519_int1 x2; uint64_t x3; x1 = ((int64_t)(arg2 - (int64_t)arg1) - (int64_t)arg3); x2 = (fiat_25519_int1)(x1 >> 51); x3 = (x1 & UINT64_C(0x7ffffffffffff)); *out1 = x3; *out2 = (fiat_25519_uint1)(0x0 - x2); } /* * The function fiat_25519_cmovznz_u64 is a single-word conditional move. * * Postconditions: * out1 = (if arg1 = 0 then arg2 else arg3) * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffffffffffff] * arg3: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] */ static FIAT_25519_FIAT_INLINE void fiat_25519_cmovznz_u64(uint64_t* out1, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { fiat_25519_uint1 x1; uint64_t x2; uint64_t x3; x1 = (!(!arg1)); x2 = ((fiat_25519_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff)); x3 = ((fiat_25519_value_barrier_u64(x2) & arg3) | (fiat_25519_value_barrier_u64((~x2)) & arg2)); *out1 = x3; } /* * The function fiat_25519_carry_mul multiplies two field elements and reduces the result. * * Postconditions: * eval out1 mod m = (eval arg1 * eval arg2) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_carry_mul(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1, const fiat_25519_loose_field_element arg2) { fiat_25519_uint128 x1; fiat_25519_uint128 x2; fiat_25519_uint128 x3; fiat_25519_uint128 x4; fiat_25519_uint128 x5; fiat_25519_uint128 x6; fiat_25519_uint128 x7; fiat_25519_uint128 x8; fiat_25519_uint128 x9; fiat_25519_uint128 x10; fiat_25519_uint128 x11; fiat_25519_uint128 x12; fiat_25519_uint128 x13; fiat_25519_uint128 x14; fiat_25519_uint128 x15; fiat_25519_uint128 x16; fiat_25519_uint128 x17; fiat_25519_uint128 x18; fiat_25519_uint128 x19; fiat_25519_uint128 x20; fiat_25519_uint128 x21; fiat_25519_uint128 x22; fiat_25519_uint128 x23; fiat_25519_uint128 x24; fiat_25519_uint128 x25; fiat_25519_uint128 x26; uint64_t x27; uint64_t x28; fiat_25519_uint128 x29; fiat_25519_uint128 x30; fiat_25519_uint128 x31; fiat_25519_uint128 x32; fiat_25519_uint128 x33; uint64_t x34; uint64_t x35; fiat_25519_uint128 x36; uint64_t x37; uint64_t x38; fiat_25519_uint128 x39; uint64_t x40; uint64_t x41; fiat_25519_uint128 x42; uint64_t x43; uint64_t x44; uint64_t x45; uint64_t x46; uint64_t x47; uint64_t x48; uint64_t x49; fiat_25519_uint1 x50; uint64_t x51; uint64_t x52; x1 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[4]) * UINT8_C(0x13))); x2 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[3]) * UINT8_C(0x13))); x3 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[2]) * UINT8_C(0x13))); x4 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[1]) * UINT8_C(0x13))); x5 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[4]) * UINT8_C(0x13))); x6 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[3]) * UINT8_C(0x13))); x7 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[2]) * UINT8_C(0x13))); x8 = ((fiat_25519_uint128)(arg1[2]) * ((arg2[4]) * UINT8_C(0x13))); x9 = ((fiat_25519_uint128)(arg1[2]) * ((arg2[3]) * UINT8_C(0x13))); x10 = ((fiat_25519_uint128)(arg1[1]) * ((arg2[4]) * UINT8_C(0x13))); x11 = ((fiat_25519_uint128)(arg1[4]) * (arg2[0])); x12 = ((fiat_25519_uint128)(arg1[3]) * (arg2[1])); x13 = ((fiat_25519_uint128)(arg1[3]) * (arg2[0])); x14 = ((fiat_25519_uint128)(arg1[2]) * (arg2[2])); x15 = ((fiat_25519_uint128)(arg1[2]) * (arg2[1])); x16 = ((fiat_25519_uint128)(arg1[2]) * (arg2[0])); x17 = ((fiat_25519_uint128)(arg1[1]) * (arg2[3])); x18 = ((fiat_25519_uint128)(arg1[1]) * (arg2[2])); x19 = ((fiat_25519_uint128)(arg1[1]) * (arg2[1])); x20 = ((fiat_25519_uint128)(arg1[1]) * (arg2[0])); x21 = ((fiat_25519_uint128)(arg1[0]) * (arg2[4])); x22 = ((fiat_25519_uint128)(arg1[0]) * (arg2[3])); x23 = ((fiat_25519_uint128)(arg1[0]) * (arg2[2])); x24 = ((fiat_25519_uint128)(arg1[0]) * (arg2[1])); x25 = ((fiat_25519_uint128)(arg1[0]) * (arg2[0])); x26 = (x25 + (x10 + (x9 + (x7 + x4)))); x27 = (uint64_t)(x26 >> 51); x28 = (uint64_t)(x26 & UINT64_C(0x7ffffffffffff)); x29 = (x21 + (x17 + (x14 + (x12 + x11)))); x30 = (x22 + (x18 + (x15 + (x13 + x1)))); x31 = (x23 + (x19 + (x16 + (x5 + x2)))); x32 = (x24 + (x20 + (x8 + (x6 + x3)))); x33 = (x27 + x32); x34 = (uint64_t)(x33 >> 51); x35 = (uint64_t)(x33 & UINT64_C(0x7ffffffffffff)); x36 = (x34 + x31); x37 = (uint64_t)(x36 >> 51); x38 = (uint64_t)(x36 & UINT64_C(0x7ffffffffffff)); x39 = (x37 + x30); x40 = (uint64_t)(x39 >> 51); x41 = (uint64_t)(x39 & UINT64_C(0x7ffffffffffff)); x42 = (x40 + x29); x43 = (uint64_t)(x42 >> 51); x44 = (uint64_t)(x42 & UINT64_C(0x7ffffffffffff)); x45 = (x43 * UINT8_C(0x13)); x46 = (x28 + x45); x47 = (x46 >> 51); x48 = (x46 & UINT64_C(0x7ffffffffffff)); x49 = (x47 + x35); x50 = (fiat_25519_uint1)(x49 >> 51); x51 = (x49 & UINT64_C(0x7ffffffffffff)); x52 = (x50 + x38); out1[0] = x48; out1[1] = x51; out1[2] = x52; out1[3] = x41; out1[4] = x44; } /* * The function fiat_25519_carry_square squares a field element and reduces the result. * * Postconditions: * eval out1 mod m = (eval arg1 * eval arg1) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_carry_square(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; fiat_25519_uint128 x9; fiat_25519_uint128 x10; fiat_25519_uint128 x11; fiat_25519_uint128 x12; fiat_25519_uint128 x13; fiat_25519_uint128 x14; fiat_25519_uint128 x15; fiat_25519_uint128 x16; fiat_25519_uint128 x17; fiat_25519_uint128 x18; fiat_25519_uint128 x19; fiat_25519_uint128 x20; fiat_25519_uint128 x21; fiat_25519_uint128 x22; fiat_25519_uint128 x23; fiat_25519_uint128 x24; uint64_t x25; uint64_t x26; fiat_25519_uint128 x27; fiat_25519_uint128 x28; fiat_25519_uint128 x29; fiat_25519_uint128 x30; fiat_25519_uint128 x31; uint64_t x32; uint64_t x33; fiat_25519_uint128 x34; uint64_t x35; uint64_t x36; fiat_25519_uint128 x37; uint64_t x38; uint64_t x39; fiat_25519_uint128 x40; uint64_t x41; uint64_t x42; uint64_t x43; uint64_t x44; uint64_t x45; uint64_t x46; uint64_t x47; fiat_25519_uint1 x48; uint64_t x49; uint64_t x50; x1 = ((arg1[4]) * UINT8_C(0x13)); x2 = (x1 * 0x2); x3 = ((arg1[4]) * 0x2); x4 = ((arg1[3]) * UINT8_C(0x13)); x5 = (x4 * 0x2); x6 = ((arg1[3]) * 0x2); x7 = ((arg1[2]) * 0x2); x8 = ((arg1[1]) * 0x2); x9 = ((fiat_25519_uint128)(arg1[4]) * x1); x10 = ((fiat_25519_uint128)(arg1[3]) * x2); x11 = ((fiat_25519_uint128)(arg1[3]) * x4); x12 = ((fiat_25519_uint128)(arg1[2]) * x2); x13 = ((fiat_25519_uint128)(arg1[2]) * x5); x14 = ((fiat_25519_uint128)(arg1[2]) * (arg1[2])); x15 = ((fiat_25519_uint128)(arg1[1]) * x2); x16 = ((fiat_25519_uint128)(arg1[1]) * x6); x17 = ((fiat_25519_uint128)(arg1[1]) * x7); x18 = ((fiat_25519_uint128)(arg1[1]) * (arg1[1])); x19 = ((fiat_25519_uint128)(arg1[0]) * x3); x20 = ((fiat_25519_uint128)(arg1[0]) * x6); x21 = ((fiat_25519_uint128)(arg1[0]) * x7); x22 = ((fiat_25519_uint128)(arg1[0]) * x8); x23 = ((fiat_25519_uint128)(arg1[0]) * (arg1[0])); x24 = (x23 + (x15 + x13)); x25 = (uint64_t)(x24 >> 51); x26 = (uint64_t)(x24 & UINT64_C(0x7ffffffffffff)); x27 = (x19 + (x16 + x14)); x28 = (x20 + (x17 + x9)); x29 = (x21 + (x18 + x10)); x30 = (x22 + (x12 + x11)); x31 = (x25 + x30); x32 = (uint64_t)(x31 >> 51); x33 = (uint64_t)(x31 & UINT64_C(0x7ffffffffffff)); x34 = (x32 + x29); x35 = (uint64_t)(x34 >> 51); x36 = (uint64_t)(x34 & UINT64_C(0x7ffffffffffff)); x37 = (x35 + x28); x38 = (uint64_t)(x37 >> 51); x39 = (uint64_t)(x37 & UINT64_C(0x7ffffffffffff)); x40 = (x38 + x27); x41 = (uint64_t)(x40 >> 51); x42 = (uint64_t)(x40 & UINT64_C(0x7ffffffffffff)); x43 = (x41 * UINT8_C(0x13)); x44 = (x26 + x43); x45 = (x44 >> 51); x46 = (x44 & UINT64_C(0x7ffffffffffff)); x47 = (x45 + x33); x48 = (fiat_25519_uint1)(x47 >> 51); x49 = (x47 & UINT64_C(0x7ffffffffffff)); x50 = (x48 + x36); out1[0] = x46; out1[1] = x49; out1[2] = x50; out1[3] = x39; out1[4] = x42; } /* * The function fiat_25519_carry reduces a field element. * * Postconditions: * eval out1 mod m = eval arg1 mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_carry(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; x1 = (arg1[0]); x2 = ((x1 >> 51) + (arg1[1])); x3 = ((x2 >> 51) + (arg1[2])); x4 = ((x3 >> 51) + (arg1[3])); x5 = ((x4 >> 51) + (arg1[4])); x6 = ((x1 & UINT64_C(0x7ffffffffffff)) + ((x5 >> 51) * UINT8_C(0x13))); x7 = ((fiat_25519_uint1)(x6 >> 51) + (x2 & UINT64_C(0x7ffffffffffff))); x8 = (x6 & UINT64_C(0x7ffffffffffff)); x9 = (x7 & UINT64_C(0x7ffffffffffff)); x10 = ((fiat_25519_uint1)(x7 >> 51) + (x3 & UINT64_C(0x7ffffffffffff))); x11 = (x4 & UINT64_C(0x7ffffffffffff)); x12 = (x5 & UINT64_C(0x7ffffffffffff)); out1[0] = x8; out1[1] = x9; out1[2] = x10; out1[3] = x11; out1[4] = x12; } /* * The function fiat_25519_add adds two field elements. * * Postconditions: * eval out1 mod m = (eval arg1 + eval arg2) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_add(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; x1 = ((arg1[0]) + (arg2[0])); x2 = ((arg1[1]) + (arg2[1])); x3 = ((arg1[2]) + (arg2[2])); x4 = ((arg1[3]) + (arg2[3])); x5 = ((arg1[4]) + (arg2[4])); out1[0] = x1; out1[1] = x2; out1[2] = x3; out1[3] = x4; out1[4] = x5; } /* * The function fiat_25519_sub subtracts two field elements. * * Postconditions: * eval out1 mod m = (eval arg1 - eval arg2) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_sub(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; x1 = ((UINT64_C(0xfffffffffffda) + (arg1[0])) - (arg2[0])); x2 = ((UINT64_C(0xffffffffffffe) + (arg1[1])) - (arg2[1])); x3 = ((UINT64_C(0xffffffffffffe) + (arg1[2])) - (arg2[2])); x4 = ((UINT64_C(0xffffffffffffe) + (arg1[3])) - (arg2[3])); x5 = ((UINT64_C(0xffffffffffffe) + (arg1[4])) - (arg2[4])); out1[0] = x1; out1[1] = x2; out1[2] = x3; out1[3] = x4; out1[4] = x5; } /* * The function fiat_25519_opp negates a field element. * * Postconditions: * eval out1 mod m = -eval arg1 mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_opp(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; x1 = (UINT64_C(0xfffffffffffda) - (arg1[0])); x2 = (UINT64_C(0xffffffffffffe) - (arg1[1])); x3 = (UINT64_C(0xffffffffffffe) - (arg1[2])); x4 = (UINT64_C(0xffffffffffffe) - (arg1[3])); x5 = (UINT64_C(0xffffffffffffe) - (arg1[4])); out1[0] = x1; out1[1] = x2; out1[2] = x3; out1[3] = x4; out1[4] = x5; } /* * The function fiat_25519_to_bytes serializes a field element to bytes in little-endian order. * * Postconditions: * out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31] * * Output Bounds: * out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]] */ static FIAT_25519_FIAT_INLINE void fiat_25519_to_bytes(uint8_t out1[32], const fiat_25519_tight_field_element arg1) { uint64_t x1; fiat_25519_uint1 x2; uint64_t x3; fiat_25519_uint1 x4; uint64_t x5; fiat_25519_uint1 x6; uint64_t x7; fiat_25519_uint1 x8; uint64_t x9; fiat_25519_uint1 x10; uint64_t x11; uint64_t x12; fiat_25519_uint1 x13; uint64_t x14; fiat_25519_uint1 x15; uint64_t x16; fiat_25519_uint1 x17; uint64_t x18; fiat_25519_uint1 x19; uint64_t x20; fiat_25519_uint1 x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; uint8_t x26; uint64_t x27; uint8_t x28; uint64_t x29; uint8_t x30; uint64_t x31; uint8_t x32; uint64_t x33; uint8_t x34; uint64_t x35; uint8_t x36; uint8_t x37; uint64_t x38; uint8_t x39; uint64_t x40; uint8_t x41; uint64_t x42; uint8_t x43; uint64_t x44; uint8_t x45; uint64_t x46; uint8_t x47; uint64_t x48; uint8_t x49; uint8_t x50; uint64_t x51; uint8_t x52; uint64_t x53; uint8_t x54; uint64_t x55; uint8_t x56; uint64_t x57; uint8_t x58; uint64_t x59; uint8_t x60; uint64_t x61; uint8_t x62; uint64_t x63; uint8_t x64; fiat_25519_uint1 x65; uint64_t x66; uint8_t x67; uint64_t x68; uint8_t x69; uint64_t x70; uint8_t x71; uint64_t x72; uint8_t x73; uint64_t x74; uint8_t x75; uint64_t x76; uint8_t x77; uint8_t x78; uint64_t x79; uint8_t x80; uint64_t x81; uint8_t x82; uint64_t x83; uint8_t x84; uint64_t x85; uint8_t x86; uint64_t x87; uint8_t x88; uint64_t x89; uint8_t x90; uint8_t x91; fiat_25519_subborrowx_u51(&x1, &x2, 0x0, (arg1[0]), UINT64_C(0x7ffffffffffed)); fiat_25519_subborrowx_u51(&x3, &x4, x2, (arg1[1]), UINT64_C(0x7ffffffffffff)); fiat_25519_subborrowx_u51(&x5, &x6, x4, (arg1[2]), UINT64_C(0x7ffffffffffff)); fiat_25519_subborrowx_u51(&x7, &x8, x6, (arg1[3]), UINT64_C(0x7ffffffffffff)); fiat_25519_subborrowx_u51(&x9, &x10, x8, (arg1[4]), UINT64_C(0x7ffffffffffff)); fiat_25519_cmovznz_u64(&x11, x10, 0x0, UINT64_C(0xffffffffffffffff)); fiat_25519_addcarryx_u51(&x12, &x13, 0x0, x1, (x11 & UINT64_C(0x7ffffffffffed))); fiat_25519_addcarryx_u51(&x14, &x15, x13, x3, (x11 & UINT64_C(0x7ffffffffffff))); fiat_25519_addcarryx_u51(&x16, &x17, x15, x5, (x11 & UINT64_C(0x7ffffffffffff))); fiat_25519_addcarryx_u51(&x18, &x19, x17, x7, (x11 & UINT64_C(0x7ffffffffffff))); fiat_25519_addcarryx_u51(&x20, &x21, x19, x9, (x11 & UINT64_C(0x7ffffffffffff))); x22 = (x20 << 4); x23 = (x18 * (uint64_t)0x2); x24 = (x16 << 6); x25 = (x14 << 3); x26 = (uint8_t)(x12 & UINT8_C(0xff)); x27 = (x12 >> 8); x28 = (uint8_t)(x27 & UINT8_C(0xff)); x29 = (x27 >> 8); x30 = (uint8_t)(x29 & UINT8_C(0xff)); x31 = (x29 >> 8); x32 = (uint8_t)(x31 & UINT8_C(0xff)); x33 = (x31 >> 8); x34 = (uint8_t)(x33 & UINT8_C(0xff)); x35 = (x33 >> 8); x36 = (uint8_t)(x35 & UINT8_C(0xff)); x37 = (uint8_t)(x35 >> 8); x38 = (x25 + (uint64_t)x37); x39 = (uint8_t)(x38 & UINT8_C(0xff)); x40 = (x38 >> 8); x41 = (uint8_t)(x40 & UINT8_C(0xff)); x42 = (x40 >> 8); x43 = (uint8_t)(x42 & UINT8_C(0xff)); x44 = (x42 >> 8); x45 = (uint8_t)(x44 & UINT8_C(0xff)); x46 = (x44 >> 8); x47 = (uint8_t)(x46 & UINT8_C(0xff)); x48 = (x46 >> 8); x49 = (uint8_t)(x48 & UINT8_C(0xff)); x50 = (uint8_t)(x48 >> 8); x51 = (x24 + (uint64_t)x50); x52 = (uint8_t)(x51 & UINT8_C(0xff)); x53 = (x51 >> 8); x54 = (uint8_t)(x53 & UINT8_C(0xff)); x55 = (x53 >> 8); x56 = (uint8_t)(x55 & UINT8_C(0xff)); x57 = (x55 >> 8); x58 = (uint8_t)(x57 & UINT8_C(0xff)); x59 = (x57 >> 8); x60 = (uint8_t)(x59 & UINT8_C(0xff)); x61 = (x59 >> 8); x62 = (uint8_t)(x61 & UINT8_C(0xff)); x63 = (x61 >> 8); x64 = (uint8_t)(x63 & UINT8_C(0xff)); x65 = (fiat_25519_uint1)(x63 >> 8); x66 = (x23 + (uint64_t)x65); x67 = (uint8_t)(x66 & UINT8_C(0xff)); x68 = (x66 >> 8); x69 = (uint8_t)(x68 & UINT8_C(0xff)); x70 = (x68 >> 8); x71 = (uint8_t)(x70 & UINT8_C(0xff)); x72 = (x70 >> 8); x73 = (uint8_t)(x72 & UINT8_C(0xff)); x74 = (x72 >> 8); x75 = (uint8_t)(x74 & UINT8_C(0xff)); x76 = (x74 >> 8); x77 = (uint8_t)(x76 & UINT8_C(0xff)); x78 = (uint8_t)(x76 >> 8); x79 = (x22 + (uint64_t)x78); x80 = (uint8_t)(x79 & UINT8_C(0xff)); x81 = (x79 >> 8); x82 = (uint8_t)(x81 & UINT8_C(0xff)); x83 = (x81 >> 8); x84 = (uint8_t)(x83 & UINT8_C(0xff)); x85 = (x83 >> 8); x86 = (uint8_t)(x85 & UINT8_C(0xff)); x87 = (x85 >> 8); x88 = (uint8_t)(x87 & UINT8_C(0xff)); x89 = (x87 >> 8); x90 = (uint8_t)(x89 & UINT8_C(0xff)); x91 = (uint8_t)(x89 >> 8); out1[0] = x26; out1[1] = x28; out1[2] = x30; out1[3] = x32; out1[4] = x34; out1[5] = x36; out1[6] = x39; out1[7] = x41; out1[8] = x43; out1[9] = x45; out1[10] = x47; out1[11] = x49; out1[12] = x52; out1[13] = x54; out1[14] = x56; out1[15] = x58; out1[16] = x60; out1[17] = x62; out1[18] = x64; out1[19] = x67; out1[20] = x69; out1[21] = x71; out1[22] = x73; out1[23] = x75; out1[24] = x77; out1[25] = x80; out1[26] = x82; out1[27] = x84; out1[28] = x86; out1[29] = x88; out1[30] = x90; out1[31] = x91; } /* * The function fiat_25519_from_bytes deserializes a field element from bytes in little-endian order. * * Postconditions: * eval out1 mod m = bytes_eval arg1 mod m * * Input Bounds: * arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]] */ static FIAT_25519_FIAT_INLINE void fiat_25519_from_bytes(fiat_25519_tight_field_element out1, const uint8_t arg1[32]) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; uint64_t x13; uint64_t x14; uint64_t x15; uint64_t x16; uint64_t x17; uint64_t x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; uint64_t x26; uint64_t x27; uint64_t x28; uint64_t x29; uint64_t x30; uint64_t x31; uint8_t x32; uint64_t x33; uint64_t x34; uint64_t x35; uint64_t x36; uint64_t x37; uint64_t x38; uint64_t x39; uint8_t x40; uint64_t x41; uint64_t x42; uint64_t x43; uint64_t x44; uint64_t x45; uint64_t x46; uint64_t x47; uint8_t x48; uint64_t x49; uint64_t x50; uint64_t x51; uint64_t x52; uint64_t x53; uint64_t x54; uint64_t x55; uint64_t x56; uint8_t x57; uint64_t x58; uint64_t x59; uint64_t x60; uint64_t x61; uint64_t x62; uint64_t x63; uint64_t x64; uint8_t x65; uint64_t x66; uint64_t x67; uint64_t x68; uint64_t x69; uint64_t x70; uint64_t x71; x1 = ((uint64_t)(arg1[31]) << 44); x2 = ((uint64_t)(arg1[30]) << 36); x3 = ((uint64_t)(arg1[29]) << 28); x4 = ((uint64_t)(arg1[28]) << 20); x5 = ((uint64_t)(arg1[27]) << 12); x6 = ((uint64_t)(arg1[26]) << 4); x7 = ((uint64_t)(arg1[25]) << 47); x8 = ((uint64_t)(arg1[24]) << 39); x9 = ((uint64_t)(arg1[23]) << 31); x10 = ((uint64_t)(arg1[22]) << 23); x11 = ((uint64_t)(arg1[21]) << 15); x12 = ((uint64_t)(arg1[20]) << 7); x13 = ((uint64_t)(arg1[19]) << 50); x14 = ((uint64_t)(arg1[18]) << 42); x15 = ((uint64_t)(arg1[17]) << 34); x16 = ((uint64_t)(arg1[16]) << 26); x17 = ((uint64_t)(arg1[15]) << 18); x18 = ((uint64_t)(arg1[14]) << 10); x19 = ((uint64_t)(arg1[13]) << 2); x20 = ((uint64_t)(arg1[12]) << 45); x21 = ((uint64_t)(arg1[11]) << 37); x22 = ((uint64_t)(arg1[10]) << 29); x23 = ((uint64_t)(arg1[9]) << 21); x24 = ((uint64_t)(arg1[8]) << 13); x25 = ((uint64_t)(arg1[7]) << 5); x26 = ((uint64_t)(arg1[6]) << 48); x27 = ((uint64_t)(arg1[5]) << 40); x28 = ((uint64_t)(arg1[4]) << 32); x29 = ((uint64_t)(arg1[3]) << 24); x30 = ((uint64_t)(arg1[2]) << 16); x31 = ((uint64_t)(arg1[1]) << 8); x32 = (arg1[0]); x33 = (x31 + (uint64_t)x32); x34 = (x30 + x33); x35 = (x29 + x34); x36 = (x28 + x35); x37 = (x27 + x36); x38 = (x26 + x37); x39 = (x38 & UINT64_C(0x7ffffffffffff)); x40 = (uint8_t)(x38 >> 51); x41 = (x25 + (uint64_t)x40); x42 = (x24 + x41); x43 = (x23 + x42); x44 = (x22 + x43); x45 = (x21 + x44); x46 = (x20 + x45); x47 = (x46 & UINT64_C(0x7ffffffffffff)); x48 = (uint8_t)(x46 >> 51); x49 = (x19 + (uint64_t)x48); x50 = (x18 + x49); x51 = (x17 + x50); x52 = (x16 + x51); x53 = (x15 + x52); x54 = (x14 + x53); x55 = (x13 + x54); x56 = (x55 & UINT64_C(0x7ffffffffffff)); x57 = (uint8_t)(x55 >> 51); x58 = (x12 + (uint64_t)x57); x59 = (x11 + x58); x60 = (x10 + x59); x61 = (x9 + x60); x62 = (x8 + x61); x63 = (x7 + x62); x64 = (x63 & UINT64_C(0x7ffffffffffff)); x65 = (uint8_t)(x63 >> 51); x66 = (x6 + (uint64_t)x65); x67 = (x5 + x66); x68 = (x4 + x67); x69 = (x3 + x68); x70 = (x2 + x69); x71 = (x1 + x70); out1[0] = x39; out1[1] = x47; out1[2] = x56; out1[3] = x64; out1[4] = x71; } /* * The function fiat_25519_carry_scmul_121666 multiplies a field element by 121666 and reduces the result. * * Postconditions: * eval out1 mod m = (121666 * eval arg1) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_carry_scmul_121666(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { fiat_25519_uint128 x1; fiat_25519_uint128 x2; fiat_25519_uint128 x3; fiat_25519_uint128 x4; fiat_25519_uint128 x5; uint64_t x6; uint64_t x7; fiat_25519_uint128 x8; uint64_t x9; uint64_t x10; fiat_25519_uint128 x11; uint64_t x12; uint64_t x13; fiat_25519_uint128 x14; uint64_t x15; uint64_t x16; fiat_25519_uint128 x17; uint64_t x18; uint64_t x19; uint64_t x20; uint64_t x21; fiat_25519_uint1 x22; uint64_t x23; uint64_t x24; fiat_25519_uint1 x25; uint64_t x26; uint64_t x27; x1 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[4])); x2 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[3])); x3 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[2])); x4 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[1])); x5 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[0])); x6 = (uint64_t)(x5 >> 51); x7 = (uint64_t)(x5 & UINT64_C(0x7ffffffffffff)); x8 = (x6 + x4); x9 = (uint64_t)(x8 >> 51); x10 = (uint64_t)(x8 & UINT64_C(0x7ffffffffffff)); x11 = (x9 + x3); x12 = (uint64_t)(x11 >> 51); x13 = (uint64_t)(x11 & UINT64_C(0x7ffffffffffff)); x14 = (x12 + x2); x15 = (uint64_t)(x14 >> 51); x16 = (uint64_t)(x14 & UINT64_C(0x7ffffffffffff)); x17 = (x15 + x1); x18 = (uint64_t)(x17 >> 51); x19 = (uint64_t)(x17 & UINT64_C(0x7ffffffffffff)); x20 = (x18 * UINT8_C(0x13)); x21 = (x7 + x20); x22 = (fiat_25519_uint1)(x21 >> 51); x23 = (x21 & UINT64_C(0x7ffffffffffff)); x24 = (x22 + x10); x25 = (fiat_25519_uint1)(x24 >> 51); x26 = (x24 & UINT64_C(0x7ffffffffffff)); x27 = (x25 + x13); out1[0] = x23; out1[1] = x26; out1[2] = x27; out1[3] = x16; out1[4] = x19; } ring-0.17.14/third_party/fiat/curve25519_64_adx.h000064400000000000000000000542141046102023000172670ustar 00000000000000#include #include "../../crypto/internal.h" #include #include #include typedef uint64_t fe4[4]; typedef uint8_t fiat_uint1; typedef int8_t fiat_int1; static __inline__ uint64_t fiat_value_barrier_u64(uint64_t a) { __asm__("" : "+r"(a) : /* no inputs */); return a; } __attribute__((target("adx,bmi2"))) static inline void fe4_mul(fe4 out, const fe4 x, const fe4 y) { fiat_curve25519_adx_mul(out, x, y); } __attribute__((target("adx,bmi2"))) static inline void fe4_sq(fe4 out, const fe4 x) { fiat_curve25519_adx_square(out, x); } /* * The function fiat_mulx_u64 is a multiplication, returning the full double-width result. * * Postconditions: * out1 = (arg1 * arg2) mod 2^64 * out2 = ⌊arg1 * arg2 / 2^64⌋ * * Input Bounds: * arg1: [0x0 ~> 0xffffffffffffffff] * arg2: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] * out2: [0x0 ~> 0xffffffffffffffff] */ __attribute__((target("adx,bmi2"))) static inline void fiat_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) { // NOTE: edited after generation #if defined(_M_X64) unsigned long long t; *out1 = _umul128(arg1, arg2, &t); *out2 = t; #elif defined(_M_ARM64) *out1 = arg1 * arg2; *out2 = __umulh(arg1, arg2); #else unsigned __int128 t = (unsigned __int128)arg1 * arg2; *out1 = t; *out2 = (t >> 64); #endif } /* * The function fiat_addcarryx_u64 is an addition with carry. * * Postconditions: * out1 = (arg1 + arg2 + arg3) mod 2^64 * out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffffffffffff] * arg3: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] * out2: [0x0 ~> 0x1] */ __attribute__((target("adx,bmi2"))) static inline void fiat_addcarryx_u64(uint64_t* out1, fiat_uint1* out2, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) { // NOTE: edited after generation #if defined(__has_builtin) # if __has_builtin(__builtin_ia32_addcarryx_u64) # define addcarry64 __builtin_ia32_addcarryx_u64 # endif #endif #if defined(addcarry64) long long unsigned int t; *out2 = addcarry64(arg1, arg2, arg3, &t); *out1 = t; #elif defined(_M_X64) long long unsigned int t; *out2 = _addcarry_u64(arg1, arg2, arg3, out1); *out1 = t; #else arg2 += arg1; arg1 = arg2 < arg1; uint64_t ret = arg2 + arg3; arg1 += ret < arg2; *out1 = ret; *out2 = arg1; #endif #undef addcarry64 } /* * The function fiat_subborrowx_u64 is a subtraction with borrow. * * Postconditions: * out1 = (-arg1 + arg2 + -arg3) mod 2^64 * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffffffffffff] * arg3: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] * out2: [0x0 ~> 0x1] */ __attribute__((target("adx,bmi2"))) static inline void fiat_subborrowx_u64(uint64_t* out1, fiat_uint1* out2, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) { #if defined(__has_builtin) # if __has_builtin(__builtin_ia32_subborrow_u64) # define subborrow64 __builtin_ia32_subborrow_u64 # endif #endif #if defined(subborrow64) long long unsigned int t; *out2 = subborrow64(arg1, arg2, arg3, &t); *out1 = t; #elif defined(_M_X64) long long unsigned int t; *out2 = _subborrow_u64(arg1, arg2, arg3, &t); // NOTE: edited after generation *out1 = t; #else *out1 = arg2 - arg3 - arg1; *out2 = (arg2 < arg3) | ((arg2 == arg3) & arg1); #endif #undef subborrow64 } /* * The function fiat_cmovznz_u64 is a single-word conditional move. * * Postconditions: * out1 = (if arg1 = 0 then arg2 else arg3) * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffffffffffff] * arg3: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] */ __attribute__((target("adx,bmi2"))) static inline void fiat_cmovznz_u64(uint64_t* out1, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) { fiat_uint1 x1; uint64_t x2; uint64_t x3; x1 = (!(!arg1)); x2 = ((fiat_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff)); x3 = ((fiat_value_barrier_u64(x2) & arg3) | (fiat_value_barrier_u64((~x2)) & arg2)); *out1 = x3; } /* * Input Bounds: * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * Output Bounds: * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ __attribute__((target("adx,bmi2"))) static void fe4_add(uint64_t out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) { uint64_t x1; fiat_uint1 x2; uint64_t x3; fiat_uint1 x4; uint64_t x5; fiat_uint1 x6; uint64_t x7; fiat_uint1 x8; uint64_t x9; uint64_t x10; fiat_uint1 x11; uint64_t x12; fiat_uint1 x13; uint64_t x14; fiat_uint1 x15; uint64_t x16; fiat_uint1 x17; uint64_t x18; uint64_t x19; fiat_uint1 x20; fiat_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); fiat_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1])); fiat_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2])); fiat_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3])); fiat_cmovznz_u64(&x9, x8, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and fiat_addcarryx_u64(&x10, &x11, 0x0, x1, x9); fiat_addcarryx_u64(&x12, &x13, x11, x3, 0x0); fiat_addcarryx_u64(&x14, &x15, x13, x5, 0x0); fiat_addcarryx_u64(&x16, &x17, x15, x7, 0x0); fiat_cmovznz_u64(&x18, x17, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and fiat_addcarryx_u64(&x19, &x20, 0x0, x10, x18); out1[0] = x19; out1[1] = x12; out1[2] = x14; out1[3] = x16; } /* * Input Bounds: * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * Output Bounds: * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ __attribute__((target("adx,bmi2"))) static void fe4_sub(uint64_t out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) { uint64_t x1; uint64_t x2; fiat_uint1 x3; uint64_t x4; uint64_t x5; fiat_uint1 x6; uint64_t x7; uint64_t x8; fiat_uint1 x9; uint64_t x10; uint64_t x11; fiat_uint1 x12; uint64_t x13; uint64_t x14; fiat_uint1 x15; uint64_t x16; fiat_uint1 x17; uint64_t x18; fiat_uint1 x19; uint64_t x20; fiat_uint1 x21; uint64_t x22; uint64_t x23; fiat_uint1 x24; x1 = (arg2[0]); fiat_subborrowx_u64(&x2, &x3, 0x0, (arg1[0]), x1); x4 = (arg2[1]); fiat_subborrowx_u64(&x5, &x6, x3, (arg1[1]), x4); x7 = (arg2[2]); fiat_subborrowx_u64(&x8, &x9, x6, (arg1[2]), x7); x10 = (arg2[3]); fiat_subborrowx_u64(&x11, &x12, x9, (arg1[3]), x10); fiat_cmovznz_u64(&x13, x12, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and fiat_subborrowx_u64(&x14, &x15, 0x0, x2, x13); fiat_subborrowx_u64(&x16, &x17, x15, x5, 0x0); fiat_subborrowx_u64(&x18, &x19, x17, x8, 0x0); fiat_subborrowx_u64(&x20, &x21, x19, x11, 0x0); fiat_cmovznz_u64(&x22, x21, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and fiat_subborrowx_u64(&x23, &x24, 0x0, x14, x22); out1[0] = x23; out1[1] = x16; out1[2] = x18; out1[3] = x20; } /* * Input Bounds: * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * arg2: [0x0 ~> 0x3ffffffffffffff] // NOTE: this is not any uint64! * Output Bounds: * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ __attribute__((target("adx,bmi2"))) static void fe4_scmul(uint64_t out1[4], const uint64_t arg1[4], uint64_t arg2) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; fiat_uint1 x6; uint64_t x7; uint64_t x8; uint64_t x9; fiat_uint1 x10; uint64_t x11; uint64_t x12; uint64_t x13; fiat_uint1 x14; uint64_t x15; uint64_t x16; uint64_t x17; fiat_uint1 x18; uint64_t x19; fiat_uint1 x20; uint64_t x21; fiat_uint1 x22; uint64_t x23; fiat_uint1 x24; uint64_t x25; uint64_t x26; fiat_uint1 x27; fiat_mulx_u64(&x1, &x2, (arg1[0]), arg2); fiat_mulx_u64(&x3, &x4, (arg1[1]), arg2); fiat_addcarryx_u64(&x5, &x6, 0x0, x2, x3); fiat_mulx_u64(&x7, &x8, (arg1[2]), arg2); fiat_addcarryx_u64(&x9, &x10, x6, x4, x7); fiat_mulx_u64(&x11, &x12, (arg1[3]), arg2); fiat_addcarryx_u64(&x13, &x14, x10, x8, x11); fiat_mulx_u64(&x15, &x16, (x12 + (uint64_t)x14), UINT8_C(0x26)); fiat_addcarryx_u64(&x17, &x18, 0x0, x1, x15); fiat_addcarryx_u64(&x19, &x20, x18, x5, 0x0); fiat_addcarryx_u64(&x21, &x22, x20, x9, 0x0); fiat_addcarryx_u64(&x23, &x24, x22, x13, 0x0); fiat_cmovznz_u64(&x25, x24, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and fiat_addcarryx_u64(&x26, &x27, 0x0, x17, x25); out1[0] = x26; out1[1] = x19; out1[2] = x21; out1[3] = x23; } /* * Input Bounds: * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * Output Bounds: * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ __attribute__((target("adx,bmi2"))) static void fe4_canon(uint64_t out1[4], const uint64_t arg1[4]) { uint64_t x1; fiat_uint1 x2; uint64_t x3; fiat_uint1 x4; uint64_t x5; fiat_uint1 x6; uint64_t x7; fiat_uint1 x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; uint64_t x13; fiat_uint1 x14; uint64_t x15; fiat_uint1 x16; uint64_t x17; fiat_uint1 x18; uint64_t x19; fiat_uint1 x20; uint64_t x21; uint64_t x22; uint64_t x23; uint64_t x24; fiat_subborrowx_u64(&x1, &x2, 0x0, (arg1[0]), UINT64_C(0xffffffffffffffed)); fiat_subborrowx_u64(&x3, &x4, x2, (arg1[1]), UINT64_C(0xffffffffffffffff)); fiat_subborrowx_u64(&x5, &x6, x4, (arg1[2]), UINT64_C(0xffffffffffffffff)); fiat_subborrowx_u64(&x7, &x8, x6, (arg1[3]), UINT64_C(0x7fffffffffffffff)); fiat_cmovznz_u64(&x9, x8, x1, (arg1[0])); fiat_cmovznz_u64(&x10, x8, x3, (arg1[1])); fiat_cmovznz_u64(&x11, x8, x5, (arg1[2])); fiat_cmovznz_u64(&x12, x8, x7, (arg1[3])); fiat_subborrowx_u64(&x13, &x14, 0x0, x9, UINT64_C(0xffffffffffffffed)); fiat_subborrowx_u64(&x15, &x16, x14, x10, UINT64_C(0xffffffffffffffff)); fiat_subborrowx_u64(&x17, &x18, x16, x11, UINT64_C(0xffffffffffffffff)); fiat_subborrowx_u64(&x19, &x20, x18, x12, UINT64_C(0x7fffffffffffffff)); fiat_cmovznz_u64(&x21, x20, x13, x9); fiat_cmovznz_u64(&x22, x20, x15, x10); fiat_cmovznz_u64(&x23, x20, x17, x11); fiat_cmovznz_u64(&x24, x20, x19, x12); out1[0] = x21; out1[1] = x22; out1[2] = x23; out1[3] = x24; } /* * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * Output Bounds: * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * out2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ __attribute__((target("adx,bmi2"))) static void fe4_cswap(uint64_t out1[4], uint64_t out2[4], fiat_uint1 arg1, const uint64_t arg2[4], const uint64_t arg3[4]) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; // NOTE: clang 14 for Zen 2 uses YMM registers fiat_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0])); fiat_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1])); fiat_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2])); fiat_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3])); fiat_cmovznz_u64(&x5, arg1, (arg3[0]), (arg2[0])); fiat_cmovznz_u64(&x6, arg1, (arg3[1]), (arg2[1])); fiat_cmovznz_u64(&x7, arg1, (arg3[2]), (arg2[2])); fiat_cmovznz_u64(&x8, arg1, (arg3[3]), (arg2[3])); out1[0] = x1; out1[1] = x2; out1[2] = x3; out1[3] = x4; out2[0] = x5; out2[1] = x6; out2[2] = x7; out2[3] = x8; } // The following functions are adaped from crypto/curve25519/curve25519.c // It would be desirable to share the code, but with the current field // implementations both 4-limb and 5-limb versions of the curve-level code need // to be included in builds targetting an unknown variant of x86_64. __attribute__((target("adx,bmi2"))) static void fe4_invert(fe4 out, const fe4 z) { fe4 t0; fe4 t1; fe4 t2; fe4 t3; int i; fe4_sq(t0, z); fe4_sq(t1, t0); for (i = 1; i < 2; ++i) { fe4_sq(t1, t1); } fe4_mul(t1, z, t1); fe4_mul(t0, t0, t1); fe4_sq(t2, t0); fe4_mul(t1, t1, t2); fe4_sq(t2, t1); for (i = 1; i < 5; ++i) { fe4_sq(t2, t2); } fe4_mul(t1, t2, t1); fe4_sq(t2, t1); for (i = 1; i < 10; ++i) { fe4_sq(t2, t2); } fe4_mul(t2, t2, t1); fe4_sq(t3, t2); for (i = 1; i < 20; ++i) { fe4_sq(t3, t3); } fe4_mul(t2, t3, t2); fe4_sq(t2, t2); for (i = 1; i < 10; ++i) { fe4_sq(t2, t2); } fe4_mul(t1, t2, t1); fe4_sq(t2, t1); for (i = 1; i < 50; ++i) { fe4_sq(t2, t2); } fe4_mul(t2, t2, t1); fe4_sq(t3, t2); for (i = 1; i < 100; ++i) { fe4_sq(t3, t3); } fe4_mul(t2, t3, t2); fe4_sq(t2, t2); for (i = 1; i < 50; ++i) { fe4_sq(t2, t2); } fe4_mul(t1, t2, t1); fe4_sq(t1, t1); for (i = 1; i < 5; ++i) { fe4_sq(t1, t1); } fe4_mul(out, t1, t0); } RING_NOINLINE // https://github.com/rust-lang/rust/issues/116573 __attribute__((target("adx,bmi2"))) void x25519_scalar_mult_adx(uint8_t out[32], const uint8_t scalar[32], const uint8_t point[32]) { uint8_t e[32]; OPENSSL_memcpy(e, scalar, 32); e[0] &= 248; e[31] &= 127; e[31] |= 64; // The following implementation was transcribed to Coq and proven to // correspond to unary scalar multiplication in affine coordinates given that // x1 != 0 is the x coordinate of some point on the curve. It was also checked // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2 // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the // underlying field, so it applies to Curve25519 itself and the quadratic // twist of Curve25519. It was not proven in Coq that prime-field arithmetic // correctly simulates extension-field arithmetic on prime-field values. // The decoding of the byte array representation of e was not considered. // Specification of Montgomery curves in affine coordinates: // // Proof that these form a group that is isomorphic to a Weierstrass curve: // // Coq transcription and correctness proof of the loop (where scalarbits=255): // // // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0 fe4 x1, x2 = {1}, z2 = {0}, x3, z3 = {1}, tmp0, tmp1; OPENSSL_memcpy(x1, point, sizeof(fe4)); x1[3] &= (uint64_t)(-1)>>1; OPENSSL_memcpy(x3, x1, sizeof(fe4)); unsigned swap = 0; int pos; for (pos = 254; pos >= 0; --pos) { // loop invariant as of right before the test, for the case where x1 != 0: // pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero // let r := e >> (pos+1) in the following equalities of projective points: // to_xz (r*P) === if swap then (x3, z3) else (x2, z2) // to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) // x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P) unsigned b = 1 & (e[pos / 8] >> (pos & 7)); swap ^= b; fe4_cswap(x2, x3, swap, x2, x3); fe4_cswap(z2, z3, swap, z2, z3); swap = b; // Coq transcription of ladderstep formula (called from transcribed loop): // // // x1 != 0 // x1 = 0 fe4_sub(tmp0, x3, z3); fe4_sub(tmp1, x2, z2); fe4_add(x2, x2, z2); fe4_add(z2, x3, z3); fe4_mul(z3, tmp0, x2); fe4_mul(z2, z2, tmp1); fe4_sq(tmp0, tmp1); fe4_sq(tmp1, x2); fe4_add(x3, z3, z2); fe4_sub(z2, z3, z2); fe4_mul(x2, tmp1, tmp0); fe4_sub(tmp1, tmp1, tmp0); fe4_sq(z2, z2); fe4_scmul(z3, tmp1, 121666); fe4_sq(x3, x3); fe4_add(tmp0, tmp0, z3); fe4_mul(z3, x1, z2); fe4_mul(z2, tmp1, tmp0); } // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2) fe4_cswap(x2, x3, swap, x2, x3); fe4_cswap(z2, z3, swap, z2, z3); fe4_invert(z2, z2); fe4_mul(x2, x2, z2); fe4_canon(x2, x2); OPENSSL_memcpy(out, x2, sizeof(fe4)); } typedef struct { fe4 X; fe4 Y; fe4 Z; fe4 T; } ge_p3_4; typedef struct { fe4 yplusx; fe4 yminusx; fe4 xy2d; } ge_precomp_4; __attribute__((target("adx,bmi2"))) static void inline_x25519_ge_dbl_4(ge_p3_4 *r, const ge_p3_4 *p, bool skip_t) { // Transcribed from a Coq function proven against affine coordinates. // https://github.com/mit-plv/fiat-crypto/blob/9943ba9e7d8f3e1c0054b2c94a5edca46ea73ef8/src/Curves/Edwards/XYZT/Basic.v#L136-L165 fe4 trX, trZ, trT, t0, cX, cY, cZ, cT; fe4_sq(trX, p->X); fe4_sq(trZ, p->Y); fe4_sq(trT, p->Z); fe4_add(trT, trT, trT); fe4_add(cY, p->X, p->Y); fe4_sq(t0, cY); fe4_add(cY, trZ, trX); fe4_sub(cZ, trZ, trX); fe4_sub(cX, t0, cY); fe4_sub(cT, trT, cZ); fe4_mul(r->X, cX, cT); fe4_mul(r->Y, cY, cZ); fe4_mul(r->Z, cZ, cT); if (!skip_t) { fe4_mul(r->T, cX, cY); } } __attribute__((target("adx,bmi2"))) __attribute__((always_inline)) // 4% speedup with clang14 and zen2 static inline void ge_p3_add_p3_precomp_4(ge_p3_4 *r, const ge_p3_4 *p, const ge_precomp_4 *q) { fe4 A, B, C, YplusX, YminusX, D, X3, Y3, Z3, T3; // Transcribed from a Coq function proven against affine coordinates. // https://github.com/mit-plv/fiat-crypto/blob/a36568d1d73aff5d7accc79fd28be672882f9c17/src/Curves/Edwards/XYZT/Precomputed.v#L38-L56 fe4_add(YplusX, p->Y, p->X); fe4_sub(YminusX, p->Y, p->X); fe4_mul(A, YplusX, q->yplusx); fe4_mul(B, YminusX, q->yminusx); fe4_mul(C, q->xy2d, p->T); fe4_add(D, p->Z, p->Z); fe4_sub(X3, A, B); fe4_add(Y3, A, B); fe4_add(Z3, D, C); fe4_sub(T3, D, C); fe4_mul(r->X, X3, T3); fe4_mul(r->Y, Y3, Z3); fe4_mul(r->Z, Z3, T3); fe4_mul(r->T, X3, Y3); } __attribute__((always_inline)) // 25% speedup with clang14 and zen2 static inline void table_select_4(ge_precomp_4 *t, const int pos, const signed char b) { uint8_t bnegative = constant_time_msb_w(b); uint8_t babs = b - ((bnegative & b) << 1); uint8_t t_bytes[3][32] = { {constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}}; #if defined(__clang__) __asm__("" : "+m" (t_bytes) : /*no inputs*/); #endif OPENSSL_STATIC_ASSERT(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), ""); for (int i = 0; i < 8; i++) { constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i], sizeof(t_bytes), constant_time_eq_w(babs, 1 + i)); } OPENSSL_STATIC_ASSERT(sizeof(t_bytes) == sizeof(ge_precomp_4), ""); // fe4 uses saturated 64-bit limbs, so converting from bytes is just a copy. OPENSSL_memcpy(t, t_bytes, sizeof(ge_precomp_4)); fe4 xy2d_neg = {0}; fe4_sub(xy2d_neg, xy2d_neg, t->xy2d); constant_time_conditional_memcpy(t->yplusx, t_bytes[1], sizeof(fe4), bnegative); constant_time_conditional_memcpy(t->yminusx, t_bytes[0], sizeof(fe4), bnegative); constant_time_conditional_memcpy(t->xy2d, xy2d_neg, sizeof(fe4), bnegative); } // h = a * B // where a = a[0]+256*a[1]+...+256^31 a[31] // B is the Ed25519 base point (x,4/5) with x positive. // // Preconditions: // a[31] <= 127 RING_NOINLINE // https://github.com/rust-lang/rust/issues/116573 __attribute__((target("adx,bmi2"))) void x25519_ge_scalarmult_base_adx(uint8_t h[4][32], const uint8_t a[32]) { signed char e[64]; signed char carry; for (unsigned i = 0; i < 32; ++i) { e[2 * i + 0] = (a[i] >> 0) & 15; e[2 * i + 1] = (a[i] >> 4) & 15; } // each e[i] is between 0 and 15 // e[63] is between 0 and 7 carry = 0; for (unsigned i = 0; i < 63; ++i) { e[i] += carry; carry = e[i] + 8; carry >>= 4; e[i] -= carry << 4; } e[63] += carry; // each e[i] is between -8 and 8 ge_p3_4 r = {{0}, {1}, {1}, {0}}; for (unsigned i = 1; i < 64; i += 2) { ge_precomp_4 t; table_select_4(&t, i / 2, e[i]); ge_p3_add_p3_precomp_4(&r, &r, &t); } inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true); inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true); inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true); inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/false); for (unsigned i = 0; i < 64; i += 2) { ge_precomp_4 t; table_select_4(&t, i / 2, e[i]); ge_p3_add_p3_precomp_4(&r, &r, &t); } // fe4 uses saturated 64-bit limbs, so converting to bytes is just a copy. // Satisfy stated precondition of fiat_25519_from_bytes; tests pass either way fe4_canon(r.X, r.X); fe4_canon(r.Y, r.Y); fe4_canon(r.Z, r.Z); fe4_canon(r.T, r.T); OPENSSL_STATIC_ASSERT(sizeof(ge_p3_4) == sizeof(uint8_t[4][32]), ""); OPENSSL_memcpy(h, &r, sizeof(ge_p3_4)); } ring-0.17.14/third_party/fiat/curve25519_64_msvc.h000064400000000000000000001073121046102023000174610ustar 00000000000000/* Autogenerated: 'src/ExtractionOCaml/unsaturated_solinas' --inline --static --use-value-barrier --no-wide-int 25519 64 '(auto)' '2^255 - 19' carry_mul carry_square carry add sub opp selectznz to_bytes from_bytes relax carry_scmul121666 */ /* curve description: 25519 */ /* machine_wordsize = 64 (from "64") */ /* requested operations: carry_mul, carry_square, carry, add, sub, opp, selectznz, to_bytes, from_bytes, relax, carry_scmul121666 */ /* n = 5 (from "(auto)") */ /* s-c = 2^255 - [(1, 19)] (from "2^255 - 19") */ /* tight_bounds_multiplier = 1 (from "") */ /* */ /* Computed values: */ /* carry_chain = [0, 1, 2, 3, 4, 0, 1] */ /* eval z = z[0] + (z[1] << 51) + (z[2] << 102) + (z[3] << 153) + (z[4] << 204) */ /* bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */ /* balance = [0xfffffffffffda, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe] */ #include #include #if defined(_M_X64) #include #endif typedef unsigned char fiat_25519_uint1; typedef signed char fiat_25519_int1; #define FIAT_25519_FIAT_INLINE inline /* The type fiat_25519_loose_field_element is a field element with loose bounds. */ /* Bounds: [[0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000]] */ typedef uint64_t fiat_25519_loose_field_element[5]; /* The type fiat_25519_tight_field_element is a field element with tight bounds. */ /* Bounds: [[0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000]] */ typedef uint64_t fiat_25519_tight_field_element[5]; #if (-1 & 3) != 3 #error "This code only works on a two's complement system" #endif #define fiat_25519_value_barrier_u64(x) (x) /* * The function fiat_25519_addcarryx_u64 is an addition with carry. * * Postconditions: * out1 = (arg1 + arg2 + arg3) mod 2^64 * out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffffffffffff] * arg3: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] * out2: [0x0 ~> 0x1] */ static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u64(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { // NOTE: edited after generation #if defined(_M_X64) *out2 = _addcarry_u64(arg1, arg2, arg3, out1); #else arg2 += arg1; arg1 = arg2 < arg1; arg3 += arg2; arg1 += arg3 < arg2; *out1 = arg3; *out2 = arg1; #endif } /* * The function fiat_25519_subborrowx_u64 is a subtraction with borrow. * * Postconditions: * out1 = (-arg1 + arg2 + -arg3) mod 2^64 * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffffffffffff] * arg3: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] * out2: [0x0 ~> 0x1] */ static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u64(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { #if defined(_M_X64) *out2 = _subborrow_u64(arg1, arg2, arg3, out1); // NOTE: edited after generation #else *out1 = arg2 - arg3 - arg1; *out2 = (arg2 < arg3) | ((arg2 == arg3) & arg1); #endif } /* * The function fiat_25519_addcarryx_u51 is an addition with carry. * * Postconditions: * out1 = (arg1 + arg2 + arg3) mod 2^51 * out2 = ⌊(arg1 + arg2 + arg3) / 2^51⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0x7ffffffffffff] * arg3: [0x0 ~> 0x7ffffffffffff] * Output Bounds: * out1: [0x0 ~> 0x7ffffffffffff] * out2: [0x0 ~> 0x1] */ static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { uint64_t x1; uint64_t x2; fiat_25519_uint1 x3; x1 = ((arg1 + arg2) + arg3); x2 = (x1 & UINT64_C(0x7ffffffffffff)); x3 = (fiat_25519_uint1)(x1 >> 51); *out1 = x2; *out2 = x3; } /* * The function fiat_25519_subborrowx_u51 is a subtraction with borrow. * * Postconditions: * out1 = (-arg1 + arg2 + -arg3) mod 2^51 * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^51⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0x7ffffffffffff] * arg3: [0x0 ~> 0x7ffffffffffff] * Output Bounds: * out1: [0x0 ~> 0x7ffffffffffff] * out2: [0x0 ~> 0x1] */ static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { int64_t x1; fiat_25519_int1 x2; uint64_t x3; x1 = ((int64_t)(arg2 - (int64_t)arg1) - (int64_t)arg3); x2 = (fiat_25519_int1)(x1 >> 51); x3 = (x1 & UINT64_C(0x7ffffffffffff)); *out1 = x3; *out2 = (fiat_25519_uint1)(0x0 - x2); } /* * The function fiat_25519_mulx_u64 is a multiplication, returning the full double-width result. * * Postconditions: * out1 = (arg1 * arg2) mod 2^64 * out2 = ⌊arg1 * arg2 / 2^64⌋ * * Input Bounds: * arg1: [0x0 ~> 0xffffffffffffffff] * arg2: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] * out2: [0x0 ~> 0xffffffffffffffff] */ static FIAT_25519_FIAT_INLINE void fiat_25519_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) { // NOTE: edited after generation #if defined(_M_X64) *out1 = _umul128(arg1, arg2, out2); #elif defined(_M_ARM64) *out1 = arg1 * arg2; *out2 = __umulh(arg1, arg2); #else #error "This file is intended for MSVC on X64 or ARM64" #endif } /* * The function fiat_25519_cmovznz_u64 is a single-word conditional move. * * Postconditions: * out1 = (if arg1 = 0 then arg2 else arg3) * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffffffffffff] * arg3: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] */ static FIAT_25519_FIAT_INLINE void fiat_25519_cmovznz_u64(uint64_t* out1, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { fiat_25519_uint1 x1; uint64_t x2; uint64_t x3; x1 = (!(!arg1)); x2 = ((fiat_25519_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff)); x3 = ((fiat_25519_value_barrier_u64(x2) & arg3) | (fiat_25519_value_barrier_u64((~x2)) & arg2)); *out1 = x3; } /* * The function fiat_25519_carry_mul multiplies two field elements and reduces the result. * * Postconditions: * eval out1 mod m = (eval arg1 * eval arg2) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_carry_mul(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1, const fiat_25519_loose_field_element arg2) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; uint64_t x13; uint64_t x14; uint64_t x15; uint64_t x16; uint64_t x17; uint64_t x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; uint64_t x26; uint64_t x27; uint64_t x28; uint64_t x29; uint64_t x30; uint64_t x31; uint64_t x32; uint64_t x33; uint64_t x34; uint64_t x35; uint64_t x36; uint64_t x37; uint64_t x38; uint64_t x39; uint64_t x40; uint64_t x41; uint64_t x42; uint64_t x43; uint64_t x44; uint64_t x45; uint64_t x46; uint64_t x47; uint64_t x48; uint64_t x49; uint64_t x50; uint64_t x51; fiat_25519_uint1 x52; uint64_t x53; fiat_25519_uint1 x54; uint64_t x55; fiat_25519_uint1 x56; uint64_t x57; fiat_25519_uint1 x58; uint64_t x59; fiat_25519_uint1 x60; uint64_t x61; fiat_25519_uint1 x62; uint64_t x63; fiat_25519_uint1 x64; uint64_t x65; fiat_25519_uint1 x66; uint64_t x67; uint64_t x68; uint64_t x69; fiat_25519_uint1 x70; uint64_t x71; fiat_25519_uint1 x72; uint64_t x73; fiat_25519_uint1 x74; uint64_t x75; fiat_25519_uint1 x76; uint64_t x77; fiat_25519_uint1 x78; uint64_t x79; fiat_25519_uint1 x80; uint64_t x81; fiat_25519_uint1 x82; uint64_t x83; fiat_25519_uint1 x84; uint64_t x85; fiat_25519_uint1 x86; uint64_t x87; fiat_25519_uint1 x88; uint64_t x89; fiat_25519_uint1 x90; uint64_t x91; fiat_25519_uint1 x92; uint64_t x93; fiat_25519_uint1 x94; uint64_t x95; fiat_25519_uint1 x96; uint64_t x97; fiat_25519_uint1 x98; uint64_t x99; fiat_25519_uint1 x100; uint64_t x101; fiat_25519_uint1 x102; uint64_t x103; fiat_25519_uint1 x104; uint64_t x105; fiat_25519_uint1 x106; uint64_t x107; fiat_25519_uint1 x108; uint64_t x109; fiat_25519_uint1 x110; uint64_t x111; fiat_25519_uint1 x112; uint64_t x113; fiat_25519_uint1 x114; uint64_t x115; fiat_25519_uint1 x116; uint64_t x117; fiat_25519_uint1 x118; uint64_t x119; fiat_25519_uint1 x120; uint64_t x121; fiat_25519_uint1 x122; uint64_t x123; fiat_25519_uint1 x124; uint64_t x125; fiat_25519_uint1 x126; uint64_t x127; fiat_25519_uint1 x128; uint64_t x129; fiat_25519_uint1 x130; uint64_t x131; fiat_25519_uint1 x132; uint64_t x133; fiat_25519_uint1 x134; uint64_t x135; uint64_t x136; uint64_t x137; uint64_t x138; fiat_25519_uint1 x139; uint64_t x140; uint64_t x141; uint64_t x142; uint64_t x143; fiat_25519_uint1 x144; uint64_t x145; uint64_t x146; uint64_t x147; uint64_t x148; fiat_25519_uint1 x149; uint64_t x150; uint64_t x151; uint64_t x152; uint64_t x153; uint64_t x154; uint64_t x155; uint64_t x156; uint64_t x157; fiat_25519_uint1 x158; uint64_t x159; uint64_t x160; fiat_25519_mulx_u64(&x1, &x2, (arg1[4]), ((arg2[4]) * UINT8_C(0x13))); fiat_25519_mulx_u64(&x3, &x4, (arg1[4]), ((arg2[3]) * UINT8_C(0x13))); fiat_25519_mulx_u64(&x5, &x6, (arg1[4]), ((arg2[2]) * UINT8_C(0x13))); fiat_25519_mulx_u64(&x7, &x8, (arg1[4]), ((arg2[1]) * UINT8_C(0x13))); fiat_25519_mulx_u64(&x9, &x10, (arg1[3]), ((arg2[4]) * UINT8_C(0x13))); fiat_25519_mulx_u64(&x11, &x12, (arg1[3]), ((arg2[3]) * UINT8_C(0x13))); fiat_25519_mulx_u64(&x13, &x14, (arg1[3]), ((arg2[2]) * UINT8_C(0x13))); fiat_25519_mulx_u64(&x15, &x16, (arg1[2]), ((arg2[4]) * UINT8_C(0x13))); fiat_25519_mulx_u64(&x17, &x18, (arg1[2]), ((arg2[3]) * UINT8_C(0x13))); fiat_25519_mulx_u64(&x19, &x20, (arg1[1]), ((arg2[4]) * UINT8_C(0x13))); fiat_25519_mulx_u64(&x21, &x22, (arg1[4]), (arg2[0])); fiat_25519_mulx_u64(&x23, &x24, (arg1[3]), (arg2[1])); fiat_25519_mulx_u64(&x25, &x26, (arg1[3]), (arg2[0])); fiat_25519_mulx_u64(&x27, &x28, (arg1[2]), (arg2[2])); fiat_25519_mulx_u64(&x29, &x30, (arg1[2]), (arg2[1])); fiat_25519_mulx_u64(&x31, &x32, (arg1[2]), (arg2[0])); fiat_25519_mulx_u64(&x33, &x34, (arg1[1]), (arg2[3])); fiat_25519_mulx_u64(&x35, &x36, (arg1[1]), (arg2[2])); fiat_25519_mulx_u64(&x37, &x38, (arg1[1]), (arg2[1])); fiat_25519_mulx_u64(&x39, &x40, (arg1[1]), (arg2[0])); fiat_25519_mulx_u64(&x41, &x42, (arg1[0]), (arg2[4])); fiat_25519_mulx_u64(&x43, &x44, (arg1[0]), (arg2[3])); fiat_25519_mulx_u64(&x45, &x46, (arg1[0]), (arg2[2])); fiat_25519_mulx_u64(&x47, &x48, (arg1[0]), (arg2[1])); fiat_25519_mulx_u64(&x49, &x50, (arg1[0]), (arg2[0])); fiat_25519_addcarryx_u64(&x51, &x52, 0x0, x13, x7); fiat_25519_addcarryx_u64(&x53, &x54, x52, x14, x8); fiat_25519_addcarryx_u64(&x55, &x56, 0x0, x17, x51); fiat_25519_addcarryx_u64(&x57, &x58, x56, x18, x53); fiat_25519_addcarryx_u64(&x59, &x60, 0x0, x19, x55); fiat_25519_addcarryx_u64(&x61, &x62, x60, x20, x57); fiat_25519_addcarryx_u64(&x63, &x64, 0x0, x49, x59); fiat_25519_addcarryx_u64(&x65, &x66, x64, x50, x61); x67 = ((x63 >> 51) | ((x65 << 13) & UINT64_C(0xffffffffffffffff))); x68 = (x63 & UINT64_C(0x7ffffffffffff)); fiat_25519_addcarryx_u64(&x69, &x70, 0x0, x23, x21); fiat_25519_addcarryx_u64(&x71, &x72, x70, x24, x22); fiat_25519_addcarryx_u64(&x73, &x74, 0x0, x27, x69); fiat_25519_addcarryx_u64(&x75, &x76, x74, x28, x71); fiat_25519_addcarryx_u64(&x77, &x78, 0x0, x33, x73); fiat_25519_addcarryx_u64(&x79, &x80, x78, x34, x75); fiat_25519_addcarryx_u64(&x81, &x82, 0x0, x41, x77); fiat_25519_addcarryx_u64(&x83, &x84, x82, x42, x79); fiat_25519_addcarryx_u64(&x85, &x86, 0x0, x25, x1); fiat_25519_addcarryx_u64(&x87, &x88, x86, x26, x2); fiat_25519_addcarryx_u64(&x89, &x90, 0x0, x29, x85); fiat_25519_addcarryx_u64(&x91, &x92, x90, x30, x87); fiat_25519_addcarryx_u64(&x93, &x94, 0x0, x35, x89); fiat_25519_addcarryx_u64(&x95, &x96, x94, x36, x91); fiat_25519_addcarryx_u64(&x97, &x98, 0x0, x43, x93); fiat_25519_addcarryx_u64(&x99, &x100, x98, x44, x95); fiat_25519_addcarryx_u64(&x101, &x102, 0x0, x9, x3); fiat_25519_addcarryx_u64(&x103, &x104, x102, x10, x4); fiat_25519_addcarryx_u64(&x105, &x106, 0x0, x31, x101); fiat_25519_addcarryx_u64(&x107, &x108, x106, x32, x103); fiat_25519_addcarryx_u64(&x109, &x110, 0x0, x37, x105); fiat_25519_addcarryx_u64(&x111, &x112, x110, x38, x107); fiat_25519_addcarryx_u64(&x113, &x114, 0x0, x45, x109); fiat_25519_addcarryx_u64(&x115, &x116, x114, x46, x111); fiat_25519_addcarryx_u64(&x117, &x118, 0x0, x11, x5); fiat_25519_addcarryx_u64(&x119, &x120, x118, x12, x6); fiat_25519_addcarryx_u64(&x121, &x122, 0x0, x15, x117); fiat_25519_addcarryx_u64(&x123, &x124, x122, x16, x119); fiat_25519_addcarryx_u64(&x125, &x126, 0x0, x39, x121); fiat_25519_addcarryx_u64(&x127, &x128, x126, x40, x123); fiat_25519_addcarryx_u64(&x129, &x130, 0x0, x47, x125); fiat_25519_addcarryx_u64(&x131, &x132, x130, x48, x127); fiat_25519_addcarryx_u64(&x133, &x134, 0x0, x67, x129); x135 = (x134 + x131); x136 = ((x133 >> 51) | ((x135 << 13) & UINT64_C(0xffffffffffffffff))); x137 = (x133 & UINT64_C(0x7ffffffffffff)); fiat_25519_addcarryx_u64(&x138, &x139, 0x0, x136, x113); x140 = (x139 + x115); x141 = ((x138 >> 51) | ((x140 << 13) & UINT64_C(0xffffffffffffffff))); x142 = (x138 & UINT64_C(0x7ffffffffffff)); fiat_25519_addcarryx_u64(&x143, &x144, 0x0, x141, x97); x145 = (x144 + x99); x146 = ((x143 >> 51) | ((x145 << 13) & UINT64_C(0xffffffffffffffff))); x147 = (x143 & UINT64_C(0x7ffffffffffff)); fiat_25519_addcarryx_u64(&x148, &x149, 0x0, x146, x81); x150 = (x149 + x83); x151 = ((x148 >> 51) | ((x150 << 13) & UINT64_C(0xffffffffffffffff))); x152 = (x148 & UINT64_C(0x7ffffffffffff)); x153 = (x151 * UINT8_C(0x13)); x154 = (x68 + x153); x155 = (x154 >> 51); x156 = (x154 & UINT64_C(0x7ffffffffffff)); x157 = (x155 + x137); x158 = (fiat_25519_uint1)(x157 >> 51); x159 = (x157 & UINT64_C(0x7ffffffffffff)); x160 = (x158 + x142); out1[0] = x156; out1[1] = x159; out1[2] = x160; out1[3] = x147; out1[4] = x152; } /* * The function fiat_25519_carry_square squares a field element and reduces the result. * * Postconditions: * eval out1 mod m = (eval arg1 * eval arg1) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_carry_square(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; uint64_t x13; uint64_t x14; uint64_t x15; uint64_t x16; uint64_t x17; uint64_t x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; uint64_t x26; uint64_t x27; uint64_t x28; uint64_t x29; uint64_t x30; uint64_t x31; uint64_t x32; uint64_t x33; uint64_t x34; uint64_t x35; uint64_t x36; uint64_t x37; uint64_t x38; uint64_t x39; fiat_25519_uint1 x40; uint64_t x41; fiat_25519_uint1 x42; uint64_t x43; fiat_25519_uint1 x44; uint64_t x45; fiat_25519_uint1 x46; uint64_t x47; uint64_t x48; uint64_t x49; fiat_25519_uint1 x50; uint64_t x51; fiat_25519_uint1 x52; uint64_t x53; fiat_25519_uint1 x54; uint64_t x55; fiat_25519_uint1 x56; uint64_t x57; fiat_25519_uint1 x58; uint64_t x59; fiat_25519_uint1 x60; uint64_t x61; fiat_25519_uint1 x62; uint64_t x63; fiat_25519_uint1 x64; uint64_t x65; fiat_25519_uint1 x66; uint64_t x67; fiat_25519_uint1 x68; uint64_t x69; fiat_25519_uint1 x70; uint64_t x71; fiat_25519_uint1 x72; uint64_t x73; fiat_25519_uint1 x74; uint64_t x75; fiat_25519_uint1 x76; uint64_t x77; fiat_25519_uint1 x78; uint64_t x79; fiat_25519_uint1 x80; uint64_t x81; fiat_25519_uint1 x82; uint64_t x83; uint64_t x84; uint64_t x85; uint64_t x86; fiat_25519_uint1 x87; uint64_t x88; uint64_t x89; uint64_t x90; uint64_t x91; fiat_25519_uint1 x92; uint64_t x93; uint64_t x94; uint64_t x95; uint64_t x96; fiat_25519_uint1 x97; uint64_t x98; uint64_t x99; uint64_t x100; uint64_t x101; uint64_t x102; uint64_t x103; uint64_t x104; uint64_t x105; fiat_25519_uint1 x106; uint64_t x107; uint64_t x108; x1 = ((arg1[4]) * UINT8_C(0x13)); x2 = (x1 * 0x2); x3 = ((arg1[4]) * 0x2); x4 = ((arg1[3]) * UINT8_C(0x13)); x5 = (x4 * 0x2); x6 = ((arg1[3]) * 0x2); x7 = ((arg1[2]) * 0x2); x8 = ((arg1[1]) * 0x2); fiat_25519_mulx_u64(&x9, &x10, (arg1[4]), x1); fiat_25519_mulx_u64(&x11, &x12, (arg1[3]), x2); fiat_25519_mulx_u64(&x13, &x14, (arg1[3]), x4); fiat_25519_mulx_u64(&x15, &x16, (arg1[2]), x2); fiat_25519_mulx_u64(&x17, &x18, (arg1[2]), x5); fiat_25519_mulx_u64(&x19, &x20, (arg1[2]), (arg1[2])); fiat_25519_mulx_u64(&x21, &x22, (arg1[1]), x2); fiat_25519_mulx_u64(&x23, &x24, (arg1[1]), x6); fiat_25519_mulx_u64(&x25, &x26, (arg1[1]), x7); fiat_25519_mulx_u64(&x27, &x28, (arg1[1]), (arg1[1])); fiat_25519_mulx_u64(&x29, &x30, (arg1[0]), x3); fiat_25519_mulx_u64(&x31, &x32, (arg1[0]), x6); fiat_25519_mulx_u64(&x33, &x34, (arg1[0]), x7); fiat_25519_mulx_u64(&x35, &x36, (arg1[0]), x8); fiat_25519_mulx_u64(&x37, &x38, (arg1[0]), (arg1[0])); fiat_25519_addcarryx_u64(&x39, &x40, 0x0, x21, x17); fiat_25519_addcarryx_u64(&x41, &x42, x40, x22, x18); fiat_25519_addcarryx_u64(&x43, &x44, 0x0, x37, x39); fiat_25519_addcarryx_u64(&x45, &x46, x44, x38, x41); x47 = ((x43 >> 51) | ((x45 << 13) & UINT64_C(0xffffffffffffffff))); x48 = (x43 & UINT64_C(0x7ffffffffffff)); fiat_25519_addcarryx_u64(&x49, &x50, 0x0, x23, x19); fiat_25519_addcarryx_u64(&x51, &x52, x50, x24, x20); fiat_25519_addcarryx_u64(&x53, &x54, 0x0, x29, x49); fiat_25519_addcarryx_u64(&x55, &x56, x54, x30, x51); fiat_25519_addcarryx_u64(&x57, &x58, 0x0, x25, x9); fiat_25519_addcarryx_u64(&x59, &x60, x58, x26, x10); fiat_25519_addcarryx_u64(&x61, &x62, 0x0, x31, x57); fiat_25519_addcarryx_u64(&x63, &x64, x62, x32, x59); fiat_25519_addcarryx_u64(&x65, &x66, 0x0, x27, x11); fiat_25519_addcarryx_u64(&x67, &x68, x66, x28, x12); fiat_25519_addcarryx_u64(&x69, &x70, 0x0, x33, x65); fiat_25519_addcarryx_u64(&x71, &x72, x70, x34, x67); fiat_25519_addcarryx_u64(&x73, &x74, 0x0, x15, x13); fiat_25519_addcarryx_u64(&x75, &x76, x74, x16, x14); fiat_25519_addcarryx_u64(&x77, &x78, 0x0, x35, x73); fiat_25519_addcarryx_u64(&x79, &x80, x78, x36, x75); fiat_25519_addcarryx_u64(&x81, &x82, 0x0, x47, x77); x83 = (x82 + x79); x84 = ((x81 >> 51) | ((x83 << 13) & UINT64_C(0xffffffffffffffff))); x85 = (x81 & UINT64_C(0x7ffffffffffff)); fiat_25519_addcarryx_u64(&x86, &x87, 0x0, x84, x69); x88 = (x87 + x71); x89 = ((x86 >> 51) | ((x88 << 13) & UINT64_C(0xffffffffffffffff))); x90 = (x86 & UINT64_C(0x7ffffffffffff)); fiat_25519_addcarryx_u64(&x91, &x92, 0x0, x89, x61); x93 = (x92 + x63); x94 = ((x91 >> 51) | ((x93 << 13) & UINT64_C(0xffffffffffffffff))); x95 = (x91 & UINT64_C(0x7ffffffffffff)); fiat_25519_addcarryx_u64(&x96, &x97, 0x0, x94, x53); x98 = (x97 + x55); x99 = ((x96 >> 51) | ((x98 << 13) & UINT64_C(0xffffffffffffffff))); x100 = (x96 & UINT64_C(0x7ffffffffffff)); x101 = (x99 * UINT8_C(0x13)); x102 = (x48 + x101); x103 = (x102 >> 51); x104 = (x102 & UINT64_C(0x7ffffffffffff)); x105 = (x103 + x85); x106 = (fiat_25519_uint1)(x105 >> 51); x107 = (x105 & UINT64_C(0x7ffffffffffff)); x108 = (x106 + x90); out1[0] = x104; out1[1] = x107; out1[2] = x108; out1[3] = x95; out1[4] = x100; } /* * The function fiat_25519_carry reduces a field element. * * Postconditions: * eval out1 mod m = eval arg1 mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_carry(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; x1 = (arg1[0]); x2 = ((x1 >> 51) + (arg1[1])); x3 = ((x2 >> 51) + (arg1[2])); x4 = ((x3 >> 51) + (arg1[3])); x5 = ((x4 >> 51) + (arg1[4])); x6 = ((x1 & UINT64_C(0x7ffffffffffff)) + ((x5 >> 51) * UINT8_C(0x13))); x7 = ((fiat_25519_uint1)(x6 >> 51) + (x2 & UINT64_C(0x7ffffffffffff))); x8 = (x6 & UINT64_C(0x7ffffffffffff)); x9 = (x7 & UINT64_C(0x7ffffffffffff)); x10 = ((fiat_25519_uint1)(x7 >> 51) + (x3 & UINT64_C(0x7ffffffffffff))); x11 = (x4 & UINT64_C(0x7ffffffffffff)); x12 = (x5 & UINT64_C(0x7ffffffffffff)); out1[0] = x8; out1[1] = x9; out1[2] = x10; out1[3] = x11; out1[4] = x12; } /* * The function fiat_25519_add adds two field elements. * * Postconditions: * eval out1 mod m = (eval arg1 + eval arg2) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_add(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; x1 = ((arg1[0]) + (arg2[0])); x2 = ((arg1[1]) + (arg2[1])); x3 = ((arg1[2]) + (arg2[2])); x4 = ((arg1[3]) + (arg2[3])); x5 = ((arg1[4]) + (arg2[4])); out1[0] = x1; out1[1] = x2; out1[2] = x3; out1[3] = x4; out1[4] = x5; } /* * The function fiat_25519_sub subtracts two field elements. * * Postconditions: * eval out1 mod m = (eval arg1 - eval arg2) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_sub(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; x1 = ((UINT64_C(0xfffffffffffda) + (arg1[0])) - (arg2[0])); x2 = ((UINT64_C(0xffffffffffffe) + (arg1[1])) - (arg2[1])); x3 = ((UINT64_C(0xffffffffffffe) + (arg1[2])) - (arg2[2])); x4 = ((UINT64_C(0xffffffffffffe) + (arg1[3])) - (arg2[3])); x5 = ((UINT64_C(0xffffffffffffe) + (arg1[4])) - (arg2[4])); out1[0] = x1; out1[1] = x2; out1[2] = x3; out1[3] = x4; out1[4] = x5; } /* * The function fiat_25519_opp negates a field element. * * Postconditions: * eval out1 mod m = -eval arg1 mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_opp(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; x1 = (UINT64_C(0xfffffffffffda) - (arg1[0])); x2 = (UINT64_C(0xffffffffffffe) - (arg1[1])); x3 = (UINT64_C(0xffffffffffffe) - (arg1[2])); x4 = (UINT64_C(0xffffffffffffe) - (arg1[3])); x5 = (UINT64_C(0xffffffffffffe) - (arg1[4])); out1[0] = x1; out1[1] = x2; out1[2] = x3; out1[3] = x4; out1[4] = x5; } /* * The function fiat_25519_to_bytes serializes a field element to bytes in little-endian order. * * Postconditions: * out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31] * * Output Bounds: * out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]] */ static FIAT_25519_FIAT_INLINE void fiat_25519_to_bytes(uint8_t out1[32], const fiat_25519_tight_field_element arg1) { uint64_t x1; fiat_25519_uint1 x2; uint64_t x3; fiat_25519_uint1 x4; uint64_t x5; fiat_25519_uint1 x6; uint64_t x7; fiat_25519_uint1 x8; uint64_t x9; fiat_25519_uint1 x10; uint64_t x11; uint64_t x12; fiat_25519_uint1 x13; uint64_t x14; fiat_25519_uint1 x15; uint64_t x16; fiat_25519_uint1 x17; uint64_t x18; fiat_25519_uint1 x19; uint64_t x20; fiat_25519_uint1 x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; uint8_t x26; uint64_t x27; uint8_t x28; uint64_t x29; uint8_t x30; uint64_t x31; uint8_t x32; uint64_t x33; uint8_t x34; uint64_t x35; uint8_t x36; uint8_t x37; uint64_t x38; uint8_t x39; uint64_t x40; uint8_t x41; uint64_t x42; uint8_t x43; uint64_t x44; uint8_t x45; uint64_t x46; uint8_t x47; uint64_t x48; uint8_t x49; uint8_t x50; uint64_t x51; uint8_t x52; uint64_t x53; uint8_t x54; uint64_t x55; uint8_t x56; uint64_t x57; uint8_t x58; uint64_t x59; uint8_t x60; uint64_t x61; uint8_t x62; uint64_t x63; uint8_t x64; fiat_25519_uint1 x65; uint64_t x66; uint8_t x67; uint64_t x68; uint8_t x69; uint64_t x70; uint8_t x71; uint64_t x72; uint8_t x73; uint64_t x74; uint8_t x75; uint64_t x76; uint8_t x77; uint8_t x78; uint64_t x79; uint8_t x80; uint64_t x81; uint8_t x82; uint64_t x83; uint8_t x84; uint64_t x85; uint8_t x86; uint64_t x87; uint8_t x88; uint64_t x89; uint8_t x90; uint8_t x91; fiat_25519_subborrowx_u51(&x1, &x2, 0x0, (arg1[0]), UINT64_C(0x7ffffffffffed)); fiat_25519_subborrowx_u51(&x3, &x4, x2, (arg1[1]), UINT64_C(0x7ffffffffffff)); fiat_25519_subborrowx_u51(&x5, &x6, x4, (arg1[2]), UINT64_C(0x7ffffffffffff)); fiat_25519_subborrowx_u51(&x7, &x8, x6, (arg1[3]), UINT64_C(0x7ffffffffffff)); fiat_25519_subborrowx_u51(&x9, &x10, x8, (arg1[4]), UINT64_C(0x7ffffffffffff)); fiat_25519_cmovznz_u64(&x11, x10, 0x0, UINT64_C(0xffffffffffffffff)); fiat_25519_addcarryx_u51(&x12, &x13, 0x0, x1, (x11 & UINT64_C(0x7ffffffffffed))); fiat_25519_addcarryx_u51(&x14, &x15, x13, x3, (x11 & UINT64_C(0x7ffffffffffff))); fiat_25519_addcarryx_u51(&x16, &x17, x15, x5, (x11 & UINT64_C(0x7ffffffffffff))); fiat_25519_addcarryx_u51(&x18, &x19, x17, x7, (x11 & UINT64_C(0x7ffffffffffff))); fiat_25519_addcarryx_u51(&x20, &x21, x19, x9, (x11 & UINT64_C(0x7ffffffffffff))); x22 = (x20 << 4); x23 = (x18 * (uint64_t)0x2); x24 = (x16 << 6); x25 = (x14 << 3); x26 = (uint8_t)(x12 & UINT8_C(0xff)); x27 = (x12 >> 8); x28 = (uint8_t)(x27 & UINT8_C(0xff)); x29 = (x27 >> 8); x30 = (uint8_t)(x29 & UINT8_C(0xff)); x31 = (x29 >> 8); x32 = (uint8_t)(x31 & UINT8_C(0xff)); x33 = (x31 >> 8); x34 = (uint8_t)(x33 & UINT8_C(0xff)); x35 = (x33 >> 8); x36 = (uint8_t)(x35 & UINT8_C(0xff)); x37 = (uint8_t)(x35 >> 8); x38 = (x25 + (uint64_t)x37); x39 = (uint8_t)(x38 & UINT8_C(0xff)); x40 = (x38 >> 8); x41 = (uint8_t)(x40 & UINT8_C(0xff)); x42 = (x40 >> 8); x43 = (uint8_t)(x42 & UINT8_C(0xff)); x44 = (x42 >> 8); x45 = (uint8_t)(x44 & UINT8_C(0xff)); x46 = (x44 >> 8); x47 = (uint8_t)(x46 & UINT8_C(0xff)); x48 = (x46 >> 8); x49 = (uint8_t)(x48 & UINT8_C(0xff)); x50 = (uint8_t)(x48 >> 8); x51 = (x24 + (uint64_t)x50); x52 = (uint8_t)(x51 & UINT8_C(0xff)); x53 = (x51 >> 8); x54 = (uint8_t)(x53 & UINT8_C(0xff)); x55 = (x53 >> 8); x56 = (uint8_t)(x55 & UINT8_C(0xff)); x57 = (x55 >> 8); x58 = (uint8_t)(x57 & UINT8_C(0xff)); x59 = (x57 >> 8); x60 = (uint8_t)(x59 & UINT8_C(0xff)); x61 = (x59 >> 8); x62 = (uint8_t)(x61 & UINT8_C(0xff)); x63 = (x61 >> 8); x64 = (uint8_t)(x63 & UINT8_C(0xff)); x65 = (fiat_25519_uint1)(x63 >> 8); x66 = (x23 + (uint64_t)x65); x67 = (uint8_t)(x66 & UINT8_C(0xff)); x68 = (x66 >> 8); x69 = (uint8_t)(x68 & UINT8_C(0xff)); x70 = (x68 >> 8); x71 = (uint8_t)(x70 & UINT8_C(0xff)); x72 = (x70 >> 8); x73 = (uint8_t)(x72 & UINT8_C(0xff)); x74 = (x72 >> 8); x75 = (uint8_t)(x74 & UINT8_C(0xff)); x76 = (x74 >> 8); x77 = (uint8_t)(x76 & UINT8_C(0xff)); x78 = (uint8_t)(x76 >> 8); x79 = (x22 + (uint64_t)x78); x80 = (uint8_t)(x79 & UINT8_C(0xff)); x81 = (x79 >> 8); x82 = (uint8_t)(x81 & UINT8_C(0xff)); x83 = (x81 >> 8); x84 = (uint8_t)(x83 & UINT8_C(0xff)); x85 = (x83 >> 8); x86 = (uint8_t)(x85 & UINT8_C(0xff)); x87 = (x85 >> 8); x88 = (uint8_t)(x87 & UINT8_C(0xff)); x89 = (x87 >> 8); x90 = (uint8_t)(x89 & UINT8_C(0xff)); x91 = (uint8_t)(x89 >> 8); out1[0] = x26; out1[1] = x28; out1[2] = x30; out1[3] = x32; out1[4] = x34; out1[5] = x36; out1[6] = x39; out1[7] = x41; out1[8] = x43; out1[9] = x45; out1[10] = x47; out1[11] = x49; out1[12] = x52; out1[13] = x54; out1[14] = x56; out1[15] = x58; out1[16] = x60; out1[17] = x62; out1[18] = x64; out1[19] = x67; out1[20] = x69; out1[21] = x71; out1[22] = x73; out1[23] = x75; out1[24] = x77; out1[25] = x80; out1[26] = x82; out1[27] = x84; out1[28] = x86; out1[29] = x88; out1[30] = x90; out1[31] = x91; } /* * The function fiat_25519_from_bytes deserializes a field element from bytes in little-endian order. * * Postconditions: * eval out1 mod m = bytes_eval arg1 mod m * * Input Bounds: * arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]] */ static FIAT_25519_FIAT_INLINE void fiat_25519_from_bytes(fiat_25519_tight_field_element out1, const uint8_t arg1[32]) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; uint64_t x13; uint64_t x14; uint64_t x15; uint64_t x16; uint64_t x17; uint64_t x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; uint64_t x26; uint64_t x27; uint64_t x28; uint64_t x29; uint64_t x30; uint64_t x31; uint8_t x32; uint64_t x33; uint64_t x34; uint64_t x35; uint64_t x36; uint64_t x37; uint64_t x38; uint64_t x39; uint8_t x40; uint64_t x41; uint64_t x42; uint64_t x43; uint64_t x44; uint64_t x45; uint64_t x46; uint64_t x47; uint8_t x48; uint64_t x49; uint64_t x50; uint64_t x51; uint64_t x52; uint64_t x53; uint64_t x54; uint64_t x55; uint64_t x56; uint8_t x57; uint64_t x58; uint64_t x59; uint64_t x60; uint64_t x61; uint64_t x62; uint64_t x63; uint64_t x64; uint8_t x65; uint64_t x66; uint64_t x67; uint64_t x68; uint64_t x69; uint64_t x70; uint64_t x71; x1 = ((uint64_t)(arg1[31]) << 44); x2 = ((uint64_t)(arg1[30]) << 36); x3 = ((uint64_t)(arg1[29]) << 28); x4 = ((uint64_t)(arg1[28]) << 20); x5 = ((uint64_t)(arg1[27]) << 12); x6 = ((uint64_t)(arg1[26]) << 4); x7 = ((uint64_t)(arg1[25]) << 47); x8 = ((uint64_t)(arg1[24]) << 39); x9 = ((uint64_t)(arg1[23]) << 31); x10 = ((uint64_t)(arg1[22]) << 23); x11 = ((uint64_t)(arg1[21]) << 15); x12 = ((uint64_t)(arg1[20]) << 7); x13 = ((uint64_t)(arg1[19]) << 50); x14 = ((uint64_t)(arg1[18]) << 42); x15 = ((uint64_t)(arg1[17]) << 34); x16 = ((uint64_t)(arg1[16]) << 26); x17 = ((uint64_t)(arg1[15]) << 18); x18 = ((uint64_t)(arg1[14]) << 10); x19 = ((uint64_t)(arg1[13]) << 2); x20 = ((uint64_t)(arg1[12]) << 45); x21 = ((uint64_t)(arg1[11]) << 37); x22 = ((uint64_t)(arg1[10]) << 29); x23 = ((uint64_t)(arg1[9]) << 21); x24 = ((uint64_t)(arg1[8]) << 13); x25 = ((uint64_t)(arg1[7]) << 5); x26 = ((uint64_t)(arg1[6]) << 48); x27 = ((uint64_t)(arg1[5]) << 40); x28 = ((uint64_t)(arg1[4]) << 32); x29 = ((uint64_t)(arg1[3]) << 24); x30 = ((uint64_t)(arg1[2]) << 16); x31 = ((uint64_t)(arg1[1]) << 8); x32 = (arg1[0]); x33 = (x31 + (uint64_t)x32); x34 = (x30 + x33); x35 = (x29 + x34); x36 = (x28 + x35); x37 = (x27 + x36); x38 = (x26 + x37); x39 = (x38 & UINT64_C(0x7ffffffffffff)); x40 = (uint8_t)(x38 >> 51); x41 = (x25 + (uint64_t)x40); x42 = (x24 + x41); x43 = (x23 + x42); x44 = (x22 + x43); x45 = (x21 + x44); x46 = (x20 + x45); x47 = (x46 & UINT64_C(0x7ffffffffffff)); x48 = (uint8_t)(x46 >> 51); x49 = (x19 + (uint64_t)x48); x50 = (x18 + x49); x51 = (x17 + x50); x52 = (x16 + x51); x53 = (x15 + x52); x54 = (x14 + x53); x55 = (x13 + x54); x56 = (x55 & UINT64_C(0x7ffffffffffff)); x57 = (uint8_t)(x55 >> 51); x58 = (x12 + (uint64_t)x57); x59 = (x11 + x58); x60 = (x10 + x59); x61 = (x9 + x60); x62 = (x8 + x61); x63 = (x7 + x62); x64 = (x63 & UINT64_C(0x7ffffffffffff)); x65 = (uint8_t)(x63 >> 51); x66 = (x6 + (uint64_t)x65); x67 = (x5 + x66); x68 = (x4 + x67); x69 = (x3 + x68); x70 = (x2 + x69); x71 = (x1 + x70); out1[0] = x39; out1[1] = x47; out1[2] = x56; out1[3] = x64; out1[4] = x71; } /* * The function fiat_25519_carry_scmul_121666 multiplies a field element by 121666 and reduces the result. * * Postconditions: * eval out1 mod m = (121666 * eval arg1) mod m * */ static FIAT_25519_FIAT_INLINE void fiat_25519_carry_scmul_121666(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; uint64_t x13; fiat_25519_uint1 x14; uint64_t x15; uint64_t x16; uint64_t x17; uint64_t x18; fiat_25519_uint1 x19; uint64_t x20; uint64_t x21; uint64_t x22; uint64_t x23; fiat_25519_uint1 x24; uint64_t x25; uint64_t x26; uint64_t x27; uint64_t x28; fiat_25519_uint1 x29; uint64_t x30; uint64_t x31; uint64_t x32; uint64_t x33; uint64_t x34; fiat_25519_uint1 x35; uint64_t x36; uint64_t x37; fiat_25519_uint1 x38; uint64_t x39; uint64_t x40; fiat_25519_mulx_u64(&x1, &x2, UINT32_C(0x1db42), (arg1[4])); fiat_25519_mulx_u64(&x3, &x4, UINT32_C(0x1db42), (arg1[3])); fiat_25519_mulx_u64(&x5, &x6, UINT32_C(0x1db42), (arg1[2])); fiat_25519_mulx_u64(&x7, &x8, UINT32_C(0x1db42), (arg1[1])); fiat_25519_mulx_u64(&x9, &x10, UINT32_C(0x1db42), (arg1[0])); x11 = ((x9 >> 51) | ((x10 << 13) & UINT64_C(0xffffffffffffffff))); x12 = (x9 & UINT64_C(0x7ffffffffffff)); fiat_25519_addcarryx_u64(&x13, &x14, 0x0, x11, x7); x15 = (x14 + x8); x16 = ((x13 >> 51) | ((x15 << 13) & UINT64_C(0xffffffffffffffff))); x17 = (x13 & UINT64_C(0x7ffffffffffff)); fiat_25519_addcarryx_u64(&x18, &x19, 0x0, x16, x5); x20 = (x19 + x6); x21 = ((x18 >> 51) | ((x20 << 13) & UINT64_C(0xffffffffffffffff))); x22 = (x18 & UINT64_C(0x7ffffffffffff)); fiat_25519_addcarryx_u64(&x23, &x24, 0x0, x21, x3); x25 = (x24 + x4); x26 = ((x23 >> 51) | ((x25 << 13) & UINT64_C(0xffffffffffffffff))); x27 = (x23 & UINT64_C(0x7ffffffffffff)); fiat_25519_addcarryx_u64(&x28, &x29, 0x0, x26, x1); x30 = (x29 + x2); x31 = ((x28 >> 51) | ((x30 << 13) & UINT64_C(0xffffffffffffffff))); x32 = (x28 & UINT64_C(0x7ffffffffffff)); x33 = (x31 * UINT8_C(0x13)); x34 = (x12 + x33); x35 = (fiat_25519_uint1)(x34 >> 51); x36 = (x34 & UINT64_C(0x7ffffffffffff)); x37 = (x35 + x17); x38 = (fiat_25519_uint1)(x37 >> 51); x39 = (x37 & UINT64_C(0x7ffffffffffff)); x40 = (x38 + x22); out1[0] = x36; out1[1] = x39; out1[2] = x40; out1[3] = x27; out1[4] = x32; } ring-0.17.14/third_party/fiat/p256_32.h000064400000000000000000002353751046102023000154010ustar 00000000000000/* Autogenerated: 'src/ExtractionOCaml/word_by_word_montgomery' --inline --static --use-value-barrier p256 32 '2^256 - 2^224 + 2^192 + 2^96 - 1' mul square add sub opp from_montgomery to_montgomery nonzero selectznz to_bytes from_bytes one msat divstep divstep_precomp */ /* curve description: p256 */ /* machine_wordsize = 32 (from "32") */ /* requested operations: mul, square, add, sub, opp, from_montgomery, to_montgomery, nonzero, selectznz, to_bytes, from_bytes, one, msat, divstep, divstep_precomp */ /* m = 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff (from "2^256 - 2^224 + 2^192 + 2^96 - 1") */ /* */ /* NOTE: In addition to the bounds specified above each function, all */ /* functions synthesized for this Montgomery arithmetic require the */ /* input to be strictly less than the prime modulus (m), and also */ /* require the input to be in the unique saturated representation. */ /* All functions also ensure that these two properties are true of */ /* return values. */ /* */ /* Computed values: */ /* eval z = z[0] + (z[1] << 32) + (z[2] << 64) + (z[3] << 96) + (z[4] << 128) + (z[5] << 160) + (z[6] << 192) + (z[7] << 224) */ /* bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */ /* twos_complement_eval z = let x1 := z[0] + (z[1] << 32) + (z[2] << 64) + (z[3] << 96) + (z[4] << 128) + (z[5] << 160) + (z[6] << 192) + (z[7] << 224) in */ /* if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256 */ #include typedef unsigned char fiat_p256_uint1; typedef signed char fiat_p256_int1; #if defined(__GNUC__) || defined(__clang__) # define FIAT_P256_FIAT_INLINE __inline__ #else # define FIAT_P256_FIAT_INLINE #endif /* The type fiat_p256_montgomery_domain_field_element is a field element in the Montgomery domain. */ /* Bounds: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] */ typedef uint32_t fiat_p256_montgomery_domain_field_element[8]; /* The type fiat_p256_non_montgomery_domain_field_element is a field element NOT in the Montgomery domain. */ /* Bounds: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] */ typedef uint32_t fiat_p256_non_montgomery_domain_field_element[8]; #if (-1 & 3) != 3 #error "This code only works on a two's complement system" #endif #if !defined(FIAT_P256_NO_ASM) && (defined(__GNUC__) || defined(__clang__)) static __inline__ uint32_t fiat_p256_value_barrier_u32(uint32_t a) { __asm__("" : "+r"(a) : /* no inputs */); return a; } #else # define fiat_p256_value_barrier_u32(x) (x) #endif /* * The function fiat_p256_addcarryx_u32 is an addition with carry. * * Postconditions: * out1 = (arg1 + arg2 + arg3) mod 2^32 * out2 = ⌊(arg1 + arg2 + arg3) / 2^32⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffff] * arg3: [0x0 ~> 0xffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffff] * out2: [0x0 ~> 0x1] */ static FIAT_P256_FIAT_INLINE void fiat_p256_addcarryx_u32(uint32_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) { uint64_t x1; uint32_t x2; fiat_p256_uint1 x3; x1 = ((arg1 + (uint64_t)arg2) + arg3); x2 = (uint32_t)(x1 & UINT32_C(0xffffffff)); x3 = (fiat_p256_uint1)(x1 >> 32); *out1 = x2; *out2 = x3; } /* * The function fiat_p256_subborrowx_u32 is a subtraction with borrow. * * Postconditions: * out1 = (-arg1 + arg2 + -arg3) mod 2^32 * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^32⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffff] * arg3: [0x0 ~> 0xffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffff] * out2: [0x0 ~> 0x1] */ static FIAT_P256_FIAT_INLINE void fiat_p256_subborrowx_u32(uint32_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) { int64_t x1; fiat_p256_int1 x2; uint32_t x3; x1 = ((arg2 - (int64_t)arg1) - arg3); x2 = (fiat_p256_int1)(x1 >> 32); x3 = (uint32_t)(x1 & UINT32_C(0xffffffff)); *out1 = x3; *out2 = (fiat_p256_uint1)(0x0 - x2); } /* * The function fiat_p256_mulx_u32 is a multiplication, returning the full double-width result. * * Postconditions: * out1 = (arg1 * arg2) mod 2^32 * out2 = ⌊arg1 * arg2 / 2^32⌋ * * Input Bounds: * arg1: [0x0 ~> 0xffffffff] * arg2: [0x0 ~> 0xffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffff] * out2: [0x0 ~> 0xffffffff] */ static FIAT_P256_FIAT_INLINE void fiat_p256_mulx_u32(uint32_t* out1, uint32_t* out2, uint32_t arg1, uint32_t arg2) { uint64_t x1; uint32_t x2; uint32_t x3; x1 = ((uint64_t)arg1 * arg2); x2 = (uint32_t)(x1 & UINT32_C(0xffffffff)); x3 = (uint32_t)(x1 >> 32); *out1 = x2; *out2 = x3; } /* * The function fiat_p256_cmovznz_u32 is a single-word conditional move. * * Postconditions: * out1 = (if arg1 = 0 then arg2 else arg3) * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffff] * arg3: [0x0 ~> 0xffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffff] */ static FIAT_P256_FIAT_INLINE void fiat_p256_cmovznz_u32(uint32_t* out1, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) { fiat_p256_uint1 x1; uint32_t x2; uint32_t x3; x1 = (!(!arg1)); x2 = ((fiat_p256_int1)(0x0 - x1) & UINT32_C(0xffffffff)); x3 = ((fiat_p256_value_barrier_u32(x2) & arg3) | (fiat_p256_value_barrier_u32((~x2)) & arg2)); *out1 = x3; } /* * The function fiat_p256_mul multiplies two field elements in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * 0 ≤ eval arg2 < m * Postconditions: * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg2)) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { uint32_t x1; uint32_t x2; uint32_t x3; uint32_t x4; uint32_t x5; uint32_t x6; uint32_t x7; uint32_t x8; uint32_t x9; uint32_t x10; uint32_t x11; uint32_t x12; uint32_t x13; uint32_t x14; uint32_t x15; uint32_t x16; uint32_t x17; uint32_t x18; uint32_t x19; uint32_t x20; uint32_t x21; uint32_t x22; uint32_t x23; uint32_t x24; uint32_t x25; fiat_p256_uint1 x26; uint32_t x27; fiat_p256_uint1 x28; uint32_t x29; fiat_p256_uint1 x30; uint32_t x31; fiat_p256_uint1 x32; uint32_t x33; fiat_p256_uint1 x34; uint32_t x35; fiat_p256_uint1 x36; uint32_t x37; fiat_p256_uint1 x38; uint32_t x39; uint32_t x40; uint32_t x41; uint32_t x42; uint32_t x43; uint32_t x44; uint32_t x45; uint32_t x46; uint32_t x47; uint32_t x48; fiat_p256_uint1 x49; uint32_t x50; fiat_p256_uint1 x51; uint32_t x52; uint32_t x53; fiat_p256_uint1 x54; uint32_t x55; fiat_p256_uint1 x56; uint32_t x57; fiat_p256_uint1 x58; uint32_t x59; fiat_p256_uint1 x60; uint32_t x61; fiat_p256_uint1 x62; uint32_t x63; fiat_p256_uint1 x64; uint32_t x65; fiat_p256_uint1 x66; uint32_t x67; fiat_p256_uint1 x68; uint32_t x69; fiat_p256_uint1 x70; uint32_t x71; uint32_t x72; uint32_t x73; uint32_t x74; uint32_t x75; uint32_t x76; uint32_t x77; uint32_t x78; uint32_t x79; uint32_t x80; uint32_t x81; uint32_t x82; uint32_t x83; uint32_t x84; uint32_t x85; uint32_t x86; uint32_t x87; fiat_p256_uint1 x88; uint32_t x89; fiat_p256_uint1 x90; uint32_t x91; fiat_p256_uint1 x92; uint32_t x93; fiat_p256_uint1 x94; uint32_t x95; fiat_p256_uint1 x96; uint32_t x97; fiat_p256_uint1 x98; uint32_t x99; fiat_p256_uint1 x100; uint32_t x101; uint32_t x102; fiat_p256_uint1 x103; uint32_t x104; fiat_p256_uint1 x105; uint32_t x106; fiat_p256_uint1 x107; uint32_t x108; fiat_p256_uint1 x109; uint32_t x110; fiat_p256_uint1 x111; uint32_t x112; fiat_p256_uint1 x113; uint32_t x114; fiat_p256_uint1 x115; uint32_t x116; fiat_p256_uint1 x117; uint32_t x118; fiat_p256_uint1 x119; uint32_t x120; uint32_t x121; uint32_t x122; uint32_t x123; uint32_t x124; uint32_t x125; uint32_t x126; uint32_t x127; uint32_t x128; fiat_p256_uint1 x129; uint32_t x130; fiat_p256_uint1 x131; uint32_t x132; uint32_t x133; fiat_p256_uint1 x134; uint32_t x135; fiat_p256_uint1 x136; uint32_t x137; fiat_p256_uint1 x138; uint32_t x139; fiat_p256_uint1 x140; uint32_t x141; fiat_p256_uint1 x142; uint32_t x143; fiat_p256_uint1 x144; uint32_t x145; fiat_p256_uint1 x146; uint32_t x147; fiat_p256_uint1 x148; uint32_t x149; fiat_p256_uint1 x150; uint32_t x151; uint32_t x152; uint32_t x153; uint32_t x154; uint32_t x155; uint32_t x156; uint32_t x157; uint32_t x158; uint32_t x159; uint32_t x160; uint32_t x161; uint32_t x162; uint32_t x163; uint32_t x164; uint32_t x165; uint32_t x166; uint32_t x167; uint32_t x168; fiat_p256_uint1 x169; uint32_t x170; fiat_p256_uint1 x171; uint32_t x172; fiat_p256_uint1 x173; uint32_t x174; fiat_p256_uint1 x175; uint32_t x176; fiat_p256_uint1 x177; uint32_t x178; fiat_p256_uint1 x179; uint32_t x180; fiat_p256_uint1 x181; uint32_t x182; uint32_t x183; fiat_p256_uint1 x184; uint32_t x185; fiat_p256_uint1 x186; uint32_t x187; fiat_p256_uint1 x188; uint32_t x189; fiat_p256_uint1 x190; uint32_t x191; fiat_p256_uint1 x192; uint32_t x193; fiat_p256_uint1 x194; uint32_t x195; fiat_p256_uint1 x196; uint32_t x197; fiat_p256_uint1 x198; uint32_t x199; fiat_p256_uint1 x200; uint32_t x201; uint32_t x202; uint32_t x203; uint32_t x204; uint32_t x205; uint32_t x206; uint32_t x207; uint32_t x208; uint32_t x209; fiat_p256_uint1 x210; uint32_t x211; fiat_p256_uint1 x212; uint32_t x213; uint32_t x214; fiat_p256_uint1 x215; uint32_t x216; fiat_p256_uint1 x217; uint32_t x218; fiat_p256_uint1 x219; uint32_t x220; fiat_p256_uint1 x221; uint32_t x222; fiat_p256_uint1 x223; uint32_t x224; fiat_p256_uint1 x225; uint32_t x226; fiat_p256_uint1 x227; uint32_t x228; fiat_p256_uint1 x229; uint32_t x230; fiat_p256_uint1 x231; uint32_t x232; uint32_t x233; uint32_t x234; uint32_t x235; uint32_t x236; uint32_t x237; uint32_t x238; uint32_t x239; uint32_t x240; uint32_t x241; uint32_t x242; uint32_t x243; uint32_t x244; uint32_t x245; uint32_t x246; uint32_t x247; uint32_t x248; uint32_t x249; fiat_p256_uint1 x250; uint32_t x251; fiat_p256_uint1 x252; uint32_t x253; fiat_p256_uint1 x254; uint32_t x255; fiat_p256_uint1 x256; uint32_t x257; fiat_p256_uint1 x258; uint32_t x259; fiat_p256_uint1 x260; uint32_t x261; fiat_p256_uint1 x262; uint32_t x263; uint32_t x264; fiat_p256_uint1 x265; uint32_t x266; fiat_p256_uint1 x267; uint32_t x268; fiat_p256_uint1 x269; uint32_t x270; fiat_p256_uint1 x271; uint32_t x272; fiat_p256_uint1 x273; uint32_t x274; fiat_p256_uint1 x275; uint32_t x276; fiat_p256_uint1 x277; uint32_t x278; fiat_p256_uint1 x279; uint32_t x280; fiat_p256_uint1 x281; uint32_t x282; uint32_t x283; uint32_t x284; uint32_t x285; uint32_t x286; uint32_t x287; uint32_t x288; uint32_t x289; uint32_t x290; fiat_p256_uint1 x291; uint32_t x292; fiat_p256_uint1 x293; uint32_t x294; uint32_t x295; fiat_p256_uint1 x296; uint32_t x297; fiat_p256_uint1 x298; uint32_t x299; fiat_p256_uint1 x300; uint32_t x301; fiat_p256_uint1 x302; uint32_t x303; fiat_p256_uint1 x304; uint32_t x305; fiat_p256_uint1 x306; uint32_t x307; fiat_p256_uint1 x308; uint32_t x309; fiat_p256_uint1 x310; uint32_t x311; fiat_p256_uint1 x312; uint32_t x313; uint32_t x314; uint32_t x315; uint32_t x316; uint32_t x317; uint32_t x318; uint32_t x319; uint32_t x320; uint32_t x321; uint32_t x322; uint32_t x323; uint32_t x324; uint32_t x325; uint32_t x326; uint32_t x327; uint32_t x328; uint32_t x329; uint32_t x330; fiat_p256_uint1 x331; uint32_t x332; fiat_p256_uint1 x333; uint32_t x334; fiat_p256_uint1 x335; uint32_t x336; fiat_p256_uint1 x337; uint32_t x338; fiat_p256_uint1 x339; uint32_t x340; fiat_p256_uint1 x341; uint32_t x342; fiat_p256_uint1 x343; uint32_t x344; uint32_t x345; fiat_p256_uint1 x346; uint32_t x347; fiat_p256_uint1 x348; uint32_t x349; fiat_p256_uint1 x350; uint32_t x351; fiat_p256_uint1 x352; uint32_t x353; fiat_p256_uint1 x354; uint32_t x355; fiat_p256_uint1 x356; uint32_t x357; fiat_p256_uint1 x358; uint32_t x359; fiat_p256_uint1 x360; uint32_t x361; fiat_p256_uint1 x362; uint32_t x363; uint32_t x364; uint32_t x365; uint32_t x366; uint32_t x367; uint32_t x368; uint32_t x369; uint32_t x370; uint32_t x371; fiat_p256_uint1 x372; uint32_t x373; fiat_p256_uint1 x374; uint32_t x375; uint32_t x376; fiat_p256_uint1 x377; uint32_t x378; fiat_p256_uint1 x379; uint32_t x380; fiat_p256_uint1 x381; uint32_t x382; fiat_p256_uint1 x383; uint32_t x384; fiat_p256_uint1 x385; uint32_t x386; fiat_p256_uint1 x387; uint32_t x388; fiat_p256_uint1 x389; uint32_t x390; fiat_p256_uint1 x391; uint32_t x392; fiat_p256_uint1 x393; uint32_t x394; uint32_t x395; uint32_t x396; uint32_t x397; uint32_t x398; uint32_t x399; uint32_t x400; uint32_t x401; uint32_t x402; uint32_t x403; uint32_t x404; uint32_t x405; uint32_t x406; uint32_t x407; uint32_t x408; uint32_t x409; uint32_t x410; uint32_t x411; fiat_p256_uint1 x412; uint32_t x413; fiat_p256_uint1 x414; uint32_t x415; fiat_p256_uint1 x416; uint32_t x417; fiat_p256_uint1 x418; uint32_t x419; fiat_p256_uint1 x420; uint32_t x421; fiat_p256_uint1 x422; uint32_t x423; fiat_p256_uint1 x424; uint32_t x425; uint32_t x426; fiat_p256_uint1 x427; uint32_t x428; fiat_p256_uint1 x429; uint32_t x430; fiat_p256_uint1 x431; uint32_t x432; fiat_p256_uint1 x433; uint32_t x434; fiat_p256_uint1 x435; uint32_t x436; fiat_p256_uint1 x437; uint32_t x438; fiat_p256_uint1 x439; uint32_t x440; fiat_p256_uint1 x441; uint32_t x442; fiat_p256_uint1 x443; uint32_t x444; uint32_t x445; uint32_t x446; uint32_t x447; uint32_t x448; uint32_t x449; uint32_t x450; uint32_t x451; uint32_t x452; fiat_p256_uint1 x453; uint32_t x454; fiat_p256_uint1 x455; uint32_t x456; uint32_t x457; fiat_p256_uint1 x458; uint32_t x459; fiat_p256_uint1 x460; uint32_t x461; fiat_p256_uint1 x462; uint32_t x463; fiat_p256_uint1 x464; uint32_t x465; fiat_p256_uint1 x466; uint32_t x467; fiat_p256_uint1 x468; uint32_t x469; fiat_p256_uint1 x470; uint32_t x471; fiat_p256_uint1 x472; uint32_t x473; fiat_p256_uint1 x474; uint32_t x475; uint32_t x476; uint32_t x477; uint32_t x478; uint32_t x479; uint32_t x480; uint32_t x481; uint32_t x482; uint32_t x483; uint32_t x484; uint32_t x485; uint32_t x486; uint32_t x487; uint32_t x488; uint32_t x489; uint32_t x490; uint32_t x491; uint32_t x492; fiat_p256_uint1 x493; uint32_t x494; fiat_p256_uint1 x495; uint32_t x496; fiat_p256_uint1 x497; uint32_t x498; fiat_p256_uint1 x499; uint32_t x500; fiat_p256_uint1 x501; uint32_t x502; fiat_p256_uint1 x503; uint32_t x504; fiat_p256_uint1 x505; uint32_t x506; uint32_t x507; fiat_p256_uint1 x508; uint32_t x509; fiat_p256_uint1 x510; uint32_t x511; fiat_p256_uint1 x512; uint32_t x513; fiat_p256_uint1 x514; uint32_t x515; fiat_p256_uint1 x516; uint32_t x517; fiat_p256_uint1 x518; uint32_t x519; fiat_p256_uint1 x520; uint32_t x521; fiat_p256_uint1 x522; uint32_t x523; fiat_p256_uint1 x524; uint32_t x525; uint32_t x526; uint32_t x527; uint32_t x528; uint32_t x529; uint32_t x530; uint32_t x531; uint32_t x532; uint32_t x533; fiat_p256_uint1 x534; uint32_t x535; fiat_p256_uint1 x536; uint32_t x537; uint32_t x538; fiat_p256_uint1 x539; uint32_t x540; fiat_p256_uint1 x541; uint32_t x542; fiat_p256_uint1 x543; uint32_t x544; fiat_p256_uint1 x545; uint32_t x546; fiat_p256_uint1 x547; uint32_t x548; fiat_p256_uint1 x549; uint32_t x550; fiat_p256_uint1 x551; uint32_t x552; fiat_p256_uint1 x553; uint32_t x554; fiat_p256_uint1 x555; uint32_t x556; uint32_t x557; uint32_t x558; uint32_t x559; uint32_t x560; uint32_t x561; uint32_t x562; uint32_t x563; uint32_t x564; uint32_t x565; uint32_t x566; uint32_t x567; uint32_t x568; uint32_t x569; uint32_t x570; uint32_t x571; uint32_t x572; uint32_t x573; fiat_p256_uint1 x574; uint32_t x575; fiat_p256_uint1 x576; uint32_t x577; fiat_p256_uint1 x578; uint32_t x579; fiat_p256_uint1 x580; uint32_t x581; fiat_p256_uint1 x582; uint32_t x583; fiat_p256_uint1 x584; uint32_t x585; fiat_p256_uint1 x586; uint32_t x587; uint32_t x588; fiat_p256_uint1 x589; uint32_t x590; fiat_p256_uint1 x591; uint32_t x592; fiat_p256_uint1 x593; uint32_t x594; fiat_p256_uint1 x595; uint32_t x596; fiat_p256_uint1 x597; uint32_t x598; fiat_p256_uint1 x599; uint32_t x600; fiat_p256_uint1 x601; uint32_t x602; fiat_p256_uint1 x603; uint32_t x604; fiat_p256_uint1 x605; uint32_t x606; uint32_t x607; uint32_t x608; uint32_t x609; uint32_t x610; uint32_t x611; uint32_t x612; uint32_t x613; uint32_t x614; fiat_p256_uint1 x615; uint32_t x616; fiat_p256_uint1 x617; uint32_t x618; uint32_t x619; fiat_p256_uint1 x620; uint32_t x621; fiat_p256_uint1 x622; uint32_t x623; fiat_p256_uint1 x624; uint32_t x625; fiat_p256_uint1 x626; uint32_t x627; fiat_p256_uint1 x628; uint32_t x629; fiat_p256_uint1 x630; uint32_t x631; fiat_p256_uint1 x632; uint32_t x633; fiat_p256_uint1 x634; uint32_t x635; fiat_p256_uint1 x636; uint32_t x637; uint32_t x638; fiat_p256_uint1 x639; uint32_t x640; fiat_p256_uint1 x641; uint32_t x642; fiat_p256_uint1 x643; uint32_t x644; fiat_p256_uint1 x645; uint32_t x646; fiat_p256_uint1 x647; uint32_t x648; fiat_p256_uint1 x649; uint32_t x650; fiat_p256_uint1 x651; uint32_t x652; fiat_p256_uint1 x653; uint32_t x654; fiat_p256_uint1 x655; uint32_t x656; uint32_t x657; uint32_t x658; uint32_t x659; uint32_t x660; uint32_t x661; uint32_t x662; uint32_t x663; x1 = (arg1[1]); x2 = (arg1[2]); x3 = (arg1[3]); x4 = (arg1[4]); x5 = (arg1[5]); x6 = (arg1[6]); x7 = (arg1[7]); x8 = (arg1[0]); fiat_p256_mulx_u32(&x9, &x10, x8, (arg2[7])); fiat_p256_mulx_u32(&x11, &x12, x8, (arg2[6])); fiat_p256_mulx_u32(&x13, &x14, x8, (arg2[5])); fiat_p256_mulx_u32(&x15, &x16, x8, (arg2[4])); fiat_p256_mulx_u32(&x17, &x18, x8, (arg2[3])); fiat_p256_mulx_u32(&x19, &x20, x8, (arg2[2])); fiat_p256_mulx_u32(&x21, &x22, x8, (arg2[1])); fiat_p256_mulx_u32(&x23, &x24, x8, (arg2[0])); fiat_p256_addcarryx_u32(&x25, &x26, 0x0, x24, x21); fiat_p256_addcarryx_u32(&x27, &x28, x26, x22, x19); fiat_p256_addcarryx_u32(&x29, &x30, x28, x20, x17); fiat_p256_addcarryx_u32(&x31, &x32, x30, x18, x15); fiat_p256_addcarryx_u32(&x33, &x34, x32, x16, x13); fiat_p256_addcarryx_u32(&x35, &x36, x34, x14, x11); fiat_p256_addcarryx_u32(&x37, &x38, x36, x12, x9); x39 = (x38 + x10); fiat_p256_mulx_u32(&x40, &x41, x23, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x42, &x43, x23, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x44, &x45, x23, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x46, &x47, x23, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x48, &x49, 0x0, x47, x44); fiat_p256_addcarryx_u32(&x50, &x51, x49, x45, x42); x52 = (x51 + x43); fiat_p256_addcarryx_u32(&x53, &x54, 0x0, x23, x46); fiat_p256_addcarryx_u32(&x55, &x56, x54, x25, x48); fiat_p256_addcarryx_u32(&x57, &x58, x56, x27, x50); fiat_p256_addcarryx_u32(&x59, &x60, x58, x29, x52); fiat_p256_addcarryx_u32(&x61, &x62, x60, x31, 0x0); fiat_p256_addcarryx_u32(&x63, &x64, x62, x33, 0x0); fiat_p256_addcarryx_u32(&x65, &x66, x64, x35, x23); fiat_p256_addcarryx_u32(&x67, &x68, x66, x37, x40); fiat_p256_addcarryx_u32(&x69, &x70, x68, x39, x41); fiat_p256_mulx_u32(&x71, &x72, x1, (arg2[7])); fiat_p256_mulx_u32(&x73, &x74, x1, (arg2[6])); fiat_p256_mulx_u32(&x75, &x76, x1, (arg2[5])); fiat_p256_mulx_u32(&x77, &x78, x1, (arg2[4])); fiat_p256_mulx_u32(&x79, &x80, x1, (arg2[3])); fiat_p256_mulx_u32(&x81, &x82, x1, (arg2[2])); fiat_p256_mulx_u32(&x83, &x84, x1, (arg2[1])); fiat_p256_mulx_u32(&x85, &x86, x1, (arg2[0])); fiat_p256_addcarryx_u32(&x87, &x88, 0x0, x86, x83); fiat_p256_addcarryx_u32(&x89, &x90, x88, x84, x81); fiat_p256_addcarryx_u32(&x91, &x92, x90, x82, x79); fiat_p256_addcarryx_u32(&x93, &x94, x92, x80, x77); fiat_p256_addcarryx_u32(&x95, &x96, x94, x78, x75); fiat_p256_addcarryx_u32(&x97, &x98, x96, x76, x73); fiat_p256_addcarryx_u32(&x99, &x100, x98, x74, x71); x101 = (x100 + x72); fiat_p256_addcarryx_u32(&x102, &x103, 0x0, x55, x85); fiat_p256_addcarryx_u32(&x104, &x105, x103, x57, x87); fiat_p256_addcarryx_u32(&x106, &x107, x105, x59, x89); fiat_p256_addcarryx_u32(&x108, &x109, x107, x61, x91); fiat_p256_addcarryx_u32(&x110, &x111, x109, x63, x93); fiat_p256_addcarryx_u32(&x112, &x113, x111, x65, x95); fiat_p256_addcarryx_u32(&x114, &x115, x113, x67, x97); fiat_p256_addcarryx_u32(&x116, &x117, x115, x69, x99); fiat_p256_addcarryx_u32(&x118, &x119, x117, x70, x101); fiat_p256_mulx_u32(&x120, &x121, x102, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x122, &x123, x102, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x124, &x125, x102, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x126, &x127, x102, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x128, &x129, 0x0, x127, x124); fiat_p256_addcarryx_u32(&x130, &x131, x129, x125, x122); x132 = (x131 + x123); fiat_p256_addcarryx_u32(&x133, &x134, 0x0, x102, x126); fiat_p256_addcarryx_u32(&x135, &x136, x134, x104, x128); fiat_p256_addcarryx_u32(&x137, &x138, x136, x106, x130); fiat_p256_addcarryx_u32(&x139, &x140, x138, x108, x132); fiat_p256_addcarryx_u32(&x141, &x142, x140, x110, 0x0); fiat_p256_addcarryx_u32(&x143, &x144, x142, x112, 0x0); fiat_p256_addcarryx_u32(&x145, &x146, x144, x114, x102); fiat_p256_addcarryx_u32(&x147, &x148, x146, x116, x120); fiat_p256_addcarryx_u32(&x149, &x150, x148, x118, x121); x151 = ((uint32_t)x150 + x119); fiat_p256_mulx_u32(&x152, &x153, x2, (arg2[7])); fiat_p256_mulx_u32(&x154, &x155, x2, (arg2[6])); fiat_p256_mulx_u32(&x156, &x157, x2, (arg2[5])); fiat_p256_mulx_u32(&x158, &x159, x2, (arg2[4])); fiat_p256_mulx_u32(&x160, &x161, x2, (arg2[3])); fiat_p256_mulx_u32(&x162, &x163, x2, (arg2[2])); fiat_p256_mulx_u32(&x164, &x165, x2, (arg2[1])); fiat_p256_mulx_u32(&x166, &x167, x2, (arg2[0])); fiat_p256_addcarryx_u32(&x168, &x169, 0x0, x167, x164); fiat_p256_addcarryx_u32(&x170, &x171, x169, x165, x162); fiat_p256_addcarryx_u32(&x172, &x173, x171, x163, x160); fiat_p256_addcarryx_u32(&x174, &x175, x173, x161, x158); fiat_p256_addcarryx_u32(&x176, &x177, x175, x159, x156); fiat_p256_addcarryx_u32(&x178, &x179, x177, x157, x154); fiat_p256_addcarryx_u32(&x180, &x181, x179, x155, x152); x182 = (x181 + x153); fiat_p256_addcarryx_u32(&x183, &x184, 0x0, x135, x166); fiat_p256_addcarryx_u32(&x185, &x186, x184, x137, x168); fiat_p256_addcarryx_u32(&x187, &x188, x186, x139, x170); fiat_p256_addcarryx_u32(&x189, &x190, x188, x141, x172); fiat_p256_addcarryx_u32(&x191, &x192, x190, x143, x174); fiat_p256_addcarryx_u32(&x193, &x194, x192, x145, x176); fiat_p256_addcarryx_u32(&x195, &x196, x194, x147, x178); fiat_p256_addcarryx_u32(&x197, &x198, x196, x149, x180); fiat_p256_addcarryx_u32(&x199, &x200, x198, x151, x182); fiat_p256_mulx_u32(&x201, &x202, x183, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x203, &x204, x183, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x205, &x206, x183, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x207, &x208, x183, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x209, &x210, 0x0, x208, x205); fiat_p256_addcarryx_u32(&x211, &x212, x210, x206, x203); x213 = (x212 + x204); fiat_p256_addcarryx_u32(&x214, &x215, 0x0, x183, x207); fiat_p256_addcarryx_u32(&x216, &x217, x215, x185, x209); fiat_p256_addcarryx_u32(&x218, &x219, x217, x187, x211); fiat_p256_addcarryx_u32(&x220, &x221, x219, x189, x213); fiat_p256_addcarryx_u32(&x222, &x223, x221, x191, 0x0); fiat_p256_addcarryx_u32(&x224, &x225, x223, x193, 0x0); fiat_p256_addcarryx_u32(&x226, &x227, x225, x195, x183); fiat_p256_addcarryx_u32(&x228, &x229, x227, x197, x201); fiat_p256_addcarryx_u32(&x230, &x231, x229, x199, x202); x232 = ((uint32_t)x231 + x200); fiat_p256_mulx_u32(&x233, &x234, x3, (arg2[7])); fiat_p256_mulx_u32(&x235, &x236, x3, (arg2[6])); fiat_p256_mulx_u32(&x237, &x238, x3, (arg2[5])); fiat_p256_mulx_u32(&x239, &x240, x3, (arg2[4])); fiat_p256_mulx_u32(&x241, &x242, x3, (arg2[3])); fiat_p256_mulx_u32(&x243, &x244, x3, (arg2[2])); fiat_p256_mulx_u32(&x245, &x246, x3, (arg2[1])); fiat_p256_mulx_u32(&x247, &x248, x3, (arg2[0])); fiat_p256_addcarryx_u32(&x249, &x250, 0x0, x248, x245); fiat_p256_addcarryx_u32(&x251, &x252, x250, x246, x243); fiat_p256_addcarryx_u32(&x253, &x254, x252, x244, x241); fiat_p256_addcarryx_u32(&x255, &x256, x254, x242, x239); fiat_p256_addcarryx_u32(&x257, &x258, x256, x240, x237); fiat_p256_addcarryx_u32(&x259, &x260, x258, x238, x235); fiat_p256_addcarryx_u32(&x261, &x262, x260, x236, x233); x263 = (x262 + x234); fiat_p256_addcarryx_u32(&x264, &x265, 0x0, x216, x247); fiat_p256_addcarryx_u32(&x266, &x267, x265, x218, x249); fiat_p256_addcarryx_u32(&x268, &x269, x267, x220, x251); fiat_p256_addcarryx_u32(&x270, &x271, x269, x222, x253); fiat_p256_addcarryx_u32(&x272, &x273, x271, x224, x255); fiat_p256_addcarryx_u32(&x274, &x275, x273, x226, x257); fiat_p256_addcarryx_u32(&x276, &x277, x275, x228, x259); fiat_p256_addcarryx_u32(&x278, &x279, x277, x230, x261); fiat_p256_addcarryx_u32(&x280, &x281, x279, x232, x263); fiat_p256_mulx_u32(&x282, &x283, x264, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x284, &x285, x264, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x286, &x287, x264, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x288, &x289, x264, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x290, &x291, 0x0, x289, x286); fiat_p256_addcarryx_u32(&x292, &x293, x291, x287, x284); x294 = (x293 + x285); fiat_p256_addcarryx_u32(&x295, &x296, 0x0, x264, x288); fiat_p256_addcarryx_u32(&x297, &x298, x296, x266, x290); fiat_p256_addcarryx_u32(&x299, &x300, x298, x268, x292); fiat_p256_addcarryx_u32(&x301, &x302, x300, x270, x294); fiat_p256_addcarryx_u32(&x303, &x304, x302, x272, 0x0); fiat_p256_addcarryx_u32(&x305, &x306, x304, x274, 0x0); fiat_p256_addcarryx_u32(&x307, &x308, x306, x276, x264); fiat_p256_addcarryx_u32(&x309, &x310, x308, x278, x282); fiat_p256_addcarryx_u32(&x311, &x312, x310, x280, x283); x313 = ((uint32_t)x312 + x281); fiat_p256_mulx_u32(&x314, &x315, x4, (arg2[7])); fiat_p256_mulx_u32(&x316, &x317, x4, (arg2[6])); fiat_p256_mulx_u32(&x318, &x319, x4, (arg2[5])); fiat_p256_mulx_u32(&x320, &x321, x4, (arg2[4])); fiat_p256_mulx_u32(&x322, &x323, x4, (arg2[3])); fiat_p256_mulx_u32(&x324, &x325, x4, (arg2[2])); fiat_p256_mulx_u32(&x326, &x327, x4, (arg2[1])); fiat_p256_mulx_u32(&x328, &x329, x4, (arg2[0])); fiat_p256_addcarryx_u32(&x330, &x331, 0x0, x329, x326); fiat_p256_addcarryx_u32(&x332, &x333, x331, x327, x324); fiat_p256_addcarryx_u32(&x334, &x335, x333, x325, x322); fiat_p256_addcarryx_u32(&x336, &x337, x335, x323, x320); fiat_p256_addcarryx_u32(&x338, &x339, x337, x321, x318); fiat_p256_addcarryx_u32(&x340, &x341, x339, x319, x316); fiat_p256_addcarryx_u32(&x342, &x343, x341, x317, x314); x344 = (x343 + x315); fiat_p256_addcarryx_u32(&x345, &x346, 0x0, x297, x328); fiat_p256_addcarryx_u32(&x347, &x348, x346, x299, x330); fiat_p256_addcarryx_u32(&x349, &x350, x348, x301, x332); fiat_p256_addcarryx_u32(&x351, &x352, x350, x303, x334); fiat_p256_addcarryx_u32(&x353, &x354, x352, x305, x336); fiat_p256_addcarryx_u32(&x355, &x356, x354, x307, x338); fiat_p256_addcarryx_u32(&x357, &x358, x356, x309, x340); fiat_p256_addcarryx_u32(&x359, &x360, x358, x311, x342); fiat_p256_addcarryx_u32(&x361, &x362, x360, x313, x344); fiat_p256_mulx_u32(&x363, &x364, x345, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x365, &x366, x345, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x367, &x368, x345, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x369, &x370, x345, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x371, &x372, 0x0, x370, x367); fiat_p256_addcarryx_u32(&x373, &x374, x372, x368, x365); x375 = (x374 + x366); fiat_p256_addcarryx_u32(&x376, &x377, 0x0, x345, x369); fiat_p256_addcarryx_u32(&x378, &x379, x377, x347, x371); fiat_p256_addcarryx_u32(&x380, &x381, x379, x349, x373); fiat_p256_addcarryx_u32(&x382, &x383, x381, x351, x375); fiat_p256_addcarryx_u32(&x384, &x385, x383, x353, 0x0); fiat_p256_addcarryx_u32(&x386, &x387, x385, x355, 0x0); fiat_p256_addcarryx_u32(&x388, &x389, x387, x357, x345); fiat_p256_addcarryx_u32(&x390, &x391, x389, x359, x363); fiat_p256_addcarryx_u32(&x392, &x393, x391, x361, x364); x394 = ((uint32_t)x393 + x362); fiat_p256_mulx_u32(&x395, &x396, x5, (arg2[7])); fiat_p256_mulx_u32(&x397, &x398, x5, (arg2[6])); fiat_p256_mulx_u32(&x399, &x400, x5, (arg2[5])); fiat_p256_mulx_u32(&x401, &x402, x5, (arg2[4])); fiat_p256_mulx_u32(&x403, &x404, x5, (arg2[3])); fiat_p256_mulx_u32(&x405, &x406, x5, (arg2[2])); fiat_p256_mulx_u32(&x407, &x408, x5, (arg2[1])); fiat_p256_mulx_u32(&x409, &x410, x5, (arg2[0])); fiat_p256_addcarryx_u32(&x411, &x412, 0x0, x410, x407); fiat_p256_addcarryx_u32(&x413, &x414, x412, x408, x405); fiat_p256_addcarryx_u32(&x415, &x416, x414, x406, x403); fiat_p256_addcarryx_u32(&x417, &x418, x416, x404, x401); fiat_p256_addcarryx_u32(&x419, &x420, x418, x402, x399); fiat_p256_addcarryx_u32(&x421, &x422, x420, x400, x397); fiat_p256_addcarryx_u32(&x423, &x424, x422, x398, x395); x425 = (x424 + x396); fiat_p256_addcarryx_u32(&x426, &x427, 0x0, x378, x409); fiat_p256_addcarryx_u32(&x428, &x429, x427, x380, x411); fiat_p256_addcarryx_u32(&x430, &x431, x429, x382, x413); fiat_p256_addcarryx_u32(&x432, &x433, x431, x384, x415); fiat_p256_addcarryx_u32(&x434, &x435, x433, x386, x417); fiat_p256_addcarryx_u32(&x436, &x437, x435, x388, x419); fiat_p256_addcarryx_u32(&x438, &x439, x437, x390, x421); fiat_p256_addcarryx_u32(&x440, &x441, x439, x392, x423); fiat_p256_addcarryx_u32(&x442, &x443, x441, x394, x425); fiat_p256_mulx_u32(&x444, &x445, x426, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x446, &x447, x426, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x448, &x449, x426, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x450, &x451, x426, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x452, &x453, 0x0, x451, x448); fiat_p256_addcarryx_u32(&x454, &x455, x453, x449, x446); x456 = (x455 + x447); fiat_p256_addcarryx_u32(&x457, &x458, 0x0, x426, x450); fiat_p256_addcarryx_u32(&x459, &x460, x458, x428, x452); fiat_p256_addcarryx_u32(&x461, &x462, x460, x430, x454); fiat_p256_addcarryx_u32(&x463, &x464, x462, x432, x456); fiat_p256_addcarryx_u32(&x465, &x466, x464, x434, 0x0); fiat_p256_addcarryx_u32(&x467, &x468, x466, x436, 0x0); fiat_p256_addcarryx_u32(&x469, &x470, x468, x438, x426); fiat_p256_addcarryx_u32(&x471, &x472, x470, x440, x444); fiat_p256_addcarryx_u32(&x473, &x474, x472, x442, x445); x475 = ((uint32_t)x474 + x443); fiat_p256_mulx_u32(&x476, &x477, x6, (arg2[7])); fiat_p256_mulx_u32(&x478, &x479, x6, (arg2[6])); fiat_p256_mulx_u32(&x480, &x481, x6, (arg2[5])); fiat_p256_mulx_u32(&x482, &x483, x6, (arg2[4])); fiat_p256_mulx_u32(&x484, &x485, x6, (arg2[3])); fiat_p256_mulx_u32(&x486, &x487, x6, (arg2[2])); fiat_p256_mulx_u32(&x488, &x489, x6, (arg2[1])); fiat_p256_mulx_u32(&x490, &x491, x6, (arg2[0])); fiat_p256_addcarryx_u32(&x492, &x493, 0x0, x491, x488); fiat_p256_addcarryx_u32(&x494, &x495, x493, x489, x486); fiat_p256_addcarryx_u32(&x496, &x497, x495, x487, x484); fiat_p256_addcarryx_u32(&x498, &x499, x497, x485, x482); fiat_p256_addcarryx_u32(&x500, &x501, x499, x483, x480); fiat_p256_addcarryx_u32(&x502, &x503, x501, x481, x478); fiat_p256_addcarryx_u32(&x504, &x505, x503, x479, x476); x506 = (x505 + x477); fiat_p256_addcarryx_u32(&x507, &x508, 0x0, x459, x490); fiat_p256_addcarryx_u32(&x509, &x510, x508, x461, x492); fiat_p256_addcarryx_u32(&x511, &x512, x510, x463, x494); fiat_p256_addcarryx_u32(&x513, &x514, x512, x465, x496); fiat_p256_addcarryx_u32(&x515, &x516, x514, x467, x498); fiat_p256_addcarryx_u32(&x517, &x518, x516, x469, x500); fiat_p256_addcarryx_u32(&x519, &x520, x518, x471, x502); fiat_p256_addcarryx_u32(&x521, &x522, x520, x473, x504); fiat_p256_addcarryx_u32(&x523, &x524, x522, x475, x506); fiat_p256_mulx_u32(&x525, &x526, x507, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x527, &x528, x507, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x529, &x530, x507, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x531, &x532, x507, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x533, &x534, 0x0, x532, x529); fiat_p256_addcarryx_u32(&x535, &x536, x534, x530, x527); x537 = (x536 + x528); fiat_p256_addcarryx_u32(&x538, &x539, 0x0, x507, x531); fiat_p256_addcarryx_u32(&x540, &x541, x539, x509, x533); fiat_p256_addcarryx_u32(&x542, &x543, x541, x511, x535); fiat_p256_addcarryx_u32(&x544, &x545, x543, x513, x537); fiat_p256_addcarryx_u32(&x546, &x547, x545, x515, 0x0); fiat_p256_addcarryx_u32(&x548, &x549, x547, x517, 0x0); fiat_p256_addcarryx_u32(&x550, &x551, x549, x519, x507); fiat_p256_addcarryx_u32(&x552, &x553, x551, x521, x525); fiat_p256_addcarryx_u32(&x554, &x555, x553, x523, x526); x556 = ((uint32_t)x555 + x524); fiat_p256_mulx_u32(&x557, &x558, x7, (arg2[7])); fiat_p256_mulx_u32(&x559, &x560, x7, (arg2[6])); fiat_p256_mulx_u32(&x561, &x562, x7, (arg2[5])); fiat_p256_mulx_u32(&x563, &x564, x7, (arg2[4])); fiat_p256_mulx_u32(&x565, &x566, x7, (arg2[3])); fiat_p256_mulx_u32(&x567, &x568, x7, (arg2[2])); fiat_p256_mulx_u32(&x569, &x570, x7, (arg2[1])); fiat_p256_mulx_u32(&x571, &x572, x7, (arg2[0])); fiat_p256_addcarryx_u32(&x573, &x574, 0x0, x572, x569); fiat_p256_addcarryx_u32(&x575, &x576, x574, x570, x567); fiat_p256_addcarryx_u32(&x577, &x578, x576, x568, x565); fiat_p256_addcarryx_u32(&x579, &x580, x578, x566, x563); fiat_p256_addcarryx_u32(&x581, &x582, x580, x564, x561); fiat_p256_addcarryx_u32(&x583, &x584, x582, x562, x559); fiat_p256_addcarryx_u32(&x585, &x586, x584, x560, x557); x587 = (x586 + x558); fiat_p256_addcarryx_u32(&x588, &x589, 0x0, x540, x571); fiat_p256_addcarryx_u32(&x590, &x591, x589, x542, x573); fiat_p256_addcarryx_u32(&x592, &x593, x591, x544, x575); fiat_p256_addcarryx_u32(&x594, &x595, x593, x546, x577); fiat_p256_addcarryx_u32(&x596, &x597, x595, x548, x579); fiat_p256_addcarryx_u32(&x598, &x599, x597, x550, x581); fiat_p256_addcarryx_u32(&x600, &x601, x599, x552, x583); fiat_p256_addcarryx_u32(&x602, &x603, x601, x554, x585); fiat_p256_addcarryx_u32(&x604, &x605, x603, x556, x587); fiat_p256_mulx_u32(&x606, &x607, x588, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x608, &x609, x588, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x610, &x611, x588, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x612, &x613, x588, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x614, &x615, 0x0, x613, x610); fiat_p256_addcarryx_u32(&x616, &x617, x615, x611, x608); x618 = (x617 + x609); fiat_p256_addcarryx_u32(&x619, &x620, 0x0, x588, x612); fiat_p256_addcarryx_u32(&x621, &x622, x620, x590, x614); fiat_p256_addcarryx_u32(&x623, &x624, x622, x592, x616); fiat_p256_addcarryx_u32(&x625, &x626, x624, x594, x618); fiat_p256_addcarryx_u32(&x627, &x628, x626, x596, 0x0); fiat_p256_addcarryx_u32(&x629, &x630, x628, x598, 0x0); fiat_p256_addcarryx_u32(&x631, &x632, x630, x600, x588); fiat_p256_addcarryx_u32(&x633, &x634, x632, x602, x606); fiat_p256_addcarryx_u32(&x635, &x636, x634, x604, x607); x637 = ((uint32_t)x636 + x605); fiat_p256_subborrowx_u32(&x638, &x639, 0x0, x621, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u32(&x640, &x641, x639, x623, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u32(&x642, &x643, x641, x625, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u32(&x644, &x645, x643, x627, 0x0); fiat_p256_subborrowx_u32(&x646, &x647, x645, x629, 0x0); fiat_p256_subborrowx_u32(&x648, &x649, x647, x631, 0x0); fiat_p256_subborrowx_u32(&x650, &x651, x649, x633, 0x1); fiat_p256_subborrowx_u32(&x652, &x653, x651, x635, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u32(&x654, &x655, x653, x637, 0x0); fiat_p256_cmovznz_u32(&x656, x655, x638, x621); fiat_p256_cmovznz_u32(&x657, x655, x640, x623); fiat_p256_cmovznz_u32(&x658, x655, x642, x625); fiat_p256_cmovznz_u32(&x659, x655, x644, x627); fiat_p256_cmovznz_u32(&x660, x655, x646, x629); fiat_p256_cmovznz_u32(&x661, x655, x648, x631); fiat_p256_cmovznz_u32(&x662, x655, x650, x633); fiat_p256_cmovznz_u32(&x663, x655, x652, x635); out1[0] = x656; out1[1] = x657; out1[2] = x658; out1[3] = x659; out1[4] = x660; out1[5] = x661; out1[6] = x662; out1[7] = x663; } /* * The function fiat_p256_square squares a field element in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * Postconditions: * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg1)) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_square(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { uint32_t x1; uint32_t x2; uint32_t x3; uint32_t x4; uint32_t x5; uint32_t x6; uint32_t x7; uint32_t x8; uint32_t x9; uint32_t x10; uint32_t x11; uint32_t x12; uint32_t x13; uint32_t x14; uint32_t x15; uint32_t x16; uint32_t x17; uint32_t x18; uint32_t x19; uint32_t x20; uint32_t x21; uint32_t x22; uint32_t x23; uint32_t x24; uint32_t x25; fiat_p256_uint1 x26; uint32_t x27; fiat_p256_uint1 x28; uint32_t x29; fiat_p256_uint1 x30; uint32_t x31; fiat_p256_uint1 x32; uint32_t x33; fiat_p256_uint1 x34; uint32_t x35; fiat_p256_uint1 x36; uint32_t x37; fiat_p256_uint1 x38; uint32_t x39; uint32_t x40; uint32_t x41; uint32_t x42; uint32_t x43; uint32_t x44; uint32_t x45; uint32_t x46; uint32_t x47; uint32_t x48; fiat_p256_uint1 x49; uint32_t x50; fiat_p256_uint1 x51; uint32_t x52; uint32_t x53; fiat_p256_uint1 x54; uint32_t x55; fiat_p256_uint1 x56; uint32_t x57; fiat_p256_uint1 x58; uint32_t x59; fiat_p256_uint1 x60; uint32_t x61; fiat_p256_uint1 x62; uint32_t x63; fiat_p256_uint1 x64; uint32_t x65; fiat_p256_uint1 x66; uint32_t x67; fiat_p256_uint1 x68; uint32_t x69; fiat_p256_uint1 x70; uint32_t x71; uint32_t x72; uint32_t x73; uint32_t x74; uint32_t x75; uint32_t x76; uint32_t x77; uint32_t x78; uint32_t x79; uint32_t x80; uint32_t x81; uint32_t x82; uint32_t x83; uint32_t x84; uint32_t x85; uint32_t x86; uint32_t x87; fiat_p256_uint1 x88; uint32_t x89; fiat_p256_uint1 x90; uint32_t x91; fiat_p256_uint1 x92; uint32_t x93; fiat_p256_uint1 x94; uint32_t x95; fiat_p256_uint1 x96; uint32_t x97; fiat_p256_uint1 x98; uint32_t x99; fiat_p256_uint1 x100; uint32_t x101; uint32_t x102; fiat_p256_uint1 x103; uint32_t x104; fiat_p256_uint1 x105; uint32_t x106; fiat_p256_uint1 x107; uint32_t x108; fiat_p256_uint1 x109; uint32_t x110; fiat_p256_uint1 x111; uint32_t x112; fiat_p256_uint1 x113; uint32_t x114; fiat_p256_uint1 x115; uint32_t x116; fiat_p256_uint1 x117; uint32_t x118; fiat_p256_uint1 x119; uint32_t x120; uint32_t x121; uint32_t x122; uint32_t x123; uint32_t x124; uint32_t x125; uint32_t x126; uint32_t x127; uint32_t x128; fiat_p256_uint1 x129; uint32_t x130; fiat_p256_uint1 x131; uint32_t x132; uint32_t x133; fiat_p256_uint1 x134; uint32_t x135; fiat_p256_uint1 x136; uint32_t x137; fiat_p256_uint1 x138; uint32_t x139; fiat_p256_uint1 x140; uint32_t x141; fiat_p256_uint1 x142; uint32_t x143; fiat_p256_uint1 x144; uint32_t x145; fiat_p256_uint1 x146; uint32_t x147; fiat_p256_uint1 x148; uint32_t x149; fiat_p256_uint1 x150; uint32_t x151; uint32_t x152; uint32_t x153; uint32_t x154; uint32_t x155; uint32_t x156; uint32_t x157; uint32_t x158; uint32_t x159; uint32_t x160; uint32_t x161; uint32_t x162; uint32_t x163; uint32_t x164; uint32_t x165; uint32_t x166; uint32_t x167; uint32_t x168; fiat_p256_uint1 x169; uint32_t x170; fiat_p256_uint1 x171; uint32_t x172; fiat_p256_uint1 x173; uint32_t x174; fiat_p256_uint1 x175; uint32_t x176; fiat_p256_uint1 x177; uint32_t x178; fiat_p256_uint1 x179; uint32_t x180; fiat_p256_uint1 x181; uint32_t x182; uint32_t x183; fiat_p256_uint1 x184; uint32_t x185; fiat_p256_uint1 x186; uint32_t x187; fiat_p256_uint1 x188; uint32_t x189; fiat_p256_uint1 x190; uint32_t x191; fiat_p256_uint1 x192; uint32_t x193; fiat_p256_uint1 x194; uint32_t x195; fiat_p256_uint1 x196; uint32_t x197; fiat_p256_uint1 x198; uint32_t x199; fiat_p256_uint1 x200; uint32_t x201; uint32_t x202; uint32_t x203; uint32_t x204; uint32_t x205; uint32_t x206; uint32_t x207; uint32_t x208; uint32_t x209; fiat_p256_uint1 x210; uint32_t x211; fiat_p256_uint1 x212; uint32_t x213; uint32_t x214; fiat_p256_uint1 x215; uint32_t x216; fiat_p256_uint1 x217; uint32_t x218; fiat_p256_uint1 x219; uint32_t x220; fiat_p256_uint1 x221; uint32_t x222; fiat_p256_uint1 x223; uint32_t x224; fiat_p256_uint1 x225; uint32_t x226; fiat_p256_uint1 x227; uint32_t x228; fiat_p256_uint1 x229; uint32_t x230; fiat_p256_uint1 x231; uint32_t x232; uint32_t x233; uint32_t x234; uint32_t x235; uint32_t x236; uint32_t x237; uint32_t x238; uint32_t x239; uint32_t x240; uint32_t x241; uint32_t x242; uint32_t x243; uint32_t x244; uint32_t x245; uint32_t x246; uint32_t x247; uint32_t x248; uint32_t x249; fiat_p256_uint1 x250; uint32_t x251; fiat_p256_uint1 x252; uint32_t x253; fiat_p256_uint1 x254; uint32_t x255; fiat_p256_uint1 x256; uint32_t x257; fiat_p256_uint1 x258; uint32_t x259; fiat_p256_uint1 x260; uint32_t x261; fiat_p256_uint1 x262; uint32_t x263; uint32_t x264; fiat_p256_uint1 x265; uint32_t x266; fiat_p256_uint1 x267; uint32_t x268; fiat_p256_uint1 x269; uint32_t x270; fiat_p256_uint1 x271; uint32_t x272; fiat_p256_uint1 x273; uint32_t x274; fiat_p256_uint1 x275; uint32_t x276; fiat_p256_uint1 x277; uint32_t x278; fiat_p256_uint1 x279; uint32_t x280; fiat_p256_uint1 x281; uint32_t x282; uint32_t x283; uint32_t x284; uint32_t x285; uint32_t x286; uint32_t x287; uint32_t x288; uint32_t x289; uint32_t x290; fiat_p256_uint1 x291; uint32_t x292; fiat_p256_uint1 x293; uint32_t x294; uint32_t x295; fiat_p256_uint1 x296; uint32_t x297; fiat_p256_uint1 x298; uint32_t x299; fiat_p256_uint1 x300; uint32_t x301; fiat_p256_uint1 x302; uint32_t x303; fiat_p256_uint1 x304; uint32_t x305; fiat_p256_uint1 x306; uint32_t x307; fiat_p256_uint1 x308; uint32_t x309; fiat_p256_uint1 x310; uint32_t x311; fiat_p256_uint1 x312; uint32_t x313; uint32_t x314; uint32_t x315; uint32_t x316; uint32_t x317; uint32_t x318; uint32_t x319; uint32_t x320; uint32_t x321; uint32_t x322; uint32_t x323; uint32_t x324; uint32_t x325; uint32_t x326; uint32_t x327; uint32_t x328; uint32_t x329; uint32_t x330; fiat_p256_uint1 x331; uint32_t x332; fiat_p256_uint1 x333; uint32_t x334; fiat_p256_uint1 x335; uint32_t x336; fiat_p256_uint1 x337; uint32_t x338; fiat_p256_uint1 x339; uint32_t x340; fiat_p256_uint1 x341; uint32_t x342; fiat_p256_uint1 x343; uint32_t x344; uint32_t x345; fiat_p256_uint1 x346; uint32_t x347; fiat_p256_uint1 x348; uint32_t x349; fiat_p256_uint1 x350; uint32_t x351; fiat_p256_uint1 x352; uint32_t x353; fiat_p256_uint1 x354; uint32_t x355; fiat_p256_uint1 x356; uint32_t x357; fiat_p256_uint1 x358; uint32_t x359; fiat_p256_uint1 x360; uint32_t x361; fiat_p256_uint1 x362; uint32_t x363; uint32_t x364; uint32_t x365; uint32_t x366; uint32_t x367; uint32_t x368; uint32_t x369; uint32_t x370; uint32_t x371; fiat_p256_uint1 x372; uint32_t x373; fiat_p256_uint1 x374; uint32_t x375; uint32_t x376; fiat_p256_uint1 x377; uint32_t x378; fiat_p256_uint1 x379; uint32_t x380; fiat_p256_uint1 x381; uint32_t x382; fiat_p256_uint1 x383; uint32_t x384; fiat_p256_uint1 x385; uint32_t x386; fiat_p256_uint1 x387; uint32_t x388; fiat_p256_uint1 x389; uint32_t x390; fiat_p256_uint1 x391; uint32_t x392; fiat_p256_uint1 x393; uint32_t x394; uint32_t x395; uint32_t x396; uint32_t x397; uint32_t x398; uint32_t x399; uint32_t x400; uint32_t x401; uint32_t x402; uint32_t x403; uint32_t x404; uint32_t x405; uint32_t x406; uint32_t x407; uint32_t x408; uint32_t x409; uint32_t x410; uint32_t x411; fiat_p256_uint1 x412; uint32_t x413; fiat_p256_uint1 x414; uint32_t x415; fiat_p256_uint1 x416; uint32_t x417; fiat_p256_uint1 x418; uint32_t x419; fiat_p256_uint1 x420; uint32_t x421; fiat_p256_uint1 x422; uint32_t x423; fiat_p256_uint1 x424; uint32_t x425; uint32_t x426; fiat_p256_uint1 x427; uint32_t x428; fiat_p256_uint1 x429; uint32_t x430; fiat_p256_uint1 x431; uint32_t x432; fiat_p256_uint1 x433; uint32_t x434; fiat_p256_uint1 x435; uint32_t x436; fiat_p256_uint1 x437; uint32_t x438; fiat_p256_uint1 x439; uint32_t x440; fiat_p256_uint1 x441; uint32_t x442; fiat_p256_uint1 x443; uint32_t x444; uint32_t x445; uint32_t x446; uint32_t x447; uint32_t x448; uint32_t x449; uint32_t x450; uint32_t x451; uint32_t x452; fiat_p256_uint1 x453; uint32_t x454; fiat_p256_uint1 x455; uint32_t x456; uint32_t x457; fiat_p256_uint1 x458; uint32_t x459; fiat_p256_uint1 x460; uint32_t x461; fiat_p256_uint1 x462; uint32_t x463; fiat_p256_uint1 x464; uint32_t x465; fiat_p256_uint1 x466; uint32_t x467; fiat_p256_uint1 x468; uint32_t x469; fiat_p256_uint1 x470; uint32_t x471; fiat_p256_uint1 x472; uint32_t x473; fiat_p256_uint1 x474; uint32_t x475; uint32_t x476; uint32_t x477; uint32_t x478; uint32_t x479; uint32_t x480; uint32_t x481; uint32_t x482; uint32_t x483; uint32_t x484; uint32_t x485; uint32_t x486; uint32_t x487; uint32_t x488; uint32_t x489; uint32_t x490; uint32_t x491; uint32_t x492; fiat_p256_uint1 x493; uint32_t x494; fiat_p256_uint1 x495; uint32_t x496; fiat_p256_uint1 x497; uint32_t x498; fiat_p256_uint1 x499; uint32_t x500; fiat_p256_uint1 x501; uint32_t x502; fiat_p256_uint1 x503; uint32_t x504; fiat_p256_uint1 x505; uint32_t x506; uint32_t x507; fiat_p256_uint1 x508; uint32_t x509; fiat_p256_uint1 x510; uint32_t x511; fiat_p256_uint1 x512; uint32_t x513; fiat_p256_uint1 x514; uint32_t x515; fiat_p256_uint1 x516; uint32_t x517; fiat_p256_uint1 x518; uint32_t x519; fiat_p256_uint1 x520; uint32_t x521; fiat_p256_uint1 x522; uint32_t x523; fiat_p256_uint1 x524; uint32_t x525; uint32_t x526; uint32_t x527; uint32_t x528; uint32_t x529; uint32_t x530; uint32_t x531; uint32_t x532; uint32_t x533; fiat_p256_uint1 x534; uint32_t x535; fiat_p256_uint1 x536; uint32_t x537; uint32_t x538; fiat_p256_uint1 x539; uint32_t x540; fiat_p256_uint1 x541; uint32_t x542; fiat_p256_uint1 x543; uint32_t x544; fiat_p256_uint1 x545; uint32_t x546; fiat_p256_uint1 x547; uint32_t x548; fiat_p256_uint1 x549; uint32_t x550; fiat_p256_uint1 x551; uint32_t x552; fiat_p256_uint1 x553; uint32_t x554; fiat_p256_uint1 x555; uint32_t x556; uint32_t x557; uint32_t x558; uint32_t x559; uint32_t x560; uint32_t x561; uint32_t x562; uint32_t x563; uint32_t x564; uint32_t x565; uint32_t x566; uint32_t x567; uint32_t x568; uint32_t x569; uint32_t x570; uint32_t x571; uint32_t x572; uint32_t x573; fiat_p256_uint1 x574; uint32_t x575; fiat_p256_uint1 x576; uint32_t x577; fiat_p256_uint1 x578; uint32_t x579; fiat_p256_uint1 x580; uint32_t x581; fiat_p256_uint1 x582; uint32_t x583; fiat_p256_uint1 x584; uint32_t x585; fiat_p256_uint1 x586; uint32_t x587; uint32_t x588; fiat_p256_uint1 x589; uint32_t x590; fiat_p256_uint1 x591; uint32_t x592; fiat_p256_uint1 x593; uint32_t x594; fiat_p256_uint1 x595; uint32_t x596; fiat_p256_uint1 x597; uint32_t x598; fiat_p256_uint1 x599; uint32_t x600; fiat_p256_uint1 x601; uint32_t x602; fiat_p256_uint1 x603; uint32_t x604; fiat_p256_uint1 x605; uint32_t x606; uint32_t x607; uint32_t x608; uint32_t x609; uint32_t x610; uint32_t x611; uint32_t x612; uint32_t x613; uint32_t x614; fiat_p256_uint1 x615; uint32_t x616; fiat_p256_uint1 x617; uint32_t x618; uint32_t x619; fiat_p256_uint1 x620; uint32_t x621; fiat_p256_uint1 x622; uint32_t x623; fiat_p256_uint1 x624; uint32_t x625; fiat_p256_uint1 x626; uint32_t x627; fiat_p256_uint1 x628; uint32_t x629; fiat_p256_uint1 x630; uint32_t x631; fiat_p256_uint1 x632; uint32_t x633; fiat_p256_uint1 x634; uint32_t x635; fiat_p256_uint1 x636; uint32_t x637; uint32_t x638; fiat_p256_uint1 x639; uint32_t x640; fiat_p256_uint1 x641; uint32_t x642; fiat_p256_uint1 x643; uint32_t x644; fiat_p256_uint1 x645; uint32_t x646; fiat_p256_uint1 x647; uint32_t x648; fiat_p256_uint1 x649; uint32_t x650; fiat_p256_uint1 x651; uint32_t x652; fiat_p256_uint1 x653; uint32_t x654; fiat_p256_uint1 x655; uint32_t x656; uint32_t x657; uint32_t x658; uint32_t x659; uint32_t x660; uint32_t x661; uint32_t x662; uint32_t x663; x1 = (arg1[1]); x2 = (arg1[2]); x3 = (arg1[3]); x4 = (arg1[4]); x5 = (arg1[5]); x6 = (arg1[6]); x7 = (arg1[7]); x8 = (arg1[0]); fiat_p256_mulx_u32(&x9, &x10, x8, (arg1[7])); fiat_p256_mulx_u32(&x11, &x12, x8, (arg1[6])); fiat_p256_mulx_u32(&x13, &x14, x8, (arg1[5])); fiat_p256_mulx_u32(&x15, &x16, x8, (arg1[4])); fiat_p256_mulx_u32(&x17, &x18, x8, (arg1[3])); fiat_p256_mulx_u32(&x19, &x20, x8, (arg1[2])); fiat_p256_mulx_u32(&x21, &x22, x8, (arg1[1])); fiat_p256_mulx_u32(&x23, &x24, x8, (arg1[0])); fiat_p256_addcarryx_u32(&x25, &x26, 0x0, x24, x21); fiat_p256_addcarryx_u32(&x27, &x28, x26, x22, x19); fiat_p256_addcarryx_u32(&x29, &x30, x28, x20, x17); fiat_p256_addcarryx_u32(&x31, &x32, x30, x18, x15); fiat_p256_addcarryx_u32(&x33, &x34, x32, x16, x13); fiat_p256_addcarryx_u32(&x35, &x36, x34, x14, x11); fiat_p256_addcarryx_u32(&x37, &x38, x36, x12, x9); x39 = (x38 + x10); fiat_p256_mulx_u32(&x40, &x41, x23, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x42, &x43, x23, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x44, &x45, x23, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x46, &x47, x23, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x48, &x49, 0x0, x47, x44); fiat_p256_addcarryx_u32(&x50, &x51, x49, x45, x42); x52 = (x51 + x43); fiat_p256_addcarryx_u32(&x53, &x54, 0x0, x23, x46); fiat_p256_addcarryx_u32(&x55, &x56, x54, x25, x48); fiat_p256_addcarryx_u32(&x57, &x58, x56, x27, x50); fiat_p256_addcarryx_u32(&x59, &x60, x58, x29, x52); fiat_p256_addcarryx_u32(&x61, &x62, x60, x31, 0x0); fiat_p256_addcarryx_u32(&x63, &x64, x62, x33, 0x0); fiat_p256_addcarryx_u32(&x65, &x66, x64, x35, x23); fiat_p256_addcarryx_u32(&x67, &x68, x66, x37, x40); fiat_p256_addcarryx_u32(&x69, &x70, x68, x39, x41); fiat_p256_mulx_u32(&x71, &x72, x1, (arg1[7])); fiat_p256_mulx_u32(&x73, &x74, x1, (arg1[6])); fiat_p256_mulx_u32(&x75, &x76, x1, (arg1[5])); fiat_p256_mulx_u32(&x77, &x78, x1, (arg1[4])); fiat_p256_mulx_u32(&x79, &x80, x1, (arg1[3])); fiat_p256_mulx_u32(&x81, &x82, x1, (arg1[2])); fiat_p256_mulx_u32(&x83, &x84, x1, (arg1[1])); fiat_p256_mulx_u32(&x85, &x86, x1, (arg1[0])); fiat_p256_addcarryx_u32(&x87, &x88, 0x0, x86, x83); fiat_p256_addcarryx_u32(&x89, &x90, x88, x84, x81); fiat_p256_addcarryx_u32(&x91, &x92, x90, x82, x79); fiat_p256_addcarryx_u32(&x93, &x94, x92, x80, x77); fiat_p256_addcarryx_u32(&x95, &x96, x94, x78, x75); fiat_p256_addcarryx_u32(&x97, &x98, x96, x76, x73); fiat_p256_addcarryx_u32(&x99, &x100, x98, x74, x71); x101 = (x100 + x72); fiat_p256_addcarryx_u32(&x102, &x103, 0x0, x55, x85); fiat_p256_addcarryx_u32(&x104, &x105, x103, x57, x87); fiat_p256_addcarryx_u32(&x106, &x107, x105, x59, x89); fiat_p256_addcarryx_u32(&x108, &x109, x107, x61, x91); fiat_p256_addcarryx_u32(&x110, &x111, x109, x63, x93); fiat_p256_addcarryx_u32(&x112, &x113, x111, x65, x95); fiat_p256_addcarryx_u32(&x114, &x115, x113, x67, x97); fiat_p256_addcarryx_u32(&x116, &x117, x115, x69, x99); fiat_p256_addcarryx_u32(&x118, &x119, x117, x70, x101); fiat_p256_mulx_u32(&x120, &x121, x102, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x122, &x123, x102, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x124, &x125, x102, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x126, &x127, x102, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x128, &x129, 0x0, x127, x124); fiat_p256_addcarryx_u32(&x130, &x131, x129, x125, x122); x132 = (x131 + x123); fiat_p256_addcarryx_u32(&x133, &x134, 0x0, x102, x126); fiat_p256_addcarryx_u32(&x135, &x136, x134, x104, x128); fiat_p256_addcarryx_u32(&x137, &x138, x136, x106, x130); fiat_p256_addcarryx_u32(&x139, &x140, x138, x108, x132); fiat_p256_addcarryx_u32(&x141, &x142, x140, x110, 0x0); fiat_p256_addcarryx_u32(&x143, &x144, x142, x112, 0x0); fiat_p256_addcarryx_u32(&x145, &x146, x144, x114, x102); fiat_p256_addcarryx_u32(&x147, &x148, x146, x116, x120); fiat_p256_addcarryx_u32(&x149, &x150, x148, x118, x121); x151 = ((uint32_t)x150 + x119); fiat_p256_mulx_u32(&x152, &x153, x2, (arg1[7])); fiat_p256_mulx_u32(&x154, &x155, x2, (arg1[6])); fiat_p256_mulx_u32(&x156, &x157, x2, (arg1[5])); fiat_p256_mulx_u32(&x158, &x159, x2, (arg1[4])); fiat_p256_mulx_u32(&x160, &x161, x2, (arg1[3])); fiat_p256_mulx_u32(&x162, &x163, x2, (arg1[2])); fiat_p256_mulx_u32(&x164, &x165, x2, (arg1[1])); fiat_p256_mulx_u32(&x166, &x167, x2, (arg1[0])); fiat_p256_addcarryx_u32(&x168, &x169, 0x0, x167, x164); fiat_p256_addcarryx_u32(&x170, &x171, x169, x165, x162); fiat_p256_addcarryx_u32(&x172, &x173, x171, x163, x160); fiat_p256_addcarryx_u32(&x174, &x175, x173, x161, x158); fiat_p256_addcarryx_u32(&x176, &x177, x175, x159, x156); fiat_p256_addcarryx_u32(&x178, &x179, x177, x157, x154); fiat_p256_addcarryx_u32(&x180, &x181, x179, x155, x152); x182 = (x181 + x153); fiat_p256_addcarryx_u32(&x183, &x184, 0x0, x135, x166); fiat_p256_addcarryx_u32(&x185, &x186, x184, x137, x168); fiat_p256_addcarryx_u32(&x187, &x188, x186, x139, x170); fiat_p256_addcarryx_u32(&x189, &x190, x188, x141, x172); fiat_p256_addcarryx_u32(&x191, &x192, x190, x143, x174); fiat_p256_addcarryx_u32(&x193, &x194, x192, x145, x176); fiat_p256_addcarryx_u32(&x195, &x196, x194, x147, x178); fiat_p256_addcarryx_u32(&x197, &x198, x196, x149, x180); fiat_p256_addcarryx_u32(&x199, &x200, x198, x151, x182); fiat_p256_mulx_u32(&x201, &x202, x183, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x203, &x204, x183, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x205, &x206, x183, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x207, &x208, x183, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x209, &x210, 0x0, x208, x205); fiat_p256_addcarryx_u32(&x211, &x212, x210, x206, x203); x213 = (x212 + x204); fiat_p256_addcarryx_u32(&x214, &x215, 0x0, x183, x207); fiat_p256_addcarryx_u32(&x216, &x217, x215, x185, x209); fiat_p256_addcarryx_u32(&x218, &x219, x217, x187, x211); fiat_p256_addcarryx_u32(&x220, &x221, x219, x189, x213); fiat_p256_addcarryx_u32(&x222, &x223, x221, x191, 0x0); fiat_p256_addcarryx_u32(&x224, &x225, x223, x193, 0x0); fiat_p256_addcarryx_u32(&x226, &x227, x225, x195, x183); fiat_p256_addcarryx_u32(&x228, &x229, x227, x197, x201); fiat_p256_addcarryx_u32(&x230, &x231, x229, x199, x202); x232 = ((uint32_t)x231 + x200); fiat_p256_mulx_u32(&x233, &x234, x3, (arg1[7])); fiat_p256_mulx_u32(&x235, &x236, x3, (arg1[6])); fiat_p256_mulx_u32(&x237, &x238, x3, (arg1[5])); fiat_p256_mulx_u32(&x239, &x240, x3, (arg1[4])); fiat_p256_mulx_u32(&x241, &x242, x3, (arg1[3])); fiat_p256_mulx_u32(&x243, &x244, x3, (arg1[2])); fiat_p256_mulx_u32(&x245, &x246, x3, (arg1[1])); fiat_p256_mulx_u32(&x247, &x248, x3, (arg1[0])); fiat_p256_addcarryx_u32(&x249, &x250, 0x0, x248, x245); fiat_p256_addcarryx_u32(&x251, &x252, x250, x246, x243); fiat_p256_addcarryx_u32(&x253, &x254, x252, x244, x241); fiat_p256_addcarryx_u32(&x255, &x256, x254, x242, x239); fiat_p256_addcarryx_u32(&x257, &x258, x256, x240, x237); fiat_p256_addcarryx_u32(&x259, &x260, x258, x238, x235); fiat_p256_addcarryx_u32(&x261, &x262, x260, x236, x233); x263 = (x262 + x234); fiat_p256_addcarryx_u32(&x264, &x265, 0x0, x216, x247); fiat_p256_addcarryx_u32(&x266, &x267, x265, x218, x249); fiat_p256_addcarryx_u32(&x268, &x269, x267, x220, x251); fiat_p256_addcarryx_u32(&x270, &x271, x269, x222, x253); fiat_p256_addcarryx_u32(&x272, &x273, x271, x224, x255); fiat_p256_addcarryx_u32(&x274, &x275, x273, x226, x257); fiat_p256_addcarryx_u32(&x276, &x277, x275, x228, x259); fiat_p256_addcarryx_u32(&x278, &x279, x277, x230, x261); fiat_p256_addcarryx_u32(&x280, &x281, x279, x232, x263); fiat_p256_mulx_u32(&x282, &x283, x264, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x284, &x285, x264, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x286, &x287, x264, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x288, &x289, x264, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x290, &x291, 0x0, x289, x286); fiat_p256_addcarryx_u32(&x292, &x293, x291, x287, x284); x294 = (x293 + x285); fiat_p256_addcarryx_u32(&x295, &x296, 0x0, x264, x288); fiat_p256_addcarryx_u32(&x297, &x298, x296, x266, x290); fiat_p256_addcarryx_u32(&x299, &x300, x298, x268, x292); fiat_p256_addcarryx_u32(&x301, &x302, x300, x270, x294); fiat_p256_addcarryx_u32(&x303, &x304, x302, x272, 0x0); fiat_p256_addcarryx_u32(&x305, &x306, x304, x274, 0x0); fiat_p256_addcarryx_u32(&x307, &x308, x306, x276, x264); fiat_p256_addcarryx_u32(&x309, &x310, x308, x278, x282); fiat_p256_addcarryx_u32(&x311, &x312, x310, x280, x283); x313 = ((uint32_t)x312 + x281); fiat_p256_mulx_u32(&x314, &x315, x4, (arg1[7])); fiat_p256_mulx_u32(&x316, &x317, x4, (arg1[6])); fiat_p256_mulx_u32(&x318, &x319, x4, (arg1[5])); fiat_p256_mulx_u32(&x320, &x321, x4, (arg1[4])); fiat_p256_mulx_u32(&x322, &x323, x4, (arg1[3])); fiat_p256_mulx_u32(&x324, &x325, x4, (arg1[2])); fiat_p256_mulx_u32(&x326, &x327, x4, (arg1[1])); fiat_p256_mulx_u32(&x328, &x329, x4, (arg1[0])); fiat_p256_addcarryx_u32(&x330, &x331, 0x0, x329, x326); fiat_p256_addcarryx_u32(&x332, &x333, x331, x327, x324); fiat_p256_addcarryx_u32(&x334, &x335, x333, x325, x322); fiat_p256_addcarryx_u32(&x336, &x337, x335, x323, x320); fiat_p256_addcarryx_u32(&x338, &x339, x337, x321, x318); fiat_p256_addcarryx_u32(&x340, &x341, x339, x319, x316); fiat_p256_addcarryx_u32(&x342, &x343, x341, x317, x314); x344 = (x343 + x315); fiat_p256_addcarryx_u32(&x345, &x346, 0x0, x297, x328); fiat_p256_addcarryx_u32(&x347, &x348, x346, x299, x330); fiat_p256_addcarryx_u32(&x349, &x350, x348, x301, x332); fiat_p256_addcarryx_u32(&x351, &x352, x350, x303, x334); fiat_p256_addcarryx_u32(&x353, &x354, x352, x305, x336); fiat_p256_addcarryx_u32(&x355, &x356, x354, x307, x338); fiat_p256_addcarryx_u32(&x357, &x358, x356, x309, x340); fiat_p256_addcarryx_u32(&x359, &x360, x358, x311, x342); fiat_p256_addcarryx_u32(&x361, &x362, x360, x313, x344); fiat_p256_mulx_u32(&x363, &x364, x345, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x365, &x366, x345, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x367, &x368, x345, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x369, &x370, x345, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x371, &x372, 0x0, x370, x367); fiat_p256_addcarryx_u32(&x373, &x374, x372, x368, x365); x375 = (x374 + x366); fiat_p256_addcarryx_u32(&x376, &x377, 0x0, x345, x369); fiat_p256_addcarryx_u32(&x378, &x379, x377, x347, x371); fiat_p256_addcarryx_u32(&x380, &x381, x379, x349, x373); fiat_p256_addcarryx_u32(&x382, &x383, x381, x351, x375); fiat_p256_addcarryx_u32(&x384, &x385, x383, x353, 0x0); fiat_p256_addcarryx_u32(&x386, &x387, x385, x355, 0x0); fiat_p256_addcarryx_u32(&x388, &x389, x387, x357, x345); fiat_p256_addcarryx_u32(&x390, &x391, x389, x359, x363); fiat_p256_addcarryx_u32(&x392, &x393, x391, x361, x364); x394 = ((uint32_t)x393 + x362); fiat_p256_mulx_u32(&x395, &x396, x5, (arg1[7])); fiat_p256_mulx_u32(&x397, &x398, x5, (arg1[6])); fiat_p256_mulx_u32(&x399, &x400, x5, (arg1[5])); fiat_p256_mulx_u32(&x401, &x402, x5, (arg1[4])); fiat_p256_mulx_u32(&x403, &x404, x5, (arg1[3])); fiat_p256_mulx_u32(&x405, &x406, x5, (arg1[2])); fiat_p256_mulx_u32(&x407, &x408, x5, (arg1[1])); fiat_p256_mulx_u32(&x409, &x410, x5, (arg1[0])); fiat_p256_addcarryx_u32(&x411, &x412, 0x0, x410, x407); fiat_p256_addcarryx_u32(&x413, &x414, x412, x408, x405); fiat_p256_addcarryx_u32(&x415, &x416, x414, x406, x403); fiat_p256_addcarryx_u32(&x417, &x418, x416, x404, x401); fiat_p256_addcarryx_u32(&x419, &x420, x418, x402, x399); fiat_p256_addcarryx_u32(&x421, &x422, x420, x400, x397); fiat_p256_addcarryx_u32(&x423, &x424, x422, x398, x395); x425 = (x424 + x396); fiat_p256_addcarryx_u32(&x426, &x427, 0x0, x378, x409); fiat_p256_addcarryx_u32(&x428, &x429, x427, x380, x411); fiat_p256_addcarryx_u32(&x430, &x431, x429, x382, x413); fiat_p256_addcarryx_u32(&x432, &x433, x431, x384, x415); fiat_p256_addcarryx_u32(&x434, &x435, x433, x386, x417); fiat_p256_addcarryx_u32(&x436, &x437, x435, x388, x419); fiat_p256_addcarryx_u32(&x438, &x439, x437, x390, x421); fiat_p256_addcarryx_u32(&x440, &x441, x439, x392, x423); fiat_p256_addcarryx_u32(&x442, &x443, x441, x394, x425); fiat_p256_mulx_u32(&x444, &x445, x426, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x446, &x447, x426, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x448, &x449, x426, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x450, &x451, x426, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x452, &x453, 0x0, x451, x448); fiat_p256_addcarryx_u32(&x454, &x455, x453, x449, x446); x456 = (x455 + x447); fiat_p256_addcarryx_u32(&x457, &x458, 0x0, x426, x450); fiat_p256_addcarryx_u32(&x459, &x460, x458, x428, x452); fiat_p256_addcarryx_u32(&x461, &x462, x460, x430, x454); fiat_p256_addcarryx_u32(&x463, &x464, x462, x432, x456); fiat_p256_addcarryx_u32(&x465, &x466, x464, x434, 0x0); fiat_p256_addcarryx_u32(&x467, &x468, x466, x436, 0x0); fiat_p256_addcarryx_u32(&x469, &x470, x468, x438, x426); fiat_p256_addcarryx_u32(&x471, &x472, x470, x440, x444); fiat_p256_addcarryx_u32(&x473, &x474, x472, x442, x445); x475 = ((uint32_t)x474 + x443); fiat_p256_mulx_u32(&x476, &x477, x6, (arg1[7])); fiat_p256_mulx_u32(&x478, &x479, x6, (arg1[6])); fiat_p256_mulx_u32(&x480, &x481, x6, (arg1[5])); fiat_p256_mulx_u32(&x482, &x483, x6, (arg1[4])); fiat_p256_mulx_u32(&x484, &x485, x6, (arg1[3])); fiat_p256_mulx_u32(&x486, &x487, x6, (arg1[2])); fiat_p256_mulx_u32(&x488, &x489, x6, (arg1[1])); fiat_p256_mulx_u32(&x490, &x491, x6, (arg1[0])); fiat_p256_addcarryx_u32(&x492, &x493, 0x0, x491, x488); fiat_p256_addcarryx_u32(&x494, &x495, x493, x489, x486); fiat_p256_addcarryx_u32(&x496, &x497, x495, x487, x484); fiat_p256_addcarryx_u32(&x498, &x499, x497, x485, x482); fiat_p256_addcarryx_u32(&x500, &x501, x499, x483, x480); fiat_p256_addcarryx_u32(&x502, &x503, x501, x481, x478); fiat_p256_addcarryx_u32(&x504, &x505, x503, x479, x476); x506 = (x505 + x477); fiat_p256_addcarryx_u32(&x507, &x508, 0x0, x459, x490); fiat_p256_addcarryx_u32(&x509, &x510, x508, x461, x492); fiat_p256_addcarryx_u32(&x511, &x512, x510, x463, x494); fiat_p256_addcarryx_u32(&x513, &x514, x512, x465, x496); fiat_p256_addcarryx_u32(&x515, &x516, x514, x467, x498); fiat_p256_addcarryx_u32(&x517, &x518, x516, x469, x500); fiat_p256_addcarryx_u32(&x519, &x520, x518, x471, x502); fiat_p256_addcarryx_u32(&x521, &x522, x520, x473, x504); fiat_p256_addcarryx_u32(&x523, &x524, x522, x475, x506); fiat_p256_mulx_u32(&x525, &x526, x507, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x527, &x528, x507, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x529, &x530, x507, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x531, &x532, x507, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x533, &x534, 0x0, x532, x529); fiat_p256_addcarryx_u32(&x535, &x536, x534, x530, x527); x537 = (x536 + x528); fiat_p256_addcarryx_u32(&x538, &x539, 0x0, x507, x531); fiat_p256_addcarryx_u32(&x540, &x541, x539, x509, x533); fiat_p256_addcarryx_u32(&x542, &x543, x541, x511, x535); fiat_p256_addcarryx_u32(&x544, &x545, x543, x513, x537); fiat_p256_addcarryx_u32(&x546, &x547, x545, x515, 0x0); fiat_p256_addcarryx_u32(&x548, &x549, x547, x517, 0x0); fiat_p256_addcarryx_u32(&x550, &x551, x549, x519, x507); fiat_p256_addcarryx_u32(&x552, &x553, x551, x521, x525); fiat_p256_addcarryx_u32(&x554, &x555, x553, x523, x526); x556 = ((uint32_t)x555 + x524); fiat_p256_mulx_u32(&x557, &x558, x7, (arg1[7])); fiat_p256_mulx_u32(&x559, &x560, x7, (arg1[6])); fiat_p256_mulx_u32(&x561, &x562, x7, (arg1[5])); fiat_p256_mulx_u32(&x563, &x564, x7, (arg1[4])); fiat_p256_mulx_u32(&x565, &x566, x7, (arg1[3])); fiat_p256_mulx_u32(&x567, &x568, x7, (arg1[2])); fiat_p256_mulx_u32(&x569, &x570, x7, (arg1[1])); fiat_p256_mulx_u32(&x571, &x572, x7, (arg1[0])); fiat_p256_addcarryx_u32(&x573, &x574, 0x0, x572, x569); fiat_p256_addcarryx_u32(&x575, &x576, x574, x570, x567); fiat_p256_addcarryx_u32(&x577, &x578, x576, x568, x565); fiat_p256_addcarryx_u32(&x579, &x580, x578, x566, x563); fiat_p256_addcarryx_u32(&x581, &x582, x580, x564, x561); fiat_p256_addcarryx_u32(&x583, &x584, x582, x562, x559); fiat_p256_addcarryx_u32(&x585, &x586, x584, x560, x557); x587 = (x586 + x558); fiat_p256_addcarryx_u32(&x588, &x589, 0x0, x540, x571); fiat_p256_addcarryx_u32(&x590, &x591, x589, x542, x573); fiat_p256_addcarryx_u32(&x592, &x593, x591, x544, x575); fiat_p256_addcarryx_u32(&x594, &x595, x593, x546, x577); fiat_p256_addcarryx_u32(&x596, &x597, x595, x548, x579); fiat_p256_addcarryx_u32(&x598, &x599, x597, x550, x581); fiat_p256_addcarryx_u32(&x600, &x601, x599, x552, x583); fiat_p256_addcarryx_u32(&x602, &x603, x601, x554, x585); fiat_p256_addcarryx_u32(&x604, &x605, x603, x556, x587); fiat_p256_mulx_u32(&x606, &x607, x588, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x608, &x609, x588, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x610, &x611, x588, UINT32_C(0xffffffff)); fiat_p256_mulx_u32(&x612, &x613, x588, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x614, &x615, 0x0, x613, x610); fiat_p256_addcarryx_u32(&x616, &x617, x615, x611, x608); x618 = (x617 + x609); fiat_p256_addcarryx_u32(&x619, &x620, 0x0, x588, x612); fiat_p256_addcarryx_u32(&x621, &x622, x620, x590, x614); fiat_p256_addcarryx_u32(&x623, &x624, x622, x592, x616); fiat_p256_addcarryx_u32(&x625, &x626, x624, x594, x618); fiat_p256_addcarryx_u32(&x627, &x628, x626, x596, 0x0); fiat_p256_addcarryx_u32(&x629, &x630, x628, x598, 0x0); fiat_p256_addcarryx_u32(&x631, &x632, x630, x600, x588); fiat_p256_addcarryx_u32(&x633, &x634, x632, x602, x606); fiat_p256_addcarryx_u32(&x635, &x636, x634, x604, x607); x637 = ((uint32_t)x636 + x605); fiat_p256_subborrowx_u32(&x638, &x639, 0x0, x621, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u32(&x640, &x641, x639, x623, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u32(&x642, &x643, x641, x625, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u32(&x644, &x645, x643, x627, 0x0); fiat_p256_subborrowx_u32(&x646, &x647, x645, x629, 0x0); fiat_p256_subborrowx_u32(&x648, &x649, x647, x631, 0x0); fiat_p256_subborrowx_u32(&x650, &x651, x649, x633, 0x1); fiat_p256_subborrowx_u32(&x652, &x653, x651, x635, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u32(&x654, &x655, x653, x637, 0x0); fiat_p256_cmovznz_u32(&x656, x655, x638, x621); fiat_p256_cmovznz_u32(&x657, x655, x640, x623); fiat_p256_cmovznz_u32(&x658, x655, x642, x625); fiat_p256_cmovznz_u32(&x659, x655, x644, x627); fiat_p256_cmovznz_u32(&x660, x655, x646, x629); fiat_p256_cmovznz_u32(&x661, x655, x648, x631); fiat_p256_cmovznz_u32(&x662, x655, x650, x633); fiat_p256_cmovznz_u32(&x663, x655, x652, x635); out1[0] = x656; out1[1] = x657; out1[2] = x658; out1[3] = x659; out1[4] = x660; out1[5] = x661; out1[6] = x662; out1[7] = x663; } /* * The function fiat_p256_add adds two field elements in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * 0 ≤ eval arg2 < m * Postconditions: * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) + eval (from_montgomery arg2)) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_add(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { uint32_t x1; fiat_p256_uint1 x2; uint32_t x3; fiat_p256_uint1 x4; uint32_t x5; fiat_p256_uint1 x6; uint32_t x7; fiat_p256_uint1 x8; uint32_t x9; fiat_p256_uint1 x10; uint32_t x11; fiat_p256_uint1 x12; uint32_t x13; fiat_p256_uint1 x14; uint32_t x15; fiat_p256_uint1 x16; uint32_t x17; fiat_p256_uint1 x18; uint32_t x19; fiat_p256_uint1 x20; uint32_t x21; fiat_p256_uint1 x22; uint32_t x23; fiat_p256_uint1 x24; uint32_t x25; fiat_p256_uint1 x26; uint32_t x27; fiat_p256_uint1 x28; uint32_t x29; fiat_p256_uint1 x30; uint32_t x31; fiat_p256_uint1 x32; uint32_t x33; fiat_p256_uint1 x34; uint32_t x35; uint32_t x36; uint32_t x37; uint32_t x38; uint32_t x39; uint32_t x40; uint32_t x41; uint32_t x42; fiat_p256_addcarryx_u32(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); fiat_p256_addcarryx_u32(&x3, &x4, x2, (arg1[1]), (arg2[1])); fiat_p256_addcarryx_u32(&x5, &x6, x4, (arg1[2]), (arg2[2])); fiat_p256_addcarryx_u32(&x7, &x8, x6, (arg1[3]), (arg2[3])); fiat_p256_addcarryx_u32(&x9, &x10, x8, (arg1[4]), (arg2[4])); fiat_p256_addcarryx_u32(&x11, &x12, x10, (arg1[5]), (arg2[5])); fiat_p256_addcarryx_u32(&x13, &x14, x12, (arg1[6]), (arg2[6])); fiat_p256_addcarryx_u32(&x15, &x16, x14, (arg1[7]), (arg2[7])); fiat_p256_subborrowx_u32(&x17, &x18, 0x0, x1, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u32(&x19, &x20, x18, x3, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u32(&x21, &x22, x20, x5, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u32(&x23, &x24, x22, x7, 0x0); fiat_p256_subborrowx_u32(&x25, &x26, x24, x9, 0x0); fiat_p256_subborrowx_u32(&x27, &x28, x26, x11, 0x0); fiat_p256_subborrowx_u32(&x29, &x30, x28, x13, 0x1); fiat_p256_subborrowx_u32(&x31, &x32, x30, x15, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u32(&x33, &x34, x32, x16, 0x0); fiat_p256_cmovznz_u32(&x35, x34, x17, x1); fiat_p256_cmovznz_u32(&x36, x34, x19, x3); fiat_p256_cmovznz_u32(&x37, x34, x21, x5); fiat_p256_cmovznz_u32(&x38, x34, x23, x7); fiat_p256_cmovznz_u32(&x39, x34, x25, x9); fiat_p256_cmovznz_u32(&x40, x34, x27, x11); fiat_p256_cmovznz_u32(&x41, x34, x29, x13); fiat_p256_cmovznz_u32(&x42, x34, x31, x15); out1[0] = x35; out1[1] = x36; out1[2] = x37; out1[3] = x38; out1[4] = x39; out1[5] = x40; out1[6] = x41; out1[7] = x42; } /* * The function fiat_p256_sub subtracts two field elements in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * 0 ≤ eval arg2 < m * Postconditions: * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) - eval (from_montgomery arg2)) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_sub(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { uint32_t x1; fiat_p256_uint1 x2; uint32_t x3; fiat_p256_uint1 x4; uint32_t x5; fiat_p256_uint1 x6; uint32_t x7; fiat_p256_uint1 x8; uint32_t x9; fiat_p256_uint1 x10; uint32_t x11; fiat_p256_uint1 x12; uint32_t x13; fiat_p256_uint1 x14; uint32_t x15; fiat_p256_uint1 x16; uint32_t x17; uint32_t x18; fiat_p256_uint1 x19; uint32_t x20; fiat_p256_uint1 x21; uint32_t x22; fiat_p256_uint1 x23; uint32_t x24; fiat_p256_uint1 x25; uint32_t x26; fiat_p256_uint1 x27; uint32_t x28; fiat_p256_uint1 x29; uint32_t x30; fiat_p256_uint1 x31; uint32_t x32; fiat_p256_uint1 x33; fiat_p256_subborrowx_u32(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); fiat_p256_subborrowx_u32(&x3, &x4, x2, (arg1[1]), (arg2[1])); fiat_p256_subborrowx_u32(&x5, &x6, x4, (arg1[2]), (arg2[2])); fiat_p256_subborrowx_u32(&x7, &x8, x6, (arg1[3]), (arg2[3])); fiat_p256_subborrowx_u32(&x9, &x10, x8, (arg1[4]), (arg2[4])); fiat_p256_subborrowx_u32(&x11, &x12, x10, (arg1[5]), (arg2[5])); fiat_p256_subborrowx_u32(&x13, &x14, x12, (arg1[6]), (arg2[6])); fiat_p256_subborrowx_u32(&x15, &x16, x14, (arg1[7]), (arg2[7])); fiat_p256_cmovznz_u32(&x17, x16, 0x0, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x18, &x19, 0x0, x1, x17); fiat_p256_addcarryx_u32(&x20, &x21, x19, x3, x17); fiat_p256_addcarryx_u32(&x22, &x23, x21, x5, x17); fiat_p256_addcarryx_u32(&x24, &x25, x23, x7, 0x0); fiat_p256_addcarryx_u32(&x26, &x27, x25, x9, 0x0); fiat_p256_addcarryx_u32(&x28, &x29, x27, x11, 0x0); fiat_p256_addcarryx_u32(&x30, &x31, x29, x13, (fiat_p256_uint1)(x17 & 0x1)); fiat_p256_addcarryx_u32(&x32, &x33, x31, x15, x17); out1[0] = x18; out1[1] = x20; out1[2] = x22; out1[3] = x24; out1[4] = x26; out1[5] = x28; out1[6] = x30; out1[7] = x32; } /* * The function fiat_p256_opp negates a field element in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * Postconditions: * eval (from_montgomery out1) mod m = -eval (from_montgomery arg1) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_opp(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { uint32_t x1; fiat_p256_uint1 x2; uint32_t x3; fiat_p256_uint1 x4; uint32_t x5; fiat_p256_uint1 x6; uint32_t x7; fiat_p256_uint1 x8; uint32_t x9; fiat_p256_uint1 x10; uint32_t x11; fiat_p256_uint1 x12; uint32_t x13; fiat_p256_uint1 x14; uint32_t x15; fiat_p256_uint1 x16; uint32_t x17; uint32_t x18; fiat_p256_uint1 x19; uint32_t x20; fiat_p256_uint1 x21; uint32_t x22; fiat_p256_uint1 x23; uint32_t x24; fiat_p256_uint1 x25; uint32_t x26; fiat_p256_uint1 x27; uint32_t x28; fiat_p256_uint1 x29; uint32_t x30; fiat_p256_uint1 x31; uint32_t x32; fiat_p256_uint1 x33; fiat_p256_subborrowx_u32(&x1, &x2, 0x0, 0x0, (arg1[0])); fiat_p256_subborrowx_u32(&x3, &x4, x2, 0x0, (arg1[1])); fiat_p256_subborrowx_u32(&x5, &x6, x4, 0x0, (arg1[2])); fiat_p256_subborrowx_u32(&x7, &x8, x6, 0x0, (arg1[3])); fiat_p256_subborrowx_u32(&x9, &x10, x8, 0x0, (arg1[4])); fiat_p256_subborrowx_u32(&x11, &x12, x10, 0x0, (arg1[5])); fiat_p256_subborrowx_u32(&x13, &x14, x12, 0x0, (arg1[6])); fiat_p256_subborrowx_u32(&x15, &x16, x14, 0x0, (arg1[7])); fiat_p256_cmovznz_u32(&x17, x16, 0x0, UINT32_C(0xffffffff)); fiat_p256_addcarryx_u32(&x18, &x19, 0x0, x1, x17); fiat_p256_addcarryx_u32(&x20, &x21, x19, x3, x17); fiat_p256_addcarryx_u32(&x22, &x23, x21, x5, x17); fiat_p256_addcarryx_u32(&x24, &x25, x23, x7, 0x0); fiat_p256_addcarryx_u32(&x26, &x27, x25, x9, 0x0); fiat_p256_addcarryx_u32(&x28, &x29, x27, x11, 0x0); fiat_p256_addcarryx_u32(&x30, &x31, x29, x13, (fiat_p256_uint1)(x17 & 0x1)); fiat_p256_addcarryx_u32(&x32, &x33, x31, x15, x17); out1[0] = x18; out1[1] = x20; out1[2] = x22; out1[3] = x24; out1[4] = x26; out1[5] = x28; out1[6] = x30; out1[7] = x32; } /* * The function fiat_p256_nonzero outputs a single non-zero word if the input is non-zero and zero otherwise. * * Preconditions: * 0 ≤ eval arg1 < m * Postconditions: * out1 = 0 ↔ eval (from_montgomery arg1) mod m = 0 * * Input Bounds: * arg1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] * Output Bounds: * out1: [0x0 ~> 0xffffffff] */ static FIAT_P256_FIAT_INLINE void fiat_p256_nonzero(uint32_t* out1, const uint32_t arg1[8]) { uint32_t x1; x1 = ((arg1[0]) | ((arg1[1]) | ((arg1[2]) | ((arg1[3]) | ((arg1[4]) | ((arg1[5]) | ((arg1[6]) | (arg1[7])))))))); *out1 = x1; } /* * The function fiat_p256_selectznz is a multi-limb conditional select. * * Postconditions: * eval out1 = (if arg1 = 0 then eval arg2 else eval arg3) * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] * arg3: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] * Output Bounds: * out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] */ static FIAT_P256_FIAT_INLINE void fiat_p256_selectznz(uint32_t out1[8], fiat_p256_uint1 arg1, const uint32_t arg2[8], const uint32_t arg3[8]) { uint32_t x1; uint32_t x2; uint32_t x3; uint32_t x4; uint32_t x5; uint32_t x6; uint32_t x7; uint32_t x8; fiat_p256_cmovznz_u32(&x1, arg1, (arg2[0]), (arg3[0])); fiat_p256_cmovznz_u32(&x2, arg1, (arg2[1]), (arg3[1])); fiat_p256_cmovznz_u32(&x3, arg1, (arg2[2]), (arg3[2])); fiat_p256_cmovznz_u32(&x4, arg1, (arg2[3]), (arg3[3])); fiat_p256_cmovznz_u32(&x5, arg1, (arg2[4]), (arg3[4])); fiat_p256_cmovznz_u32(&x6, arg1, (arg2[5]), (arg3[5])); fiat_p256_cmovznz_u32(&x7, arg1, (arg2[6]), (arg3[6])); fiat_p256_cmovznz_u32(&x8, arg1, (arg2[7]), (arg3[7])); out1[0] = x1; out1[1] = x2; out1[2] = x3; out1[3] = x4; out1[4] = x5; out1[5] = x6; out1[6] = x7; out1[7] = x8; } ring-0.17.14/third_party/fiat/p256_64.h000064400000000000000000000754401046102023000154010ustar 00000000000000/* Autogenerated: 'src/ExtractionOCaml/word_by_word_montgomery' --inline --static --use-value-barrier p256 64 '2^256 - 2^224 + 2^192 + 2^96 - 1' mul square add sub opp from_montgomery to_montgomery nonzero selectznz to_bytes from_bytes one msat divstep divstep_precomp */ /* curve description: p256 */ /* machine_wordsize = 64 (from "64") */ /* requested operations: mul, square, add, sub, opp, from_montgomery, to_montgomery, nonzero, selectznz, to_bytes, from_bytes, one, msat, divstep, divstep_precomp */ /* m = 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff (from "2^256 - 2^224 + 2^192 + 2^96 - 1") */ /* */ /* NOTE: In addition to the bounds specified above each function, all */ /* functions synthesized for this Montgomery arithmetic require the */ /* input to be strictly less than the prime modulus (m), and also */ /* require the input to be in the unique saturated representation. */ /* All functions also ensure that these two properties are true of */ /* return values. */ /* */ /* Computed values: */ /* eval z = z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) */ /* bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */ /* twos_complement_eval z = let x1 := z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) in */ /* if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256 */ #include typedef unsigned char fiat_p256_uint1; typedef signed char fiat_p256_int1; #if defined(__GNUC__) || defined(__clang__) # define FIAT_P256_FIAT_EXTENSION __extension__ # define FIAT_P256_FIAT_INLINE __inline__ #else # define FIAT_P256_FIAT_EXTENSION # define FIAT_P256_FIAT_INLINE #endif FIAT_P256_FIAT_EXTENSION typedef signed __int128 fiat_p256_int128; FIAT_P256_FIAT_EXTENSION typedef unsigned __int128 fiat_p256_uint128; /* The type fiat_p256_montgomery_domain_field_element is a field element in the Montgomery domain. */ /* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ typedef uint64_t fiat_p256_montgomery_domain_field_element[4]; /* The type fiat_p256_non_montgomery_domain_field_element is a field element NOT in the Montgomery domain. */ /* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ typedef uint64_t fiat_p256_non_montgomery_domain_field_element[4]; #if (-1 & 3) != 3 #error "This code only works on a two's complement system" #endif #if !defined(FIAT_P256_NO_ASM) && (defined(__GNUC__) || defined(__clang__)) static __inline__ uint64_t fiat_p256_value_barrier_u64(uint64_t a) { __asm__("" : "+r"(a) : /* no inputs */); return a; } #else # define fiat_p256_value_barrier_u64(x) (x) #endif /* * The function fiat_p256_addcarryx_u64 is an addition with carry. * * Postconditions: * out1 = (arg1 + arg2 + arg3) mod 2^64 * out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffffffffffff] * arg3: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] * out2: [0x0 ~> 0x1] */ static FIAT_P256_FIAT_INLINE void fiat_p256_addcarryx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) { fiat_p256_uint128 x1; uint64_t x2; fiat_p256_uint1 x3; x1 = ((arg1 + (fiat_p256_uint128)arg2) + arg3); x2 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff)); x3 = (fiat_p256_uint1)(x1 >> 64); *out1 = x2; *out2 = x3; } /* * The function fiat_p256_subborrowx_u64 is a subtraction with borrow. * * Postconditions: * out1 = (-arg1 + arg2 + -arg3) mod 2^64 * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffffffffffff] * arg3: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] * out2: [0x0 ~> 0x1] */ static FIAT_P256_FIAT_INLINE void fiat_p256_subborrowx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) { fiat_p256_int128 x1; fiat_p256_int1 x2; uint64_t x3; x1 = ((arg2 - (fiat_p256_int128)arg1) - arg3); x2 = (fiat_p256_int1)(x1 >> 64); x3 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff)); *out1 = x3; *out2 = (fiat_p256_uint1)(0x0 - x2); } /* * The function fiat_p256_mulx_u64 is a multiplication, returning the full double-width result. * * Postconditions: * out1 = (arg1 * arg2) mod 2^64 * out2 = ⌊arg1 * arg2 / 2^64⌋ * * Input Bounds: * arg1: [0x0 ~> 0xffffffffffffffff] * arg2: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] * out2: [0x0 ~> 0xffffffffffffffff] */ static FIAT_P256_FIAT_INLINE void fiat_p256_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) { fiat_p256_uint128 x1; uint64_t x2; uint64_t x3; x1 = ((fiat_p256_uint128)arg1 * arg2); x2 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff)); x3 = (uint64_t)(x1 >> 64); *out1 = x2; *out2 = x3; } /* * The function fiat_p256_cmovznz_u64 is a single-word conditional move. * * Postconditions: * out1 = (if arg1 = 0 then arg2 else arg3) * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffffffffffff] * arg3: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] */ static FIAT_P256_FIAT_INLINE void fiat_p256_cmovznz_u64(uint64_t* out1, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) { fiat_p256_uint1 x1; uint64_t x2; uint64_t x3; x1 = (!(!arg1)); x2 = ((fiat_p256_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff)); x3 = ((fiat_p256_value_barrier_u64(x2) & arg3) | (fiat_p256_value_barrier_u64((~x2)) & arg2)); *out1 = x3; } /* * The function fiat_p256_mul multiplies two field elements in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * 0 ≤ eval arg2 < m * Postconditions: * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg2)) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; uint64_t x13; fiat_p256_uint1 x14; uint64_t x15; fiat_p256_uint1 x16; uint64_t x17; fiat_p256_uint1 x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; uint64_t x26; fiat_p256_uint1 x27; uint64_t x28; uint64_t x29; fiat_p256_uint1 x30; uint64_t x31; fiat_p256_uint1 x32; uint64_t x33; fiat_p256_uint1 x34; uint64_t x35; fiat_p256_uint1 x36; uint64_t x37; fiat_p256_uint1 x38; uint64_t x39; uint64_t x40; uint64_t x41; uint64_t x42; uint64_t x43; uint64_t x44; uint64_t x45; uint64_t x46; uint64_t x47; fiat_p256_uint1 x48; uint64_t x49; fiat_p256_uint1 x50; uint64_t x51; fiat_p256_uint1 x52; uint64_t x53; uint64_t x54; fiat_p256_uint1 x55; uint64_t x56; fiat_p256_uint1 x57; uint64_t x58; fiat_p256_uint1 x59; uint64_t x60; fiat_p256_uint1 x61; uint64_t x62; fiat_p256_uint1 x63; uint64_t x64; uint64_t x65; uint64_t x66; uint64_t x67; uint64_t x68; uint64_t x69; uint64_t x70; fiat_p256_uint1 x71; uint64_t x72; uint64_t x73; fiat_p256_uint1 x74; uint64_t x75; fiat_p256_uint1 x76; uint64_t x77; fiat_p256_uint1 x78; uint64_t x79; fiat_p256_uint1 x80; uint64_t x81; fiat_p256_uint1 x82; uint64_t x83; uint64_t x84; uint64_t x85; uint64_t x86; uint64_t x87; uint64_t x88; uint64_t x89; uint64_t x90; uint64_t x91; uint64_t x92; fiat_p256_uint1 x93; uint64_t x94; fiat_p256_uint1 x95; uint64_t x96; fiat_p256_uint1 x97; uint64_t x98; uint64_t x99; fiat_p256_uint1 x100; uint64_t x101; fiat_p256_uint1 x102; uint64_t x103; fiat_p256_uint1 x104; uint64_t x105; fiat_p256_uint1 x106; uint64_t x107; fiat_p256_uint1 x108; uint64_t x109; uint64_t x110; uint64_t x111; uint64_t x112; uint64_t x113; uint64_t x114; uint64_t x115; fiat_p256_uint1 x116; uint64_t x117; uint64_t x118; fiat_p256_uint1 x119; uint64_t x120; fiat_p256_uint1 x121; uint64_t x122; fiat_p256_uint1 x123; uint64_t x124; fiat_p256_uint1 x125; uint64_t x126; fiat_p256_uint1 x127; uint64_t x128; uint64_t x129; uint64_t x130; uint64_t x131; uint64_t x132; uint64_t x133; uint64_t x134; uint64_t x135; uint64_t x136; uint64_t x137; fiat_p256_uint1 x138; uint64_t x139; fiat_p256_uint1 x140; uint64_t x141; fiat_p256_uint1 x142; uint64_t x143; uint64_t x144; fiat_p256_uint1 x145; uint64_t x146; fiat_p256_uint1 x147; uint64_t x148; fiat_p256_uint1 x149; uint64_t x150; fiat_p256_uint1 x151; uint64_t x152; fiat_p256_uint1 x153; uint64_t x154; uint64_t x155; uint64_t x156; uint64_t x157; uint64_t x158; uint64_t x159; uint64_t x160; fiat_p256_uint1 x161; uint64_t x162; uint64_t x163; fiat_p256_uint1 x164; uint64_t x165; fiat_p256_uint1 x166; uint64_t x167; fiat_p256_uint1 x168; uint64_t x169; fiat_p256_uint1 x170; uint64_t x171; fiat_p256_uint1 x172; uint64_t x173; uint64_t x174; fiat_p256_uint1 x175; uint64_t x176; fiat_p256_uint1 x177; uint64_t x178; fiat_p256_uint1 x179; uint64_t x180; fiat_p256_uint1 x181; uint64_t x182; fiat_p256_uint1 x183; uint64_t x184; uint64_t x185; uint64_t x186; uint64_t x187; x1 = (arg1[1]); x2 = (arg1[2]); x3 = (arg1[3]); x4 = (arg1[0]); fiat_p256_mulx_u64(&x5, &x6, x4, (arg2[3])); fiat_p256_mulx_u64(&x7, &x8, x4, (arg2[2])); fiat_p256_mulx_u64(&x9, &x10, x4, (arg2[1])); fiat_p256_mulx_u64(&x11, &x12, x4, (arg2[0])); fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9); fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7); fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5); x19 = (x18 + x6); fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22); x28 = (x27 + x23); fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24); fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26); fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28); fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20); fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21); fiat_p256_mulx_u64(&x39, &x40, x1, (arg2[3])); fiat_p256_mulx_u64(&x41, &x42, x1, (arg2[2])); fiat_p256_mulx_u64(&x43, &x44, x1, (arg2[1])); fiat_p256_mulx_u64(&x45, &x46, x1, (arg2[0])); fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43); fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41); fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39); x53 = (x52 + x40); fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45); fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47); fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49); fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51); fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53); fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66); x72 = (x71 + x67); fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68); fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70); fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72); fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64); fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65); x83 = ((uint64_t)x82 + x63); fiat_p256_mulx_u64(&x84, &x85, x2, (arg2[3])); fiat_p256_mulx_u64(&x86, &x87, x2, (arg2[2])); fiat_p256_mulx_u64(&x88, &x89, x2, (arg2[1])); fiat_p256_mulx_u64(&x90, &x91, x2, (arg2[0])); fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88); fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86); fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84); x98 = (x97 + x85); fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90); fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92); fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94); fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96); fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98); fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111); x117 = (x116 + x112); fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113); fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115); fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117); fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109); fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110); x128 = ((uint64_t)x127 + x108); fiat_p256_mulx_u64(&x129, &x130, x3, (arg2[3])); fiat_p256_mulx_u64(&x131, &x132, x3, (arg2[2])); fiat_p256_mulx_u64(&x133, &x134, x3, (arg2[1])); fiat_p256_mulx_u64(&x135, &x136, x3, (arg2[0])); fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133); fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131); fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129); x143 = (x142 + x130); fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135); fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137); fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139); fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141); fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143); fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156); x162 = (x161 + x157); fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158); fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160); fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162); fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154); fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155); x173 = ((uint64_t)x172 + x153); fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff)); fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0); fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001)); fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0); fiat_p256_cmovznz_u64(&x184, x183, x174, x165); fiat_p256_cmovznz_u64(&x185, x183, x176, x167); fiat_p256_cmovznz_u64(&x186, x183, x178, x169); fiat_p256_cmovznz_u64(&x187, x183, x180, x171); out1[0] = x184; out1[1] = x185; out1[2] = x186; out1[3] = x187; } /* * The function fiat_p256_square squares a field element in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * Postconditions: * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg1)) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_square(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; uint64_t x13; fiat_p256_uint1 x14; uint64_t x15; fiat_p256_uint1 x16; uint64_t x17; fiat_p256_uint1 x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; uint64_t x26; fiat_p256_uint1 x27; uint64_t x28; uint64_t x29; fiat_p256_uint1 x30; uint64_t x31; fiat_p256_uint1 x32; uint64_t x33; fiat_p256_uint1 x34; uint64_t x35; fiat_p256_uint1 x36; uint64_t x37; fiat_p256_uint1 x38; uint64_t x39; uint64_t x40; uint64_t x41; uint64_t x42; uint64_t x43; uint64_t x44; uint64_t x45; uint64_t x46; uint64_t x47; fiat_p256_uint1 x48; uint64_t x49; fiat_p256_uint1 x50; uint64_t x51; fiat_p256_uint1 x52; uint64_t x53; uint64_t x54; fiat_p256_uint1 x55; uint64_t x56; fiat_p256_uint1 x57; uint64_t x58; fiat_p256_uint1 x59; uint64_t x60; fiat_p256_uint1 x61; uint64_t x62; fiat_p256_uint1 x63; uint64_t x64; uint64_t x65; uint64_t x66; uint64_t x67; uint64_t x68; uint64_t x69; uint64_t x70; fiat_p256_uint1 x71; uint64_t x72; uint64_t x73; fiat_p256_uint1 x74; uint64_t x75; fiat_p256_uint1 x76; uint64_t x77; fiat_p256_uint1 x78; uint64_t x79; fiat_p256_uint1 x80; uint64_t x81; fiat_p256_uint1 x82; uint64_t x83; uint64_t x84; uint64_t x85; uint64_t x86; uint64_t x87; uint64_t x88; uint64_t x89; uint64_t x90; uint64_t x91; uint64_t x92; fiat_p256_uint1 x93; uint64_t x94; fiat_p256_uint1 x95; uint64_t x96; fiat_p256_uint1 x97; uint64_t x98; uint64_t x99; fiat_p256_uint1 x100; uint64_t x101; fiat_p256_uint1 x102; uint64_t x103; fiat_p256_uint1 x104; uint64_t x105; fiat_p256_uint1 x106; uint64_t x107; fiat_p256_uint1 x108; uint64_t x109; uint64_t x110; uint64_t x111; uint64_t x112; uint64_t x113; uint64_t x114; uint64_t x115; fiat_p256_uint1 x116; uint64_t x117; uint64_t x118; fiat_p256_uint1 x119; uint64_t x120; fiat_p256_uint1 x121; uint64_t x122; fiat_p256_uint1 x123; uint64_t x124; fiat_p256_uint1 x125; uint64_t x126; fiat_p256_uint1 x127; uint64_t x128; uint64_t x129; uint64_t x130; uint64_t x131; uint64_t x132; uint64_t x133; uint64_t x134; uint64_t x135; uint64_t x136; uint64_t x137; fiat_p256_uint1 x138; uint64_t x139; fiat_p256_uint1 x140; uint64_t x141; fiat_p256_uint1 x142; uint64_t x143; uint64_t x144; fiat_p256_uint1 x145; uint64_t x146; fiat_p256_uint1 x147; uint64_t x148; fiat_p256_uint1 x149; uint64_t x150; fiat_p256_uint1 x151; uint64_t x152; fiat_p256_uint1 x153; uint64_t x154; uint64_t x155; uint64_t x156; uint64_t x157; uint64_t x158; uint64_t x159; uint64_t x160; fiat_p256_uint1 x161; uint64_t x162; uint64_t x163; fiat_p256_uint1 x164; uint64_t x165; fiat_p256_uint1 x166; uint64_t x167; fiat_p256_uint1 x168; uint64_t x169; fiat_p256_uint1 x170; uint64_t x171; fiat_p256_uint1 x172; uint64_t x173; uint64_t x174; fiat_p256_uint1 x175; uint64_t x176; fiat_p256_uint1 x177; uint64_t x178; fiat_p256_uint1 x179; uint64_t x180; fiat_p256_uint1 x181; uint64_t x182; fiat_p256_uint1 x183; uint64_t x184; uint64_t x185; uint64_t x186; uint64_t x187; x1 = (arg1[1]); x2 = (arg1[2]); x3 = (arg1[3]); x4 = (arg1[0]); fiat_p256_mulx_u64(&x5, &x6, x4, (arg1[3])); fiat_p256_mulx_u64(&x7, &x8, x4, (arg1[2])); fiat_p256_mulx_u64(&x9, &x10, x4, (arg1[1])); fiat_p256_mulx_u64(&x11, &x12, x4, (arg1[0])); fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9); fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7); fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5); x19 = (x18 + x6); fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22); x28 = (x27 + x23); fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24); fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26); fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28); fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20); fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21); fiat_p256_mulx_u64(&x39, &x40, x1, (arg1[3])); fiat_p256_mulx_u64(&x41, &x42, x1, (arg1[2])); fiat_p256_mulx_u64(&x43, &x44, x1, (arg1[1])); fiat_p256_mulx_u64(&x45, &x46, x1, (arg1[0])); fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43); fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41); fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39); x53 = (x52 + x40); fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45); fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47); fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49); fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51); fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53); fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66); x72 = (x71 + x67); fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68); fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70); fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72); fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64); fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65); x83 = ((uint64_t)x82 + x63); fiat_p256_mulx_u64(&x84, &x85, x2, (arg1[3])); fiat_p256_mulx_u64(&x86, &x87, x2, (arg1[2])); fiat_p256_mulx_u64(&x88, &x89, x2, (arg1[1])); fiat_p256_mulx_u64(&x90, &x91, x2, (arg1[0])); fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88); fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86); fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84); x98 = (x97 + x85); fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90); fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92); fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94); fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96); fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98); fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111); x117 = (x116 + x112); fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113); fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115); fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117); fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109); fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110); x128 = ((uint64_t)x127 + x108); fiat_p256_mulx_u64(&x129, &x130, x3, (arg1[3])); fiat_p256_mulx_u64(&x131, &x132, x3, (arg1[2])); fiat_p256_mulx_u64(&x133, &x134, x3, (arg1[1])); fiat_p256_mulx_u64(&x135, &x136, x3, (arg1[0])); fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133); fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131); fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129); x143 = (x142 + x130); fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135); fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137); fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139); fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141); fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143); fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156); x162 = (x161 + x157); fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158); fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160); fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162); fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154); fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155); x173 = ((uint64_t)x172 + x153); fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff)); fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0); fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001)); fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0); fiat_p256_cmovznz_u64(&x184, x183, x174, x165); fiat_p256_cmovznz_u64(&x185, x183, x176, x167); fiat_p256_cmovznz_u64(&x186, x183, x178, x169); fiat_p256_cmovznz_u64(&x187, x183, x180, x171); out1[0] = x184; out1[1] = x185; out1[2] = x186; out1[3] = x187; } /* * The function fiat_p256_add adds two field elements in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * 0 ≤ eval arg2 < m * Postconditions: * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) + eval (from_montgomery arg2)) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_add(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { uint64_t x1; fiat_p256_uint1 x2; uint64_t x3; fiat_p256_uint1 x4; uint64_t x5; fiat_p256_uint1 x6; uint64_t x7; fiat_p256_uint1 x8; uint64_t x9; fiat_p256_uint1 x10; uint64_t x11; fiat_p256_uint1 x12; uint64_t x13; fiat_p256_uint1 x14; uint64_t x15; fiat_p256_uint1 x16; uint64_t x17; fiat_p256_uint1 x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); fiat_p256_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1])); fiat_p256_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2])); fiat_p256_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3])); fiat_p256_subborrowx_u64(&x9, &x10, 0x0, x1, UINT64_C(0xffffffffffffffff)); fiat_p256_subborrowx_u64(&x11, &x12, x10, x3, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u64(&x13, &x14, x12, x5, 0x0); fiat_p256_subborrowx_u64(&x15, &x16, x14, x7, UINT64_C(0xffffffff00000001)); fiat_p256_subborrowx_u64(&x17, &x18, x16, x8, 0x0); fiat_p256_cmovznz_u64(&x19, x18, x9, x1); fiat_p256_cmovznz_u64(&x20, x18, x11, x3); fiat_p256_cmovznz_u64(&x21, x18, x13, x5); fiat_p256_cmovznz_u64(&x22, x18, x15, x7); out1[0] = x19; out1[1] = x20; out1[2] = x21; out1[3] = x22; } /* * The function fiat_p256_sub subtracts two field elements in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * 0 ≤ eval arg2 < m * Postconditions: * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) - eval (from_montgomery arg2)) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_sub(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { uint64_t x1; fiat_p256_uint1 x2; uint64_t x3; fiat_p256_uint1 x4; uint64_t x5; fiat_p256_uint1 x6; uint64_t x7; fiat_p256_uint1 x8; uint64_t x9; uint64_t x10; fiat_p256_uint1 x11; uint64_t x12; fiat_p256_uint1 x13; uint64_t x14; fiat_p256_uint1 x15; uint64_t x16; fiat_p256_uint1 x17; fiat_p256_subborrowx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); fiat_p256_subborrowx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1])); fiat_p256_subborrowx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2])); fiat_p256_subborrowx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3])); fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9); fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff))); fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0); fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001))); out1[0] = x10; out1[1] = x12; out1[2] = x14; out1[3] = x16; } /* * The function fiat_p256_opp negates a field element in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * Postconditions: * eval (from_montgomery out1) mod m = -eval (from_montgomery arg1) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_opp(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { uint64_t x1; fiat_p256_uint1 x2; uint64_t x3; fiat_p256_uint1 x4; uint64_t x5; fiat_p256_uint1 x6; uint64_t x7; fiat_p256_uint1 x8; uint64_t x9; uint64_t x10; fiat_p256_uint1 x11; uint64_t x12; fiat_p256_uint1 x13; uint64_t x14; fiat_p256_uint1 x15; uint64_t x16; fiat_p256_uint1 x17; fiat_p256_subborrowx_u64(&x1, &x2, 0x0, 0x0, (arg1[0])); fiat_p256_subborrowx_u64(&x3, &x4, x2, 0x0, (arg1[1])); fiat_p256_subborrowx_u64(&x5, &x6, x4, 0x0, (arg1[2])); fiat_p256_subborrowx_u64(&x7, &x8, x6, 0x0, (arg1[3])); fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9); fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff))); fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0); fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001))); out1[0] = x10; out1[1] = x12; out1[2] = x14; out1[3] = x16; } /* * The function fiat_p256_nonzero outputs a single non-zero word if the input is non-zero and zero otherwise. * * Preconditions: * 0 ≤ eval arg1 < m * Postconditions: * out1 = 0 ↔ eval (from_montgomery arg1) mod m = 0 * * Input Bounds: * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] */ static FIAT_P256_FIAT_INLINE void fiat_p256_nonzero(uint64_t* out1, const uint64_t arg1[4]) { uint64_t x1; x1 = ((arg1[0]) | ((arg1[1]) | ((arg1[2]) | (arg1[3])))); *out1 = x1; } /* * The function fiat_p256_selectznz is a multi-limb conditional select. * * Postconditions: * eval out1 = (if arg1 = 0 then eval arg2 else eval arg3) * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * Output Bounds: * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ static FIAT_P256_FIAT_INLINE void fiat_p256_selectznz(uint64_t out1[4], fiat_p256_uint1 arg1, const uint64_t arg2[4], const uint64_t arg3[4]) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; fiat_p256_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0])); fiat_p256_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1])); fiat_p256_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2])); fiat_p256_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3])); out1[0] = x1; out1[1] = x2; out1[2] = x3; out1[3] = x4; } ring-0.17.14/third_party/fiat/p256_64_msvc.h000064400000000000000000001765121046102023000164330ustar 00000000000000/* Autogenerated: 'src/ExtractionOCaml/word_by_word_montgomery' --inline --static --use-value-barrier --no-wide-int p256 64 '2^256 - 2^224 + 2^192 + 2^96 - 1' mul square add sub opp from_montgomery to_montgomery nonzero selectznz to_bytes from_bytes one msat divstep divstep_precomp */ /* curve description: p256 */ /* machine_wordsize = 64 (from "64") */ /* requested operations: mul, square, add, sub, opp, from_montgomery, to_montgomery, nonzero, selectznz, to_bytes, from_bytes, one, msat, divstep, divstep_precomp */ /* m = 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff (from "2^256 - 2^224 + 2^192 + 2^96 - 1") */ /* */ /* NOTE: In addition to the bounds specified above each function, all */ /* functions synthesized for this Montgomery arithmetic require the */ /* input to be strictly less than the prime modulus (m), and also */ /* require the input to be in the unique saturated representation. */ /* All functions also ensure that these two properties are true of */ /* return values. */ /* */ /* Computed values: */ /* eval z = z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) */ /* bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */ /* twos_complement_eval z = let x1 := z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) in */ /* if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256 */ #include #include #if defined(_M_X64) #include #endif typedef unsigned char fiat_p256_uint1; typedef signed char fiat_p256_int1; #define FIAT_P256_FIAT_INLINE inline /* The type fiat_p256_montgomery_domain_field_element is a field element in the Montgomery domain. */ /* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ typedef uint64_t fiat_p256_montgomery_domain_field_element[4]; /* The type fiat_p256_non_montgomery_domain_field_element is a field element NOT in the Montgomery domain. */ /* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ typedef uint64_t fiat_p256_non_montgomery_domain_field_element[4]; #if (-1 & 3) != 3 #error "This code only works on a two's complement system" #endif #define fiat_p256_value_barrier_u64(x) (x) /* * The function fiat_p256_addcarryx_u64 is an addition with carry. * * Postconditions: * out1 = (arg1 + arg2 + arg3) mod 2^64 * out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffffffffffff] * arg3: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] * out2: [0x0 ~> 0x1] */ static FIAT_P256_FIAT_INLINE void fiat_p256_addcarryx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) { #if defined(_M_X64) *out2 = _addcarry_u64(arg1, arg2, arg3, out1); #else arg2 += arg1; arg1 = arg2 < arg1; arg3 += arg2; arg1 += arg3 < arg2; *out1 = arg3; *out2 = arg1; #endif } /* * The function fiat_p256_subborrowx_u64 is a subtraction with borrow. * * Postconditions: * out1 = (-arg1 + arg2 + -arg3) mod 2^64 * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋ * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffffffffffff] * arg3: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] * out2: [0x0 ~> 0x1] */ static FIAT_P256_FIAT_INLINE void fiat_p256_subborrowx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) { #if defined(_M_X64) *out2 = _subborrow_u64(arg1, arg2, arg3, out1); // NOTE: edited after generation #else *out1 = arg2 - arg3 - arg1; *out2 = (arg2 < arg3) | ((arg2 == arg3) & arg1); #endif } /* * The function fiat_p256_mulx_u64 is a multiplication, returning the full double-width result. * * Postconditions: * out1 = (arg1 * arg2) mod 2^64 * out2 = ⌊arg1 * arg2 / 2^64⌋ * * Input Bounds: * arg1: [0x0 ~> 0xffffffffffffffff] * arg2: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] * out2: [0x0 ~> 0xffffffffffffffff] */ static FIAT_P256_FIAT_INLINE void fiat_p256_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) { // NOTE: edited after generation #if defined(_M_X64) *out1 = _umul128(arg1, arg2, out2); #elif defined(_M_ARM64) *out1 = arg1 * arg2; *out2 = __umulh(arg1, arg2); #else #error "This file is intended for MSVC on X64 or ARM64" #endif } /* * The function fiat_p256_cmovznz_u64 is a single-word conditional move. * * Postconditions: * out1 = (if arg1 = 0 then arg2 else arg3) * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [0x0 ~> 0xffffffffffffffff] * arg3: [0x0 ~> 0xffffffffffffffff] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] */ static FIAT_P256_FIAT_INLINE void fiat_p256_cmovznz_u64(uint64_t* out1, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) { fiat_p256_uint1 x1; uint64_t x2; uint64_t x3; x1 = (!(!arg1)); x2 = ((fiat_p256_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff)); x3 = ((fiat_p256_value_barrier_u64(x2) & arg3) | (fiat_p256_value_barrier_u64((~x2)) & arg2)); *out1 = x3; } /* * The function fiat_p256_mul multiplies two field elements in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * 0 ≤ eval arg2 < m * Postconditions: * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg2)) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; uint64_t x13; fiat_p256_uint1 x14; uint64_t x15; fiat_p256_uint1 x16; uint64_t x17; fiat_p256_uint1 x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; uint64_t x26; fiat_p256_uint1 x27; uint64_t x28; uint64_t x29; fiat_p256_uint1 x30; uint64_t x31; fiat_p256_uint1 x32; uint64_t x33; fiat_p256_uint1 x34; uint64_t x35; fiat_p256_uint1 x36; uint64_t x37; fiat_p256_uint1 x38; uint64_t x39; uint64_t x40; uint64_t x41; uint64_t x42; uint64_t x43; uint64_t x44; uint64_t x45; uint64_t x46; uint64_t x47; fiat_p256_uint1 x48; uint64_t x49; fiat_p256_uint1 x50; uint64_t x51; fiat_p256_uint1 x52; uint64_t x53; uint64_t x54; fiat_p256_uint1 x55; uint64_t x56; fiat_p256_uint1 x57; uint64_t x58; fiat_p256_uint1 x59; uint64_t x60; fiat_p256_uint1 x61; uint64_t x62; fiat_p256_uint1 x63; uint64_t x64; uint64_t x65; uint64_t x66; uint64_t x67; uint64_t x68; uint64_t x69; uint64_t x70; fiat_p256_uint1 x71; uint64_t x72; uint64_t x73; fiat_p256_uint1 x74; uint64_t x75; fiat_p256_uint1 x76; uint64_t x77; fiat_p256_uint1 x78; uint64_t x79; fiat_p256_uint1 x80; uint64_t x81; fiat_p256_uint1 x82; uint64_t x83; uint64_t x84; uint64_t x85; uint64_t x86; uint64_t x87; uint64_t x88; uint64_t x89; uint64_t x90; uint64_t x91; uint64_t x92; fiat_p256_uint1 x93; uint64_t x94; fiat_p256_uint1 x95; uint64_t x96; fiat_p256_uint1 x97; uint64_t x98; uint64_t x99; fiat_p256_uint1 x100; uint64_t x101; fiat_p256_uint1 x102; uint64_t x103; fiat_p256_uint1 x104; uint64_t x105; fiat_p256_uint1 x106; uint64_t x107; fiat_p256_uint1 x108; uint64_t x109; uint64_t x110; uint64_t x111; uint64_t x112; uint64_t x113; uint64_t x114; uint64_t x115; fiat_p256_uint1 x116; uint64_t x117; uint64_t x118; fiat_p256_uint1 x119; uint64_t x120; fiat_p256_uint1 x121; uint64_t x122; fiat_p256_uint1 x123; uint64_t x124; fiat_p256_uint1 x125; uint64_t x126; fiat_p256_uint1 x127; uint64_t x128; uint64_t x129; uint64_t x130; uint64_t x131; uint64_t x132; uint64_t x133; uint64_t x134; uint64_t x135; uint64_t x136; uint64_t x137; fiat_p256_uint1 x138; uint64_t x139; fiat_p256_uint1 x140; uint64_t x141; fiat_p256_uint1 x142; uint64_t x143; uint64_t x144; fiat_p256_uint1 x145; uint64_t x146; fiat_p256_uint1 x147; uint64_t x148; fiat_p256_uint1 x149; uint64_t x150; fiat_p256_uint1 x151; uint64_t x152; fiat_p256_uint1 x153; uint64_t x154; uint64_t x155; uint64_t x156; uint64_t x157; uint64_t x158; uint64_t x159; uint64_t x160; fiat_p256_uint1 x161; uint64_t x162; uint64_t x163; fiat_p256_uint1 x164; uint64_t x165; fiat_p256_uint1 x166; uint64_t x167; fiat_p256_uint1 x168; uint64_t x169; fiat_p256_uint1 x170; uint64_t x171; fiat_p256_uint1 x172; uint64_t x173; uint64_t x174; fiat_p256_uint1 x175; uint64_t x176; fiat_p256_uint1 x177; uint64_t x178; fiat_p256_uint1 x179; uint64_t x180; fiat_p256_uint1 x181; uint64_t x182; fiat_p256_uint1 x183; uint64_t x184; uint64_t x185; uint64_t x186; uint64_t x187; x1 = (arg1[1]); x2 = (arg1[2]); x3 = (arg1[3]); x4 = (arg1[0]); fiat_p256_mulx_u64(&x5, &x6, x4, (arg2[3])); fiat_p256_mulx_u64(&x7, &x8, x4, (arg2[2])); fiat_p256_mulx_u64(&x9, &x10, x4, (arg2[1])); fiat_p256_mulx_u64(&x11, &x12, x4, (arg2[0])); fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9); fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7); fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5); x19 = (x18 + x6); fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22); x28 = (x27 + x23); fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24); fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26); fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28); fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20); fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21); fiat_p256_mulx_u64(&x39, &x40, x1, (arg2[3])); fiat_p256_mulx_u64(&x41, &x42, x1, (arg2[2])); fiat_p256_mulx_u64(&x43, &x44, x1, (arg2[1])); fiat_p256_mulx_u64(&x45, &x46, x1, (arg2[0])); fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43); fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41); fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39); x53 = (x52 + x40); fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45); fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47); fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49); fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51); fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53); fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66); x72 = (x71 + x67); fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68); fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70); fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72); fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64); fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65); x83 = ((uint64_t)x82 + x63); fiat_p256_mulx_u64(&x84, &x85, x2, (arg2[3])); fiat_p256_mulx_u64(&x86, &x87, x2, (arg2[2])); fiat_p256_mulx_u64(&x88, &x89, x2, (arg2[1])); fiat_p256_mulx_u64(&x90, &x91, x2, (arg2[0])); fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88); fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86); fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84); x98 = (x97 + x85); fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90); fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92); fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94); fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96); fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98); fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111); x117 = (x116 + x112); fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113); fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115); fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117); fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109); fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110); x128 = ((uint64_t)x127 + x108); fiat_p256_mulx_u64(&x129, &x130, x3, (arg2[3])); fiat_p256_mulx_u64(&x131, &x132, x3, (arg2[2])); fiat_p256_mulx_u64(&x133, &x134, x3, (arg2[1])); fiat_p256_mulx_u64(&x135, &x136, x3, (arg2[0])); fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133); fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131); fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129); x143 = (x142 + x130); fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135); fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137); fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139); fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141); fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143); fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156); x162 = (x161 + x157); fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158); fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160); fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162); fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154); fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155); x173 = ((uint64_t)x172 + x153); fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff)); fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0); fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001)); fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0); fiat_p256_cmovznz_u64(&x184, x183, x174, x165); fiat_p256_cmovznz_u64(&x185, x183, x176, x167); fiat_p256_cmovznz_u64(&x186, x183, x178, x169); fiat_p256_cmovznz_u64(&x187, x183, x180, x171); out1[0] = x184; out1[1] = x185; out1[2] = x186; out1[3] = x187; } /* * The function fiat_p256_square squares a field element in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * Postconditions: * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg1)) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_square(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; uint64_t x13; fiat_p256_uint1 x14; uint64_t x15; fiat_p256_uint1 x16; uint64_t x17; fiat_p256_uint1 x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; uint64_t x26; fiat_p256_uint1 x27; uint64_t x28; uint64_t x29; fiat_p256_uint1 x30; uint64_t x31; fiat_p256_uint1 x32; uint64_t x33; fiat_p256_uint1 x34; uint64_t x35; fiat_p256_uint1 x36; uint64_t x37; fiat_p256_uint1 x38; uint64_t x39; uint64_t x40; uint64_t x41; uint64_t x42; uint64_t x43; uint64_t x44; uint64_t x45; uint64_t x46; uint64_t x47; fiat_p256_uint1 x48; uint64_t x49; fiat_p256_uint1 x50; uint64_t x51; fiat_p256_uint1 x52; uint64_t x53; uint64_t x54; fiat_p256_uint1 x55; uint64_t x56; fiat_p256_uint1 x57; uint64_t x58; fiat_p256_uint1 x59; uint64_t x60; fiat_p256_uint1 x61; uint64_t x62; fiat_p256_uint1 x63; uint64_t x64; uint64_t x65; uint64_t x66; uint64_t x67; uint64_t x68; uint64_t x69; uint64_t x70; fiat_p256_uint1 x71; uint64_t x72; uint64_t x73; fiat_p256_uint1 x74; uint64_t x75; fiat_p256_uint1 x76; uint64_t x77; fiat_p256_uint1 x78; uint64_t x79; fiat_p256_uint1 x80; uint64_t x81; fiat_p256_uint1 x82; uint64_t x83; uint64_t x84; uint64_t x85; uint64_t x86; uint64_t x87; uint64_t x88; uint64_t x89; uint64_t x90; uint64_t x91; uint64_t x92; fiat_p256_uint1 x93; uint64_t x94; fiat_p256_uint1 x95; uint64_t x96; fiat_p256_uint1 x97; uint64_t x98; uint64_t x99; fiat_p256_uint1 x100; uint64_t x101; fiat_p256_uint1 x102; uint64_t x103; fiat_p256_uint1 x104; uint64_t x105; fiat_p256_uint1 x106; uint64_t x107; fiat_p256_uint1 x108; uint64_t x109; uint64_t x110; uint64_t x111; uint64_t x112; uint64_t x113; uint64_t x114; uint64_t x115; fiat_p256_uint1 x116; uint64_t x117; uint64_t x118; fiat_p256_uint1 x119; uint64_t x120; fiat_p256_uint1 x121; uint64_t x122; fiat_p256_uint1 x123; uint64_t x124; fiat_p256_uint1 x125; uint64_t x126; fiat_p256_uint1 x127; uint64_t x128; uint64_t x129; uint64_t x130; uint64_t x131; uint64_t x132; uint64_t x133; uint64_t x134; uint64_t x135; uint64_t x136; uint64_t x137; fiat_p256_uint1 x138; uint64_t x139; fiat_p256_uint1 x140; uint64_t x141; fiat_p256_uint1 x142; uint64_t x143; uint64_t x144; fiat_p256_uint1 x145; uint64_t x146; fiat_p256_uint1 x147; uint64_t x148; fiat_p256_uint1 x149; uint64_t x150; fiat_p256_uint1 x151; uint64_t x152; fiat_p256_uint1 x153; uint64_t x154; uint64_t x155; uint64_t x156; uint64_t x157; uint64_t x158; uint64_t x159; uint64_t x160; fiat_p256_uint1 x161; uint64_t x162; uint64_t x163; fiat_p256_uint1 x164; uint64_t x165; fiat_p256_uint1 x166; uint64_t x167; fiat_p256_uint1 x168; uint64_t x169; fiat_p256_uint1 x170; uint64_t x171; fiat_p256_uint1 x172; uint64_t x173; uint64_t x174; fiat_p256_uint1 x175; uint64_t x176; fiat_p256_uint1 x177; uint64_t x178; fiat_p256_uint1 x179; uint64_t x180; fiat_p256_uint1 x181; uint64_t x182; fiat_p256_uint1 x183; uint64_t x184; uint64_t x185; uint64_t x186; uint64_t x187; x1 = (arg1[1]); x2 = (arg1[2]); x3 = (arg1[3]); x4 = (arg1[0]); fiat_p256_mulx_u64(&x5, &x6, x4, (arg1[3])); fiat_p256_mulx_u64(&x7, &x8, x4, (arg1[2])); fiat_p256_mulx_u64(&x9, &x10, x4, (arg1[1])); fiat_p256_mulx_u64(&x11, &x12, x4, (arg1[0])); fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9); fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7); fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5); x19 = (x18 + x6); fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22); x28 = (x27 + x23); fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24); fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26); fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28); fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20); fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21); fiat_p256_mulx_u64(&x39, &x40, x1, (arg1[3])); fiat_p256_mulx_u64(&x41, &x42, x1, (arg1[2])); fiat_p256_mulx_u64(&x43, &x44, x1, (arg1[1])); fiat_p256_mulx_u64(&x45, &x46, x1, (arg1[0])); fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43); fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41); fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39); x53 = (x52 + x40); fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45); fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47); fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49); fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51); fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53); fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66); x72 = (x71 + x67); fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68); fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70); fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72); fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64); fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65); x83 = ((uint64_t)x82 + x63); fiat_p256_mulx_u64(&x84, &x85, x2, (arg1[3])); fiat_p256_mulx_u64(&x86, &x87, x2, (arg1[2])); fiat_p256_mulx_u64(&x88, &x89, x2, (arg1[1])); fiat_p256_mulx_u64(&x90, &x91, x2, (arg1[0])); fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88); fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86); fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84); x98 = (x97 + x85); fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90); fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92); fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94); fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96); fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98); fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111); x117 = (x116 + x112); fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113); fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115); fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117); fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109); fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110); x128 = ((uint64_t)x127 + x108); fiat_p256_mulx_u64(&x129, &x130, x3, (arg1[3])); fiat_p256_mulx_u64(&x131, &x132, x3, (arg1[2])); fiat_p256_mulx_u64(&x133, &x134, x3, (arg1[1])); fiat_p256_mulx_u64(&x135, &x136, x3, (arg1[0])); fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133); fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131); fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129); x143 = (x142 + x130); fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135); fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137); fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139); fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141); fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143); fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156); x162 = (x161 + x157); fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158); fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160); fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162); fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154); fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155); x173 = ((uint64_t)x172 + x153); fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff)); fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0); fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001)); fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0); fiat_p256_cmovznz_u64(&x184, x183, x174, x165); fiat_p256_cmovznz_u64(&x185, x183, x176, x167); fiat_p256_cmovznz_u64(&x186, x183, x178, x169); fiat_p256_cmovznz_u64(&x187, x183, x180, x171); out1[0] = x184; out1[1] = x185; out1[2] = x186; out1[3] = x187; } /* * The function fiat_p256_add adds two field elements in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * 0 ≤ eval arg2 < m * Postconditions: * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) + eval (from_montgomery arg2)) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_add(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { uint64_t x1; fiat_p256_uint1 x2; uint64_t x3; fiat_p256_uint1 x4; uint64_t x5; fiat_p256_uint1 x6; uint64_t x7; fiat_p256_uint1 x8; uint64_t x9; fiat_p256_uint1 x10; uint64_t x11; fiat_p256_uint1 x12; uint64_t x13; fiat_p256_uint1 x14; uint64_t x15; fiat_p256_uint1 x16; uint64_t x17; fiat_p256_uint1 x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); fiat_p256_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1])); fiat_p256_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2])); fiat_p256_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3])); fiat_p256_subborrowx_u64(&x9, &x10, 0x0, x1, UINT64_C(0xffffffffffffffff)); fiat_p256_subborrowx_u64(&x11, &x12, x10, x3, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u64(&x13, &x14, x12, x5, 0x0); fiat_p256_subborrowx_u64(&x15, &x16, x14, x7, UINT64_C(0xffffffff00000001)); fiat_p256_subborrowx_u64(&x17, &x18, x16, x8, 0x0); fiat_p256_cmovznz_u64(&x19, x18, x9, x1); fiat_p256_cmovznz_u64(&x20, x18, x11, x3); fiat_p256_cmovznz_u64(&x21, x18, x13, x5); fiat_p256_cmovznz_u64(&x22, x18, x15, x7); out1[0] = x19; out1[1] = x20; out1[2] = x21; out1[3] = x22; } /* * The function fiat_p256_sub subtracts two field elements in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * 0 ≤ eval arg2 < m * Postconditions: * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) - eval (from_montgomery arg2)) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_sub(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { uint64_t x1; fiat_p256_uint1 x2; uint64_t x3; fiat_p256_uint1 x4; uint64_t x5; fiat_p256_uint1 x6; uint64_t x7; fiat_p256_uint1 x8; uint64_t x9; uint64_t x10; fiat_p256_uint1 x11; uint64_t x12; fiat_p256_uint1 x13; uint64_t x14; fiat_p256_uint1 x15; uint64_t x16; fiat_p256_uint1 x17; fiat_p256_subborrowx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); fiat_p256_subborrowx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1])); fiat_p256_subborrowx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2])); fiat_p256_subborrowx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3])); fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9); fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff))); fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0); fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001))); out1[0] = x10; out1[1] = x12; out1[2] = x14; out1[3] = x16; } /* * The function fiat_p256_opp negates a field element in the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * Postconditions: * eval (from_montgomery out1) mod m = -eval (from_montgomery arg1) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_opp(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { uint64_t x1; fiat_p256_uint1 x2; uint64_t x3; fiat_p256_uint1 x4; uint64_t x5; fiat_p256_uint1 x6; uint64_t x7; fiat_p256_uint1 x8; uint64_t x9; uint64_t x10; fiat_p256_uint1 x11; uint64_t x12; fiat_p256_uint1 x13; uint64_t x14; fiat_p256_uint1 x15; uint64_t x16; fiat_p256_uint1 x17; fiat_p256_subborrowx_u64(&x1, &x2, 0x0, 0x0, (arg1[0])); fiat_p256_subborrowx_u64(&x3, &x4, x2, 0x0, (arg1[1])); fiat_p256_subborrowx_u64(&x5, &x6, x4, 0x0, (arg1[2])); fiat_p256_subborrowx_u64(&x7, &x8, x6, 0x0, (arg1[3])); fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9); fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff))); fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0); fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001))); out1[0] = x10; out1[1] = x12; out1[2] = x14; out1[3] = x16; } /* * The function fiat_p256_from_montgomery translates a field element out of the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * Postconditions: * eval out1 mod m = (eval arg1 * ((2^64)⁻¹ mod m)^4) mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_from_montgomery(fiat_p256_non_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; fiat_p256_uint1 x9; uint64_t x10; fiat_p256_uint1 x11; uint64_t x12; fiat_p256_uint1 x13; uint64_t x14; fiat_p256_uint1 x15; uint64_t x16; uint64_t x17; uint64_t x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; fiat_p256_uint1 x23; uint64_t x24; fiat_p256_uint1 x25; uint64_t x26; fiat_p256_uint1 x27; uint64_t x28; fiat_p256_uint1 x29; uint64_t x30; fiat_p256_uint1 x31; uint64_t x32; fiat_p256_uint1 x33; uint64_t x34; fiat_p256_uint1 x35; uint64_t x36; fiat_p256_uint1 x37; uint64_t x38; uint64_t x39; uint64_t x40; uint64_t x41; uint64_t x42; uint64_t x43; uint64_t x44; fiat_p256_uint1 x45; uint64_t x46; fiat_p256_uint1 x47; uint64_t x48; fiat_p256_uint1 x49; uint64_t x50; fiat_p256_uint1 x51; uint64_t x52; fiat_p256_uint1 x53; uint64_t x54; fiat_p256_uint1 x55; uint64_t x56; fiat_p256_uint1 x57; uint64_t x58; fiat_p256_uint1 x59; uint64_t x60; uint64_t x61; uint64_t x62; uint64_t x63; uint64_t x64; uint64_t x65; uint64_t x66; fiat_p256_uint1 x67; uint64_t x68; fiat_p256_uint1 x69; uint64_t x70; fiat_p256_uint1 x71; uint64_t x72; fiat_p256_uint1 x73; uint64_t x74; fiat_p256_uint1 x75; uint64_t x76; uint64_t x77; fiat_p256_uint1 x78; uint64_t x79; fiat_p256_uint1 x80; uint64_t x81; fiat_p256_uint1 x82; uint64_t x83; fiat_p256_uint1 x84; uint64_t x85; fiat_p256_uint1 x86; uint64_t x87; uint64_t x88; uint64_t x89; uint64_t x90; x1 = (arg1[0]); fiat_p256_mulx_u64(&x2, &x3, x1, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x4, &x5, x1, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x6, &x7, x1, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x8, &x9, 0x0, x7, x4); fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x6); fiat_p256_addcarryx_u64(&x12, &x13, x11, 0x0, x8); fiat_p256_addcarryx_u64(&x14, &x15, 0x0, x12, (arg1[1])); fiat_p256_mulx_u64(&x16, &x17, x14, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x18, &x19, x14, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x20, &x21, x14, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x22, &x23, 0x0, x21, x18); fiat_p256_addcarryx_u64(&x24, &x25, 0x0, x14, x20); fiat_p256_addcarryx_u64(&x26, &x27, x25, (x15 + (x13 + (x9 + x5))), x22); fiat_p256_addcarryx_u64(&x28, &x29, x27, x2, (x23 + x19)); fiat_p256_addcarryx_u64(&x30, &x31, x29, x3, x16); fiat_p256_addcarryx_u64(&x32, &x33, 0x0, x26, (arg1[2])); fiat_p256_addcarryx_u64(&x34, &x35, x33, x28, 0x0); fiat_p256_addcarryx_u64(&x36, &x37, x35, x30, 0x0); fiat_p256_mulx_u64(&x38, &x39, x32, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x40, &x41, x32, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x42, &x43, x32, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x44, &x45, 0x0, x43, x40); fiat_p256_addcarryx_u64(&x46, &x47, 0x0, x32, x42); fiat_p256_addcarryx_u64(&x48, &x49, x47, x34, x44); fiat_p256_addcarryx_u64(&x50, &x51, x49, x36, (x45 + x41)); fiat_p256_addcarryx_u64(&x52, &x53, x51, (x37 + (x31 + x17)), x38); fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x48, (arg1[3])); fiat_p256_addcarryx_u64(&x56, &x57, x55, x50, 0x0); fiat_p256_addcarryx_u64(&x58, &x59, x57, x52, 0x0); fiat_p256_mulx_u64(&x60, &x61, x54, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x62, &x63, x54, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x66, &x67, 0x0, x65, x62); fiat_p256_addcarryx_u64(&x68, &x69, 0x0, x54, x64); fiat_p256_addcarryx_u64(&x70, &x71, x69, x56, x66); fiat_p256_addcarryx_u64(&x72, &x73, x71, x58, (x67 + x63)); fiat_p256_addcarryx_u64(&x74, &x75, x73, (x59 + (x53 + x39)), x60); x76 = (x75 + x61); fiat_p256_subborrowx_u64(&x77, &x78, 0x0, x70, UINT64_C(0xffffffffffffffff)); fiat_p256_subborrowx_u64(&x79, &x80, x78, x72, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u64(&x81, &x82, x80, x74, 0x0); fiat_p256_subborrowx_u64(&x83, &x84, x82, x76, UINT64_C(0xffffffff00000001)); fiat_p256_subborrowx_u64(&x85, &x86, x84, 0x0, 0x0); fiat_p256_cmovznz_u64(&x87, x86, x77, x70); fiat_p256_cmovznz_u64(&x88, x86, x79, x72); fiat_p256_cmovznz_u64(&x89, x86, x81, x74); fiat_p256_cmovznz_u64(&x90, x86, x83, x76); out1[0] = x87; out1[1] = x88; out1[2] = x89; out1[3] = x90; } /* * The function fiat_p256_to_montgomery translates a field element into the Montgomery domain. * * Preconditions: * 0 ≤ eval arg1 < m * Postconditions: * eval (from_montgomery out1) mod m = eval arg1 mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_to_montgomery(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_non_montgomery_domain_field_element arg1) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; uint64_t x13; fiat_p256_uint1 x14; uint64_t x15; fiat_p256_uint1 x16; uint64_t x17; fiat_p256_uint1 x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; fiat_p256_uint1 x26; uint64_t x27; fiat_p256_uint1 x28; uint64_t x29; fiat_p256_uint1 x30; uint64_t x31; fiat_p256_uint1 x32; uint64_t x33; fiat_p256_uint1 x34; uint64_t x35; fiat_p256_uint1 x36; uint64_t x37; uint64_t x38; uint64_t x39; uint64_t x40; uint64_t x41; uint64_t x42; uint64_t x43; uint64_t x44; uint64_t x45; fiat_p256_uint1 x46; uint64_t x47; fiat_p256_uint1 x48; uint64_t x49; fiat_p256_uint1 x50; uint64_t x51; fiat_p256_uint1 x52; uint64_t x53; fiat_p256_uint1 x54; uint64_t x55; fiat_p256_uint1 x56; uint64_t x57; fiat_p256_uint1 x58; uint64_t x59; uint64_t x60; uint64_t x61; uint64_t x62; uint64_t x63; uint64_t x64; uint64_t x65; fiat_p256_uint1 x66; uint64_t x67; fiat_p256_uint1 x68; uint64_t x69; fiat_p256_uint1 x70; uint64_t x71; fiat_p256_uint1 x72; uint64_t x73; fiat_p256_uint1 x74; uint64_t x75; fiat_p256_uint1 x76; uint64_t x77; uint64_t x78; uint64_t x79; uint64_t x80; uint64_t x81; uint64_t x82; uint64_t x83; uint64_t x84; uint64_t x85; fiat_p256_uint1 x86; uint64_t x87; fiat_p256_uint1 x88; uint64_t x89; fiat_p256_uint1 x90; uint64_t x91; fiat_p256_uint1 x92; uint64_t x93; fiat_p256_uint1 x94; uint64_t x95; fiat_p256_uint1 x96; uint64_t x97; fiat_p256_uint1 x98; uint64_t x99; uint64_t x100; uint64_t x101; uint64_t x102; uint64_t x103; uint64_t x104; uint64_t x105; fiat_p256_uint1 x106; uint64_t x107; fiat_p256_uint1 x108; uint64_t x109; fiat_p256_uint1 x110; uint64_t x111; fiat_p256_uint1 x112; uint64_t x113; fiat_p256_uint1 x114; uint64_t x115; fiat_p256_uint1 x116; uint64_t x117; uint64_t x118; uint64_t x119; uint64_t x120; uint64_t x121; uint64_t x122; uint64_t x123; uint64_t x124; uint64_t x125; fiat_p256_uint1 x126; uint64_t x127; fiat_p256_uint1 x128; uint64_t x129; fiat_p256_uint1 x130; uint64_t x131; fiat_p256_uint1 x132; uint64_t x133; fiat_p256_uint1 x134; uint64_t x135; fiat_p256_uint1 x136; uint64_t x137; fiat_p256_uint1 x138; uint64_t x139; uint64_t x140; uint64_t x141; uint64_t x142; uint64_t x143; uint64_t x144; uint64_t x145; fiat_p256_uint1 x146; uint64_t x147; fiat_p256_uint1 x148; uint64_t x149; fiat_p256_uint1 x150; uint64_t x151; fiat_p256_uint1 x152; uint64_t x153; fiat_p256_uint1 x154; uint64_t x155; fiat_p256_uint1 x156; uint64_t x157; fiat_p256_uint1 x158; uint64_t x159; fiat_p256_uint1 x160; uint64_t x161; fiat_p256_uint1 x162; uint64_t x163; fiat_p256_uint1 x164; uint64_t x165; fiat_p256_uint1 x166; uint64_t x167; uint64_t x168; uint64_t x169; uint64_t x170; x1 = (arg1[1]); x2 = (arg1[2]); x3 = (arg1[3]); x4 = (arg1[0]); fiat_p256_mulx_u64(&x5, &x6, x4, UINT64_C(0x4fffffffd)); fiat_p256_mulx_u64(&x7, &x8, x4, UINT64_C(0xfffffffffffffffe)); fiat_p256_mulx_u64(&x9, &x10, x4, UINT64_C(0xfffffffbffffffff)); fiat_p256_mulx_u64(&x11, &x12, x4, 0x3); fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9); fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7); fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5); fiat_p256_mulx_u64(&x19, &x20, x11, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x21, &x22, x11, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x23, &x24, x11, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x25, &x26, 0x0, x24, x21); fiat_p256_addcarryx_u64(&x27, &x28, 0x0, x11, x23); fiat_p256_addcarryx_u64(&x29, &x30, x28, x13, x25); fiat_p256_addcarryx_u64(&x31, &x32, x30, x15, (x26 + x22)); fiat_p256_addcarryx_u64(&x33, &x34, x32, x17, x19); fiat_p256_addcarryx_u64(&x35, &x36, x34, (x18 + x6), x20); fiat_p256_mulx_u64(&x37, &x38, x1, UINT64_C(0x4fffffffd)); fiat_p256_mulx_u64(&x39, &x40, x1, UINT64_C(0xfffffffffffffffe)); fiat_p256_mulx_u64(&x41, &x42, x1, UINT64_C(0xfffffffbffffffff)); fiat_p256_mulx_u64(&x43, &x44, x1, 0x3); fiat_p256_addcarryx_u64(&x45, &x46, 0x0, x44, x41); fiat_p256_addcarryx_u64(&x47, &x48, x46, x42, x39); fiat_p256_addcarryx_u64(&x49, &x50, x48, x40, x37); fiat_p256_addcarryx_u64(&x51, &x52, 0x0, x29, x43); fiat_p256_addcarryx_u64(&x53, &x54, x52, x31, x45); fiat_p256_addcarryx_u64(&x55, &x56, x54, x33, x47); fiat_p256_addcarryx_u64(&x57, &x58, x56, x35, x49); fiat_p256_mulx_u64(&x59, &x60, x51, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x61, &x62, x51, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x63, &x64, x51, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x65, &x66, 0x0, x64, x61); fiat_p256_addcarryx_u64(&x67, &x68, 0x0, x51, x63); fiat_p256_addcarryx_u64(&x69, &x70, x68, x53, x65); fiat_p256_addcarryx_u64(&x71, &x72, x70, x55, (x66 + x62)); fiat_p256_addcarryx_u64(&x73, &x74, x72, x57, x59); fiat_p256_addcarryx_u64(&x75, &x76, x74, (((uint64_t)x58 + x36) + (x50 + x38)), x60); fiat_p256_mulx_u64(&x77, &x78, x2, UINT64_C(0x4fffffffd)); fiat_p256_mulx_u64(&x79, &x80, x2, UINT64_C(0xfffffffffffffffe)); fiat_p256_mulx_u64(&x81, &x82, x2, UINT64_C(0xfffffffbffffffff)); fiat_p256_mulx_u64(&x83, &x84, x2, 0x3); fiat_p256_addcarryx_u64(&x85, &x86, 0x0, x84, x81); fiat_p256_addcarryx_u64(&x87, &x88, x86, x82, x79); fiat_p256_addcarryx_u64(&x89, &x90, x88, x80, x77); fiat_p256_addcarryx_u64(&x91, &x92, 0x0, x69, x83); fiat_p256_addcarryx_u64(&x93, &x94, x92, x71, x85); fiat_p256_addcarryx_u64(&x95, &x96, x94, x73, x87); fiat_p256_addcarryx_u64(&x97, &x98, x96, x75, x89); fiat_p256_mulx_u64(&x99, &x100, x91, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x101, &x102, x91, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x103, &x104, x91, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x105, &x106, 0x0, x104, x101); fiat_p256_addcarryx_u64(&x107, &x108, 0x0, x91, x103); fiat_p256_addcarryx_u64(&x109, &x110, x108, x93, x105); fiat_p256_addcarryx_u64(&x111, &x112, x110, x95, (x106 + x102)); fiat_p256_addcarryx_u64(&x113, &x114, x112, x97, x99); fiat_p256_addcarryx_u64(&x115, &x116, x114, (((uint64_t)x98 + x76) + (x90 + x78)), x100); fiat_p256_mulx_u64(&x117, &x118, x3, UINT64_C(0x4fffffffd)); fiat_p256_mulx_u64(&x119, &x120, x3, UINT64_C(0xfffffffffffffffe)); fiat_p256_mulx_u64(&x121, &x122, x3, UINT64_C(0xfffffffbffffffff)); fiat_p256_mulx_u64(&x123, &x124, x3, 0x3); fiat_p256_addcarryx_u64(&x125, &x126, 0x0, x124, x121); fiat_p256_addcarryx_u64(&x127, &x128, x126, x122, x119); fiat_p256_addcarryx_u64(&x129, &x130, x128, x120, x117); fiat_p256_addcarryx_u64(&x131, &x132, 0x0, x109, x123); fiat_p256_addcarryx_u64(&x133, &x134, x132, x111, x125); fiat_p256_addcarryx_u64(&x135, &x136, x134, x113, x127); fiat_p256_addcarryx_u64(&x137, &x138, x136, x115, x129); fiat_p256_mulx_u64(&x139, &x140, x131, UINT64_C(0xffffffff00000001)); fiat_p256_mulx_u64(&x141, &x142, x131, UINT32_C(0xffffffff)); fiat_p256_mulx_u64(&x143, &x144, x131, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x145, &x146, 0x0, x144, x141); fiat_p256_addcarryx_u64(&x147, &x148, 0x0, x131, x143); fiat_p256_addcarryx_u64(&x149, &x150, x148, x133, x145); fiat_p256_addcarryx_u64(&x151, &x152, x150, x135, (x146 + x142)); fiat_p256_addcarryx_u64(&x153, &x154, x152, x137, x139); fiat_p256_addcarryx_u64(&x155, &x156, x154, (((uint64_t)x138 + x116) + (x130 + x118)), x140); fiat_p256_subborrowx_u64(&x157, &x158, 0x0, x149, UINT64_C(0xffffffffffffffff)); fiat_p256_subborrowx_u64(&x159, &x160, x158, x151, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u64(&x161, &x162, x160, x153, 0x0); fiat_p256_subborrowx_u64(&x163, &x164, x162, x155, UINT64_C(0xffffffff00000001)); fiat_p256_subborrowx_u64(&x165, &x166, x164, x156, 0x0); fiat_p256_cmovznz_u64(&x167, x166, x157, x149); fiat_p256_cmovznz_u64(&x168, x166, x159, x151); fiat_p256_cmovznz_u64(&x169, x166, x161, x153); fiat_p256_cmovznz_u64(&x170, x166, x163, x155); out1[0] = x167; out1[1] = x168; out1[2] = x169; out1[3] = x170; } /* * The function fiat_p256_nonzero outputs a single non-zero word if the input is non-zero and zero otherwise. * * Preconditions: * 0 ≤ eval arg1 < m * Postconditions: * out1 = 0 ↔ eval (from_montgomery arg1) mod m = 0 * * Input Bounds: * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] */ static FIAT_P256_FIAT_INLINE void fiat_p256_nonzero(uint64_t* out1, const uint64_t arg1[4]) { uint64_t x1; x1 = ((arg1[0]) | ((arg1[1]) | ((arg1[2]) | (arg1[3])))); *out1 = x1; } /* * The function fiat_p256_selectznz is a multi-limb conditional select. * * Postconditions: * out1 = (if arg1 = 0 then arg2 else arg3) * * Input Bounds: * arg1: [0x0 ~> 0x1] * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * Output Bounds: * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ static FIAT_P256_FIAT_INLINE void fiat_p256_selectznz(uint64_t out1[4], fiat_p256_uint1 arg1, const uint64_t arg2[4], const uint64_t arg3[4]) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; fiat_p256_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0])); fiat_p256_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1])); fiat_p256_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2])); fiat_p256_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3])); out1[0] = x1; out1[1] = x2; out1[2] = x3; out1[3] = x4; } /* * The function fiat_p256_to_bytes serializes a field element NOT in the Montgomery domain to bytes in little-endian order. * * Preconditions: * 0 ≤ eval arg1 < m * Postconditions: * out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31] * * Input Bounds: * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * Output Bounds: * out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff]] */ static FIAT_P256_FIAT_INLINE void fiat_p256_to_bytes(uint8_t out1[32], const uint64_t arg1[4]) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint8_t x5; uint64_t x6; uint8_t x7; uint64_t x8; uint8_t x9; uint64_t x10; uint8_t x11; uint64_t x12; uint8_t x13; uint64_t x14; uint8_t x15; uint64_t x16; uint8_t x17; uint8_t x18; uint8_t x19; uint64_t x20; uint8_t x21; uint64_t x22; uint8_t x23; uint64_t x24; uint8_t x25; uint64_t x26; uint8_t x27; uint64_t x28; uint8_t x29; uint64_t x30; uint8_t x31; uint8_t x32; uint8_t x33; uint64_t x34; uint8_t x35; uint64_t x36; uint8_t x37; uint64_t x38; uint8_t x39; uint64_t x40; uint8_t x41; uint64_t x42; uint8_t x43; uint64_t x44; uint8_t x45; uint8_t x46; uint8_t x47; uint64_t x48; uint8_t x49; uint64_t x50; uint8_t x51; uint64_t x52; uint8_t x53; uint64_t x54; uint8_t x55; uint64_t x56; uint8_t x57; uint64_t x58; uint8_t x59; uint8_t x60; x1 = (arg1[3]); x2 = (arg1[2]); x3 = (arg1[1]); x4 = (arg1[0]); x5 = (uint8_t)(x4 & UINT8_C(0xff)); x6 = (x4 >> 8); x7 = (uint8_t)(x6 & UINT8_C(0xff)); x8 = (x6 >> 8); x9 = (uint8_t)(x8 & UINT8_C(0xff)); x10 = (x8 >> 8); x11 = (uint8_t)(x10 & UINT8_C(0xff)); x12 = (x10 >> 8); x13 = (uint8_t)(x12 & UINT8_C(0xff)); x14 = (x12 >> 8); x15 = (uint8_t)(x14 & UINT8_C(0xff)); x16 = (x14 >> 8); x17 = (uint8_t)(x16 & UINT8_C(0xff)); x18 = (uint8_t)(x16 >> 8); x19 = (uint8_t)(x3 & UINT8_C(0xff)); x20 = (x3 >> 8); x21 = (uint8_t)(x20 & UINT8_C(0xff)); x22 = (x20 >> 8); x23 = (uint8_t)(x22 & UINT8_C(0xff)); x24 = (x22 >> 8); x25 = (uint8_t)(x24 & UINT8_C(0xff)); x26 = (x24 >> 8); x27 = (uint8_t)(x26 & UINT8_C(0xff)); x28 = (x26 >> 8); x29 = (uint8_t)(x28 & UINT8_C(0xff)); x30 = (x28 >> 8); x31 = (uint8_t)(x30 & UINT8_C(0xff)); x32 = (uint8_t)(x30 >> 8); x33 = (uint8_t)(x2 & UINT8_C(0xff)); x34 = (x2 >> 8); x35 = (uint8_t)(x34 & UINT8_C(0xff)); x36 = (x34 >> 8); x37 = (uint8_t)(x36 & UINT8_C(0xff)); x38 = (x36 >> 8); x39 = (uint8_t)(x38 & UINT8_C(0xff)); x40 = (x38 >> 8); x41 = (uint8_t)(x40 & UINT8_C(0xff)); x42 = (x40 >> 8); x43 = (uint8_t)(x42 & UINT8_C(0xff)); x44 = (x42 >> 8); x45 = (uint8_t)(x44 & UINT8_C(0xff)); x46 = (uint8_t)(x44 >> 8); x47 = (uint8_t)(x1 & UINT8_C(0xff)); x48 = (x1 >> 8); x49 = (uint8_t)(x48 & UINT8_C(0xff)); x50 = (x48 >> 8); x51 = (uint8_t)(x50 & UINT8_C(0xff)); x52 = (x50 >> 8); x53 = (uint8_t)(x52 & UINT8_C(0xff)); x54 = (x52 >> 8); x55 = (uint8_t)(x54 & UINT8_C(0xff)); x56 = (x54 >> 8); x57 = (uint8_t)(x56 & UINT8_C(0xff)); x58 = (x56 >> 8); x59 = (uint8_t)(x58 & UINT8_C(0xff)); x60 = (uint8_t)(x58 >> 8); out1[0] = x5; out1[1] = x7; out1[2] = x9; out1[3] = x11; out1[4] = x13; out1[5] = x15; out1[6] = x17; out1[7] = x18; out1[8] = x19; out1[9] = x21; out1[10] = x23; out1[11] = x25; out1[12] = x27; out1[13] = x29; out1[14] = x31; out1[15] = x32; out1[16] = x33; out1[17] = x35; out1[18] = x37; out1[19] = x39; out1[20] = x41; out1[21] = x43; out1[22] = x45; out1[23] = x46; out1[24] = x47; out1[25] = x49; out1[26] = x51; out1[27] = x53; out1[28] = x55; out1[29] = x57; out1[30] = x59; out1[31] = x60; } /* * The function fiat_p256_from_bytes deserializes a field element NOT in the Montgomery domain from bytes in little-endian order. * * Preconditions: * 0 ≤ bytes_eval arg1 < m * Postconditions: * eval out1 mod m = bytes_eval arg1 mod m * 0 ≤ eval out1 < m * * Input Bounds: * arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff]] * Output Bounds: * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ static FIAT_P256_FIAT_INLINE void fiat_p256_from_bytes(uint64_t out1[4], const uint8_t arg1[32]) { uint64_t x1; uint64_t x2; uint64_t x3; uint64_t x4; uint64_t x5; uint64_t x6; uint64_t x7; uint8_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; uint64_t x13; uint64_t x14; uint64_t x15; uint8_t x16; uint64_t x17; uint64_t x18; uint64_t x19; uint64_t x20; uint64_t x21; uint64_t x22; uint64_t x23; uint8_t x24; uint64_t x25; uint64_t x26; uint64_t x27; uint64_t x28; uint64_t x29; uint64_t x30; uint64_t x31; uint8_t x32; uint64_t x33; uint64_t x34; uint64_t x35; uint64_t x36; uint64_t x37; uint64_t x38; uint64_t x39; uint64_t x40; uint64_t x41; uint64_t x42; uint64_t x43; uint64_t x44; uint64_t x45; uint64_t x46; uint64_t x47; uint64_t x48; uint64_t x49; uint64_t x50; uint64_t x51; uint64_t x52; uint64_t x53; uint64_t x54; uint64_t x55; uint64_t x56; uint64_t x57; uint64_t x58; uint64_t x59; uint64_t x60; x1 = ((uint64_t)(arg1[31]) << 56); x2 = ((uint64_t)(arg1[30]) << 48); x3 = ((uint64_t)(arg1[29]) << 40); x4 = ((uint64_t)(arg1[28]) << 32); x5 = ((uint64_t)(arg1[27]) << 24); x6 = ((uint64_t)(arg1[26]) << 16); x7 = ((uint64_t)(arg1[25]) << 8); x8 = (arg1[24]); x9 = ((uint64_t)(arg1[23]) << 56); x10 = ((uint64_t)(arg1[22]) << 48); x11 = ((uint64_t)(arg1[21]) << 40); x12 = ((uint64_t)(arg1[20]) << 32); x13 = ((uint64_t)(arg1[19]) << 24); x14 = ((uint64_t)(arg1[18]) << 16); x15 = ((uint64_t)(arg1[17]) << 8); x16 = (arg1[16]); x17 = ((uint64_t)(arg1[15]) << 56); x18 = ((uint64_t)(arg1[14]) << 48); x19 = ((uint64_t)(arg1[13]) << 40); x20 = ((uint64_t)(arg1[12]) << 32); x21 = ((uint64_t)(arg1[11]) << 24); x22 = ((uint64_t)(arg1[10]) << 16); x23 = ((uint64_t)(arg1[9]) << 8); x24 = (arg1[8]); x25 = ((uint64_t)(arg1[7]) << 56); x26 = ((uint64_t)(arg1[6]) << 48); x27 = ((uint64_t)(arg1[5]) << 40); x28 = ((uint64_t)(arg1[4]) << 32); x29 = ((uint64_t)(arg1[3]) << 24); x30 = ((uint64_t)(arg1[2]) << 16); x31 = ((uint64_t)(arg1[1]) << 8); x32 = (arg1[0]); x33 = (x31 + (uint64_t)x32); x34 = (x30 + x33); x35 = (x29 + x34); x36 = (x28 + x35); x37 = (x27 + x36); x38 = (x26 + x37); x39 = (x25 + x38); x40 = (x23 + (uint64_t)x24); x41 = (x22 + x40); x42 = (x21 + x41); x43 = (x20 + x42); x44 = (x19 + x43); x45 = (x18 + x44); x46 = (x17 + x45); x47 = (x15 + (uint64_t)x16); x48 = (x14 + x47); x49 = (x13 + x48); x50 = (x12 + x49); x51 = (x11 + x50); x52 = (x10 + x51); x53 = (x9 + x52); x54 = (x7 + (uint64_t)x8); x55 = (x6 + x54); x56 = (x5 + x55); x57 = (x4 + x56); x58 = (x3 + x57); x59 = (x2 + x58); x60 = (x1 + x59); out1[0] = x39; out1[1] = x46; out1[2] = x53; out1[3] = x60; } /* * The function fiat_p256_set_one returns the field element one in the Montgomery domain. * * Postconditions: * eval (from_montgomery out1) mod m = 1 mod m * 0 ≤ eval out1 < m * */ static FIAT_P256_FIAT_INLINE void fiat_p256_set_one(fiat_p256_montgomery_domain_field_element out1) { out1[0] = 0x1; out1[1] = UINT64_C(0xffffffff00000000); out1[2] = UINT64_C(0xffffffffffffffff); out1[3] = UINT32_C(0xfffffffe); } /* * The function fiat_p256_msat returns the saturated representation of the prime modulus. * * Postconditions: * twos_complement_eval out1 = m * 0 ≤ eval out1 < m * * Output Bounds: * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ static FIAT_P256_FIAT_INLINE void fiat_p256_msat(uint64_t out1[5]) { out1[0] = UINT64_C(0xffffffffffffffff); out1[1] = UINT32_C(0xffffffff); out1[2] = 0x0; out1[3] = UINT64_C(0xffffffff00000001); out1[4] = 0x0; } /* * The function fiat_p256_divstep computes a divstep. * * Preconditions: * 0 ≤ eval arg4 < m * 0 ≤ eval arg5 < m * Postconditions: * out1 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then 1 - arg1 else 1 + arg1) * twos_complement_eval out2 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then twos_complement_eval arg3 else twos_complement_eval arg2) * twos_complement_eval out3 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then ⌊(twos_complement_eval arg3 - twos_complement_eval arg2) / 2⌋ else ⌊(twos_complement_eval arg3 + (twos_complement_eval arg3 mod 2) * twos_complement_eval arg2) / 2⌋) * eval (from_montgomery out4) mod m = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then (2 * eval (from_montgomery arg5)) mod m else (2 * eval (from_montgomery arg4)) mod m) * eval (from_montgomery out5) mod m = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then (eval (from_montgomery arg4) - eval (from_montgomery arg4)) mod m else (eval (from_montgomery arg5) + (twos_complement_eval arg3 mod 2) * eval (from_montgomery arg4)) mod m) * 0 ≤ eval out5 < m * 0 ≤ eval out5 < m * 0 ≤ eval out2 < m * 0 ≤ eval out3 < m * * Input Bounds: * arg1: [0x0 ~> 0xffffffffffffffff] * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * arg4: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * arg5: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * Output Bounds: * out1: [0x0 ~> 0xffffffffffffffff] * out2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * out3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * out4: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] * out5: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ static FIAT_P256_FIAT_INLINE void fiat_p256_divstep(uint64_t* out1, uint64_t out2[5], uint64_t out3[5], uint64_t out4[4], uint64_t out5[4], uint64_t arg1, const uint64_t arg2[5], const uint64_t arg3[5], const uint64_t arg4[4], const uint64_t arg5[4]) { uint64_t x1; fiat_p256_uint1 x2; fiat_p256_uint1 x3; uint64_t x4; fiat_p256_uint1 x5; uint64_t x6; uint64_t x7; uint64_t x8; uint64_t x9; uint64_t x10; uint64_t x11; uint64_t x12; fiat_p256_uint1 x13; uint64_t x14; fiat_p256_uint1 x15; uint64_t x16; fiat_p256_uint1 x17; uint64_t x18; fiat_p256_uint1 x19; uint64_t x20; fiat_p256_uint1 x21; uint64_t x22; uint64_t x23; uint64_t x24; uint64_t x25; uint64_t x26; uint64_t x27; uint64_t x28; uint64_t x29; uint64_t x30; uint64_t x31; fiat_p256_uint1 x32; uint64_t x33; fiat_p256_uint1 x34; uint64_t x35; fiat_p256_uint1 x36; uint64_t x37; fiat_p256_uint1 x38; uint64_t x39; fiat_p256_uint1 x40; uint64_t x41; fiat_p256_uint1 x42; uint64_t x43; fiat_p256_uint1 x44; uint64_t x45; fiat_p256_uint1 x46; uint64_t x47; fiat_p256_uint1 x48; uint64_t x49; uint64_t x50; uint64_t x51; uint64_t x52; uint64_t x53; fiat_p256_uint1 x54; uint64_t x55; fiat_p256_uint1 x56; uint64_t x57; fiat_p256_uint1 x58; uint64_t x59; fiat_p256_uint1 x60; uint64_t x61; uint64_t x62; fiat_p256_uint1 x63; uint64_t x64; fiat_p256_uint1 x65; uint64_t x66; fiat_p256_uint1 x67; uint64_t x68; fiat_p256_uint1 x69; uint64_t x70; uint64_t x71; uint64_t x72; uint64_t x73; fiat_p256_uint1 x74; uint64_t x75; uint64_t x76; uint64_t x77; uint64_t x78; uint64_t x79; uint64_t x80; fiat_p256_uint1 x81; uint64_t x82; fiat_p256_uint1 x83; uint64_t x84; fiat_p256_uint1 x85; uint64_t x86; fiat_p256_uint1 x87; uint64_t x88; fiat_p256_uint1 x89; uint64_t x90; uint64_t x91; uint64_t x92; uint64_t x93; uint64_t x94; fiat_p256_uint1 x95; uint64_t x96; fiat_p256_uint1 x97; uint64_t x98; fiat_p256_uint1 x99; uint64_t x100; fiat_p256_uint1 x101; uint64_t x102; fiat_p256_uint1 x103; uint64_t x104; fiat_p256_uint1 x105; uint64_t x106; fiat_p256_uint1 x107; uint64_t x108; fiat_p256_uint1 x109; uint64_t x110; fiat_p256_uint1 x111; uint64_t x112; fiat_p256_uint1 x113; uint64_t x114; uint64_t x115; uint64_t x116; uint64_t x117; uint64_t x118; uint64_t x119; uint64_t x120; uint64_t x121; uint64_t x122; uint64_t x123; uint64_t x124; uint64_t x125; uint64_t x126; fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (~arg1), 0x1); x3 = (fiat_p256_uint1)((fiat_p256_uint1)(x1 >> 63) & (fiat_p256_uint1)((arg3[0]) & 0x1)); fiat_p256_addcarryx_u64(&x4, &x5, 0x0, (~arg1), 0x1); fiat_p256_cmovznz_u64(&x6, x3, arg1, x4); fiat_p256_cmovznz_u64(&x7, x3, (arg2[0]), (arg3[0])); fiat_p256_cmovznz_u64(&x8, x3, (arg2[1]), (arg3[1])); fiat_p256_cmovznz_u64(&x9, x3, (arg2[2]), (arg3[2])); fiat_p256_cmovznz_u64(&x10, x3, (arg2[3]), (arg3[3])); fiat_p256_cmovznz_u64(&x11, x3, (arg2[4]), (arg3[4])); fiat_p256_addcarryx_u64(&x12, &x13, 0x0, 0x1, (~(arg2[0]))); fiat_p256_addcarryx_u64(&x14, &x15, x13, 0x0, (~(arg2[1]))); fiat_p256_addcarryx_u64(&x16, &x17, x15, 0x0, (~(arg2[2]))); fiat_p256_addcarryx_u64(&x18, &x19, x17, 0x0, (~(arg2[3]))); fiat_p256_addcarryx_u64(&x20, &x21, x19, 0x0, (~(arg2[4]))); fiat_p256_cmovznz_u64(&x22, x3, (arg3[0]), x12); fiat_p256_cmovznz_u64(&x23, x3, (arg3[1]), x14); fiat_p256_cmovznz_u64(&x24, x3, (arg3[2]), x16); fiat_p256_cmovznz_u64(&x25, x3, (arg3[3]), x18); fiat_p256_cmovznz_u64(&x26, x3, (arg3[4]), x20); fiat_p256_cmovznz_u64(&x27, x3, (arg4[0]), (arg5[0])); fiat_p256_cmovznz_u64(&x28, x3, (arg4[1]), (arg5[1])); fiat_p256_cmovznz_u64(&x29, x3, (arg4[2]), (arg5[2])); fiat_p256_cmovznz_u64(&x30, x3, (arg4[3]), (arg5[3])); fiat_p256_addcarryx_u64(&x31, &x32, 0x0, x27, x27); fiat_p256_addcarryx_u64(&x33, &x34, x32, x28, x28); fiat_p256_addcarryx_u64(&x35, &x36, x34, x29, x29); fiat_p256_addcarryx_u64(&x37, &x38, x36, x30, x30); fiat_p256_subborrowx_u64(&x39, &x40, 0x0, x31, UINT64_C(0xffffffffffffffff)); fiat_p256_subborrowx_u64(&x41, &x42, x40, x33, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u64(&x43, &x44, x42, x35, 0x0); fiat_p256_subborrowx_u64(&x45, &x46, x44, x37, UINT64_C(0xffffffff00000001)); fiat_p256_subborrowx_u64(&x47, &x48, x46, x38, 0x0); x49 = (arg4[3]); x50 = (arg4[2]); x51 = (arg4[1]); x52 = (arg4[0]); fiat_p256_subborrowx_u64(&x53, &x54, 0x0, 0x0, x52); fiat_p256_subborrowx_u64(&x55, &x56, x54, 0x0, x51); fiat_p256_subborrowx_u64(&x57, &x58, x56, 0x0, x50); fiat_p256_subborrowx_u64(&x59, &x60, x58, 0x0, x49); fiat_p256_cmovznz_u64(&x61, x60, 0x0, UINT64_C(0xffffffffffffffff)); fiat_p256_addcarryx_u64(&x62, &x63, 0x0, x53, x61); fiat_p256_addcarryx_u64(&x64, &x65, x63, x55, (x61 & UINT32_C(0xffffffff))); fiat_p256_addcarryx_u64(&x66, &x67, x65, x57, 0x0); fiat_p256_addcarryx_u64(&x68, &x69, x67, x59, (x61 & UINT64_C(0xffffffff00000001))); fiat_p256_cmovznz_u64(&x70, x3, (arg5[0]), x62); fiat_p256_cmovznz_u64(&x71, x3, (arg5[1]), x64); fiat_p256_cmovznz_u64(&x72, x3, (arg5[2]), x66); fiat_p256_cmovznz_u64(&x73, x3, (arg5[3]), x68); x74 = (fiat_p256_uint1)(x22 & 0x1); fiat_p256_cmovznz_u64(&x75, x74, 0x0, x7); fiat_p256_cmovznz_u64(&x76, x74, 0x0, x8); fiat_p256_cmovznz_u64(&x77, x74, 0x0, x9); fiat_p256_cmovznz_u64(&x78, x74, 0x0, x10); fiat_p256_cmovznz_u64(&x79, x74, 0x0, x11); fiat_p256_addcarryx_u64(&x80, &x81, 0x0, x22, x75); fiat_p256_addcarryx_u64(&x82, &x83, x81, x23, x76); fiat_p256_addcarryx_u64(&x84, &x85, x83, x24, x77); fiat_p256_addcarryx_u64(&x86, &x87, x85, x25, x78); fiat_p256_addcarryx_u64(&x88, &x89, x87, x26, x79); fiat_p256_cmovznz_u64(&x90, x74, 0x0, x27); fiat_p256_cmovznz_u64(&x91, x74, 0x0, x28); fiat_p256_cmovznz_u64(&x92, x74, 0x0, x29); fiat_p256_cmovznz_u64(&x93, x74, 0x0, x30); fiat_p256_addcarryx_u64(&x94, &x95, 0x0, x70, x90); fiat_p256_addcarryx_u64(&x96, &x97, x95, x71, x91); fiat_p256_addcarryx_u64(&x98, &x99, x97, x72, x92); fiat_p256_addcarryx_u64(&x100, &x101, x99, x73, x93); fiat_p256_subborrowx_u64(&x102, &x103, 0x0, x94, UINT64_C(0xffffffffffffffff)); fiat_p256_subborrowx_u64(&x104, &x105, x103, x96, UINT32_C(0xffffffff)); fiat_p256_subborrowx_u64(&x106, &x107, x105, x98, 0x0); fiat_p256_subborrowx_u64(&x108, &x109, x107, x100, UINT64_C(0xffffffff00000001)); fiat_p256_subborrowx_u64(&x110, &x111, x109, x101, 0x0); fiat_p256_addcarryx_u64(&x112, &x113, 0x0, x6, 0x1); x114 = ((x80 >> 1) | ((x82 << 63) & UINT64_C(0xffffffffffffffff))); x115 = ((x82 >> 1) | ((x84 << 63) & UINT64_C(0xffffffffffffffff))); x116 = ((x84 >> 1) | ((x86 << 63) & UINT64_C(0xffffffffffffffff))); x117 = ((x86 >> 1) | ((x88 << 63) & UINT64_C(0xffffffffffffffff))); x118 = ((x88 & UINT64_C(0x8000000000000000)) | (x88 >> 1)); fiat_p256_cmovznz_u64(&x119, x48, x39, x31); fiat_p256_cmovznz_u64(&x120, x48, x41, x33); fiat_p256_cmovznz_u64(&x121, x48, x43, x35); fiat_p256_cmovznz_u64(&x122, x48, x45, x37); fiat_p256_cmovznz_u64(&x123, x111, x102, x94); fiat_p256_cmovznz_u64(&x124, x111, x104, x96); fiat_p256_cmovznz_u64(&x125, x111, x106, x98); fiat_p256_cmovznz_u64(&x126, x111, x108, x100); *out1 = x112; out2[0] = x7; out2[1] = x8; out2[2] = x9; out2[3] = x10; out2[4] = x11; out3[0] = x114; out3[1] = x115; out3[2] = x116; out3[3] = x117; out3[4] = x118; out4[0] = x119; out4[1] = x120; out4[2] = x121; out4[3] = x122; out5[0] = x123; out5[1] = x124; out5[2] = x125; out5[3] = x126; } /* * The function fiat_p256_divstep_precomp returns the precomputed value for Bernstein-Yang-inversion (in montgomery form). * * Postconditions: * eval (from_montgomery out1) = ⌊(m - 1) / 2⌋^(if ⌊log2 m⌋ + 1 < 46 then ⌊(49 * (⌊log2 m⌋ + 1) + 80) / 17⌋ else ⌊(49 * (⌊log2 m⌋ + 1) + 57) / 17⌋) * 0 ≤ eval out1 < m * * Output Bounds: * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ static FIAT_P256_FIAT_INLINE void fiat_p256_divstep_precomp(uint64_t out1[4]) { out1[0] = UINT64_C(0x67ffffffb8000000); out1[1] = UINT64_C(0xc000000038000000); out1[2] = UINT64_C(0xd80000007fffffff); out1[3] = UINT64_C(0x2fffffffffffffff); }

, other_prime_len_bits: BitLength, ) -> Result, LimbSliceError> { // `elem_exp_consttime_inner` is parameterized on `STORAGE_LIMBS` only so // we can run tests with larger-than-supported-in-operation test vectors. elem_exp_consttime_inner::( out, base, oneRRR, exponent, p, other_prime_len_bits, ) } // The maximum modulus size supported for `elem_exp_consttime` in normal // operation. const ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS: usize = 2048 / LIMB_BITS; const _LIMBS_PER_CHUNK_DIVIDES_ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS: () = assert!(ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS % limbs512::LIMBS_PER_CHUNK == 0); const WINDOW_BITS: u32 = 5; const TABLE_ENTRIES: usize = 1 << WINDOW_BITS; const STORAGE_ENTRIES: usize = TABLE_ENTRIES + if cfg!(target_arch = "x86_64") { 3 } else { 0 }; #[cfg(not(target_arch = "x86_64"))] fn elem_exp_consttime_inner( out: Storage, base_mod_n: &Elem, oneRRR: &One, exponent: &PrivateExponent, m: &Modulus, other_prime_len_bits: BitLength, ) -> Result, LimbSliceError> { use crate::{bssl, limb::Window}; let base_rinverse: Elem = elem_reduced(out, base_mod_n, m, other_prime_len_bits); let num_limbs = m.limbs().len(); let m_chunked: AsChunks = match slice::as_chunks(m.limbs()) { (m, []) => m, _ => { return Err(LimbSliceError::len_mismatch(LenMismatchError::new( num_limbs, ))) } }; let cpe = m_chunked.len(); // 512-bit chunks per entry. // This code doesn't have the strict alignment requirements that the x86_64 // version does, but uses the same aligned storage for convenience. assert!(STORAGE_LIMBS % (STORAGE_ENTRIES * limbs512::LIMBS_PER_CHUNK) == 0); // TODO: `const` let mut table = limbs512::AlignedStorage::::zeroed(); let mut table = table .aligned_chunks_mut(TABLE_ENTRIES, cpe) .map_err(LimbSliceError::len_mismatch)?; // TODO: Rewrite the below in terms of `AsChunks`. let table = table.as_flattened_mut(); fn gather(table: &[Limb], acc: &mut Elem, i: Window) { prefixed_extern! { fn LIMBS_select_512_32( r: *mut Limb, table: *const Limb, num_limbs: c::size_t, i: Window, ) -> bssl::Result; } Result::from(unsafe { LIMBS_select_512_32(acc.limbs.as_mut_ptr(), table.as_ptr(), acc.limbs.len(), i) }) .unwrap(); } fn power( table: &[Limb], mut acc: Elem, m: &Modulus, i: Window, mut tmp: Elem, ) -> (Elem, Elem) { for _ in 0..WINDOW_BITS { acc = elem_squared(acc, m); } gather(table, &mut tmp, i); let acc = elem_mul(&tmp, acc, m); (acc, tmp) } fn entry(table: &[Limb], i: usize, num_limbs: usize) -> &[Limb] { &table[(i * num_limbs)..][..num_limbs] } fn entry_mut(table: &mut [Limb], i: usize, num_limbs: usize) -> &mut [Limb] { &mut table[(i * num_limbs)..][..num_limbs] } // table[0] = base**0 (i.e. 1). m.oneR(entry_mut(table, 0, num_limbs)); // table[1] = base*R == (base/R * RRR)/R limbs_mul_mont( ( entry_mut(table, 1, num_limbs), base_rinverse.limbs.as_ref(), oneRRR.as_ref().limbs.as_ref(), ), m.limbs(), m.n0(), m.cpu_features(), )?; for i in 2..TABLE_ENTRIES { let (src1, src2) = if i % 2 == 0 { (i / 2, i / 2) } else { (i - 1, 1) }; let (previous, rest) = table.split_at_mut(num_limbs * i); let src1 = entry(previous, src1, num_limbs); let src2 = entry(previous, src2, num_limbs); let dst = entry_mut(rest, 0, num_limbs); limbs_mul_mont((dst, src1, src2), m.limbs(), m.n0(), m.cpu_features())?; } let mut acc = Elem { limbs: base_rinverse.limbs, encoding: PhantomData, }; let tmp = m.alloc_zero(); let tmp = Elem { limbs: tmp.limbs, encoding: PhantomData, }; let (acc, _) = limb::fold_5_bit_windows( exponent.limbs(), |initial_window| { gather(&table, &mut acc, initial_window); (acc, tmp) }, |(acc, tmp), window| power(&table, acc, m, window, tmp), ); Ok(acc.into_unencoded(m)) } #[cfg(target_arch = "x86_64")] fn elem_exp_consttime_inner( out: Storage, base_mod_n: &Elem, oneRRR: &One, exponent: &PrivateExponent, m: &Modulus, other_prime_len_bits: BitLength, ) -> Result, LimbSliceError> { use super::limbs::x86_64::mont::{ gather5, mul_mont5, mul_mont_gather5_amm, power5_amm, scatter5, sqr_mont5, }; use crate::{ cpu::{ intel::{Adx, Bmi2}, GetFeature as _, }, limb::{LeakyWindow, Window}, polyfill::slice::AsChunksMut, }; let n0 = m.n0(); let cpu2 = m.cpu_features().get_feature(); let cpu3 = m.cpu_features().get_feature(); if base_mod_n.limbs.len() != m.limbs().len() * 2 { return Err(LimbSliceError::len_mismatch(LenMismatchError::new( base_mod_n.limbs.len(), ))); } let m_original: AsChunks = match slice::as_chunks(m.limbs()) { (m, []) => m, _ => return Err(LimbSliceError::len_mismatch(LenMismatchError::new(8))), }; let cpe = m_original.len(); // 512-bit chunks per entry let oneRRR = &oneRRR.as_ref().limbs; let oneRRR = match slice::as_chunks(oneRRR) { (c, []) => c, _ => { return Err(LimbSliceError::len_mismatch(LenMismatchError::new( oneRRR.len(), ))) } }; // The x86_64 assembly was written under the assumption that the input data // is aligned to `MOD_EXP_CTIME_ALIGN` bytes, which was/is 64 in OpenSSL. // Subsequently, it was changed such that, according to BoringSSL, they // only require 16 byte alignment. We enforce the old, stronger, alignment // unless/until we can see a benefit to reducing it. // // Similarly, OpenSSL uses the x86_64 assembly functions by giving it only // inputs `tmp`, `am`, and `np` that immediately follow the table. // According to BoringSSL, in older versions of the OpenSSL code, this // extra space was required for memory safety because the assembly code // would over-read the table; according to BoringSSL, this is no longer the // case. Regardless, the upstream code also contained comments implying // that this was also important for performance. For now, we do as OpenSSL // did/does. const MOD_EXP_CTIME_ALIGN: usize = 64; // Required by const _TABLE_ENTRIES_IS_32: () = assert!(TABLE_ENTRIES == 32); const _STORAGE_ENTRIES_HAS_3_EXTRA: () = assert!(STORAGE_ENTRIES == TABLE_ENTRIES + 3); assert!(STORAGE_LIMBS % (STORAGE_ENTRIES * limbs512::LIMBS_PER_CHUNK) == 0); // TODO: `const` let mut table = limbs512::AlignedStorage::::zeroed(); let mut table = table .aligned_chunks_mut(STORAGE_ENTRIES, cpe) .map_err(LimbSliceError::len_mismatch)?; let (mut table, mut state) = table.split_at_mut(TABLE_ENTRIES * cpe); assert_eq!((table.as_ptr() as usize) % MOD_EXP_CTIME_ALIGN, 0); // These are named `(tmp, am, np)` in BoringSSL. let (mut acc, mut rest) = state.split_at_mut(cpe); let (mut base_cached, mut m_cached) = rest.split_at_mut(cpe); // "To improve cache locality" according to upstream. m_cached .as_flattened_mut() .copy_from_slice(m_original.as_flattened()); let m_cached = m_cached.as_ref(); let out: Elem = elem_reduced(out, base_mod_n, m, other_prime_len_bits); let base_rinverse = match slice::as_chunks(&out.limbs) { (c, []) => c, _ => { return Err(LimbSliceError::len_mismatch(LenMismatchError::new( out.limbs.len(), ))) } }; // base_cached = base*R == (base/R * RRR)/R mul_mont5( base_cached.as_mut(), base_rinverse, oneRRR, m_cached, n0, cpu2, )?; let base_cached = base_cached.as_ref(); let mut out = Storage::from(out); // recycle. // Fill in all the powers of 2 of `acc` into the table using only squaring and without any // gathering, storing the last calculated power into `acc`. fn scatter_powers_of_2( mut table: AsChunksMut, mut acc: AsChunksMut, m_cached: AsChunks, n0: &N0, mut i: LeakyWindow, cpu: Option<(Adx, Bmi2)>, ) -> Result<(), LimbSliceError> { loop { scatter5(acc.as_ref(), table.as_mut(), i)?; i *= 2; if i >= TABLE_ENTRIES as LeakyWindow { break; } sqr_mont5(acc.as_mut(), m_cached, n0, cpu)?; } Ok(()) } // All entries in `table` will be Montgomery encoded. // acc = table[0] = base**0 (i.e. 1). m.oneR(acc.as_flattened_mut()); scatter5(acc.as_ref(), table.as_mut(), 0)?; // acc = base**1 (i.e. base). acc.as_flattened_mut() .copy_from_slice(base_cached.as_flattened()); // Fill in entries 1, 2, 4, 8, 16. scatter_powers_of_2(table.as_mut(), acc.as_mut(), m_cached, n0, 1, cpu2)?; // Fill in entries 3, 6, 12, 24; 5, 10, 20, 30; 7, 14, 28; 9, 18; 11, 22; 13, 26; 15, 30; // 17; 19; 21; 23; 25; 27; 29; 31. for i in (3..(TABLE_ENTRIES as LeakyWindow)).step_by(2) { let power = Window::from(i - 1); assert!(power < 32); // Not secret, unsafe { mul_mont_gather5_amm( acc.as_mut(), base_cached, table.as_ref(), m_cached, n0, power, cpu3, ) }?; scatter_powers_of_2(table.as_mut(), acc.as_mut(), m_cached, n0, i, cpu2)?; } let table = table.as_ref(); let acc = limb::fold_5_bit_windows( exponent.limbs(), |initial_window| { unsafe { gather5(acc.as_mut(), table, initial_window) } .unwrap_or_else(unwrap_impossible_limb_slice_error); acc }, |mut acc, window| { unsafe { power5_amm(acc.as_mut(), table, m_cached, n0, window, cpu3) } .unwrap_or_else(unwrap_impossible_limb_slice_error); acc }, ); // Reuse `base_rinverse`'s limbs to save an allocation. out.limbs.copy_from_slice(acc.as_flattened()); Ok(from_montgomery_amm(out, m)) } /// Verified a == b**-1 (mod m), i.e. a**-1 == b (mod m). pub fn verify_inverses_consttime( a: &Elem, b: Elem, m: &Modulus, ) -> Result<(), error::Unspecified> { let r = elem_mul(a, b, m); limb::verify_limbs_equal_1_leak_bit(&r.limbs) } #[inline] pub fn elem_verify_equal_consttime( a: &Elem, b: &Elem, ) -> Result<(), error::Unspecified> { let equal = limb::limbs_equal_limbs_consttime(&a.limbs, &b.limbs) .unwrap_or_else(unwrap_impossible_len_mismatch_error); if !equal.leak() { return Err(error::Unspecified); } Ok(()) } #[cold] #[inline(never)] fn unwrap_impossible_len_mismatch_error(LenMismatchError { .. }: LenMismatchError) -> T { unreachable!() } #[cold] #[inline(never)] fn unwrap_impossible_limb_slice_error(err: LimbSliceError) { match err { LimbSliceError::LenMismatch(_) => unreachable!(), LimbSliceError::TooShort(_) => unreachable!(), LimbSliceError::TooLong(_) => unreachable!(), } } #[cfg(test)] mod tests { use super::*; use crate::cpu; use crate::testutil as test; // Type-level representation of an arbitrary modulus. struct M {} impl PublicModulus for M {} #[test] fn test_elem_exp_consttime() { let cpu_features = cpu::features(); test::run( test_vector_file!("../../crypto/fipsmodule/bn/test/mod_exp_tests.txt"), |section, test_case| { assert_eq!(section, ""); let m = consume_modulus::(test_case, "M"); let m = m.modulus(cpu_features); let expected_result = consume_elem(test_case, "ModExp", &m); let base = consume_elem(test_case, "A", &m); let e = { let bytes = test_case.consume_bytes("E"); PrivateExponent::from_be_bytes_for_test_only(untrusted::Input::from(&bytes), &m) .expect("valid exponent") }; let oneRR = One::newRR(m.alloc_zero(), &m); let oneRRR = One::newRRR(oneRR, &m); // `base` in the test vectors is reduced (mod M) already but // the API expects the bsae to be (mod N) where N = M * P for // some other prime of the same length. Fake that here. // Pretend there's another prime of equal length. struct N {} let other_modulus_len_bits = m.len_bits(); let base: Elem = { let mut limbs = BoxedLimbs::zero(base.limbs.len() * 2); limbs[..base.limbs.len()].copy_from_slice(&base.limbs); Elem { limbs, encoding: PhantomData, } }; let too_big = m.limbs().len() > ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS; let actual_result = if !too_big { elem_exp_consttime( m.alloc_zero(), &base, &oneRRR, &e, &m, other_modulus_len_bits, ) } else { let actual_result = elem_exp_consttime( m.alloc_zero(), &base, &oneRRR, &e, &m, other_modulus_len_bits, ); // TODO: Be more specific with which error we expect? assert!(actual_result.is_err()); // Try again with a larger-than-normally-supported limit elem_exp_consttime_inner::<_, _, { (4096 / LIMB_BITS) * STORAGE_ENTRIES }>( m.alloc_zero(), &base, &oneRRR, &e, &m, other_modulus_len_bits, ) }; match actual_result { Ok(r) => assert_elem_eq(&r, &expected_result), Err(LimbSliceError::LenMismatch { .. }) => panic!(), Err(LimbSliceError::TooLong { .. }) => panic!(), Err(LimbSliceError::TooShort { .. }) => panic!(), }; Ok(()) }, ) } // TODO: fn test_elem_exp_vartime() using // "src/rsa/bigint_elem_exp_vartime_tests.txt". See that file for details. // In the meantime, the function is tested indirectly via the RSA // verification and signing tests. #[test] fn test_elem_mul() { let cpu_features = cpu::features(); test::run( test_vector_file!("../../crypto/fipsmodule/bn/test/mod_mul_tests.txt"), |section, test_case| { assert_eq!(section, ""); let m = consume_modulus::(test_case, "M"); let m = m.modulus(cpu_features); let expected_result = consume_elem(test_case, "ModMul", &m); let a = consume_elem(test_case, "A", &m); let b = consume_elem(test_case, "B", &m); let b = into_encoded(m.alloc_zero(), b, &m); let a = into_encoded(m.alloc_zero(), a, &m); let actual_result = elem_mul(&a, b, &m); let actual_result = actual_result.into_unencoded(&m); assert_elem_eq(&actual_result, &expected_result); Ok(()) }, ) } #[test] fn test_elem_squared() { let cpu_features = cpu::features(); test::run( test_vector_file!("bigint_elem_squared_tests.txt"), |section, test_case| { assert_eq!(section, ""); let m = consume_modulus::(test_case, "M"); let m = m.modulus(cpu_features); let expected_result = consume_elem(test_case, "ModSquare", &m); let a = consume_elem(test_case, "A", &m); let a = into_encoded(m.alloc_zero(), a, &m); let actual_result = elem_squared(a, &m); let actual_result = actual_result.into_unencoded(&m); assert_elem_eq(&actual_result, &expected_result); Ok(()) }, ) } #[test] fn test_elem_reduced() { let cpu_features = cpu::features(); test::run( test_vector_file!("bigint_elem_reduced_tests.txt"), |section, test_case| { assert_eq!(section, ""); struct M {} let m_ = consume_modulus::(test_case, "M"); let m = m_.modulus(cpu_features); let expected_result = consume_elem(test_case, "R", &m); let a = consume_elem_unchecked::(test_case, "A", expected_result.limbs.len() * 2); let other_modulus_len_bits = m_.len_bits(); let actual_result = elem_reduced(m.alloc_zero(), &a, &m, other_modulus_len_bits); let oneRR = One::newRR(m.alloc_zero(), &m); let actual_result = elem_mul(oneRR.as_ref(), actual_result, &m); assert_elem_eq(&actual_result, &expected_result); Ok(()) }, ) } #[test] fn test_elem_reduced_once() { let cpu_features = cpu::features(); test::run( test_vector_file!("bigint_elem_reduced_once_tests.txt"), |section, test_case| { assert_eq!(section, ""); struct M {} struct O {} let m = consume_modulus::(test_case, "m"); let m = m.modulus(cpu_features); let a = consume_elem_unchecked::(test_case, "a", m.limbs().len()); let expected_result = consume_elem::(test_case, "r", &m); let other_modulus_len_bits = m.len_bits(); let actual_result = elem_reduced_once(m.alloc_zero(), &a, &m, other_modulus_len_bits); assert_elem_eq(&actual_result, &expected_result); Ok(()) }, ) } fn consume_elem( test_case: &mut test::TestCase, name: &str, m: &Modulus, ) -> Elem { let value = test_case.consume_bytes(name); Elem::from_be_bytes_padded(untrusted::Input::from(&value), m).unwrap() } fn consume_elem_unchecked( test_case: &mut test::TestCase, name: &str, num_limbs: usize, ) -> Elem { let bytes = test_case.consume_bytes(name); let mut limbs = BoxedLimbs::zero(num_limbs); limb::parse_big_endian_and_pad_consttime(untrusted::Input::from(&bytes), &mut limbs) .unwrap(); Elem { limbs, encoding: PhantomData, } } fn consume_modulus(test_case: &mut test::TestCase, name: &str) -> OwnedModulus { let value = test_case.consume_bytes(name); OwnedModulus::from( OwnedModulusValue::from_be_bytes(untrusted::Input::from(&value)).unwrap(), ) } fn assert_elem_eq(a: &Elem, b: &Elem) { if elem_verify_equal_consttime(a, b).is_err() { panic!("{:x?} != {:x?}", &*a.limbs, &*b.limbs); } } fn into_encoded(out: Storage, a: Elem, m: &Modulus) -> Elem { let oneRR = One::newRR(out, m); elem_mul(oneRR.as_ref(), a, m) } } ring-0.17.14/src/arithmetic/constant.rs000064400000000000000000000013671046102023000160630ustar 00000000000000use crate::limb::LeakyLimb; use core::mem::size_of; const fn parse_digit(d: u8) -> u8 { match d.to_ascii_lowercase() { b'0'..=b'9' => d - b'0', b'a'..=b'f' => d - b'a' + 10, _ => panic!(), } } // TODO: this would be nicer as a trait, but currently traits don't support const functions pub const fn limbs_from_hex(hex: &str) -> [LeakyLimb; LIMBS] { let hex = hex.as_bytes(); let mut limbs = [0; LIMBS]; let limb_nibbles = size_of::() * 2; let mut i = 0; while i < hex.len() { let char = hex[hex.len() - 1 - i]; let val = parse_digit(char); limbs[i / limb_nibbles] |= (val as LeakyLimb) << ((i % limb_nibbles) * 4); i += 1; } limbs } ring-0.17.14/src/arithmetic/ffi.rs000064400000000000000000000071331046102023000147730ustar 00000000000000// Copyright 2024-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{inout::AliasingSlices3, n0::N0, LimbSliceError, MAX_LIMBS, MIN_LIMBS}; use crate::{c, limb::Limb, polyfill::usize_from_u32}; use core::{mem::size_of, num::NonZeroUsize}; const _MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES: () = { // BoringSSL's limit: 8 kiloBYTES. const BN_MONTGOMERY_MAX_WORDS: usize = (8 * 1092) / size_of::(); assert!(MAX_LIMBS <= BN_MONTGOMERY_MAX_WORDS); // Some 64-bit assembly implementations were written to take `len` as a // `c_int`, so they zero out the undefined top half of `len` to convert it // to a `usize`. But, others don't. assert!(MAX_LIMBS <= usize_from_u32(u32::MAX)); }; macro_rules! bn_mul_mont_ffi { ( $in_out:expr, $n:expr, $n0:expr, $cpu:expr, unsafe { ($MIN_LEN:expr, $MOD_LEN:expr, $Cpu:ty) => $f:ident }) => {{ use crate::{c, limb::Limb}; prefixed_extern! { // `r` and/or 'a' and/or 'b' may alias. // XXX: BoringSSL declares these functions to return `int`. fn $f( r: *mut Limb, a: *const Limb, b: *const Limb, n: *const Limb, n0: &N0, len: c::NonZero_size_t, ); } unsafe { crate::arithmetic::ffi::bn_mul_mont_ffi::<$Cpu, { $MIN_LEN }, { $MOD_LEN }>( $in_out, $n, $n0, $cpu, $f, ) } }}; } #[inline] pub(super) unsafe fn bn_mul_mont_ffi( in_out: impl AliasingSlices3, n: &[Limb], n0: &N0, cpu: Cpu, f: unsafe extern "C" fn( r: *mut Limb, a: *const Limb, b: *const Limb, n: *const Limb, n0: &N0, len: c::NonZero_size_t, ), ) -> Result<(), LimbSliceError> { assert_eq!(n.len() % LEN_MOD, 0); // The caller should guard against this. /// The x86 implementation of `bn_mul_mont`, at least, requires at least 4 /// limbs. For a long time we have required 4 limbs for all targets, though /// this may be unnecessary. const _MIN_LIMBS_AT_LEAST_4: () = assert!(MIN_LIMBS >= 4); // We haven't tested shorter lengths. assert!(LEN_MIN >= MIN_LIMBS); if n.len() < LEN_MIN { return Err(LimbSliceError::too_short(n.len())); } let len = NonZeroUsize::new(n.len()).unwrap_or_else(|| { // Unreachable because we checked against `LEN_MIN`, and we checked // `LEN_MIN` is nonzero. unreachable!() }); // Avoid stack overflow from the alloca inside. if len.get() > MAX_LIMBS { return Err(LimbSliceError::too_long(n.len())); } in_out .with_non_dangling_non_null_pointers_rab(len, |r, a, b| { let n = n.as_ptr(); let _: Cpu = cpu; unsafe { f(r, a, b, n, n0, len) }; }) .map_err(LimbSliceError::len_mismatch) } ring-0.17.14/src/arithmetic/inout.rs000064400000000000000000000150461046102023000153670ustar 00000000000000// Copyright 2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. pub(crate) use crate::error::LenMismatchError; use core::num::NonZeroUsize; pub(crate) trait AliasingSlices2 { /// The pointers passed to `f` will be valid and non-null, and will not /// be dangling, so they can be passed to C functions. /// /// The first pointer, `r`, may be pointing to uninitialized memory for /// `expected_len` elements of type `T`, properly aligned and writable. /// `f` must not read from `r` before writing to it. /// /// The second & third pointers, `a` and `b`, point to `expected_len` /// values of type `T`, properly aligned. /// /// `r`, `a`, and/or `b` may alias each other only in the following ways: /// `ptr::eq(r, a)`, `ptr::eq(r, b)`, and/or `ptr::eq(a, b)`; i.e. they /// will not be "overlapping." /// /// Implementations of this trait shouldn't override this default /// implementation. #[inline(always)] fn with_non_dangling_non_null_pointers_ra( self, expected_len: NonZeroUsize, f: impl FnOnce(*mut T, *const T) -> R, ) -> Result where Self: Sized, { self.with_potentially_dangling_non_null_pointers_ra(expected_len.get(), f) } /// If `expected_len == 0` then the pointers passed to `f` may be /// dangling pointers, which should not be passed to C functions. In all /// other respects, this works like /// `Self::with_non_dangling_non_null_pointers_rab`. /// /// Implementations of this trait should implement this method and not /// `with_non_dangling_non_null_pointers_rab`. Users of this trait should /// use `with_non_dangling_non_null_pointers_rab` and not this. fn with_potentially_dangling_non_null_pointers_ra( self, expected_len: usize, f: impl FnOnce(*mut T, *const T) -> R, ) -> Result; } impl AliasingSlices2 for &mut [T] { fn with_potentially_dangling_non_null_pointers_ra( self, expected_len: usize, f: impl FnOnce(*mut T, *const T) -> R, ) -> Result { let r = self; if r.len() != expected_len { return Err(LenMismatchError::new(r.len())); } Ok(f(r.as_mut_ptr(), r.as_ptr())) } } impl AliasingSlices2 for (&mut [T], &[T]) { fn with_potentially_dangling_non_null_pointers_ra( self, expected_len: usize, f: impl FnOnce(*mut T, *const T) -> R, ) -> Result { let (r, a) = self; if r.len() != expected_len { return Err(LenMismatchError::new(r.len())); } if a.len() != expected_len { return Err(LenMismatchError::new(a.len())); } Ok(f(r.as_mut_ptr(), a.as_ptr())) } } pub(crate) trait AliasingSlices3 { /// The pointers passed to `f` will all be non-null and properly aligned, /// and will not be dangling. /// /// The first pointer, `r` points to potentially-uninitialized writable /// space for `expected_len` elements of type `T`. Accordingly, `f` must /// not read from `r` before writing to it. /// /// The second & third pointers, `a` and `b`, point to `expected_len` /// initialized values of type `T`. /// /// `r`, `a`, and/or `b` may alias each other, but only in the following /// ways: `ptr::eq(r, a)`, `ptr::eq(r, b)`, and/or `ptr::eq(a, b)`; they /// will not be "overlapping." /// /// Implementations of this trait shouldn't override this default /// implementation. #[inline(always)] fn with_non_dangling_non_null_pointers_rab( self, expected_len: NonZeroUsize, f: impl FnOnce(*mut T, *const T, *const T) -> R, ) -> Result where Self: Sized, { self.with_potentially_dangling_non_null_pointers_rab(expected_len.get(), f) } /// If `expected_len == 0` then the pointers passed to `f` may be /// dangling pointers, which should not be passed to C functions. In all /// other respects, this works like /// `Self::with_non_dangling_non_null_pointers_rab`. /// /// Implementations of this trait should implement this method and not /// `with_non_dangling_non_null_pointers_rab`. Users of this trait should /// use `with_non_dangling_non_null_pointers_rab` and not this. fn with_potentially_dangling_non_null_pointers_rab( self, expected_len: usize, f: impl FnOnce(*mut T, *const T, *const T) -> R, ) -> Result; } impl AliasingSlices3 for &mut [T] { fn with_potentially_dangling_non_null_pointers_rab( self, expected_len: usize, f: impl FnOnce(*mut T, *const T, *const T) -> R, ) -> Result { >::with_potentially_dangling_non_null_pointers_ra( self, expected_len, |r, a| f(r, r, a), ) } } impl AliasingSlices3 for (&mut [T], &[T], &[T]) { fn with_potentially_dangling_non_null_pointers_rab( self, expected_len: usize, f: impl FnOnce(*mut T, *const T, *const T) -> R, ) -> Result { let (r, a, b) = self; ((r, a), b).with_potentially_dangling_non_null_pointers_rab(expected_len, f) } } impl AliasingSlices3 for (RA, &[T]) where RA: AliasingSlices2, { fn with_potentially_dangling_non_null_pointers_rab( self, expected_len: usize, f: impl FnOnce(*mut T, *const T, *const T) -> R, ) -> Result { let (ra, b) = self; if b.len() != expected_len { return Err(LenMismatchError::new(b.len())); } ra.with_potentially_dangling_non_null_pointers_ra(expected_len, |r, a| f(r, a, b.as_ptr())) } } ring-0.17.14/src/arithmetic/limbs/aarch64/mod.rs000064400000000000000000000015341046102023000173430ustar 00000000000000// Copyright 2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(all(target_arch = "aarch64", target_endian = "little"))] pub(in super::super) mod mont; ring-0.17.14/src/arithmetic/limbs/aarch64/mont.rs000064400000000000000000000042551046102023000175440ustar 00000000000000// Copyright 2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(all(target_arch = "aarch64", target_endian = "little"))] use super::super::super::{inout::AliasingSlices3 as _, n0::N0, LimbSliceError, MAX_LIMBS}; use crate::{ c, limb::Limb, polyfill::slice::{AsChunks, AsChunksMut}, }; use core::num::NonZeroUsize; #[inline] pub(in super::super::super) fn sqr_mont5( mut in_out: AsChunksMut, n: AsChunks, n0: &N0, ) -> Result<(), LimbSliceError> { prefixed_extern! { // `r` and/or 'a' may alias. // XXX: BoringSSL (kinda, implicitly) declares this to return `int`. // `num` must be a non-zero multiple of 8. fn bn_sqr8x_mont( rp: *mut Limb, ap: *const Limb, ap_again: *const Limb, np: *const Limb, n0: &N0, num: c::NonZero_size_t); } let in_out = in_out.as_flattened_mut(); let n = n.as_flattened(); let num_limbs = NonZeroUsize::new(n.len()).ok_or_else(|| LimbSliceError::too_short(n.len()))?; // Avoid stack overflow from the alloca inside. if num_limbs.get() > MAX_LIMBS { return Err(LimbSliceError::too_long(num_limbs.get())); } in_out .with_non_dangling_non_null_pointers_rab(num_limbs, |r, a, a_again| { let n = n.as_ptr(); // Non-dangling because num_limbs > 0. unsafe { bn_sqr8x_mont(r, a, a_again, n, n0, num_limbs) }; }) .map_err(LimbSliceError::len_mismatch) } ring-0.17.14/src/arithmetic/limbs/mod.rs000064400000000000000000000014531046102023000161130ustar 00000000000000// Copyright 2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. pub(super) mod aarch64; pub(super) mod x86_64; ring-0.17.14/src/arithmetic/limbs/x86_64/mod.rs000064400000000000000000000015031046102023000170450ustar 00000000000000// Copyright 2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(target_arch = "x86_64")] pub(in super::super::super) mod mont; ring-0.17.14/src/arithmetic/limbs/x86_64/mont.rs000064400000000000000000000241551046102023000172530ustar 00000000000000// Copyright 2015-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(target_arch = "x86_64")] use super::super::super::{ inout::{AliasingSlices2, AliasingSlices3}, n0::N0, LimbSliceError, MAX_LIMBS, }; use crate::{ c, cpu::intel::{Adx, Bmi1, Bmi2}, error::LenMismatchError, limb::{LeakyWindow, Limb, Window}, polyfill::slice::{AsChunks, AsChunksMut}, }; use core::num::NonZeroUsize; const _512_IS_LIMB_BITS_TIMES_8: () = assert!(8 * Limb::BITS == 512); #[inline] pub(in super::super::super) fn mul_mont5( mut r: AsChunksMut, a: AsChunks, b: AsChunks, m: AsChunks, n0: &N0, maybe_adx_bmi2: Option<(Adx, Bmi2)>, ) -> Result<(), LimbSliceError> { mul_mont5_4x( (r.as_flattened_mut(), a.as_flattened(), b.as_flattened()), m.into(), n0, maybe_adx_bmi2, ) } pub const MIN_4X: usize = 8; #[inline] pub(in super::super::super) fn mul_mont5_4x( in_out: impl AliasingSlices3, n: AsChunks, n0: &N0, maybe_adx_bmi2: Option<(Adx, Bmi2)>, ) -> Result<(), LimbSliceError> { const MOD_4X: usize = 4; let n = n.as_flattened(); if let Some(cpu) = maybe_adx_bmi2 { bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe { (MIN_4X, MOD_4X, (Adx, Bmi2)) => bn_mulx4x_mont }) } else { bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { (MIN_4X, MOD_4X, ()) => bn_mul4x_mont }) } } #[inline] pub(in super::super::super) fn sqr_mont5( mut in_out: AsChunksMut, n: AsChunks, n0: &N0, maybe_adx_bmi2: Option<(Adx, Bmi2)>, ) -> Result<(), LimbSliceError> { prefixed_extern! { // `r` and/or 'a' may alias. // XXX: BoringSSL declares this to return `int`. // `num` must be a non-zero multiple of 8. fn bn_sqr8x_mont( rp: *mut Limb, ap: *const Limb, mulx_adx_capable: Limb, np: *const Limb, n0: &N0, num: c::NonZero_size_t); } let in_out = in_out.as_flattened_mut(); let n = n.as_flattened(); let num_limbs = NonZeroUsize::new(n.len()).ok_or_else(|| LimbSliceError::too_short(n.len()))?; // Avoid stack overflow from the alloca inside. if num_limbs.get() > MAX_LIMBS { return Err(LimbSliceError::too_long(num_limbs.get())); } // `Limb::from(mulx_adx.is_some())`, but intentionally branchy. let mulx_adx_capable = match maybe_adx_bmi2 { Some(_) => Limb::from(true), None => Limb::from(false), }; in_out .with_non_dangling_non_null_pointers_ra(num_limbs, |r, a| { let n = n.as_ptr(); // Non-dangling because num_limbs > 0. unsafe { bn_sqr8x_mont(r, a, mulx_adx_capable, n, n0, num_limbs) }; }) .map_err(LimbSliceError::len_mismatch) } #[inline(always)] pub(in super::super::super) fn scatter5( a: AsChunks, mut table: AsChunksMut, power: LeakyWindow, ) -> Result<(), LimbSliceError> { prefixed_extern! { // Upstream uses `num: c::size_t` too, and `power: c::size_t`; see // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`. fn bn_scatter5( inp: *const Limb, num: c::NonZero_size_t, table: *mut Limb, power: LeakyWindow, ); } let num_limbs = check_common(a, table.as_ref())?; let a = a.as_flattened(); let table = table.as_flattened_mut(); assert!(power < 32); unsafe { bn_scatter5(a.as_ptr(), num_limbs, table.as_mut_ptr(), power) }; Ok(()) } // SAFETY: `power` must be less than 32. #[inline(always)] pub(in super::super::super) unsafe fn gather5( mut r: AsChunksMut, table: AsChunks, power: Window, ) -> Result<(), LimbSliceError> { prefixed_extern! { // Upstream uses `num: c::size_t` too, and `power: c::size_t`; see // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`. fn bn_gather5( out: *mut Limb, num: c::NonZero_size_t, table: *const Limb, power: Window); } let num_limbs = check_common(r.as_ref(), table)?; let r = r.as_flattened_mut(); let table = table.as_flattened(); // SAFETY: We cannot assert that `power` is in range because it is secret. // TODO: Create a `Window5` type that is guaranteed to be in range. unsafe { bn_gather5(r.as_mut_ptr(), num_limbs, table.as_ptr(), power) }; Ok(()) } // SAFETY: `power` must be less than 32. #[inline(always)] pub(in super::super::super) unsafe fn mul_mont_gather5_amm( mut r: AsChunksMut, a: AsChunks, table: AsChunks, n: AsChunks, n0: &N0, power: Window, maybe_adx_bmi1_bmi2: Option<(Adx, Bmi1, Bmi2)>, ) -> Result<(), LimbSliceError> { prefixed_extern! { // Upstream has `num: c_int` and `power: c_int`; see // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`. fn bn_mul4x_mont_gather5( rp: *mut Limb, ap: *const Limb, table: *const Limb, np: *const Limb, n0: &N0, num: c::NonZero_size_t, power: Window, ); // Upstream has `num: c_int` and `power: c_int`; see // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`. fn bn_mulx4x_mont_gather5( rp: *mut Limb, ap: *const Limb, table: *const Limb, np: *const Limb, n0: &N0, num: c::NonZero_size_t, power: Window, ); } let num_limbs = check_common_with_n(r.as_ref(), table, n)?; let a = a.as_flattened(); if a.len() != num_limbs.get() { return Err(LimbSliceError::len_mismatch(LenMismatchError::new(a.len()))); } let r = r.as_flattened_mut(); let r = r.as_mut_ptr(); let a = a.as_ptr(); let table = table.as_flattened(); let table = table.as_ptr(); let n = n.as_flattened(); let n = n.as_ptr(); // SAFETY: We cannot assert that `power` is in range because it is secret. // TODO: Create a `Window5` type that is guaranteed to be in range. if maybe_adx_bmi1_bmi2.is_some() { unsafe { bn_mulx4x_mont_gather5(r, a, table, n, n0, num_limbs, power) } } else { unsafe { bn_mul4x_mont_gather5(r, a, table, n, n0, num_limbs, power) } }; Ok(()) } // SAFETY: `power` must be less than 32. #[inline(always)] pub(in super::super::super) unsafe fn power5_amm( mut in_out: AsChunksMut, table: AsChunks, n: AsChunks, n0: &N0, power: Window, maybe_adx_bmi1_bmi2: Option<(Adx, Bmi1, Bmi2)>, ) -> Result<(), LimbSliceError> { prefixed_extern! { // Upstream has `num: c_int` and `power: c_int`; see // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`. fn bn_power5_nohw( rp: *mut Limb, ap: *const Limb, table: *const Limb, np: *const Limb, n0: &N0, num: c::NonZero_size_t, power: Window, ); // Upstream has `num: c_int` and `power: c_int`; see // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`. fn bn_powerx5( rp: *mut Limb, ap: *const Limb, table: *const Limb, np: *const Limb, n0: &N0, num: c::NonZero_size_t, power: Window, ); } let num_limbs = check_common_with_n(in_out.as_ref(), table, n)?; let in_out = in_out.as_flattened_mut(); let r = in_out.as_mut_ptr(); let a = in_out.as_ptr(); let table = table.as_flattened(); let table = table.as_ptr(); let n = n.as_flattened(); let n = n.as_ptr(); // SAFETY: We cannot assert that `power` is in range because it is secret. // TODO: Create a `Window5` type that is guaranteed to be in range. if maybe_adx_bmi1_bmi2.is_some() { unsafe { bn_powerx5(r, a, table, n, n0, num_limbs, power) } } else { unsafe { bn_power5_nohw(r, a, table, n, n0, num_limbs, power) } }; Ok(()) } // Helps the compiler will be able to hoist all of these checks out of the // loops in the caller. Try to help the compiler by doing the checks // consistently in each function and also by inlining this function and all the // callers. #[inline(always)] fn check_common( a: AsChunks, table: AsChunks, ) -> Result { assert_eq!((table.as_ptr() as usize) % 16, 0); // According to BoringSSL. let a = a.as_flattened(); let table = table.as_flattened(); let num_limbs = NonZeroUsize::new(a.len()).ok_or_else(|| LimbSliceError::too_short(a.len()))?; if num_limbs.get() > MAX_LIMBS { return Err(LimbSliceError::too_long(a.len())); } if num_limbs.get() * 32 != table.len() { return Err(LimbSliceError::len_mismatch(LenMismatchError::new( table.len(), ))); }; Ok(num_limbs) } #[inline(always)] fn check_common_with_n( a: AsChunks, table: AsChunks, n: AsChunks, ) -> Result { // Choose `a` instead of `n` so that every function starts with // `check_common` passing the exact same arguments, so that the compiler // can easily de-dupe the checks. let num_limbs = check_common(a, table)?; let n = n.as_flattened(); if n.len() != num_limbs.get() { return Err(LimbSliceError::len_mismatch(LenMismatchError::new(n.len()))); } Ok(num_limbs) } ring-0.17.14/src/arithmetic/limbs512/mod.rs000064400000000000000000000015131046102023000163400ustar 00000000000000// Copyright 2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. mod storage; pub(super) use self::storage::{AlignedStorage, LIMBS_PER_CHUNK}; ring-0.17.14/src/arithmetic/limbs512/storage.rs000064400000000000000000000044101046102023000172240ustar 00000000000000// Copyright 2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::{ error::LenMismatchError, limb::{Limb, LIMB_BITS}, polyfill::slice::{self, AsChunksMut}, }; use core::mem::{align_of, size_of}; // Some x86_64 assembly is written under the assumption that some of its // input data and/or temporary storage is aligned to `MOD_EXP_CTIME_ALIGN` // bytes, which was/is 64 in OpenSSL. // // We use this in the non-X86-64 implementation of exponentiation as well, // with the hope of converging th two implementations into one. #[repr(C, align(64))] pub struct AlignedStorage([Limb; N]); const _LIMB_SIZE_DIVIDES_ALIGNMENT: () = assert!(align_of::>() % size_of::() == 0); pub const LIMBS_PER_CHUNK: usize = 512 / LIMB_BITS; impl AlignedStorage { pub fn zeroed() -> Self { assert_eq!(N % LIMBS_PER_CHUNK, 0); // TODO: const. Self([0; N]) } // The result will have every chunk aligned on a 64 byte boundary. pub fn aligned_chunks_mut( &mut self, num_entries: usize, chunks_per_entry: usize, ) -> Result, LenMismatchError> { let total_limbs = num_entries * chunks_per_entry * LIMBS_PER_CHUNK; let len = self.0.len(); let flattened = self .0 .get_mut(..total_limbs) .ok_or_else(|| LenMismatchError::new(len))?; match slice::as_chunks_mut(flattened) { (chunks, []) => Ok(chunks), (_, r) => Err(LenMismatchError::new(r.len())), } } } ring-0.17.14/src/arithmetic/montgomery.rs000064400000000000000000000307541046102023000164340ustar 00000000000000// Copyright 2017-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. pub use super::n0::N0; use super::{inout::AliasingSlices3, LimbSliceError, MIN_LIMBS}; use crate::cpu; use cfg_if::cfg_if; // Indicates that the element is not encoded; there is no *R* factor // that needs to be canceled out. #[derive(Copy, Clone)] pub enum Unencoded {} // Indicates that the element is encoded; the value has one *R* // factor that needs to be canceled out. #[derive(Copy, Clone)] pub enum R {} // Indicates the element is encoded three times; the value has three // *R* factors that need to be canceled out. #[allow(clippy::upper_case_acronyms)] #[derive(Copy, Clone)] pub enum RRR {} // Indicates the element is encoded twice; the value has two *R* // factors that need to be canceled out. #[derive(Copy, Clone)] pub enum RR {} // Indicates the element is inversely encoded; the value has one // 1/*R* factor that needs to be canceled out. #[derive(Copy, Clone)] pub enum RInverse {} pub trait Encoding {} impl Encoding for RRR {} impl Encoding for RR {} impl Encoding for R {} impl Encoding for Unencoded {} impl Encoding for RInverse {} /// The encoding of the result of a reduction. pub trait ReductionEncoding { type Output: Encoding; } impl ReductionEncoding for RRR { type Output = RR; } impl ReductionEncoding for RR { type Output = R; } impl ReductionEncoding for R { type Output = Unencoded; } impl ReductionEncoding for Unencoded { type Output = RInverse; } /// The encoding of the result of a multiplication. pub trait ProductEncoding { type Output: Encoding; } impl ProductEncoding for (Unencoded, E) { type Output = E::Output; } impl ProductEncoding for (R, E) { type Output = E; } impl ProductEncoding for (RR, RR) { type Output = RRR; } impl ProductEncoding for (RInverse, E) where E::Output: ReductionEncoding, { type Output = <::Output as ReductionEncoding>::Output; } // XXX: Rust doesn't allow overlapping impls, // TODO (if/when Rust allows it): // impl ProductEncoding for // (E1, E2) { // type Output = <(E2, E1) as ProductEncoding>::Output; // } impl ProductEncoding for (RR, Unencoded) { type Output = <(Unencoded, RR) as ProductEncoding>::Output; } impl ProductEncoding for (RR, RInverse) { type Output = <(RInverse, RR) as ProductEncoding>::Output; } impl ProductEncoding for (RRR, RInverse) { type Output = <(RInverse, RRR) as ProductEncoding>::Output; } #[allow(unused_imports)] use crate::{bssl, c, limb::Limb}; #[inline(always)] pub(super) fn limbs_mul_mont( in_out: impl AliasingSlices3, n: &[Limb], n0: &N0, cpu: cpu::Features, ) -> Result<(), LimbSliceError> { const MOD_FALLBACK: usize = 1; // No restriction. cfg_if! { if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { let _: cpu::Features = cpu; const MIN_4X: usize = 4; const MOD_4X: usize = 4; if n.len() >= MIN_4X && n.len() % MOD_4X == 0 { bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { (MIN_4X, MOD_4X, ()) => bn_mul4x_mont }) } else { bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { (MIN_LIMBS, MOD_FALLBACK, ()) => bn_mul_mont_nohw }) } } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] { const MIN_8X: usize = 8; const MOD_8X: usize = 8; if n.len() >= MIN_8X && n.len() % MOD_8X == 0 { use crate::cpu::{GetFeature as _, arm::Neon}; if let Some(cpu) = cpu.get_feature() { return bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe { (MIN_8X, MOD_8X, Neon) => bn_mul8x_mont_neon }); } } // The ARM version of `bn_mul_mont_nohw` has a minimum of 2. const _MIN_LIMBS_AT_LEAST_2: () = assert!(MIN_LIMBS >= 2); bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { (MIN_LIMBS, MOD_FALLBACK, ()) => bn_mul_mont_nohw }) } else if #[cfg(target_arch = "x86")] { use crate::{cpu::GetFeature as _, cpu::intel::Sse2}; // The X86 implementation of `bn_mul_mont` has a minimum of 4. const _MIN_LIMBS_AT_LEAST_4: () = assert!(MIN_LIMBS >= 4); if let Some(cpu) = cpu.get_feature() { bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe { (MIN_LIMBS, MOD_FALLBACK, Sse2) => bn_mul_mont }) } else { // This isn't really an FFI call; it's defined below. unsafe { super::ffi::bn_mul_mont_ffi::<(), {MIN_LIMBS}, 1>(in_out, n, n0, (), bn_mul_mont_fallback) } } } else if #[cfg(target_arch = "x86_64")] { use crate::{cpu::GetFeature as _, polyfill::slice}; use super::limbs::x86_64; if n.len() >= x86_64::mont::MIN_4X { if let (n, []) = slice::as_chunks(n) { return x86_64::mont::mul_mont5_4x(in_out, n, n0, cpu.get_feature()); } } bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { (MIN_LIMBS, MOD_FALLBACK, ()) => bn_mul_mont_nohw }) } else { // Use the fallback implementation implemented below through the // FFI wrapper defined below, so that Rust and C code both go // through `bn_mul_mont`. bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe { (MIN_LIMBS, MOD_FALLBACK, cpu::Features) => bn_mul_mont }) } } } cfg_if! { if #[cfg(not(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86_64")))] { // TODO: Stop calling this from C and un-export it. #[cfg(not(target_arch = "x86"))] prefixed_export! { unsafe extern "C" fn bn_mul_mont( r: *mut Limb, a: *const Limb, b: *const Limb, n: *const Limb, n0: &N0, num_limbs: c::NonZero_size_t, ) { unsafe { bn_mul_mont_fallback(r, a, b, n, n0, num_limbs) } } } #[cfg_attr(target_arch = "x86", cold)] #[cfg_attr(target_arch = "x86", inline(never))] unsafe extern "C" fn bn_mul_mont_fallback( r: *mut Limb, a: *const Limb, b: *const Limb, n: *const Limb, n0: &N0, num_limbs: c::NonZero_size_t, ) { use super::MAX_LIMBS; let num_limbs = num_limbs.get(); // The mutable pointer `r` may alias `a` and/or `b`, so the lifetimes of // any slices for `a` or `b` must not overlap with the lifetime of any // mutable for `r`. // Nothing aliases `n` let n = unsafe { core::slice::from_raw_parts(n, num_limbs) }; let mut tmp = [0; 2 * MAX_LIMBS]; let tmp = &mut tmp[..(2 * num_limbs)]; { let a: &[Limb] = unsafe { core::slice::from_raw_parts(a, num_limbs) }; let b: &[Limb] = unsafe { core::slice::from_raw_parts(b, num_limbs) }; limbs_mul(tmp, a, b); } let r: &mut [Limb] = unsafe { core::slice::from_raw_parts_mut(r, num_limbs) }; limbs_from_mont_in_place(r, tmp, n, n0); } } } // `bigint` needs then when the `alloc` feature is enabled. `bn_mul_mont` above needs this when // we are using the platforms for which we don't have `bn_mul_mont` in assembly. #[cfg(any( feature = "alloc", not(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86_64" )) ))] pub(super) fn limbs_from_mont_in_place(r: &mut [Limb], tmp: &mut [Limb], m: &[Limb], n0: &N0) { prefixed_extern! { fn bn_from_montgomery_in_place( r: *mut Limb, num_r: c::size_t, a: *mut Limb, num_a: c::size_t, n: *const Limb, num_n: c::size_t, n0: &N0, ) -> bssl::Result; } Result::from(unsafe { bn_from_montgomery_in_place( r.as_mut_ptr(), r.len(), tmp.as_mut_ptr(), tmp.len(), m.as_ptr(), m.len(), n0, ) }) .unwrap() } #[cfg(not(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86_64" )))] fn limbs_mul(r: &mut [Limb], a: &[Limb], b: &[Limb]) { debug_assert_eq!(r.len(), 2 * a.len()); debug_assert_eq!(a.len(), b.len()); let ab_len = a.len(); r[..ab_len].fill(0); for (i, &b_limb) in b.iter().enumerate() { r[ab_len + i] = unsafe { limbs_mul_add_limb(r[i..][..ab_len].as_mut_ptr(), a.as_ptr(), b_limb, ab_len) }; } } #[cfg(any( test, not(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86_64", )) ))] prefixed_extern! { // `r` must not alias `a` #[must_use] fn limbs_mul_add_limb(r: *mut Limb, a: *const Limb, b: Limb, num_limbs: c::size_t) -> Limb; } /// r = r**2 pub(super) fn limbs_square_mont( r: &mut [Limb], n: &[Limb], n0: &N0, cpu: cpu::Features, ) -> Result<(), LimbSliceError> { #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { use super::limbs::aarch64; use crate::polyfill::slice; if let ((r, []), (n, [])) = (slice::as_chunks_mut(r), slice::as_chunks(n)) { return aarch64::mont::sqr_mont5(r, n, n0); } } #[cfg(target_arch = "x86_64")] { use super::limbs::x86_64; use crate::{cpu::GetFeature as _, polyfill::slice}; if let ((r, []), (n, [])) = (slice::as_chunks_mut(r), slice::as_chunks(n)) { return x86_64::mont::sqr_mont5(r, n, n0, cpu.get_feature()); } } limbs_mul_mont(r, n, n0, cpu) } #[cfg(test)] mod tests { use super::super::MAX_LIMBS; use super::*; use crate::limb::Limb; #[test] // TODO: wasm fn test_mul_add_words() { const ZERO: Limb = 0; const MAX: Limb = ZERO.wrapping_sub(1); static TEST_CASES: &[(&[Limb], &[Limb], Limb, Limb, &[Limb])] = &[ (&[0], &[0], 0, 0, &[0]), (&[MAX], &[0], MAX, 0, &[MAX]), (&[0], &[MAX], MAX, MAX - 1, &[1]), (&[MAX], &[MAX], MAX, MAX, &[0]), (&[0, 0], &[MAX, MAX], MAX, MAX - 1, &[1, MAX]), (&[1, 0], &[MAX, MAX], MAX, MAX - 1, &[2, MAX]), (&[MAX, 0], &[MAX, MAX], MAX, MAX, &[0, 0]), (&[0, 1], &[MAX, MAX], MAX, MAX, &[1, 0]), (&[MAX, MAX], &[MAX, MAX], MAX, MAX, &[0, MAX]), ]; for (i, (r_input, a, w, expected_retval, expected_r)) in TEST_CASES.iter().enumerate() { let mut r = [0; MAX_LIMBS]; let r = { let r = &mut r[..r_input.len()]; r.copy_from_slice(r_input); r }; assert_eq!(r.len(), a.len()); // Sanity check let actual_retval = unsafe { limbs_mul_add_limb(r.as_mut_ptr(), a.as_ptr(), *w, a.len()) }; assert_eq!(&r, expected_r, "{}: {:x?} != {:x?}", i, r, expected_r); assert_eq!( actual_retval, *expected_retval, "{}: {:x?} != {:x?}", i, actual_retval, *expected_retval ); } } } ring-0.17.14/src/arithmetic/n0.rs000064400000000000000000000023711046102023000145430ustar 00000000000000// Copyright 2015-2022 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::limb::Limb; #[derive(Clone, Copy)] #[repr(transparent)] pub struct N0([Limb; 2]); impl N0 { #[cfg(feature = "alloc")] pub(super) const LIMBS_USED: usize = 64 / crate::limb::LIMB_BITS; #[inline] pub const fn precalculated(n0: u64) -> Self { #[cfg(target_pointer_width = "64")] { Self([n0, 0]) } #[cfg(target_pointer_width = "32")] { Self([n0 as Limb, (n0 >> crate::limb::LIMB_BITS) as Limb]) } } } ring-0.17.14/src/arithmetic.rs000064400000000000000000000030421046102023000142220ustar 00000000000000// Copyright 2017-2023 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. pub(crate) use self::{constant::limbs_from_hex, limb_slice_error::LimbSliceError}; use crate::{error::LenMismatchError, limb::LIMB_BITS}; #[macro_use] mod ffi; mod constant; #[cfg(feature = "alloc")] pub mod bigint; pub(crate) mod inout; mod limbs; mod limbs512; pub mod montgomery; mod n0; // The minimum number of limbs allowed for any `&[Limb]` operation. // // TODO: Use `256 / LIMB_BITS` so that the limit is independent of limb size. pub const MIN_LIMBS: usize = 4; // The maximum number of limbs allowed for any `&[Limb]` operation. pub const MAX_LIMBS: usize = 8192 / LIMB_BITS; cold_exhaustive_error! { enum limb_slice_error::LimbSliceError { len_mismatch => LenMismatch(LenMismatchError), too_short => TooShort(usize), too_long => TooLong(usize), } } ring-0.17.14/src/bb/boolmask.rs000064400000000000000000000025341046102023000142700ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::Word; use core::ops; // BoolMask is either `BoolMask::TRUE` or `BoolMask::FALSE`. #[repr(transparent)] pub struct BoolMask(Word); impl BoolMask { #[cfg(test)] pub(super) const TRUE: Self = Self(Word::MAX); #[cfg(test)] pub(super) const FALSE: Self = Self(0); /// Returns true if `self` is `BoolMask::TRUE`; otherwise, returns false /// (`self` is `BoolMask::FALSE`). pub(crate) fn leak(self) -> bool { self.0 != 0 } } impl ops::BitAnd for BoolMask { type Output = Self; fn bitand(self, rhs: Self) -> Self { Self(self.0 & rhs.0) } } ring-0.17.14/src/bb/leaky.rs000064400000000000000000000022221046102023000135600ustar 00000000000000// Copyright 2015-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #[cfg(target_pointer_width = "64")] type CompilerWord = u64; #[cfg(target_pointer_width = "32")] type CompilerWord = u32; /// A native word that isn't secret. /// /// `LeakyWord` supports `as` conversions to/from native types. /// /// XXX: This isn't the native word size on targets where a pointer isn't the /// same size as a native word. TODO: Fix this. pub(crate) type LeakyWord = CompilerWord; ring-0.17.14/src/bb/mod.rs000064400000000000000000000123701046102023000132370ustar 00000000000000// Copyright 2015-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Building blocks. use crate::{c, error}; use core::{ffi::c_int, num::NonZeroUsize}; mod boolmask; mod leaky; mod word; pub(crate) use self::{boolmask::BoolMask, leaky::LeakyWord, word::Word}; /// Returns `Ok(())` if `a == b` and `Err(error::Unspecified)` otherwise. pub fn verify_slices_are_equal(a: &[u8], b: &[u8]) -> Result<(), error::Unspecified> { let len = a.len(); // Arbitrary choice. if b.len() != len { return Err(error::Unspecified); } match NonZeroUsize::new(len) { Some(len) => { let a = a.as_ptr(); let b = b.as_ptr(); // SAFETY: `a` and `b` are valid non-null non-dangling pointers to `len` // bytes. let result = unsafe { CRYPTO_memcmp(a, b, len) }; match result { 0 => Ok(()), _ => Err(error::Unspecified), } } None => Ok(()), // Empty slices are equal. } } prefixed_extern! { fn CRYPTO_memcmp(a: *const u8, b: *const u8, len: c::NonZero_size_t) -> c_int; } pub(crate) fn xor_16(a: [u8; 16], b: [u8; 16]) -> [u8; 16] { let a = u128::from_ne_bytes(a); let b = u128::from_ne_bytes(b); let r = a ^ b; r.to_ne_bytes() } #[inline(always)] pub(crate) fn xor_assign<'a>(a: impl IntoIterator, b: u8) { a.into_iter().for_each(|a| *a ^= b); } /// XORs the first N bytes of `b` into `a`, where N is /// `core::cmp::min(a.len(), b.len())`. #[inline(always)] pub(crate) fn xor_assign_at_start<'a>( a: impl IntoIterator, b: impl IntoIterator, ) { a.into_iter().zip(b).for_each(|(a, b)| *a ^= *b); } #[cfg(test)] mod tests { use super::*; use crate::{bssl, rand}; fn leak_in_test(a: BoolMask) -> bool { a.leak() } #[test] fn test_constant_time() -> Result<(), error::Unspecified> { prefixed_extern! { fn bssl_constant_time_test_main() -> bssl::Result; } Result::from(unsafe { bssl_constant_time_test_main() }) } #[test] fn constant_time_conditional_memcpy() -> Result<(), error::Unspecified> { let rng = rand::SystemRandom::new(); for _ in 0..100 { let mut out = rand::generate::<[u8; 256]>(&rng)?.expose(); let input = rand::generate::<[u8; 256]>(&rng)?.expose(); // Mask to 16 bits to make zero more likely than it would otherwise be. let b = (rand::generate::<[u8; 1]>(&rng)?.expose()[0] & 0x0f) == 0; let ref_in = input; let ref_out = if b { input } else { out }; prefixed_extern! { fn bssl_constant_time_test_conditional_memcpy(dst: &mut [u8; 256], src: &[u8; 256], b: BoolMask); } unsafe { bssl_constant_time_test_conditional_memcpy( &mut out, &input, if b { BoolMask::TRUE } else { BoolMask::FALSE }, ) } assert_eq!(ref_in, input); assert_eq!(ref_out, out); } Ok(()) } #[test] fn constant_time_conditional_memxor() -> Result<(), error::Unspecified> { let rng = rand::SystemRandom::new(); for _ in 0..256 { let mut out = rand::generate::<[u8; 256]>(&rng)?.expose(); let input = rand::generate::<[u8; 256]>(&rng)?.expose(); // Mask to 16 bits to make zero more likely than it would otherwise be. let b = (rand::generate::<[u8; 1]>(&rng)?.expose()[0] & 0x0f) != 0; let ref_in = input; let mut ref_out = out; if b { xor_assign_at_start(&mut ref_out, &ref_in) }; prefixed_extern! { fn bssl_constant_time_test_conditional_memxor(dst: &mut [u8; 256], src: &[u8; 256], b: BoolMask); } unsafe { bssl_constant_time_test_conditional_memxor( &mut out, &input, if b { BoolMask::TRUE } else { BoolMask::FALSE }, ); } assert_eq!(ref_in, input); assert_eq!(ref_out, out); } Ok(()) } #[test] fn test_bool_mask_bitwise_and_is_logical_and() { assert!(leak_in_test(BoolMask::TRUE & BoolMask::TRUE)); assert!(!leak_in_test(BoolMask::TRUE & BoolMask::FALSE)); assert!(!leak_in_test(BoolMask::FALSE & BoolMask::TRUE)); assert!(!leak_in_test(BoolMask::FALSE & BoolMask::FALSE)); } } ring-0.17.14/src/bb/word.rs000064400000000000000000000031341046102023000134310ustar 00000000000000// Copyright 2015-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::LeakyWord; /// A native word that may hold a secret. /// /// XXX: Currently this is a type alias of `LeakyWord` so it doesn't enforce, /// except by convention, the prevention of leaks. This is a temporary state to /// support the refactorings that will /// /// XXX: This isn't the native word size on targets where a pointer isn't the /// same size as a native word. TODO: Fix this. /// /// XXX: Over time, we'll evolve Word into a newtype with an API that minimizes /// leaks and makes all leaks explicit, like so: pub(crate) type Word = LeakyWord; /* TODO: #[repr(transparent)] pub(crate) struct Word(LeakyWord); impl Word { pub fn leak_word(self) -> LeakyWord { self.0 } } impl From for Word { fn from(w: LeakyWord) -> Self { // TODO: Use a stronger `black_box`. Self(core::hint::black_box(w)) } } */ ring-0.17.14/src/bits.rs000064400000000000000000000106551046102023000130420ustar 00000000000000// Copyright 2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Bit lengths. use crate::{error::InputTooLongError, polyfill}; /// The length of something, in bits. /// /// This can represent a bit length that isn't a whole number of bytes. #[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd)] #[repr(transparent)] pub struct BitLength(T); pub(crate) trait FromByteLen: Sized { /// Constructs a `BitLength` from the given length in bytes. /// /// Fails if `bytes * 8` is too large for a `T`. fn from_byte_len(bytes: T) -> Result>; } impl FromByteLen for BitLength { #[inline] fn from_byte_len(bytes: usize) -> Result { match bytes.checked_mul(8) { Some(bits) => Ok(Self(bits)), None => Err(InputTooLongError::new(bytes)), } } } impl FromByteLen for BitLength { #[inline] fn from_byte_len(bytes: u64) -> Result> { match bytes.checked_mul(8) { Some(bits) => Ok(Self(bits)), None => Err(InputTooLongError::new(bytes)), } } } impl FromByteLen for BitLength { #[inline] fn from_byte_len(bytes: usize) -> Result> { match polyfill::u64_from_usize(bytes).checked_mul(8) { Some(bits) => Ok(Self(bits)), None => Err(InputTooLongError::new(bytes)), } } } impl BitLength { /// Constructs a `BitLength` from the given length in bits. #[inline] pub const fn from_bits(bits: T) -> Self { Self(bits) } } impl BitLength { /// The number of bits this bit length represents, as the underlying type. #[inline] pub fn as_bits(self) -> T { self.0 } } // Lengths measured in bits, where all arithmetic is guaranteed not to // overflow. impl BitLength { #[cfg(feature = "alloc")] #[inline] pub(crate) fn half_rounded_up(&self) -> Self { let round_up = self.0 & 1; Self((self.0 / 2) + round_up) } /// The bit length, rounded up to a whole number of bytes. #[inline] pub const fn as_usize_bytes_rounded_up(&self) -> usize { // Equivalent to (self.0 + 7) / 8, except with no potential for // overflow and without branches. // Branchless round_up = if self.0 & 0b111 != 0 { 1 } else { 0 }; let round_up = ((self.0 >> 2) | (self.0 >> 1) | self.0) & 1; (self.0 / 8) + round_up } #[cfg(feature = "alloc")] #[inline] pub(crate) fn try_sub_1(self) -> Result { let sum = self.0.checked_sub(1).ok_or(crate::error::Unspecified)?; Ok(Self(sum)) } } impl BitLength { pub fn to_be_bytes(self) -> [u8; 8] { self.0.to_be_bytes() } } #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] impl From> for BitLength { fn from(BitLength(value): BitLength) -> Self { BitLength(polyfill::u64_from_usize(value)) } } impl TryFrom> for BitLength { type Error = >::Error; fn try_from(BitLength(value): BitLength) -> Result { value.try_into().map(BitLength) } } const _TEST_AS_USIZE_BYTES_ROUNDED_UP_EVEN: () = assert!(BitLength::from_bits(8192).as_usize_bytes_rounded_up() == 8192 / 8); const _TEST_AS_USIZE_BYTES_ROUNDED_UP_ONE_BIT_HIGH: () = assert!(BitLength::from_bits(8192 + 1).as_usize_bytes_rounded_up() == (8192 / 8) + 1); const _TEST_AS_USIZE_BYTES_ROUNDED_UP_SEVEN_BITS_HIGH: () = assert!(BitLength::from_bits(8192 + 7).as_usize_bytes_rounded_up() == (8192 / 8) + 1); ring-0.17.14/src/bssl.rs000064400000000000000000000036521046102023000130430ustar 00000000000000// Copyright 2015 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::error; use core::ffi::c_int; /// An `int` returned from a foreign function containing **1** if the function /// was successful or **0** if an error occurred. This is the convention used by /// C code in `ring`. #[must_use] #[repr(transparent)] pub struct Result(c_int); impl From for core::result::Result<(), error::Unspecified> { fn from(ret: Result) -> Self { match ret.0 { 1 => Ok(()), c => { debug_assert_eq!(c, 0, "`bssl::Result` value must be 0 or 1"); Err(error::Unspecified) } } } } #[cfg(test)] mod tests { mod result { use crate::bssl; use core::{ ffi::c_int, mem::{align_of, size_of}, }; #[test] fn size_and_alignment() { type Underlying = c_int; assert_eq!(size_of::(), size_of::()); assert_eq!(align_of::(), align_of::()); } #[test] fn semantics() { assert!(Result::from(bssl::Result(0)).is_err()); assert!(Result::from(bssl::Result(1)).is_ok()); } } } ring-0.17.14/src/c.rs000064400000000000000000000026631046102023000123230ustar 00000000000000// Copyright 2016-2019 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! C types. //! //! Avoid using the `libc` crate to get C types since `libc` doesn't support //! all the targets we need to support. It turns out that the few types we need //! are all uniformly defined on the platforms we care about. This will //! probably change if/when we support 16-bit platforms or platforms where //! `usize` and `uintptr_t` are different sizes. //! //! TODO(MSRV, feature(c_size_t)): Use `core::{ffi::c_size_t}`. //! TODO(MSRV-1.79): Use `NonZero`. // Keep in sync with the checks in base.h that verify these assumptions. #![allow(dead_code)] use core::num::NonZeroUsize; pub(crate) type size_t = usize; pub(crate) type NonZero_size_t = NonZeroUsize; ring-0.17.14/src/cpu/arm/darwin.rs000064400000000000000000000110001046102023000147140ustar 00000000000000// Copyright 2016-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{Aes, Neon, PMull, Sha256, Sha512, CAPS_STATIC}; use crate::polyfill::cstr; // ``` // $ rustc +1.61.0 --print cfg --target=aarch64-apple-ios | grep -E "neon|aes|sha|pmull" // target_feature="aes" // target_feature="neon" // target_feature="sha2" // $ rustc +1.61.0 --print cfg --target=aarch64-apple-darwin | grep -E "neon|aes|sha|pmull" // target_feature="aes" // target_feature="neon" // target_feature="sha2" // target_feature="sha3" // ``` // // XXX/TODO(coverage)/TODO(size): aarch64-apple-darwin is statically guaranteed to have "sha3" but // other aarch64-apple-* targets require dynamic detection. Since we don't have test coverage for // the other targets yet, we wouldn't have a way of testing the dynamic detection if we statically // enabled `Sha512` for -darwin. So instead, temporarily, we statically ignore the static // availability of the feature on -darwin so that it runs the dynamic detection. pub const MIN_STATIC_FEATURES: u32 = Neon::mask() | Aes::mask() | Sha256::mask() | PMull::mask(); pub const FORCE_DYNAMIC_DETECTION: u32 = !MIN_STATIC_FEATURES; // MSRV: Enforce 1.61.0 onaarch64-apple-*, in particular) prior to. Earlier // versions of Rust before did not report the AAarch64 CPU features correctly // for these targets. Cargo.toml specifies `rust-version` but versions before // Rust 1.56 don't know about it. #[allow(clippy::assertions_on_constants)] const _AARCH64_APPLE_TARGETS_EXPECTED_FEATURES: () = assert!((CAPS_STATIC & MIN_STATIC_FEATURES) == MIN_STATIC_FEATURES); // Ensure we don't accidentally allow features statically beyond // `MIN_STATIC_FEATURES` so that dynamic detection is done uniformly for // all of these targets. #[allow(clippy::assertions_on_constants)] const _AARCH64_APPLE_DARWIN_TARGETS_EXPECTED_FEATURES: () = assert!(CAPS_STATIC == MIN_STATIC_FEATURES); pub fn detect_features() -> u32 { fn detect_feature(name: cstr::Ref) -> bool { use crate::polyfill; use core::mem; use libc::{c_int, c_void}; let mut value: c_int = 0; let mut len = mem::size_of_val(&value); let value_ptr = polyfill::ptr::from_mut(&mut value).cast::(); // SAFETY: `value_ptr` is a valid pointer to `value` and `len` is the size of `value`. let rc = unsafe { libc::sysctlbyname(name.as_ptr(), value_ptr, &mut len, core::ptr::null_mut(), 0) }; // All the conditions are separated so we can observe them in code coverage. if rc != 0 { return false; } debug_assert_eq!(len, mem::size_of_val(&value)); if len != mem::size_of_val(&value) { return false; } value != 0 } // We do not need to check for the presence of NEON, as Armv8-A always has it const _ASSERT_NEON_DETECTED: () = assert!((CAPS_STATIC & Neon::mask()) == Neon::mask()); let mut features = 0; // TODO(MSRV 1.77): Use c"..." literal. const SHA512_NAME: cstr::Ref = cstr::unwrap_const_from_bytes_with_nul(b"hw.optional.armv8_2_sha512\0"); if detect_feature(SHA512_NAME) { features |= Sha512::mask(); } features } #[cfg(test)] mod tests { use super::*; use crate::cpu; #[test] fn sha512_detection() { // We intentionally disable static feature detection for SHA-512. const _SHA512_NOT_STATICALLY_DETECTED: () = assert!((CAPS_STATIC & Sha512::mask()) == 0); if cfg!(target_os = "macos") { use crate::cpu::{arm::Sha512, GetFeature as _}; // All aarch64-apple-darwin targets have SHA3 enabled statically... assert!(cfg!(target_feature = "sha3")); // ...so we should detect it. let cpu = cpu::features(); assert!(matches!(cpu.get_feature(), Some(Sha512 { .. }))); } } } ring-0.17.14/src/cpu/arm/fuchsia.rs000064400000000000000000000042311046102023000150620ustar 00000000000000// Copyright 2016-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{Aes, Neon, PMull, Sha256, Sha512, CAPS_STATIC}; pub const FORCE_DYNAMIC_DETECTION: u32 = 0; pub fn detect_features() -> u32 { type zx_status_t = i32; #[link(name = "zircon")] extern "C" { fn zx_system_get_features(kind: u32, features: *mut u32) -> zx_status_t; } const ZX_OK: i32 = 0; const ZX_FEATURE_KIND_CPU: u32 = 0; const ZX_ARM64_FEATURE_ISA_AES: u32 = 1 << 3; const ZX_ARM64_FEATURE_ISA_PMULL: u32 = 1 << 4; const ZX_ARM64_FEATURE_ISA_SHA256: u32 = 1 << 6; const ZX_ARM64_FEATURE_ISA_SHA512: u32 = 1 << 18; let mut caps = 0; let rc = unsafe { zx_system_get_features(ZX_FEATURE_KIND_CPU, &mut caps) }; let mut features = 0; // We do not need to check for the presence of NEON, as Armv8-A always has it const _ASSERT_NEON_DETECTED: () = assert!((CAPS_STATIC & Neon::mask()) == Neon::mask()); if rc == ZX_OK { if caps & ZX_ARM64_FEATURE_ISA_AES == ZX_ARM64_FEATURE_ISA_AES { features |= Aes::mask(); } if caps & ZX_ARM64_FEATURE_ISA_PMULL == ZX_ARM64_FEATURE_ISA_PMULL { features |= PMull::mask(); } if caps & ZX_ARM64_FEATURE_ISA_SHA256 == ZX_ARM64_FEATURE_ISA_SHA256 { features |= Sha256::mask(); } if caps & ZX_ARM64_FEATURE_ISA_SHA512 == ZX_ARM64_FEATURE_ISA_SHA512 { features |= Sha512::mask(); } } features } ring-0.17.14/src/cpu/arm/linux.rs000064400000000000000000000071421046102023000146030ustar 00000000000000// Copyright 2016-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::Neon; // Work around a bug in LLVM/rustc where `-C target_cpu=cortex-a72`-- // and `-C target_cpu=native` on Cortex-A72 Raspberry PI devices in // particular--enables crypto features even though not all Cortex-A72 // CPUs have crypto features: // // ``` // $ rustc --print cfg --target=aarch64-unknown-linux-gnu | grep feature // target_feature="neon" // $ rustc --print cfg --target=aarch64-unknown-linux-gnu -C target_cpu=cortex-a72 | grep feature // target_feature="aes" // target_feature="crc" // target_feature="neon" // target_feature="pmuv3" // target_feature="sha2" // ``` // // XXX/TODO(MSRV https://github.com/llvm/llvm-project/issues/90365): This // workaround is heavy-handed since it forces extra branches for devices that // have correctly-modeled feature sets, so it should be removed. pub const FORCE_DYNAMIC_DETECTION: u32 = !Neon::mask(); // `uclibc` does not provide `getauxval` so just use static feature detection // for it. #[cfg(target_env = "uclibc")] pub fn detect_features() -> u32 { 0 } #[cfg(all( not(target_env = "uclibc"), all(target_arch = "aarch64", target_endian = "little") ))] pub fn detect_features() -> u32 { use super::{Aes, PMull, Sha256, Sha512, CAPS_STATIC}; use libc::{getauxval, AT_HWCAP, HWCAP_AES, HWCAP_PMULL, HWCAP_SHA2, HWCAP_SHA512}; let mut features = 0; // We do not need to check for the presence of NEON, as Armv8-A always has it const _ASSERT_NEON_DETECTED: () = assert!((CAPS_STATIC & Neon::mask()) == Neon::mask()); let caps = unsafe { getauxval(AT_HWCAP) }; if caps & HWCAP_AES == HWCAP_AES { features |= Aes::mask(); } if caps & HWCAP_PMULL == HWCAP_PMULL { features |= PMull::mask(); } if caps & HWCAP_SHA2 == HWCAP_SHA2 { features |= Sha256::mask(); } if caps & HWCAP_SHA512 == HWCAP_SHA512 { features |= Sha512::mask(); } features } #[cfg(all( not(target_env = "uclibc"), all(target_arch = "arm", target_endian = "little") ))] pub fn detect_features() -> u32 { use super::CAPS_STATIC; // The `libc` crate doesn't provide this functionality on all // 32-bit Linux targets, like Android or -musl. Use this polyfill // for all 32-bit ARM targets so that testing on one of them will // be more meaningful to the others. use libc::c_ulong; extern "C" { pub fn getauxval(type_: c_ulong) -> c_ulong; } const AT_HWCAP: c_ulong = 16; const HWCAP_NEON: c_ulong = 1 << 12; let mut features = 0; if CAPS_STATIC & Neon::mask() != Neon::mask() { let caps = unsafe { getauxval(AT_HWCAP) }; // OpenSSL and BoringSSL don't enable any other features if NEON isn't // available. We don't enable any hardware implementations for 32-bit ARM. if caps & HWCAP_NEON == HWCAP_NEON { features |= Neon::mask(); } } features } ring-0.17.14/src/cpu/arm/windows.rs000064400000000000000000000027641046102023000151430ustar 00000000000000// Copyright 2016-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{Aes, Neon, PMull, Sha256, CAPS_STATIC}; use windows_sys::Win32::System::Threading::{ IsProcessorFeaturePresent, PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE, }; pub const FORCE_DYNAMIC_DETECTION: u32 = 0; pub fn detect_features() -> u32 { // We do not need to check for the presence of NEON, as Armv8-A always has it const _ASSERT_NEON_DETECTED: () = assert!((CAPS_STATIC & Neon::mask()) == Neon::mask()); let mut features = 0; let result = unsafe { IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) }; if result != 0 { // These are all covered by one call in Windows features |= Aes::mask(); features |= PMull::mask(); features |= Sha256::mask(); } features } ring-0.17.14/src/cpu/arm.rs000064400000000000000000000201311046102023000134350ustar 00000000000000// Copyright 2016-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::CAPS_STATIC; mod abi_assumptions { use core::mem::size_of; // TODO: Support ARM64_32; see // https://github.com/briansmith/ring/issues/1832#issuecomment-1892928147. This also requires // replacing all `cfg(target_pointer_width)` logic for non-pointer/reference things // (`N0`, `Limb`, `LimbMask`, `crypto_word_t` etc.). #[cfg(target_arch = "aarch64")] const _ASSUMED_POINTER_SIZE: usize = 8; #[cfg(target_arch = "arm")] const _ASSUMED_POINTER_SIZE: usize = 4; const _ASSUMED_USIZE_SIZE: () = assert!(size_of::() == _ASSUMED_POINTER_SIZE); const _ASSUMED_REF_SIZE: () = assert!(size_of::<&'static u8>() == _ASSUMED_POINTER_SIZE); // To support big-endian, we'd need to make several changes as described in // https://github.com/briansmith/ring/issues/1832. const _ASSUMED_ENDIANNESS: () = assert!(cfg!(target_endian = "little")); } // uclibc: When linked statically, uclibc doesn't provide getauxval. // When linked dynamically, recent versions do provide it, but we // want to support older versions too. Assume that if uclibc is being // used, this is an embedded target where the user cares a lot about // minimizing code size and also that they know in advance exactly // what target features are supported, so rely only on static feature // detection. cfg_if::cfg_if! { if #[cfg(all(all(target_arch = "aarch64", target_endian = "little"), any(target_os = "ios", target_os = "macos", target_os = "tvos", target_os = "visionos", target_os = "watchos")))] { mod darwin; use darwin as detect; } else if #[cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_os = "fuchsia"))] { mod fuchsia; use fuchsia as detect; } else if #[cfg(any(target_os = "android", target_os = "linux"))] { mod linux; use linux as detect; } else if #[cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_os = "windows"))] { mod windows; use windows as detect; } else { mod detect { pub const FORCE_DYNAMIC_DETECTION: u32 = 0; pub fn detect_features() -> u32 { 0 } } } } impl_get_feature! { features: [ // TODO(MSRV): 32-bit ARM doesn't have `target_feature = "neon"` yet. { ("aarch64", "arm") => Neon }, // TODO(MSRV): There is no "pmull" feature listed from // `rustc --print cfg --target=aarch64-apple-darwin`. Originally ARMv8 tied // PMULL detection into AES detection, but later versions split it; see // https://developer.arm.com/downloads/-/exploration-tools/feature-names-for-a-profile // "Features introduced prior to 2020." Change this to use "pmull" when // that is supported. { ("aarch64") => PMull }, { ("aarch64") => Aes }, { ("aarch64") => Sha256 }, // Keep in sync with `ARMV8_SHA512`. // "sha3" is overloaded for both SHA-3 and SHA-512. { ("aarch64") => Sha512 }, ], } pub(super) mod featureflags { pub(in super::super) use super::detect::FORCE_DYNAMIC_DETECTION; use super::*; use crate::{ cpu, polyfill::{once_cell::race, usize_from_u32}, }; use core::num::NonZeroUsize; #[cfg(all(target_arch = "arm", target_endian = "little"))] use core::sync::atomic::{AtomicU32, Ordering}; pub(in super::super) fn get_or_init() -> cpu::Features { fn init() -> NonZeroUsize { let detected = detect::detect_features(); let filtered = (if cfg!(feature = "unstable-testing-arm-no-hw") { !Neon::mask() } else { 0 }) | (if cfg!(feature = "unstable-testing-arm-no-neon") { Neon::mask() } else { 0 }); let detected = detected & !filtered; let merged = CAPS_STATIC | detected; #[cfg(all( target_arch = "arm", target_endian = "little", target_has_atomic = "32" ))] if (merged & Neon::mask()) == Neon::mask() { // `neon_available` is declared as `alignas(4) uint32_t` in the C code. // AtomicU32 is `#[repr(C, align(4))]`. prefixed_extern! { static neon_available: AtomicU32; } // SAFETY: The C code only reads `neon_available`, and its // reads are synchronized through the `OnceNonZeroUsize` // Acquire/Release semantics as we ensure we have a // `cpu::Features` instance before calling into the C code. let p = unsafe { &neon_available }; p.store(1, Ordering::Relaxed); } let merged = usize_from_u32(merged) | (1 << (Shift::Initialized as u32)); NonZeroUsize::new(merged).unwrap() // Can't fail because we just set a bit. } // SAFETY: This is the only caller. Any concurrent reading doesn't // affect the safety of the writing. let _: NonZeroUsize = FEATURES.get_or_init(init); // SAFETY: We initialized the CPU features as required. unsafe { cpu::Features::new_after_feature_flags_written_and_synced_unchecked() } } pub(in super::super) fn get(_cpu_features: cpu::Features) -> u32 { // SAFETY: Since only `get_or_init()` could have created // `_cpu_features`, and it only does so after `FEATURES.get_or_init()`, // we know we are reading from `FEATURES` after initializing it. // // Also, 0 means "no features detected" to users, which is designed to // be a safe configuration. let features = FEATURES.get().map(NonZeroUsize::get).unwrap_or(0); // The truncation is lossless, as we set the value with a u32. #[allow(clippy::cast_possible_truncation)] let features = features as u32; features } static FEATURES: race::OnceNonZeroUsize = race::OnceNonZeroUsize::new(); // TODO(MSRV): There is no "pmull" feature listed from // `rustc --print cfg --target=aarch64-apple-darwin`. Originally ARMv8 tied // PMULL detection into AES detection, but later versions split it; see // https://developer.arm.com/downloads/-/exploration-tools/feature-names-for-a-profile // "Features introduced prior to 2020." Change this to use "pmull" when // that is supported. // // "sha3" is overloaded for both SHA-3 and SHA-512. #[cfg(all(target_arch = "aarch64", target_endian = "little"))] #[rustfmt::skip] pub(in super::super) const STATIC_DETECTED: u32 = 0 | (if cfg!(target_feature = "neon") { Neon::mask() } else { 0 }) | (if cfg!(target_feature = "aes") { Aes::mask() } else { 0 }) | (if cfg!(target_feature = "aes") { PMull::mask() } else { 0 }) | (if cfg!(target_feature = "sha2") { Sha256::mask() } else { 0 }) | (if cfg!(target_feature = "sha3") { Sha512::mask() } else { 0 }) ; // TODO(MSRV): 32-bit ARM doesn't support any static feature detection yet. #[cfg(all(target_arch = "arm", target_endian = "little"))] pub(in super::super) const STATIC_DETECTED: u32 = 0; } #[allow(clippy::assertions_on_constants)] const _AARCH64_HAS_NEON: () = assert!( ((CAPS_STATIC & Neon::mask()) == Neon::mask()) || !cfg!(all(target_arch = "aarch64", target_endian = "little")) ); ring-0.17.14/src/cpu/intel.rs000064400000000000000000000360741046102023000140060ustar 00000000000000// Copyright 2016-2021 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use cfg_if::cfg_if; mod abi_assumptions { use core::mem::size_of; // TOOD: Support targets that do not have SSE and SSE2 enabled, such as // x86_64-unknown-linux-none. See // https://github.com/briansmith/ring/issues/1793#issuecomment-1793243725, // https://github.com/briansmith/ring/issues/1832, // https://github.com/briansmith/ring/issues/1833. const _ASSUMES_SSE2: () = assert!(cfg!(target_feature = "sse") && cfg!(target_feature = "sse2")); #[cfg(target_arch = "x86_64")] const _ASSUMED_POINTER_SIZE: usize = 8; #[cfg(target_arch = "x86")] const _ASSUMED_POINTER_SIZE: usize = 4; const _ASSUMED_USIZE_SIZE: () = assert!(size_of::() == _ASSUMED_POINTER_SIZE); const _ASSUMED_REF_SIZE: () = assert!(size_of::<&'static u8>() == _ASSUMED_POINTER_SIZE); const _ASSUMED_ENDIANNESS: () = assert!(cfg!(target_endian = "little")); } pub(super) mod featureflags { use super::super::CAPS_STATIC; use crate::{ cpu, polyfill::{once_cell::race, usize_from_u32}, }; use core::num::NonZeroUsize; pub(in super::super) fn get_or_init() -> cpu::Features { // SAFETY: `OPENSSL_cpuid_setup` must be called only in // `INIT.call_once()` below. prefixed_extern! { fn OPENSSL_cpuid_setup(out: &mut [u32; 4]); } let _: NonZeroUsize = FEATURES.get_or_init(|| { let mut cpuid = [0; 4]; // SAFETY: We assume that it is safe to execute CPUID and XGETBV. unsafe { OPENSSL_cpuid_setup(&mut cpuid); } let detected = super::cpuid_to_caps_and_set_c_flags(&cpuid); let merged = CAPS_STATIC | detected; let merged = usize_from_u32(merged) | (1 << (super::Shift::Initialized as u32)); NonZeroUsize::new(merged).unwrap() // Can't fail because we just set a bit. }); // SAFETY: We initialized the CPU features as required. // `INIT.call_once` has `happens-before` semantics. unsafe { cpu::Features::new_after_feature_flags_written_and_synced_unchecked() } } pub(in super::super) fn get(_cpu_features: cpu::Features) -> u32 { // SAFETY: Since only `get_or_init()` could have created // `_cpu_features`, and it only does so after `FEATURES.get_or_init()`, // we know we are reading from `FEATURES` after initializing it. // // Also, 0 means "no features detected" to users, which is designed to // be a safe configuration. let features = FEATURES.get().map(NonZeroUsize::get).unwrap_or(0); // The truncation is lossless, as we set the value with a u32. #[allow(clippy::cast_possible_truncation)] let features = features as u32; features } static FEATURES: race::OnceNonZeroUsize = race::OnceNonZeroUsize::new(); #[cfg(target_arch = "x86")] #[rustfmt::skip] pub const STATIC_DETECTED: u32 = 0 | (if cfg!(target_feature = "sse2") { super::Sse2::mask() } else { 0 }) ; // Limited to x86_64-v2 features. // TODO: Add missing x86-64-v3 features if we find real-world use of x86-64-v3. // TODO: Add all features we use. #[cfg(target_arch = "x86_64")] #[rustfmt::skip] pub const STATIC_DETECTED: u32 = 0 | if cfg!(target_feature = "sse4.1") { super::Sse41::mask() } else { 0 } | if cfg!(target_feature = "ssse3") { super::Ssse3::mask() } else { 0 } ; pub const FORCE_DYNAMIC_DETECTION: u32 = 0; } fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { // "Intel" citations are for "Intel 64 and IA-32 Architectures Software // Developer’s Manual", Combined Volumes, December 2024. // "AMD" citations are for "AMD64 Technology AMD64 Architecture // Programmer’s Manual, Volumes 1-5" Revision 4.08 April 2024. // The `prefixed_extern!` uses below assume this #[cfg(target_arch = "x86_64")] use core::{mem::align_of, sync::atomic::AtomicU32}; #[cfg(target_arch = "x86_64")] const _ATOMIC32_ALIGNMENT_EQUALS_U32_ALIGNMENT: () = assert!(align_of::() == align_of::()); fn check(leaf: u32, bit: u32) -> bool { let shifted = 1 << bit; (leaf & shifted) == shifted } fn set(out: &mut u32, shift: Shift) { let shifted = 1 << (shift as u32); debug_assert_eq!(*out & shifted, 0); *out |= shifted; debug_assert_eq!(*out & shifted, shifted); } #[cfg(target_arch = "x86_64")] let is_intel = check(cpuid[0], 30); // Synthesized by `OPENSSL_cpuid_setup` // CPUID leaf 1. let leaf1_ecx = cpuid[1]; // Intel: "Structured Extended Feature Flags Enumeration Leaf" #[cfg(target_arch = "x86_64")] let (extended_features_ebx, extended_features_ecx) = (cpuid[2], cpuid[3]); let mut caps = 0; // AMD: "Collectively the SSE1, [...] are referred to as the legacy SSE // instructions. All legacy SSE instructions support 128-bit vector // operands." // Intel: "11.6.2 Checking for Intel SSE and SSE2 Support" // We have to assume the prerequisites for SSE/SSE2 are met since we're // already almost definitely using SSE registers if these target features // are enabled. // // These also seem to help ensure CMOV support; There doesn't seem to be // a `cfg!(target_feature = "cmov")`. It is likely that removing these // assertions will remove the requirement for CMOV. With our without // CMOV, it is likely that some of our timing side channel prevention does // not work. Presumably the people who delete these are verifying that it // all works fine. const _SSE_REQUIRED: () = assert!(cfg!(target_feature = "sse")); const _SSE2_REQUIRED: () = assert!(cfg!(target_feature = "sse2")); #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))] { // If somebody is trying to compile for an x86 target without SSE2 // and they deleted the `_SSE2_REQUIRED` const assertion above then // they're probably trying to support a Linux/BSD/etc. distro that // tries to support ancient x86 systems without SSE/SSE2. Try to // reduce the harm caused, by implementing dynamic feature detection // for them so that most systems will work like normal. // // Note that usually an x86-64 target with SSE2 disabled by default, // usually `-none-` targets, will not support dynamically-detected use // of SIMD registers via CPUID. A whole different mechanism is needed // to support them. Same for i*86-*-none targets. let leaf1_edx = cpuid[0]; let sse1_available = check(leaf1_edx, 25); let sse2_available = check(leaf1_edx, 26); if sse1_available && sse2_available { set(&mut caps, Shift::Sse2); } } // Sometimes people delete the `_SSE_REQUIRED`/`_SSE2_REQUIRED` const // assertions in an attempt to support pre-SSE2 32-bit x86 systems. If they // do, hopefully they won't delete these redundant assertions, so that // x86_64 isn't affected. #[cfg(target_arch = "x86_64")] const _SSE2_REQUIRED_X86_64: () = assert!(cfg!(target_feature = "sse2")); #[cfg(target_arch = "x86_64")] const _SSE_REQUIRED_X86_64: () = assert!(cfg!(target_feature = "sse2")); // Intel: "12.7.2 Checking for SSSE3 Support" // If/when we support dynamic detection of SSE/SSE2, make this conditional // on SSE/SSE2. if check(leaf1_ecx, 9) { set(&mut caps, Shift::Ssse3); } // Intel: "12.12.2 Checking for Intel SSE4.1 Support" // If/when we support dynamic detection of SSE/SSE2, make this conditional // on SSE/SSE2. // XXX: We don't check for SSE3 and we're not sure if it is compatible for // us to do so; does AMD advertise SSE3? TODO: address this. // XXX: We don't condition this on SSSE3 being available. TODO: address // this. #[cfg(target_arch = "x86_64")] if check(leaf1_ecx, 19) { set(&mut caps, Shift::Sse41); } // AMD: "The extended SSE instructions include [...]." // Intel: "14.3 DETECTION OF INTEL AVX INSTRUCTIONS" // `OPENSSL_cpuid_setup` clears this bit when it detects the OS doesn't // support AVX state. let avx_available = check(leaf1_ecx, 28); if avx_available { set(&mut caps, Shift::Avx); } #[cfg(target_arch = "x86_64")] if avx_available { // The Intel docs don't seem to document the detection. The instruction // definitions of the VEX.256 instructions reference the // VAES/VPCLMULQDQ features and the documentation for the extended // features gives the values. We combine these into one feature because // we never use them independently. let vaes_available = check(extended_features_ecx, 9); let vclmul_available = check(extended_features_ecx, 10); if vaes_available && vclmul_available { set(&mut caps, Shift::VAesClmul); } } // "14.7.1 Detection of Intel AVX2 Hardware support" // XXX: We don't condition AVX2 on AVX. TODO: Address this. // `OPENSSL_cpuid_setup` clears this bit when it detects the OS doesn't // support AVX state. #[cfg(target_arch = "x86_64")] if check(extended_features_ebx, 5) { set(&mut caps, Shift::Avx2); // Declared as `uint32_t` in the C code. prefixed_extern! { static avx2_available: AtomicU32; } // SAFETY: The C code only reads `avx2_available`, and its reads are // synchronized through the `OnceNonZeroUsize` Acquire/Release // semantics as we ensure we have a `cpu::Features` instance before // calling into the C code. let flag = unsafe { &avx2_available }; flag.store(1, core::sync::atomic::Ordering::Relaxed); } // Intel: "12.13.4 Checking for Intel AES-NI Support" // If/when we support dynamic detection of SSE/SSE2, revisit this. // TODO: Clarify "interesting" states like (!SSE && AVX && AES-NI) // and AES-NI & !AVX. // Each check of `ClMul`, `Aes`, and `Sha` must be paired with a check for // an AVX feature (e.g. `Avx`) or an SSE feature (e.g. `Ssse3`), as every // use will either be supported by SSE* or AVX* instructions. We then // assume that those supporting instructions' prerequisites (e.g. OS // support for AVX or SSE state, respectively) are the only prerequisites // for these features. if check(leaf1_ecx, 1) { set(&mut caps, Shift::ClMul); } if check(leaf1_ecx, 25) { set(&mut caps, Shift::Aes); } // See BoringSSL 69c26de93c82ad98daecaec6e0c8644cdf74b03f before enabling // static feature detection for this. #[cfg(target_arch = "x86_64")] if check(extended_features_ebx, 29) { set(&mut caps, Shift::Sha); } #[cfg(target_arch = "x86_64")] { if is_intel { set(&mut caps, Shift::IntelCpu); } if check(leaf1_ecx, 22) { set(&mut caps, Shift::Movbe); } let adx_available = check(extended_features_ebx, 19); if adx_available { set(&mut caps, Shift::Adx); } // Some 6th Generation (Skylake) CPUs claim to support BMI1 and BMI2 // when they don't; see erratum "SKD052". The Intel document at // https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/6th-gen-core-u-y-spec-update.pdf // contains the footnote "Affects 6th Generation Intel Pentium processor // family and Intel Celeron processor family". Further research indicates // that Skylake Pentium/Celeron do not implement AVX or ADX. It turns // out that we only use BMI1 and BMI2 in combination with ADX and/or // AVX. // // rust `std::arch::is_x86_feature_detected` does a very similar thing // but only looks at AVX, not ADX. Note that they reference an older // version of the erratum labeled SKL052. let believe_bmi_bits = !is_intel || (adx_available || avx_available); if check(extended_features_ebx, 3) && believe_bmi_bits { set(&mut caps, Shift::Bmi1); } let bmi2_available = check(extended_features_ebx, 8) && believe_bmi_bits; if bmi2_available { set(&mut caps, Shift::Bmi2); } if adx_available && bmi2_available { // Declared as `uint32_t` in the C code. prefixed_extern! { static adx_bmi2_available: AtomicU32; } // SAFETY: The C code only reads `adx_bmi2_available`, and its // reads are synchronized through the `OnceNonZeroUsize` // Acquire/Release semantics as we ensure we have a // `cpu::Features` instance before calling into the C code. let flag = unsafe { &adx_bmi2_available }; flag.store(1, core::sync::atomic::Ordering::Relaxed); } } caps } impl_get_feature! { features: [ { ("x86_64") => VAesClmul }, { ("x86", "x86_64") => ClMul }, { ("x86", "x86_64") => Ssse3 }, { ("x86_64") => Sse41 }, { ("x86_64") => Movbe }, { ("x86", "x86_64") => Aes }, { ("x86", "x86_64") => Avx }, { ("x86_64") => Bmi1 }, { ("x86_64") => Avx2 }, { ("x86_64") => Bmi2 }, { ("x86_64") => Adx }, // See BoringSSL 69c26de93c82ad98daecaec6e0c8644cdf74b03f before enabling // static feature detection for this. { ("x86_64") => Sha }, // x86_64 can just assume SSE2 is available. { ("x86") => Sse2 }, ], } cfg_if! { if #[cfg(target_arch = "x86_64")] { #[derive(Clone, Copy)] pub(crate) struct IntelCpu(super::Features); impl super::GetFeature for super::features::Values { fn get_feature(&self) -> Option { const MASK: u32 = 1 << (Shift::IntelCpu as u32); if (self.values() & MASK) == MASK { Some(IntelCpu(self.cpu())) } else { None } } } } } #[cfg(test)] mod tests { // This should always pass on any x86 system except very, very, old ones. #[cfg(target_arch = "x86")] #[test] fn x86_has_sse2() { use super::*; use crate::cpu::{self, GetFeature as _}; assert!(matches!(cpu::features().get_feature(), Some(Sse2 { .. }))) } } ring-0.17.14/src/cpu.rs000064400000000000000000000154271046102023000126720ustar 00000000000000// Copyright 2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. pub(crate) use self::features::Features; use core::mem::size_of; macro_rules! impl_get_feature { { features: [ $( { ( $( $arch:expr ),+ ) => $Name:ident }, )+ ], } => { $( #[cfg(any( $( target_arch = $arch ),+ ))] #[derive(Clone, Copy)] pub(crate) struct $Name(crate::cpu::Features); #[cfg(any( $( target_arch = $arch ),+ ))] impl $Name { const fn mask() -> u32 { 1 << (Shift::$Name as u32) } } #[cfg(any( $( target_arch = $arch ),+ ))] impl crate::cpu::GetFeature<$Name> for super::features::Values { #[inline(always)] fn get_feature(&self) -> Option<$Name> { const MASK: u32 = $Name::mask(); const STATICALLY_DETECTED: bool = (crate::cpu::CAPS_STATIC & MASK) == MASK; if STATICALLY_DETECTED { // TODO: `const` return Some($Name(self.cpu())); } if (self.values() & MASK) == MASK { Some($Name(self.cpu())) } else { None } } } )+ #[repr(u32)] enum Shift { $( #[cfg(any( $( target_arch = $arch ),+ ))] $Name, )+ #[cfg(target_arch = "x86_64")] IntelCpu, #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64"))] // Synthesized to ensure the dynamic flag set is always non-zero. // // Keep this at the end as it is never checked except during init. Initialized, } } } pub(crate) trait GetFeature { fn get_feature(&self) -> Option; } impl GetFeature<()> for features::Values { #[inline(always)] fn get_feature(&self) -> Option<()> { Some(()) } } impl GetFeature<(A, B)> for features::Values where features::Values: GetFeature, features::Values: GetFeature, { #[inline(always)] fn get_feature(&self) -> Option<(A, B)> { match (self.get_feature(), self.get_feature()) { (Some(a), Some(b)) => Some((a, b)), _ => None, } } } impl GetFeature<(A, B, C)> for features::Values where features::Values: GetFeature, features::Values: GetFeature, features::Values: GetFeature, { #[inline(always)] fn get_feature(&self) -> Option<(A, B, C)> { match (self.get_feature(), self.get_feature(), self.get_feature()) { (Some(a), Some(b), Some(c)) => Some((a, b, c)), _ => None, } } } impl GetFeature for Features where features::Values: GetFeature, { #[inline(always)] fn get_feature(&self) -> Option { self.values().get_feature() } } #[inline(always)] pub(crate) fn features() -> Features { featureflags::get_or_init() } mod features { use crate::polyfill::NotSend; /// A witness indicating that CPU features have been detected and cached. /// /// This is a zero-sized type so that it can be "stored" wherever convenient. #[derive(Copy, Clone)] pub(crate) struct Features(NotSend); impl Features { pub fn values(self) -> Values { Values { values: super::featureflags::get(self), cpu: self, } } } cfg_if::cfg_if! { if #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64"))] { impl Features { // SAFETY: This must only be called after CPU features have been written // and synchronized. pub(super) unsafe fn new_after_feature_flags_written_and_synced_unchecked() -> Self { Self(NotSend::VALUE) } } } else { impl Features { pub(super) fn new_no_features_to_detect() -> Self { Self(NotSend::VALUE) } } } } pub struct Values { values: u32, cpu: Features, } impl Values { #[inline(always)] pub(super) fn values(&self) -> u32 { self.values } #[inline(always)] pub(super) fn cpu(&self) -> Features { self.cpu } } } const _: () = assert!(size_of::() == 0); cfg_if::cfg_if! { if #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little")))] { pub mod arm; use arm::featureflags; } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { pub mod intel; use intel::featureflags; } else { mod featureflags { use super::Features; #[inline(always)] pub(super) fn get_or_init() -> Features { Features::new_no_features_to_detect() } #[inline(always)] pub(super) fn get(_cpu_features: Features) -> u32 { STATIC_DETECTED } pub(super) const STATIC_DETECTED: u32 = 0; pub(super) const FORCE_DYNAMIC_DETECTION: u32 = 0; } } } const CAPS_STATIC: u32 = featureflags::STATIC_DETECTED & !featureflags::FORCE_DYNAMIC_DETECTION; #[allow(clippy::assertions_on_constants, clippy::bad_bit_mask)] const _FORCE_DYNAMIC_DETECTION_HONORED: () = assert!((CAPS_STATIC & featureflags::FORCE_DYNAMIC_DETECTION) == 0); #[cfg(test)] mod tests { use super::*; #[test] fn test_static_is_subset_of_dynamic() { let cpu = features(); let dynamic = featureflags::get(cpu); assert_eq!(dynamic & CAPS_STATIC, CAPS_STATIC); } } ring-0.17.14/src/data/alg-rsa-encryption.der000064400000000000000000000000151046102023000166430ustar 00000000000000 *H ring-0.17.14/src/debug.rs000064400000000000000000000055101046102023000131610ustar 00000000000000// Copyright 2018 Trent Clarke. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // Generates an implementation of the Debug trait for a type that defers to the // Debug implementation for a given field. macro_rules! derive_debug_via_id { ($typename:ident) => { impl ::core::fmt::Debug for $typename { fn fmt(&self, f: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> { ::core::fmt::Debug::fmt(&self.id, f) } } }; } macro_rules! derive_debug_via_field { ($type:ty, $field:ident) => { derive_debug_via_field!($type, stringify!($type), $field); }; ($type:ty, $typename:expr, $field:ident) => { impl ::core::fmt::Debug for $type { fn fmt(&self, f: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> { f.debug_struct($typename) .field(stringify!($field), &self.$field) .finish() } } }; } // Generates an implementation of the Debug trait for a type that outputs the // hex encoding of the byte slice representation of the value. macro_rules! derive_debug_self_as_ref_hex_bytes { ($typename:ident) => { impl ::core::fmt::Debug for $typename { fn fmt(&self, f: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> { crate::debug::write_hex_tuple(f, stringify!($typename), self) } } }; } pub(crate) fn write_hex_tuple( fmt: &mut core::fmt::Formatter, type_name: &str, value: &dyn AsRef<[u8]>, ) -> Result<(), ::core::fmt::Error> { fmt.debug_tuple(type_name) .field(&HexStr(value.as_ref())) .finish() } pub struct HexStr<'a>(pub &'a [u8]); impl core::fmt::Debug for HexStr<'_> { fn fmt(&self, fmt: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { fmt.write_str("\"")?; write_hex_bytes(fmt, self.0)?; fmt.write_str("\"")?; Ok(()) } } pub(crate) fn write_hex_bytes( fmt: &mut core::fmt::Formatter, bytes: &[u8], ) -> Result<(), ::core::fmt::Error> { for byte in bytes { write!(fmt, "{:02x}", byte)?; } Ok(()) } ring-0.17.14/src/deprecated_constant_time.rs000064400000000000000000000020401046102023000171150ustar 00000000000000// Copyright 2015-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::{bb, error}; #[deprecated( note = "To be removed. Internal function not intended for external use with no promises regarding side channels." )] pub fn verify_slices_are_equal(a: &[u8], b: &[u8]) -> Result<(), error::Unspecified> { bb::verify_slices_are_equal(a, b) } ring-0.17.14/src/deprecated_test.rs000064400000000000000000000034421046102023000152340ustar 00000000000000// Copyright 2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![doc(hidden)] /// References a test input file. #[macro_export] macro_rules! test_file { ($file_name:expr) => { $crate::test::File { file_name: $file_name, contents: include_str!($file_name), } }; } pub use crate::testutil::{ compile_time_assert_clone, compile_time_assert_copy, compile_time_assert_eq, compile_time_assert_send, compile_time_assert_sync, from_hex, run, File, TestCase, }; #[cfg(feature = "std")] pub use crate::testutil::compile_time_assert_std_error_error; #[deprecated(note = "internal API that will be removed")] #[doc(hidden)] pub mod rand { #[deprecated(note = "internal API that will be removed")] pub type FixedByteRandom = crate::testutil::rand::FixedByteRandom; #[deprecated(note = "internal API that will be removed")] pub type FixedSliceRandom<'a> = crate::testutil::rand::FixedSliceRandom<'a>; #[deprecated(note = "internal API that will be removed")] pub type FixedSliceSequenceRandom<'a> = crate::testutil::rand::FixedSliceSequenceRandom<'a>; } ring-0.17.14/src/digest/dynstate.rs000064400000000000000000000060421046102023000152060ustar 00000000000000// Copyright 2015-2019 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{format_output, sha1, sha2, Output}; use crate::{cpu, polyfill::slice}; use core::mem::size_of; // Invariant: When constructed with `new32` (resp. `new64`), `As32` (resp. // `As64`) is the active variant. // Invariant: The active variant never changes after initialization. #[derive(Clone)] pub(super) enum DynState { As64(sha2::State64), As32(sha2::State32), } impl DynState { pub const fn new32(initial_state: sha2::State32) -> Self { Self::As32(initial_state) } pub const fn new64(initial_state: sha2::State64) -> Self { Self::As64(initial_state) } pub fn format_output(self) -> Output { match self { Self::As64(state) => { format_output::<_, _, { size_of::() }>(state, u64::to_be_bytes) } Self::As32(state) => { format_output::<_, _, { size_of::() }>(state, u32::to_be_bytes) } } } } pub(super) fn sha1_block_data_order<'d>( state: &mut DynState, data: &'d [u8], _cpu_features: cpu::Features, ) -> (usize, &'d [u8]) { let state = match state { DynState::As32(state) => state, _ => { unreachable!(); } }; let (full_blocks, leftover) = slice::as_chunks(data); sha1::sha1_block_data_order(state, full_blocks); (full_blocks.as_flattened().len(), leftover) } pub(super) fn sha256_block_data_order<'d>( state: &mut DynState, data: &'d [u8], cpu_features: cpu::Features, ) -> (usize, &'d [u8]) { let state = match state { DynState::As32(state) => state, _ => { unreachable!(); } }; let (full_blocks, leftover) = slice::as_chunks(data); sha2::block_data_order_32(state, full_blocks, cpu_features); (full_blocks.len() * sha2::SHA256_BLOCK_LEN.into(), leftover) } pub(super) fn sha512_block_data_order<'d>( state: &mut DynState, data: &'d [u8], cpu_features: cpu::Features, ) -> (usize, &'d [u8]) { let state = match state { DynState::As64(state) => state, _ => { unreachable!(); } }; let (full_blocks, leftover) = slice::as_chunks(data); sha2::block_data_order_64(state, full_blocks, cpu_features); (full_blocks.len() * sha2::SHA512_BLOCK_LEN.into(), leftover) } ring-0.17.14/src/digest/sha1.rs000064400000000000000000000071041046102023000142070ustar 00000000000000// Copyright 2015-2025 Brian Smith. // Copyright 2016 Simon Sapin. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{ sha2::{ fallback::{ch, maj, Word}, State32, }, BlockLen, OutputLen, }; use crate::polyfill::slice::{self, AsChunks}; use core::{mem::size_of, num::Wrapping}; pub(super) const BLOCK_LEN: BlockLen = BlockLen::_512; pub const CHAINING_LEN: usize = 160 / 8; pub(super) const OUTPUT_LEN: OutputLen = OutputLen::_160; const CHAINING_WORDS: usize = CHAINING_LEN / 4; type W32 = Wrapping; // FIPS 180-4 4.1.1 #[inline] fn parity(x: W32, y: W32, z: W32) -> W32 { x ^ y ^ z } type State = [W32; CHAINING_WORDS]; const ROUNDS: usize = 80; pub fn sha1_block_data_order(state: &mut State32, data: AsChunks) { // The unwrap won't fail because `CHAINING_WORDS` is smaller than the // length. let state: &mut State = (&mut state[..CHAINING_WORDS]).try_into().unwrap(); // SAFETY: The caller guarantees that this is called with data pointing to `num` // `BLOCK_LEN`-long blocks. *state = block_data_order(*state, data) } #[inline] #[rustfmt::skip] fn block_data_order( mut H: [W32; CHAINING_WORDS], M: AsChunks, ) -> [W32; CHAINING_WORDS] { for M in M { let (M, remainder): (AsChunks()}>, &[u8]) = slice::as_chunks(M); debug_assert!(remainder.is_empty()); // FIPS 180-4 6.1.2 Step 1 let mut W: [W32; ROUNDS] = [W32::ZERO; ROUNDS]; W.iter_mut().zip(M).for_each(|(Wt, Mt)| { *Wt = W32::from_be_bytes(*Mt); }); for t in 16..ROUNDS { let wt = W[t - 3] ^ W[t - 8] ^ W[t - 14] ^ W[t - 16]; W[t] = rotl(wt, 1); } // FIPS 180-4 6.1.2 Step 2 let [a, b, c, d, e] = H; // FIPS 180-4 6.1.2 Step 3 with constants and functions from FIPS 180-4 {4.1.1, 4.2.1} let (a, b, c, d, e) = step3(a, b, c, d, e, &W, 0, Wrapping(0x5a827999), ch); let (a, b, c, d, e) = step3(a, b, c, d, e, &W, 20, Wrapping(0x6ed9eba1), parity); let (a, b, c, d, e) = step3(a, b, c, d, e, &W, 40, Wrapping(0x8f1bbcdc), maj); let (a, b, c, d, e) = step3(a, b, c, d, e, &W, 60, Wrapping(0xca62c1d6), parity); // FIPS 180-4 6.1.2 Step 4 H[0] += a; H[1] += b; H[2] += c; H[3] += d; H[4] += e; } H } #[inline(always)] fn step3( mut a: W32, mut b: W32, mut c: W32, mut d: W32, mut e: W32, W: &[W32; 80], t: usize, k: W32, f: impl Fn(W32, W32, W32) -> W32, ) -> (W32, W32, W32, W32, W32) { let W = &W[t..(t + 20)]; for W_t in W.iter() { let T = rotl(a, 5) + f(b, c, d) + e + k + W_t; e = d; d = c; c = rotl(b, 30); b = a; a = T; } (a, b, c, d, e) } #[inline(always)] fn rotl(x: W32, n: u32) -> W32 { Wrapping(x.0.rotate_left(n)) } ring-0.17.14/src/digest/sha2/fallback.rs000064400000000000000000000246571046102023000157630ustar 00000000000000// Copyright 2019-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::CHAINING_WORDS; use crate::polyfill::slice::{self, AsChunks}; use core::{ num::Wrapping, ops::{Add, AddAssign, BitAnd, BitOr, BitXor, Not, Shr}, }; #[cfg_attr( any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86_64" ), allow(dead_code) )] #[inline] pub(super) fn block_data_order( mut H: [S; CHAINING_WORDS], M: AsChunks, ) -> [S; CHAINING_WORDS] where for<'a> &'a S::InputBytes: From<&'a [u8; BYTES_LEN]>, { for M in M { let (M, remainder): (AsChunks, &[u8]) = slice::as_chunks(M); debug_assert!(remainder.is_empty()); // FIPS 180-4 {6.2.2, 6.4.2} Step 1 // // TODO(MSRV): Use `let W: [S::from(0); S::ROUNDS]` instead; depends on // https://github.com/rust-lang/rust/issues/43408. let mut W = S::zero_w(); let W = W.as_mut(); W.iter_mut().zip(M).for_each(|(Wt, Mt)| { let Mt: &S::InputBytes = Mt.into(); *Wt = S::from_be_bytes(*Mt); }); for t in 16..S::ROUNDS { W[t] = sigma_1(W[t - 2]) + W[t - 7] + sigma_0(W[t - 15]) + W[t - 16] } // FIPS 180-4 {6.2.2, 6.4.2} Step 2 let [mut a, mut b, mut c, mut d, mut e, mut f, mut g, mut h] = H; // FIPS 180-4 {6.2.2, 6.4.2} Step 3 for (Kt, Wt) in S::K.as_ref().iter().zip(W.iter()) { let T1 = h + SIGMA_1(e) + ch(e, f, g) + *Kt + *Wt; let T2 = SIGMA_0(a) + maj(a, b, c); h = g; g = f; f = e; e = d + T1; d = c; c = b; b = a; a = T1 + T2; } // FIPS 180-4 {6.2.2, 6.4.2} Step 4 H[0] += a; H[1] += b; H[2] += c; H[3] += d; H[4] += e; H[5] += f; H[6] += g; H[7] += h; } H } // FIPS 180-4 {4.1.1, 4.1.2, 4.1.3} #[inline(always)] pub(in super::super) fn ch(x: W, y: W, z: W) -> W { (x & y) | (!x & z) } // FIPS 180-4 {4.1.1, 4.1.2, 4.1.3} #[inline(always)] pub(in super::super) fn maj(x: W, y: W, z: W) -> W { (x & y) | (x & z) | (y & z) } // FIPS 180-4 {4.1.2, 4.1.3} #[inline(always)] fn SIGMA_0(x: S) -> S { x.rotr(S::BIG_SIGMA_0.0) ^ x.rotr(S::BIG_SIGMA_0.1) ^ x.rotr(S::BIG_SIGMA_0.2) } // FIPS 180-4 {4.1.2, 4.1.3} #[inline(always)] fn SIGMA_1(x: S) -> S { x.rotr(S::BIG_SIGMA_1.0) ^ x.rotr(S::BIG_SIGMA_1.1) ^ x.rotr(S::BIG_SIGMA_1.2) } // FIPS 180-4 {4.1.2, 4.1.3} #[inline(always)] fn sigma_0(x: S) -> S { x.rotr(S::SMALL_SIGMA_0.0) ^ x.rotr(S::SMALL_SIGMA_0.1) ^ (x >> S::SMALL_SIGMA_0.2) } // FIPS 180-4 {4.1.2, 4.1.3} #[inline(always)] fn sigma_1(x: S) -> S { x.rotr(S::SMALL_SIGMA_1.0) ^ x.rotr(S::SMALL_SIGMA_1.1) ^ (x >> S::SMALL_SIGMA_1.2) } // Commonality between SHA-1 and SHA-2 words. pub(in super::super) trait Word: 'static + Sized + Copy + Add + AddAssign + BitAnd + BitOr + Not { const ZERO: Self; type InputBytes: Copy; fn from_be_bytes(input: Self::InputBytes) -> Self; fn rotr(self, count: u32) -> Self; } /// A SHA-2 input word. pub(super) trait Sha2: Word + BitXor + Shr { const BIG_SIGMA_0: (u32, u32, u32); const BIG_SIGMA_1: (u32, u32, u32); const SMALL_SIGMA_0: (u32, u32, usize); const SMALL_SIGMA_1: (u32, u32, usize); const ROUNDS: usize; type W: AsRef<[Self]> + AsMut<[Self]>; fn zero_w() -> Self::W; const K: &'static Self::W; } impl Word for Wrapping { const ZERO: Self = Self(0); type InputBytes = [u8; 4]; #[inline(always)] fn from_be_bytes(input: Self::InputBytes) -> Self { Self(u32::from_be_bytes(input)) } #[inline(always)] fn rotr(self, count: u32) -> Self { Self(self.0.rotate_right(count)) } } // SHA-256 impl Sha2 for Wrapping { // FIPS 180-4 4.1.2 const BIG_SIGMA_0: (u32, u32, u32) = (2, 13, 22); const BIG_SIGMA_1: (u32, u32, u32) = (6, 11, 25); const SMALL_SIGMA_0: (u32, u32, usize) = (7, 18, 3); const SMALL_SIGMA_1: (u32, u32, usize) = (17, 19, 10); // FIPS 180-4 {6.2.2} Step 1 const ROUNDS: usize = 64; type W = [Self; Self::ROUNDS]; fn zero_w() -> Self::W { [Self::ZERO; Self::ROUNDS] } // FIPS 180-4 4.2.2 const K: &'static Self::W = &[ Self(0x428a2f98), Self(0x71374491), Self(0xb5c0fbcf), Self(0xe9b5dba5), Self(0x3956c25b), Self(0x59f111f1), Self(0x923f82a4), Self(0xab1c5ed5), Self(0xd807aa98), Self(0x12835b01), Self(0x243185be), Self(0x550c7dc3), Self(0x72be5d74), Self(0x80deb1fe), Self(0x9bdc06a7), Self(0xc19bf174), Self(0xe49b69c1), Self(0xefbe4786), Self(0x0fc19dc6), Self(0x240ca1cc), Self(0x2de92c6f), Self(0x4a7484aa), Self(0x5cb0a9dc), Self(0x76f988da), Self(0x983e5152), Self(0xa831c66d), Self(0xb00327c8), Self(0xbf597fc7), Self(0xc6e00bf3), Self(0xd5a79147), Self(0x06ca6351), Self(0x14292967), Self(0x27b70a85), Self(0x2e1b2138), Self(0x4d2c6dfc), Self(0x53380d13), Self(0x650a7354), Self(0x766a0abb), Self(0x81c2c92e), Self(0x92722c85), Self(0xa2bfe8a1), Self(0xa81a664b), Self(0xc24b8b70), Self(0xc76c51a3), Self(0xd192e819), Self(0xd6990624), Self(0xf40e3585), Self(0x106aa070), Self(0x19a4c116), Self(0x1e376c08), Self(0x2748774c), Self(0x34b0bcb5), Self(0x391c0cb3), Self(0x4ed8aa4a), Self(0x5b9cca4f), Self(0x682e6ff3), Self(0x748f82ee), Self(0x78a5636f), Self(0x84c87814), Self(0x8cc70208), Self(0x90befffa), Self(0xa4506ceb), Self(0xbef9a3f7), Self(0xc67178f2), ]; } impl Word for Wrapping { const ZERO: Self = Self(0); type InputBytes = [u8; 8]; #[inline(always)] fn from_be_bytes(input: Self::InputBytes) -> Self { Self(u64::from_be_bytes(input)) } #[inline(always)] fn rotr(self, count: u32) -> Self { Self(self.0.rotate_right(count)) } } // SHA-384 and SHA-512 impl Sha2 for Wrapping { // FIPS 180-4 4.1.3 const BIG_SIGMA_0: (u32, u32, u32) = (28, 34, 39); const BIG_SIGMA_1: (u32, u32, u32) = (14, 18, 41); const SMALL_SIGMA_0: (u32, u32, usize) = (1, 8, 7); const SMALL_SIGMA_1: (u32, u32, usize) = (19, 61, 6); // FIPS 180-4 {6.4.2} Step 1 const ROUNDS: usize = 80; type W = [Self; Self::ROUNDS]; fn zero_w() -> Self::W { [Self::ZERO; Self::ROUNDS] } // FIPS 180-4 4.2.3 const K: &'static Self::W = &[ Self(0x428a2f98d728ae22), Self(0x7137449123ef65cd), Self(0xb5c0fbcfec4d3b2f), Self(0xe9b5dba58189dbbc), Self(0x3956c25bf348b538), Self(0x59f111f1b605d019), Self(0x923f82a4af194f9b), Self(0xab1c5ed5da6d8118), Self(0xd807aa98a3030242), Self(0x12835b0145706fbe), Self(0x243185be4ee4b28c), Self(0x550c7dc3d5ffb4e2), Self(0x72be5d74f27b896f), Self(0x80deb1fe3b1696b1), Self(0x9bdc06a725c71235), Self(0xc19bf174cf692694), Self(0xe49b69c19ef14ad2), Self(0xefbe4786384f25e3), Self(0x0fc19dc68b8cd5b5), Self(0x240ca1cc77ac9c65), Self(0x2de92c6f592b0275), Self(0x4a7484aa6ea6e483), Self(0x5cb0a9dcbd41fbd4), Self(0x76f988da831153b5), Self(0x983e5152ee66dfab), Self(0xa831c66d2db43210), Self(0xb00327c898fb213f), Self(0xbf597fc7beef0ee4), Self(0xc6e00bf33da88fc2), Self(0xd5a79147930aa725), Self(0x06ca6351e003826f), Self(0x142929670a0e6e70), Self(0x27b70a8546d22ffc), Self(0x2e1b21385c26c926), Self(0x4d2c6dfc5ac42aed), Self(0x53380d139d95b3df), Self(0x650a73548baf63de), Self(0x766a0abb3c77b2a8), Self(0x81c2c92e47edaee6), Self(0x92722c851482353b), Self(0xa2bfe8a14cf10364), Self(0xa81a664bbc423001), Self(0xc24b8b70d0f89791), Self(0xc76c51a30654be30), Self(0xd192e819d6ef5218), Self(0xd69906245565a910), Self(0xf40e35855771202a), Self(0x106aa07032bbd1b8), Self(0x19a4c116b8d2d0c8), Self(0x1e376c085141ab53), Self(0x2748774cdf8eeb99), Self(0x34b0bcb5e19b48a8), Self(0x391c0cb3c5c95a63), Self(0x4ed8aa4ae3418acb), Self(0x5b9cca4f7763e373), Self(0x682e6ff3d6b2b8a3), Self(0x748f82ee5defb2fc), Self(0x78a5636f43172f60), Self(0x84c87814a1f0ab72), Self(0x8cc702081a6439ec), Self(0x90befffa23631e28), Self(0xa4506cebde82bde9), Self(0xbef9a3f7b2c67915), Self(0xc67178f2e372532b), Self(0xca273eceea26619c), Self(0xd186b8c721c0c207), Self(0xeada7dd6cde0eb1e), Self(0xf57d4f7fee6ed178), Self(0x06f067aa72176fba), Self(0x0a637dc5a2c898a6), Self(0x113f9804bef90dae), Self(0x1b710b35131c471b), Self(0x28db77f523047d84), Self(0x32caab7b40c72493), Self(0x3c9ebe0a15c9bebc), Self(0x431d67c49c100d4c), Self(0x4cc5d4becb3e42b6), Self(0x597f299cfc657e2a), Self(0x5fcb6fab3ad6faec), Self(0x6c44198c4a475817), ]; } ring-0.17.14/src/digest/sha2/ffi.rs000064400000000000000000000054151046102023000147570ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::CHAINING_WORDS; use crate::polyfill::slice::AsChunks; use core::num::{NonZeroUsize, Wrapping}; /// `unsafe { T => f }` means it is safe to call `f` iff we can construct /// a value of type `T`. macro_rules! sha2_ffi { ( $U:ty, $BLOCK_LEN:expr, unsafe { $Cpu:ty => $f:ident }, $state:expr, $data:expr, $cpu:expr $(,)? ) => {{ prefixed_extern! { fn $f( state: *mut [core::num::Wrapping<$U>; crate::digest::sha2::CHAINING_WORDS], data: *const [u8; $BLOCK_LEN], num: crate::c::NonZero_size_t); } // SAFETY: The user asserts that $f has the signature above and is safe // to call if additionally we have a value of type `$Cpu`, which we do. unsafe { crate::digest::sha2::ffi::sha2_ffi::<$U, $Cpu, { $BLOCK_LEN }>($state, $data, $cpu, $f) } }}; } macro_rules! sha2_32_ffi { ( unsafe { $Cpu:ty => $f:ident }, $state:expr, $data:expr, $cpu:expr $(,)? ) => { sha2_ffi!(u32, crate::digest::sha2::SHA256_BLOCK_LEN.into(), unsafe { $Cpu => $f }, $state, $data, $cpu) } } macro_rules! sha2_64_ffi { ( unsafe { $Cpu:ty => $f:ident }, $state:expr, $data:expr, $cpu:expr $(,)? ) => { sha2_ffi!(u64, SHA512_BLOCK_LEN.into(), unsafe { $Cpu => $f }, $state, $data, $cpu) } } pub(super) unsafe fn sha2_ffi( state: &mut [Wrapping; CHAINING_WORDS], data: AsChunks, cpu: Cpu, f: unsafe extern "C" fn( *mut [Wrapping; CHAINING_WORDS], *const [u8; BLOCK_LEN], crate::c::NonZero_size_t, ), ) { if let Some(blocks) = NonZeroUsize::new(data.len()) { let data = data.as_ptr(); let _: Cpu = cpu; // SAFETY: // * `blocks` is non-zero. // * `data` is non-NULL and points to `blocks` blocks. // * The caller asserted that `f` meets this contract if we have // an instance of `Cpu`. unsafe { f(state, data, blocks) } } } ring-0.17.14/src/digest/sha2/mod.rs000064400000000000000000000023051046102023000147650ustar 00000000000000// Copyright 2019-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::BlockLen; pub(super) use self::{ sha2_32::{block_data_order_32, State32, SHA256_BLOCK_LEN}, sha2_64::{block_data_order_64, State64, SHA512_BLOCK_LEN}, }; pub(super) const CHAINING_WORDS: usize = 8; #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86_64" ))] #[macro_use] mod ffi; pub(super) mod fallback; mod sha2_32; mod sha2_64; ring-0.17.14/src/digest/sha2/sha2_32.rs000064400000000000000000000057411046102023000153560ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{BlockLen, CHAINING_WORDS}; use crate::{cpu, polyfill::slice::AsChunks}; use cfg_if::cfg_if; use core::num::Wrapping; pub(in super::super) const SHA256_BLOCK_LEN: BlockLen = BlockLen::_512; pub type State32 = [Wrapping; CHAINING_WORDS]; pub(crate) fn block_data_order_32( state: &mut State32, data: AsChunks, cpu: cpu::Features, ) { cfg_if! { if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { use cpu::{GetFeature as _, arm::Sha256}; if let Some(cpu) = cpu.get_feature() { sha2_32_ffi!(unsafe { Sha256 => sha256_block_data_order_hw }, state, data, cpu) } else { sha2_32_ffi!(unsafe { () => sha256_block_data_order_nohw }, state, data, ()) } } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] { use cpu::{GetFeature as _, arm::Neon}; if let Some(cpu) = cpu.get_feature() { sha2_32_ffi!(unsafe { Neon => sha256_block_data_order_neon }, state, data, cpu) } else { sha2_32_ffi!(unsafe { () => sha256_block_data_order_nohw }, state, data, ()) } } else if #[cfg(target_arch = "x86_64")] { use cpu::{GetFeature as _, intel::{Avx, IntelCpu, Sha, Ssse3 }}; let cpu = cpu.values(); if let Some(cpu) = cpu.get_feature() { sha2_32_ffi!(unsafe { (Sha, Ssse3) => sha256_block_data_order_hw }, state, data, cpu) } else if let Some(cpu) = cpu.get_feature() { // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA // extension; see the discussion in upstream's sha1-586.pl. sha2_32_ffi!(unsafe { (Avx, IntelCpu) => sha256_block_data_order_avx }, state, data, cpu) } else if let Some(cpu) = cpu.get_feature() { sha2_32_ffi!(unsafe { Ssse3 => sha256_block_data_order_ssse3 }, state, data, cpu) } else { sha2_32_ffi!(unsafe { () => sha256_block_data_order_nohw }, state, data, ()) } } else { let _ = cpu; // Unneeded. *state = super::fallback::block_data_order(*state, data) } } } ring-0.17.14/src/digest/sha2/sha2_64.rs000064400000000000000000000053171046102023000153620ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{BlockLen, CHAINING_WORDS}; use crate::{cpu, polyfill::slice::AsChunks}; use cfg_if::cfg_if; use core::num::Wrapping; pub(in super::super) const SHA512_BLOCK_LEN: BlockLen = BlockLen::_1024; pub type State64 = [Wrapping; CHAINING_WORDS]; pub(crate) fn block_data_order_64( state: &mut State64, data: AsChunks, cpu: cpu::Features, ) { cfg_if! { if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { use cpu::{GetFeature as _, arm::Sha512}; if let Some(cpu) = cpu.get_feature() { sha2_64_ffi!(unsafe { Sha512 => sha512_block_data_order_hw }, state, data, cpu) } else { sha2_64_ffi!(unsafe { () => sha512_block_data_order_nohw }, state, data, ()) } } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] { use cpu::{GetFeature as _, arm::Neon}; if let Some(cpu) = cpu.get_feature() { sha2_64_ffi!(unsafe { Neon => sha512_block_data_order_neon }, state, data, cpu) } else { sha2_64_ffi!(unsafe { () => sha512_block_data_order_nohw }, state, data, ()) } } else if #[cfg(target_arch = "x86_64")] { use cpu::{GetFeature as _, intel::{Avx, IntelCpu}}; if let Some(cpu) = cpu.get_feature() { // Pre-Zen AMD CPUs had microcoded SHLD/SHRD which makes the // AVX version slow. We're also unsure of the side channel // ramifications of those microcoded instructions. sha2_64_ffi!(unsafe { (Avx, IntelCpu) => sha512_block_data_order_avx }, state, data, cpu); } else { sha2_64_ffi!(unsafe { () => sha512_block_data_order_nohw }, state, data, ()) } } else { let _ = cpu; // Unneeded. *state = super::fallback::block_data_order(*state, data) } } } ring-0.17.14/src/digest.rs000064400000000000000000000520521046102023000133550ustar 00000000000000// Copyright 2015-2019 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! SHA-2 and the legacy SHA-1 digest algorithm. //! //! If all the data is available in a single contiguous slice then the `digest` //! function should be used. Otherwise, the digest can be calculated in //! multiple steps using `Context`. use self::{ dynstate::DynState, sha2::{SHA256_BLOCK_LEN, SHA512_BLOCK_LEN}, }; use crate::{ bits::{BitLength, FromByteLen as _}, cpu, debug, error, polyfill::{self, slice, sliceutil}, }; use core::num::Wrapping; pub(crate) use self::finish_error::FinishError; mod dynstate; mod sha1; mod sha2; #[derive(Clone)] pub(crate) struct BlockContext { state: DynState, // Note that SHA-512 has a 128-bit input bit counter, but this // implementation only supports up to 2^64-1 input bits for all algorithms, // so a 64-bit counter is more than sufficient. completed_bytes: u64, /// The context's algorithm. pub algorithm: &'static Algorithm, } impl BlockContext { pub(crate) fn new(algorithm: &'static Algorithm) -> Self { Self { state: algorithm.initial_state.clone(), completed_bytes: 0, algorithm, } } /// Processes all the full blocks in `input`, returning the partial block /// at the end, which may be empty. pub(crate) fn update<'i>(&mut self, input: &'i [u8], cpu_features: cpu::Features) -> &'i [u8] { let (completed_bytes, leftover) = self.block_data_order(input, cpu_features); // Using saturated addition here allows `update` to be infallible and // panic-free. If we were to reach the maximum value here then `finish` // will detect that we processed too much data when it converts this to // a bit length. self.completed_bytes = self .completed_bytes .saturating_add(polyfill::u64_from_usize(completed_bytes)); leftover } // On input, `block[..num_pending]` is the (possibly-empty) last *partial* // chunk of input. It *must* be partial; that is, it is required that // `num_pending < self.algorithm.block_len`. // // `block` may be arbitrarily overwritten. pub(crate) fn try_finish( mut self, block: &mut [u8; MAX_BLOCK_LEN], num_pending: usize, cpu_features: cpu::Features, ) -> Result { let completed_bits = self .completed_bytes .checked_add(polyfill::u64_from_usize(num_pending)) .ok_or_else(|| { // Choosing self.completed_bytes here is lossy & somewhat arbitrary. InputTooLongError::new(self.completed_bytes) }) .and_then(BitLength::from_byte_len) .map_err(FinishError::input_too_long)?; let block_len = self.algorithm.block_len(); let block = &mut block[..block_len]; let padding = match block.get_mut(num_pending..) { Some([separator, padding @ ..]) => { *separator = 0x80; padding } // Precondition violated. unreachable => { return Err(FinishError::pending_not_a_partial_block( unreachable.as_deref(), )); } }; let padding = match padding .len() .checked_sub(self.algorithm.block_len.len_len()) { Some(_) => padding, None => { padding.fill(0); let (completed_bytes, leftover) = self.block_data_order(block, cpu_features); debug_assert_eq!((completed_bytes, leftover.len()), (block_len, 0)); // We don't increase |self.completed_bytes| because the padding // isn't data, and so it isn't included in the data length. &mut block[..] } }; let (to_zero, len) = padding.split_at_mut(padding.len() - 8); to_zero.fill(0); len.copy_from_slice(&completed_bits.to_be_bytes()); let (completed_bytes, leftover) = self.block_data_order(block, cpu_features); debug_assert_eq!((completed_bytes, leftover.len()), (block_len, 0)); Ok(Digest { algorithm: self.algorithm, value: self.state.format_output(), }) } #[must_use] fn block_data_order<'d>( &mut self, data: &'d [u8], cpu_features: cpu::Features, ) -> (usize, &'d [u8]) { (self.algorithm.block_data_order)(&mut self.state, data, cpu_features) } } pub(crate) type InputTooLongError = error::InputTooLongError; cold_exhaustive_error! { enum finish_error::FinishError { input_too_long => InputTooLong(InputTooLongError), pending_not_a_partial_block_inner => PendingNotAPartialBlock(usize), } } impl FinishError { #[cold] #[inline(never)] fn pending_not_a_partial_block(padding: Option<&[u8]>) -> Self { match padding { None => Self::pending_not_a_partial_block_inner(0), Some(padding) => Self::pending_not_a_partial_block_inner(padding.len()), } } } /// A context for multi-step (Init-Update-Finish) digest calculations. /// /// # Examples /// /// ``` /// use ring::digest; /// /// let one_shot = digest::digest(&digest::SHA384, b"hello, world"); /// /// let mut ctx = digest::Context::new(&digest::SHA384); /// ctx.update(b"hello"); /// ctx.update(b", "); /// ctx.update(b"world"); /// let multi_part = ctx.finish(); /// /// assert_eq!(&one_shot.as_ref(), &multi_part.as_ref()); /// ``` #[derive(Clone)] pub struct Context { block: BlockContext, // TODO: More explicitly force 64-bit alignment for |pending|. pending: [u8; MAX_BLOCK_LEN], // Invariant: `self.num_pending < self.block.algorithm.block_len`. num_pending: usize, } impl Context { /// Constructs a new context. pub fn new(algorithm: &'static Algorithm) -> Self { Self { block: BlockContext::new(algorithm), pending: [0u8; MAX_BLOCK_LEN], num_pending: 0, } } pub(crate) fn clone_from(block: &BlockContext) -> Self { Self { block: block.clone(), pending: [0u8; MAX_BLOCK_LEN], num_pending: 0, } } /// Updates the digest with all the data in `data`. pub fn update(&mut self, data: &[u8]) { let cpu_features = cpu::features(); let block_len = self.block.algorithm.block_len(); let buffer = &mut self.pending[..block_len]; let to_digest = if self.num_pending == 0 { data } else { let buffer_to_fill = match buffer.get_mut(self.num_pending..) { Some(buffer_to_fill) => buffer_to_fill, None => { // Impossible because of the invariant. unreachable!(); } }; sliceutil::overwrite_at_start(buffer_to_fill, data); match slice::split_at_checked(data, buffer_to_fill.len()) { Some((just_copied, to_digest)) => { debug_assert_eq!(buffer_to_fill.len(), just_copied.len()); debug_assert_eq!(self.num_pending + just_copied.len(), block_len); let leftover = self.block.update(buffer, cpu_features); debug_assert_eq!(leftover.len(), 0); self.num_pending = 0; to_digest } None => { self.num_pending += data.len(); // If `data` isn't enough to complete a block, buffer it and stop. debug_assert!(self.num_pending < block_len); return; } } }; let leftover = self.block.update(to_digest, cpu_features); sliceutil::overwrite_at_start(buffer, leftover); self.num_pending = leftover.len(); debug_assert!(self.num_pending < block_len); } /// Finalizes the digest calculation and returns the digest value. /// /// `finish` consumes the context so it cannot be (mis-)used after `finish` /// has been called. pub fn finish(self) -> Digest { let cpu = cpu::features(); self.try_finish(cpu) .map_err(error::erase::) .unwrap() } pub(crate) fn try_finish( mut self, cpu_features: cpu::Features, ) -> Result { self.block .try_finish(&mut self.pending, self.num_pending, cpu_features) .map_err(|err| match err { FinishError::InputTooLong(i) => i, FinishError::PendingNotAPartialBlock(_) => { // Due to invariant. unreachable!() } }) } /// The algorithm that this context is using. #[inline(always)] pub fn algorithm(&self) -> &'static Algorithm { self.block.algorithm } } /// Returns the digest of `data` using the given digest algorithm. pub fn digest(algorithm: &'static Algorithm, data: &[u8]) -> Digest { let cpu = cpu::features(); Digest::compute_from(algorithm, data, cpu) .map_err(error::erase::) .unwrap() } /// A calculated digest value. /// /// Use [`Self::as_ref`] to get the value as a `&[u8]`. #[derive(Clone, Copy)] pub struct Digest { value: Output, algorithm: &'static Algorithm, } impl Digest { pub(crate) fn compute_from( algorithm: &'static Algorithm, data: &[u8], cpu: cpu::Features, ) -> Result { let mut ctx = Context::new(algorithm); ctx.update(data); ctx.try_finish(cpu) } /// The algorithm that was used to calculate the digest value. #[inline(always)] pub fn algorithm(&self) -> &'static Algorithm { self.algorithm } } impl AsRef<[u8]> for Digest { #[inline(always)] fn as_ref(&self) -> &[u8] { &self.value.0[..self.algorithm.output_len()] } } impl core::fmt::Debug for Digest { fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result { write!(fmt, "{:?}:", self.algorithm)?; debug::write_hex_bytes(fmt, self.as_ref()) } } /// A digest algorithm. pub struct Algorithm { output_len: OutputLen, chaining_len: usize, block_len: BlockLen, /// `block_data_order` processes all the full blocks of data in `data`. It /// returns the number of bytes processed and the unprocessed data, which /// is guaranteed to be less than `block_len` bytes long. block_data_order: for<'d> fn( state: &mut DynState, data: &'d [u8], cpu_features: cpu::Features, ) -> (usize, &'d [u8]), initial_state: DynState, id: AlgorithmID, } #[derive(Debug, Eq, PartialEq)] enum AlgorithmID { SHA1, SHA256, SHA384, SHA512, SHA512_256, } impl PartialEq for Algorithm { fn eq(&self, other: &Self) -> bool { self.id == other.id } } impl Eq for Algorithm {} derive_debug_via_id!(Algorithm); impl Algorithm { /// The internal block length. pub fn block_len(&self) -> usize { self.block_len.into() } /// The size of the chaining value of the digest function, in bytes. /// /// For non-truncated algorithms (SHA-1, SHA-256, SHA-512), this is equal /// to [`Self::output_len()`]. For truncated algorithms (e.g. SHA-384, /// SHA-512/256), this is equal to the length before truncation. This is /// mostly helpful for determining the size of an HMAC key that is /// appropriate for the digest algorithm. pub fn chaining_len(&self) -> usize { self.chaining_len } /// The length of a finalized digest. pub fn output_len(&self) -> usize { self.output_len.into() } } /// SHA-1 as specified in [FIPS 180-4]. Deprecated. /// /// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf pub static SHA1_FOR_LEGACY_USE_ONLY: Algorithm = Algorithm { output_len: sha1::OUTPUT_LEN, chaining_len: sha1::CHAINING_LEN, block_len: sha1::BLOCK_LEN, block_data_order: dynstate::sha1_block_data_order, initial_state: DynState::new32([ Wrapping(0x67452301u32), Wrapping(0xefcdab89u32), Wrapping(0x98badcfeu32), Wrapping(0x10325476u32), Wrapping(0xc3d2e1f0u32), Wrapping(0), Wrapping(0), Wrapping(0), ]), id: AlgorithmID::SHA1, }; /// SHA-256 as specified in [FIPS 180-4]. /// /// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf pub static SHA256: Algorithm = Algorithm { output_len: OutputLen::_256, chaining_len: SHA256_OUTPUT_LEN, block_len: SHA256_BLOCK_LEN, block_data_order: dynstate::sha256_block_data_order, initial_state: DynState::new32([ Wrapping(0x6a09e667u32), Wrapping(0xbb67ae85u32), Wrapping(0x3c6ef372u32), Wrapping(0xa54ff53au32), Wrapping(0x510e527fu32), Wrapping(0x9b05688cu32), Wrapping(0x1f83d9abu32), Wrapping(0x5be0cd19u32), ]), id: AlgorithmID::SHA256, }; /// SHA-384 as specified in [FIPS 180-4]. /// /// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf pub static SHA384: Algorithm = Algorithm { output_len: OutputLen::_384, chaining_len: SHA512_OUTPUT_LEN, block_len: SHA512_BLOCK_LEN, block_data_order: dynstate::sha512_block_data_order, initial_state: DynState::new64([ Wrapping(0xcbbb9d5dc1059ed8), Wrapping(0x629a292a367cd507), Wrapping(0x9159015a3070dd17), Wrapping(0x152fecd8f70e5939), Wrapping(0x67332667ffc00b31), Wrapping(0x8eb44a8768581511), Wrapping(0xdb0c2e0d64f98fa7), Wrapping(0x47b5481dbefa4fa4), ]), id: AlgorithmID::SHA384, }; /// SHA-512 as specified in [FIPS 180-4]. /// /// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf pub static SHA512: Algorithm = Algorithm { output_len: OutputLen::_512, chaining_len: SHA512_OUTPUT_LEN, block_len: SHA512_BLOCK_LEN, block_data_order: dynstate::sha512_block_data_order, initial_state: DynState::new64([ Wrapping(0x6a09e667f3bcc908), Wrapping(0xbb67ae8584caa73b), Wrapping(0x3c6ef372fe94f82b), Wrapping(0xa54ff53a5f1d36f1), Wrapping(0x510e527fade682d1), Wrapping(0x9b05688c2b3e6c1f), Wrapping(0x1f83d9abfb41bd6b), Wrapping(0x5be0cd19137e2179), ]), id: AlgorithmID::SHA512, }; /// SHA-512/256 as specified in [FIPS 180-4]. /// /// This is *not* the same as just truncating the output of SHA-512, as /// SHA-512/256 has its own initial state distinct from SHA-512's initial /// state. /// /// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf pub static SHA512_256: Algorithm = Algorithm { output_len: OutputLen::_256, chaining_len: SHA512_OUTPUT_LEN, block_len: SHA512_BLOCK_LEN, block_data_order: dynstate::sha512_block_data_order, initial_state: DynState::new64([ Wrapping(0x22312194fc2bf72c), Wrapping(0x9f555fa3c84c64c2), Wrapping(0x2393b86b6f53b151), Wrapping(0x963877195940eabd), Wrapping(0x96283ee2a88effe3), Wrapping(0xbe5e1e2553863992), Wrapping(0x2b0199fc2c85b8aa), Wrapping(0x0eb72ddc81c52ca2), ]), id: AlgorithmID::SHA512_256, }; #[derive(Clone, Copy)] struct Output([u8; MAX_OUTPUT_LEN]); /// The maximum block length ([`Algorithm::block_len()`]) of all the algorithms /// in this module. pub const MAX_BLOCK_LEN: usize = BlockLen::MAX.into(); /// The maximum output length ([`Algorithm::output_len()`]) of all the /// algorithms in this module. pub const MAX_OUTPUT_LEN: usize = OutputLen::MAX.into(); /// The maximum chaining length ([`Algorithm::chaining_len()`]) of all the /// algorithms in this module. pub const MAX_CHAINING_LEN: usize = MAX_OUTPUT_LEN; #[inline] fn format_output(input: [Wrapping; sha2::CHAINING_WORDS], f: F) -> Output where F: Fn(T) -> [u8; N], T: Copy, { let mut output = Output([0; MAX_OUTPUT_LEN]); output .0 .chunks_mut(N) .zip(input.iter().copied().map(|Wrapping(w)| f(w))) .for_each(|(o, i)| { o.copy_from_slice(&i); }); output } /// The length of the output of SHA-1, in bytes. pub const SHA1_OUTPUT_LEN: usize = sha1::OUTPUT_LEN.into(); /// The length of the output of SHA-256, in bytes. pub const SHA256_OUTPUT_LEN: usize = OutputLen::_256.into(); /// The length of the output of SHA-384, in bytes. pub const SHA384_OUTPUT_LEN: usize = OutputLen::_384.into(); /// The length of the output of SHA-512, in bytes. pub const SHA512_OUTPUT_LEN: usize = OutputLen::_512.into(); /// The length of the output of SHA-512/256, in bytes. pub const SHA512_256_OUTPUT_LEN: usize = OutputLen::_256.into(); #[derive(Clone, Copy)] enum BlockLen { _512 = 512 / 8, _1024 = 1024 / 8, // MAX } impl BlockLen { const MAX: Self = Self::_1024; #[inline(always)] const fn into(self) -> usize { self as usize } #[inline(always)] const fn len_len(self) -> usize { let len_len = match self { BlockLen::_512 => LenLen::_64, BlockLen::_1024 => LenLen::_128, }; len_len as usize } } #[derive(Clone, Copy)] enum LenLen { _64 = 64 / 8, _128 = 128 / 8, } #[derive(Clone, Copy)] enum OutputLen { _160 = 160 / 8, _256 = 256 / 8, _384 = 384 / 8, _512 = 512 / 8, // MAX } impl OutputLen { const MAX: Self = Self::_512; #[inline(always)] const fn into(self) -> usize { self as usize } } #[cfg(test)] mod tests { mod max_input { extern crate alloc; use super::super::super::digest; use crate::polyfill::u64_from_usize; use alloc::vec; macro_rules! max_input_tests { ( $algorithm_name:ident ) => { mod $algorithm_name { use super::super::super::super::digest; #[test] fn max_input_test() { super::max_input_test(&digest::$algorithm_name); } #[test] #[should_panic] fn too_long_input_test_block() { super::too_long_input_test_block(&digest::$algorithm_name); } #[test] #[should_panic] fn too_long_input_test_byte() { super::too_long_input_test_byte(&digest::$algorithm_name); } } }; } fn max_input_test(alg: &'static digest::Algorithm) { let mut context = nearly_full_context(alg); let next_input = vec![0u8; alg.block_len() - 1]; context.update(&next_input); let _ = context.finish(); // no panic } fn too_long_input_test_block(alg: &'static digest::Algorithm) { let mut context = nearly_full_context(alg); let next_input = vec![0u8; alg.block_len()]; context.update(&next_input); let _ = context.finish(); // should panic } fn too_long_input_test_byte(alg: &'static digest::Algorithm) { let mut context = nearly_full_context(alg); let next_input = vec![0u8; alg.block_len() - 1]; context.update(&next_input); context.update(&[0]); let _ = context.finish(); // should panic } fn nearly_full_context(alg: &'static digest::Algorithm) -> digest::Context { // All implementations currently support up to 2^64-1 bits // of input; according to the spec, SHA-384 and SHA-512 // support up to 2^128-1, but that's not implemented yet. let max_bytes = 1u64 << (64 - 3); let max_blocks = max_bytes / u64_from_usize(alg.block_len()); let completed_bytes = (max_blocks - 1) * u64_from_usize(alg.block_len()); digest::Context { block: digest::BlockContext { state: alg.initial_state.clone(), completed_bytes, algorithm: alg, }, pending: [0u8; digest::MAX_BLOCK_LEN], num_pending: 0, } } max_input_tests!(SHA1_FOR_LEGACY_USE_ONLY); max_input_tests!(SHA256); max_input_tests!(SHA384); max_input_tests!(SHA512); } } ring-0.17.14/src/ec/curve25519/ed25519/ed25519_pkcs8_v2_template.der000064400000000000000000000000231046102023000217420ustar 000000000000000Q0+ep" !ring-0.17.14/src/ec/curve25519/ed25519/signing.rs000064400000000000000000000251501046102023000166520ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! EdDSA Signatures. use super::{super::ops::*, eddsa_digest, ED25519_PUBLIC_KEY_LEN}; use crate::{ cpu, digest, error, io::der, pkcs8, rand, signature::{self, KeyPair as SigningKeyPair}, }; /// An Ed25519 key pair, for signing. pub struct Ed25519KeyPair { // RFC 8032 Section 5.1.6 calls this *s*. private_scalar: Scalar, // RFC 8032 Section 5.1.6 calls this *prefix*. private_prefix: Prefix, // RFC 8032 Section 5.1.5 calls this *A*. public_key: PublicKey, } derive_debug_via_field!(Ed25519KeyPair, stringify!(Ed25519KeyPair), public_key); impl Ed25519KeyPair { /// Generates a new key pair and returns the key pair serialized as a /// PKCS#8 document. /// /// The PKCS#8 document will be a v2 `OneAsymmetricKey` with the public key, /// as described in [RFC 5958 Section 2]; see [RFC 8410 Section 10.3] for an /// example. /// /// [RFC 5958 Section 2]: https://tools.ietf.org/html/rfc5958#section-2 /// [RFC 8410 Section 10.3]: https://tools.ietf.org/html/rfc8410#section-10.3 pub fn generate_pkcs8( rng: &dyn rand::SecureRandom, ) -> Result { let cpu_features = cpu::features(); let seed: [u8; SEED_LEN] = rand::generate(rng)?.expose(); let key_pair = Self::from_seed_(&seed, cpu_features); Ok(pkcs8::wrap_key( &PKCS8_TEMPLATE, &seed[..], key_pair.public_key().as_ref(), )) } /// Constructs an Ed25519 key pair by parsing an unencrypted PKCS#8 v2 /// Ed25519 private key. /// /// `openssl genpkey -algorithm ED25519` generates PKCS# v1 keys, which /// require the use of `Ed25519KeyPair::from_pkcs8_maybe_unchecked()` /// instead of `Ed25519KeyPair::from_pkcs8()`. /// /// The input must be in PKCS#8 v2 format, and in particular it must contain /// the public key in addition to the private key. `from_pkcs8()` will /// verify that the public key and the private key are consistent with each /// other. /// /// Some early implementations of PKCS#8 v2, including earlier versions of /// *ring* and other implementations, wrapped the public key in the wrong /// ASN.1 tags. Both that incorrect form and the standardized form are /// accepted. /// /// If you need to parse PKCS#8 v1 files (without the public key) then use /// `Ed25519KeyPair::from_pkcs8_maybe_unchecked()` instead. pub fn from_pkcs8(pkcs8: &[u8]) -> Result { let version = pkcs8::Version::V2Only(pkcs8::PublicKeyOptions { accept_legacy_ed25519_public_key_tag: true, }); let (seed, public_key) = unwrap_pkcs8(version, untrusted::Input::from(pkcs8))?; Self::from_seed_and_public_key( seed.as_slice_less_safe(), public_key.unwrap().as_slice_less_safe(), ) } /// Constructs an Ed25519 key pair by parsing an unencrypted PKCS#8 v1 or v2 /// Ed25519 private key. /// /// `openssl genpkey -algorithm ED25519` generates PKCS# v1 keys. /// /// It is recommended to use `Ed25519KeyPair::from_pkcs8()`, which accepts /// only PKCS#8 v2 files that contain the public key. /// `from_pkcs8_maybe_unchecked()` parses PKCS#2 files exactly like /// `from_pkcs8()`. It also accepts v1 files. PKCS#8 v1 files do not contain /// the public key, so when a v1 file is parsed the public key will be /// computed from the private key, and there will be no consistency check /// between the public key and the private key. /// /// Some early implementations of PKCS#8 v2, including earlier versions of /// *ring* and other implementations, wrapped the public key in the wrong /// ASN.1 tags. Both that incorrect form and the standardized form are /// accepted. /// /// PKCS#8 v2 files are parsed exactly like `Ed25519KeyPair::from_pkcs8()`. pub fn from_pkcs8_maybe_unchecked(pkcs8: &[u8]) -> Result { let version = pkcs8::Version::V1OrV2(pkcs8::PublicKeyOptions { accept_legacy_ed25519_public_key_tag: true, }); let (seed, public_key) = unwrap_pkcs8(version, untrusted::Input::from(pkcs8))?; if let Some(public_key) = public_key { Self::from_seed_and_public_key( seed.as_slice_less_safe(), public_key.as_slice_less_safe(), ) } else { Self::from_seed_unchecked(seed.as_slice_less_safe()) } } /// Constructs an Ed25519 key pair from the private key seed `seed` and its /// public key `public_key`. /// /// It is recommended to use `Ed25519KeyPair::from_pkcs8()` instead. /// /// The private and public keys will be verified to be consistent with each /// other. This helps avoid misuse of the key (e.g. accidentally swapping /// the private key and public key, or using the wrong private key for the /// public key). This also detects any corruption of the public or private /// key. pub fn from_seed_and_public_key( seed: &[u8], public_key: &[u8], ) -> Result { let pair = Self::from_seed_unchecked(seed)?; // This implicitly verifies that `public_key` is the right length. // XXX: This rejects ~18 keys when they are partially reduced, though // those keys are virtually impossible to find. if public_key != pair.public_key.as_ref() { let err = if public_key.len() != pair.public_key.as_ref().len() { error::KeyRejected::invalid_encoding() } else { error::KeyRejected::inconsistent_components() }; return Err(err); } Ok(pair) } /// Constructs a Ed25519 key pair from the private key seed `seed`. /// /// It is recommended to use `Ed25519KeyPair::from_pkcs8()` instead. When /// that is not practical, it is recommended to use /// `Ed25519KeyPair::from_seed_and_public_key()` instead. /// /// Since the public key is not given, the public key will be computed from /// the private key. It is not possible to detect misuse or corruption of /// the private key since the public key isn't given as input. pub fn from_seed_unchecked(seed: &[u8]) -> Result { let seed = seed .try_into() .map_err(|_| error::KeyRejected::invalid_encoding())?; Ok(Self::from_seed_(seed, cpu::features())) } fn from_seed_(seed: &Seed, cpu_features: cpu::Features) -> Self { let h = digest::digest(&digest::SHA512, seed); let (private_scalar, private_prefix) = h.as_ref().split_at(SCALAR_LEN); let private_scalar = MaskedScalar::from_bytes_masked(private_scalar.try_into().unwrap()).into(); let a = ExtPoint::from_scalarmult_base(&private_scalar, cpu_features); Self { private_scalar, private_prefix: private_prefix.try_into().unwrap(), public_key: PublicKey(a.into_encoded_point(cpu_features)), } } /// Returns the signature of the message `msg`. pub fn sign(&self, msg: &[u8]) -> signature::Signature { let cpu_features = cpu::features(); signature::Signature::new(|signature_bytes| { prefixed_extern! { fn x25519_sc_muladd( s: &mut [u8; SCALAR_LEN], a: &Scalar, b: &Scalar, c: &Scalar, ); } let (signature_bytes, _unused) = signature_bytes.split_at_mut(ELEM_LEN + SCALAR_LEN); let (signature_r, signature_s) = signature_bytes.split_at_mut(ELEM_LEN); let nonce = { let mut ctx = digest::Context::new(&digest::SHA512); ctx.update(&self.private_prefix); ctx.update(msg); ctx.finish() }; let nonce = Scalar::from_sha512_digest_reduced(nonce); let r = ExtPoint::from_scalarmult_base(&nonce, cpu_features); signature_r.copy_from_slice(&r.into_encoded_point(cpu_features)); let hram_digest = eddsa_digest(signature_r, self.public_key.as_ref(), msg); let hram = Scalar::from_sha512_digest_reduced(hram_digest); unsafe { x25519_sc_muladd( signature_s.try_into().unwrap(), &hram, &self.private_scalar, &nonce, ); } SIGNATURE_LEN }) } } impl signature::KeyPair for Ed25519KeyPair { type PublicKey = PublicKey; fn public_key(&self) -> &Self::PublicKey { &self.public_key } } #[derive(Clone, Copy)] pub struct PublicKey([u8; ED25519_PUBLIC_KEY_LEN]); impl AsRef<[u8]> for PublicKey { fn as_ref(&self) -> &[u8] { self.0.as_ref() } } derive_debug_self_as_ref_hex_bytes!(PublicKey); fn unwrap_pkcs8( version: pkcs8::Version, input: untrusted::Input, ) -> Result<(untrusted::Input, Option), error::KeyRejected> { let (private_key, public_key) = pkcs8::unwrap_key(&PKCS8_TEMPLATE, version, input)?; let private_key = private_key .read_all(error::Unspecified, |input| { der::expect_tag_and_get_value(input, der::Tag::OctetString) }) .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; Ok((private_key, public_key)) } type Prefix = [u8; PREFIX_LEN]; const PREFIX_LEN: usize = digest::SHA512_OUTPUT_LEN - SCALAR_LEN; const SIGNATURE_LEN: usize = ELEM_LEN + SCALAR_LEN; type Seed = [u8; SEED_LEN]; const SEED_LEN: usize = 32; static PKCS8_TEMPLATE: pkcs8::Template = pkcs8::Template { bytes: include_bytes!("ed25519_pkcs8_v2_template.der"), alg_id_range: core::ops::Range { start: 7, end: 12 }, curve_id_index: 0, private_key_index: 0x10, }; ring-0.17.14/src/ec/curve25519/ed25519/verification.rs000064400000000000000000000057141046102023000177020ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! EdDSA Signatures. use super::{super::ops::*, eddsa_digest}; use crate::{cpu, error, sealed, signature}; /// Parameters for EdDSA signing and verification. pub struct EdDSAParameters; impl core::fmt::Debug for EdDSAParameters { fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { write!(f, "ring::signature::ED25519") } } /// Verification of [Ed25519] signatures. /// /// Ed25519 uses SHA-512 as the digest algorithm. /// /// [Ed25519]: https://ed25519.cr.yp.to/ pub static ED25519: EdDSAParameters = EdDSAParameters {}; impl signature::VerificationAlgorithm for EdDSAParameters { fn verify( &self, public_key: untrusted::Input, msg: untrusted::Input, signature: untrusted::Input, ) -> Result<(), error::Unspecified> { let cpu_features = cpu::features(); let public_key: &[u8; ELEM_LEN] = public_key.as_slice_less_safe().try_into()?; let (signature_r, signature_s) = signature.read_all(error::Unspecified, |input| { let signature_r: &[u8; ELEM_LEN] = input .read_bytes(ELEM_LEN)? .as_slice_less_safe() .try_into()?; let signature_s: &[u8; SCALAR_LEN] = input .read_bytes(SCALAR_LEN)? .as_slice_less_safe() .try_into()?; Ok((signature_r, signature_s)) })?; let signature_s = Scalar::from_bytes_checked(*signature_s)?; let mut a = ExtPoint::from_encoded_point_vartime(public_key)?; a.invert_vartime(); let h_digest = eddsa_digest(signature_r, public_key, msg.as_slice_less_safe()); let h = Scalar::from_sha512_digest_reduced(h_digest); let mut r = Point::new_at_infinity(); unsafe { x25519_ge_double_scalarmult_vartime(&mut r, &h, &a, &signature_s) }; let r_check = r.into_encoded_point(cpu_features); if *signature_r != r_check { return Err(error::Unspecified); } Ok(()) } } impl sealed::Sealed for EdDSAParameters {} prefixed_extern! { fn x25519_ge_double_scalarmult_vartime( r: &mut Point, a_coeff: &Scalar, a: &ExtPoint, b_coeff: &Scalar, ); } ring-0.17.14/src/ec/curve25519/ed25519.rs000064400000000000000000000023011046102023000152050ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! EdDSA Signatures. use super::ops::ELEM_LEN; use crate::digest; pub mod signing; pub mod verification; /// The length of an Ed25519 public key. pub const ED25519_PUBLIC_KEY_LEN: usize = ELEM_LEN; pub fn eddsa_digest(signature_r: &[u8], public_key: &[u8], msg: &[u8]) -> digest::Digest { let mut ctx = digest::Context::new(&digest::SHA512); ctx.update(signature_r); ctx.update(public_key); ctx.update(msg); ctx.finish() } ring-0.17.14/src/ec/curve25519/ops.rs000064400000000000000000000125351046102023000150220ustar 00000000000000// Copyright 2015-2017 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Elliptic curve operations on the birationally equivalent curves Curve25519 //! and Edwards25519. pub use super::scalar::{MaskedScalar, Scalar, SCALAR_LEN}; use crate::{ bssl, cpu, error, limb::{Limb, LIMB_BITS}, }; use core::{ffi::c_int, marker::PhantomData}; // Elem` is `fe` in curve25519/internal.h. // Elem is `fe_loose` in curve25519/internal.h. // Keep this in sync with curve25519/internal.h. #[repr(C)] pub struct Elem { limbs: [Limb; ELEM_LIMBS], // This is called `v` in the C code. encoding: PhantomData, } pub trait Encoding {} pub struct T; impl Encoding for T {} const ELEM_LIMBS: usize = 5 * 64 / LIMB_BITS; impl Elem { fn zero() -> Self { Self { limbs: Default::default(), encoding: PhantomData, } } } impl Elem { fn negate(&mut self) { unsafe { x25519_fe_neg(self); } } } // An encoding of a curve point. If on Curve25519, it should be encoded as // described in Section 5 of [RFC 7748]. If on Edwards25519, it should be // encoded as described in section 5.1.2 of [RFC 8032]. // // [RFC 7748] https://tools.ietf.org/html/rfc7748#section-5 // [RFC 8032] https://tools.ietf.org/html/rfc8032#section-5.1.2 pub type EncodedPoint = [u8; ELEM_LEN]; pub const ELEM_LEN: usize = 32; // Keep this in sync with `ge_p3` in curve25519/internal.h. #[repr(C)] pub struct ExtPoint { x: Elem, y: Elem, z: Elem, t: Elem, } impl ExtPoint { // Returns the result of multiplying the base point by the scalar in constant time. pub(super) fn from_scalarmult_base(scalar: &Scalar, cpu: cpu::Features) -> Self { let mut r = Self { x: Elem::zero(), y: Elem::zero(), z: Elem::zero(), t: Elem::zero(), }; prefixed_extern! { fn x25519_ge_scalarmult_base(h: &mut ExtPoint, a: &Scalar, has_fe25519_adx: c_int); } unsafe { x25519_ge_scalarmult_base(&mut r, scalar, has_fe25519_adx(cpu).into()); } r } pub fn from_encoded_point_vartime(encoded: &EncodedPoint) -> Result { let mut point = Self { x: Elem::zero(), y: Elem::zero(), z: Elem::zero(), t: Elem::zero(), }; Result::from(unsafe { x25519_ge_frombytes_vartime(&mut point, encoded) }).map(|()| point) } pub(super) fn into_encoded_point(self, cpu_features: cpu::Features) -> EncodedPoint { encode_point(self.x, self.y, self.z, cpu_features) } pub(super) fn invert_vartime(&mut self) { self.x.negate(); self.t.negate(); } } // Keep this in sync with `ge_p2` in curve25519/internal.h. #[repr(C)] pub struct Point { x: Elem, y: Elem, z: Elem, } impl Point { pub fn new_at_infinity() -> Self { Self { x: Elem::zero(), y: Elem::zero(), z: Elem::zero(), } } pub(super) fn into_encoded_point(self, cpu_features: cpu::Features) -> EncodedPoint { encode_point(self.x, self.y, self.z, cpu_features) } } fn encode_point(x: Elem, y: Elem, z: Elem, _cpu_features: cpu::Features) -> EncodedPoint { let mut bytes = [0; ELEM_LEN]; let sign_bit: u8 = unsafe { let mut recip = Elem::zero(); x25519_fe_invert(&mut recip, &z); let mut x_over_z = Elem::zero(); x25519_fe_mul_ttt(&mut x_over_z, &x, &recip); let mut y_over_z = Elem::zero(); x25519_fe_mul_ttt(&mut y_over_z, &y, &recip); x25519_fe_tobytes(&mut bytes, &y_over_z); x25519_fe_isnegative(&x_over_z) }; // The preceding computations must execute in constant time, but this // doesn't need to. bytes[ELEM_LEN - 1] ^= sign_bit << 7; bytes } #[inline(always)] pub(super) fn has_fe25519_adx(cpu: cpu::Features) -> bool { cfg_if::cfg_if! { if #[cfg(all(target_arch = "x86_64", not(target_os = "windows")))] { use cpu::{intel::{Adx, Bmi1, Bmi2}, GetFeature as _}; matches!(cpu.get_feature(), Some((Adx { .. }, Bmi1 { .. }, Bmi2 { .. }))) } else { let _ = cpu; false } } } prefixed_extern! { fn x25519_fe_invert(out: &mut Elem, z: &Elem); fn x25519_fe_isnegative(elem: &Elem) -> u8; fn x25519_fe_mul_ttt(h: &mut Elem, f: &Elem, g: &Elem); fn x25519_fe_neg(f: &mut Elem); fn x25519_fe_tobytes(bytes: &mut EncodedPoint, elem: &Elem); fn x25519_ge_frombytes_vartime(h: &mut ExtPoint, s: &EncodedPoint) -> bssl::Result; } ring-0.17.14/src/ec/curve25519/scalar.rs000064400000000000000000000054661046102023000154730ustar 00000000000000// Copyright 2015-2019 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::{ arithmetic::limbs_from_hex, digest, error, limb, polyfill::slice::{self, AsChunks}, }; use core::array; #[repr(transparent)] pub struct Scalar([u8; SCALAR_LEN]); pub const SCALAR_LEN: usize = 32; impl Scalar { // Constructs a `Scalar` from `bytes`, failing if `bytes` encodes a scalar // that is not in the range [0, n). pub fn from_bytes_checked(bytes: [u8; SCALAR_LEN]) -> Result { const ORDER: [limb::Limb; SCALAR_LEN / limb::LIMB_BYTES] = limbs_from_hex("1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed"); let order = ORDER.map(limb::Limb::from); let (limbs_as_bytes, _empty): (AsChunks, _) = slice::as_chunks(&bytes); debug_assert!(_empty.is_empty()); let limbs: [limb::Limb; SCALAR_LEN / limb::LIMB_BYTES] = array::from_fn(|i| limb::Limb::from_le_bytes(limbs_as_bytes[i])); limb::verify_limbs_less_than_limbs_leak_bit(&limbs, &order)?; Ok(Self(bytes)) } // Constructs a `Scalar` from `digest` reduced modulo n. pub fn from_sha512_digest_reduced(digest: digest::Digest) -> Self { prefixed_extern! { fn x25519_sc_reduce(s: &mut UnreducedScalar); } let mut unreduced = [0u8; digest::SHA512_OUTPUT_LEN]; unreduced.copy_from_slice(digest.as_ref()); unsafe { x25519_sc_reduce(&mut unreduced) }; Self((&unreduced[..SCALAR_LEN]).try_into().unwrap()) } } #[repr(transparent)] pub struct MaskedScalar([u8; SCALAR_LEN]); impl MaskedScalar { pub fn from_bytes_masked(bytes: [u8; SCALAR_LEN]) -> Self { prefixed_extern! { fn x25519_sc_mask(a: &mut [u8; SCALAR_LEN]); } let mut r = Self(bytes); unsafe { x25519_sc_mask(&mut r.0) }; r } } impl From for Scalar { fn from(MaskedScalar(scalar): MaskedScalar) -> Self { Self(scalar) } } type UnreducedScalar = [u8; UNREDUCED_SCALAR_LEN]; const UNREDUCED_SCALAR_LEN: usize = SCALAR_LEN * 2; ring-0.17.14/src/ec/curve25519/x25519.rs000064400000000000000000000206061046102023000150740ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! X25519 Key agreement. use super::{ops, scalar::SCALAR_LEN}; use crate::{agreement, bb, cpu, ec, error, rand}; use core::ffi::c_int; static CURVE25519: ec::Curve = ec::Curve { public_key_len: PUBLIC_KEY_LEN, elem_scalar_seed_len: ELEM_AND_SCALAR_LEN, id: ec::CurveID::Curve25519, check_private_key_bytes: x25519_check_private_key_bytes, generate_private_key: x25519_generate_private_key, public_from_private: x25519_public_from_private, }; /// X25519 (ECDH using Curve25519) as described in [RFC 7748]. /// /// Everything is as described in RFC 7748. Key agreement will fail if the /// result of the X25519 operation is zero; see the notes on the /// "all-zero value" in [RFC 7748 section 6.1]. /// /// [RFC 7748]: https://tools.ietf.org/html/rfc7748 /// [RFC 7748 section 6.1]: https://tools.ietf.org/html/rfc7748#section-6.1 pub static X25519: agreement::Algorithm = agreement::Algorithm { curve: &CURVE25519, ecdh: x25519_ecdh, }; #[allow(clippy::unnecessary_wraps)] fn x25519_check_private_key_bytes( bytes: &[u8], _: cpu::Features, ) -> Result<(), error::Unspecified> { debug_assert_eq!(bytes.len(), PRIVATE_KEY_LEN); Ok(()) } fn x25519_generate_private_key( rng: &dyn rand::SecureRandom, out: &mut [u8], _: cpu::Features, ) -> Result<(), error::Unspecified> { rng.fill(out) } fn x25519_public_from_private( public_out: &mut [u8], private_key: &ec::Seed, cpu_features: cpu::Features, ) -> Result<(), error::Unspecified> { let public_out = public_out.try_into()?; let private_key: &[u8; SCALAR_LEN] = private_key.bytes_less_safe().try_into()?; let private_key = ops::MaskedScalar::from_bytes_masked(*private_key); #[cfg(all( all(target_arch = "arm", target_endian = "little"), any(target_os = "android", target_os = "linux") ))] if let Some(cpu) = >::get_feature(&cpu_features) { static MONTGOMERY_BASE_POINT: [u8; 32] = [ 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; x25519_neon(public_out, &private_key, &MONTGOMERY_BASE_POINT, cpu); return Ok(()); } prefixed_extern! { fn x25519_public_from_private_generic_masked( public_key_out: &mut PublicKey, private_key: &PrivateKey, use_adx: c_int, ); } unsafe { x25519_public_from_private_generic_masked( public_out, &private_key, ops::has_fe25519_adx(cpu_features).into(), ); } Ok(()) } fn x25519_ecdh( out: &mut [u8], my_private_key: &ec::Seed, peer_public_key: untrusted::Input, cpu_features: cpu::Features, ) -> Result<(), error::Unspecified> { let my_private_key: &[u8; SCALAR_LEN] = my_private_key.bytes_less_safe().try_into()?; let my_private_key = ops::MaskedScalar::from_bytes_masked(*my_private_key); let peer_public_key: &[u8; PUBLIC_KEY_LEN] = peer_public_key.as_slice_less_safe().try_into()?; fn scalar_mult( out: &mut ops::EncodedPoint, scalar: &ops::MaskedScalar, point: &ops::EncodedPoint, #[allow(unused_variables)] cpu_features: cpu::Features, ) { #[cfg(all( all(target_arch = "arm", target_endian = "little"), any(target_os = "android", target_os = "linux") ))] if let Some(cpu) = >::get_feature(&cpu_features) { return x25519_neon(out, scalar, point, cpu); } #[cfg(all(target_arch = "x86_64", not(target_os = "windows")))] { if ops::has_fe25519_adx(cpu_features) { prefixed_extern! { fn x25519_scalar_mult_adx( out: &mut ops::EncodedPoint, scalar: &ops::MaskedScalar, point: &ops::EncodedPoint, ); } return unsafe { x25519_scalar_mult_adx(out, scalar, point) }; } } prefixed_extern! { fn x25519_scalar_mult_generic_masked( out: &mut ops::EncodedPoint, scalar: &ops::MaskedScalar, point: &ops::EncodedPoint, ); } unsafe { x25519_scalar_mult_generic_masked(out, scalar, point); } } scalar_mult( out.try_into()?, &my_private_key, peer_public_key, cpu_features, ); let zeros: SharedSecret = [0; SHARED_SECRET_LEN]; if bb::verify_slices_are_equal(out, &zeros).is_ok() { // All-zero output results when the input is a point of small order. return Err(error::Unspecified); } Ok(()) } // BoringSSL uses `!defined(OPENSSL_APPLE)`. #[cfg(all( all(target_arch = "arm", target_endian = "little"), any(target_os = "android", target_os = "linux") ))] fn x25519_neon( out: &mut ops::EncodedPoint, scalar: &ops::MaskedScalar, point: &ops::EncodedPoint, _cpu: cpu::arm::Neon, ) { prefixed_extern! { fn x25519_NEON( out: &mut ops::EncodedPoint, scalar: &ops::MaskedScalar, point: &ops::EncodedPoint, ); } unsafe { x25519_NEON(out, scalar, point) } } const ELEM_AND_SCALAR_LEN: usize = ops::ELEM_LEN; type PrivateKey = ops::MaskedScalar; const PRIVATE_KEY_LEN: usize = ELEM_AND_SCALAR_LEN; // An X25519 public key as an encoded Curve25519 point. type PublicKey = [u8; PUBLIC_KEY_LEN]; const PUBLIC_KEY_LEN: usize = ELEM_AND_SCALAR_LEN; // An X25519 shared secret as an encoded Curve25519 point. type SharedSecret = [u8; SHARED_SECRET_LEN]; const SHARED_SECRET_LEN: usize = ELEM_AND_SCALAR_LEN; #[cfg(test)] mod tests { use super::*; use crate::ec; use untrusted::Input; #[test] fn test_x25519_public_from_private() { struct TestVector { private: [u8; 32], public: [u8; 32], } static TEST_CASES: &[TestVector] = &[ TestVector { private: [ 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d, 0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45, 0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a, 0xb1, 0x77, 0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a, ], public: [ 0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54, 0x74, 0x8b, 0x7d, 0xdc, 0xb4, 0x3e, 0xf7, 0x5a, 0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4, 0xeb, 0xa4, 0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a, ], }, TestVector { private: [ 0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b, 0x79, 0xe1, 0x7f, 0x8b, 0x83, 0x80, 0x0e, 0xe6, 0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd, 0x1c, 0x2f, 0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb, ], public: [ 0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4, 0xd3, 0x5b, 0x61, 0xc2, 0xec, 0xe4, 0x35, 0x37, 0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d, 0xad, 0xfc, 0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f, ], }, ]; let cpu_features = cpu::features(); for test_case in TEST_CASES { let seed = ec::Seed::from_bytes(&CURVE25519, Input::from(&test_case.private), cpu_features) .unwrap(); let mut output = [0u8; 32]; x25519_public_from_private(&mut output, &seed, cpu_features).unwrap(); assert_eq!(output, test_case.public); } } } ring-0.17.14/src/ec/curve25519.rs000064400000000000000000000015611046102023000142160ustar 00000000000000// Copyright 2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Elliptic curve operations and schemes using Curve25519. pub mod ed25519; pub mod x25519; mod ops; mod scalar; ring-0.17.14/src/ec/keys.rs000064400000000000000000000047221046102023000134410ustar 00000000000000use super::{Curve, ELEM_MAX_BYTES, SEED_MAX_BYTES}; use crate::{cpu, error, rand}; pub struct KeyPair { seed: Seed, public_key: PublicKey, } impl KeyPair { pub(super) fn derive( seed: Seed, cpu_features: cpu::Features, ) -> Result { let public_key = seed.compute_public_key(cpu_features)?; Ok(Self { seed, public_key }) } pub fn public_key(&self) -> &PublicKey { &self.public_key } pub fn split(self) -> (Seed, PublicKey) { (self.seed, self.public_key) } } pub struct Seed { bytes: [u8; SEED_MAX_BYTES], curve: &'static Curve, } impl Seed { pub(crate) fn generate( curve: &'static Curve, rng: &dyn rand::SecureRandom, cpu: cpu::Features, ) -> Result { let mut r = Self { bytes: [0u8; SEED_MAX_BYTES], curve, }; (curve.generate_private_key)(rng, &mut r.bytes[..curve.elem_scalar_seed_len], cpu)?; Ok(r) } pub(crate) fn from_bytes( curve: &'static Curve, bytes: untrusted::Input, cpu: cpu::Features, ) -> Result { let bytes = bytes.as_slice_less_safe(); if curve.elem_scalar_seed_len != bytes.len() { return Err(error::Unspecified); } (curve.check_private_key_bytes)(bytes, cpu)?; let mut r = Self { bytes: [0; SEED_MAX_BYTES], curve, }; r.bytes[..curve.elem_scalar_seed_len].copy_from_slice(bytes); Ok(r) } pub fn bytes_less_safe(&self) -> &[u8] { &self.bytes[..self.curve.elem_scalar_seed_len] } pub(crate) fn compute_public_key( &self, cpu_features: cpu::Features, ) -> Result { let mut public_key = PublicKey { bytes: [0u8; PUBLIC_KEY_MAX_LEN], len: self.curve.public_key_len, }; (self.curve.public_from_private)( &mut public_key.bytes[..public_key.len], self, cpu_features, )?; Ok(public_key) } } #[derive(Copy, Clone)] pub struct PublicKey { bytes: [u8; PUBLIC_KEY_MAX_LEN], len: usize, } impl AsRef<[u8]> for PublicKey { fn as_ref(&self) -> &[u8] { &self.bytes[..self.len] } } /// The maximum length, in bytes, of an encoded public key. pub const PUBLIC_KEY_MAX_LEN: usize = 1 + (2 * ELEM_MAX_BYTES); ring-0.17.14/src/ec/suite_b/curve.rs000064400000000000000000000070371046102023000152460ustar 00000000000000// Copyright 2015-2017 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::{cpu, ec, error, rand}; /// A key agreement algorithm. macro_rules! suite_b_curve { ( $NAME:ident, $bits:expr, $private_key_ops:expr, $id:expr, $check_private_key_bytes:ident, $generate_private_key:ident, $public_from_private:ident) => { /// Public keys are encoding in uncompressed form using the /// Octet-String-to-Elliptic-Curve-Point algorithm in /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]. Public keys are /// validated during key agreement according to /// [NIST Special Publication 800-56A, revision 2] and Appendix B.3 of /// the NSA's [Suite B Implementer's Guide to NIST SP 800-56A]. /// /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]: /// http://www.secg.org/sec1-v2.pdf /// [NIST Special Publication 800-56A, revision 2]: /// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf /// [Suite B Implementer's Guide to NIST SP 800-56A]: /// https://github.com/briansmith/ring/blob/main/doc/ecdh.pdf pub static $NAME: ec::Curve = ec::Curve { public_key_len: 1 + (2 * (($bits + 7) / 8)), elem_scalar_seed_len: ($bits + 7) / 8, id: $id, check_private_key_bytes: $check_private_key_bytes, generate_private_key: $generate_private_key, public_from_private: $public_from_private, }; fn $check_private_key_bytes( bytes: &[u8], cpu: cpu::Features, ) -> Result<(), error::Unspecified> { debug_assert_eq!(bytes.len(), $bits / 8); ec::suite_b::private_key::check_scalar_big_endian_bytes($private_key_ops, bytes, cpu) } fn $generate_private_key( rng: &dyn rand::SecureRandom, out: &mut [u8], cpu: cpu::Features, ) -> Result<(), error::Unspecified> { ec::suite_b::private_key::generate_private_scalar_bytes($private_key_ops, rng, out, cpu) } fn $public_from_private( public_out: &mut [u8], private_key: &ec::Seed, cpu: cpu::Features, ) -> Result<(), error::Unspecified> { ec::suite_b::private_key::public_from_private( $private_key_ops, public_out, private_key, cpu, ) } }; } suite_b_curve!( P256, 256, &ec::suite_b::ops::p256::PRIVATE_KEY_OPS, ec::CurveID::P256, p256_check_private_key_bytes, p256_generate_private_key, p256_public_from_private ); suite_b_curve!( P384, 384, &ec::suite_b::ops::p384::PRIVATE_KEY_OPS, ec::CurveID::P384, p384_check_private_key_bytes, p384_generate_private_key, p384_public_from_private ); ring-0.17.14/src/ec/suite_b/ecdh.rs000064400000000000000000000234341046102023000150240ustar 00000000000000// Copyright 2015-2017 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! ECDH key agreement using the P-256 and P-384 curves. use super::{ops::*, private_key::*, public_key::*}; use crate::{agreement, cpu, ec, error}; /// A key agreement algorithm. macro_rules! ecdh { ( $NAME:ident, $curve:expr, $name_str:expr, $private_key_ops:expr, $public_key_ops:expr, $ecdh:ident ) => { #[doc = "ECDH using the NSA Suite B"] #[doc=$name_str] #[doc = "curve."] /// /// Public keys are encoding in uncompressed form using the /// Octet-String-to-Elliptic-Curve-Point algorithm in /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]. Public keys are /// validated during key agreement according to /// [NIST Special Publication 800-56A, revision 2] and Appendix B.3 of /// the NSA's [Suite B Implementer's Guide to NIST SP 800-56A]. /// /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]: /// http://www.secg.org/sec1-v2.pdf /// [NIST Special Publication 800-56A, revision 2]: /// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf /// [Suite B Implementer's Guide to NIST SP 800-56A]: /// https://github.com/briansmith/ring/blob/main/doc/ecdh.pdf pub static $NAME: agreement::Algorithm = agreement::Algorithm { curve: $curve, ecdh: $ecdh, }; fn $ecdh( out: &mut [u8], my_private_key: &ec::Seed, peer_public_key: untrusted::Input, cpu: cpu::Features, ) -> Result<(), error::Unspecified> { ecdh( $private_key_ops, $public_key_ops, out, my_private_key, peer_public_key, cpu, ) } }; } ecdh!( ECDH_P256, &ec::suite_b::curve::P256, "P-256 (secp256r1)", &p256::PRIVATE_KEY_OPS, &p256::PUBLIC_KEY_OPS, p256_ecdh ); ecdh!( ECDH_P384, &ec::suite_b::curve::P384, "P-384 (secp384r1)", &p384::PRIVATE_KEY_OPS, &p384::PUBLIC_KEY_OPS, p384_ecdh ); fn ecdh( private_key_ops: &PrivateKeyOps, public_key_ops: &PublicKeyOps, out: &mut [u8], my_private_key: &ec::Seed, peer_public_key: untrusted::Input, cpu: cpu::Features, ) -> Result<(), error::Unspecified> { // The NIST SP 800-56Ar2 steps are from section 5.7.1.2 Elliptic Curve // Cryptography Cofactor Diffie-Hellman (ECC CDH) Primitive. // // The "NSA Guide" steps are from section 3.1 of the NSA guide, "Ephemeral // Unified Model." let q = &public_key_ops.common.elem_modulus(cpu); // NSA Guide Step 1 is handled separately. // NIST SP 800-56Ar2 5.6.2.2.2. // NSA Guide Step 2. // // `parse_uncompressed_point` verifies that the point is not at infinity // and that it is on the curve, using the Partial Public-Key Validation // Routine. let peer_public_key = parse_uncompressed_point(public_key_ops, q, peer_public_key)?; // NIST SP 800-56Ar2 Step 1. // NSA Guide Step 3 (except point at infinity check). // // Note that the cofactor (h) is one since we only support prime-order // curves, so we can safely ignore the cofactor. // // It is impossible for the result to be the point at infinity because our // private key is in the range [1, n) and the curve has prime order and // `parse_uncompressed_point` verified that the peer public key is on the // curve and not at infinity. However, since the standards require the // check, we do it using `assert!`. // // NIST SP 800-56Ar2 defines "Destroy" thusly: "In this Recommendation, to // destroy is an action applied to a key or a piece of secret data. After // a key or a piece of secret data is destroyed, no information about its // value can be recovered." We interpret "destroy" somewhat liberally: we // assume that since we throw away the values to be destroyed, no // information about their values can be recovered. This doesn't meet the // NSA guide's explicit requirement to "zeroize" them though. // TODO: this only needs common scalar ops let n = &private_key_ops.common.scalar_modulus(cpu); let my_private_key = private_key_as_scalar(n, my_private_key); let product = private_key_ops.point_mul(&my_private_key, &peer_public_key, cpu); // NIST SP 800-56Ar2 Steps 2, 3, 4, and 5. // NSA Guide Steps 3 (point at infinity check) and 4. // // Again, we have a pretty liberal interpretation of the NIST's spec's // "Destroy" that doesn't meet the NSA requirement to "zeroize." // `big_endian_affine_from_jacobian` verifies that the result is not at // infinity and also does an extra check to verify that the point is on // the curve. big_endian_affine_from_jacobian(private_key_ops, q, out, None, &product) // NSA Guide Step 5 & 6 are deferred to the caller. Again, we have a // pretty liberal interpretation of the NIST's spec's "Destroy" that // doesn't meet the NSA requirement to "zeroize." } #[cfg(test)] mod tests { use super::super::ops; use crate::testutil as test; use crate::{agreement, ec, limb}; static SUPPORTED_SUITE_B_ALGS: [(&str, &agreement::Algorithm, &ec::Curve, &ops::CommonOps); 2] = [ ( "P-256", &agreement::ECDH_P256, &super::super::curve::P256, &ops::p256::COMMON_OPS, ), ( "P-384", &agreement::ECDH_P384, &super::super::curve::P384, &ops::p384::COMMON_OPS, ), ]; #[test] fn test_agreement_suite_b_ecdh_generate() { // Generates a string of bytes 0x00...00, which will always result in // a scalar value of zero. let random_00 = test::rand::FixedByteRandom { byte: 0x00 }; // Generates a string of bytes 0xFF...FF, which will be larger than the // group order of any curve that is supported. let random_ff = test::rand::FixedByteRandom { byte: 0xff }; for &(_, alg, curve, ops) in SUPPORTED_SUITE_B_ALGS.iter() { // Test that the private key value zero is rejected and that // `generate` gives up after a while of only getting zeros. assert!(agreement::EphemeralPrivateKey::generate(alg, &random_00).is_err()); // Test that the private key value larger than the group order is // rejected and that `generate` gives up after a while of only // getting values larger than the group order. assert!(agreement::EphemeralPrivateKey::generate(alg, &random_ff).is_err()); // Test that a private key value exactly equal to the group order // is rejected and that `generate` gives up after a while of only // getting that value from the PRNG. let mut n_bytes = [0u8; ec::SCALAR_MAX_BYTES]; let num_bytes = curve.elem_scalar_seed_len; limb::big_endian_from_limbs(ops.n_limbs(), &mut n_bytes[..num_bytes]); { let n_bytes = &mut n_bytes[..num_bytes]; let rng = test::rand::FixedSliceRandom { bytes: n_bytes }; assert!(agreement::EphemeralPrivateKey::generate(alg, &rng).is_err()); } // Test that a private key value exactly equal to the group order // minus 1 is accepted. let mut n_minus_1_bytes = n_bytes; { let n_minus_1_bytes = &mut n_minus_1_bytes[..num_bytes]; n_minus_1_bytes[num_bytes - 1] -= 1; let rng = test::rand::FixedSliceRandom { bytes: n_minus_1_bytes, }; let key = agreement::EphemeralPrivateKey::generate(alg, &rng).unwrap(); assert_eq!(n_minus_1_bytes, key.bytes_for_test()); } // Test that n + 1 also fails. let mut n_plus_1_bytes = n_bytes; { let n_plus_1_bytes = &mut n_plus_1_bytes[..num_bytes]; n_plus_1_bytes[num_bytes - 1] += 1; let rng = test::rand::FixedSliceRandom { bytes: n_plus_1_bytes, }; assert!(agreement::EphemeralPrivateKey::generate(alg, &rng).is_err()); } // Test recovery from initial RNG failure. The first value will be // n, then n + 1, then zero, the next value will be n - 1, which // will be accepted. { let bytes = [ &n_bytes[..num_bytes], &n_plus_1_bytes[..num_bytes], &[0u8; ec::SCALAR_MAX_BYTES][..num_bytes], &n_minus_1_bytes[..num_bytes], ]; let rng = test::rand::FixedSliceSequenceRandom { bytes: &bytes, current: core::cell::UnsafeCell::new(0), }; let key = agreement::EphemeralPrivateKey::generate(alg, &rng).unwrap(); assert_eq!(&n_minus_1_bytes[..num_bytes], key.bytes_for_test()); } } } } ring-0.17.14/src/ec/suite_b/ecdsa/digest_scalar.rs000064400000000000000000000120031046102023000177720ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! ECDSA Signatures using the P-256 and P-384 curves. use crate::{digest, ec::suite_b::ops::*}; /// Calculate the digest of `msg` using the digest algorithm `digest_alg`. Then /// convert the digest to a scalar in the range [0, n) as described in /// NIST's FIPS 186-4 Section 4.2. Note that this is one of the few cases where /// a `Scalar` is allowed to have the value zero. /// /// NIST's FIPS 186-4 4.2 says "When the length of the output of the hash /// function is greater than N (i.e., the bit length of q), then the leftmost N /// bits of the hash function output block shall be used in any calculation /// using the hash function output during the generation or verification of a /// digital signature." /// /// "Leftmost N bits" means "N most significant bits" because we interpret the /// digest as a bit-endian encoded integer. /// /// The NSA guide instead vaguely suggests that we should convert the digest /// value to an integer and then reduce it mod `n`. However, real-world /// implementations (e.g. `digest_to_bn` in OpenSSL and `hashToInt` in Go) do /// what FIPS 186-4 says to do, not what the NSA guide suggests. /// /// Why shifting the value right by at most one bit is sufficient: P-256's `n` /// has its 256th bit set; i.e. 2**255 < n < 2**256. Once we've truncated the /// digest to 256 bits and converted it to an integer, it will have a value /// less than 2**256. If the value is larger than `n` then shifting it one bit /// right will give a value less than 2**255, which is less than `n`. The /// analogous argument applies for P-384. However, it does *not* apply in /// general; for example, it doesn't apply to P-521. pub(super) fn digest_scalar(n: &Modulus, msg: digest::Digest) -> Scalar { digest_scalar_(n, msg.as_ref()) } #[cfg(test)] pub(super) fn digest_bytes_scalar(n: &Modulus, digest: &[u8]) -> Scalar { digest_scalar_(n, digest) } // This is a separate function solely so that we can test specific digest // values like all-zero values and values larger than `n`. fn digest_scalar_(n: &Modulus, digest: &[u8]) -> Scalar { let len = n.bytes_len(); let digest = if digest.len() > len { &digest[..len] } else { digest }; scalar_parse_big_endian_partially_reduced_variable_consttime(n, untrusted::Input::from(digest)) .unwrap() } #[cfg(test)] mod tests { use super::digest_bytes_scalar; use crate::testutil as test; use crate::{cpu, digest, ec::suite_b::ops::*, limb}; #[test] fn test() { let cpu = cpu::features(); test::run( test_vector_file!("ecdsa_digest_scalar_tests.txt"), |section, test_case| { assert_eq!(section, ""); let curve_name = test_case.consume_string("Curve"); let digest_name = test_case.consume_string("Digest"); let input = test_case.consume_bytes("Input"); let output = test_case.consume_bytes("Output"); let (ops, digest_alg) = match (curve_name.as_str(), digest_name.as_str()) { ("P-256", "SHA256") => (&p256::PUBLIC_SCALAR_OPS, &digest::SHA256), ("P-256", "SHA384") => (&p256::PUBLIC_SCALAR_OPS, &digest::SHA384), ("P-384", "SHA256") => (&p384::PUBLIC_SCALAR_OPS, &digest::SHA256), ("P-384", "SHA384") => (&p384::PUBLIC_SCALAR_OPS, &digest::SHA384), _ => { panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); } }; let n = &ops.scalar_ops.scalar_modulus(cpu); assert_eq!(input.len(), digest_alg.output_len()); assert_eq!(output.len(), ops.scalar_ops.scalar_bytes_len()); assert_eq!(output.len(), n.bytes_len()); let expected = scalar_parse_big_endian_variable( n, limb::AllowZero::Yes, untrusted::Input::from(&output), ) .unwrap(); let actual = digest_bytes_scalar(n, &input); assert_eq!( ops.scalar_ops.leak_limbs(&actual), ops.scalar_ops.leak_limbs(&expected) ); Ok(()) }, ); } } ring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p256_pkcs8_v1_template.der000064400000000000000000000000511046102023000237000ustar 0000000000000000*H=*H=m0k DBring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p384_pkcs8_v1_template.der000064400000000000000000000000501046102023000237010ustar 0000000000000000*H=+"00dbring-0.17.14/src/ec/suite_b/ecdsa/signing.rs000064400000000000000000000527041046102023000166400ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! ECDSA Signatures using the P-256 and P-384 curves. use super::digest_scalar::digest_scalar; use crate::{ arithmetic::montgomery::*, cpu, digest, ec::{ self, suite_b::{ops::*, private_key}, }, error, io::der, limb, pkcs8, rand, sealed, signature, }; /// An ECDSA signing algorithm. pub struct EcdsaSigningAlgorithm { curve: &'static ec::Curve, private_scalar_ops: &'static PrivateScalarOps, private_key_ops: &'static PrivateKeyOps, digest_alg: &'static digest::Algorithm, pkcs8_template: &'static pkcs8::Template, format_rs: fn(ops: &'static ScalarOps, r: &Scalar, s: &Scalar, out: &mut [u8]) -> usize, id: AlgorithmID, } #[derive(Debug, Eq, PartialEq)] enum AlgorithmID { ECDSA_P256_SHA256_FIXED_SIGNING, ECDSA_P384_SHA384_FIXED_SIGNING, ECDSA_P256_SHA256_ASN1_SIGNING, ECDSA_P384_SHA384_ASN1_SIGNING, } derive_debug_via_id!(EcdsaSigningAlgorithm); impl PartialEq for EcdsaSigningAlgorithm { fn eq(&self, other: &Self) -> bool { self.id == other.id } } impl Eq for EcdsaSigningAlgorithm {} impl sealed::Sealed for EcdsaSigningAlgorithm {} /// An ECDSA key pair, used for signing. pub struct EcdsaKeyPair { d: Scalar, nonce_key: NonceRandomKey, alg: &'static EcdsaSigningAlgorithm, public_key: PublicKey, } derive_debug_via_field!(EcdsaKeyPair, stringify!(EcdsaKeyPair), public_key); impl EcdsaKeyPair { /// Generates a new key pair and returns the key pair serialized as a /// PKCS#8 document. /// /// The PKCS#8 document will be a v1 `OneAsymmetricKey` with the public key /// included in the `ECPrivateKey` structure, as described in /// [RFC 5958 Section 2] and [RFC 5915]. The `ECPrivateKey` structure will /// not have a `parameters` field so the generated key is compatible with /// PKCS#11. /// /// [RFC 5915]: https://tools.ietf.org/html/rfc5915 /// [RFC 5958 Section 2]: https://tools.ietf.org/html/rfc5958#section-2 pub fn generate_pkcs8( alg: &'static EcdsaSigningAlgorithm, rng: &dyn rand::SecureRandom, ) -> Result { let cpu = cpu::features(); let private_key = ec::Seed::generate(alg.curve, rng, cpu)?; let public_key = private_key.compute_public_key(cpu)?; Ok(pkcs8::wrap_key( alg.pkcs8_template, private_key.bytes_less_safe(), public_key.as_ref(), )) } /// Constructs an ECDSA key pair by parsing an unencrypted PKCS#8 v1 /// id-ecPublicKey `ECPrivateKey` key. /// /// The input must be in PKCS#8 v1 format. It must contain the public key in /// the `ECPrivateKey` structure; `from_pkcs8()` will verify that the public /// key and the private key are consistent with each other. The algorithm /// identifier must identify the curve by name; it must not use an /// "explicit" encoding of the curve. The `parameters` field of the /// `ECPrivateKey`, if present, must be the same named curve that is in the /// algorithm identifier in the PKCS#8 header. pub fn from_pkcs8( alg: &'static EcdsaSigningAlgorithm, pkcs8: &[u8], rng: &dyn rand::SecureRandom, ) -> Result { let key_pair = ec::suite_b::key_pair_from_pkcs8( alg.curve, alg.pkcs8_template, untrusted::Input::from(pkcs8), cpu::features(), )?; Self::new(alg, key_pair, rng) } /// Constructs an ECDSA key pair from the private key and public key bytes /// /// The private key must encoded as a big-endian fixed-length integer. For /// example, a P-256 private key must be 32 bytes prefixed with leading /// zeros as needed. /// /// The public key is encoding in uncompressed form using the /// Octet-String-to-Elliptic-Curve-Point algorithm in /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]. /// /// This is intended for use by code that deserializes key pairs. It is /// recommended to use `EcdsaKeyPair::from_pkcs8()` (with a PKCS#8-encoded /// key) instead. /// /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]: /// http://www.secg.org/sec1-v2.pdf pub fn from_private_key_and_public_key( alg: &'static EcdsaSigningAlgorithm, private_key: &[u8], public_key: &[u8], rng: &dyn rand::SecureRandom, ) -> Result { let key_pair = ec::suite_b::key_pair_from_bytes( alg.curve, untrusted::Input::from(private_key), untrusted::Input::from(public_key), cpu::features(), )?; Self::new(alg, key_pair, rng) } fn new( alg: &'static EcdsaSigningAlgorithm, key_pair: ec::KeyPair, rng: &dyn rand::SecureRandom, ) -> Result { let cpu = cpu::features(); let (seed, public_key) = key_pair.split(); let n = &alg.private_scalar_ops.scalar_ops.scalar_modulus(cpu); let d = private_key::private_key_as_scalar(n, &seed); let d = alg.private_scalar_ops.to_mont(&d, cpu); let nonce_key = NonceRandomKey::new(alg, &seed, rng)?; Ok(Self { d, nonce_key, alg, public_key: PublicKey(public_key), }) } /// Returns the signature of the `message` using a random nonce generated by `rng`. pub fn sign( &self, rng: &dyn rand::SecureRandom, message: &[u8], ) -> Result { let cpu = cpu::features(); // Step 4 (out of order). let h = digest::digest(self.alg.digest_alg, message); // Incorporate `h` into the nonce to hedge against faulty RNGs. (This // is not an approved random number generator that is mandated in // the spec.) let nonce_rng = NonceRandom { key: &self.nonce_key, message_digest: &h, rng, }; self.sign_digest(h, &nonce_rng, cpu) } #[cfg(test)] fn sign_with_fixed_nonce_during_test( &self, rng: &dyn rand::SecureRandom, message: &[u8], ) -> Result { // Step 4 (out of order). let h = digest::digest(self.alg.digest_alg, message); self.sign_digest(h, rng, cpu::features()) } /// Returns the signature of message digest `h` using a "random" nonce /// generated by `rng`. fn sign_digest( &self, h: digest::Digest, rng: &dyn rand::SecureRandom, cpu: cpu::Features, ) -> Result { // NSA Suite B Implementer's Guide to ECDSA Section 3.4.1: ECDSA // Signature Generation. // NSA Guide Prerequisites: // // Prior to generating an ECDSA signature, the signatory shall // obtain: // // 1. an authentic copy of the domain parameters, // 2. a digital signature key pair (d,Q), either generated by a // method from Appendix A.1, or obtained from a trusted third // party, // 3. assurance of the validity of the public key Q (see Appendix // A.3), and // 4. assurance that he/she/it actually possesses the associated // private key d (see [SP800-89] Section 6). // // The domain parameters are hard-coded into the source code. // `EcdsaKeyPair::generate_pkcs8()` can be used to meet the second // requirement; otherwise, it is up to the user to ensure the key pair // was obtained from a trusted private key. The constructors for // `EcdsaKeyPair` ensure that #3 and #4 are met subject to the caveats // in SP800-89 Section 6. let ops = self.alg.private_scalar_ops; let scalar_ops = ops.scalar_ops; let cops = scalar_ops.common; let private_key_ops = self.alg.private_key_ops; let q = &cops.elem_modulus(cpu); let n = &scalar_ops.scalar_modulus(cpu); for _ in 0..100 { // XXX: iteration conut? // Step 1. let k = private_key::random_scalar(self.alg.private_key_ops, n, rng)?; let k_inv = ops.scalar_inv_to_mont(&k, cpu); // Step 2. let r = private_key_ops.point_mul_base(&k, cpu); // Step 3. let r = { let (x, _) = private_key::affine_from_jacobian(private_key_ops, q, &r)?; let x = q.elem_unencoded(&x); n.elem_reduced_to_scalar(&x) }; if n.is_zero(&r) { continue; } // Step 4 is done by the caller. // Step 5. let e = digest_scalar(n, h); // Step 6. let s = { let mut e_plus_dr = scalar_ops.scalar_product(&self.d, &r, cpu); n.add_assign(&mut e_plus_dr, &e); scalar_ops.scalar_product(&k_inv, &e_plus_dr, cpu) }; if n.is_zero(&s) { continue; } // Step 7 with encoding. return Ok(signature::Signature::new(|sig_bytes| { (self.alg.format_rs)(scalar_ops, &r, &s, sig_bytes) })); } Err(error::Unspecified) } } /// Generates an ECDSA nonce in a way that attempts to protect against a faulty /// `SecureRandom`. struct NonceRandom<'a> { key: &'a NonceRandomKey, message_digest: &'a digest::Digest, rng: &'a dyn rand::SecureRandom, } impl core::fmt::Debug for NonceRandom<'_> { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("NonceRandom").finish() } } impl rand::sealed::SecureRandom for NonceRandom<'_> { fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> { // Use the same digest algorithm that will be used to digest the // message. The digest algorithm's output is exactly the right size; // this is checked below. // // XXX(perf): The single iteration will require two digest block // operations because the amount of data digested is larger than one // block. let digest_alg = self.key.0.algorithm(); let mut ctx = digest::Context::new(digest_alg); // Digest the randomized digest of the private key. let key = self.key.0.as_ref(); ctx.update(key); // The random value is digested between the key and the message so that // the key and the message are not directly digested in the same digest // block. assert!(key.len() <= digest_alg.block_len() / 2); { let mut rand = [0u8; digest::MAX_BLOCK_LEN]; let rand = &mut rand[..digest_alg.block_len() - key.len()]; assert!(rand.len() >= dest.len()); self.rng.fill(rand)?; ctx.update(rand); } ctx.update(self.message_digest.as_ref()); let nonce = ctx.finish(); // `copy_from_slice()` panics if the lengths differ, so we don't have // to separately assert that the lengths are the same. dest.copy_from_slice(nonce.as_ref()); Ok(()) } } impl sealed::Sealed for NonceRandom<'_> {} struct NonceRandomKey(digest::Digest); impl NonceRandomKey { fn new( alg: &EcdsaSigningAlgorithm, seed: &ec::Seed, rng: &dyn rand::SecureRandom, ) -> Result { let mut rand = [0; digest::MAX_OUTPUT_LEN]; let rand = &mut rand[0..alg.curve.elem_scalar_seed_len]; // XXX: `KeyRejected` isn't the right way to model failure of the RNG, // but to fix that we'd need to break the API by changing the result type. // TODO: Fix the API in the next breaking release. rng.fill(rand) .map_err(|error::Unspecified| error::KeyRejected::rng_failed())?; let mut ctx = digest::Context::new(alg.digest_alg); ctx.update(rand); ctx.update(seed.bytes_less_safe()); Ok(Self(ctx.finish())) } } impl signature::KeyPair for EcdsaKeyPair { type PublicKey = PublicKey; fn public_key(&self) -> &Self::PublicKey { &self.public_key } } #[derive(Clone, Copy)] pub struct PublicKey(ec::PublicKey); derive_debug_self_as_ref_hex_bytes!(PublicKey); impl AsRef<[u8]> for PublicKey { fn as_ref(&self) -> &[u8] { self.0.as_ref() } } fn format_rs_fixed(ops: &'static ScalarOps, r: &Scalar, s: &Scalar, out: &mut [u8]) -> usize { let scalar_len = ops.scalar_bytes_len(); let (r_out, rest) = out.split_at_mut(scalar_len); limb::big_endian_from_limbs(ops.leak_limbs(r), r_out); let (s_out, _) = rest.split_at_mut(scalar_len); limb::big_endian_from_limbs(ops.leak_limbs(s), s_out); 2 * scalar_len } fn format_rs_asn1(ops: &'static ScalarOps, r: &Scalar, s: &Scalar, out: &mut [u8]) -> usize { // This assumes `a` is not zero since neither `r` or `s` is allowed to be // zero. fn format_integer_tlv(ops: &ScalarOps, a: &Scalar, out: &mut [u8]) -> usize { let mut fixed = [0u8; ec::SCALAR_MAX_BYTES + 1]; let fixed = &mut fixed[..(ops.scalar_bytes_len() + 1)]; limb::big_endian_from_limbs(ops.leak_limbs(a), &mut fixed[1..]); // Since `a_fixed_out` is an extra byte long, it is guaranteed to start // with a zero. debug_assert_eq!(fixed[0], 0); // There must be at least one non-zero byte since `a` isn't zero. let first_index = fixed.iter().position(|b| *b != 0).unwrap(); // If the first byte has its high bit set, it needs to be prefixed with 0x00. let first_index = if fixed[first_index] & 0x80 != 0 { first_index - 1 } else { first_index }; let value = &fixed[first_index..]; out[0] = der::Tag::Integer.into(); // Lengths less than 128 are encoded in one byte. assert!(value.len() < 128); #[allow(clippy::cast_possible_truncation)] { out[1] = value.len() as u8; } out[2..][..value.len()].copy_from_slice(value); 2 + value.len() } out[0] = der::Tag::Sequence.into(); let r_tlv_len = format_integer_tlv(ops, r, &mut out[2..]); let s_tlv_len = format_integer_tlv(ops, s, &mut out[2..][r_tlv_len..]); // Lengths less than 128 are encoded in one byte. let value_len = r_tlv_len + s_tlv_len; assert!(value_len < 128); #[allow(clippy::cast_possible_truncation)] { out[1] = value_len as u8; } 2 + value_len } /// Signing of fixed-length (PKCS#11 style) ECDSA signatures using the /// P-256 curve and SHA-256. /// /// See "`ECDSA_*_FIXED` Details" in `ring::signature`'s module-level /// documentation for more details. pub static ECDSA_P256_SHA256_FIXED_SIGNING: EcdsaSigningAlgorithm = EcdsaSigningAlgorithm { curve: &ec::suite_b::curve::P256, private_scalar_ops: &p256::PRIVATE_SCALAR_OPS, private_key_ops: &p256::PRIVATE_KEY_OPS, digest_alg: &digest::SHA256, pkcs8_template: &EC_PUBLIC_KEY_P256_PKCS8_V1_TEMPLATE, format_rs: format_rs_fixed, id: AlgorithmID::ECDSA_P256_SHA256_FIXED_SIGNING, }; /// Signing of fixed-length (PKCS#11 style) ECDSA signatures using the /// P-384 curve and SHA-384. /// /// See "`ECDSA_*_FIXED` Details" in `ring::signature`'s module-level /// documentation for more details. pub static ECDSA_P384_SHA384_FIXED_SIGNING: EcdsaSigningAlgorithm = EcdsaSigningAlgorithm { curve: &ec::suite_b::curve::P384, private_scalar_ops: &p384::PRIVATE_SCALAR_OPS, private_key_ops: &p384::PRIVATE_KEY_OPS, digest_alg: &digest::SHA384, pkcs8_template: &EC_PUBLIC_KEY_P384_PKCS8_V1_TEMPLATE, format_rs: format_rs_fixed, id: AlgorithmID::ECDSA_P384_SHA384_FIXED_SIGNING, }; /// Signing of ASN.1 DER-encoded ECDSA signatures using the P-256 curve and /// SHA-256. /// /// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level /// documentation for more details. pub static ECDSA_P256_SHA256_ASN1_SIGNING: EcdsaSigningAlgorithm = EcdsaSigningAlgorithm { curve: &ec::suite_b::curve::P256, private_scalar_ops: &p256::PRIVATE_SCALAR_OPS, private_key_ops: &p256::PRIVATE_KEY_OPS, digest_alg: &digest::SHA256, pkcs8_template: &EC_PUBLIC_KEY_P256_PKCS8_V1_TEMPLATE, format_rs: format_rs_asn1, id: AlgorithmID::ECDSA_P256_SHA256_ASN1_SIGNING, }; /// Signing of ASN.1 DER-encoded ECDSA signatures using the P-384 curve and /// SHA-384. /// /// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level /// documentation for more details. pub static ECDSA_P384_SHA384_ASN1_SIGNING: EcdsaSigningAlgorithm = EcdsaSigningAlgorithm { curve: &ec::suite_b::curve::P384, private_scalar_ops: &p384::PRIVATE_SCALAR_OPS, private_key_ops: &p384::PRIVATE_KEY_OPS, digest_alg: &digest::SHA384, pkcs8_template: &EC_PUBLIC_KEY_P384_PKCS8_V1_TEMPLATE, format_rs: format_rs_asn1, id: AlgorithmID::ECDSA_P384_SHA384_ASN1_SIGNING, }; static EC_PUBLIC_KEY_P256_PKCS8_V1_TEMPLATE: pkcs8::Template = pkcs8::Template { bytes: include_bytes!("ecPublicKey_p256_pkcs8_v1_template.der"), alg_id_range: core::ops::Range { start: 8, end: 27 }, curve_id_index: 9, private_key_index: 0x24, }; static EC_PUBLIC_KEY_P384_PKCS8_V1_TEMPLATE: pkcs8::Template = pkcs8::Template { bytes: include_bytes!("ecPublicKey_p384_pkcs8_v1_template.der"), alg_id_range: core::ops::Range { start: 8, end: 24 }, curve_id_index: 9, private_key_index: 0x23, }; #[cfg(test)] mod tests { use crate::testutil as test; use crate::{rand, signature}; #[test] fn signature_ecdsa_sign_fixed_test() { let rng = rand::SystemRandom::new(); test::run( test_vector_file!("ecdsa_sign_fixed_tests.txt"), |section, test_case| { assert_eq!(section, ""); let curve_name = test_case.consume_string("Curve"); let digest_name = test_case.consume_string("Digest"); let msg = test_case.consume_bytes("Msg"); let d = test_case.consume_bytes("d"); let q = test_case.consume_bytes("Q"); let k = test_case.consume_bytes("k"); let expected_result = test_case.consume_bytes("Sig"); let alg = match (curve_name.as_str(), digest_name.as_str()) { ("P-256", "SHA256") => &signature::ECDSA_P256_SHA256_FIXED_SIGNING, ("P-384", "SHA384") => &signature::ECDSA_P384_SHA384_FIXED_SIGNING, _ => { panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); } }; let private_key = signature::EcdsaKeyPair::from_private_key_and_public_key(alg, &d, &q, &rng) .unwrap(); let rng = test::rand::FixedSliceRandom { bytes: &k }; let actual_result = private_key .sign_with_fixed_nonce_during_test(&rng, &msg) .unwrap(); assert_eq!(actual_result.as_ref(), &expected_result[..]); Ok(()) }, ); } #[test] fn signature_ecdsa_sign_asn1_test() { let rng = rand::SystemRandom::new(); test::run( test_vector_file!("ecdsa_sign_asn1_tests.txt"), |section, test_case| { assert_eq!(section, ""); let curve_name = test_case.consume_string("Curve"); let digest_name = test_case.consume_string("Digest"); let msg = test_case.consume_bytes("Msg"); let d = test_case.consume_bytes("d"); let q = test_case.consume_bytes("Q"); let k = test_case.consume_bytes("k"); let expected_result = test_case.consume_bytes("Sig"); let alg = match (curve_name.as_str(), digest_name.as_str()) { ("P-256", "SHA256") => &signature::ECDSA_P256_SHA256_ASN1_SIGNING, ("P-384", "SHA384") => &signature::ECDSA_P384_SHA384_ASN1_SIGNING, _ => { panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); } }; let private_key = signature::EcdsaKeyPair::from_private_key_and_public_key(alg, &d, &q, &rng) .unwrap(); let rng = test::rand::FixedSliceRandom { bytes: &k }; let actual_result = private_key .sign_with_fixed_nonce_during_test(&rng, &msg) .unwrap(); assert_eq!(actual_result.as_ref(), &expected_result[..]); Ok(()) }, ); } } ring-0.17.14/src/ec/suite_b/ecdsa/verification.rs000064400000000000000000000311421046102023000176550ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! ECDSA Signatures using the P-256 and P-384 curves. use super::digest_scalar::digest_scalar; use crate::{ arithmetic::montgomery::*, cpu, digest, ec::suite_b::{ops::*, public_key::*, verify_jacobian_point_is_on_the_curve}, error, io::der, limb, sealed, signature, }; /// An ECDSA verification algorithm. pub struct EcdsaVerificationAlgorithm { ops: &'static PublicScalarOps, digest_alg: &'static digest::Algorithm, split_rs: for<'a> fn( ops: &'static ScalarOps, input: &mut untrusted::Reader<'a>, ) -> Result<(untrusted::Input<'a>, untrusted::Input<'a>), error::Unspecified>, id: AlgorithmID, } #[derive(Debug)] enum AlgorithmID { ECDSA_P256_SHA256_ASN1, ECDSA_P256_SHA256_FIXED, ECDSA_P256_SHA384_ASN1, ECDSA_P384_SHA256_ASN1, ECDSA_P384_SHA384_ASN1, ECDSA_P384_SHA384_FIXED, } derive_debug_via_id!(EcdsaVerificationAlgorithm); impl signature::VerificationAlgorithm for EcdsaVerificationAlgorithm { fn verify( &self, public_key: untrusted::Input, msg: untrusted::Input, signature: untrusted::Input, ) -> Result<(), error::Unspecified> { let cpu = cpu::features(); let e = { // NSA Guide Step 2: "Use the selected hash function to compute H = // Hash(M)." let h = digest::digest(self.digest_alg, msg.as_slice_less_safe()); // NSA Guide Step 3: "Convert the bit string H to an integer e as // described in Appendix B.2." let n = &self.ops.scalar_ops.scalar_modulus(cpu); digest_scalar(n, h) }; self.verify_digest(public_key, e, signature) } } impl EcdsaVerificationAlgorithm { /// This is intentionally not public. fn verify_digest( &self, public_key: untrusted::Input, e: Scalar, signature: untrusted::Input, ) -> Result<(), error::Unspecified> { let cpu = cpu::features(); // NSA Suite B Implementer's Guide to ECDSA Section 3.4.2. let public_key_ops = self.ops.public_key_ops; let scalar_ops = self.ops.scalar_ops; let q = &public_key_ops.common.elem_modulus(cpu); let n = &scalar_ops.scalar_modulus(cpu); // NSA Guide Prerequisites: // // Prior to accepting a verified digital signature as valid the // verifier shall have: // // 1. assurance of the signatory’s claimed identity, // 2. an authentic copy of the domain parameters, (q, FR, a, b, SEED, // G, n, h), // 3. assurance of the validity of the public key, and // 4. assurance that the claimed signatory actually possessed the // private key that was used to generate the digital signature at // the time that the signature was generated. // // Prerequisites #1 and #4 are outside the scope of what this function // can do. Prerequisite #2 is handled implicitly as the domain // parameters are hard-coded into the source. Prerequisite #3 is // handled by `parse_uncompressed_point`. let peer_pub_key = parse_uncompressed_point(public_key_ops, q, public_key)?; let (r, s) = signature.read_all(error::Unspecified, |input| { (self.split_rs)(scalar_ops, input) })?; // NSA Guide Step 1: "If r and s are not both integers in the interval // [1, n − 1], output INVALID." let r = scalar_parse_big_endian_variable(n, limb::AllowZero::No, r)?; let s = scalar_parse_big_endian_variable(n, limb::AllowZero::No, s)?; // NSA Guide Step 4: "Compute w = s**−1 mod n, using the routine in // Appendix B.1." let w = self.ops.scalar_inv_to_mont_vartime(&s, cpu); // NSA Guide Step 5: "Compute u1 = (e * w) mod n, and compute // u2 = (r * w) mod n." let u1 = scalar_ops.scalar_product(&e, &w, cpu); let u2 = scalar_ops.scalar_product(&r, &w, cpu); // NSA Guide Step 6: "Compute the elliptic curve point // R = (xR, yR) = u1*G + u2*Q, using EC scalar multiplication and EC // addition. If R is equal to the point at infinity, output INVALID." let product = (self.ops.twin_mul)(&u1, &u2, &peer_pub_key, cpu); // Verify that the point we computed is on the curve; see // `verify_affine_point_is_on_the_curve_scaled` for details on why. It // would be more secure to do the check on the affine coordinates if we // were going to convert to affine form (again, see // `verify_affine_point_is_on_the_curve_scaled` for details on why). // But, we're going to avoid converting to affine for performance // reasons, so we do the verification using the Jacobian coordinates. let z2 = verify_jacobian_point_is_on_the_curve(q, &product)?; // NSA Guide Step 7: "Compute v = xR mod n." // NSA Guide Step 8: "Compare v and r0. If v = r0, output VALID; // otherwise, output INVALID." // // Instead, we use Greg Maxwell's trick to avoid the inversion mod `q` // that would be necessary to compute the affine X coordinate. let x = q.point_x(&product); fn sig_r_equals_x(q: &Modulus, r: &Elem, x: &Elem, z2: &Elem) -> bool { let r_jacobian = q.elem_product(z2, r); let x = q.elem_unencoded(x); q.elems_are_equal(&r_jacobian, &x).leak() } let mut r = self.ops.scalar_as_elem(&r); if sig_r_equals_x(q, &r, &x, &z2) { return Ok(()); } if q.elem_less_than_vartime(&r, &self.ops.q_minus_n) { let n = Elem::from(self.ops.n()); q.add_assign(&mut r, &n); if sig_r_equals_x(q, &r, &x, &z2) { return Ok(()); } } Err(error::Unspecified) } } impl sealed::Sealed for EcdsaVerificationAlgorithm {} fn split_rs_fixed<'a>( ops: &'static ScalarOps, input: &mut untrusted::Reader<'a>, ) -> Result<(untrusted::Input<'a>, untrusted::Input<'a>), error::Unspecified> { let scalar_len = ops.scalar_bytes_len(); let r = input.read_bytes(scalar_len)?; let s = input.read_bytes(scalar_len)?; Ok((r, s)) } fn split_rs_asn1<'a>( _ops: &'static ScalarOps, input: &mut untrusted::Reader<'a>, ) -> Result<(untrusted::Input<'a>, untrusted::Input<'a>), error::Unspecified> { der::nested(input, der::Tag::Sequence, error::Unspecified, |input| { let r = der::positive_integer(input)?.big_endian_without_leading_zero_as_input(); let s = der::positive_integer(input)?.big_endian_without_leading_zero_as_input(); Ok((r, s)) }) } /// Verification of fixed-length (PKCS#11 style) ECDSA signatures using the /// P-256 curve and SHA-256. /// /// See "`ECDSA_*_FIXED` Details" in `ring::signature`'s module-level /// documentation for more details. pub static ECDSA_P256_SHA256_FIXED: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm { ops: &p256::PUBLIC_SCALAR_OPS, digest_alg: &digest::SHA256, split_rs: split_rs_fixed, id: AlgorithmID::ECDSA_P256_SHA256_FIXED, }; /// Verification of fixed-length (PKCS#11 style) ECDSA signatures using the /// P-384 curve and SHA-384. /// /// See "`ECDSA_*_FIXED` Details" in `ring::signature`'s module-level /// documentation for more details. pub static ECDSA_P384_SHA384_FIXED: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm { ops: &p384::PUBLIC_SCALAR_OPS, digest_alg: &digest::SHA384, split_rs: split_rs_fixed, id: AlgorithmID::ECDSA_P384_SHA384_FIXED, }; /// Verification of ASN.1 DER-encoded ECDSA signatures using the P-256 curve /// and SHA-256. /// /// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level /// documentation for more details. pub static ECDSA_P256_SHA256_ASN1: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm { ops: &p256::PUBLIC_SCALAR_OPS, digest_alg: &digest::SHA256, split_rs: split_rs_asn1, id: AlgorithmID::ECDSA_P256_SHA256_ASN1, }; /// *Not recommended*. Verification of ASN.1 DER-encoded ECDSA signatures using /// the P-256 curve and SHA-384. /// /// In most situations, P-256 should be used only with SHA-256 and P-384 /// should be used only with SHA-384. However, in some cases, particularly TLS /// on the web, it is necessary to support P-256 with SHA-384 for compatibility /// with widely-deployed implementations that do not follow these guidelines. /// /// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level /// documentation for more details. pub static ECDSA_P256_SHA384_ASN1: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm { ops: &p256::PUBLIC_SCALAR_OPS, digest_alg: &digest::SHA384, split_rs: split_rs_asn1, id: AlgorithmID::ECDSA_P256_SHA384_ASN1, }; /// *Not recommended*. Verification of ASN.1 DER-encoded ECDSA signatures using /// the P-384 curve and SHA-256. /// /// In most situations, P-256 should be used only with SHA-256 and P-384 /// should be used only with SHA-384. However, in some cases, particularly TLS /// on the web, it is necessary to support P-256 with SHA-384 for compatibility /// with widely-deployed implementations that do not follow these guidelines. /// /// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level /// documentation for more details. pub static ECDSA_P384_SHA256_ASN1: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm { ops: &p384::PUBLIC_SCALAR_OPS, digest_alg: &digest::SHA256, split_rs: split_rs_asn1, id: AlgorithmID::ECDSA_P384_SHA256_ASN1, }; /// Verification of ASN.1 DER-encoded ECDSA signatures using the P-384 curve /// and SHA-384. /// /// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level /// documentation for more details. pub static ECDSA_P384_SHA384_ASN1: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm { ops: &p384::PUBLIC_SCALAR_OPS, digest_alg: &digest::SHA384, split_rs: split_rs_asn1, id: AlgorithmID::ECDSA_P384_SHA384_ASN1, }; #[cfg(test)] mod tests { extern crate alloc; use super::*; use crate::testutil as test; use alloc::{vec, vec::Vec}; #[test] fn test_digest_based_test_vectors() { let cpu = cpu::features(); test::run( test_vector_file!("../../../../crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt"), |section, test_case| { assert_eq!(section, ""); let curve_name = test_case.consume_string("Curve"); let public_key = { let mut public_key = vec![0x04]; public_key.extend(&test_case.consume_bytes("X")); public_key.extend(&test_case.consume_bytes("Y")); public_key }; let digest = test_case.consume_bytes("Digest"); let sig = { let mut sig = Vec::new(); sig.extend(&test_case.consume_bytes("R")); sig.extend(&test_case.consume_bytes("S")); sig }; let invalid = test_case.consume_optional_string("Invalid"); let alg = match curve_name.as_str() { "P-256" => &ECDSA_P256_SHA256_FIXED, "P-384" => &ECDSA_P384_SHA384_FIXED, _ => { panic!("Unsupported curve: {}", curve_name); } }; let n = &alg.ops.scalar_ops.scalar_modulus(cpu); let digest = super::super::digest_scalar::digest_bytes_scalar(n, &digest[..]); let actual_result = alg.verify_digest( untrusted::Input::from(&public_key[..]), digest, untrusted::Input::from(&sig[..]), ); assert_eq!(actual_result.is_ok(), invalid.is_none()); Ok(()) }, ); } } ring-0.17.14/src/ec/suite_b/ecdsa.rs000064400000000000000000000000721046102023000151710ustar 00000000000000mod digest_scalar; pub mod signing; pub mod verification; ring-0.17.14/src/ec/suite_b/ops/elem.rs000064400000000000000000000112771046102023000156460ustar 00000000000000// Copyright 2017 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::ec::suite_b::ops::{ p256::NUM_LIMBS as P256_NUM_LIMBS, p384::NUM_LIMBS as P384_NUM_LIMBS, }; use crate::{ arithmetic::{ limbs_from_hex, montgomery::{Encoding, ProductEncoding, Unencoded}, }, limb::{LeakyLimb, Limb}, }; use core::marker::PhantomData; #[derive(Clone, Copy)] pub(super) enum NumLimbs { P256, P384, } impl NumLimbs { pub(super) const MAX: usize = Self::P384.into(); pub(super) const fn into(self) -> usize { match self { NumLimbs::P256 => P256_NUM_LIMBS, NumLimbs::P384 => P384_NUM_LIMBS, } } } /// Elements of ℤ/mℤ for some modulus *m*. Elements are always fully reduced /// with respect to *m*; i.e. the 0 <= x < m for every value x. #[derive(Clone, Copy)] pub struct Elem { // XXX: pub pub(super) limbs: [Limb; NumLimbs::MAX], /// The modulus *m* for the ring ℤ/mℤ for which this element is a value. pub(super) m: PhantomData, /// The number of Montgomery factors that need to be canceled out from /// `value` to get the actual value. pub(super) encoding: PhantomData, } pub struct PublicElem { pub(super) limbs: [LeakyLimb; NumLimbs::MAX], pub(super) m: PhantomData, pub(super) encoding: PhantomData, } impl From<&PublicElem> for Elem { fn from(value: &PublicElem) -> Self { Self { limbs: core::array::from_fn(|i| Limb::from(value.limbs[i])), m: value.m, encoding: value.encoding, } } } impl Elem { // There's no need to convert `value` to the Montgomery domain since // 0 * R**2 (mod m) == 0, so neither the modulus nor the encoding are needed // as inputs for constructing a zero-valued element. pub fn zero() -> Self { Self { limbs: [0; NumLimbs::MAX], m: PhantomData, encoding: PhantomData, } } } impl Elem { pub fn one() -> Self { let mut r = Self::zero(); r.limbs[0] = 1; r } } impl PublicElem { pub const fn from_hex(hex: &str) -> Self { Self { limbs: limbs_from_hex(hex), m: PhantomData, encoding: PhantomData, } } } #[inline] pub fn mul_mont( f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), a: &Elem, b: &Elem, ) -> Elem::Output> where (EA, EB): ProductEncoding, { binary_op(f, a, b) } // let r = f(a, b); return r; #[inline] pub fn binary_op( f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), a: &Elem, b: &Elem, ) -> Elem { let mut r = Elem::zero(); unsafe { f(r.limbs.as_mut_ptr(), a.limbs.as_ptr(), b.limbs.as_ptr()) } r } // a := f(a, b); #[inline] pub fn binary_op_assign( f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), a: &mut Elem, b: &Elem, ) { unsafe { f(a.limbs.as_mut_ptr(), a.limbs.as_ptr(), b.limbs.as_ptr()) } } // let r = f(a); return r; #[inline] pub fn unary_op( f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb), a: &Elem, ) -> Elem { let mut r = Elem::zero(); unsafe { f(r.limbs.as_mut_ptr(), a.limbs.as_ptr()) } r } // a := f(a); #[inline] pub fn unary_op_assign( f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb), a: &mut Elem, ) { unsafe { f(a.limbs.as_mut_ptr(), a.limbs.as_ptr()) } } // a := f(a, a); #[inline] pub fn unary_op_from_binary_op_assign( f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), a: &mut Elem, ) { unsafe { f(a.limbs.as_mut_ptr(), a.limbs.as_ptr(), a.limbs.as_ptr()) } } ring-0.17.14/src/ec/suite_b/ops/p256.rs000064400000000000000000000263031046102023000154140ustar 00000000000000// Copyright 2016-2023 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{ elem::{binary_op, binary_op_assign}, elem_sqr_mul, elem_sqr_mul_acc, PublicModulus, *, }; pub(super) const NUM_LIMBS: usize = 256 / LIMB_BITS; pub static COMMON_OPS: CommonOps = CommonOps { num_limbs: elem::NumLimbs::P256, q: PublicModulus { p: limbs_from_hex("ffffffff00000001000000000000000000000000ffffffffffffffffffffffff"), rr: PublicElem::from_hex("4fffffffdfffffffffffffffefffffffbffffffff0000000000000003"), }, n: PublicElem::from_hex("ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551"), a: PublicElem::from_hex("fffffffc00000004000000000000000000000003fffffffffffffffffffffffc"), b: PublicElem::from_hex("dc30061d04874834e5a220abf7212ed6acf005cd78843090d89cdf6229c4bddf"), elem_mul_mont: p256_mul_mont, elem_sqr_mont: p256_sqr_mont, }; #[cfg(test)] pub(super) static GENERATOR: (PublicElem, PublicElem) = ( PublicElem::from_hex("18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c"), PublicElem::from_hex("8571ff1825885d85d2e88688dd21f3258b4ab8e4ba19e45cddf25357ce95560a"), ); pub static PRIVATE_KEY_OPS: PrivateKeyOps = PrivateKeyOps { common: &COMMON_OPS, elem_inv_squared: p256_elem_inv_squared, point_mul_base_impl: p256_point_mul_base_impl, point_mul_impl: p256_point_mul, point_add_jacobian_impl: p256_point_add, }; fn p256_elem_inv_squared(q: &Modulus, a: &Elem) -> Elem { // Calculate a**-2 (mod q) == a**(q - 3) (mod q) // // The exponent (q - 3) is: // // 0xffffffff00000001000000000000000000000000fffffffffffffffffffffffc #[inline] fn sqr_mul(q: &Modulus, a: &Elem, squarings: LeakyWord, b: &Elem) -> Elem { elem_sqr_mul(&COMMON_OPS, a, squarings, b, q.cpu()) } #[inline] fn sqr_mul_acc(q: &Modulus, a: &mut Elem, squarings: LeakyWord, b: &Elem) { elem_sqr_mul_acc(&COMMON_OPS, a, squarings, b, q.cpu()) } let b_1 = &a; let b_11 = sqr_mul(q, b_1, 1, b_1); let b_111 = sqr_mul(q, &b_11, 1, b_1); let f_11 = sqr_mul(q, &b_111, 3, &b_111); let fff = sqr_mul(q, &f_11, 6, &f_11); let fff_111 = sqr_mul(q, &fff, 3, &b_111); let fffffff_11 = sqr_mul(q, &fff_111, 15, &fff_111); let ffffffff = sqr_mul(q, &fffffff_11, 2, &b_11); // ffffffff00000001 let mut acc = sqr_mul(q, &ffffffff, 31 + 1, b_1); // ffffffff00000001000000000000000000000000ffffffff sqr_mul_acc(q, &mut acc, 96 + 32, &ffffffff); // ffffffff00000001000000000000000000000000ffffffffffffffff sqr_mul_acc(q, &mut acc, 32, &ffffffff); // ffffffff00000001000000000000000000000000fffffffffffffffffffffff_11 sqr_mul_acc(q, &mut acc, 30, &fffffff_11); // ffffffff00000001000000000000000000000000fffffffffffffffffffffffc q.elem_square(&mut acc); q.elem_square(&mut acc); acc } fn p256_point_mul_base_impl(g_scalar: &Scalar, _cpu: cpu::Features) -> Point { prefixed_extern! { fn p256_point_mul_base( r: *mut Limb, // [3][COMMON_OPS.num_limbs] g_scalar: *const Limb, // [COMMON_OPS.num_limbs] ); } let mut r = Point::new_at_infinity(); unsafe { p256_point_mul_base(r.xyz.as_mut_ptr(), g_scalar.limbs.as_ptr()); } r } pub static PUBLIC_KEY_OPS: PublicKeyOps = PublicKeyOps { common: &COMMON_OPS, }; pub static SCALAR_OPS: ScalarOps = ScalarOps { common: &COMMON_OPS, scalar_mul_mont: p256_scalar_mul_mont, }; pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps { scalar_ops: &SCALAR_OPS, public_key_ops: &PUBLIC_KEY_OPS, #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64" ))] twin_mul: twin_mul_nistz256, #[cfg(not(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64" )))] twin_mul: |g_scalar, p_scalar, p_xy, cpu| { twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy, cpu) }, q_minus_n: PublicElem::from_hex("4319055358e8617b0c46353d039cdaae"), // TODO: Use an optimized variable-time implementation. scalar_inv_to_mont_vartime: |s, cpu| PRIVATE_SCALAR_OPS.scalar_inv_to_mont(s, cpu), }; #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64" ))] fn twin_mul_nistz256( g_scalar: &Scalar, p_scalar: &Scalar, p_xy: &(Elem, Elem), cpu: cpu::Features, ) -> Point { let scaled_g = point_mul_base_vartime(g_scalar, cpu); let scaled_p = PRIVATE_KEY_OPS.point_mul(p_scalar, p_xy, cpu::features()); PRIVATE_KEY_OPS.point_sum(&scaled_g, &scaled_p, cpu) } #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64" ))] fn point_mul_base_vartime(g_scalar: &Scalar, _cpu: cpu::Features) -> Point { prefixed_extern! { fn p256_point_mul_base_vartime(r: *mut Limb, // [3][COMMON_OPS.num_limbs] g_scalar: *const Limb, // [COMMON_OPS.num_limbs] ); } let mut scaled_g = Point::new_at_infinity(); unsafe { p256_point_mul_base_vartime(scaled_g.xyz.as_mut_ptr(), g_scalar.limbs.as_ptr()); } scaled_g } pub static PRIVATE_SCALAR_OPS: PrivateScalarOps = PrivateScalarOps { scalar_ops: &SCALAR_OPS, oneRR_mod_n: PublicScalar::from_hex( "66e12d94f3d956202845b2392b6bec594699799c49bd6fa683244c95be79eea2", ), scalar_inv_to_mont: p256_scalar_inv_to_mont, }; #[allow(clippy::just_underscores_and_digits)] fn p256_scalar_inv_to_mont(a: Scalar, _cpu: cpu::Features) -> Scalar { // Calculate the modular inverse of scalar |a| using Fermat's Little // Theorem: // // a**-1 (mod n) == a**(n - 2) (mod n) // // The exponent (n - 2) is: // // 0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254f #[inline] fn mul(a: &Scalar, b: &Scalar) -> Scalar { binary_op(p256_scalar_mul_mont, a, b) } #[inline] fn sqr(a: &Scalar) -> Scalar { let mut tmp = Scalar::zero(); unsafe { p256_scalar_sqr_rep_mont(tmp.limbs.as_mut_ptr(), a.limbs.as_ptr(), 1) } tmp } // Returns (`a` squared `squarings` times) * `b`. fn sqr_mul(a: &Scalar, squarings: LeakyWord, b: &Scalar) -> Scalar { debug_assert!(squarings >= 1); let mut tmp = Scalar::zero(); unsafe { p256_scalar_sqr_rep_mont(tmp.limbs.as_mut_ptr(), a.limbs.as_ptr(), squarings) } mul(&tmp, b) } // Sets `acc` = (`acc` squared `squarings` times) * `b`. fn sqr_mul_acc(acc: &mut Scalar, squarings: LeakyWord, b: &Scalar) { debug_assert!(squarings >= 1); unsafe { p256_scalar_sqr_rep_mont(acc.limbs.as_mut_ptr(), acc.limbs.as_ptr(), squarings) } binary_op_assign(p256_scalar_mul_mont, acc, b); } let _1 = &a; let _10 = sqr(_1); // 2 let _100 = sqr(&_10); // 4 let _101 = mul(&_100, _1); // 5 let _111 = mul(&_101, &_10); // 7 let _1000 = sqr(&_100); // 8 let _10000 = sqr(&_1000); // 16 let _100000 = sqr(&_10000); // 32 let _100111 = mul(&_111, &_100000); // 39 = 7 + 32 let _101011 = mul(&_100, &_100111); // 43 = 4 + 39 let _101111 = mul(&_100, &_101011); // 47 = 4 + 39 let _1001111 = mul(&_100000, &_101111); // 79 = 32 + 47 let _86 = sqr(&_101011); // 86 = 43 * 2 let _1011011 = mul(&_101, &_86); // 91 = 5 + 86 let _92 = mul(_1, &_1011011); // 92 = 1 + 91 let _1100011 = mul(&_111, &_92); // 99 = 7 + 92 let _10111111 = mul(&_92, &_1100011); // 191 = 92 + 99 let _11011111 = mul(&_100000, &_10111111); // 223 = 32 + 191 let ff = mul(&_100000, &_11011111); // 255 = 32 + 223 let ffff = sqr_mul(&ff, 0 + 8, &ff); let ffffffff = sqr_mul(&ffff, 0 + 16, &ffff); // ffffffff00000000ffffffff let mut acc = sqr_mul(&ffffffff, 32 + 32, &ffffffff); // ffffffff00000000ffffffffffffffff sqr_mul_acc(&mut acc, 0 + 32, &ffffffff); // The rest of the exponent, in binary, is: // // 1011110011100110111110101010110110100111000101111001111010000100 // 1111001110111001110010101100001011111100011000110010010101001111 sqr_mul_acc(&mut acc, 6, &_101111); sqr_mul_acc(&mut acc, 2 + 3, &_111); sqr_mul_acc(&mut acc, 2 + 8, &_11011111); sqr_mul_acc(&mut acc, 1 + 3, &_101); sqr_mul_acc(&mut acc, 1 + 7, &_1011011); sqr_mul_acc(&mut acc, 1 + 6, &_100111); sqr_mul_acc(&mut acc, 3 + 6, &_101111); sqr_mul_acc(&mut acc, 2 + 3, &_111); sqr_mul_acc(&mut acc, 3, &_101); sqr_mul_acc(&mut acc, 4 + 7, &_1001111); sqr_mul_acc(&mut acc, 2 + 3, &_111); sqr_mul_acc(&mut acc, 1 + 3, &_111); sqr_mul_acc(&mut acc, 2 + 3, &_111); sqr_mul_acc(&mut acc, 2 + 6, &_101011); sqr_mul_acc(&mut acc, 4 + 8, &_10111111); sqr_mul_acc(&mut acc, 3 + 7, &_1100011); sqr_mul_acc(&mut acc, 2 + 1, _1); sqr_mul_acc(&mut acc, 2 + 3, &_101); sqr_mul_acc(&mut acc, 1 + 7, &_1001111); acc } prefixed_extern! { pub(super) fn p256_mul_mont( r: *mut Limb, // [COMMON_OPS.num_limbs] a: *const Limb, // [COMMON_OPS.num_limbs] b: *const Limb, // [COMMON_OPS.num_limbs] ); pub(super) fn p256_sqr_mont( r: *mut Limb, // [COMMON_OPS.num_limbs] a: *const Limb, // [COMMON_OPS.num_limbs] ); fn p256_point_add( r: *mut Limb, // [3][COMMON_OPS.num_limbs] a: *const Limb, // [3][COMMON_OPS.num_limbs] b: *const Limb, // [3][COMMON_OPS.num_limbs] ); fn p256_point_mul( r: *mut Limb, // [3][COMMON_OPS.num_limbs] p_scalar: *const Limb, // [COMMON_OPS.num_limbs] p_x: *const Limb, // [COMMON_OPS.num_limbs] p_y: *const Limb, // [COMMON_OPS.num_limbs] ); fn p256_scalar_mul_mont( r: *mut Limb, // [COMMON_OPS.num_limbs] a: *const Limb, // [COMMON_OPS.num_limbs] b: *const Limb, // [COMMON_OPS.num_limbs] ); fn p256_scalar_sqr_rep_mont( r: *mut Limb, // [COMMON_OPS.num_limbs] a: *const Limb, // [COMMON_OPS.num_limbs] rep: LeakyWord, ); } #[cfg(test)] mod tests { #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64" ))] #[test] fn p256_point_mul_base_vartime_test() { use super::{super::tests::point_mul_base_tests, *}; point_mul_base_tests( &PRIVATE_KEY_OPS, point_mul_base_vartime, test_vector_file!("p256_point_mul_base_tests.txt"), ); } } ring-0.17.14/src/ec/suite_b/ops/p384.rs000064400000000000000000000244641046102023000154240ustar 00000000000000// Copyright 2016-2023 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{ elem::{binary_op, binary_op_assign}, elem_sqr_mul, elem_sqr_mul_acc, PublicModulus, *, }; pub(super) const NUM_LIMBS: usize = 384 / LIMB_BITS; pub static COMMON_OPS: CommonOps = CommonOps { num_limbs: elem::NumLimbs::P384, q: PublicModulus { p: limbs_from_hex("fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff0000000000000000ffffffff"), rr: PublicElem::from_hex("10000000200000000fffffffe000000000000000200000000fffffffe00000001"), }, n: PublicElem::from_hex("ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973"), a: PublicElem::from_hex("fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffbfffffffc0000000000000003fffffffc"), b: PublicElem::from_hex("cd08114b604fbff9b62b21f41f022094e3374bee94938ae277f2209b1920022ef729add87a4c32ec081188719d412dcc"), elem_mul_mont: p384_elem_mul_mont, elem_sqr_mont: p384_elem_sqr_mont, }; pub(super) static GENERATOR: (PublicElem, PublicElem) = ( PublicElem::from_hex("4d3aadc2299e1513812ff723614ede2b6454868459a30eff879c3afc541b4d6e20e378e2a0d6ce383dd0756649c0b528"), PublicElem::from_hex("2b78abc25a15c5e9dd8002263969a840c6c3521968f4ffd98bade7562e83b050a1bfa8bf7bb4a9ac23043dad4b03a4fe"), ); pub static PRIVATE_KEY_OPS: PrivateKeyOps = PrivateKeyOps { common: &COMMON_OPS, elem_inv_squared: p384_elem_inv_squared, point_mul_base_impl: p384_point_mul_base_impl, point_mul_impl: p384_point_mul, point_add_jacobian_impl: p384_point_add, }; fn p384_elem_inv_squared(q: &Modulus, a: &Elem) -> Elem { // Calculate a**-2 (mod q) == a**(q - 3) (mod q) // // The exponent (q - 3) is: // // 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe\ // ffffffff0000000000000000fffffffc #[inline] fn sqr_mul(q: &Modulus, a: &Elem, squarings: LeakyWord, b: &Elem) -> Elem { elem_sqr_mul(&COMMON_OPS, a, squarings, b, q.cpu()) } #[inline] fn sqr_mul_acc(q: &Modulus, a: &mut Elem, squarings: LeakyWord, b: &Elem) { elem_sqr_mul_acc(&COMMON_OPS, a, squarings, b, q.cpu()) } let b_1 = &a; let b_11 = sqr_mul(q, b_1, 1, b_1); let b_111 = sqr_mul(q, &b_11, 1, b_1); let f_11 = sqr_mul(q, &b_111, 3, &b_111); let fff = sqr_mul(q, &f_11, 6, &f_11); let fff_111 = sqr_mul(q, &fff, 3, &b_111); let fffffff_11 = sqr_mul(q, &fff_111, 15, &fff_111); let fffffffffffffff = sqr_mul(q, &fffffff_11, 30, &fffffff_11); let ffffffffffffffffffffffffffffff = sqr_mul(q, &fffffffffffffff, 60, &fffffffffffffff); // ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff let mut acc = sqr_mul( q, &ffffffffffffffffffffffffffffff, 120, &ffffffffffffffffffffffffffffff, ); // fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff_111 sqr_mul_acc(q, &mut acc, 15, &fff_111); // fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff sqr_mul_acc(q, &mut acc, 1 + 30, &fffffff_11); sqr_mul_acc(q, &mut acc, 2, &b_11); // fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff // 0000000000000000fffffff_11 sqr_mul_acc(q, &mut acc, 64 + 30, &fffffff_11); // fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff // 0000000000000000fffffffc q.elem_square(&mut acc); q.elem_square(&mut acc); acc } fn p384_point_mul_base_impl(a: &Scalar, cpu: cpu::Features) -> Point { // XXX: Not efficient. TODO: Precompute multiples of the generator. let generator = (Elem::from(&GENERATOR.0), Elem::from(&GENERATOR.1)); PRIVATE_KEY_OPS.point_mul(a, &generator, cpu) } pub static PUBLIC_KEY_OPS: PublicKeyOps = PublicKeyOps { common: &COMMON_OPS, }; pub static SCALAR_OPS: ScalarOps = ScalarOps { common: &COMMON_OPS, scalar_mul_mont: p384_scalar_mul_mont, }; pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps { scalar_ops: &SCALAR_OPS, public_key_ops: &PUBLIC_KEY_OPS, twin_mul: |g_scalar, p_scalar, p_xy, cpu| { twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy, cpu) }, q_minus_n: PublicElem::from_hex("389cb27e0bc8d21fa7e5f24cb74f58851313e696333ad68c"), // TODO: Use an optimized variable-time implementation. scalar_inv_to_mont_vartime: |s, cpu| PRIVATE_SCALAR_OPS.scalar_inv_to_mont(s, cpu), }; pub static PRIVATE_SCALAR_OPS: PrivateScalarOps = PrivateScalarOps { scalar_ops: &SCALAR_OPS, oneRR_mod_n: PublicScalar::from_hex("c84ee012b39bf213fb05b7a28266895d40d49174aab1cc5bc3e483afcb82947ff3d81e5df1aa4192d319b2419b409a9"), scalar_inv_to_mont: p384_scalar_inv_to_mont, }; fn p384_scalar_inv_to_mont(a: Scalar, _cpu: cpu::Features) -> Scalar { // Calculate the modular inverse of scalar |a| using Fermat's Little // Theorem: // // a**-1 (mod n) == a**(n - 2) (mod n) // // The exponent (n - 2) is: // // 0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf\ // 581a0db248b0a77aecec196accc52971 fn mul(a: &Scalar, b: &Scalar) -> Scalar { binary_op(p384_scalar_mul_mont, a, b) } fn sqr(a: &Scalar) -> Scalar { binary_op(p384_scalar_mul_mont, a, a) } fn sqr_mut(a: &mut Scalar) { unary_op_from_binary_op_assign(p384_scalar_mul_mont, a); } // Returns (`a` squared `squarings` times) * `b`. fn sqr_mul(a: &Scalar, squarings: LeakyWord, b: &Scalar) -> Scalar { debug_assert!(squarings >= 1); let mut tmp = sqr(a); for _ in 1..squarings { sqr_mut(&mut tmp); } mul(&tmp, b) } // Sets `acc` = (`acc` squared `squarings` times) * `b`. fn sqr_mul_acc(acc: &mut Scalar, squarings: LeakyWord, b: &Scalar) { debug_assert!(squarings >= 1); for _ in 0..squarings { sqr_mut(acc); } binary_op_assign(p384_scalar_mul_mont, acc, b) } // Indexes into `d`. const B_1: usize = 0; const B_11: usize = 1; const B_101: usize = 2; const B_111: usize = 3; const B_1001: usize = 4; const B_1011: usize = 5; const B_1101: usize = 6; const B_1111: usize = 7; const DIGIT_COUNT: usize = 8; let mut d = [Scalar::zero(); DIGIT_COUNT]; d[B_1] = a; let b_10 = sqr(&d[B_1]); for i in B_11..DIGIT_COUNT { d[i] = mul(&d[i - 1], &b_10); } let ff = sqr_mul(&d[B_1111], 0 + 4, &d[B_1111]); let ffff = sqr_mul(&ff, 0 + 8, &ff); let ffffffff = sqr_mul(&ffff, 0 + 16, &ffff); let ffffffffffffffff = sqr_mul(&ffffffff, 0 + 32, &ffffffff); let ffffffffffffffffffffffff = sqr_mul(&ffffffffffffffff, 0 + 32, &ffffffff); // ffffffffffffffffffffffffffffffffffffffffffffffff let mut acc = sqr_mul(&ffffffffffffffffffffffff, 0 + 96, &ffffffffffffffffffffffff); // The rest of the exponent, in binary, is: // // 1100011101100011010011011000000111110100001101110010110111011111 // 0101100000011010000011011011001001001000101100001010011101111010 // 1110110011101100000110010110101011001100110001010010100101110001 #[allow(clippy::cast_possible_truncation)] static REMAINING_WINDOWS: [(u8, u8); 39] = [ (2, B_11 as u8), (3 + 3, B_111 as u8), (1 + 2, B_11 as u8), (3 + 2, B_11 as u8), (1 + 4, B_1001 as u8), (4, B_1011 as u8), (6 + 4, B_1111 as u8), (3, B_101 as u8), (4 + 1, B_1 as u8), (4, B_1011 as u8), (4, B_1001 as u8), (1 + 4, B_1101 as u8), (4, B_1101 as u8), (4, B_1111 as u8), (1 + 4, B_1011 as u8), (6 + 4, B_1101 as u8), (5 + 4, B_1101 as u8), (4, B_1011 as u8), (2 + 4, B_1001 as u8), (2 + 1, B_1 as u8), (3 + 4, B_1011 as u8), (4 + 3, B_101 as u8), (2 + 3, B_111 as u8), (1 + 4, B_1111 as u8), (1 + 4, B_1011 as u8), (4, B_1011 as u8), (2 + 3, B_111 as u8), (1 + 2, B_11 as u8), (5 + 2, B_11 as u8), (2 + 4, B_1011 as u8), (1 + 3, B_101 as u8), (1 + 2, B_11 as u8), (2 + 2, B_11 as u8), (2 + 2, B_11 as u8), (3 + 3, B_101 as u8), (2 + 3, B_101 as u8), (2 + 3, B_101 as u8), (2, B_11 as u8), (3 + 1, B_1 as u8), ]; for &(squarings, digit) in &REMAINING_WINDOWS[..] { sqr_mul_acc(&mut acc, LeakyWord::from(squarings), &d[usize::from(digit)]); } acc } unsafe extern "C" fn p384_elem_sqr_mont( r: *mut Limb, // [COMMON_OPS.num_limbs] a: *const Limb, // [COMMON_OPS.num_limbs] ) { // XXX: Inefficient. TODO: Make a dedicated squaring routine. unsafe { p384_elem_mul_mont(r, a, a); } } prefixed_extern! { fn p384_elem_mul_mont( r: *mut Limb, // [COMMON_OPS.num_limbs] a: *const Limb, // [COMMON_OPS.num_limbs] b: *const Limb, // [COMMON_OPS.num_limbs] ); fn p384_point_add( r: *mut Limb, // [3][COMMON_OPS.num_limbs] a: *const Limb, // [3][COMMON_OPS.num_limbs] b: *const Limb, // [3][COMMON_OPS.num_limbs] ); fn p384_point_mul( r: *mut Limb, // [3][COMMON_OPS.num_limbs] p_scalar: *const Limb, // [COMMON_OPS.num_limbs] p_x: *const Limb, // [COMMON_OPS.num_limbs] p_y: *const Limb, // [COMMON_OPS.num_limbs] ); fn p384_scalar_mul_mont( r: *mut Limb, // [COMMON_OPS.num_limbs] a: *const Limb, // [COMMON_OPS.num_limbs] b: *const Limb, // [COMMON_OPS.num_limbs] ); } ring-0.17.14/src/ec/suite_b/ops.rs000064400000000000000000001246371046102023000147310ustar 00000000000000// Copyright 2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::{ arithmetic::limbs_from_hex, arithmetic::montgomery::*, bb::LeakyWord, cpu, error::{self, LenMismatchError}, limb::*, }; use core::marker::PhantomData; use elem::{mul_mont, unary_op, unary_op_assign, unary_op_from_binary_op_assign}; /// A field element, i.e. an element of ℤ/qℤ for the curve's field modulus /// *q*. pub type Elem = elem::Elem; type PublicElem = elem::PublicElem; /// Represents the (prime) order *q* of the curve's prime field. #[derive(Clone, Copy)] pub enum Q {} /// A scalar. Its value is in [0, n). Zero-valued scalars are forbidden in most /// contexts. pub type Scalar = elem::Elem; type PublicScalar = elem::PublicElem; /// Represents the prime order *n* of the curve's group. #[derive(Clone, Copy)] pub enum N {} pub(super) struct Modulus { // TODO: [Limb; elem::NumLimbs::MAX] limbs: &'static [Limb; elem::NumLimbs::MAX], num_limbs: elem::NumLimbs, cops: &'static CommonOps, m: PhantomData, cpu: cpu::Features, } pub struct Point { // The coordinates are stored in a contiguous array, where the first // `ops.num_limbs` elements are the X coordinate, the next // `ops.num_limbs` elements are the Y coordinate, and the next // `ops.num_limbs` elements are the Z coordinate. This layout is dictated // by the requirements of the nistz256 code. xyz: [Limb; 3 * elem::NumLimbs::MAX], } impl Point { pub fn new_at_infinity() -> Self { Self { xyz: [0; 3 * elem::NumLimbs::MAX], } } } /// Operations and values needed by all curve operations. pub struct CommonOps { num_limbs: elem::NumLimbs, q: PublicModulus, n: PublicElem, pub a: PublicElem, // Must be -3 mod q pub b: PublicElem, // In all cases, `r`, `a`, and `b` may all alias each other. elem_mul_mont: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), elem_sqr_mont: unsafe extern "C" fn(r: *mut Limb, a: *const Limb), } impl CommonOps { pub(super) fn elem_modulus(&'static self, cpu_features: cpu::Features) -> Modulus { Modulus { // TODO: limbs: self.q.p.map(Limb::from), limbs: &self.q.p, num_limbs: self.num_limbs, cops: self, m: PhantomData, cpu: cpu_features, } } pub(super) fn scalar_modulus(&'static self, cpu_features: cpu::Features) -> Modulus { Modulus { // TODO: limbs: self.n.limbs.map(Limb::from), limbs: &self.n.limbs, num_limbs: self.num_limbs, cops: self, m: PhantomData, cpu: cpu_features, } } // The length of a field element, which is the same as the length of a // scalar, in bytes. pub fn len(&self) -> usize { // Keep in sync with `Modulus::len()` self.num_limbs.into() * LIMB_BYTES } #[cfg(test)] pub(super) fn n_limbs(&self) -> &[Limb] { &self.n.limbs[..self.num_limbs.into()] } } impl Modulus { pub fn cpu(&self) -> cpu::Features { self.cpu } // Keep in sync with `CommonOps::len()`. pub fn bytes_len(&self) -> usize { self.num_limbs.into() * LIMB_BYTES } } impl Modulus { #[inline] pub fn add_assign(&self, a: &mut elem::Elem, b: &elem::Elem) { let num_limbs = self.num_limbs.into(); limbs_add_assign_mod( &mut a.limbs[..num_limbs], &b.limbs[..num_limbs], &self.limbs[..num_limbs], ) .unwrap_or_else(unwrap_impossible_len_mismatch_error) } } impl Modulus { #[inline] pub fn elems_are_equal(&self, a: &Elem, b: &Elem) -> LimbMask { let num_limbs = self.num_limbs.into(); limbs_equal_limbs_consttime(&a.limbs[..num_limbs], &b.limbs[..num_limbs]) .unwrap_or_else(unwrap_impossible_len_mismatch_error) } #[inline] pub fn elem_unencoded(&self, a: &Elem) -> Elem { self.elem_product(a, &Elem::one()) } } impl CommonOps { #[inline] fn is_zero(&self, a: &elem::Elem) -> bool { let num_limbs = self.num_limbs.into(); limbs_are_zero(&a.limbs[..num_limbs]).leak() } #[inline] fn elem_mul(&self, a: &mut Elem, b: &Elem, _cpu: cpu::Features) { elem::binary_op_assign(self.elem_mul_mont, a, b) } #[inline] fn elem_product( &self, a: &Elem, b: &Elem, _cpu: cpu::Features, ) -> Elem<<(EA, EB) as ProductEncoding>::Output> where (EA, EB): ProductEncoding, { mul_mont(self.elem_mul_mont, a, b) } #[inline] fn elem_square(&self, a: &mut Elem, _cpu: cpu::Features) { unary_op_assign(self.elem_sqr_mont, a); } #[inline] fn elem_squared(&self, a: &Elem, _cpu: cpu::Features) -> Elem { unary_op(self.elem_sqr_mont, a) } } impl Modulus { #[inline] pub fn elem_mul(&self, a: &mut Elem, b: &Elem) { self.cops.elem_mul(a, b, self.cpu) } #[inline] pub fn elem_product( &self, a: &Elem, b: &Elem, ) -> Elem<<(EA, EB) as ProductEncoding>::Output> where (EA, EB): ProductEncoding, { self.cops.elem_product(a, b, self.cpu) } #[inline] pub fn elem_square(&self, a: &mut Elem) { self.cops.elem_square(a, self.cpu) } #[inline] pub fn elem_squared(&self, a: &Elem) -> Elem { self.cops.elem_squared(a, self.cpu) } } impl Modulus { #[inline] pub fn is_zero(&self, a: &elem::Elem) -> bool { self.cops.is_zero(a) } } impl Modulus { pub fn elem_verify_is_not_zero(&self, a: &Elem) -> Result<(), error::Unspecified> { if self.is_zero(a) { Err(error::Unspecified) } else { Ok(()) } } pub(super) fn a(&self) -> &'static PublicElem { &self.cops.a } pub(super) fn b(&self) -> &'static PublicElem { &self.cops.b } } impl PrivateKeyOps { pub(super) fn point_sum(&self, a: &Point, b: &Point, _cpu: cpu::Features) -> Point { let mut r = Point::new_at_infinity(); unsafe { (self.point_add_jacobian_impl)(r.xyz.as_mut_ptr(), a.xyz.as_ptr(), b.xyz.as_ptr()) } r } } impl Modulus { pub fn point_x(&self, p: &Point) -> Elem { let num_limbs = self.num_limbs.into(); let mut r = Elem::zero(); r.limbs[..num_limbs].copy_from_slice(&p.xyz[0..num_limbs]); r } pub fn point_y(&self, p: &Point) -> Elem { let num_limbs = self.num_limbs.into(); let mut r = Elem::zero(); r.limbs[..num_limbs].copy_from_slice(&p.xyz[num_limbs..(2 * num_limbs)]); r } pub fn point_z(&self, p: &Point) -> Elem { let num_limbs = self.num_limbs.into(); let mut r = Elem::zero(); r.limbs[..num_limbs].copy_from_slice(&p.xyz[(2 * num_limbs)..(3 * num_limbs)]); r } } struct PublicModulus { p: [LeakyLimb; elem::NumLimbs::MAX], rr: PublicElem, } /// Operations on private keys, for ECDH and ECDSA signing. pub struct PrivateKeyOps { pub common: &'static CommonOps, elem_inv_squared: fn(q: &Modulus, a: &Elem) -> Elem, point_mul_base_impl: fn(a: &Scalar, cpu: cpu::Features) -> Point, point_mul_impl: unsafe extern "C" fn( r: *mut Limb, // [3][num_limbs] p_scalar: *const Limb, // [num_limbs] p_x: *const Limb, // [num_limbs] p_y: *const Limb, // [num_limbs] ), point_add_jacobian_impl: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), } impl PrivateKeyOps { pub fn leak_limbs<'a>(&self, a: &'a Elem) -> &'a [Limb] { &a.limbs[..self.common.num_limbs.into()] } #[inline(always)] pub(super) fn point_mul_base(&self, a: &Scalar, cpu: cpu::Features) -> Point { (self.point_mul_base_impl)(a, cpu) } #[inline(always)] pub(super) fn point_mul( &self, p_scalar: &Scalar, (p_x, p_y): &(Elem, Elem), _cpu: cpu::Features, ) -> Point { let mut r = Point::new_at_infinity(); unsafe { (self.point_mul_impl)( r.xyz.as_mut_ptr(), p_scalar.limbs.as_ptr(), p_x.limbs.as_ptr(), p_y.limbs.as_ptr(), ); } r } #[inline] pub(super) fn elem_inverse_squared(&self, q: &Modulus, a: &Elem) -> Elem { (self.elem_inv_squared)(q, a) } } /// Operations and values needed by all operations on public keys (ECDH /// agreement and ECDSA verification). pub struct PublicKeyOps { pub common: &'static CommonOps, } impl PublicKeyOps { // The serialized bytes are in big-endian order, zero-padded. The limbs // of `Elem` are in the native endianness, least significant limb to // most significant limb. Besides the parsing, conversion, this also // implements NIST SP 800-56A Step 2: "Verify that xQ and yQ are integers // in the interval [0, p-1] in the case that q is an odd prime p[.]" pub(super) fn elem_parse( &self, q: &Modulus, input: &mut untrusted::Reader, ) -> Result, error::Unspecified> { let _cpu = cpu::features(); let encoded_value = input.read_bytes(self.common.len())?; let parsed = elem_parse_big_endian_fixed_consttime(q, encoded_value)?; let mut r = Elem::zero(); let rr = Elem::from(&self.common.q.rr); // Montgomery encode (elem_to_mont). // TODO: do something about this. unsafe { (self.common.elem_mul_mont)( r.limbs.as_mut_ptr(), parsed.limbs.as_ptr(), rr.limbs.as_ptr(), ) } Ok(r) } } // Operations used by both ECDSA signing and ECDSA verification. In general // these must be side-channel resistant. pub struct ScalarOps { pub common: &'static CommonOps, scalar_mul_mont: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), } impl ScalarOps { pub(super) fn scalar_modulus(&'static self, cpu_features: cpu::Features) -> Modulus { self.common.scalar_modulus(cpu_features) } // The (maximum) length of a scalar, not including any padding. pub fn scalar_bytes_len(&self) -> usize { self.common.len() } } impl ScalarOps { pub fn leak_limbs<'s>(&self, s: &'s Scalar) -> &'s [Limb] { &s.limbs[..self.common.num_limbs.into()] } #[inline] pub(super) fn scalar_product( &self, a: &Scalar, b: &Scalar, _cpu: cpu::Features, ) -> Scalar<<(EA, EB) as ProductEncoding>::Output> where (EA, EB): ProductEncoding, { mul_mont(self.scalar_mul_mont, a, b) } } /// Operations on public scalars needed by ECDSA signature verification. pub struct PublicScalarOps { pub scalar_ops: &'static ScalarOps, pub public_key_ops: &'static PublicKeyOps, pub(super) twin_mul: fn( g_scalar: &Scalar, p_scalar: &Scalar, p_xy: &(Elem, Elem), cpu: cpu::Features, ) -> Point, scalar_inv_to_mont_vartime: fn(s: &Scalar, cpu: cpu::Features) -> Scalar, pub(super) q_minus_n: PublicElem, } impl PublicScalarOps { pub fn n(&self) -> &PublicElem { &self.scalar_ops.common.n } #[inline] pub fn scalar_as_elem(&self, a: &Scalar) -> Elem { Elem { limbs: a.limbs, m: PhantomData, encoding: PhantomData, } } } impl Modulus { pub fn elem_less_than_vartime(&self, a: &Elem, b: &PublicElem) -> bool { let num_limbs = self.num_limbs.into(); limbs_less_than_limbs_vartime(&a.limbs[..num_limbs], &b.limbs[..num_limbs]) .unwrap_or_else(|LenMismatchError { .. }| unreachable!()) } } impl PublicScalarOps { pub(super) fn scalar_inv_to_mont_vartime( &self, s: &Scalar, cpu: cpu::Features, ) -> Scalar { (self.scalar_inv_to_mont_vartime)(s, cpu) } } #[allow(non_snake_case)] pub struct PrivateScalarOps { pub scalar_ops: &'static ScalarOps, oneRR_mod_n: PublicScalar, // 1 * R**2 (mod n). TOOD: Use One. scalar_inv_to_mont: fn(a: Scalar, cpu: cpu::Features) -> Scalar, } impl PrivateScalarOps { pub(super) fn to_mont(&self, s: &Scalar, cpu: cpu::Features) -> Scalar { self.scalar_ops .scalar_product(s, &Scalar::from(&self.oneRR_mod_n), cpu) } /// Returns the modular inverse of `a` (mod `n`). Panics if `a` is zero. pub(super) fn scalar_inv_to_mont(&self, a: &Scalar, cpu: cpu::Features) -> Scalar { assert!(!self.scalar_ops.common.is_zero(a)); let a = self.to_mont(a, cpu); (self.scalar_inv_to_mont)(a, cpu) } } // XXX: Inefficient and unnecessarily depends on `PrivateKeyOps`. TODO: implement interleaved wNAF // multiplication. fn twin_mul_inefficient( ops: &PrivateKeyOps, g_scalar: &Scalar, p_scalar: &Scalar, p_xy: &(Elem, Elem), cpu: cpu::Features, ) -> Point { let scaled_g = ops.point_mul_base(g_scalar, cpu); let scaled_p = ops.point_mul(p_scalar, p_xy, cpu); ops.point_sum(&scaled_g, &scaled_p, cpu) } // This assumes n < q < 2*n. impl Modulus { pub fn elem_reduced_to_scalar(&self, elem: &Elem) -> Scalar { let num_limbs = self.num_limbs.into(); let mut r_limbs = elem.limbs; limbs_reduce_once(&mut r_limbs[..num_limbs], &self.limbs[..num_limbs]) .unwrap_or_else(unwrap_impossible_len_mismatch_error); Scalar { limbs: r_limbs, m: PhantomData, encoding: PhantomData, } } } // Returns (`a` squared `squarings` times) * `b`. fn elem_sqr_mul( ops: &CommonOps, a: &Elem, squarings: LeakyWord, b: &Elem, cpu: cpu::Features, ) -> Elem { debug_assert!(squarings >= 1); let mut tmp = ops.elem_squared(a, cpu); for _ in 1..squarings { ops.elem_square(&mut tmp, cpu); } ops.elem_product(&tmp, b, cpu) } // Sets `acc` = (`acc` squared `squarings` times) * `b`. fn elem_sqr_mul_acc( ops: &CommonOps, acc: &mut Elem, squarings: LeakyWord, b: &Elem, cpu: cpu::Features, ) { debug_assert!(squarings >= 1); for _ in 0..squarings { ops.elem_square(acc, cpu); } ops.elem_mul(acc, b, cpu) } #[inline] pub(super) fn elem_parse_big_endian_fixed_consttime( q: &Modulus, bytes: untrusted::Input, ) -> Result, error::Unspecified> { parse_big_endian_fixed_consttime(q, bytes, AllowZero::Yes) } #[inline] pub(super) fn scalar_parse_big_endian_fixed_consttime( n: &Modulus, bytes: untrusted::Input, ) -> Result { parse_big_endian_fixed_consttime(n, bytes, AllowZero::No) } #[inline] pub(super) fn scalar_parse_big_endian_variable( n: &Modulus, allow_zero: AllowZero, bytes: untrusted::Input, ) -> Result { let num_limbs = n.num_limbs.into(); let mut r = Scalar::zero(); parse_big_endian_in_range_and_pad_consttime( bytes, allow_zero, &n.limbs[..num_limbs], &mut r.limbs[..num_limbs], )?; Ok(r) } pub(super) fn scalar_parse_big_endian_partially_reduced_variable_consttime( n: &Modulus, bytes: untrusted::Input, ) -> Result { let num_limbs = n.num_limbs.into(); let mut r = Scalar::zero(); { let r = &mut r.limbs[..num_limbs]; parse_big_endian_and_pad_consttime(bytes, r)?; limbs_reduce_once(r, &n.limbs[..num_limbs]) .unwrap_or_else(unwrap_impossible_len_mismatch_error); } Ok(r) } fn parse_big_endian_fixed_consttime( m: &Modulus, bytes: untrusted::Input, allow_zero: AllowZero, ) -> Result, error::Unspecified> { let num_limbs = m.num_limbs.into(); if bytes.len() != m.bytes_len() { return Err(error::Unspecified); } let mut r = elem::Elem::zero(); parse_big_endian_in_range_and_pad_consttime( bytes, allow_zero, &m.limbs[..num_limbs], &mut r.limbs[..num_limbs], )?; Ok(r) } #[cold] #[inline(never)] fn unwrap_impossible_len_mismatch_error(LenMismatchError { .. }: LenMismatchError) -> T { unreachable!() } #[cfg(test)] mod tests { extern crate alloc; use super::*; use crate::testutil as test; use alloc::{format, vec, vec::Vec}; const ZERO_SCALAR: Scalar = Scalar { limbs: [0; elem::NumLimbs::MAX], m: PhantomData, encoding: PhantomData, }; trait Convert { fn convert(self, q: &Modulus) -> Elem; } impl Convert for Elem { fn convert(self, _q: &Modulus) -> Elem { self } } impl Convert for Elem { fn convert(self, q: &Modulus) -> Elem { q.elem_unencoded(&self) } } fn q_minus_n_plus_n_equals_0_test(ops: &PublicScalarOps) { let cops = ops.scalar_ops.common; let q = &cops.elem_modulus(cpu::features()); let mut x = Elem::from(&ops.q_minus_n); q.add_assign(&mut x, &Elem::from(&cops.n)); assert!(q.is_zero(&x)); } #[test] fn p256_q_minus_n_plus_n_equals_0_test() { q_minus_n_plus_n_equals_0_test(&p256::PUBLIC_SCALAR_OPS); } #[test] fn p384_q_minus_n_plus_n_equals_0_test() { q_minus_n_plus_n_equals_0_test(&p384::PUBLIC_SCALAR_OPS); } #[test] fn p256_elem_add_test() { elem_add_test( &p256::PUBLIC_SCALAR_OPS, test_vector_file!("ops/p256_elem_sum_tests.txt"), ); } #[test] fn p384_elem_add_test() { elem_add_test( &p384::PUBLIC_SCALAR_OPS, test_vector_file!("ops/p384_elem_sum_tests.txt"), ); } fn elem_add_test(ops: &PublicScalarOps, test_file: test::File) { let cops = ops.public_key_ops.common; let q = &cops.elem_modulus(cpu::features()); test::run(test_file, |section, test_case| { assert_eq!(section, ""); let a = consume_elem(q, test_case, "a"); let b = consume_elem(q, test_case, "b"); let expected_sum = consume_elem(q, test_case, "r"); let mut actual_sum = a; q.add_assign(&mut actual_sum, &b); assert_limbs_are_equal(cops, &actual_sum.limbs, &expected_sum.limbs); let mut actual_sum = b; q.add_assign(&mut actual_sum, &a); assert_limbs_are_equal(cops, &actual_sum.limbs, &expected_sum.limbs); Ok(()) }) } // XXX: There's no `p256_sub` in *ring*; it's logic is inlined into // the point arithmetic functions. Thus, we can't test it. #[test] fn p384_elem_sub_test() { prefixed_extern! { fn p384_elem_sub(r: *mut Limb, a: *const Limb, b: *const Limb); } elem_sub_test( &p384::COMMON_OPS, p384_elem_sub, test_vector_file!("ops/p384_elem_sum_tests.txt"), ); } fn elem_sub_test( ops: &'static CommonOps, elem_sub: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), test_file: test::File, ) { let q = &ops.elem_modulus(cpu::features()); test::run(test_file, |section, test_case| { assert_eq!(section, ""); let a = consume_elem(q, test_case, "a"); let b = consume_elem(q, test_case, "b"); let r = consume_elem(q, test_case, "r"); let mut actual_difference = Elem::::zero(); unsafe { elem_sub( actual_difference.limbs.as_mut_ptr(), r.limbs.as_ptr(), b.limbs.as_ptr(), ); } assert_limbs_are_equal(ops, &actual_difference.limbs, &a.limbs); let mut actual_difference = Elem::::zero(); unsafe { elem_sub( actual_difference.limbs.as_mut_ptr(), r.limbs.as_ptr(), a.limbs.as_ptr(), ); } assert_limbs_are_equal(ops, &actual_difference.limbs, &b.limbs); Ok(()) }) } // XXX: There's no `p256_div_by_2` in *ring*; it's logic is inlined // into the point arithmetic functions. Thus, we can't test it. #[test] fn p384_elem_div_by_2_test() { prefixed_extern! { fn p384_elem_div_by_2(r: *mut Limb, a: *const Limb); } elem_div_by_2_test( &p384::COMMON_OPS, p384_elem_div_by_2, test_vector_file!("ops/p384_elem_div_by_2_tests.txt"), ); } fn elem_div_by_2_test( ops: &'static CommonOps, elem_div_by_2: unsafe extern "C" fn(r: *mut Limb, a: *const Limb), test_file: test::File, ) { let q = &ops.elem_modulus(cpu::features()); test::run(test_file, |section, test_case| { assert_eq!(section, ""); let a = consume_elem(q, test_case, "a"); let r = consume_elem(q, test_case, "r"); let mut actual_result = Elem::::zero(); unsafe { elem_div_by_2(actual_result.limbs.as_mut_ptr(), a.limbs.as_ptr()); } assert_limbs_are_equal(ops, &actual_result.limbs, &r.limbs); Ok(()) }) } // There is no `ecp_nistz256_neg` on other targets. #[cfg(target_arch = "x86_64")] #[test] fn p256_elem_neg_test() { prefixed_extern! { fn ecp_nistz256_neg(r: *mut Limb, a: *const Limb); } elem_neg_test( &p256::COMMON_OPS, ecp_nistz256_neg, test_vector_file!("ops/p256_elem_neg_tests.txt"), ); } #[test] fn p384_elem_neg_test() { prefixed_extern! { fn p384_elem_neg(r: *mut Limb, a: *const Limb); } elem_neg_test( &p384::COMMON_OPS, p384_elem_neg, test_vector_file!("ops/p384_elem_neg_tests.txt"), ); } fn elem_neg_test( ops: &'static CommonOps, elem_neg: unsafe extern "C" fn(r: *mut Limb, a: *const Limb), test_file: test::File, ) { let q = &ops.elem_modulus(cpu::features()); test::run(test_file, |section, test_case| { assert_eq!(section, ""); let a = consume_elem(q, test_case, "a"); let b = consume_elem(q, test_case, "b"); // Verify -a == b. { let mut actual_result = Elem::::zero(); unsafe { elem_neg(actual_result.limbs.as_mut_ptr(), a.limbs.as_ptr()); } assert_limbs_are_equal(ops, &actual_result.limbs, &b.limbs); } // Verify -b == a. { let mut actual_result = Elem::::zero(); unsafe { elem_neg(actual_result.limbs.as_mut_ptr(), b.limbs.as_ptr()); } assert_limbs_are_equal(ops, &actual_result.limbs, &a.limbs); } Ok(()) }) } #[test] fn p256_elem_mul_test() { elem_mul_test( &p256::COMMON_OPS, test_vector_file!("ops/p256_elem_mul_tests.txt"), ); } #[test] fn p384_elem_mul_test() { elem_mul_test( &p384::COMMON_OPS, test_vector_file!("ops/p384_elem_mul_tests.txt"), ); } fn elem_mul_test(ops: &'static CommonOps, test_file: test::File) { let q = &ops.elem_modulus(cpu::features()); test::run(test_file, |section, test_case| { assert_eq!(section, ""); let mut a = consume_elem(q, test_case, "a"); let b = consume_elem(q, test_case, "b"); let r = consume_elem(q, test_case, "r"); q.elem_mul(&mut a, &b); assert_limbs_are_equal(ops, &a.limbs, &r.limbs); Ok(()) }) } #[test] fn p256_scalar_mul_test() { scalar_mul_test( &p256::SCALAR_OPS, test_vector_file!("ops/p256_scalar_mul_tests.txt"), ); } #[test] fn p384_scalar_mul_test() { scalar_mul_test( &p384::SCALAR_OPS, test_vector_file!("ops/p384_scalar_mul_tests.txt"), ); } fn scalar_mul_test(ops: &ScalarOps, test_file: test::File) { let cpu = cpu::features(); let cops = ops.common; let n = &cops.scalar_modulus(cpu); test::run(test_file, |section, test_case| { assert_eq!(section, ""); let a = consume_scalar(n, test_case, "a"); let b = consume_scalar_mont(n, test_case, "b"); let expected_result = consume_scalar(n, test_case, "r"); let actual_result = ops.scalar_product(&a, &b, cpu); assert_limbs_are_equal(cops, &actual_result.limbs, &expected_result.limbs); Ok(()) }) } #[test] fn p256_scalar_square_test() { prefixed_extern! { fn p256_scalar_sqr_rep_mont(r: *mut Limb, a: *const Limb, rep: LeakyWord); } scalar_square_test( &p256::SCALAR_OPS, p256_scalar_sqr_rep_mont, test_vector_file!("ops/p256_scalar_square_tests.txt"), ); } // XXX: There's no `p384_scalar_square_test()` because there's no dedicated // `p384_scalar_sqr_rep_mont()`. fn scalar_square_test( ops: &ScalarOps, sqr_rep: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, rep: LeakyWord), test_file: test::File, ) { let cpu = cpu::features(); let cops = ops.common; let n = &cops.scalar_modulus(cpu); test::run(test_file, |section, test_case| { assert_eq!(section, ""); let cpu = cpu::features(); let a = consume_scalar(n, test_case, "a"); let expected_result = consume_scalar(n, test_case, "r"); { let mut actual_result: Scalar = Scalar { limbs: [0; elem::NumLimbs::MAX], m: PhantomData, encoding: PhantomData, }; unsafe { sqr_rep(actual_result.limbs.as_mut_ptr(), a.limbs.as_ptr(), 1); } assert_limbs_are_equal(cops, &actual_result.limbs, &expected_result.limbs); } { let actual_result = ops.scalar_product(&a, &a, cpu); assert_limbs_are_equal(cops, &actual_result.limbs, &expected_result.limbs); } Ok(()) }) } #[test] #[should_panic(expected = "!self.scalar_ops.common.is_zero(a)")] fn p256_scalar_inv_to_mont_zero_panic_test() { let _ = p256::PRIVATE_SCALAR_OPS.scalar_inv_to_mont(&ZERO_SCALAR, cpu::features()); } #[test] #[should_panic(expected = "!self.scalar_ops.common.is_zero(a)")] fn p384_scalar_inv_to_mont_zero_panic_test() { let _ = p384::PRIVATE_SCALAR_OPS.scalar_inv_to_mont(&ZERO_SCALAR, cpu::features()); } #[test] fn p256_point_sum_test() { point_sum_test( &p256::PRIVATE_KEY_OPS, test_vector_file!("ops/p256_point_sum_tests.txt"), ); } #[test] fn p384_point_sum_test() { point_sum_test( &p384::PRIVATE_KEY_OPS, test_vector_file!("ops/p384_point_sum_tests.txt"), ); } fn point_sum_test(ops: &PrivateKeyOps, test_file: test::File) { let cpu = cpu::features(); test::run(test_file, |section, test_case| { assert_eq!(section, ""); let a = consume_jacobian_point(ops, test_case, "a"); let b = consume_jacobian_point(ops, test_case, "b"); let r_expected: TestPoint = consume_point(ops, test_case, "r"); let r_actual = ops.point_sum(&a, &b, cpu); assert_point_actual_equals_expected(ops, &r_actual, &r_expected); Ok(()) }); } #[test] fn p256_point_sum_mixed_test() { prefixed_extern! { fn p256_point_add_affine( r: *mut Limb, // [p256::COMMON_OPS.num_limbs*3] a: *const Limb, // [p256::COMMON_OPS.num_limbs*3] b: *const Limb, // [p256::COMMON_OPS.num_limbs*2] ); } point_sum_mixed_test( &p256::PRIVATE_KEY_OPS, p256_point_add_affine, test_vector_file!("ops/p256_point_sum_mixed_tests.txt"), ); } // XXX: There is no `nistz384_point_add_affine()`. fn point_sum_mixed_test( ops: &PrivateKeyOps, point_add_affine: unsafe extern "C" fn( r: *mut Limb, // [ops.num_limbs*3] a: *const Limb, // [ops.num_limbs*3] b: *const Limb, // [ops.num_limbs*2] ), test_file: test::File, ) { test::run(test_file, |section, test_case| { assert_eq!(section, ""); let a = consume_jacobian_point(ops, test_case, "a"); let b = consume_affine_point(ops, test_case, "b"); let r_expected: TestPoint = consume_point(ops, test_case, "r"); let mut r_actual = Point::new_at_infinity(); unsafe { point_add_affine(r_actual.xyz.as_mut_ptr(), a.xyz.as_ptr(), b.xy.as_ptr()); } assert_point_actual_equals_expected(ops, &r_actual, &r_expected); Ok(()) }); } #[test] fn p256_point_double_test() { prefixed_extern! { fn p256_point_double( r: *mut Limb, // [p256::COMMON_OPS.num_limbs*3] a: *const Limb, // [p256::COMMON_OPS.num_limbs*3] ); } point_double_test( &p256::PRIVATE_KEY_OPS, p256_point_double, test_vector_file!("ops/p256_point_double_tests.txt"), ); } #[test] fn p384_point_double_test() { prefixed_extern! { fn p384_point_double( r: *mut Limb, // [p384::COMMON_OPS.num_limbs*3] a: *const Limb, // [p384::COMMON_OPS.num_limbs*3] ); } point_double_test( &p384::PRIVATE_KEY_OPS, p384_point_double, test_vector_file!("ops/p384_point_double_tests.txt"), ); } fn point_double_test( ops: &PrivateKeyOps, point_double: unsafe extern "C" fn( r: *mut Limb, // [ops.num_limbs*3] a: *const Limb, // [ops.num_limbs*3] ), test_file: test::File, ) { test::run(test_file, |section, test_case| { assert_eq!(section, ""); let a = consume_jacobian_point(ops, test_case, "a"); let r_expected: TestPoint = consume_point(ops, test_case, "r"); let mut r_actual = Point::new_at_infinity(); unsafe { point_double(r_actual.xyz.as_mut_ptr(), a.xyz.as_ptr()); } assert_point_actual_equals_expected(ops, &r_actual, &r_expected); Ok(()) }); } /// TODO: We should be testing `point_mul` with points other than the generator. #[test] fn p256_point_mul_test() { let generator = ( Elem::from(&p256::GENERATOR.0), Elem::from(&p256::GENERATOR.1), ); point_mul_base_tests( &p256::PRIVATE_KEY_OPS, |s, cpu| p256::PRIVATE_KEY_OPS.point_mul(s, &generator, cpu), test_vector_file!("ops/p256_point_mul_base_tests.txt"), ); } /// TODO: We should be testing `point_mul` with points other than the generator. #[test] fn p384_point_mul_test() { let generator = ( Elem::from(&p384::GENERATOR.0), Elem::from(&p384::GENERATOR.1), ); point_mul_base_tests( &p384::PRIVATE_KEY_OPS, |s, cpu| p384::PRIVATE_KEY_OPS.point_mul(s, &generator, cpu), test_vector_file!("ops/p384_point_mul_base_tests.txt"), ); } #[test] fn p256_point_mul_serialized_test() { point_mul_serialized_test( &p256::PRIVATE_KEY_OPS, &p256::PUBLIC_KEY_OPS, test_vector_file!("ops/p256_point_mul_serialized_tests.txt"), ); } fn point_mul_serialized_test( priv_ops: &PrivateKeyOps, pub_ops: &PublicKeyOps, test_file: test::File, ) { let cpu = cpu::features(); let cops = pub_ops.common; let q = &cops.elem_modulus(cpu); let n = &cops.scalar_modulus(cpu); test::run(test_file, |section, test_case| { assert_eq!(section, ""); let p_scalar = consume_scalar(n, test_case, "p_scalar"); let p = test_case.consume_bytes("p"); let p = super::super::public_key::parse_uncompressed_point( pub_ops, q, untrusted::Input::from(&p), ) .expect("valid point"); let expected_result = test_case.consume_bytes("r"); let product = priv_ops.point_mul(&p_scalar, &p, cpu::features()); let mut actual_result = vec![4u8; 1 + (2 * cops.len())]; { let (x, y) = actual_result[1..].split_at_mut(cops.len()); super::super::private_key::big_endian_affine_from_jacobian( priv_ops, q, x, Some(y), &product, ) .expect("successful encoding"); } assert_eq!(expected_result, actual_result); Ok(()) }) } #[test] fn p256_point_mul_base_test() { point_mul_base_tests( &p256::PRIVATE_KEY_OPS, |s, cpu| p256::PRIVATE_KEY_OPS.point_mul_base(s, cpu), test_vector_file!("ops/p256_point_mul_base_tests.txt"), ); } #[test] fn p384_point_mul_base_test() { point_mul_base_tests( &p384::PRIVATE_KEY_OPS, |s, cpu| p384::PRIVATE_KEY_OPS.point_mul_base(s, cpu), test_vector_file!("ops/p384_point_mul_base_tests.txt"), ); } pub(super) fn point_mul_base_tests( ops: &PrivateKeyOps, f: impl Fn(&Scalar, cpu::Features) -> Point, test_file: test::File, ) { let cpu = cpu::features(); let n = &ops.common.scalar_modulus(cpu); test::run(test_file, |section, test_case| { assert_eq!(section, ""); let g_scalar = consume_scalar(n, test_case, "g_scalar"); let expected_result: TestPoint = consume_point(ops, test_case, "r"); let actual_result = f(&g_scalar, cpu); assert_point_actual_equals_expected(ops, &actual_result, &expected_result); Ok(()) }) } fn assert_point_actual_equals_expected( ops: &PrivateKeyOps, actual_point: &Point, expected_point: &TestPoint, ) where Elem: Convert, { let cpu = cpu::features(); let cops = ops.common; let q = &cops.elem_modulus(cpu); let actual_x = &q.point_x(actual_point); let actual_y = &q.point_y(actual_point); let actual_z = &q.point_z(actual_point); match expected_point { TestPoint::Infinity => { let zero = Elem::zero(); assert_elems_are_equal(q, actual_z, &zero); } TestPoint::Affine(expected_x, expected_y) => { let zz_inv = ops.elem_inverse_squared(q, actual_z); let x_aff = q.elem_product(actual_x, &zz_inv); let y_aff = { let zzzz_inv = q.elem_squared(&zz_inv); let zzz_inv = q.elem_product(actual_z, &zzzz_inv); q.elem_product(actual_y, &zzz_inv) }; let x_aff = x_aff.convert(q); let y_aff = y_aff.convert(q); assert_elems_are_equal(q, &x_aff, expected_x); assert_elems_are_equal(q, &y_aff, expected_y); } } } fn consume_jacobian_point( ops: &PrivateKeyOps, test_case: &mut test::TestCase, name: &str, ) -> Point { let q = &ops.common.elem_modulus(cpu::features()); let input = test_case.consume_string(name); let elems = input.split(", ").collect::>(); assert_eq!(elems.len(), 3); let mut p = Point::new_at_infinity(); consume_point_elem(q, &mut p.xyz, &elems, 0); consume_point_elem(q, &mut p.xyz, &elems, 1); consume_point_elem(q, &mut p.xyz, &elems, 2); p } struct AffinePoint { xy: [Limb; 2 * elem::NumLimbs::MAX], } fn consume_affine_point( ops: &PrivateKeyOps, test_case: &mut test::TestCase, name: &str, ) -> AffinePoint { let q = &ops.common.elem_modulus(cpu::features()); let input = test_case.consume_string(name); let elems = input.split(", ").collect::>(); assert_eq!(elems.len(), 2); let mut p = AffinePoint { xy: [0; 2 * elem::NumLimbs::MAX], }; consume_point_elem(q, &mut p.xy, &elems, 0); consume_point_elem(q, &mut p.xy, &elems, 1); p } fn consume_point_elem(q: &Modulus, limbs_out: &mut [Limb], elems: &[&str], i: usize) { let num_limbs = q.num_limbs.into(); let bytes = test::from_hex(elems[i]).unwrap(); let bytes = untrusted::Input::from(&bytes); let r: Elem = elem_parse_big_endian_fixed_consttime(q, bytes).unwrap(); // XXX: “Transmute” this to `Elem` limbs. limbs_out[(i * num_limbs)..((i + 1) * num_limbs)].copy_from_slice(&r.limbs[..num_limbs]); } enum TestPoint { Infinity, Affine(Elem, Elem), } fn consume_point( ops: &PrivateKeyOps, test_case: &mut test::TestCase, name: &str, ) -> TestPoint { let q = &ops.common.elem_modulus(cpu::features()); fn consume_point_elem(q: &Modulus, elems: &[&str], i: usize) -> Elem { let bytes = test::from_hex(elems[i]).unwrap(); let bytes = untrusted::Input::from(&bytes); let unencoded: Elem = elem_parse_big_endian_fixed_consttime(q, bytes).unwrap(); // XXX: “Transmute” this to `Elem` limbs. Elem { limbs: unencoded.limbs, m: PhantomData, encoding: PhantomData, } } let input = test_case.consume_string(name); if input == "inf" { return TestPoint::Infinity; } let elems = input.split(", ").collect::>(); assert_eq!(elems.len(), 2); let x = consume_point_elem(q, &elems, 0); let y = consume_point_elem(q, &elems, 1); TestPoint::Affine(x, y) } fn assert_elems_are_equal(q: &Modulus, a: &Elem, b: &Elem) { assert_limbs_are_equal(q.cops, &a.limbs, &b.limbs) } fn assert_limbs_are_equal( ops: &CommonOps, actual: &[Limb; elem::NumLimbs::MAX], expected: &[Limb; elem::NumLimbs::MAX], ) { let num_limbs = ops.num_limbs.into(); if actual[..num_limbs] != expected[..num_limbs] { let mut actual_s = alloc::string::String::new(); let mut expected_s = alloc::string::String::new(); for j in 0..num_limbs { let width = LIMB_BITS / 4; let formatted = format!("{:0width$x}", actual[num_limbs - j - 1]); actual_s.push_str(&formatted); let formatted = format!("{:0width$x}", expected[num_limbs - j - 1]); expected_s.push_str(&formatted); } panic!( "Actual != Expected,\nActual = {}, Expected = {}", actual_s, expected_s ); } } fn consume_elem(q: &Modulus, test_case: &mut test::TestCase, name: &str) -> Elem { let unpadded_bytes = test_case.consume_bytes(name); let mut bytes = vec![0; q.bytes_len() - unpadded_bytes.len()]; bytes.extend(&unpadded_bytes); let bytes = untrusted::Input::from(&bytes); let r: Elem = elem_parse_big_endian_fixed_consttime(q, bytes).unwrap(); // XXX: “Transmute” this to an `Elem`. Elem { limbs: r.limbs, m: PhantomData, encoding: PhantomData, } } fn consume_scalar(n: &Modulus, test_case: &mut test::TestCase, name: &str) -> Scalar { let bytes = test_case.consume_bytes(name); let bytes = untrusted::Input::from(&bytes); scalar_parse_big_endian_variable(n, AllowZero::Yes, bytes).unwrap() } fn consume_scalar_mont( n: &Modulus, test_case: &mut test::TestCase, name: &str, ) -> Scalar { let bytes = test_case.consume_bytes(name); let bytes = untrusted::Input::from(&bytes); let s = scalar_parse_big_endian_variable(n, AllowZero::Yes, bytes).unwrap(); // “Transmute” it to a `Scalar`. Scalar { limbs: s.limbs, m: PhantomData, encoding: PhantomData, } } } mod elem; pub mod p256; pub mod p384; ring-0.17.14/src/ec/suite_b/private_key.rs000064400000000000000000000177221046102023000164460ustar 00000000000000// Copyright 2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Functionality shared by operations on private keys (ECC keygen and //! ECDSA signing). use super::{ops::*, verify_affine_point_is_on_the_curve}; use crate::{arithmetic::montgomery::R, cpu, ec, error, limb, rand}; /// Generates a random scalar in the range [1, n). pub(super) fn random_scalar( ops: &PrivateKeyOps, n: &Modulus, rng: &dyn rand::SecureRandom, ) -> Result { let mut bytes = [0; ec::SCALAR_MAX_BYTES]; let bytes = &mut bytes[..ops.common.len()]; generate_private_scalar_bytes(ops, rng, bytes, n.cpu())?; scalar_from_big_endian_bytes(n, bytes) } pub(super) fn generate_private_scalar_bytes( ops: &PrivateKeyOps, rng: &dyn rand::SecureRandom, out: &mut [u8], cpu: cpu::Features, ) -> Result<(), error::Unspecified> { // [NSA Suite B Implementer's Guide to ECDSA] Appendix A.1.2, and // [NSA Suite B Implementer's Guide to NIST SP 800-56A] Appendix B.2, // "Key Pair Generation by Testing Candidates". // // [NSA Suite B Implementer's Guide to ECDSA]: doc/ecdsa.pdf // [NSA Suite B Implementer's Guide to NIST SP 800-56A]: doc/ecdh.pdf // TODO: The NSA guide also suggests, in appendix B.1, another mechanism // that would avoid the need to use `rng.fill()` more than once. It works // by generating an extra 64 bits of random bytes and then reducing the // output (mod n). Supposedly, this removes enough of the bias towards // small values from the modular reduction, but it isn't obvious that it is // sufficient. TODO: Figure out what we can do to mitigate the bias issue // and switch to the other mechanism. let candidate = out; // XXX: The value 100 was chosen to match OpenSSL due to uncertainty of // what specific value would be better, but it seems bad to try 100 times. for _ in 0..100 { // NSA Guide Steps 1, 2, and 3. // // Since we calculate the length ourselves, it is pointless to check // it, since we can only check it by doing the same calculation. // NSA Guide Step 4. // // The requirement that the random number generator has the // requested security strength is delegated to `rng`. rng.fill(candidate)?; // NSA Guide Steps 5, 6, and 7. if check_scalar_big_endian_bytes(ops, candidate, cpu).is_err() { continue; } // NSA Guide Step 8 is done in `public_from_private()`. // NSA Guide Step 9. return Ok(()); } Err(error::Unspecified) } // The underlying X25519 and Ed25519 code uses an [u8; 32] to store the private // key. To make the ECDH and ECDSA code similar to that, we also store the // private key that way, which means we have to convert it to a Scalar whenever // we need to use it. #[inline] pub(super) fn private_key_as_scalar(n: &Modulus, private_key: &ec::Seed) -> Scalar { // This cannot fail because we know the private key is valid. scalar_from_big_endian_bytes(n, private_key.bytes_less_safe()).unwrap() } pub(super) fn check_scalar_big_endian_bytes( ops: &PrivateKeyOps, bytes: &[u8], cpu: cpu::Features, ) -> Result<(), error::Unspecified> { debug_assert_eq!(bytes.len(), ops.common.len()); let n = &ops.common.scalar_modulus(cpu); scalar_from_big_endian_bytes(n, bytes).map(|_| ()) } // Parses a fixed-length (zero-padded) big-endian-encoded scalar in the range // [1, n). This is intended to be constant-time with respect to the actual // value *only if* the value is actually in range. In other words, this won't // leak anything about a valid value, but it might leak small amounts of // information about an invalid value (which constraint it failed). pub(super) fn scalar_from_big_endian_bytes( n: &Modulus, bytes: &[u8], ) -> Result { // [NSA Suite B Implementer's Guide to ECDSA] Appendix A.1.2, and // [NSA Suite B Implementer's Guide to NIST SP 800-56A] Appendix B.2, // "Key Pair Generation by Testing Candidates". // // [NSA Suite B Implementer's Guide to ECDSA]: doc/ecdsa.pdf // [NSA Suite B Implementer's Guide to NIST SP 800-56A]: doc/ecdh.pdf // // Steps 5, 6, and 7. // // XXX: The NSA guide says that we should verify that the random scalar is // in the range [0, n - 1) and then add one to it so that it is in the range // [1, n). Instead, we verify that the scalar is in the range [1, n). This // way, we avoid needing to compute or store the value (n - 1), we avoid the // need to implement a function to add one to a scalar, and we avoid needing // to convert the scalar back into an array of bytes. scalar_parse_big_endian_fixed_consttime(n, untrusted::Input::from(bytes)) } pub(super) fn public_from_private( ops: &PrivateKeyOps, public_out: &mut [u8], my_private_key: &ec::Seed, cpu: cpu::Features, ) -> Result<(), error::Unspecified> { let q = &ops.common.elem_modulus(cpu); let elem_and_scalar_bytes = ops.common.len(); debug_assert_eq!(public_out.len(), 1 + (2 * elem_and_scalar_bytes)); let n = &ops.common.scalar_modulus(cpu); let my_private_key = private_key_as_scalar(n, my_private_key); let my_public_key = ops.point_mul_base(&my_private_key, cpu); public_out[0] = 4; // Uncompressed encoding. let (x_out, y_out) = public_out[1..].split_at_mut(elem_and_scalar_bytes); // `big_endian_affine_from_jacobian` verifies that the point is not at // infinity and is on the curve. big_endian_affine_from_jacobian(ops, q, x_out, Some(y_out), &my_public_key) } pub(super) fn affine_from_jacobian( ops: &PrivateKeyOps, q: &Modulus, p: &Point, ) -> Result<(Elem, Elem), error::Unspecified> { let z = q.point_z(p); // Since we restrict our private key to the range [1, n), the curve has // prime order, and we verify that the peer's point is on the curve, // there's no way that the result can be at infinity. But, use `assert!` // instead of `debug_assert!` anyway assert!(q.elem_verify_is_not_zero(&z).is_ok()); let x = q.point_x(p); let y = q.point_y(p); let zz_inv = ops.elem_inverse_squared(q, &z); let x_aff = q.elem_product(&x, &zz_inv); // `y_aff` is needed to validate the point is on the curve. It is also // needed in the non-ECDH case where we need to output it. let y_aff = { let zzzz_inv = q.elem_squared(&zz_inv); let zzz_inv = q.elem_product(&z, &zzzz_inv); q.elem_product(&y, &zzz_inv) }; // If we validated our inputs correctly and then computed (x, y, z), then // (x, y, z) will be on the curve. See // `verify_affine_point_is_on_the_curve_scaled` for the motivation. verify_affine_point_is_on_the_curve(q, (&x_aff, &y_aff))?; Ok((x_aff, y_aff)) } pub(super) fn big_endian_affine_from_jacobian( ops: &PrivateKeyOps, q: &Modulus, x_out: &mut [u8], y_out: Option<&mut [u8]>, p: &Point, ) -> Result<(), error::Unspecified> { let (x_aff, y_aff) = affine_from_jacobian(ops, q, p)?; let x = q.elem_unencoded(&x_aff); limb::big_endian_from_limbs(ops.leak_limbs(&x), x_out); if let Some(y_out) = y_out { let y = q.elem_unencoded(&y_aff); limb::big_endian_from_limbs(ops.leak_limbs(&y), y_out); } Ok(()) } ring-0.17.14/src/ec/suite_b/public_key.rs000064400000000000000000000105711046102023000162450ustar 00000000000000// Copyright 2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Functionality shared by operations on public keys (ECDSA verification and //! ECDH agreement). use super::{ops::*, verify_affine_point_is_on_the_curve}; use crate::{arithmetic::montgomery::*, error}; /// Parses a public key encoded in uncompressed form. The key is validated /// using the ECC Partial Public-Key Validation Routine from /// [NIST SP 800-56A, revision 2] Section 5.6.2.3.3, the NSA's /// "Suite B Implementer's Guide to NIST SP 800-56A," Appendix B.3, and the /// NSA's "Suite B Implementer's Guide to FIPS 186-3 (ECDSA)," Appendix A.3. /// /// [NIST SP 800-56A, revision 2]: /// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf pub(super) fn parse_uncompressed_point( ops: &PublicKeyOps, q: &Modulus, input: untrusted::Input, ) -> Result<(Elem, Elem), error::Unspecified> { // NIST SP 800-56A Step 1: "Verify that Q is not the point at infinity. // This can be done by inspection if the point is entered in the standard // affine representation." (We do it by inspection since we only accept // the affine representation.) let (x, y) = input.read_all(error::Unspecified, |input| { // The encoding must be 4, which is the encoding for "uncompressed". let encoding = input.read_byte()?; if encoding != 4 { return Err(error::Unspecified); } // NIST SP 800-56A Step 2: "Verify that xQ and yQ are integers in the // interval [0, p-1] in the case that q is an odd prime p[.]" let x = ops.elem_parse(q, input)?; let y = ops.elem_parse(q, input)?; Ok((x, y)) })?; // NIST SP 800-56A Step 3: "If q is an odd prime p, verify that // yQ**2 = xQ**3 + axQ + b in GF(p), where the arithmetic is performed // modulo p." verify_affine_point_is_on_the_curve(q, (&x, &y))?; // NIST SP 800-56A Note: "Since its order is not verified, there is no // check that the public key is in the correct EC subgroup." // // NSA Suite B Implementer's Guide Note: "ECC Full Public-Key Validation // includes an additional check to ensure that the point has the correct // order. This check is not necessary for curves having prime order (and // cofactor h = 1), such as P-256 and P-384." Ok((x, y)) } #[cfg(test)] mod tests { use super::*; use crate::cpu; use crate::testutil as test; #[test] fn parse_uncompressed_point_test() { let cpu = cpu::features(); test::run( test_vector_file!("suite_b_public_key_tests.txt"), |section, test_case| { assert_eq!(section, ""); let curve_name = test_case.consume_string("Curve"); let public_key = test_case.consume_bytes("Q"); let public_key = untrusted::Input::from(&public_key); let is_valid = test_case.consume_string("Result") == "P"; let curve_ops = public_key_ops_from_curve_name(&curve_name); let q = &curve_ops.common.elem_modulus(cpu); let result = parse_uncompressed_point(curve_ops, q, public_key); assert_eq!(is_valid, result.is_ok()); // TODO: Verify that we when we re-serialize the parsed (x, y), the // output is equal to the input. Ok(()) }, ); } fn public_key_ops_from_curve_name(curve_name: &str) -> &'static PublicKeyOps { if curve_name == "P-256" { &p256::PUBLIC_KEY_OPS } else if curve_name == "P-384" { &p384::PUBLIC_KEY_OPS } else { panic!("Unsupported curve: {}", curve_name); } } } ring-0.17.14/src/ec/suite_b.rs000064400000000000000000000213641046102023000141210ustar 00000000000000// Copyright 2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Elliptic curve operations on P-256 & P-384. use self::ops::*; use crate::{arithmetic::montgomery::*, cpu, ec, error, io::der, pkcs8}; // NIST SP 800-56A Step 3: "If q is an odd prime p, verify that // yQ**2 = xQ**3 + axQ + b in GF(p), where the arithmetic is performed modulo // p." // // That is, verify that (x, y) is on the curve, which is true iif: // // y**2 == x**3 + a*x + b (mod q) // // Or, equivalently, but more efficiently: // // y**2 == (x**2 + a)*x + b (mod q) // fn verify_affine_point_is_on_the_curve( q: &Modulus, (x, y): (&Elem, &Elem), ) -> Result<(), error::Unspecified> { verify_affine_point_is_on_the_curve_scaled(q, (x, y), &Elem::from(q.a()), &Elem::from(q.b())) } // Use `verify_affine_point_is_on_the_curve` instead of this function whenever // the affine coordinates are available or will become available. This function // should only be used then the affine coordinates are never calculated. See // the notes for `verify_affine_point_is_on_the_curve_scaled`. // // The value `z**2` is returned on success because it is useful for ECDSA // verification. // // This function also verifies that the point is not at infinity. fn verify_jacobian_point_is_on_the_curve( q: &Modulus, p: &Point, ) -> Result, error::Unspecified> { let z = q.point_z(p); // Verify that the point is not at infinity. q.elem_verify_is_not_zero(&z)?; let x = q.point_x(p); let y = q.point_y(p); // We are given Jacobian coordinates (x, y, z). So, we have: // // (x/z**2, y/z**3) == (x', y'), // // where (x', y') are the affine coordinates. The curve equation is: // // y'**2 == x'**3 + a*x' + b == (x'**2 + a)*x' + b // // Substituting our Jacobian coordinates, we get: // // / y \**2 / / x \**2 \ / x \ // | ---- | == | | ---- | + a | * | ---- | + b // \ z**3 / \ \ z**2 / / \ z**2 / // // Simplify: // // y**2 / x**2 \ x // ---- == | ---- + a | * ---- + b // z**6 \ z**4 / z**2 // // Multiply both sides by z**6: // // z**6 / x**2 \ z**6 // ---- * y**2 == | ---- + a | * ---- * x + (z**6) * b // z**6 \ z**4 / z**2 // // Simplify: // // / x**2 \ // y**2 == | ---- + a | * z**4 * x + (z**6) * b // \ z**4 / // // Distribute z**4: // // / z**4 \ // y**2 == | ---- * x**2 + z**4 * a | * x + (z**6) * b // \ z**4 / // // Simplify: // // y**2 == (x**2 + z**4 * a) * x + (z**6) * b // let z2 = q.elem_squared(&z); let z4 = q.elem_squared(&z2); let z4_a = q.elem_product(&z4, &Elem::from(q.a())); let z6 = q.elem_product(&z4, &z2); let z6_b = q.elem_product(&z6, &Elem::from(q.b())); verify_affine_point_is_on_the_curve_scaled(q, (&x, &y), &z4_a, &z6_b)?; Ok(z2) } // Handles the common logic of point-is-on-the-curve checks for both affine and // Jacobian cases. // // When doing the check that the point is on the curve after a computation, // to avoid fault attacks or mitigate potential bugs, it is better for security // to use `verify_affine_point_is_on_the_curve` on the affine coordinates, // because it provides some protection against faults that occur in the // computation of the inverse of `z`. See the paper and presentation "Fault // Attacks on Projective-to-Affine Coordinates Conversion" by Diana Maimuţ, // Cédric Murdica, David Naccache, Mehdi Tibouchi. That presentation concluded // simply "Check the validity of the result after conversion to affine // coordinates." (It seems like a good idea to verify that // z_inv * z == 1 mod q too). // // In the case of affine coordinates (x, y), `a_scaled` and `b_scaled` are // `a` and `b`, respectively. In the case of Jacobian coordinates (x, y, z), // the computation and comparison is the same, except `a_scaled` and `b_scaled` // are (z**4 * a) and (z**6 * b), respectively. Thus, performance is another // reason to prefer doing the check on the affine coordinates, as Jacobian // computation requires 3 extra multiplications and 2 extra squarings. // // An example of a fault attack that isn't mitigated by a point-on-the-curve // check after multiplication is given in "Sign Change Fault Attacks On // Elliptic Curve Cryptosystems" by Johannes Blömer, Martin Otto, and // Jean-Pierre Seifert. fn verify_affine_point_is_on_the_curve_scaled( q: &Modulus, (x, y): (&Elem, &Elem), a_scaled: &Elem, b_scaled: &Elem, ) -> Result<(), error::Unspecified> { let lhs = q.elem_squared(y); let mut rhs = q.elem_squared(x); q.add_assign(&mut rhs, a_scaled); q.elem_mul(&mut rhs, x); q.add_assign(&mut rhs, b_scaled); if !q.elems_are_equal(&lhs, &rhs).leak() { return Err(error::Unspecified); } Ok(()) } pub(crate) fn key_pair_from_pkcs8( curve: &'static ec::Curve, template: &pkcs8::Template, input: untrusted::Input, cpu_features: cpu::Features, ) -> Result { let (ec_private_key, _) = pkcs8::unwrap_key(template, pkcs8::Version::V1Only, input)?; let (private_key, public_key) = ec_private_key.read_all(error::KeyRejected::invalid_encoding(), |input| { // https://tools.ietf.org/html/rfc5915#section-3 der::nested( input, der::Tag::Sequence, error::KeyRejected::invalid_encoding(), |input| key_pair_from_pkcs8_(template, input), ) })?; key_pair_from_bytes(curve, private_key, public_key, cpu_features) } fn key_pair_from_pkcs8_<'a>( template: &pkcs8::Template, input: &mut untrusted::Reader<'a>, ) -> Result<(untrusted::Input<'a>, untrusted::Input<'a>), error::KeyRejected> { let version = der::small_nonnegative_integer(input) .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; if version != 1 { return Err(error::KeyRejected::version_not_supported()); } let private_key = der::expect_tag_and_get_value(input, der::Tag::OctetString) .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; // [0] parameters (optional). if input.peek(u8::from(der::Tag::ContextSpecificConstructed0)) { let actual_alg_id = der::expect_tag_and_get_value(input, der::Tag::ContextSpecificConstructed0) .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; if actual_alg_id.as_slice_less_safe() != template.curve_oid().as_slice_less_safe() { return Err(error::KeyRejected::wrong_algorithm()); } } // [1] publicKey. The RFC says it is optional, but we require it // to be present. let public_key = der::nested( input, der::Tag::ContextSpecificConstructed1, error::Unspecified, der::bit_string_with_no_unused_bits, ) .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; Ok((private_key, public_key)) } pub(crate) fn key_pair_from_bytes( curve: &'static ec::Curve, private_key_bytes: untrusted::Input, public_key_bytes: untrusted::Input, cpu_features: cpu::Features, ) -> Result { let seed = ec::Seed::from_bytes(curve, private_key_bytes, cpu_features) .map_err(|error::Unspecified| error::KeyRejected::invalid_component())?; let r = ec::KeyPair::derive(seed, cpu_features) .map_err(|error::Unspecified| error::KeyRejected::unexpected_error())?; if public_key_bytes.as_slice_less_safe() != r.public_key().as_ref() { return Err(error::KeyRejected::inconsistent_components()); } Ok(r) } pub mod curve; pub mod ecdh; pub mod ecdsa; mod ops; mod private_key; mod public_key; ring-0.17.14/src/ec.rs000064400000000000000000000043161046102023000124650ustar 00000000000000// Copyright 2015-2017 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::{cpu, error, rand}; pub use self::keys::{KeyPair, PublicKey, Seed}; pub struct Curve { pub public_key_len: usize, pub elem_scalar_seed_len: usize, pub id: CurveID, // Precondition: `bytes` is the correct length. check_private_key_bytes: fn(bytes: &[u8], cpu: cpu::Features) -> Result<(), error::Unspecified>, generate_private_key: fn( rng: &dyn rand::SecureRandom, &mut [u8], cpu: cpu::Features, ) -> Result<(), error::Unspecified>, public_from_private: fn( public_out: &mut [u8], private_key: &Seed, cpu: cpu::Features, ) -> Result<(), error::Unspecified>, } derive_debug_via_id!(Curve); #[derive(Clone, Copy, Debug, PartialEq)] pub enum CurveID { Curve25519, P256, P384, } const ELEM_MAX_BITS: usize = 384; pub const ELEM_MAX_BYTES: usize = (ELEM_MAX_BITS + 7) / 8; pub const SCALAR_MAX_BYTES: usize = ELEM_MAX_BYTES; const SEED_MAX_BYTES: usize = ELEM_MAX_BYTES; /// The maximum length of a PKCS#8 documents generated by *ring* for ECC keys. /// /// This is NOT the maximum length of a PKCS#8 document that can be consumed by /// `pkcs8::unwrap_key()`. /// /// `40` is the length of the P-384 template. It is actually one byte shorter /// than the P-256 template, but the private key and the public key are much /// longer. pub const PKCS8_DOCUMENT_MAX_LEN: usize = 40 + SCALAR_MAX_BYTES + keys::PUBLIC_KEY_MAX_LEN; pub mod curve25519; mod keys; pub mod suite_b; ring-0.17.14/src/error/input_too_long.rs000064400000000000000000000030651046102023000162660ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. pub struct InputTooLongError { /// Note that this might not actually be the (exact) length of the input, /// and its units might be lost. For example, it could be any of the /// following: /// /// * The length in bytes of the entire input. /// * The length in bytes of some *part* of the input. /// * A bit length. /// * A length in terms of "blocks" or other grouping of input values. /// * Some intermediate quantity that was used when checking the input /// length. /// * Some arbitrary value. #[allow(dead_code)] imprecise_input_length: T, } impl InputTooLongError { #[cold] #[inline(never)] pub(crate) fn new(imprecise_input_length: T) -> Self { Self { imprecise_input_length, } } } ring-0.17.14/src/error/into_unspecified.rs000064400000000000000000000023271046102023000165560ustar 00000000000000// Copyright 2016-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::error::{KeyRejected, Unspecified}; impl From for Unspecified { fn from(source: untrusted::EndOfInput) -> Self { super::erase(source) } } impl From for Unspecified { fn from(source: core::array::TryFromSliceError) -> Self { super::erase(source) } } impl From for Unspecified { fn from(source: KeyRejected) -> Self { super::erase(source) } } ring-0.17.14/src/error/key_rejected.rs000064400000000000000000000072501046102023000156640ustar 00000000000000// Copyright 2016-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Error reporting. #[cfg(feature = "std")] extern crate std; /// An error parsing or validating a key. /// /// The `Display` implementation will return a string that will help you better /// understand why a key was rejected change which errors are reported in which /// situations while minimizing the likelihood that any applications will be /// broken. /// /// Here is an incomplete list of reasons a key may be unsupported: /// /// * Invalid or Inconsistent Components: A component of the key has an invalid /// value, or the mathematical relationship between two (or more) components /// required for a valid key does not hold. /// /// * The encoding of the key is invalid. Perhaps the key isn't in the correct /// format; e.g. it may be Base64 ("PEM") encoded, in which case the Base64 /// encoding needs to be undone first. /// /// * The encoding includes a versioning mechanism and that mechanism indicates /// that the key is encoded in a version of the encoding that isn't supported. /// This might happen for multi-prime RSA keys (keys with more than two /// private prime factors), which aren't supported, for example. /// /// * Too small or too Large: One of the primary components of the key is too /// small or two large. Too-small keys are rejected for security reasons. Some /// unnecessarily large keys are rejected for performance reasons. /// /// * Wrong algorithm: The key is not valid for the algorithm in which it was /// being used. /// /// * Unexpected errors: Report this as a bug. #[derive(Copy, Clone, Debug)] pub struct KeyRejected(&'static str); impl KeyRejected { pub(crate) fn inconsistent_components() -> Self { Self("InconsistentComponents") } pub(crate) fn invalid_component() -> Self { Self("InvalidComponent") } #[inline] pub(crate) fn invalid_encoding() -> Self { Self("InvalidEncoding") } // XXX: See the comment at the call site. pub(crate) fn rng_failed() -> Self { Self("RNG failed") } pub(crate) fn public_key_is_missing() -> Self { Self("PublicKeyIsMissing") } #[cfg(feature = "alloc")] pub(crate) fn too_small() -> Self { Self("TooSmall") } #[cfg(feature = "alloc")] pub(crate) fn too_large() -> Self { Self("TooLarge") } pub(crate) fn version_not_supported() -> Self { Self("VersionNotSupported") } pub(crate) fn wrong_algorithm() -> Self { Self("WrongAlgorithm") } #[cfg(feature = "alloc")] pub(crate) fn private_modulus_len_not_multiple_of_512_bits() -> Self { Self("PrivateModulusLenNotMultipleOf512Bits") } pub(crate) fn unexpected_error() -> Self { Self("UnexpectedError") } } #[cfg(feature = "std")] impl std::error::Error for KeyRejected {} impl core::fmt::Display for KeyRejected { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.write_str(self.0) } } ring-0.17.14/src/error/mod.rs000064400000000000000000000040421046102023000140020ustar 00000000000000// Copyright 2016-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Error reporting. pub use self::{key_rejected::KeyRejected, unspecified::Unspecified}; pub(crate) use self::{ input_too_long::InputTooLongError, len_mismatch_error::LenMismatchError, too_much_output_requested::TooMuchOutputRequestedError, }; mod input_too_long; mod into_unspecified; mod key_rejected; mod unspecified; #[cold] #[inline(never)] pub(crate) fn erase(_: T) -> Unspecified { Unspecified } cold_exhaustive_error! { struct too_much_output_requested::TooMuchOutputRequestedError with pub(crate) constructor { // Note that this might not actually be the (exact) output length // requested, and its units might be lost. For example, it could be any of // the following: // // * The length in bytes of the entire output. // * The length in bytes of some *part* of the output. // * A bit length. // * A length in terms of "blocks" or other grouping of output values. // * Some intermediate quantity that was used when checking the output // length. // * Some arbitrary value. imprecise_output_length: usize } } cold_exhaustive_error! { struct len_mismatch_error::LenMismatchError with pub(crate) constructor { len: usize } } ring-0.17.14/src/error/unspecified.rs000064400000000000000000000066041046102023000155270ustar 00000000000000// Copyright 2016-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #[cfg(feature = "std")] extern crate std; /// An error with absolutely no details. /// /// *ring* uses this unit type as the error type in most of its results /// because (a) usually the specific reasons for a failure are obvious or are /// not useful to know, and/or (b) providing more details about a failure might /// provide a dangerous side channel, and/or (c) it greatly simplifies the /// error handling logic. /// /// `Result` is mostly equivalent to /// `Result`. However, `ring::error::Unspecified` implements /// [`std::error::Error`] and users of *ring* can implement /// `From` to map this to their own error types, as /// described in [“Error Handling” in the Rust Book]: /// /// ``` /// use ring::rand::{self, SecureRandom}; /// /// enum Error { /// CryptoError, /// /// # #[cfg(feature = "alloc")] /// IOError(std::io::Error), /// // [...] /// } /// /// impl From for Error { /// fn from(_: ring::error::Unspecified) -> Self { Error::CryptoError } /// } /// /// fn eight_random_bytes() -> Result<[u8; 8], Error> { /// let rng = rand::SystemRandom::new(); /// let mut bytes = [0; 8]; /// /// // The `From` implementation above makes this /// // equivalent to /// // `rng.fill(&mut bytes).map_err(|_| Error::CryptoError)?`. /// rng.fill(&mut bytes)?; /// /// Ok(bytes) /// } /// /// assert!(eight_random_bytes().is_ok()); /// ``` /// /// Experience with using and implementing other crypto libraries like has /// shown that sophisticated error reporting facilities often cause significant /// bugs themselves, both within the crypto library and within users of the /// crypto library. This approach attempts to minimize complexity in the hopes /// of avoiding such problems. In some cases, this approach may be too extreme, /// and it may be important for an operation to provide some details about the /// cause of a failure. Users of *ring* are encouraged to report such cases so /// that they can be addressed individually. /// /// [`std::error::Error`]: https://doc.rust-lang.org/std/error/trait.Error.html /// [“Error Handling” in the Rust Book]: /// https://doc.rust-lang.org/book/first-edition/error-handling.html#the-from-trait #[derive(Clone, Copy, Debug, PartialEq)] pub struct Unspecified; // This is required for the implementation of `std::error::Error`. impl core::fmt::Display for Unspecified { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.write_str("ring::error::Unspecified") } } #[cfg(feature = "std")] impl std::error::Error for Unspecified {} ring-0.17.14/src/hkdf.rs000064400000000000000000000155211046102023000130120ustar 00000000000000// Copyright 2015 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! HMAC-based Extract-and-Expand Key Derivation Function. //! //! HKDF is specified in [RFC 5869]. //! //! [RFC 5869]: https://tools.ietf.org/html/rfc5869 use crate::{error, hmac}; /// An HKDF algorithm. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct Algorithm(hmac::Algorithm); impl Algorithm { /// The underlying HMAC algorithm. #[inline] pub fn hmac_algorithm(&self) -> hmac::Algorithm { self.0 } } /// HKDF using HMAC-SHA-1. Obsolete. pub static HKDF_SHA1_FOR_LEGACY_USE_ONLY: Algorithm = Algorithm(hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY); /// HKDF using HMAC-SHA-256. pub static HKDF_SHA256: Algorithm = Algorithm(hmac::HMAC_SHA256); /// HKDF using HMAC-SHA-384. pub static HKDF_SHA384: Algorithm = Algorithm(hmac::HMAC_SHA384); /// HKDF using HMAC-SHA-512. pub static HKDF_SHA512: Algorithm = Algorithm(hmac::HMAC_SHA512); impl KeyType for Algorithm { fn len(&self) -> usize { self.0.digest_algorithm().output_len() } } /// A salt for HKDF operations. #[derive(Debug)] pub struct Salt(hmac::Key); impl Salt { /// Constructs a new `Salt` with the given value based on the given digest /// algorithm. /// /// Constructing a `Salt` is relatively expensive so it is good to reuse a /// `Salt` object instead of re-constructing `Salt`s with the same value. pub fn new(algorithm: Algorithm, value: &[u8]) -> Self { Self(hmac::Key::new(algorithm.0, value)) } /// The [HKDF-Extract] operation. /// /// [HKDF-Extract]: https://tools.ietf.org/html/rfc5869#section-2.2 pub fn extract(&self, secret: &[u8]) -> Prk { // The spec says that if no salt is provided then a key of // `digest_alg.output_len` bytes of zeros is used. But, HMAC keys are // already zero-padded to the block length, which is larger than the output // length of the extract step (the length of the digest). Consequently the // `Key` constructor will automatically do the right thing for a // zero-length string. let salt = &self.0; let prk = hmac::sign(salt, secret); Prk(hmac::Key::new(salt.algorithm(), prk.as_ref())) } /// The algorithm used to derive this salt. #[inline] pub fn algorithm(&self) -> Algorithm { Algorithm(self.0.algorithm()) } } impl From> for Salt { fn from(okm: Okm<'_, Algorithm>) -> Self { Self(hmac::Key::from(Okm { prk: okm.prk, info: okm.info, len: okm.len().0, len_cached: okm.len_cached, })) } } /// The length of the OKM (Output Keying Material) for a `Prk::expand()` call. pub trait KeyType { /// The length that `Prk::expand()` should expand its input to. fn len(&self) -> usize; } /// A HKDF PRK (pseudorandom key). #[derive(Clone, Debug)] pub struct Prk(hmac::Key); impl Prk { /// Construct a new `Prk` directly with the given value. /// /// Usually one can avoid using this. It is useful when the application /// intentionally wants to leak the PRK secret, e.g. to implement /// `SSLKEYLOGFILE` functionality. pub fn new_less_safe(algorithm: Algorithm, value: &[u8]) -> Self { Self(hmac::Key::new(algorithm.hmac_algorithm(), value)) } /// The [HKDF-Expand] operation. /// /// [HKDF-Expand]: https://tools.ietf.org/html/rfc5869#section-2.3 /// /// Fails if (and only if) `len` is too large. #[inline] pub fn expand<'a, L: KeyType>( &'a self, info: &'a [&'a [u8]], len: L, ) -> Result, error::Unspecified> { let len_cached = len.len(); if len_cached > 255 * self.0.algorithm().digest_algorithm().output_len() { return Err(error::Unspecified); } Ok(Okm { prk: self, info, len, len_cached, }) } } impl From> for Prk { fn from(okm: Okm) -> Self { Self(hmac::Key::from(Okm { prk: okm.prk, info: okm.info, len: okm.len().0, len_cached: okm.len_cached, })) } } /// An HKDF OKM (Output Keying Material) /// /// Intentionally not `Clone` or `Copy` as an OKM is generally only safe to /// use once. #[derive(Debug)] pub struct Okm<'a, L: KeyType> { prk: &'a Prk, info: &'a [&'a [u8]], len: L, len_cached: usize, } impl Okm<'_, L> { /// The `OkmLength` given to `Prk::expand()`. #[inline] pub fn len(&self) -> &L { &self.len } /// Fills `out` with the output of the HKDF-Expand operation for the given /// inputs. /// /// Fails if (and only if) the requested output length is larger than 255 /// times the size of the digest algorithm's output. (This is the limit /// imposed by the HKDF specification due to the way HKDF's counter is /// constructed.) #[inline] pub fn fill(self, out: &mut [u8]) -> Result<(), error::Unspecified> { fill_okm(self.prk, self.info, out, self.len_cached) } } fn fill_okm( prk: &Prk, info: &[&[u8]], out: &mut [u8], len: usize, ) -> Result<(), error::Unspecified> { if out.len() != len { return Err(error::Unspecified); } let digest_alg = prk.0.algorithm().digest_algorithm(); assert!(digest_alg.block_len() >= digest_alg.output_len()); let mut ctx = hmac::Context::with_key(&prk.0); let mut n = 1u8; let mut out = out; loop { for info in info { ctx.update(info); } ctx.update(&[n]); let t = ctx.sign(); let t = t.as_ref(); // Append `t` to the output. out = if out.len() < digest_alg.output_len() { let len = out.len(); out.copy_from_slice(&t[..len]); &mut [] } else { let (this_chunk, rest) = out.split_at_mut(digest_alg.output_len()); this_chunk.copy_from_slice(t); rest }; if out.is_empty() { return Ok(()); } ctx = hmac::Context::with_key(&prk.0); ctx.update(t); n = n.checked_add(1).unwrap(); } } ring-0.17.14/src/hmac.rs000064400000000000000000000364531046102023000130150ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! HMAC is specified in [RFC 2104]. //! //! After a `Key` is constructed, it can be used for multiple signing or //! verification operations. Separating the construction of the key from the //! rest of the HMAC operation allows the per-key precomputation to be done //! only once, instead of it being done in every HMAC operation. //! //! Frequently all the data to be signed in a message is available in a single //! contiguous piece. In that case, the module-level `sign` function can be //! used. Otherwise, if the input is in multiple parts, `Context` should be //! used. //! //! # Examples: //! //! ## Signing a value and verifying it wasn't tampered with //! //! ``` //! use ring::{hmac, rand}; //! //! let rng = rand::SystemRandom::new(); //! let key = hmac::Key::generate(hmac::HMAC_SHA256, &rng)?; //! //! let msg = "hello, world"; //! //! let tag = hmac::sign(&key, msg.as_bytes()); //! //! // [We give access to the message to an untrusted party, and they give it //! // back to us. We need to verify they didn't tamper with it.] //! //! hmac::verify(&key, msg.as_bytes(), tag.as_ref())?; //! //! # Ok::<(), ring::error::Unspecified>(()) //! ``` //! //! ## Using the one-shot API: //! //! ``` //! use ring::{digest, hmac, rand}; //! use ring::rand::SecureRandom; //! //! let msg = "hello, world"; //! //! // The sender generates a secure key value and signs the message with it. //! // Note that in a real protocol, a key agreement protocol would be used to //! // derive `key_value`. //! let rng = rand::SystemRandom::new(); //! let key_value: [u8; digest::SHA256_OUTPUT_LEN] = rand::generate(&rng)?.expose(); //! //! let s_key = hmac::Key::new(hmac::HMAC_SHA256, key_value.as_ref()); //! let tag = hmac::sign(&s_key, msg.as_bytes()); //! //! // The receiver (somehow!) knows the key value, and uses it to verify the //! // integrity of the message. //! let v_key = hmac::Key::new(hmac::HMAC_SHA256, key_value.as_ref()); //! hmac::verify(&v_key, msg.as_bytes(), tag.as_ref())?; //! //! # Ok::<(), ring::error::Unspecified>(()) //! ``` //! //! ## Using the multi-part API: //! ``` //! use ring::{digest, hmac, rand}; //! use ring::rand::SecureRandom; //! //! let parts = ["hello", ", ", "world"]; //! //! // The sender generates a secure key value and signs the message with it. //! // Note that in a real protocol, a key agreement protocol would be used to //! // derive `key_value`. //! let rng = rand::SystemRandom::new(); //! let mut key_value: [u8; digest::SHA384_OUTPUT_LEN] = rand::generate(&rng)?.expose(); //! //! let s_key = hmac::Key::new(hmac::HMAC_SHA384, key_value.as_ref()); //! let mut s_ctx = hmac::Context::with_key(&s_key); //! for part in &parts { //! s_ctx.update(part.as_bytes()); //! } //! let tag = s_ctx.sign(); //! //! // The receiver (somehow!) knows the key value, and uses it to verify the //! // integrity of the message. //! let v_key = hmac::Key::new(hmac::HMAC_SHA384, key_value.as_ref()); //! let mut msg = Vec::::new(); //! for part in &parts { //! msg.extend(part.as_bytes()); //! } //! hmac::verify(&v_key, &msg.as_ref(), tag.as_ref())?; //! //! # Ok::<(), ring::error::Unspecified>(()) //! ``` //! //! [RFC 2104]: https://tools.ietf.org/html/rfc2104 use crate::{ bb, cpu, digest::{self, Digest, FinishError}, error, hkdf, rand, }; pub(crate) use crate::digest::InputTooLongError; /// An HMAC algorithm. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct Algorithm(&'static digest::Algorithm); impl Algorithm { /// The digest algorithm this HMAC algorithm is based on. #[inline] pub fn digest_algorithm(&self) -> &'static digest::Algorithm { self.0 } } /// HMAC using SHA-1. Obsolete. pub static HMAC_SHA1_FOR_LEGACY_USE_ONLY: Algorithm = Algorithm(&digest::SHA1_FOR_LEGACY_USE_ONLY); /// HMAC using SHA-256. pub static HMAC_SHA256: Algorithm = Algorithm(&digest::SHA256); /// HMAC using SHA-384. pub static HMAC_SHA384: Algorithm = Algorithm(&digest::SHA384); /// HMAC using SHA-512. pub static HMAC_SHA512: Algorithm = Algorithm(&digest::SHA512); /// An HMAC tag. /// /// For a given tag `t`, use `t.as_ref()` to get the tag value as a byte slice. #[derive(Clone, Copy, Debug)] pub struct Tag(Digest); impl AsRef<[u8]> for Tag { #[inline] fn as_ref(&self) -> &[u8] { self.0.as_ref() } } /// A key to use for HMAC signing. #[derive(Clone)] pub struct Key { inner: digest::BlockContext, outer: digest::BlockContext, } impl core::fmt::Debug for Key { fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { f.debug_struct("Key") .field("algorithm", self.algorithm().digest_algorithm()) .finish() } } impl Key { /// Generate an HMAC signing key using the given digest algorithm with a /// random value generated from `rng`. /// /// The key will be `digest_alg.output_len` bytes long, based on the /// recommendation in [RFC 2104 Section 3]. /// /// [RFC 2104 Section 3]: https://tools.ietf.org/html/rfc2104#section-3 pub fn generate( algorithm: Algorithm, rng: &dyn rand::SecureRandom, ) -> Result { Self::construct(algorithm, |buf| rng.fill(buf), cpu::features()) } fn construct( algorithm: Algorithm, fill: F, cpu: cpu::Features, ) -> Result where F: FnOnce(&mut [u8]) -> Result<(), error::Unspecified>, { let mut key_bytes = [0; digest::MAX_OUTPUT_LEN]; let key_bytes = &mut key_bytes[..algorithm.0.output_len()]; fill(key_bytes)?; Self::try_new(algorithm, key_bytes, cpu).map_err(error::erase::) } /// Construct an HMAC signing key using the given digest algorithm and key /// value. /// /// `key_value` should be a value generated using a secure random number /// generator (e.g. the `key_value` output by /// `SealingKey::generate_serializable()`) or derived from a random key by /// a key derivation function (e.g. `ring::hkdf`). In particular, /// `key_value` shouldn't be a password. /// /// As specified in RFC 2104, if `key_value` is shorter than the digest /// algorithm's block length (as returned by `digest::Algorithm::block_len()`, /// not the digest length returned by `digest::Algorithm::output_len()`) then /// it will be padded with zeros. Similarly, if it is longer than the block /// length then it will be compressed using the digest algorithm. /// /// You should not use keys larger than the `digest_alg.block_len` because /// the truncation described above reduces their strength to only /// `digest_alg.output_len * 8` bits. Support for such keys is likely to be /// removed in a future version of *ring*. pub fn new(algorithm: Algorithm, key_value: &[u8]) -> Self { Self::try_new(algorithm, key_value, cpu::features()) .map_err(error::erase::) .unwrap() } pub(crate) fn try_new( algorithm: Algorithm, key_value: &[u8], cpu_features: cpu::Features, ) -> Result { let digest_alg = algorithm.0; let mut key = Self { inner: digest::BlockContext::new(digest_alg), outer: digest::BlockContext::new(digest_alg), }; let block_len = digest_alg.block_len(); let key_hash; let key_value = if key_value.len() <= block_len { key_value } else { key_hash = Digest::compute_from(digest_alg, key_value, cpu_features)?; key_hash.as_ref() }; const IPAD: u8 = 0x36; let mut padded_key = [IPAD; digest::MAX_BLOCK_LEN]; let padded_key = &mut padded_key[..block_len]; // If the key is shorter than one block then we're supposed to act like // it is padded with zero bytes up to the block length. `x ^ 0 == x` so // we can just leave the trailing bytes of `padded_key` untouched. bb::xor_assign_at_start(&mut padded_key[..], key_value); let leftover = key.inner.update(padded_key, cpu_features); debug_assert_eq!(leftover.len(), 0); const OPAD: u8 = 0x5C; // Remove the `IPAD` masking, leaving the unmasked padded key, then // mask with `OPAD`, all in one step. bb::xor_assign(&mut padded_key[..], IPAD ^ OPAD); let leftover = key.outer.update(padded_key, cpu_features); debug_assert_eq!(leftover.len(), 0); Ok(key) } /// The digest algorithm for the key. #[inline] pub fn algorithm(&self) -> Algorithm { Algorithm(self.inner.algorithm) } pub(crate) fn sign(&self, data: &[u8], cpu: cpu::Features) -> Result { let mut ctx = Context::with_key(self); ctx.update(data); ctx.try_sign(cpu) } fn verify(&self, data: &[u8], tag: &[u8], cpu: cpu::Features) -> Result<(), VerifyError> { let computed = self .sign(data, cpu) .map_err(VerifyError::InputTooLongError)?; bb::verify_slices_are_equal(computed.as_ref(), tag) .map_err(|_: error::Unspecified| VerifyError::Mismatch) } } impl hkdf::KeyType for Algorithm { fn len(&self) -> usize { self.digest_algorithm().output_len() } } impl From> for Key { fn from(okm: hkdf::Okm) -> Self { Self::construct(*okm.len(), |buf| okm.fill(buf), cpu::features()).unwrap() } } /// A context for multi-step (Init-Update-Finish) HMAC signing. /// /// Use `sign` for single-step HMAC signing. #[derive(Clone)] pub struct Context { inner: digest::Context, outer: digest::BlockContext, } impl core::fmt::Debug for Context { fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { f.debug_struct("Context") .field("algorithm", self.inner.algorithm()) .finish() } } impl Context { /// Constructs a new HMAC signing context using the given digest algorithm /// and key. pub fn with_key(signing_key: &Key) -> Self { Self { inner: digest::Context::clone_from(&signing_key.inner), outer: signing_key.outer.clone(), } } /// Updates the HMAC with all the data in `data`. `update` may be called /// zero or more times until `finish` is called. pub fn update(&mut self, data: &[u8]) { self.inner.update(data); } /// Finalizes the HMAC calculation and returns the HMAC value. `sign` /// consumes the context so it cannot be (mis-)used after `sign` has been /// called. /// /// It is generally not safe to implement HMAC verification by comparing /// the return value of `sign` to a tag. Use `verify` for verification /// instead. pub fn sign(self) -> Tag { self.try_sign(cpu::features()) .map_err(error::erase::) .unwrap() } pub(crate) fn try_sign(self, cpu_features: cpu::Features) -> Result { // Consequently, `num_pending` is valid. debug_assert_eq!(self.inner.algorithm(), self.outer.algorithm); debug_assert!(self.inner.algorithm().output_len() < self.outer.algorithm.block_len()); let inner = self.inner.try_finish(cpu_features)?; let inner = inner.as_ref(); let num_pending = inner.len(); let buffer = &mut [0u8; digest::MAX_BLOCK_LEN]; const _BUFFER_IS_LARGE_ENOUGH_TO_HOLD_INNER: () = assert!(digest::MAX_OUTPUT_LEN < digest::MAX_BLOCK_LEN); buffer[..num_pending].copy_from_slice(inner); self.outer .try_finish(buffer, num_pending, cpu_features) .map(Tag) .map_err(|err| match err { FinishError::InputTooLong(i) => { // Unreachable, as we gave the inner context exactly the // same input we gave the outer context, and // `inner.try_finish` already succeeded. However, it is // quite difficult to prove this, and we already return // `InputTooLongError`, so just forward it along. i } FinishError::PendingNotAPartialBlock(_) => { // Follows from the assertions above. unreachable!() } }) } } /// Calculates the HMAC of `data` using the key `key` in one step. /// /// Use `Context` to calculate HMACs where the input is in multiple parts. /// /// It is generally not safe to implement HMAC verification by comparing the /// return value of `sign` to a tag. Use `verify` for verification instead. pub fn sign(key: &Key, data: &[u8]) -> Tag { key.sign(data, cpu::features()) .map_err(error::erase::) .unwrap() } /// Calculates the HMAC of `data` using the signing key `key`, and verifies /// whether the resultant value equals `tag`, in one step. /// /// This is logically equivalent to, but more efficient than, constructing a /// `Key` with the same value as `key` and then using `verify`. /// /// The verification will be done in constant time to prevent timing attacks. pub fn verify(key: &Key, data: &[u8], tag: &[u8]) -> Result<(), error::Unspecified> { key.verify(data, tag, cpu::features()) .map_err(|_: VerifyError| error::Unspecified) } enum VerifyError { // Theoretically somebody could have calculated a valid tag with a gigantic // input that we do not support. If we were to support every theoretically // valid input length, for *every* digest algorithm, then we could argue // that hitting the input length limit implies a mismatch since nobody // could have calculated such a tag with the given input. #[allow(dead_code)] InputTooLongError(InputTooLongError), Mismatch, } #[cfg(test)] mod tests { use crate::{hmac, rand}; // Make sure that `Key::generate` and `verify_with_own_key` aren't // completely wacky. #[test] pub fn hmac_signing_key_coverage() { let rng = rand::SystemRandom::new(); const HELLO_WORLD_GOOD: &[u8] = b"hello, world"; const HELLO_WORLD_BAD: &[u8] = b"hello, worle"; for algorithm in &[ hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY, hmac::HMAC_SHA256, hmac::HMAC_SHA384, hmac::HMAC_SHA512, ] { let key = hmac::Key::generate(*algorithm, &rng).unwrap(); let tag = hmac::sign(&key, HELLO_WORLD_GOOD); assert!(hmac::verify(&key, HELLO_WORLD_GOOD, tag.as_ref()).is_ok()); assert!(hmac::verify(&key, HELLO_WORLD_BAD, tag.as_ref()).is_err()) } } } ring-0.17.14/src/io/der.rs000064400000000000000000000237131046102023000132610ustar 00000000000000// Copyright 2015 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Building blocks for parsing DER-encoded ASN.1 structures. //! //! This module contains the foundational parts of an ASN.1 DER parser. use super::Positive; use crate::error; pub const CONSTRUCTED: u8 = 1 << 5; pub const CONTEXT_SPECIFIC: u8 = 2 << 6; #[derive(Clone, Copy, PartialEq)] #[repr(u8)] pub enum Tag { Boolean = 0x01, Integer = 0x02, BitString = 0x03, OctetString = 0x04, Null = 0x05, OID = 0x06, Sequence = CONSTRUCTED | 0x10, // 0x30 UTCTime = 0x17, GeneralizedTime = 0x18, ContextSpecific1 = CONTEXT_SPECIFIC | 1, ContextSpecificConstructed0 = CONTEXT_SPECIFIC | CONSTRUCTED | 0, ContextSpecificConstructed1 = CONTEXT_SPECIFIC | CONSTRUCTED | 1, ContextSpecificConstructed3 = CONTEXT_SPECIFIC | CONSTRUCTED | 3, } impl From for usize { fn from(tag: Tag) -> Self { Self::from(Tag::into(tag)) } } impl From for u8 { fn from(tag: Tag) -> Self { Tag::into(tag) } } // `impl From for u8` but as a `const fn`. impl Tag { pub const fn into(self) -> u8 { self as u8 } } pub fn expect_tag_and_get_value<'a>( input: &mut untrusted::Reader<'a>, tag: Tag, ) -> Result, error::Unspecified> { let (actual_tag, inner) = read_tag_and_get_value(input)?; if usize::from(tag) != usize::from(actual_tag) { return Err(error::Unspecified); } Ok(inner) } pub fn read_tag_and_get_value<'a>( input: &mut untrusted::Reader<'a>, ) -> Result<(u8, untrusted::Input<'a>), error::Unspecified> { let tag = input.read_byte()?; if (tag & 0x1F) == 0x1F { return Err(error::Unspecified); // High tag number form is not allowed. } // If the high order bit of the first byte is set to zero then the length // is encoded in the seven remaining bits of that byte. Otherwise, those // seven bits represent the number of bytes used to encode the length. let length = match input.read_byte()? { n if (n & 0x80) == 0 => usize::from(n), 0x81 => { let second_byte = input.read_byte()?; if second_byte < 128 { return Err(error::Unspecified); // Not the canonical encoding. } usize::from(second_byte) } 0x82 => { let second_byte = usize::from(input.read_byte()?); let third_byte = usize::from(input.read_byte()?); let combined = (second_byte << 8) | third_byte; if combined < 256 { return Err(error::Unspecified); // Not the canonical encoding. } combined } _ => { return Err(error::Unspecified); // We don't support longer lengths. } }; let inner = input.read_bytes(length)?; Ok((tag, inner)) } #[inline] pub fn bit_string_with_no_unused_bits<'a>( input: &mut untrusted::Reader<'a>, ) -> Result, error::Unspecified> { bit_string_tagged_with_no_unused_bits(Tag::BitString, input) } pub(crate) fn bit_string_tagged_with_no_unused_bits<'a>( tag: Tag, input: &mut untrusted::Reader<'a>, ) -> Result, error::Unspecified> { nested(input, tag, error::Unspecified, |value| { let unused_bits_at_end = value.read_byte().map_err(|_| error::Unspecified)?; if unused_bits_at_end != 0 { return Err(error::Unspecified); } Ok(value.read_bytes_to_end()) }) } // TODO: investigate taking decoder as a reference to reduce generated code // size. pub fn nested<'a, F, R, E: Copy>( input: &mut untrusted::Reader<'a>, tag: Tag, error: E, decoder: F, ) -> Result where F: FnOnce(&mut untrusted::Reader<'a>) -> Result, { let inner = expect_tag_and_get_value(input, tag).map_err(|_| error)?; inner.read_all(error, decoder) } pub(crate) fn nonnegative_integer<'a>( input: &mut untrusted::Reader<'a>, ) -> Result, error::Unspecified> { let value = expect_tag_and_get_value(input, Tag::Integer)?; match value .as_slice_less_safe() .split_first() .ok_or(error::Unspecified)? { // Zero or leading zero. (0, rest) => { match rest.first() { // Zero. None => Ok(value), // Necessary leading zero. Some(&second) if second & 0x80 == 0x80 => Ok(untrusted::Input::from(rest)), // Unnecessary leading zero. _ => Err(error::Unspecified), } } // Positive value with no leading zero. (first, _) if first & 0x80 == 0 => Ok(value), // Negative value. (_, _) => Err(error::Unspecified), } } /// Parse as integer with a value in the in the range [0, 255], returning its /// numeric value. This is typically used for parsing version numbers. #[inline] pub fn small_nonnegative_integer(input: &mut untrusted::Reader) -> Result { let value = nonnegative_integer(input)?; match *value.as_slice_less_safe() { [b] => Ok(b), _ => Err(error::Unspecified), } } /// Parses a positive DER integer, returning the big-endian-encoded value, /// sans any leading zero byte. pub fn positive_integer<'a>( input: &mut untrusted::Reader<'a>, ) -> Result, error::Unspecified> { let value = nonnegative_integer(input)?; Positive::from_be_bytes(value) } #[cfg(test)] mod tests { use super::*; use crate::error; fn with_i<'a, F, R>(value: &'a [u8], f: F) -> Result where F: FnOnce(&mut untrusted::Reader<'a>) -> Result, { untrusted::Input::from(value).read_all(error::Unspecified, f) } static ZERO_INTEGER: &[u8] = &[0x02, 0x01, 0x00]; static GOOD_POSITIVE_INTEGERS_SMALL: &[(&[u8], u8)] = &[ (&[0x02, 0x01, 0x01], 0x01), (&[0x02, 0x01, 0x02], 0x02), (&[0x02, 0x01, 0x7e], 0x7e), (&[0x02, 0x01, 0x7f], 0x7f), // Values that need to have an 0x00 prefix to disambiguate them from // them from negative values. (&[0x02, 0x02, 0x00, 0x80], 0x80), (&[0x02, 0x02, 0x00, 0x81], 0x81), (&[0x02, 0x02, 0x00, 0xfe], 0xfe), (&[0x02, 0x02, 0x00, 0xff], 0xff), ]; static GOOD_POSITIVE_INTEGERS_LARGE: &[(&[u8], &[u8])] = &[ (&[0x02, 0x02, 0x01, 0x00], &[0x01, 0x00]), (&[0x02, 0x02, 0x02, 0x01], &[0x02, 0x01]), (&[0x02, 0x02, 0x7e, 0xfe], &[0x7e, 0xfe]), (&[0x02, 0x02, 0x7f, 0xff], &[0x7f, 0xff]), // Values that need to have an 0x00 prefix to disambiguate them from // them from negative values. (&[0x02, 0x03, 0x00, 0x80, 0x00], &[0x80, 0x00]), (&[0x02, 0x03, 0x00, 0x81, 0x01], &[0x81, 0x01]), (&[0x02, 0x03, 0x00, 0xfe, 0xfe], &[0xfe, 0xfe]), (&[0x02, 0x03, 0x00, 0xff, 0xff], &[0xff, 0xff]), ]; static BAD_NONNEGATIVE_INTEGERS: &[&[u8]] = &[ &[], // At end of input &[0x02], // Tag only &[0x02, 0x00], // Empty value // Length mismatch &[0x02, 0x00, 0x01], &[0x02, 0x01], // Would be valid if leading zero is ignored when comparing length. &[0x02, 0x01, 0x00, 0x01], &[0x02, 0x01, 0x01, 0x00], // Would be valid if last byte is ignored. &[0x02, 0x02, 0x01], // Values that are missing a necessary leading 0x00 &[0x02, 0x01, 0x80], &[0x02, 0x01, 0x81], &[0x02, 0x01, 0xfe], &[0x02, 0x01, 0xff], // Values that have an unnecessary leading 0x00 &[0x02, 0x02, 0x00, 0x00], &[0x02, 0x02, 0x00, 0x01], &[0x02, 0x02, 0x00, 0x02], &[0x02, 0x02, 0x00, 0x7e], &[0x02, 0x02, 0x00, 0x7f], ]; #[test] fn test_small_nonnegative_integer() { let zero = (ZERO_INTEGER, 0x00); for &(test_in, test_out) in core::iter::once(&zero).chain(GOOD_POSITIVE_INTEGERS_SMALL.iter()) { let result = with_i(test_in, |input| { assert_eq!(small_nonnegative_integer(input)?, test_out); Ok(()) }); assert_eq!(result, Ok(())); } for &test_in in BAD_NONNEGATIVE_INTEGERS .iter() .chain(GOOD_POSITIVE_INTEGERS_LARGE.iter().map(|(input, _)| input)) { let result = with_i(test_in, small_nonnegative_integer); assert_eq!(result, Err(error::Unspecified)); } } #[test] fn test_positive_integer() { for (test_in, test_out) in GOOD_POSITIVE_INTEGERS_SMALL .iter() .map(|(test_in, test_out)| (*test_in, core::slice::from_ref(test_out))) .chain(GOOD_POSITIVE_INTEGERS_LARGE.iter().copied()) { let result = with_i(test_in, |input| { assert_eq!( positive_integer(input)?.big_endian_without_leading_zero(), test_out ); Ok(()) }); assert_eq!(result, Ok(())) } for &test_in in core::iter::once(&ZERO_INTEGER).chain(BAD_NONNEGATIVE_INTEGERS.iter()) { let result = with_i(test_in, positive_integer); assert!(matches!(result, Err(error::Unspecified))); } } } ring-0.17.14/src/io/der_writer.rs000064400000000000000000000045021046102023000146500ustar 00000000000000// Copyright 2018 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{der::*, writer::*, *}; use alloc::boxed::Box; pub(crate) fn write_positive_integer( output: &mut dyn Accumulator, value: &Positive, ) -> Result<(), TooLongError> { let first_byte = value.first_byte(); let value = value.big_endian_without_leading_zero_as_input(); write_tlv(output, Tag::Integer, |output| { if (first_byte & 0x80) != 0 { output.write_byte(0)?; // Disambiguate negative number. } write_copy(output, value) }) } pub(crate) fn write_all( tag: Tag, write_value: &dyn Fn(&mut dyn Accumulator) -> Result<(), TooLongError>, ) -> Result, TooLongError> { let length = { let mut length = LengthMeasurement::zero(); write_tlv(&mut length, tag, write_value)?; length }; let mut output = Writer::with_capacity(length); write_tlv(&mut output, tag, write_value)?; Ok(output.into()) } fn write_tlv(output: &mut dyn Accumulator, tag: Tag, write_value: F) -> Result<(), TooLongError> where F: Fn(&mut dyn Accumulator) -> Result<(), TooLongError>, { let length: usize = { let mut length = LengthMeasurement::zero(); write_value(&mut length)?; length.into() }; let length: u16 = length.try_into().map_err(|_| TooLongError::new())?; output.write_byte(tag.into())?; let [lo, hi] = length.to_le_bytes(); if length >= 0x1_00 { output.write_byte(0x82)?; output.write_byte(hi)?; } else if length >= 0x80 { output.write_byte(0x81)?; } output.write_byte(lo)?; write_value(output) } ring-0.17.14/src/io/positive.rs000064400000000000000000000066721046102023000143560ustar 00000000000000// Copyright 2018 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Serialization and deserialization. use crate::error; /// A serialized positive integer. #[derive(Copy, Clone)] pub struct Positive<'a>(untrusted::Input<'a>); impl<'a> Positive<'a> { #[inline] pub(crate) fn from_be_bytes(input: untrusted::Input<'a>) -> Result { // Empty inputs are not allowed. let &first_byte = input .as_slice_less_safe() .first() .ok_or(error::Unspecified)?; // Zero isn't allowed and leading zeros aren't allowed. if first_byte == 0 { return Err(error::Unspecified); } Ok(Self(input)) } /// Returns the value, ordered from significant byte to least significant /// byte, without any leading zeros. The result is guaranteed to be /// non-empty. #[inline] pub fn big_endian_without_leading_zero(&self) -> &'a [u8] { self.big_endian_without_leading_zero_as_input() .as_slice_less_safe() } #[inline] pub(crate) fn big_endian_without_leading_zero_as_input(&self) -> untrusted::Input<'a> { self.0 } } impl Positive<'_> { /// Returns the first byte. /// /// Will not panic because the value is guaranteed to have at least one /// byte. pub fn first_byte(&self) -> u8 { // This won't panic because self.0.as_slice_less_safe()[0] } } #[cfg(test)] mod tests { use super::*; #[test] fn test_from_be_bytes() { static TEST_CASES: &[(&[u8], Result<&[u8], error::Unspecified>)] = &[ // An empty input isn't a number. (&[], Err(error::Unspecified)), // Zero is not positive. (&[0x00], Err(error::Unspecified)), // Minimum value. No leading zero required or allowed. (&[0x00, 0x01], Err(error::Unspecified)), (&[0x01], Ok(&[0x01])), // Maximum first byte. No leading zero required or allowed. (&[0xff], Ok(&[0xff])), (&[0x00, 0xff], Err(error::Unspecified)), // The last byte can be zero. (&[0x01, 0x00], Ok(&[0x01, 0x00])), (&[0x01, 0x00, 0x00], Ok(&[0x01, 0x00, 0x00])), // Having no zero bytes are also allowed. (&[0x01, 0x01], Ok(&[0x01, 0x01])), // A middle byte can be zero. (&[0x01, 0x00, 0x01], Ok(&[0x01, 0x00, 0x01])), (&[0x01, 0x01, 0x01], Ok(&[0x01, 0x01, 0x01])), ]; for &(input, result) in TEST_CASES { let input = untrusted::Input::from(input); assert_eq!( Positive::from_be_bytes(input).map(|p| p.big_endian_without_leading_zero()), result ); } } } ring-0.17.14/src/io/writer.rs000064400000000000000000000052271046102023000140230ustar 00000000000000// Copyright 2018 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use alloc::{boxed::Box, vec::Vec}; pub trait Accumulator { fn write_byte(&mut self, value: u8) -> Result<(), TooLongError>; fn write_bytes(&mut self, value: &[u8]) -> Result<(), TooLongError>; } pub(super) struct LengthMeasurement { len: usize, } impl From for usize { fn from(len: LengthMeasurement) -> usize { len.len } } impl LengthMeasurement { pub fn zero() -> Self { Self { len: 0 } } } impl Accumulator for LengthMeasurement { fn write_byte(&mut self, _value: u8) -> Result<(), TooLongError> { self.len = self.len.checked_add(1).ok_or_else(TooLongError::new)?; Ok(()) } fn write_bytes(&mut self, value: &[u8]) -> Result<(), TooLongError> { self.len = self .len .checked_add(value.len()) .ok_or_else(TooLongError::new)?; Ok(()) } } pub(super) struct Writer { bytes: Vec, requested_capacity: usize, } impl Writer { pub(super) fn with_capacity(capacity: LengthMeasurement) -> Self { Self { bytes: Vec::with_capacity(capacity.len), requested_capacity: capacity.len, } } } impl From for Box<[u8]> { fn from(writer: Writer) -> Self { assert_eq!(writer.requested_capacity, writer.bytes.len()); writer.bytes.into_boxed_slice() } } impl Accumulator for Writer { fn write_byte(&mut self, value: u8) -> Result<(), TooLongError> { self.bytes.push(value); Ok(()) } fn write_bytes(&mut self, value: &[u8]) -> Result<(), TooLongError> { self.bytes.extend(value); Ok(()) } } pub fn write_copy( accumulator: &mut dyn Accumulator, to_copy: untrusted::Input, ) -> Result<(), TooLongError> { accumulator.write_bytes(to_copy.as_slice_less_safe()) } pub struct TooLongError(()); impl TooLongError { pub fn new() -> Self { Self(()) } } ring-0.17.14/src/io.rs000064400000000000000000000020401046102023000124750ustar 00000000000000// Copyright 2018 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Serialization and deserialization. #[doc(hidden)] pub mod der; #[cfg(feature = "alloc")] mod writer; #[cfg(feature = "alloc")] pub(crate) mod der_writer; pub(crate) mod positive; pub use self::positive::Positive; #[cfg(feature = "alloc")] pub(crate) use self::writer::TooLongError; ring-0.17.14/src/lib.rs000064400000000000000000000123611046102023000126430ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! # Feature Flags //! //! //!
Feature //! Description //!
alloc (default) //! Enable features that require use of the heap, RSA in particular. //!
less-safe-getrandom-custom-or-rdrand //! Treat user-provided ("custom") and RDRAND-based getrandom //! implementations as secure random number generators (see //! SecureRandom). This feature only works with //! os = "none" targets. See //! //! register_custom_getrandom //! and //! RDRAND on x86 //! for additional details. //!
less-safe-getrandom-espidf //! Treat getrandom as a secure random number generator (see //! SecureRandom) on the esp-idf target. While the esp-idf //! target does have hardware RNG, it is beyond the scope of ring to //! ensure its configuration. This feature allows ring to build //! on esp-idf despite the likelihood that RNG is not secure. //! This feature only works with os = espidf targets. //! See //!
std //! Enable features that use libstd, in particular //! std::error::Error integration. Implies `alloc`. //!
wasm32_unknown_unknown_js //! When this feature is enabled, for the wasm32-unknown-unknown target, //! Web APIs will be used to implement features like `ring::rand` that //! require an operating environment of some kind. This has no effect //! for any other target. This enables the `getrandom` crate's `js` //! feature. //!
// When running mk/package.sh, don't actually build any code. #![allow( clippy::collapsible_if, clippy::identity_op, clippy::len_without_is_empty, clippy::let_unit_value, clippy::new_without_default, clippy::neg_cmp_op_on_partial_ord, clippy::too_many_arguments, clippy::type_complexity, non_camel_case_types, non_snake_case, unsafe_code )] #![deny(variant_size_differences)] #![forbid( unused_results, unsafe_op_in_unsafe_fn, clippy::char_lit_as_u8, clippy::fn_to_numeric_cast, clippy::fn_to_numeric_cast_with_truncation, clippy::ptr_as_ptr )] #![warn( clippy::unnecessary_cast, clippy::cast_lossless, clippy::cast_possible_truncation, clippy::cast_possible_wrap, clippy::cast_precision_loss, clippy::cast_sign_loss )] #![cfg_attr( not(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64", feature = "alloc" )), allow(dead_code, unused_imports, unused_macros) )] #![no_std] #[cfg(feature = "alloc")] extern crate alloc; #[macro_use] mod debug; #[macro_use] mod prefixed; #[doc(hidden)] #[macro_use] mod testutil; #[macro_use] mod bssl; #[macro_use] mod polyfill; pub mod aead; pub mod agreement; mod arithmetic; mod bits; pub(crate) mod bb; pub(crate) mod c; #[doc(hidden)] #[deprecated( note = "Will be removed. Internal module not intended for external use, with no promises regarding side channels." )] pub mod deprecated_constant_time; #[doc(hidden)] #[allow(deprecated)] #[deprecated( note = "Will be removed. Internal module not intended for external use, with no promises regarding side channels." )] pub use deprecated_constant_time as constant_time; pub mod io; mod cpu; pub mod digest; mod ec; pub mod error; pub mod hkdf; pub mod hmac; mod limb; pub mod pbkdf2; pub mod pkcs8; pub mod rand; #[cfg(feature = "alloc")] pub mod rsa; pub mod signature; #[cfg(test)] mod tests; mod sealed { /// Traits that are designed to only be implemented internally in *ring*. // // Usage: // ``` // use crate::sealed; // // pub trait MyType: sealed::Sealed { // // [...] // } // // impl sealed::Sealed for MyType {} // ``` pub trait Sealed {} } #[deprecated(note = "internal API that will be removed")] pub mod deprecated_test; #[allow(deprecated)] #[deprecated(note = "internal API that will be removed")] pub use deprecated_test as test; ring-0.17.14/src/limb.rs000064400000000000000000000546451046102023000130330ustar 00000000000000// Copyright 2016 David Judd. // Copyright 2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Unsigned multi-precision integer arithmetic. //! //! Limbs ordered least-significant-limb to most-significant-limb. The bits //! limbs use the native endianness. use crate::{ arithmetic::inout::{AliasingSlices2, AliasingSlices3}, bb, c, error::{self, LenMismatchError}, polyfill::{sliceutil, usize_from_u32, ArrayFlatMap}, }; use core::{iter, num::NonZeroUsize}; #[cfg(any(test, feature = "alloc"))] use crate::bits; #[cfg(feature = "alloc")] use core::num::Wrapping; // XXX: Not correct for x32 ABIs. pub type Limb = bb::Word; pub type LeakyLimb = bb::LeakyWord; pub const LIMB_BITS: usize = usize_from_u32(Limb::BITS); pub const LIMB_BYTES: usize = (LIMB_BITS + 7) / 8; pub type LimbMask = bb::BoolMask; #[inline] pub fn limbs_equal_limbs_consttime(a: &[Limb], b: &[Limb]) -> Result { if a.len() != b.len() { return Err(LenMismatchError::new(a.len())); } let all = a.iter().zip(b).fold(0, |running, (a, b)| running | (a ^ b)); Ok(limb_is_zero(all)) } #[inline] fn limbs_less_than_limbs(a: &[Limb], b: &[Limb]) -> Result { prefixed_extern! { fn LIMBS_less_than(a: *const Limb, b: *const Limb, num_limbs: c::NonZero_size_t) -> LimbMask; } // Use `b.len` because usually `b` will be the modulus which is likely to // have had its length checked already so this can be elided by the // optimizer. // XXX: Questionable whether `LenMismatchError` is appropriate. let len = NonZeroUsize::new(b.len()).ok_or_else(|| LenMismatchError::new(a.len()))?; if a.len() != len.get() { return Err(LenMismatchError::new(a.len())); } Ok(unsafe { LIMBS_less_than(a.as_ptr(), b.as_ptr(), len) }) } #[inline] pub(crate) fn verify_limbs_less_than_limbs_leak_bit( a: &[Limb], b: &[Limb], ) -> Result<(), error::Unspecified> { let r = limbs_less_than_limbs(a, b).map_err(error::erase::)?; if r.leak() { Ok(()) } else { Err(error::Unspecified) } } #[inline] pub fn limbs_less_than_limbs_vartime(a: &[Limb], b: &[Limb]) -> Result { let r = limbs_less_than_limbs(a, b)?; Ok(r.leak()) } #[inline] fn limb_is_zero(limb: Limb) -> LimbMask { prefixed_extern! { fn LIMB_is_zero(limb: Limb) -> LimbMask; } unsafe { LIMB_is_zero(limb) } } #[inline] pub fn limbs_are_zero(limbs: &[Limb]) -> LimbMask { limb_is_zero(limbs.iter().fold(0, |a, b| a | b)) } /// Leaks one bit of information (other than the lengths of the inputs): /// Whether the given limbs are even. #[cfg(any(test, feature = "alloc"))] #[inline] pub fn limbs_reject_even_leak_bit(limbs: &[Limb]) -> Result<(), error::Unspecified> { let bottom = *limbs.first().ok_or(error::Unspecified)?; if limb_is_zero(bottom & 1).leak() { return Err(error::Unspecified); } Ok(()) } #[cfg(any(test, feature = "alloc"))] #[inline] pub fn verify_limbs_equal_1_leak_bit(a: &[Limb]) -> Result<(), error::Unspecified> { if let [bottom, ref rest @ ..] = *a { let equal = limb_is_zero(bottom ^ 1) & limbs_are_zero(rest); if equal.leak() { return Ok(()); } } Err(error::Unspecified) } /// Returns the number of bits in `a`. // // This strives to be constant-time with respect to the values of all bits // except the most significant bit. This does not attempt to be constant-time // with respect to `a.len()` or the value of the result or the value of the // most significant bit (It's 1, unless the input is zero, in which case it's // zero.) #[cfg(any(test, feature = "alloc"))] pub fn limbs_minimal_bits(a: &[Limb]) -> bits::BitLength { for num_limbs in (1..=a.len()).rev() { let high_limb = a[num_limbs - 1]; // Find the number of set bits in |high_limb| by a linear scan from the // most significant bit to the least significant bit. This works great // for the most common inputs because usually the most significant bit // it set. for high_limb_num_bits in (1..=LIMB_BITS).rev() { let shifted = unsafe { LIMB_shr(high_limb, high_limb_num_bits - 1) }; if shifted != 0 { return bits::BitLength::from_bits( ((num_limbs - 1) * LIMB_BITS) + high_limb_num_bits, ); } } } // No bits were set. bits::BitLength::from_bits(0) } /// Equivalent to `if (r >= m) { r -= m; }` #[inline] pub fn limbs_reduce_once(r: &mut [Limb], m: &[Limb]) -> Result<(), LenMismatchError> { prefixed_extern! { fn LIMBS_reduce_once(r: *mut Limb, m: *const Limb, num_limbs: c::NonZero_size_t); } let num_limbs = NonZeroUsize::new(r.len()).ok_or_else(|| LenMismatchError::new(m.len()))?; let r = r.as_mut_ptr(); // Non-dangling because num_limbs is non-zero. let m = m.as_ptr(); // Non-dangling because num_limbs is non-zero. unsafe { LIMBS_reduce_once(r, m, num_limbs) }; Ok(()) } #[derive(Clone, Copy, PartialEq)] pub enum AllowZero { No, Yes, } /// Parses `input` into `result`, verifies that the value is less than /// `max_exclusive`, and pads `result` with zeros to its length. If `allow_zero` /// is not `AllowZero::Yes`, zero values are rejected. /// /// This attempts to be constant-time with respect to the actual value *only if* /// the value is actually in range. In other words, this won't leak anything /// about a valid value, but it might leak small amounts of information about an /// invalid value (which constraint it failed). pub fn parse_big_endian_in_range_and_pad_consttime( input: untrusted::Input, allow_zero: AllowZero, max_exclusive: &[Limb], result: &mut [Limb], ) -> Result<(), error::Unspecified> { parse_big_endian_and_pad_consttime(input, result)?; verify_limbs_less_than_limbs_leak_bit(result, max_exclusive)?; if allow_zero != AllowZero::Yes { if limbs_are_zero(result).leak() { return Err(error::Unspecified); } } Ok(()) } /// Parses `input` into `result`, padding `result` with zeros to its length. /// This attempts to be constant-time with respect to the value but not with /// respect to the length; it is assumed that the length is public knowledge. pub fn parse_big_endian_and_pad_consttime( input: untrusted::Input, result: &mut [Limb], ) -> Result<(), error::Unspecified> { if input.is_empty() { return Err(error::Unspecified); } let input_limbs = input.as_slice_less_safe().rchunks(LIMB_BYTES).map(|chunk| { let mut padded = [0; LIMB_BYTES]; sliceutil::overwrite_at_start(&mut padded[(LIMB_BYTES - chunk.len())..], chunk); Limb::from_be_bytes(padded) }); if input_limbs.len() > result.len() { return Err(error::Unspecified); } result .iter_mut() .zip(input_limbs.chain(iter::repeat(0))) .for_each(|(r, i)| *r = i); Ok(()) } pub fn big_endian_from_limbs(limbs: &[Limb], out: &mut [u8]) { let be_bytes = unstripped_be_bytes(limbs); assert_eq!(out.len(), be_bytes.len()); out.iter_mut().zip(be_bytes).for_each(|(o, i)| { *o = i; }); } /// Returns an iterator of the big-endian encoding of `limbs`. /// /// The number of bytes returned will be a multiple of `LIMB_BYTES` /// and thus may be padded with leading zeros. pub fn unstripped_be_bytes(limbs: &[Limb]) -> impl ExactSizeIterator + Clone + '_ { // The unwrap is safe because a slice can never be larger than `usize` bytes. ArrayFlatMap::new(limbs.iter().rev().copied(), Limb::to_be_bytes).unwrap() } // Used in FFI pub type Window = bb::Word; // Used in FFI pub type LeakyWindow = bb::LeakyWord; /// Processes `limbs` as a sequence of 5-bit windows, folding the windows from /// most significant to least significant and returning the accumulated result. /// The first window will be mapped by `init` to produce the initial value for /// the accumulator. Then `f` will be called to fold the accumulator and the /// next window until all windows are processed. When the input's bit length /// isn't divisible by 5, the window passed to `init` will be partial; all /// windows passed to `fold` will be full. /// /// This is designed to avoid leaking the contents of `limbs` through side /// channels as long as `init` and `fold` are side-channel free. /// /// Panics if `limbs` is empty. #[cfg(feature = "alloc")] pub fn fold_5_bit_windows R, F: Fn(R, Window) -> R>( limbs: &[Limb], init: I, fold: F, ) -> R { #[derive(Clone, Copy)] #[repr(transparent)] struct BitIndex(Wrapping); const WINDOW_BITS: Wrapping = Wrapping(5); prefixed_extern! { fn LIMBS_window5_split_window( lower_limb: Limb, higher_limb: Limb, index_within_word: BitIndex, ) -> Window; fn LIMBS_window5_unsplit_window(limb: Limb, index_within_word: BitIndex) -> Window; } let num_limbs = limbs.len(); let mut window_low_bit = { let num_whole_windows = (num_limbs * LIMB_BITS) / 5; let mut leading_bits = (num_limbs * LIMB_BITS) - (num_whole_windows * 5); if leading_bits == 0 { leading_bits = WINDOW_BITS.0; } BitIndex(Wrapping(LIMB_BITS - leading_bits)) }; let initial_value = { let leading_partial_window = unsafe { LIMBS_window5_split_window(*limbs.first().unwrap(), 0, window_low_bit) }; window_low_bit.0 -= WINDOW_BITS; init(leading_partial_window) }; let mut low_limb = Limb::from(0 as LeakyWindow); limbs.iter().fold(initial_value, |mut acc, current_limb| { let higher_limb = low_limb; low_limb = *current_limb; if window_low_bit.0 > Wrapping(LIMB_BITS) - WINDOW_BITS { let window = unsafe { LIMBS_window5_split_window(low_limb, higher_limb, window_low_bit) }; window_low_bit.0 -= WINDOW_BITS; acc = fold(acc, window); }; while window_low_bit.0 < Wrapping(LIMB_BITS) { let window = unsafe { LIMBS_window5_unsplit_window(low_limb, window_low_bit) }; // The loop exits when this subtraction underflows, causing `window_low_bit` to // wrap around to a very large value. window_low_bit.0 -= WINDOW_BITS; acc = fold(acc, window); } window_low_bit.0 += Wrapping(LIMB_BITS); // "Fix" the underflow. acc }) } #[inline] pub(crate) fn limbs_add_assign_mod( a: &mut [Limb], b: &[Limb], m: &[Limb], ) -> Result<(), LenMismatchError> { prefixed_extern! { // `r` and `a` may alias. fn LIMBS_add_mod( r: *mut Limb, a: *const Limb, b: *const Limb, m: *const Limb, num_limbs: c::NonZero_size_t, ); } let num_limbs = NonZeroUsize::new(m.len()).ok_or_else(|| LenMismatchError::new(m.len()))?; (a, b).with_non_dangling_non_null_pointers_rab(num_limbs, |r, a, b| { let m = m.as_ptr(); // Also non-dangling because `num_limbs` is non-zero. unsafe { LIMBS_add_mod(r, a, b, m, num_limbs) } }) } // r *= 2 (mod m). pub(crate) fn limbs_double_mod(r: &mut [Limb], m: &[Limb]) -> Result<(), LenMismatchError> { prefixed_extern! { // `r` and `a` may alias. fn LIMBS_shl_mod( r: *mut Limb, a: *const Limb, m: *const Limb, num_limbs: c::NonZero_size_t); } let num_limbs = NonZeroUsize::new(m.len()).ok_or_else(|| LenMismatchError::new(m.len()))?; r.with_non_dangling_non_null_pointers_ra(num_limbs, |r, a| { let m = m.as_ptr(); // Also non-dangling because num_limbs > 0. unsafe { LIMBS_shl_mod(r, a, m, num_limbs); } }) } // *r = -a, assuming a is odd. pub(crate) fn limbs_negative_odd(r: &mut [Limb], a: &[Limb]) { debug_assert_eq!(r.len(), a.len()); // Two's complement step 1: flip all the bits. // The compiler should optimize this to vectorized (a ^ !0). r.iter_mut().zip(a.iter()).for_each(|(r, &a)| { *r = !a; }); // Two's complement step 2: Add one. Since `a` is odd, `r` is even. Thus we // can use a bitwise or for addition. r[0] |= 1; } #[cfg(any(test, feature = "alloc"))] prefixed_extern! { fn LIMB_shr(a: Limb, shift: c::size_t) -> Limb; } #[allow(clippy::useless_conversion)] #[cfg(test)] mod tests { use super::*; use alloc::vec::Vec; use cfg_if::cfg_if; const MAX: LeakyLimb = LeakyLimb::MAX; fn leak_in_test(a: LimbMask) -> bool { a.leak() } #[test] fn test_limbs_are_even() { static EVENS: &[&[LeakyLimb]] = &[ &[], &[0], &[2], &[0, 0], &[2, 0], &[0, 1], &[0, 2], &[0, 3], &[0, 0, 0, 0, MAX], ]; for even in EVENS { let even = &Vec::from_iter(even.iter().copied().map(Limb::from)); assert!(matches!( limbs_reject_even_leak_bit(even), Err(error::Unspecified) )); } static ODDS: &[&[LeakyLimb]] = &[ &[1], &[3], &[1, 0], &[3, 0], &[1, 1], &[1, 2], &[1, 3], &[1, 0, 0, 0, MAX], ]; for odd in ODDS { let odd = &Vec::from_iter(odd.iter().copied().map(Limb::from)); assert!(matches!(limbs_reject_even_leak_bit(odd), Ok(()))); } } static ZEROES: &[&[LeakyLimb]] = &[ &[], &[0], &[0, 0], &[0, 0, 0], &[0, 0, 0, 0], &[0, 0, 0, 0, 0], &[0, 0, 0, 0, 0, 0, 0], &[0, 0, 0, 0, 0, 0, 0, 0], &[0, 0, 0, 0, 0, 0, 0, 0, 0], ]; static NONZEROES: &[&[LeakyLimb]] = &[ &[1], &[0, 1], &[1, 1], &[1, 0, 0, 0], &[0, 1, 0, 0], &[0, 0, 1, 0], &[0, 0, 0, 1], ]; #[test] fn test_limbs_are_zero() { for zero in ZEROES { let zero = &Vec::from_iter(zero.iter().copied().map(Limb::from)); assert!(leak_in_test(limbs_are_zero(zero))); } for nonzero in NONZEROES { let nonzero = &Vec::from_iter(nonzero.iter().copied().map(Limb::from)); assert!(!leak_in_test(limbs_are_zero(nonzero))); } } #[test] fn test_limbs_equal_limb() { // Equal static EQUAL: &[&[LeakyLimb]] = &[&[1], &[1, 0], &[1, 0, 0], &[1, 0, 0, 0, 0, 0, 0]]; for a in EQUAL { let a = &Vec::from_iter(a.iter().copied().map(Limb::from)); assert!(matches!(verify_limbs_equal_1_leak_bit(a), Ok(()))); } // Unequal static UNEQUAL: &[&[LeakyLimb]] = &[ &[0], &[2], &[3], &[MAX], &[0, 1], &[1, 1], &[0, 0, 0, 0, 0, 0, 0, 1], &[0, 0, 0, 0, 1, 0, 0, 0], &[0, 0, 0, 0, 1, 0, 0, 1], &[MAX, 1], ]; for a in UNEQUAL { let a = &Vec::from_iter(a.iter().copied().map(Limb::from)); assert!(matches!( verify_limbs_equal_1_leak_bit(a), Err(error::Unspecified) )); } } #[test] fn test_parse_big_endian_and_pad_consttime() { const LIMBS: usize = 4; { // Empty input. let inp = untrusted::Input::from(&[]); let mut result = [0; LIMBS].map(From::::from); assert!(parse_big_endian_and_pad_consttime(inp, &mut result).is_err()); } // The input is longer than will fit in the given number of limbs. { let inp = [1, 2, 3, 4, 5, 6, 7, 8, 9]; let inp = untrusted::Input::from(&inp); let mut result = [0; 8 / LIMB_BYTES].map(From::::from); assert!(parse_big_endian_and_pad_consttime(inp, &mut result[..]).is_err()); } // Less than a full limb. { let inp = [0xfe]; let inp = untrusted::Input::from(&inp); let mut result = [0; LIMBS].map(From::::from); assert_eq!( Ok(()), parse_big_endian_and_pad_consttime(inp, &mut result[..]) ); assert_eq!(&[0xfe, 0, 0, 0], &result); } // A whole limb for 32-bit, half a limb for 64-bit. { let inp = [0xbe, 0xef, 0xf0, 0x0d]; let inp = untrusted::Input::from(&inp); let mut result = [0; LIMBS].map(From::::from); assert_eq!(Ok(()), parse_big_endian_and_pad_consttime(inp, &mut result)); assert_eq!(&[0xbeeff00d, 0, 0, 0], &result); } cfg_if! { if #[cfg(target_pointer_width = "64")] { static TEST_CASES: &[(&[u8], &[Limb])] = &[ (&[1], &[1, 0]), (&[1, 2], &[0x102, 0]), (&[1, 2, 3], &[0x10203, 0]), (&[1, 2, 3, 4], &[0x102_0304, 0]), (&[1, 2, 3, 4, 5], &[0x1_0203_0405, 0]), (&[1, 2, 3, 4, 5, 6], &[0x102_0304_0506, 0]), (&[1, 2, 3, 4, 5, 6, 7], &[0x1_0203_0405_0607, 0]), (&[1, 2, 3, 4, 5, 6, 7, 8], &[0x102_0304_0506_0708, 0]), (&[1, 2, 3, 4, 5, 6, 7, 8, 9], &[0x0203_0405_0607_0809, 0x1]), (&[1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa], &[0x0304_0506_0708_090a, 0x102]), (&[1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb], &[0x0405_0607_0809_0a0b, 0x1_0203]), ]; for (be_bytes, limbs) in TEST_CASES { let mut buf = [0; 2]; parse_big_endian_and_pad_consttime(untrusted::Input::from(be_bytes), &mut buf) .unwrap(); assert_eq!(limbs, &buf, "({be_bytes:x?}, {limbs:x?}"); } } else if #[cfg(target_pointer_width = "32")] { static TEST_CASES: &[(&[u8], &[Limb])] = &[ (&[1], &[1, 0, 0]), (&[1, 2], &[0x102, 0, 0]), (&[1, 2, 3], &[0x10203, 0, 0]), (&[1, 2, 3, 4], &[0x102_0304, 0, 0]), (&[1, 2, 3, 4, 5], &[0x0203_0405, 0x1, 0]), (&[1, 2, 3, 4, 5, 6], &[0x0304_0506, 0x102, 0]), (&[1, 2, 3, 4, 5, 6, 7], &[0x0405_0607, 0x1_0203, 0]), (&[1, 2, 3, 4, 5, 6, 7, 8], &[0x0506_0708, 0x102_0304, 0]), (&[1, 2, 3, 4, 5, 6, 7, 8, 9], &[0x0607_0809, 0x0203_0405, 0x1]), (&[1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa], &[0x0708_090a, 0x0304_0506, 0x102]), (&[1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb], &[0x0809_0a0b, 0x0405_0607, 0x1_0203]), ]; for (be_bytes, limbs) in TEST_CASES { let mut buf = [0; 3]; parse_big_endian_and_pad_consttime(untrusted::Input::from(be_bytes), &mut buf) .unwrap(); assert_eq!(limbs, &buf, "({be_bytes:x?}, {limbs:x?}"); } } else { panic!("Unsupported target_pointer_width"); } // XXX: This is a weak set of tests. TODO: expand it. } } #[test] fn test_big_endian_from_limbs_same_length() { #[cfg(target_pointer_width = "32")] let limbs = [ 0xbccddeef, 0x89900aab, 0x45566778, 0x01122334, 0xddeeff00, 0x99aabbcc, 0x55667788, 0x11223344, ]; #[cfg(target_pointer_width = "64")] let limbs = [ 0x8990_0aab_bccd_deef, 0x0112_2334_4556_6778, 0x99aa_bbcc_ddee_ff00, 0x1122_3344_5566_7788, ]; let limbs = limbs.map(From::::from); let expected = [ 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x78, 0x89, 0x90, 0x0a, 0xab, 0xbc, 0xcd, 0xde, 0xef, ]; let mut out = [0xabu8; 32]; big_endian_from_limbs(&limbs[..], &mut out); assert_eq!(&out[..], &expected[..]); } #[should_panic] #[test] fn test_big_endian_from_limbs_fewer_limbs() { #[cfg(target_pointer_width = "32")] // Two fewer limbs. let limbs = [ 0xbccddeef, 0x89900aab, 0x45566778, 0x01122334, 0xddeeff00, 0x99aabbcc, ]; // One fewer limb. #[cfg(target_pointer_width = "64")] let limbs = [ 0x8990_0aab_bccd_deef, 0x0112_2334_4556_6778, 0x99aa_bbcc_ddee_ff00, ]; let limbs = limbs.map(From::::from); let mut out = [0xabu8; 32]; big_endian_from_limbs(&limbs[..], &mut out); } #[test] fn test_limbs_minimal_bits() { const ALL_ONES: LeakyLimb = LeakyLimb::MAX; static CASES: &[(&[LeakyLimb], usize)] = &[ (&[], 0), (&[0], 0), (&[ALL_ONES], LIMB_BITS), (&[ALL_ONES, 0], LIMB_BITS), (&[ALL_ONES, 1], LIMB_BITS + 1), (&[0, 0], 0), (&[1, 0], 1), (&[0, 1], LIMB_BITS + 1), (&[0, ALL_ONES], 2 * LIMB_BITS), (&[ALL_ONES, ALL_ONES], 2 * LIMB_BITS), (&[ALL_ONES, ALL_ONES >> 1], 2 * LIMB_BITS - 1), (&[ALL_ONES, 0b100_0000], LIMB_BITS + 7), (&[ALL_ONES, 0b101_0000], LIMB_BITS + 7), (&[ALL_ONES, ALL_ONES >> 1], LIMB_BITS + (LIMB_BITS) - 1), ]; for (limbs, bits) in CASES { let limbs = &Vec::from_iter(limbs.iter().copied().map(Limb::from)); assert_eq!(limbs_minimal_bits(limbs).as_bits(), *bits); } } } ring-0.17.14/src/pbkdf2.rs000064400000000000000000000276641046102023000132610ustar 00000000000000// Copyright 2015 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! PBKDF2 derivation and verification. //! //! Use `derive` to derive PBKDF2 outputs. Use `verify` to verify secret //! against previously-derived outputs. //! //! PBKDF2 is specified in [RFC 2898 Section 5.2] with test vectors given in //! [RFC 6070]. See also [NIST Special Publication 800-132]. //! //! [RFC 2898 Section 5.2]: https://tools.ietf.org/html/rfc2898#section-5.2 //! [RFC 6070]: https://tools.ietf.org/html/rfc6070 //! [NIST Special Publication 800-132]: //! http://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-132.pdf //! //! # Examples //! //! ## Password Database Example //! //! ``` //! use ring::{digest, pbkdf2}; //! use std::{collections::HashMap, num::NonZeroU32}; //! //! static PBKDF2_ALG: pbkdf2::Algorithm = pbkdf2::PBKDF2_HMAC_SHA256; //! const CREDENTIAL_LEN: usize = digest::SHA256_OUTPUT_LEN; //! pub type Credential = [u8; CREDENTIAL_LEN]; //! //! enum Error { //! WrongUsernameOrPassword //! } //! //! struct PasswordDatabase { //! pbkdf2_iterations: NonZeroU32, //! db_salt_component: [u8; 16], //! //! // Normally this would be a persistent database. //! storage: HashMap, //! } //! //! impl PasswordDatabase { //! pub fn store_password(&mut self, username: &str, password: &str) { //! let salt = self.salt(username); //! let mut to_store: Credential = [0u8; CREDENTIAL_LEN]; //! pbkdf2::derive(PBKDF2_ALG, self.pbkdf2_iterations, &salt, //! password.as_bytes(), &mut to_store); //! self.storage.insert(String::from(username), to_store); //! } //! //! pub fn verify_password(&self, username: &str, attempted_password: &str) //! -> Result<(), Error> { //! match self.storage.get(username) { //! Some(actual_password) => { //! let salt = self.salt(username); //! pbkdf2::verify(PBKDF2_ALG, self.pbkdf2_iterations, &salt, //! attempted_password.as_bytes(), //! actual_password) //! .map_err(|_| Error::WrongUsernameOrPassword) //! }, //! //! None => Err(Error::WrongUsernameOrPassword) //! } //! } //! //! // The salt should have a user-specific component so that an attacker //! // cannot crack one password for multiple users in the database. It //! // should have a database-unique component so that an attacker cannot //! // crack the same user's password across databases in the unfortunate //! // but common case that the user has used the same password for //! // multiple systems. //! fn salt(&self, username: &str) -> Vec { //! let mut salt = Vec::with_capacity(self.db_salt_component.len() + //! username.as_bytes().len()); //! salt.extend(self.db_salt_component.as_ref()); //! salt.extend(username.as_bytes()); //! salt //! } //! } //! //! fn main() { //! // Normally these parameters would be loaded from a configuration file. //! let mut db = PasswordDatabase { //! pbkdf2_iterations: NonZeroU32::new(100_000).unwrap(), //! db_salt_component: [ //! // This value was generated from a secure PRNG. //! 0xd6, 0x26, 0x98, 0xda, 0xf4, 0xdc, 0x50, 0x52, //! 0x24, 0xf2, 0x27, 0xd1, 0xfe, 0x39, 0x01, 0x8a //! ], //! storage: HashMap::new(), //! }; //! //! db.store_password("alice", "@74d7]404j|W}6u"); //! //! // An attempt to log in with the wrong password fails. //! assert!(db.verify_password("alice", "wrong password").is_err()); //! //! // Normally there should be an expoentially-increasing delay between //! // attempts to further protect against online attacks. //! //! // An attempt to log in with the right password succeeds. //! assert!(db.verify_password("alice", "@74d7]404j|W}6u").is_ok()); //! } use self::{derive_error::DeriveError, verify_error::VerifyError}; use crate::{ bb, cpu, digest, error::{self, TooMuchOutputRequestedError}, hmac::{self, InputTooLongError}, }; use core::num::NonZeroU32; /// A PBKDF2 algorithm. #[derive(Clone, Copy, PartialEq, Eq)] pub struct Algorithm(hmac::Algorithm); /// PBKDF2 using HMAC-SHA1. pub static PBKDF2_HMAC_SHA1: Algorithm = Algorithm(hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY); /// PBKDF2 using HMAC-SHA256. pub static PBKDF2_HMAC_SHA256: Algorithm = Algorithm(hmac::HMAC_SHA256); /// PBKDF2 using HMAC-SHA384. pub static PBKDF2_HMAC_SHA384: Algorithm = Algorithm(hmac::HMAC_SHA384); /// PBKDF2 using HMAC-SHA512. pub static PBKDF2_HMAC_SHA512: Algorithm = Algorithm(hmac::HMAC_SHA512); /// Fills `out` with the key derived using PBKDF2 with the given inputs. /// /// Do not use `derive` as part of verifying a secret; use `verify` instead, to /// minimize the effectiveness of timing attacks. /// /// `out.len()` must be no larger than the digest length * (2**32 - 1), per the /// PBKDF2 specification. /// /// | Parameter | RFC 2898 Section 5.2 Term /// |-------------|------------------------------------------- /// | digest_alg | PRF (HMAC with the given digest algorithm) /// | iterations | c (iteration count) /// | salt | S (salt) /// | secret | P (password) /// | out | dk (derived key) /// | out.len() | dkLen (derived key length) /// /// # Panics /// /// Panics if `out.len() > u32::MAX * digest_alg.output_len()`, where /// `digest_alg` is the underlying HMAC/digest algorithm. /// /// Panics if `salt` is so astronomically gigantic that it isn't a valid input /// to the underlying digest function. /// /// Panics if `secret` is so astronomically gigantic that it isn't a valid /// input to the underlying digest function. pub fn derive( algorithm: Algorithm, iterations: NonZeroU32, salt: &[u8], secret: &[u8], out: &mut [u8], ) { let cpu = cpu::features(); try_derive(algorithm, iterations, salt, secret, out, cpu) .map_err(error::erase::) .unwrap() } fn try_derive( algorithm: Algorithm, iterations: NonZeroU32, salt: &[u8], secret: &[u8], out: &mut [u8], cpu: cpu::Features, ) -> Result<(), DeriveError> { let digest_alg = algorithm.0.digest_algorithm(); let output_len = digest_alg.output_len(); // This implementation's performance is asymptotically optimal as described // in https://jbp.io/2015/08/11/pbkdf2-performance-matters/. However, it // hasn't been optimized to the same extent as fastpbkdf2. In particular, // this implementation is probably doing a lot of unnecessary copying. let secret = hmac::Key::try_new(algorithm.0, secret, cpu).map_err(DeriveError::secret_too_long)?; // Clear |out|. out.fill(0); let mut idx: u32 = 0; let out_len = out.len(); for chunk in out.chunks_mut(output_len) { idx = idx.checked_add(1).ok_or_else(|| { DeriveError::too_much_output_requested(TooMuchOutputRequestedError::new(out_len)) })?; // If the salt is too long, then we'll detect this on the first // iteration before we've written any output. derive_block(&secret, iterations, salt, idx, chunk, cpu) .map_err(DeriveError::salt_too_long)?; } Ok(()) } fn derive_block( secret: &hmac::Key, iterations: NonZeroU32, salt: &[u8], idx: u32, out: &mut [u8], cpu: cpu::Features, ) -> Result<(), InputTooLongError> { let mut ctx = hmac::Context::with_key(secret); ctx.update(salt); ctx.update(&u32::to_be_bytes(idx)); let mut u = ctx.try_sign(cpu)?; let mut remaining: u32 = iterations.into(); loop { bb::xor_assign_at_start(&mut out[..], u.as_ref()); if remaining == 1 { break; } remaining -= 1; // This will not fail, because the output of HMAC is never too long to // be an input for the same algorithm, but we can't prove that with // only locally-available information. u = secret.sign(u.as_ref(), cpu)? } Ok(()) } cold_exhaustive_error! { enum derive_error::DeriveError { secret_too_long => SecretTooLong(InputTooLongError), salt_too_long => SaltTooLong(InputTooLongError), too_much_output_requested => TooMuchOutputRequested(TooMuchOutputRequestedError), } } cold_exhaustive_error! { enum verify_error::VerifyError { mismatch => Mismatch(()), secret_too_long => SecretTooLong(InputTooLongError), salt_too_long => SaltTooLong(InputTooLongError), previously_derived_empty => PreviouslyDerivedEmpty(usize), } } /// Verifies that a previously-derived (e.g., using `derive`) PBKDF2 value /// matches the PBKDF2 value derived from the other inputs. /// /// The comparison is done in constant time to prevent timing attacks. The /// comparison will fail if `previously_derived` is empty (has a length of /// zero). /// /// | Parameter | RFC 2898 Section 5.2 Term /// |----------------------------|-------------------------------------------- /// | digest_alg | PRF (HMAC with the given digest algorithm). /// | `iterations` | c (iteration count) /// | `salt` | S (salt) /// | `secret` | P (password) /// | `previously_derived` | dk (derived key) /// | `previously_derived.len()` | dkLen (derived key length) pub fn verify( algorithm: Algorithm, iterations: NonZeroU32, salt: &[u8], secret: &[u8], previously_derived: &[u8], ) -> Result<(), error::Unspecified> { let cpu = cpu::features(); try_verify(algorithm, iterations, salt, secret, previously_derived, cpu) .map_err(error::erase::) } fn try_verify( algorithm: Algorithm, iterations: NonZeroU32, salt: &[u8], secret: &[u8], previously_derived: &[u8], cpu: cpu::Features, ) -> Result<(), VerifyError> { let digest_alg = algorithm.0.digest_algorithm(); if previously_derived.is_empty() { return Err(VerifyError::previously_derived_empty(0)); } let mut derived_buf = [0u8; digest::MAX_OUTPUT_LEN]; let output_len = digest_alg.output_len(); let secret = hmac::Key::try_new(algorithm.0, secret, cpu).map_err(VerifyError::secret_too_long)?; let mut idx: u32 = 0; let mut matches = 1; for previously_derived_chunk in previously_derived.chunks(output_len) { idx = idx.checked_add(1).ok_or_else(|| { // `previously_derived` is so gigantic that PBKDF2 couldn't // have been used to compute it. VerifyError::mismatch(()) })?; let derived_chunk = &mut derived_buf[..previously_derived_chunk.len()]; derived_chunk.fill(0); derive_block(&secret, iterations, salt, idx, derived_chunk, cpu) .map_err(VerifyError::salt_too_long)?; // XXX: This isn't fully constant-time-safe. TODO: Fix that. #[allow(clippy::bool_to_int_with_if)] let current_block_matches = if bb::verify_slices_are_equal(derived_chunk, previously_derived_chunk).is_ok() { 1 } else { 0 }; matches &= current_block_matches; } if matches == 0 { return Err(VerifyError::mismatch(())); } Ok(()) } ring-0.17.14/src/pkcs8.rs000064400000000000000000000174321046102023000131310ustar 00000000000000// Copyright 2017 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! PKCS#8 is specified in [RFC 5958]. //! //! [RFC 5958]: https://tools.ietf.org/html/rfc5958 use crate::{ec, error, io::der}; pub(crate) struct PublicKeyOptions { /// Should the wrong public key ASN.1 tagging used by early implementations /// of PKCS#8 v2 (including earlier versions of *ring*) be accepted? pub accept_legacy_ed25519_public_key_tag: bool, } pub(crate) enum Version { V1Only, V1OrV2(PublicKeyOptions), V2Only(PublicKeyOptions), } /// A template for constructing PKCS#8 documents. /// /// Note that this only works for ECC. pub(crate) struct Template { pub bytes: &'static [u8], // The range within `bytes` that holds the value (not including the tag and // length) for use in the PKCS#8 document's privateKeyAlgorithm field. pub alg_id_range: core::ops::Range, // `bytes[alg_id_range][curve_id_index..]` contains the OID identifying the, // curve, including the tag and length. pub curve_id_index: usize, // `bytes` will be split into two parts at `private_key_index`, where the // first part is written before the private key and the second part is // written after the private key. The public key is written after the second // part. pub private_key_index: usize, } impl Template { #[inline] fn alg_id_value(&self) -> untrusted::Input { untrusted::Input::from(self.alg_id_value_()) } fn alg_id_value_(&self) -> &[u8] { &self.bytes[self.alg_id_range.start..self.alg_id_range.end] } #[inline] pub fn curve_oid(&self) -> untrusted::Input { untrusted::Input::from(&self.alg_id_value_()[self.curve_id_index..]) } } /// Parses an unencrypted PKCS#8 private key, verifies that it is the right type /// of key, and returns the key value. /// /// PKCS#8 is specified in [RFC 5958]. /// /// [RFC 5958]: https://tools.ietf.org/html/rfc5958 pub(crate) fn unwrap_key<'a>( template: &Template, version: Version, input: untrusted::Input<'a>, ) -> Result<(untrusted::Input<'a>, Option>), error::KeyRejected> { unwrap_key_(template.alg_id_value(), version, input) } /// Parses an unencrypted PKCS#8 private key, verifies that it is the right type /// of key, and returns the key value. /// /// `alg_id` must be the encoded value (not including the outermost `SEQUENCE` /// tag and length) of the `AlgorithmIdentifier` that identifies the key type. /// The result will be an encoded `RSAPrivateKey` or `ECPrivateKey` or similar. /// /// PKCS#8 is specified in [RFC 5958]. /// /// [RFC 5958]: https://tools.ietf.org/html/rfc5958 pub(crate) fn unwrap_key_<'a>( alg_id: untrusted::Input, version: Version, input: untrusted::Input<'a>, ) -> Result<(untrusted::Input<'a>, Option>), error::KeyRejected> { input.read_all(error::KeyRejected::invalid_encoding(), |input| { der::nested( input, der::Tag::Sequence, error::KeyRejected::invalid_encoding(), |input| unwrap_key__(alg_id, version, input), ) }) } fn unwrap_key__<'a>( alg_id: untrusted::Input, version: Version, input: &mut untrusted::Reader<'a>, ) -> Result<(untrusted::Input<'a>, Option>), error::KeyRejected> { let actual_version = der::small_nonnegative_integer(input) .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; // Do things in a specific order to return more useful errors: // 1. Check for completely unsupported version. // 2. Check for algorithm mismatch. // 3. Check for algorithm-specific version mismatch. if actual_version > 1 { return Err(error::KeyRejected::version_not_supported()); }; let actual_alg_id = der::expect_tag_and_get_value(input, der::Tag::Sequence) .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; if actual_alg_id.as_slice_less_safe() != alg_id.as_slice_less_safe() { return Err(error::KeyRejected::wrong_algorithm()); } let public_key_options = match (actual_version, version) { (0, Version::V1Only) => None, (0, Version::V1OrV2(_)) => None, (1, Version::V1OrV2(options)) | (1, Version::V2Only(options)) => Some(options), _ => { return Err(error::KeyRejected::version_not_supported()); } }; let private_key = der::expect_tag_and_get_value(input, der::Tag::OctetString) .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; // Ignore any attributes that are present. if input.peek(der::Tag::ContextSpecificConstructed0.into()) { let _ = der::expect_tag_and_get_value(input, der::Tag::ContextSpecificConstructed0) .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; } let public_key = if let Some(options) = public_key_options { if input.at_end() { return Err(error::KeyRejected::public_key_is_missing()); } const INCORRECT_LEGACY: der::Tag = der::Tag::ContextSpecificConstructed1; let result = if options.accept_legacy_ed25519_public_key_tag && input.peek(INCORRECT_LEGACY.into()) { der::nested( input, INCORRECT_LEGACY, error::Unspecified, der::bit_string_with_no_unused_bits, ) } else { der::bit_string_tagged_with_no_unused_bits(der::Tag::ContextSpecific1, input) }; let public_key = result.map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; Some(public_key) } else { None }; Ok((private_key, public_key)) } /// A generated PKCS#8 document. pub struct Document { bytes: [u8; ec::PKCS8_DOCUMENT_MAX_LEN], len: usize, } impl AsRef<[u8]> for Document { #[inline] fn as_ref(&self) -> &[u8] { &self.bytes[..self.len] } } pub(crate) fn wrap_key(template: &Template, private_key: &[u8], public_key: &[u8]) -> Document { let mut result = Document { bytes: [0; ec::PKCS8_DOCUMENT_MAX_LEN], len: template.bytes.len() + private_key.len() + public_key.len(), }; wrap_key_( template, private_key, public_key, &mut result.bytes[..result.len], ); result } /// Formats a private key "prefix||private_key||middle||public_key" where /// `template` is "prefix||middle" split at position `private_key_index`. fn wrap_key_(template: &Template, private_key: &[u8], public_key: &[u8], bytes: &mut [u8]) { let (before_private_key, after_private_key) = template.bytes.split_at(template.private_key_index); let private_key_end_index = template.private_key_index + private_key.len(); bytes[..template.private_key_index].copy_from_slice(before_private_key); bytes[template.private_key_index..private_key_end_index].copy_from_slice(private_key); bytes[private_key_end_index..(private_key_end_index + after_private_key.len())] .copy_from_slice(after_private_key); bytes[(private_key_end_index + after_private_key.len())..].copy_from_slice(public_key); } ring-0.17.14/src/polyfill/array_flat_map.rs000064400000000000000000000076021046102023000167120ustar 00000000000000use core::iter::FlatMap; /// A specialized version of `core::iter::FlatMap` for mapping over exact-sized /// iterators with a function that returns an array. /// /// `ArrayFlatMap` differs from `FlatMap` in that `ArrayFlatMap` implements /// `ExactSizeIterator`. Since the result of `F` always has `LEN` elements, if /// `I` is an exact-sized iterator of length `inner_len` then we know the /// length of the flat-mapped result is `inner_len * LEN`. (The constructor /// verifies that this multiplication doesn't overflow `usize`.) #[derive(Clone)] pub struct ArrayFlatMap { inner: FlatMap, remaining: usize, } impl ArrayFlatMap where I: ExactSizeIterator, F: FnMut(I::Item) -> [Item; LEN], { /// Constructs an `ArrayFlatMap` wrapping the given iterator, using the /// given function pub fn new(inner: I, f: F) -> Option { let remaining = inner.len().checked_mul(LEN)?; let inner = inner.flat_map(f); Some(Self { inner, remaining }) } } impl Iterator for ArrayFlatMap where I: Iterator, F: FnMut(I::Item) -> [Item; LEN], { type Item = Item; fn next(&mut self) -> Option { let result = self.inner.next(); if result.is_some() { self.remaining -= 1; } result } /// Required for implementing `ExactSizeIterator`. fn size_hint(&self) -> (usize, Option) { (self.remaining, Some(self.remaining)) } } impl ExactSizeIterator for ArrayFlatMap where I: Iterator, F: FnMut(I::Item) -> [Item; LEN], { } #[cfg(test)] mod tests { use super::*; use core::mem::size_of; #[test] fn test_array_flat_map() { static TEST_CASES: &[(&[u16], fn(u16) -> [u8; 2], &[u8])] = &[ // Empty input (&[], u16::to_be_bytes, &[]), // Non-empty input. ( &[0x0102, 0x0304, 0x0506], u16::to_be_bytes, &[1, 2, 3, 4, 5, 6], ), // Test with a different mapping function. ( &[0x0102, 0x0304, 0x0506], u16::to_le_bytes, &[2, 1, 4, 3, 6, 5], ), ]; TEST_CASES.iter().copied().for_each(|(input, f, expected)| { let mapped = ArrayFlatMap::new(input.iter().copied(), f).unwrap(); super::super::test::assert_iterator(mapped, expected); }); } // Does ArrayFlatMap::new() handle overflow correctly? #[test] fn test_array_flat_map_len_overflow() { struct DownwardCounter { remaining: usize, } impl Iterator for DownwardCounter { type Item = usize; fn next(&mut self) -> Option { if self.remaining > 0 { let result = self.remaining; self.remaining -= 1; Some(result) } else { None } } fn size_hint(&self) -> (usize, Option) { (self.remaining, Some(self.remaining)) } } impl ExactSizeIterator for DownwardCounter {} const MAX: usize = usize::MAX / size_of::(); static TEST_CASES: &[(usize, bool)] = &[(MAX, true), (MAX + 1, false)]; TEST_CASES.iter().copied().for_each(|(input_len, is_some)| { let inner = DownwardCounter { remaining: input_len, }; let mapped = ArrayFlatMap::new(inner, usize::to_be_bytes); assert_eq!(mapped.is_some(), is_some); if let Some(mapped) = mapped { assert_eq!(mapped.len(), input_len * size_of::()); } }); } } ring-0.17.14/src/polyfill/array_split_map.rs000064400000000000000000000047441046102023000171230ustar 00000000000000// Copyright 2023 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. pub trait ArraySplitMap { fn array_split_map(self, f: impl Fn([I; CN]) -> O) -> [O; ON]; } impl ArraySplitMap for [I; 12] { #[inline] fn array_split_map(self, f: impl Fn([I; 4]) -> O) -> [O; 3] { let [a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3] = self; [ f([a0, a1, a2, a3]), f([b0, b1, b2, b3]), f([c0, c1, c2, c3]), ] } } impl ArraySplitMap for [I; 16] { #[inline] fn array_split_map(self, f: impl Fn([I; 4]) -> O) -> [O; 4] { let [a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3] = self; [ f([a0, a1, a2, a3]), f([b0, b1, b2, b3]), f([c0, c1, c2, c3]), f([d0, d1, d2, d3]), ] } } impl ArraySplitMap for [I; 32] { #[inline] fn array_split_map(self, f: impl Fn([I; 4]) -> O) -> [O; 8] { let [a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3, e0, e1, e2, e3, f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3] = self; [ f([a0, a1, a2, a3]), f([b0, b1, b2, b3]), f([c0, c1, c2, c3]), f([d0, d1, d2, d3]), f([e0, e1, e2, e3]), f([f0, f1, f2, f3]), f([g0, g1, g2, g3]), f([h0, h1, h2, h3]), ] } } impl ArraySplitMap for [I; 16] { #[inline] fn array_split_map(self, f: impl Fn([I; 8]) -> O) -> [O; 2] { let [a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7] = self; [ f([a0, a1, a2, a3, a4, a5, a6, a7]), f([b0, b1, b2, b3, b4, b5, b6, b7]), ] } } ring-0.17.14/src/polyfill/cold_error.rs000064400000000000000000000071331046102023000160620ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. /// Reduces boilerplate for defining error types where we want the compiler to /// optimize for the non-error path by assuming constructing an error is /// unlikely/cold code. /// /// WARNING: Every struct/variant must contain some *non-constant* value so /// that the "invariant code" pass of the compiler doesn't recognize the /// constructor as being "invariant code" and optimizing it away; /// although such optimization would be nice to take advantage of, it /// seems to lose the `#[cold]` attribute. /// /// Constructor functions ar marked `pub(super)` to ensure that instances can /// only be constructed from within the enclosing module (and its submodules). /// /// XXX: #[inline(never)] is required to avoid the (MIR?) optimizer inlining /// away the function call and losing the `#[cold]` attribute in the process. /// We'd otherwise maybe prefer all constructors to be inline. /// /// The type is defined in its own submodule `#mod_name` to hide the /// variant/struct constructor, ensuring instances are only constructed /// through the generated `$constructor` functions. The constructor methods /// work around the lack of the ability to mark an enum variant `#[cold]` and /// `#[inline(never)]`. macro_rules! cold_exhaustive_error { // struct { struct $mod_name:ident::$Error:ident with $vis:vis constructor { $field:ident: $ValueType:ty } } => { mod $mod_name { #[allow(unused_imports)] use super::*; // So `$ValueType` is in scope. pub struct $Error { #[allow(dead_code)] $field: $ValueType } impl $Error { #[cold] #[inline(never)] $vis fn new($field: $ValueType) -> Self { Self { $field } } } } }; // struct with default constructor visibility. { struct $mod_name:ident::$Error:ident { $field:ident: $ValueType:ty } } => { cold_exhaustive_error! { struct $mod_name::$Error with pub(super) constructor { $field: $ValueType } } }; // enum { enum $mod_name:ident::$Error:ident { $( $constructor:ident => $Variant:ident($ValueType:ty), )+ } } => { mod $mod_name { #[allow(unused_imports)] use super::*; // So `$ValueType` is in scope. pub enum $Error { $( $Variant(#[allow(dead_code)] $ValueType) ),+ } impl $Error { $( #[cold] #[inline(never)] pub(super) fn $constructor(value: $ValueType) -> Self { Self::$Variant(value) } )+ } } }; } ring-0.17.14/src/polyfill/cstr.rs000064400000000000000000000075571046102023000147150ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(all( target_vendor = "apple", any( target_os = "ios", target_os = "macos", target_os = "tvos", target_os = "visionos", target_os = "watchos" ) ))] //! Work around lack of `core::ffi::CStr` prior to Rust 1.64, and the lack of //! `const fn` support for `CStr` in later versions. #![cfg(all( all(target_arch = "aarch64", target_endian = "little"), target_vendor = "apple" ))] use core::mem::{align_of, size_of}; // TODO(MSRV 1.64): Use `core::ffi::c_char`. use libc::c_char; // TODO(MSRV 1.64): Replace with `&core::ffi::CStr`. pub struct Ref(&'static [u8]); impl Ref { #[inline(always)] pub fn as_ptr(&self) -> *const c_char { const _SAME_ALIGNMENT: () = assert!(align_of::() == align_of::()); const _SAME_SIZE: () = assert!(size_of::() == size_of::()); // It is safe to cast a `*const u8` to a `const c_char` as they are the // same size and alignment. self.0.as_ptr().cast() } // SAFETY: Same as `CStr::from_bytes_with_nul_unchecked`. const unsafe fn from_bytes_with_nul_unchecked(value: &'static [u8]) -> Self { Self(value) } } pub const fn unwrap_const_from_bytes_with_nul(value: &'static [u8]) -> Ref { // XXX: We cannot use `unwrap_const` since `Ref`/`CStr` is not `Copy`. match const_from_bytes_with_nul(value) { Some(r) => r, None => panic!("const_from_bytes_with_nul failed"), } } // TODO(MSRV 1.72): Replace with `CStr::from_bytes_with_nul`. #[inline(always)] const fn const_from_bytes_with_nul(value: &'static [u8]) -> Option { const fn const_contains(mut value: &[u8], needle: &u8) -> bool { while let [head, tail @ ..] = value { if *head == *needle { return true; } value = tail; } false } // TODO(MSRV 1.69): Use `core::ffi::CStr::from_bytes_until_nul` match value { [before_nul @ .., 0] if !const_contains(before_nul, &0) => { // SAFETY: // * `value` is nul-terminated according to the slice pattern. // * `value` doesn't contain any interior null, by the guard. // TODO(MSRV 1.64): Use `CStr::from_bytes_with_nul_unchecked` Some(unsafe { Ref::from_bytes_with_nul_unchecked(value) }) } _ => None, } } mod tests { use super::const_from_bytes_with_nul; // Bad. const _EMPTY_UNTERMINATED: () = assert!(const_from_bytes_with_nul(b"").is_none()); const _EMPTY_DOUBLE_TERMINATED: () = assert!(const_from_bytes_with_nul(b"\0\0").is_none()); const _DOUBLE_NUL: () = assert!(const_from_bytes_with_nul(b"\0\0").is_none()); const _LEADINGL_NUL: () = assert!(const_from_bytes_with_nul(b"\0a\0").is_none()); const _INTERNAL_NUL_UNTERMINATED: () = assert!(const_from_bytes_with_nul(b"\0a").is_none()); // Good. const _EMPTY_TERMINATED: () = assert!(const_from_bytes_with_nul(b"\0").is_some()); const _NONEMPTY: () = assert!(const_from_bytes_with_nul(b"asdf\0").is_some()); const _1_CHAR: () = assert!(const_from_bytes_with_nul(b"a\0").is_some()); } ring-0.17.14/src/polyfill/leading_zeros_skipped.rs000064400000000000000000000034121046102023000202700ustar 00000000000000use core::iter::Peekable; /// An iterator that skips all leading zeros. /// /// When the wrapped iterator is all zeros, then the last item is retained. pub struct LeadingZerosStripped where I: Iterator, { inner: Peekable, } impl Clone for LeadingZerosStripped where I: Iterator, Peekable: Clone, { fn clone(&self) -> Self { Self { inner: self.inner.clone(), } } } impl LeadingZerosStripped where I: ExactSizeIterator, { pub fn new(inner: I) -> Self { let mut len = inner.len(); let mut inner = inner.peekable(); // Strip all leading zeroes, but don't strip the last byte if all bytes // were zero. while len > 1 && inner.next_if_eq(&0).is_some() { len -= 1; } Self { inner } } } impl Iterator for LeadingZerosStripped where I: Iterator, { type Item = I::Item; fn next(&mut self) -> Option { self.inner.next() } fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } } impl ExactSizeIterator for LeadingZerosStripped where I: ExactSizeIterator {} #[cfg(test)] mod tests { use super::*; #[test] fn test_leading_zeroes_stripped() { static TEST_CASES: &[(&[u8], &[u8])] = &[ (&[], &[]), (&[0], &[0]), (&[0, 1], &[1]), (&[0, 0, 1], &[1]), (&[0, 0, 0, 1], &[1]), (&[1, 0], &[1, 0]), (&[0, 1, 0], &[1, 0]), ]; TEST_CASES.iter().copied().for_each(|(input, expected)| { let stripped = LeadingZerosStripped::new(input.iter().copied()); super::super::test::assert_iterator(stripped, expected); }); } } ring-0.17.14/src/polyfill/notsend.rs000064400000000000000000000023411046102023000153760ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use crate::testutil; use core::{marker::PhantomData, mem::size_of}; /// A ZST that can be added to any type to make the type `!Send`. #[derive(Clone, Copy)] pub struct NotSend(PhantomData<*mut ()>); impl NotSend { pub const VALUE: Self = Self(PhantomData); } #[allow(deprecated)] const _: () = testutil::compile_time_assert_clone::(); #[allow(deprecated)] const _: () = testutil::compile_time_assert_copy::(); const _: () = assert!(size_of::() == 0); ring-0.17.14/src/polyfill/once_cell/LICENSE-APACHE000064400000000000000000000251371046102023000172350ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ring-0.17.14/src/polyfill/once_cell/LICENSE-MIT000064400000000000000000000017761046102023000167500ustar 00000000000000Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHOR OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ring-0.17.14/src/polyfill/once_cell/race.rs000064400000000000000000000050501046102023000165610ustar 00000000000000//! Thread-safe, non-blocking, "first one wins" flavor of `OnceCell`. //! //! If two threads race to initialize a type from the `race` module, they //! don't block, execute initialization function together, but only one of //! them stores the result. //! //! This module does not require `std` feature. //! //! # Atomic orderings //! //! All types in this module use `Acquire` and `Release` //! [atomic orderings](Ordering) for all their operations. While this is not //! strictly necessary for types other than `OnceBox`, it is useful for users as //! it allows them to be certain that after `get` or `get_or_init` returns on //! one thread, any side-effects caused by the setter thread prior to them //! calling `set` or `get_or_init` will be made visible to that thread; without //! it, it's possible for it to appear as if they haven't happened yet from the //! getter thread's perspective. This is an acceptable tradeoff to make since //! `Acquire` and `Release` have very little performance overhead on most //! architectures versus `Relaxed`. use core::sync::atomic; use atomic::{AtomicUsize, Ordering}; use core::num::NonZeroUsize; /// A thread-safe cell which can be written to only once. pub struct OnceNonZeroUsize { inner: AtomicUsize, } impl OnceNonZeroUsize { /// Creates a new empty cell. #[inline] pub const fn new() -> OnceNonZeroUsize { OnceNonZeroUsize { inner: AtomicUsize::new(0), } } /// Gets the underlying value. #[inline] pub fn get(&self) -> Option { let val = self.inner.load(Ordering::Acquire); NonZeroUsize::new(val) } /// Gets the contents of the cell, initializing it with `f` if the cell was /// empty. /// /// If several threads concurrently run `get_or_init`, more than one `f` can /// be called. However, all threads will return the same value, produced by /// some `f`. pub fn get_or_init(&self, f: F) -> NonZeroUsize where F: FnOnce() -> NonZeroUsize, { let val = self.inner.load(Ordering::Acquire); match NonZeroUsize::new(val) { Some(it) => it, None => self.init(f), } } #[cold] #[inline(never)] fn init(&self, f: impl FnOnce() -> NonZeroUsize) -> NonZeroUsize { let mut val = f().get(); let exchange = self .inner .compare_exchange(0, val, Ordering::AcqRel, Ordering::Acquire); if let Err(old) = exchange { val = old; } unsafe { NonZeroUsize::new_unchecked(val) } } } ring-0.17.14/src/polyfill/ptr.rs000064400000000000000000000020611046102023000145300ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // TODO(MSRV 1.76): Replace with `core::ptr::from_mut`. #[allow(dead_code)] #[inline(always)] pub fn from_mut(r: &mut T) -> *mut T { r } // TODO(MSRV 1.76): Replace with `core::ptr::from_ref`. #[allow(dead_code)] #[inline(always)] pub const fn from_ref(r: &T) -> *const T { r } ring-0.17.14/src/polyfill/slice/as_chunks.rs000064400000000000000000000064641046102023000170130ustar 00000000000000// Copyright 2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::AsChunksMut; use core::ops; #[inline(always)] pub fn as_chunks(slice: &[T]) -> (AsChunks, &[T]) { assert!(N != 0, "chunk size must be non-zero"); let len = slice.len() / N; let (multiple_of_n, remainder) = slice.split_at(len * N); (AsChunks(multiple_of_n), remainder) } #[derive(Clone, Copy)] pub struct AsChunks<'a, T, const N: usize>(&'a [T]); impl<'a, T, const N: usize> AsChunks<'a, T, N> { #[inline(always)] pub fn from_ref(value: &'a [T; N]) -> Self { Self(value) } #[inline(always)] pub fn as_flattened(&self) -> &[T] { self.0 } #[cfg(any(target_arch = "aarch64", target_arch = "arm", target_arch = "x86_64"))] #[inline(always)] pub fn as_ptr(&self) -> *const [T; N] { self.0.as_ptr().cast() } #[inline(always)] pub fn is_empty(&self) -> bool { self.0.is_empty() } #[inline(always)] pub fn len(&self) -> usize { self.0.len() / N } } impl ops::Index for AsChunks<'_, T, N> where [T]: ops::Index, Output = [T]>, { type Output = [T; N]; #[inline(always)] fn index(&self, index: usize) -> &Self::Output { let start = N * index; let slice = &self.0[start..(start + N)]; slice.try_into().unwrap() } } impl<'a, T, const N: usize> IntoIterator for AsChunks<'a, T, N> { type IntoIter = AsChunksIter<'a, T, N>; type Item = &'a [T; N]; #[inline(always)] fn into_iter(self) -> Self::IntoIter { AsChunksIter(self.0.chunks_exact(N)) } } pub struct AsChunksIter<'a, T, const N: usize>(core::slice::ChunksExact<'a, T>); impl<'a, T, const N: usize> Iterator for AsChunksIter<'a, T, N> { type Item = &'a [T; N]; #[inline(always)] fn next(&mut self) -> Option { self.0.next().map(|x| x.try_into().unwrap()) } } // `&mut [[T; N]]` is implicitly convertable to `&[[T; N]]` but our types can't // do that. impl<'a, T, const N: usize> From<&'a AsChunksMut<'_, T, N>> for AsChunks<'a, T, N> { #[inline(always)] fn from(as_mut: &'a AsChunksMut<'_, T, N>) -> Self { Self(as_mut.as_flattened()) } } impl<'a, T, const N: usize> From<&'a [T; N]> for AsChunks<'a, T, N> { #[inline(always)] fn from(array: &'a [T; N]) -> Self { Self(array) } } // TODO: `impl From for AsChunks<'a, T, N>`. impl<'a, T> From> for AsChunks<'a, T, 4> { #[inline(always)] fn from(as_2x: AsChunks<'a, T, 8>) -> Self { Self(as_2x.0) } } ring-0.17.14/src/polyfill/slice/as_chunks_mut.rs000064400000000000000000000055471046102023000177010ustar 00000000000000// Copyright 2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::AsChunks; #[inline(always)] pub fn as_chunks_mut(slice: &mut [T]) -> (AsChunksMut, &mut [T]) { assert!(N != 0, "chunk size must be non-zero"); let len = slice.len() / N; let (multiple_of_n, remainder) = slice.split_at_mut(len * N); (AsChunksMut(multiple_of_n), remainder) } pub struct AsChunksMut<'a, T, const N: usize>(&'a mut [T]); impl AsChunksMut<'_, T, N> { #[inline(always)] pub fn as_flattened(&self) -> &[T] { self.0 } #[inline(always)] pub fn as_flattened_mut(&mut self) -> &mut [T] { self.0 } #[cfg(target_arch = "aarch64")] pub fn as_ptr(&self) -> *const [T; N] { self.0.as_ptr().cast() } #[cfg(target_arch = "x86_64")] pub fn as_ptr(&self) -> *const [T; N] { self.0.as_ptr().cast() } #[cfg(target_arch = "aarch64")] pub fn as_mut_ptr(&mut self) -> *mut [T; N] { self.0.as_mut_ptr().cast() } #[cfg(target_arch = "x86_64")] #[inline(always)] pub fn as_mut(&mut self) -> AsChunksMut { AsChunksMut(self.0) } #[inline(always)] pub fn as_ref(&self) -> AsChunks { AsChunks::::from(self) } // Argument moved from runtime argument to `const` argument so that // `CHUNK_LEN * N` is checked at compile time for overflow. #[inline(always)] pub fn chunks_mut(&mut self) -> AsChunksMutChunksMutIter { AsChunksMutChunksMutIter(self.0.chunks_mut(CHUNK_LEN * N)) } #[cfg(target_arch = "x86_64")] #[inline(always)] pub fn split_at_mut(&mut self, mid: usize) -> (AsChunksMut, AsChunksMut) { let (before, after) = self.0.split_at_mut(mid * N); (AsChunksMut(before), AsChunksMut(after)) } } pub struct AsChunksMutChunksMutIter<'a, T, const N: usize>(core::slice::ChunksMut<'a, T>); impl<'a, T, const N: usize> Iterator for AsChunksMutChunksMutIter<'a, T, N> { type Item = AsChunksMut<'a, T, N>; #[inline(always)] fn next(&mut self) -> Option { self.0.next().map(AsChunksMut) } } ring-0.17.14/src/polyfill/slice.rs000064400000000000000000000041041046102023000150220ustar 00000000000000// Permission is hereby granted, free of charge, to any // person obtaining a copy of this software and associated // documentation files (the "Software"), to deal in the // Software without restriction, including without // limitation the rights to use, copy, modify, merge, // publish, distribute, sublicense, and/or sell copies of // the Software, and to permit persons to whom the Software // is furnished to do so, subject to the following // conditions: // // The above copyright notice and this permission notice // shall be included in all copies or substantial portions // of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED // TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A // PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT // SHALL THE AUTHOR OR COPYRIGHT HOLDERS BE LIABLE FOR ANY // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR // IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. mod as_chunks; mod as_chunks_mut; pub use as_chunks::{as_chunks, AsChunks}; pub use as_chunks_mut::{as_chunks_mut, AsChunksMut}; // TODO(MSRV feature(split_at_checked)): Use `slice::split_at_checked`. // // Note that the libcore version is implemented in terms of // `slice::split_at_unchecked()`, and `slice::split_at()` was changed to be // implemented in terms of `split_at_checked`. For now, we implement this in // terms of `split_at` and rely on the optimizer to eliminate the panic. #[inline(always)] pub fn split_at_checked(slice: &[T], i: usize) -> Option<(&[T], &[T])> { if slice.len() >= i { Some(slice.split_at(i)) } else { None } } // TODO(MSRV-1.77): Use `slice::split_first_chunk_mut`. #[inline(always)] pub fn split_first_chunk_mut( slice: &mut [T], ) -> Option<(&mut [T; N], &mut [T])> { if slice.len() >= N { let (head, tail) = slice.split_at_mut(N); head.try_into().ok().map(|head| (head, tail)) } else { None } } ring-0.17.14/src/polyfill/sliceutil.rs000064400000000000000000000021221046102023000157160ustar 00000000000000// Copyright 2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Utilities to make dealing with slices less tediuous. /// Replaces the first N elements of `a` with the first N elements of `b`, where /// N is `core::cmp::min(a.len(), b.len())`, leaving the rest unchanged. pub fn overwrite_at_start(a: &mut [T], b: &[T]) { a.iter_mut().zip(b).for_each(|(a, b)| { *a = *b; }); } ring-0.17.14/src/polyfill/test.rs000064400000000000000000000017341046102023000147100ustar 00000000000000pub fn assert_iterator(it: impl ExactSizeIterator + Clone, expected: &[T]) where T: Copy + core::fmt::Debug + PartialEq, { // Assert that the cloned iterator is correct. assert_exact_size_iterator(it.clone(), expected); // Assert that the original iterator is correct. assert_exact_size_iterator(it, expected); } /// Asserts that `it` adheres to the `ExactSizeIterator` contract. fn assert_exact_size_iterator(mut it: impl ExactSizeIterator, expected: &[T]) where T: Copy + core::fmt::Debug + PartialEq, { assert_eq!(it.len(), expected.len()); assert_eq!(it.size_hint(), expected.iter().size_hint()); for i in 0..expected.len() { let len = it.len(); assert_eq!(len, expected.len() - i); assert_eq!(it.size_hint(), (len, Some(len))); assert_eq!(it.next(), Some(expected[i])); } assert_eq!(it.len(), 0); assert_eq!(it.size_hint(), (0, Some(0))); assert_eq!(it.next(), None); } ring-0.17.14/src/polyfill/unwrap_const.rs000064400000000000000000000022351046102023000164500ustar 00000000000000// Copyright 2022 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. /// Polyfill for `Option::unwrap()` as a const fn; feature `const_option`. /// https://github.com/rust-lang/rust/issues/67441. /// TODO(MSRV): Replace this with `x.unwrap()`. /// /// `T: Copy` avoids "constant functions cannot evaluate destructors." pub const fn unwrap_const(x: Option) -> T where T: Copy, { if let Some(x) = x { x } else { panic!("unwrap_const on `None`"); } } ring-0.17.14/src/polyfill.rs000064400000000000000000000055061046102023000137320ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Polyfills for functionality that will (hopefully) be added to Rust's //! standard library soon. #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] #[inline(always)] pub const fn u64_from_usize(x: usize) -> u64 { x as u64 } #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] pub const fn usize_from_u32(x: u32) -> usize { x as usize } #[cfg(all( target_arch = "aarch64", target_endian = "little", target_pointer_width = "64" ))] #[allow(clippy::cast_possible_truncation)] pub fn usize_from_u64(x: u64) -> usize { x as usize } /// const-capable `x.try_into().unwrap_or(usize::MAX)` #[allow(clippy::cast_possible_truncation)] #[inline(always)] pub const fn usize_from_u64_saturated(x: u64) -> usize { const USIZE_MAX: u64 = u64_from_usize(usize::MAX); if x < USIZE_MAX { x as usize } else { usize::MAX } } #[macro_use] mod cold_error; mod array_flat_map; mod array_split_map; pub mod cstr; pub mod sliceutil; #[cfg(feature = "alloc")] mod leading_zeros_skipped; #[cfg(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" ))] pub mod once_cell { pub mod race; } mod notsend; pub mod ptr; pub mod slice; #[cfg(test)] mod test; mod unwrap_const; pub use self::{ array_flat_map::ArrayFlatMap, array_split_map::ArraySplitMap, notsend::NotSend, unwrap_const::unwrap_const, }; #[cfg(feature = "alloc")] pub use leading_zeros_skipped::LeadingZerosStripped; #[cfg(test)] mod tests { use super::*; #[test] fn test_usize_from_u64_saturated() { const USIZE_MAX: u64 = u64_from_usize(usize::MAX); assert_eq!(usize_from_u64_saturated(u64::MIN), usize::MIN); assert_eq!(usize_from_u64_saturated(USIZE_MAX), usize::MAX); assert_eq!(usize_from_u64_saturated(USIZE_MAX - 1), usize::MAX - 1); #[cfg(not(target_pointer_width = "64"))] { assert_eq!(usize_from_u64_saturated(USIZE_MAX + 1), usize::MAX); } } } ring-0.17.14/src/prefixed.rs000064400000000000000000000053771046102023000137140ustar 00000000000000// Keep in sync with `core_name_and_version` in build.rs. macro_rules! core_name_and_version { () => { concat!( env!("CARGO_PKG_NAME"), "_core_", env!("CARGO_PKG_VERSION_MAJOR"), "_", env!("CARGO_PKG_VERSION_MINOR"), "_", env!("CARGO_PKG_VERSION_PATCH"), "_", env!("CARGO_PKG_VERSION_PRE"), // Often empty ) }; } // Keep in sync with `prefix` in build.rs. macro_rules! prefix { ( ) => { concat!(core_name_and_version!(), "_") }; } macro_rules! prefixed_extern { // Functions. { $( $( #[$meta:meta] )* $vis:vis fn $name:ident ( $( $arg_pat:ident : $arg_ty:ty ),* $(,)? ) $( -> $ret_ty:ty )?; )+ } => { extern "C" { $( prefixed_item! { link_name $name { $( #[$meta] )* $vis fn $name ( $( $arg_pat : $arg_ty ),* ) $( -> $ret_ty )?; } } )+ } }; // A `static` global variable. { $( #[$meta:meta] )* $vis:vis static $name:ident: $typ:ty; } => { extern "C" { prefixed_item! { link_name $name { $( #[$meta] )* $vis static $name: $typ; } } } }; // A `static mut` global variable. { $( #[$meta:meta] )* $vis:vis static mut $name:ident: $typ:ty; } => { extern "C" { prefixed_item! { link_name $name { $( #[$meta] )* $vis static mut $name: $typ; } } } }; } #[deprecated = "`#[export_name]` creates problems and we will stop doing it."] #[cfg(not(any( all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), target_arch = "x86", target_arch = "x86_64" )))] macro_rules! prefixed_export { // A function. { $( #[$meta:meta] )* $vis:vis unsafe extern "C" fn $name:ident ( $( $arg_pat:ident : $arg_ty:ty ),* $(,)? ) $body:block } => { prefixed_item! { export_name $name { $( #[$meta] )* $vis unsafe extern "C" fn $name ( $( $arg_pat : $arg_ty ),* ) $body } } }; } macro_rules! prefixed_item { { $attr:ident $name:ident { $item:item } } => { #[$attr = concat!(prefix!(), stringify!($name))] $item }; } ring-0.17.14/src/rand.rs000064400000000000000000000124461046102023000130250ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. //! Cryptographic pseudo-random number generation. //! //! *ring* functions that generate random bytes take a `&dyn SecureRandom` //! parameter to make it clear which functions are non-deterministic. use crate::error; /// A secure random number generator. pub trait SecureRandom: sealed::SecureRandom { /// Fills `dest` with random bytes. fn fill(&self, dest: &mut [u8]) -> Result<(), error::Unspecified>; } impl SecureRandom for T where T: sealed::SecureRandom, { #[inline(always)] fn fill(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> { self.fill_impl(dest) } } /// A random value constructed from a `SecureRandom` that hasn't been exposed /// through any safe Rust interface. /// /// Intentionally does not implement any traits other than `Sized`. pub struct Random(T); impl Random { /// Expose the random value. #[inline] pub fn expose(self) -> T { self.0 } } /// Generate the new random value using `rng`. #[inline] pub fn generate( rng: &dyn SecureRandom, ) -> Result, error::Unspecified> { let mut r = T::zero(); rng.fill(r.as_mut_bytes())?; Ok(Random(r)) } pub(crate) mod sealed { use crate::error; pub trait SecureRandom: core::fmt::Debug { /// Fills `dest` with random bytes. fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified>; } pub trait RandomlyConstructable: Sized { fn zero() -> Self; // `Default::default()` fn as_mut_bytes(&mut self) -> &mut [u8]; // `AsMut<[u8]>::as_mut` } impl RandomlyConstructable for [u8; N] { #[inline] fn zero() -> Self { [0; N] } #[inline] fn as_mut_bytes(&mut self) -> &mut [u8] { &mut self[..] } } } /// A type that can be returned by `ring::rand::generate()`. pub trait RandomlyConstructable: sealed::RandomlyConstructable {} impl RandomlyConstructable for T where T: sealed::RandomlyConstructable {} /// A secure random number generator where the random values come directly /// from the operating system. /// /// "Directly from the operating system" here presently means "whatever the /// `getrandom` crate does" but that may change in the future. That roughly /// means calling libc's `getrandom` function or whatever is analogous to that; /// see the `getrandom` crate's documentation for more info. /// /// A single `SystemRandom` may be shared across multiple threads safely. /// /// `new()` is guaranteed to always succeed and to have low latency; it won't /// try to open or read from a file or do similar things. The first call to /// `fill()` may block a substantial amount of time since any and all /// initialization is deferred to it. Therefore, it may be a good idea to call /// `fill()` once at a non-latency-sensitive time to minimize latency for /// future calls. #[derive(Clone, Debug)] pub struct SystemRandom(()); impl SystemRandom { /// Constructs a new `SystemRandom`. #[inline(always)] pub fn new() -> Self { Self(()) } } impl crate::sealed::Sealed for SystemRandom {} // Use the `getrandom` crate whenever it is using the environment's (operating // system's) CSPRNG. Avoid using it on targets where it uses the `rdrand` // implementation. #[cfg(any( all(feature = "less-safe-getrandom-custom-or-rdrand", target_os = "none"), all(feature = "less-safe-getrandom-espidf", target_os = "espidf"), target_os = "aix", target_os = "android", target_os = "dragonfly", target_os = "freebsd", target_os = "fuchsia", target_os = "haiku", target_os = "hermit", target_os = "hurd", target_os = "horizon", target_os = "illumos", target_os = "linux", target_os = "netbsd", target_os = "openbsd", target_os = "redox", target_os = "solaris", target_os = "vita", target_os = "windows", all( target_vendor = "apple", any( target_os = "ios", target_os = "macos", target_os = "tvos", target_os = "visionos", target_os = "watchos", ) ), all( target_arch = "wasm32", any( target_os = "wasi", all(target_os = "unknown", feature = "wasm32_unknown_unknown_js") ) ), ))] impl sealed::SecureRandom for SystemRandom { #[inline(always)] fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> { getrandom::getrandom(dest).map_err(|_| error::Unspecified) } } ring-0.17.14/src/rsa/keypair.rs000064400000000000000000000626071046102023000143360ustar 00000000000000// Copyright 2015-2016 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{ padding::{self, RsaEncoding}, KeyPairComponents, PublicExponent, PublicKey, PublicKeyComponents, N, }; /// RSA PKCS#1 1.5 signatures. use crate::{ arithmetic::{ bigint, montgomery::{R, RR, RRR}, LimbSliceError, }, bits::BitLength, cpu, digest, error::{self, KeyRejected}, io::der, pkcs8, rand, signature, }; /// An RSA key pair, used for signing. pub struct KeyPair { p: PrivateCrtPrime