blake3-1.5.4/.cargo_vcs_info.json0000644000000001360000000000100122040ustar { "git": { "sha1": "95e42b84fc4709974c7b23c7ae885989ab36c31e" }, "path_in_vcs": "" }blake3-1.5.4/.git-blame-ignore-revs000064400000000000000000000001141046102023000150700ustar 00000000000000# CMakeLists.txt whitespace fixups 3e14f865d30271c74fc68d417af488ea91b66d48 blake3-1.5.4/.github/workflows/build_b3sum.py000064400000000000000000000021721046102023000171550ustar 00000000000000#! /usr/bin/env python3 from pathlib import Path import platform import shutil import subprocess import sys ROOT = Path(__file__).parent.parent.parent RUST_TARGET = sys.argv[1] subprocess.run( ["cargo", "build", "--target", sys.argv[1], "--release"], cwd=ROOT / "b3sum" ) if platform.system() == "Windows": original_exe_name = "b3sum.exe" else: original_exe_name = "b3sum" if platform.system() == "Windows": new_exe_name = "b3sum_windows_x64_bin.exe" elif platform.system() == "Darwin": new_exe_name = "b3sum_macos_x64_bin" elif platform.system() == "Linux": new_exe_name = "b3sum_linux_x64_bin" else: raise RuntimeError("Unexpected platform: " + platform.system()) # Copy the built binary so that it has the upload name we want. out_dir = ROOT / "b3sum/target" / RUST_TARGET / "release" original_exe_path = str(out_dir / original_exe_name) new_exe_path = str(out_dir / new_exe_name) print("copying", repr(original_exe_path), "to", repr(new_exe_path)) shutil.copyfile(original_exe_path, new_exe_path) # This lets the subsequent upload step get the filepath. print("::set-output name=bin_path::" + new_exe_path) blake3-1.5.4/.github/workflows/ci.yml000064400000000000000000000340331046102023000155120ustar 00000000000000name: tests on: push: branches: - "*" # not on tags pull_request: env: BLAKE3_CI: "1" RUSTFLAGS: "-D warnings" RUST_BACKTRACE: "1" jobs: library_tests: name: ${{ matrix.target.name }} ${{ matrix.channel }} runs-on: ${{ matrix.target.os }} strategy: fail-fast: false matrix: target: [ { "os": "ubuntu-latest", "toolchain": "x86_64-unknown-linux-gnu", "name": "Linux GNU" }, { "os": "macOS-latest", "toolchain": "x86_64-apple-darwin", "name": "macOS" }, { "os": "windows-latest", "toolchain": "x86_64-pc-windows-msvc", "name": "Windows MSVC" }, { "os": "windows-latest", "toolchain": "x86_64-pc-windows-gnu", "name": "Windows GNU" } ] channel: [ "stable", "beta", "nightly", # The current MSRV. This crate doesn't have an official MSRV policy, # but in practice we'll probably do what libc does: # https://github.com/rust-lang/libs-team/issues/72. # This test target is here so that we notice if we accidentally bump # the MSRV, but it's not a promise that we won't bump it. "1.70.0", ] steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} # Print the compiler version, for debugging. - name: print compiler version run: cargo run --quiet working-directory: ./tools/compiler_version # Print out instruction set support, for debugging. - name: print instruction set support run: cargo run --quiet working-directory: ./tools/instruction_set_support # Default tests plus Rayon and trait implementations. - run: cargo test --features=rayon,traits-preview,serde,zeroize # Same but with only one thread in the Rayon pool. This can find deadlocks. - name: "again with RAYON_NUM_THREADS=1" run: cargo test --features=rayon,traits-preview,serde,zeroize env: RAYON_NUM_THREADS: 1 # The mmap feature by itself (update_mmap_rayon is omitted). - run: cargo test --features=mmap # All public features put together. - run: cargo test --features=mmap,rayon,traits-preview,serde,zeroize # no_std tests. - run: cargo test --no-default-features # A matrix of different test settings: # - debug vs release # - assembly vs Rust+C intrinsics vs pure Rust intrinsics # - different levels of SIMD support # # Full SIMD support. - run: cargo test --features= - run: cargo test --features=prefer_intrinsics - run: cargo test --features=pure - run: cargo test --features= --release - run: cargo test --features=prefer_intrinsics --release - run: cargo test --features=pure --release # No AVX-512. - run: cargo test --features=no_avx512 - run: cargo test --features=no_avx512,prefer_intrinsics - run: cargo test --features=no_avx512,pure - run: cargo test --features=no_avx512 --release - run: cargo test --features=no_avx512,prefer_intrinsics --release - run: cargo test --features=no_avx512,pure --release # No AVX2. - run: cargo test --features=no_avx512,no_avx2 - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics - run: cargo test --features=no_avx512,no_avx2,pure - run: cargo test --features=no_avx512,no_avx2 --release - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics --release - run: cargo test --features=no_avx512,no_avx2,pure --release # No SSE4.1 - run: cargo test --features=no_avx512,no_avx2,no_sse41 - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure - run: cargo test --features=no_avx512,no_avx2,no_sse41 --release - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics --release - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure --release # No SSE2 - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2 - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2 --release - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics --release - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure --release # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains. - run: cargo test --benches --features=rayon env: RUSTC_BOOTSTRAP: 1 # Test vectors. - name: test vectors run: cargo test working-directory: ./test_vectors - name: test vectors intrinsics run: cargo test --features=prefer_intrinsics working-directory: ./test_vectors - name: test vectors pure run: cargo test --features=pure working-directory: ./test_vectors # Test C code. - name: cargo test C bindings assembly run: cargo test working-directory: ./c/blake3_c_rust_bindings - name: cargo test C bindings intrinsics run: cargo test --features=prefer_intrinsics working-directory: ./c/blake3_c_rust_bindings - name: cargo test C bindings no AVX-512 run: cargo test working-directory: ./c/blake3_c_rust_bindings env: CFLAGS: -DBLAKE3_NO_AVX512 - name: cargo test C bindings no AVX2 run: cargo test working-directory: ./c/blake3_c_rust_bindings env: CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 - name: cargo test C bindings no SSE41 run: cargo test working-directory: ./c/blake3_c_rust_bindings env: CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_SSE41 - name: cargo test C bindings no SSE2 run: cargo test working-directory: ./c/blake3_c_rust_bindings env: CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_SSE2 # Reference impl doc test. - name: reference impl doc test run: cargo test working-directory: ./reference_impl b3sum_tests: name: b3sum ${{ matrix.target.name }} ${{ matrix.channel }} runs-on: ${{ matrix.target.os }} strategy: fail-fast: false matrix: target: [ { "os": "ubuntu-latest", "toolchain": "x86_64-unknown-linux-gnu", "name": "Linux GNU" }, { "os": "macOS-latest", "toolchain": "x86_64-apple-darwin", "name": "macOS" }, { "os": "windows-latest", "toolchain": "x86_64-pc-windows-msvc", "name": "Windows MSVC" }, { "os": "windows-latest", "toolchain": "x86_64-pc-windows-gnu", "name": "Windows GNU" } ] channel: [ "stable", "beta", "nightly", # The b3sum MSRV is sometimes higher than the blake3 crate's, because # b3sum depends on Clap. We check in the b3sum Cargo.lock, so Clap # update shouldn't randomly break us here. "1.74.1", ] steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} # Test b3sum. - name: test b3sum run: cargo test working-directory: ./b3sum - name: test b3sum --no-default-features run: cargo test --no-default-features working-directory: ./b3sum cross_tests: name: cross ${{ matrix.arch }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: arch: - i586-unknown-linux-musl - i686-unknown-linux-musl - armv7-unknown-linux-gnueabihf - aarch64-unknown-linux-gnu # Big-endian targets. See https://twitter.com/burntsushi5/status/1695483429997945092. - powerpc64-unknown-linux-gnu - s390x-unknown-linux-gnu steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - run: cargo install cross # Test the portable implementation on everything. - run: cross test --target ${{ matrix.arch }} # Test building for ancient i386 processors without guaranteed SSE2 support. - run: cross rustc --target ${{ matrix.arch }} -- -C target-cpu=i386 if: startsWith(matrix.arch, 'i586-') || startsWith(matrix.arch, 'i686-') # Test the NEON implementation on ARM targets. - run: cross test --target ${{ matrix.arch }} --features=neon if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-') # NEON is enabled by default on aarch64, disabling it through the no_neon feature. - run: cross test --target ${{ matrix.arch }} --features=no_neon if: startsWith(matrix.arch, 'aarch64-') # Test vectors. Note that this uses a hacky script due to path dependency limitations. - run: ./test_vectors/cross_test.sh --target ${{ matrix.arch }} # C code. Same issue with the hacky script. - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }} - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }} --features=neon if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-') # Currently only on x86. c_tests: name: C Makefile tests runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 # Test the intrinsics-based implementations. - run: make -f Makefile.testing test working-directory: ./c - run: make -f Makefile.testing clean && rm blake3_sse2.c working-directory: ./c - run: BLAKE3_NO_SSE2=1 make -f Makefile.testing test working-directory: ./c - run: make -f Makefile.testing clean && rm blake3_sse41.c working-directory: ./c - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 make -f Makefile.testing test working-directory: ./c - run: make -f Makefile.testing clean && rm blake3_avx2.c working-directory: ./c - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 make -f Makefile.testing test working-directory: ./c - run: make -f Makefile.testing clean && rm blake3_avx512.c working-directory: ./c - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 BLAKE3_NO_AVX512=1 make -f Makefile.testing test working-directory: ./c # Test the assembly implementations. - run: make -f Makefile.testing test_asm working-directory: ./c - run: make -f Makefile.testing clean && rm blake3_sse2_x86-64_unix.S working-directory: ./c - run: BLAKE3_NO_SSE2=1 make -f Makefile.testing test_asm working-directory: ./c - run: make -f Makefile.testing clean && rm blake3_sse41_x86-64_unix.S working-directory: ./c - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 make -f Makefile.testing test_asm working-directory: ./c - run: make -f Makefile.testing clean && rm blake3_avx2_x86-64_unix.S working-directory: ./c - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 make -f Makefile.testing test_asm working-directory: ./c - run: make -f Makefile.testing clean && rm blake3_avx512_x86-64_unix.S working-directory: ./c - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 BLAKE3_NO_AVX512=1 make -f Makefile.testing test_asm working-directory: ./c # Restore the files we deleted above. - run: git checkout . # Build the example. - run: make -f Makefile.testing example working-directory: ./c # Note that this jobs builds AArch64 binaries from an x86_64 host. build_apple_silicon: name: build for Apple Silicon runs-on: macOS-latest strategy: fail-fast: false steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: targets: aarch64-apple-darwin - name: build blake3 run: cargo build --target aarch64-apple-darwin - name: build b3sum run: cargo build --target aarch64-apple-darwin working-directory: ./b3sum build_tinycc: name: build with the Tiny C Compiler runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: install TCC run: sudo apt-get install -y tcc - name: compile run: > tcc -shared -O3 -o libblake3.so \ -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512 \ blake3.c blake3_dispatch.c blake3_portable.c working-directory: ./c # See https://github.com/BLAKE3-team/BLAKE3/issues/271 for why we test this. # Note that this isn't guaranteed to execute on an AVX-512-supporting server, # but hopefully at least some of the time it will. gcc54: name: "compile and test with GCC 5.4" runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: addnab/docker-run-action@v3 with: image: gcc:5.4 options: -v ${{ github.workspace }}:/work run: | cat /proc/cpuinfo curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal cd /work ~/.cargo/bin/cargo test --features prefer_intrinsics # CMake build test (Library only), current macOS/Linux only. cmake_build: name: CMake ${{ matrix.os }} ${{ matrix.compiler }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: ["ubuntu-latest", "macOS-latest", "windows-latest"] compiler: [gcc, clang, cl] exclude: - os: windows-latest compiler: gcc - os: ubuntu-latest compiler: msvc - os: macOS-latest compiler: msvc steps: - uses: actions/checkout@v4 - name: CMake generation run: cmake -S c -B c/build -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/target - name: CMake build / install run: cmake --build c/build --target install miri_smoketest: name: Miri smoketest runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@nightly with: components: miri # Currently the test search "miri" only matches "test_miri_smoketest", but # we might add more. If this accidentally picks up anything incompatible or # slow, we can narrow it. - run: cargo miri test miri blake3-1.5.4/.github/workflows/tag.yml000064400000000000000000000025031046102023000156670ustar 00000000000000name: publish_b3sum_binaries on: push: tags: - "*" env: BLAKE3_CI: "1" RUSTFLAGS: "-D warnings" jobs: cargo_tests: name: ${{ matrix.target.name }} runs-on: ${{ matrix.target.os }} strategy: fail-fast: false matrix: target: [ { "os": "ubuntu-latest", "rust-target": "x86_64-unknown-linux-musl", "name": "Linux" }, { "os": "macOS-latest", "rust-target": "x86_64-apple-darwin", "name": "macOS" }, { "os": "windows-latest", "rust-target": "x86_64-pc-windows-msvc", "name": "Windows" }, ] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: "3.x" - run: pip install PyGithub - run: sudo apt-get install musl-tools if: matrix.target.os == 'ubuntu-latest' - uses: dtolnay/rust-toolchain@stable with: targets: ${{ matrix.target.rust-target }} - name: build b3sum id: build_b3sum run: python -u .github/workflows/build_b3sum.py ${{ matrix.target.rust-target }} - name: upload release asset env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TAG: ${{ github.ref }} run: python -u .github/workflows/upload_github_release_asset.py ${{ steps.build_b3sum.outputs.bin_path }} blake3-1.5.4/.github/workflows/upload_github_release_asset.py000075500000000000000000000044561046102023000225040ustar 00000000000000#! /usr/bin/env python3 import github import os import sys import time RETRIES = 10 g = github.Github(os.environ["GITHUB_TOKEN"]) tag_name = os.environ["GITHUB_TAG"] tag_prefix = "refs/tags/" if tag_name.startswith(tag_prefix): tag_name = tag_name[len(tag_prefix) :] assert len(sys.argv) == 2 asset_path = sys.argv[1] asset_name = os.path.basename(asset_path) repo = g.get_repo(os.environ["GITHUB_REPOSITORY"]) tags = list(repo.get_tags()) for tag in tags: if tag.name == tag_name: break else: raise RuntimeError("no tag named " + repr(tag_name)) try: print("Creating GitHub release for tag " + repr(tag_name) + "...") repo.create_git_release(tag_name, tag_name, tag.commit.commit.message) except github.GithubException as github_error: if github_error.data["errors"][0]["code"] == "already_exists": print("Release for tag " + repr(tag_name) + " already exists.") else: raise def get_release(): for i in range(RETRIES): releases = list(repo.get_releases()) for release in releases: if release.tag_name == tag_name: return release print(f"Release for tag {repr(tag_name)} not found. Retrying...") time.sleep(1) raise RuntimeError("no release for tag " + repr(tag_name)) release = get_release() print("Uploading " + repr(asset_path) + "...") for i in range(RETRIES): try: print("Upload attempt #{} of {}...".format(i + 1, RETRIES)) release.upload_asset(asset_path) break except github.GithubException as github_error: # Unfortunately the asset upload API is flaky. Even worse, it often # partially succeeds, returning an error to the caller but leaving the # release in a state where subsequent uploads of the same asset will # fail with an "already_exists" error. (Though the asset is not visible # on github.com, so we can't just declare victory and move on.) If we # detect this case, explicitly delete the asset and continue retrying. print(github_error) for asset in release.get_assets(): if asset.name == asset_name: print("Found uploaded asset after failure. Deleting...") asset.delete_asset() else: raise RuntimeError("All upload attempts failed.") print("Success!") blake3-1.5.4/.gitignore000064400000000000000000000000221046102023000127560ustar 00000000000000Cargo.lock target blake3-1.5.4/CONTRIBUTING.md000064400000000000000000000022201046102023000132210ustar 00000000000000# Contributing We welcome and encourage third-party contributions to BLAKE3, be it reports of issues encountered while using the software or proposals of patches. ## Bug reports Bugs and other problems should be reported on [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues). If you report a bug, please: * Check that it's not already reported in the [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues). * Provide information to help us diagnose and ideally reproduce the bug. ## Patches We encourage you to fix a bug via a [GitHub Pull request](https://github.com/BLAKE3/BLAKE3/pulls), preferably after creating a related issue and referring it in the PR. If you contribute code and submit a patch, please note the following: * We use Rust's stable branch for developing BLAKE3. * Pull requests should target the `master` branch. * Try to follow the established Rust [style guidelines](https://doc.rust-lang.org/1.0.0/style/). Also please make sure to create new unit tests covering your code additions. You can execute the tests by running: ```bash cargo test ``` All third-party contributions will be recognized in the list of contributors. blake3-1.5.4/Cargo.toml0000644000000046030000000000100102050ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "blake3" version = "1.5.4" authors = [ "Jack O'Connor ", "Samuel Neves", ] build = "build.rs" autobins = false autoexamples = false autotests = false autobenches = false description = "the BLAKE3 hash function" documentation = "https://docs.rs/blake3" readme = "README.md" license = "CC0-1.0 OR Apache-2.0 OR Apache-2.0 WITH LLVM-exception" repository = "https://github.com/BLAKE3-team/BLAKE3" [package.metadata.docs.rs] features = [ "mmap", "rayon", "serde", "zeroize", ] [lib] name = "blake3" path = "src/lib.rs" [[bench]] name = "bench" path = "benches/bench.rs" [dependencies.arrayref] version = "0.3.5" [dependencies.arrayvec] version = "0.7.4" default-features = false [dependencies.cfg-if] version = "1.0.0" [dependencies.constant_time_eq] version = "0.3.0" [dependencies.digest] version = "0.10.1" features = ["mac"] optional = true [dependencies.memmap2] version = "0.9" optional = true [dependencies.rayon-core] version = "1.12.1" optional = true [dependencies.serde] version = "1.0" features = ["derive"] optional = true default-features = false [dependencies.zeroize] version = "1" optional = true default-features = false [dev-dependencies.ciborium] version = "0.2.2" [dev-dependencies.hex] version = "0.4.2" [dev-dependencies.hmac] version = "0.12.0" [dev-dependencies.page_size] version = "0.6.0" [dev-dependencies.rand] version = "0.8.0" [dev-dependencies.rand_chacha] version = "0.3.0" [dev-dependencies.serde_json] version = "1.0.107" [dev-dependencies.tempfile] version = "3.8.0" [build-dependencies.cc] version = "1.1.12" [features] default = ["std"] mmap = [ "std", "dep:memmap2", ] neon = [] no_avx2 = [] no_avx512 = [] no_neon = [] no_sse2 = [] no_sse41 = [] prefer_intrinsics = [] pure = [] rayon = [ "dep:rayon-core", "std", ] std = [] traits-preview = ["dep:digest"] zeroize = [ "dep:zeroize", "arrayvec/zeroize", ] blake3-1.5.4/Cargo.toml.orig000064400000000000000000000122411046102023000136630ustar 00000000000000[package] name = "blake3" version = "1.5.4" authors = ["Jack O'Connor ", "Samuel Neves"] description = "the BLAKE3 hash function" repository = "https://github.com/BLAKE3-team/BLAKE3" license = "CC0-1.0 OR Apache-2.0 OR Apache-2.0 WITH LLVM-exception" documentation = "https://docs.rs/blake3" readme = "README.md" edition = "2021" [features] default = ["std"] # The NEON implementation does not participate in dynamic feature detection, # which is currently x86-only. If "neon" is on, NEON support is assumed. Note # that AArch64 always supports NEON, but support on ARMv7 varies. The NEON # implementation uses C intrinsics and requires a C compiler. neon = [] # This crate uses libstd for std::io trait implementations, and also for # runtime CPU feature detection. This feature is enabled by default. If you use # --no-default-features, the only way to use the SIMD implementations in this # crate is to enable the corresponding instruction sets statically for the # entire build, with e.g. RUSTFLAGS="-C target-cpu=native". std = [] # The `rayon` feature (disabled by default, but enabled for docs.rs) adds the # `update_rayon` and (in combination with `mmap` below) `update_mmap_rayon` # methods, for multithreaded hashing. However, even if this feature is enabled, # all other APIs remain single-threaded. # # Implementation detail: We take a dependency on rayon-core instead of rayon, # because it builds faster and still includes all the APIs we need. rayon = ["dep:rayon-core", "std"] # The `mmap` feature (disabled by default, but enabled for docs.rs) adds the # `update_mmap` and (in combination with `rayon` above) `update_mmap_rayon` # helper methods for memory-mapped IO. mmap = ["std", "dep:memmap2"] # Implement the zeroize::Zeroize trait for types in this crate. zeroize = ["dep:zeroize", "arrayvec/zeroize"] # This crate implements traits from the RustCrypto project, exposed here as the # "traits-preview" feature. However, these traits aren't stable, and they're # expected to change in incompatible ways before they reach 1.0. For that # reason, this crate makes no SemVer guarantees for this feature, and callers # who use it should expect breaking changes between patch versions of this # crate. (The "*-preview" feature name follows the conventions of the RustCrypto # "signature" crate.) traits-preview = ["dep:digest"] # ---------- Features below this line are undocumented and unstable. ---------- # The following features are mainly intended for testing and benchmarking, and # they might change or disappear at any time without a major version bump. # By default on x86_64, this crate uses Samuel Neves' hand-written assembly # implementations for SSE4.1, AVX2, and AVX512. (These provide both the best # runtime performance, and the fastest build times.) And by default on 32-bit # x86, this crate uses Rust intrinsics implementations for SSE4.1 and AVX2, and # a C intrinsics implementation for AVX-512. In both cases, if a C compiler is # not detected, or if AVX-512 support is missing from the detected compiler, # build.rs automatically falls back to a pure Rust build. This feature forces # that fallback, for testing purposes. (Note that in CI testing, we set the # BLAKE3_CI environment variable, which instructs build.rs to error out rather # than doing an automatic fallback.) pure = [] # As described above, on x86_64 this crate use assembly implementations by # default. Enabling the "prefer_intrinsics" feature makes this crate use # intrinsics implementations on both 32-bit and 64-bit x86, again for testing # purposes. prefer_intrinsics = [] # Disable individual instruction sets. CI testing uses these flags to simulate # different levels of hardware SIMD support. Note that code for the # corresponding instruction set is still compiled; only detection is disabled. # # As noted above, these flags are *for testing only* and are not stable. It's # possible that some users might find that their particular use case performs # better if e.g. AVX-512 is disabled, because of issues like CPU downclocking. # If that comes up, and if disabling the instruction set here at the feature # level turns out to be the right approach, then we can design a stable # feature. Until then, we reserve the right to break these features in a patch # release. no_sse2 = [] no_sse41 = [] no_avx2 = [] no_avx512 = [] no_neon = [] [package.metadata.docs.rs] # Document the rayon/mmap methods and the Serialize/Deserialize/Zeroize impls on docs.rs. features = ["mmap", "rayon", "serde", "zeroize"] [dependencies] arrayref = "0.3.5" arrayvec = { version = "0.7.4", default-features = false } constant_time_eq = "0.3.0" cfg-if = "1.0.0" digest = { version = "0.10.1", features = [ "mac" ], optional = true } memmap2 = { version = "0.9", optional = true } rayon-core = { version = "1.12.1", optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } zeroize = { version = "1", default-features = false, optional = true } [dev-dependencies] hmac = "0.12.0" hex = "0.4.2" page_size = "0.6.0" rand = "0.8.0" rand_chacha = "0.3.0" reference_impl = { path = "./reference_impl" } tempfile = "3.8.0" serde_json = "1.0.107" ciborium = "0.2.2" [build-dependencies] cc = "1.1.12" blake3-1.5.4/LICENSE_A2000064400000000000000000000261411046102023000123270ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2019 Jack O'Connor and Samuel Neves Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. blake3-1.5.4/LICENSE_A2LLVM000064400000000000000000000277531046102023000130340ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2019 Jack O'Connor and Samuel Neves Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ---- LLVM Exceptions to the Apache 2.0 License ---- As an exception, if, as a result of your compiling your source code, portions of this Software are embedded into an Object form of such source code, you may redistribute such embedded portions in such Object form without complying with the conditions of Sections 4(a), 4(b) and 4(d) of the License. In addition, if you combine or link compiled forms of this Software with software that is licensed under the GPLv2 ("Combined Software") and if a court of competent jurisdiction determines that the patent provision (Section 3), the indemnity provision (Section 9) or other Section of the License conflicts with the conditions of the GPLv2, you may retroactively and prospectively choose to deem waived or otherwise exclude such Section(s) of the License, but only in their entirety and only with respect to the Combined Software. blake3-1.5.4/LICENSE_CC0000064400000000000000000000156101046102023000124310ustar 00000000000000Creative Commons Legal Code CC0 1.0 Universal CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER. Statement of Purpose The laws of most jurisdictions throughout the world automatically confer exclusive Copyright and Related Rights (defined below) upon the creator and subsequent owner(s) (each and all, an "owner") of an original work of authorship and/or a database (each, a "Work"). Certain owners wish to permanently relinquish those rights to a Work for the purpose of contributing to a commons of creative, cultural and scientific works ("Commons") that the public can reliably and without fear of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free culture and the further production of creative, cultural and scientific works, or to gain reputation or greater distribution for their Work in part through the use and efforts of others. For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person associating CC0 with a Work (the "Affirmer"), to the extent that he or she is an owner of Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and publicly distribute the Work under its terms, with knowledge of his or her Copyright and Related Rights in the Work and the meaning and intended legal effect of CC0 on those rights. 1. Copyright and Related Rights. A Work made available under CC0 may be protected by copyright and related or neighboring rights ("Copyright and Related Rights"). Copyright and Related Rights include, but are not limited to, the following: i. the right to reproduce, adapt, distribute, perform, display, communicate, and translate a Work; ii. moral rights retained by the original author(s) and/or performer(s); iii. publicity and privacy rights pertaining to a person's image or likeness depicted in a Work; iv. rights protecting against unfair competition in regards to a Work, subject to the limitations in paragraph 4(a), below; v. rights protecting the extraction, dissemination, use and reuse of data in a Work; vi. database rights (such as those arising under Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, and under any national implementation thereof, including any amended or successor version of such directive); and vii. other similar, equivalent or corresponding rights throughout the world based on applicable law or treaty, and any national implementations thereof. 2. Waiver. To the greatest extent permitted by, but not in contravention of, applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and unconditionally waives, abandons, and surrenders all of Affirmer's Copyright and Related Rights and associated claims and causes of action, whether now known or unknown (including existing as well as future claims and causes of action), in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each member of the public at large and to the detriment of Affirmer's heirs and successors, fully intending that such Waiver shall not be subject to revocation, rescission, cancellation, termination, or any other legal or equitable action to disrupt the quiet enjoyment of the Work by the public as contemplated by Affirmer's express Statement of Purpose. 3. Public License Fallback. Should any part of the Waiver for any reason be judged legally invalid or ineffective under applicable law, then the Waiver shall be preserved to the maximum extent permitted taking into account Affirmer's express Statement of Purpose. In addition, to the extent the Waiver is so judged Affirmer hereby grants to each affected person a royalty-free, non transferable, non sublicensable, non exclusive, irrevocable and unconditional license to exercise Affirmer's Copyright and Related Rights in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "License"). The License shall be deemed effective as of the date CC0 was applied by Affirmer to the Work. Should any part of the License for any reason be judged legally invalid or ineffective under applicable law, such partial invalidity or ineffectiveness shall not invalidate the remainder of the License, and in such case Affirmer hereby affirms that he or she will not (i) exercise any of his or her remaining Copyright and Related Rights in the Work or (ii) assert any associated claims and causes of action with respect to the Work, in either case contrary to Affirmer's express Statement of Purpose. 4. Limitations and Disclaimers. a. No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected by this document. b. Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work, express, implied, statutory or otherwise, including without limitation warranties of title, merchantability, fitness for a particular purpose, non infringement, or the absence of latent or other defects, accuracy, or the present or absence of errors, whether or not discoverable, all to the greatest extent permissible under applicable law. c. Affirmer disclaims responsibility for clearing rights of other persons that may apply to the Work or any use thereof, including without limitation any person's Copyright and Related Rights in the Work. Further, Affirmer disclaims responsibility for obtaining any necessary consents, permissions or other rights required for any use of the Work. d. Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work. blake3-1.5.4/README.md000064400000000000000000000216151046102023000122600ustar 00000000000000# BLAKE3 BLAKE3 is a cryptographic hash function that is: - **Much faster** than MD5, SHA-1, SHA-2, SHA-3, and BLAKE2. - **Secure**, unlike MD5 and SHA-1. And secure against length extension, unlike SHA-2. - **Highly parallelizable** across any number of threads and SIMD lanes, because it's a Merkle tree on the inside. - Capable of **verified streaming** and **incremental updates**, again because it's a Merkle tree. - A **PRF**, **MAC**, **KDF**, and **XOF**, as well as a regular hash. - **One algorithm with no variants**, which is fast on x86-64 and also on smaller architectures. The [chart below](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/benchmarks/bar_chart.py) is an example benchmark of 16 KiB inputs on a Cascade Lake-SP 8275CL server CPU from 2019. For more detailed benchmarks, see the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf).

performance graph

BLAKE3 is based on an optimized instance of the established hash function [BLAKE2](https://blake2.net) and on the [original Bao tree mode](https://github.com/oconnor663/bao/blob/master/docs/spec_0.9.1.md). The specifications and design rationale are available in the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). The default output size is 256 bits. The current version of [Bao](https://github.com/oconnor663/bao) implements verified streaming with BLAKE3. This repository is the official implementation of BLAKE3. It includes: * The [`blake3`](https://crates.io/crates/blake3) Rust crate, which includes optimized implementations for SSE2, SSE4.1, AVX2, AVX-512, and NEON, with automatic runtime CPU feature detection on x86. The `rayon` feature provides multithreading. * The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which provides a command line interface. It uses multithreading by default, making it an order of magnitude faster than e.g. `sha256sum` on typical desktop hardware. * The [C implementation](c), which like the Rust implementation includes SIMD code and runtime CPU feature detection on x86. Unlike the Rust implementation, it's [not currently multithreaded](c#multithreading). See [`c/README.md`](c/README.md). * The [Rust reference implementation](reference_impl/reference_impl.rs), which is discussed in Section 5.1 of the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). This implementation is much smaller and simpler than the optimized ones above. If you want to see how BLAKE3 works, or you're writing a port that doesn't need multithreading or SIMD optimizations, start here. Ports of the reference implementation to other languages are hosted in separate repositories ([C](https://github.com/oconnor663/blake3_reference_impl_c), [Python](https://github.com/oconnor663/pure_python_blake3)). * A [set of test vectors](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json) that covers extended outputs, all three modes, and a variety of input lengths. * [![Actions Status](https://github.com/BLAKE3-team/BLAKE3/workflows/tests/badge.svg)](https://github.com/BLAKE3-team/BLAKE3/actions) BLAKE3 was designed by: * [@oconnor663 ](https://github.com/oconnor663) (Jack O'Connor) * [@sneves](https://github.com/sneves) (Samuel Neves) * [@veorq](https://github.com/veorq) (Jean-Philippe Aumasson) * [@zookozcash](https://github.com/zookozcash) (Zooko) The development of BLAKE3 was sponsored by [Electric Coin Company](https://electriccoin.co). *NOTE: BLAKE3 is not a password hashing algorithm, because it's designed to be fast, whereas password hashing should not be fast. If you hash passwords to store the hashes or if you derive keys from passwords, we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).* ## Usage ### The `b3sum` utility The `b3sum` command line utility prints the BLAKE3 hashes of files or of standard input. Prebuilt binaries are available for Linux, Windows, and macOS (requiring the [unidentified developer workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac)) on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases). If you've [installed Rust and Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html), you can also build `b3sum` yourself with: ```bash cargo install b3sum ``` If `rustup` didn't configure your `PATH` for you, you might need to go looking for the installed binary in e.g. `~/.cargo/bin`. You can test out how fast BLAKE3 is on your machine by creating a big file and hashing it, for example: ```bash # Create a 1 GB file. head -c 1000000000 /dev/zero > /tmp/bigfile # Hash it with SHA-256. time openssl sha256 /tmp/bigfile # Hash it with BLAKE3. time b3sum /tmp/bigfile ``` ### The `blake3` crate [![docs.rs](https://docs.rs/blake3/badge.svg)](https://docs.rs/blake3) To use BLAKE3 from Rust code, add a dependency on the `blake3` crate to your `Cargo.toml`. Here's an example of hashing some input bytes: ```rust // Hash an input all at once. let hash1 = blake3::hash(b"foobarbaz"); // Hash an input incrementally. let mut hasher = blake3::Hasher::new(); hasher.update(b"foo"); hasher.update(b"bar"); hasher.update(b"baz"); let hash2 = hasher.finalize(); assert_eq!(hash1, hash2); // Extended output. OutputReader also implements Read and Seek. let mut output = [0; 1000]; let mut output_reader = hasher.finalize_xof(); output_reader.fill(&mut output); assert_eq!(hash1, output[..32]); // Print a hash as hex. println!("{}", hash1); ``` Besides `hash`, BLAKE3 provides two other modes, `keyed_hash` and `derive_key`. The `keyed_hash` mode takes a 256-bit key: ```rust // MAC an input all at once. let example_key = [42u8; 32]; let mac1 = blake3::keyed_hash(&example_key, b"example input"); // MAC incrementally. let mut hasher = blake3::Hasher::new_keyed(&example_key); hasher.update(b"example input"); let mac2 = hasher.finalize(); assert_eq!(mac1, mac2); ``` The `derive_key` mode takes a context string and some key material (not a password). The context string should be hardcoded, globally unique, and application-specific. A good default format for the context string is `"[application] [commit timestamp] [purpose]"`: ```rust // Derive a couple of subkeys for different purposes. const EMAIL_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:10:44 email key"; const API_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:11:21 API key"; let input_key_material = b"usually at least 32 random bytes, not a password"; let email_key = blake3::derive_key(EMAIL_CONTEXT, input_key_material); let api_key = blake3::derive_key(API_CONTEXT, input_key_material); assert_ne!(email_key, api_key); ``` ### The C implementation See [`c/README.md`](c/README.md). ### Other implementations We post links to third-party bindings and implementations on the [@BLAKE3team Twitter account](https://twitter.com/BLAKE3team) whenever we hear about them. Some highlights include [an optimized Go implementation](https://github.com/zeebo/blake3), [Wasm bindings for Node.js and browsers](https://github.com/connor4312/blake3), [binary wheels for Python](https://github.com/oconnor663/blake3-py), [.NET bindings](https://github.com/xoofx/Blake3.NET), and [JNI bindings](https://github.com/sken77/BLAKE3jni). ## Contributing Please see [CONTRIBUTING.md](CONTRIBUTING.md). ## Licenses This work is released into the public domain with [CC0 1.0](./LICENSE_CC0). Alternatively, it is licensed under any of the following: * [Apache 2.0](./LICENSE_A2) * [Apache 2.0 with LLVM exceptions](./LICENSE_A2LLVM) ## Adoption & deployment Here's a (non-exhaustive) list of protocols and software that use BLAKE3: * [Alephium](https://github.com/alephium/alephium/blob/master/crypto/src/main/scala/org/alephium/crypto/Blake3.scala) * [Bazel](https://github.com/bazelbuild/bazel/releases/tag/6.4.0) * [Chia](https://github.com/Chia-Network/chia-blockchain/blob/main/CHANGELOG.md#10beta8-aka-beta-18---2020-07-16) * [IPFS](https://github.com/ipfs/go-verifcid/issues/13) * [Farcaster](https://www.farcaster.xyz/) * [LLVM](https://reviews.llvm.org/D121510) * [Nym](https://github.com/nymtech/nym/blob/59056a22c5e6b01a38da2124662bd1fa3c8abef2/common/nymsphinx/params/src/lib.rs#L5) * [OpenZFS](https://github.com/openzfs/zfs/) * [Redox](https://www.redox-os.org/news/pkgar-introduction/) * [Saito](https://saito.tech/) * [Skale](https://github.com/skalenetwork/skale-consensus/pull/284) * [Solana](https://docs.rs/solana-program/1.9.5/solana_program/blake3/index.html) * [Tekken 8](https://en.bandainamcoent.eu/tekken/tekken-8) * [Wasmer](https://github.com/wasmerio/wasmer/blob/4f935a8c162bf604df223003e434e4f7ca253688/lib/cache/src/hash.rs#L21) ## Miscellany - [@veorq](https://github.com/veorq) and [@oconnor663](https://github.com/oconnor663) did [a podcast interview](https://www.cryptography.fm/3) about designing BLAKE3. blake3-1.5.4/benches/bench.rs000064400000000000000000000311401046102023000140270ustar 00000000000000#![feature(test)] extern crate test; use arrayref::array_ref; use arrayvec::ArrayVec; use blake3::guts::{BLOCK_LEN, CHUNK_LEN}; use blake3::platform::{Platform, MAX_SIMD_DEGREE}; use blake3::OUT_LEN; use rand::prelude::*; use test::Bencher; const KIB: usize = 1024; // This struct randomizes two things: // 1. The actual bytes of input. // 2. The page offset the input starts at. pub struct RandomInput { buf: Vec, len: usize, offsets: Vec, offset_index: usize, } impl RandomInput { pub fn new(b: &mut Bencher, len: usize) -> Self { b.bytes += len as u64; let page_size: usize = page_size::get(); let mut buf = vec![0u8; len + page_size]; let mut rng = rand::thread_rng(); rng.fill_bytes(&mut buf); let mut offsets: Vec = (0..page_size).collect(); offsets.shuffle(&mut rng); Self { buf, len, offsets, offset_index: 0, } } pub fn get(&mut self) -> &[u8] { let offset = self.offsets[self.offset_index]; self.offset_index += 1; if self.offset_index >= self.offsets.len() { self.offset_index = 0; } &self.buf[offset..][..self.len] } } fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) { let mut state = [1u32; 8]; let mut r = RandomInput::new(b, 64); let input = array_ref!(r.get(), 0, 64); b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0)); } #[bench] fn bench_single_compression_portable(b: &mut Bencher) { bench_single_compression_fn(b, Platform::portable()); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_single_compression_sse2(b: &mut Bencher) { if let Some(platform) = Platform::sse2() { bench_single_compression_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_single_compression_sse41(b: &mut Bencher) { if let Some(platform) = Platform::sse41() { bench_single_compression_fn(b, platform); } } #[bench] #[cfg(blake3_avx512_ffi)] fn bench_single_compression_avx512(b: &mut Bencher) { if let Some(platform) = Platform::avx512() { bench_single_compression_fn(b, platform); } } fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) { let degree = platform.simd_degree(); let mut inputs = Vec::new(); for _ in 0..degree { inputs.push(RandomInput::new(b, CHUNK_LEN)); } b.iter(|| { let input_arrays: ArrayVec<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE> = inputs .iter_mut() .take(degree) .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) .collect(); let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; platform.hash_many( &input_arrays[..], &[0; 8], 0, blake3::IncrementCounter::Yes, 0, 0, 0, &mut out, ); }); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_chunks_sse2(b: &mut Bencher) { if let Some(platform) = Platform::sse2() { bench_many_chunks_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_chunks_sse41(b: &mut Bencher) { if let Some(platform) = Platform::sse41() { bench_many_chunks_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_chunks_avx2(b: &mut Bencher) { if let Some(platform) = Platform::avx2() { bench_many_chunks_fn(b, platform); } } #[bench] #[cfg(blake3_avx512_ffi)] fn bench_many_chunks_avx512(b: &mut Bencher) { if let Some(platform) = Platform::avx512() { bench_many_chunks_fn(b, platform); } } #[bench] #[cfg(feature = "neon")] fn bench_many_chunks_neon(b: &mut Bencher) { if let Some(platform) = Platform::neon() { bench_many_chunks_fn(b, platform); } } // TODO: When we get const generics we can unify this with the chunks code. fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) { let degree = platform.simd_degree(); let mut inputs = Vec::new(); for _ in 0..degree { inputs.push(RandomInput::new(b, BLOCK_LEN)); } b.iter(|| { let input_arrays: ArrayVec<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE> = inputs .iter_mut() .take(degree) .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) .collect(); let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; platform.hash_many( &input_arrays[..], &[0; 8], 0, blake3::IncrementCounter::No, 0, 0, 0, &mut out, ); }); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_parents_sse2(b: &mut Bencher) { if let Some(platform) = Platform::sse2() { bench_many_parents_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_parents_sse41(b: &mut Bencher) { if let Some(platform) = Platform::sse41() { bench_many_parents_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_parents_avx2(b: &mut Bencher) { if let Some(platform) = Platform::avx2() { bench_many_parents_fn(b, platform); } } #[bench] #[cfg(blake3_avx512_ffi)] fn bench_many_parents_avx512(b: &mut Bencher) { if let Some(platform) = Platform::avx512() { bench_many_parents_fn(b, platform); } } #[bench] #[cfg(feature = "neon")] fn bench_many_parents_neon(b: &mut Bencher) { if let Some(platform) = Platform::neon() { bench_many_parents_fn(b, platform); } } fn bench_atonce(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| blake3::hash(input.get())); } #[bench] fn bench_atonce_0001_block(b: &mut Bencher) { bench_atonce(b, BLOCK_LEN); } #[bench] fn bench_atonce_0001_kib(b: &mut Bencher) { bench_atonce(b, 1 * KIB); } #[bench] fn bench_atonce_0002_kib(b: &mut Bencher) { bench_atonce(b, 2 * KIB); } #[bench] fn bench_atonce_0004_kib(b: &mut Bencher) { bench_atonce(b, 4 * KIB); } #[bench] fn bench_atonce_0008_kib(b: &mut Bencher) { bench_atonce(b, 8 * KIB); } #[bench] fn bench_atonce_0016_kib(b: &mut Bencher) { bench_atonce(b, 16 * KIB); } #[bench] fn bench_atonce_0032_kib(b: &mut Bencher) { bench_atonce(b, 32 * KIB); } #[bench] fn bench_atonce_0064_kib(b: &mut Bencher) { bench_atonce(b, 64 * KIB); } #[bench] fn bench_atonce_0128_kib(b: &mut Bencher) { bench_atonce(b, 128 * KIB); } #[bench] fn bench_atonce_0256_kib(b: &mut Bencher) { bench_atonce(b, 256 * KIB); } #[bench] fn bench_atonce_0512_kib(b: &mut Bencher) { bench_atonce(b, 512 * KIB); } #[bench] fn bench_atonce_1024_kib(b: &mut Bencher) { bench_atonce(b, 1024 * KIB); } fn bench_incremental(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| blake3::Hasher::new().update(input.get()).finalize()); } #[bench] fn bench_incremental_0001_block(b: &mut Bencher) { bench_incremental(b, BLOCK_LEN); } #[bench] fn bench_incremental_0001_kib(b: &mut Bencher) { bench_incremental(b, 1 * KIB); } #[bench] fn bench_incremental_0002_kib(b: &mut Bencher) { bench_incremental(b, 2 * KIB); } #[bench] fn bench_incremental_0004_kib(b: &mut Bencher) { bench_incremental(b, 4 * KIB); } #[bench] fn bench_incremental_0008_kib(b: &mut Bencher) { bench_incremental(b, 8 * KIB); } #[bench] fn bench_incremental_0016_kib(b: &mut Bencher) { bench_incremental(b, 16 * KIB); } #[bench] fn bench_incremental_0032_kib(b: &mut Bencher) { bench_incremental(b, 32 * KIB); } #[bench] fn bench_incremental_0064_kib(b: &mut Bencher) { bench_incremental(b, 64 * KIB); } #[bench] fn bench_incremental_0128_kib(b: &mut Bencher) { bench_incremental(b, 128 * KIB); } #[bench] fn bench_incremental_0256_kib(b: &mut Bencher) { bench_incremental(b, 256 * KIB); } #[bench] fn bench_incremental_0512_kib(b: &mut Bencher) { bench_incremental(b, 512 * KIB); } #[bench] fn bench_incremental_1024_kib(b: &mut Bencher) { bench_incremental(b, 1024 * KIB); } fn bench_reference(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| { let mut hasher = reference_impl::Hasher::new(); hasher.update(input.get()); let mut out = [0; 32]; hasher.finalize(&mut out); out }); } #[bench] fn bench_reference_0001_block(b: &mut Bencher) { bench_reference(b, BLOCK_LEN); } #[bench] fn bench_reference_0001_kib(b: &mut Bencher) { bench_reference(b, 1 * KIB); } #[bench] fn bench_reference_0002_kib(b: &mut Bencher) { bench_reference(b, 2 * KIB); } #[bench] fn bench_reference_0004_kib(b: &mut Bencher) { bench_reference(b, 4 * KIB); } #[bench] fn bench_reference_0008_kib(b: &mut Bencher) { bench_reference(b, 8 * KIB); } #[bench] fn bench_reference_0016_kib(b: &mut Bencher) { bench_reference(b, 16 * KIB); } #[bench] fn bench_reference_0032_kib(b: &mut Bencher) { bench_reference(b, 32 * KIB); } #[bench] fn bench_reference_0064_kib(b: &mut Bencher) { bench_reference(b, 64 * KIB); } #[bench] fn bench_reference_0128_kib(b: &mut Bencher) { bench_reference(b, 128 * KIB); } #[bench] fn bench_reference_0256_kib(b: &mut Bencher) { bench_reference(b, 256 * KIB); } #[bench] fn bench_reference_0512_kib(b: &mut Bencher) { bench_reference(b, 512 * KIB); } #[bench] fn bench_reference_1024_kib(b: &mut Bencher) { bench_reference(b, 1024 * KIB); } #[cfg(feature = "rayon")] fn bench_rayon(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| blake3::Hasher::new().update_rayon(input.get()).finalize()); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0001_block(b: &mut Bencher) { bench_rayon(b, BLOCK_LEN); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0001_kib(b: &mut Bencher) { bench_rayon(b, 1 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0002_kib(b: &mut Bencher) { bench_rayon(b, 2 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0004_kib(b: &mut Bencher) { bench_rayon(b, 4 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0008_kib(b: &mut Bencher) { bench_rayon(b, 8 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0016_kib(b: &mut Bencher) { bench_rayon(b, 16 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0032_kib(b: &mut Bencher) { bench_rayon(b, 32 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0064_kib(b: &mut Bencher) { bench_rayon(b, 64 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0128_kib(b: &mut Bencher) { bench_rayon(b, 128 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0256_kib(b: &mut Bencher) { bench_rayon(b, 256 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0512_kib(b: &mut Bencher) { bench_rayon(b, 512 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_1024_kib(b: &mut Bencher) { bench_rayon(b, 1024 * KIB); } // This checks that update() splits up its input in increasing powers of 2, so // that it can recover a high degree of parallelism when the number of bytes // hashed so far is uneven. The performance of this benchmark should be // reasonably close to bench_incremental_0064_kib, within 80% or so. When we // had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69), // performance was less than half. #[bench] fn bench_two_updates(b: &mut Bencher) { let len = 65536; let mut input = RandomInput::new(b, len); b.iter(|| { let mut hasher = blake3::Hasher::new(); let input = input.get(); hasher.update(&input[..1]); hasher.update(&input[1..]); hasher.finalize() }); } fn bench_xof(b: &mut Bencher, len: usize) { b.bytes = len as u64; let mut output = [0u8; 64 * BLOCK_LEN]; let output_slice = &mut output[..len]; let mut xof = blake3::Hasher::new().finalize_xof(); b.iter(|| xof.fill(output_slice)); } #[bench] fn bench_xof_01_block(b: &mut Bencher) { bench_xof(b, BLOCK_LEN); } #[bench] fn bench_xof_02_blocks(b: &mut Bencher) { bench_xof(b, 2 * BLOCK_LEN); } #[bench] fn bench_xof_04_blocks(b: &mut Bencher) { bench_xof(b, 4 * BLOCK_LEN); } #[bench] fn bench_xof_08_blocks(b: &mut Bencher) { bench_xof(b, 8 * BLOCK_LEN); } #[bench] fn bench_xof_16_blocks(b: &mut Bencher) { bench_xof(b, 16 * BLOCK_LEN); } #[bench] fn bench_xof_32_blocks(b: &mut Bencher) { bench_xof(b, 32 * BLOCK_LEN); } #[bench] fn bench_xof_64_blocks(b: &mut Bencher) { bench_xof(b, 64 * BLOCK_LEN); } blake3-1.5.4/build.rs000064400000000000000000000247101046102023000124450ustar 00000000000000use std::env; fn defined(var: &str) -> bool { println!("cargo:rerun-if-env-changed={}", var); env::var_os(var).is_some() } fn is_pure() -> bool { defined("CARGO_FEATURE_PURE") } fn should_prefer_intrinsics() -> bool { defined("CARGO_FEATURE_PREFER_INTRINSICS") } fn is_neon() -> bool { defined("CARGO_FEATURE_NEON") } fn is_no_neon() -> bool { defined("CARGO_FEATURE_NO_NEON") } fn is_ci() -> bool { defined("BLAKE3_CI") } fn warn(warning: &str) { assert!(!warning.contains("\n")); println!("cargo:warning={}", warning); if is_ci() { println!("cargo:warning=Warnings in CI are treated as errors. Build failed."); std::process::exit(1); } } fn target_components() -> Vec { let target = env::var("TARGET").unwrap(); target.split("-").map(|s| s.to_string()).collect() } fn is_x86_64() -> bool { target_components()[0] == "x86_64" } fn is_x86_32() -> bool { let arch = &target_components()[0]; arch == "i386" || arch == "i586" || arch == "i686" } fn is_arm() -> bool { is_armv7() || is_aarch64() || target_components()[0] == "arm" } fn is_aarch64() -> bool { target_components()[0] == "aarch64" } fn is_armv7() -> bool { target_components()[0] == "armv7" } fn endianness() -> String { let endianness = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap(); assert!(endianness == "little" || endianness == "big"); endianness } fn is_little_endian() -> bool { endianness() == "little" } fn is_big_endian() -> bool { endianness() == "big" } // Windows targets may be using the MSVC toolchain or the MinGW toolchain. The // right compiler flags to use depend on the toolchain. (And we don't want to // use flag_if_supported, because we don't want features to be silently // disabled by old compilers.) fn is_windows_msvc() -> bool { // Some targets are only two components long, so check in steps. target_components()[1] == "pc" && target_components()[2] == "windows" && target_components()[3] == "msvc" } // MinGW toolchain uses 2 different targets depending on the main compiler. // Target for a general MinGW toolchain ends with `-gnu` (GCC is used as C // compiler). Target for a LLVM-MinGW toolchain (Clang is used as C compiler) // ends with `-gnullvm`. fn is_windows_gnu() -> bool { // Some targets are only two components long, so check in steps. target_components()[1] == "pc" && target_components()[2] == "windows" && target_components()[3] != "msvc" } fn new_build() -> cc::Build { let mut build = cc::Build::new(); if !is_windows_msvc() { build.flag("-std=c11"); } // Do NOT trigger a rebuild any time the env changes (e.g. $PATH). // This prevents all downstream crates from being rebuilt when `cargo check` // or `cargo build` are run in different environments, like Rust Analyzer // vs. in the terminal vs. in a Git pre-commit hook. build.emit_rerun_if_env_changed(false); build } #[derive(PartialEq)] enum CCompilerSupport { NoCompiler, NoAVX512, YesAVX512, } use CCompilerSupport::*; fn c_compiler_support() -> CCompilerSupport { let build = new_build(); let flags_checked; let support_result: Result = if is_windows_msvc() { flags_checked = "/arch:AVX512"; build.is_flag_supported("/arch:AVX512") } else { // Check for both of the flags we use. If -mavx512f works, then -mavx512vl // will probably always work too, but we might as well be thorough. flags_checked = "-mavx512f and -mavx512vl"; match build.is_flag_supported("-mavx512f") { Ok(true) => build.is_flag_supported("-mavx512vl"), false_or_error => false_or_error, } }; match support_result { Ok(true) => YesAVX512, Ok(false) => { warn(&format!( "The C compiler {:?} does not support {}.", build.get_compiler().path(), flags_checked, )); NoAVX512 } Err(e) => { println!("{:?}", e); warn(&format!( "No C compiler {:?} detected.", build.get_compiler().path() )); NoCompiler } } } fn build_sse2_sse41_avx2_rust_intrinsics() { // No C code to compile here. Set the cfg flags that enable the Rust SSE2, // SSE4.1, and AVX2 intrinsics modules. The regular Cargo build will compile // them. println!("cargo:rustc-cfg=blake3_sse2_rust"); println!("cargo:rustc-cfg=blake3_sse41_rust"); println!("cargo:rustc-cfg=blake3_avx2_rust"); } fn build_sse2_sse41_avx2_assembly() { // Build the assembly implementations for SSE4.1 and AVX2. This is // preferred, but it only supports x86_64. assert!(is_x86_64()); println!("cargo:rustc-cfg=blake3_sse2_ffi"); println!("cargo:rustc-cfg=blake3_sse41_ffi"); println!("cargo:rustc-cfg=blake3_avx2_ffi"); let mut build = new_build(); if is_windows_msvc() { build.file("c/blake3_sse2_x86-64_windows_msvc.asm"); build.file("c/blake3_sse41_x86-64_windows_msvc.asm"); build.file("c/blake3_avx2_x86-64_windows_msvc.asm"); } else if is_windows_gnu() { build.file("c/blake3_sse2_x86-64_windows_gnu.S"); build.file("c/blake3_sse41_x86-64_windows_gnu.S"); build.file("c/blake3_avx2_x86-64_windows_gnu.S"); } else { // All non-Windows implementations are assumed to support // Linux-style assembly. These files do contain a small // explicit workaround for macOS also. build.file("c/blake3_sse2_x86-64_unix.S"); build.file("c/blake3_sse41_x86-64_unix.S"); build.file("c/blake3_avx2_x86-64_unix.S"); } build.compile("blake3_sse2_sse41_avx2_assembly"); } fn build_avx512_c_intrinsics() { // This is required on 32-bit x86 targets, since the assembly // implementation doesn't support those. println!("cargo:rustc-cfg=blake3_avx512_ffi"); let mut build = new_build(); build.file("c/blake3_avx512.c"); if is_windows_msvc() { build.flag("/arch:AVX512"); } else { build.flag("-mavx512f"); build.flag("-mavx512vl"); } if is_windows_gnu() { // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782. build.flag("-fno-asynchronous-unwind-tables"); } build.compile("blake3_avx512_intrinsics"); } fn build_avx512_assembly() { // Build the assembly implementation for AVX-512. This is preferred, but it // only supports x86_64. assert!(is_x86_64()); println!("cargo:rustc-cfg=blake3_avx512_ffi"); let mut build = new_build(); if is_windows_msvc() { build.file("c/blake3_avx512_x86-64_windows_msvc.asm"); } else { if is_windows_gnu() { build.file("c/blake3_avx512_x86-64_windows_gnu.S"); } else { // All non-Windows implementations are assumed to support Linux-style // assembly. These files do contain a small explicit workaround for // macOS also. build.file("c/blake3_avx512_x86-64_unix.S"); } // Older versions of Clang require these flags, even for assembly. See // https://github.com/BLAKE3-team/BLAKE3/issues/79. build.flag("-mavx512f"); build.flag("-mavx512vl"); } build.compile("blake3_avx512_assembly"); } fn build_neon_c_intrinsics() { let mut build = new_build(); // Note that blake3_neon.c normally depends on the blake3_portable.c // for the single-instance compression function, but we expose // portable.rs over FFI instead. See ffi_neon.rs. build.file("c/blake3_neon.c"); // ARMv7 platforms that support NEON generally need the following // flags. AArch64 supports NEON by default and does not support -mpfu. if is_armv7() { build.flag("-mfpu=neon-vfpv4"); build.flag("-mfloat-abi=hard"); } build.compile("blake3_neon"); } fn main() -> Result<(), Box> { // As of Rust 1.80, unrecognized config names are warnings. Give Cargo all of our config names. let all_cfgs = [ "blake3_sse2_ffi", "blake3_sse2_rust", "blake3_sse41_ffi", "blake3_sse41_rust", "blake3_avx2_ffi", "blake3_avx2_rust", "blake3_avx512_ffi", "blake3_neon", ]; for cfg_name in all_cfgs { // TODO: Switch this whole file to the new :: syntax when our MSRV reaches 1.77. // https://doc.rust-lang.org/cargo/reference/build-scripts.html#outputs-of-the-build-script println!("cargo:rustc-check-cfg=cfg({cfg_name}, values(none()))"); } if is_pure() && is_neon() { panic!("It doesn't make sense to enable both \"pure\" and \"neon\"."); } if is_no_neon() && is_neon() { panic!("It doesn't make sense to enable both \"no_neon\" and \"neon\"."); } if is_x86_64() || is_x86_32() { let support = c_compiler_support(); if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler { build_sse2_sse41_avx2_rust_intrinsics(); } else { // We assume that all C compilers can assemble SSE4.1 and AVX2. We // don't explicitly check for support. build_sse2_sse41_avx2_assembly(); } if is_pure() || support == NoCompiler || support == NoAVX512 { // The binary will not include any AVX-512 code. } else if is_x86_32() || should_prefer_intrinsics() { build_avx512_c_intrinsics(); } else { build_avx512_assembly(); } } if is_neon() && is_big_endian() { panic!("The NEON implementation doesn't support big-endian ARM.") } if (is_arm() && is_neon()) || (!is_no_neon() && !is_pure() && is_aarch64() && is_little_endian()) { println!("cargo:rustc-cfg=blake3_neon"); build_neon_c_intrinsics(); } // The `cc` crate doesn't automatically emit rerun-if directives for the // environment variables it supports, in particular for $CC. We expect to // do a lot of benchmarking across different compilers, so we explicitly // add the variables that we're likely to need. println!("cargo:rerun-if-env-changed=CC"); println!("cargo:rerun-if-env-changed=CFLAGS"); // Ditto for source files, though these shouldn't change as often. for file in std::fs::read_dir("c")? { println!( "cargo:rerun-if-changed={}", file?.path().to_str().expect("utf-8") ); } Ok(()) } blake3-1.5.4/c/.gitignore000064400000000000000000000000611046102023000132030ustar 00000000000000blake3 example build/ *.o CMakeUserPresets.json blake3-1.5.4/c/CMakeLists.txt000064400000000000000000000207301046102023000137600ustar 00000000000000cmake_minimum_required(VERSION 3.9 FATAL_ERROR) # respect C_EXTENSIONS OFF without explicitly setting C_STANDARD if (POLICY CMP0128) cmake_policy(SET CMP0128 NEW) endif() # mark_as_advanced does not implicitly create UNINITIALIZED cache entries if (POLICY CMP0102) cmake_policy(SET CMP0102 NEW) endif() project(libblake3 VERSION 1.5.4 DESCRIPTION "BLAKE3 C implementation" LANGUAGES C ASM ) include(FeatureSummary) include(GNUInstallDirs) # architecture lists for which to enable assembly / SIMD sources set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) set(BLAKE3_X86_NAMES i686 x86 X86) set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a) # default SIMD compiler flag configuration (can be overriden by toolchains or CLI) if(MSVC) set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2") # MSVC has no dedicated sse4.1 flag (see https://learn.microsoft.com/en-us/cpp/build/reference/arch-x86?view=msvc-170) set(BLAKE3_CFLAGS_SSE4.1 "/arch:AVX" CACHE STRING "the compiler flags to enable SSE4.1") set(BLAKE3_CFLAGS_AVX2 "/arch:AVX2" CACHE STRING "the compiler flags to enable AVX2") set(BLAKE3_CFLAGS_AVX512 "/arch:AVX512" CACHE STRING "the compiler flags to enable AVX512") set(BLAKE3_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_msvc.asm blake3_avx512_x86-64_windows_msvc.asm blake3_sse2_x86-64_windows_msvc.asm blake3_sse41_x86-64_windows_msvc.asm ) elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang" OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang") set(BLAKE3_CFLAGS_SSE2 "-msse2" CACHE STRING "the compiler flags to enable SSE2") set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1") set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2") set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512") if (WIN32) set(BLAKE3_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_gnu.S blake3_avx512_x86-64_windows_gnu.S blake3_sse2_x86-64_windows_gnu.S blake3_sse41_x86-64_windows_gnu.S ) elseif(UNIX) set(BLAKE3_AMD64_ASM_SOURCES blake3_avx2_x86-64_unix.S blake3_avx512_x86-64_unix.S blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S ) endif() if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8) # 32-bit ARMv8 needs NEON to be enabled explicitly set(BLAKE3_CFLAGS_NEON "-mfpu=neon" CACHE STRING "the compiler flags to enable NEON") endif() endif() mark_as_advanced(BLAKE3_CFLAGS_SSE2 BLAKE3_CFLAGS_SSE4.1 BLAKE3_CFLAGS_AVX2 BLAKE3_CFLAGS_AVX512 BLAKE3_CFLAGS_NEON) mark_as_advanced(BLAKE3_AMD64_ASM_SOURCES) message(STATUS "BLAKE3 SIMD configuration: ${CMAKE_C_COMPILER_ARCHITECTURE_ID}") if(MSVC AND DEFINED CMAKE_C_COMPILER_ARCHITECTURE_ID) if(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Xx]86") set(BLAKE3_SIMD_TYPE "x86-intrinsics" CACHE STRING "the SIMD acceleration type to use") elseif(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Xx]64") set(BLAKE3_SIMD_TYPE "amd64-asm" CACHE STRING "the SIMD acceleration type to use") elseif(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Aa][Rr][Mm]64") set(BLAKE3_SIMD_TYPE "neon-intrinsics" CACHE STRING "the SIMD acceleration type to use") else() set(BLAKE3_SIMD_TYPE "none" CACHE STRING "the SIMD acceleration type to use") endif() elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES) set(BLAKE3_SIMD_TYPE "amd64-asm" CACHE STRING "the SIMD acceleration type to use") elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES AND DEFINED BLAKE3_CFLAGS_SSE2 AND DEFINED BLAKE3_CFLAGS_SSE4.1 AND DEFINED BLAKE3_CFLAGS_AVX2 AND DEFINED BLAKE3_CFLAGS_AVX512) set(BLAKE3_SIMD_TYPE "x86-intrinsics" CACHE STRING "the SIMD acceleration type to use") elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES OR ANDROID_ABI STREQUAL "armeabi-v7a" OR BLAKE3_USE_NEON_INTRINSICS) AND (DEFINED BLAKE3_CFLAGS_NEON OR CMAKE_SIZEOF_VOID_P EQUAL 8)) set(BLAKE3_SIMD_TYPE "neon-intrinsics" CACHE STRING "the SIMD acceleration type to use") else() set(BLAKE3_SIMD_TYPE "none" CACHE STRING "the SIMD acceleration type to use") endif() mark_as_advanced(BLAKE3_SIMD_TYPE) # library target add_library(blake3 blake3.c blake3_dispatch.c blake3_portable.c ) add_library(BLAKE3::blake3 ALIAS blake3) # library configuration set(BLAKE3_PKGCONFIG_CFLAGS) if (BUILD_SHARED_LIBS) target_compile_definitions(blake3 PUBLIC BLAKE3_DLL PRIVATE BLAKE3_DLL_EXPORTS ) list(APPEND BLAKE3_PKGCONFIG_CFLAGS -DBLAKE3_DLL) endif() target_include_directories(blake3 PUBLIC $ $ ) set_target_properties(blake3 PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 0 C_VISIBILITY_PRESET hidden C_EXTENSIONS OFF ) target_compile_features(blake3 PUBLIC c_std_99) # ensure C_EXTENSIONS OFF is respected without overriding CMAKE_C_STANDARD # which may be set by the user or toolchain file if (NOT POLICY CMP0128 AND NOT DEFINED CMAKE_C_STANDARD) set_target_properties(blake3 PROPERTIES C_STANDARD 99) endif() # optional SIMD sources if(BLAKE3_SIMD_TYPE STREQUAL "amd64-asm") if (NOT DEFINED BLAKE3_AMD64_ASM_SOURCES) message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to 'amd64-asm' but no assembly sources are available for the target architecture.") endif() set(BLAKE3_SIMD_AMD64_ASM ON) if(MSVC) enable_language(ASM_MASM) endif() target_sources(blake3 PRIVATE ${BLAKE3_AMD64_ASM_SOURCES}) elseif(BLAKE3_SIMD_TYPE STREQUAL "x86-intrinsics") if (NOT DEFINED BLAKE3_CFLAGS_SSE2 OR NOT DEFINED BLAKE3_CFLAGS_SSE4.1 OR NOT DEFINED BLAKE3_CFLAGS_AVX2 OR NOT DEFINED BLAKE3_CFLAGS_AVX512) message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to 'x86-intrinsics' but no compiler flags are available for the target architecture.") endif() set(BLAKE3_SIMD_X86_INTRINSICS ON) target_sources(blake3 PRIVATE blake3_avx2.c blake3_avx512.c blake3_sse2.c blake3_sse41.c ) set_source_files_properties(blake3_avx2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX2}") set_source_files_properties(blake3_avx512.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX512}") set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}") set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}") elseif(BLAKE3_SIMD_TYPE STREQUAL "neon-intrinsics") set(BLAKE3_SIMD_NEON_INTRINSICS ON) target_sources(blake3 PRIVATE blake3_neon.c ) target_compile_definitions(blake3 PRIVATE BLAKE3_USE_NEON=1 ) if (DEFINED BLAKE3_CFLAGS_NEON) set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}") endif() elseif(BLAKE3_SIMD_TYPE STREQUAL "none") target_compile_definitions(blake3 PRIVATE BLAKE3_USE_NEON=0 BLAKE3_NO_SSE2 BLAKE3_NO_SSE41 BLAKE3_NO_AVX2 BLAKE3_NO_AVX512 ) else() message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to an unknown value: '${BLAKE3_SIMD_TYPE}'") endif() # cmake install support install(FILES blake3.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") install(TARGETS blake3 EXPORT blake3-targets) install(EXPORT blake3-targets NAMESPACE BLAKE3:: DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3" ) include(CMakePackageConfigHelpers) configure_package_config_file(blake3-config.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/blake3-config.cmake" INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3" ) write_basic_package_version_file( "${CMAKE_CURRENT_BINARY_DIR}/blake3-config-version.cmake" VERSION ${libblake3_VERSION} COMPATIBILITY SameMajorVersion ) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/blake3-config.cmake" "${CMAKE_CURRENT_BINARY_DIR}/blake3-config-version.cmake" DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3" ) configure_file(libblake3.pc.in libblake3.pc @ONLY) install(FILES "${CMAKE_BINARY_DIR}/libblake3.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") # print feature summary # add_feature_info cannot directly use the BLAKE3_SIMD_TYPE :( add_feature_info("AMD64 assembly" BLAKE3_SIMD_AMD64_ASM "The library uses hand written amd64 SIMD assembly.") add_feature_info("x86 SIMD intrinsics" BLAKE3_SIMD_X86_INTRINSICS "The library uses x86 SIMD intrinsics.") add_feature_info("NEON SIMD intrinsics" BLAKE3_SIMD_NEON_INTRINSICS "The library uses NEON SIMD intrinsics.") feature_summary(WHAT ENABLED_FEATURES) blake3-1.5.4/c/CMakePresets.json000064400000000000000000000034701046102023000144430ustar 00000000000000{ "version": 3, "cmakeMinimumRequired": { "major": 3, "minor": 22, "patch": 0 }, "configurePresets": [ { "name": "base", "hidden": true, "binaryDir": "${sourceDir}/build/${presetName}" }, { "name": "msvc", "hidden": true, "generator": "Visual Studio 17 2022", "vendor": { "microsoft.com/VisualStudioSettings/CMake/1.0": { "hostOS": [ "Windows" ] } } }, { "name": "x64-windows-msvc", "inherits": [ "msvc", "base" ], "architecture": "x64" }, { "name": "x86-windows-msvc", "inherits": [ "msvc", "base" ], "architecture": "Win32" }, { "name": "arm64-windows-msvc", "inherits": [ "msvc", "base" ], "architecture": "ARM64" } ], "buildPresets": [ { "name": "x64-windows-msvc-debug", "configurePreset": "x64-windows-msvc", "configuration": "Debug" }, { "name": "x64-windows-msvc-release", "configurePreset": "x64-windows-msvc", "configuration": "RelWithDebInfo" }, { "name": "x86-windows-msvc-debug", "configurePreset": "x86-windows-msvc", "configuration": "Debug" }, { "name": "x86-windows-msvc-release", "configurePreset": "x86-windows-msvc", "configuration": "RelWithDebInfo" } ] }blake3-1.5.4/c/Makefile.testing000064400000000000000000000040111046102023000143260ustar 00000000000000# This Makefile is only for testing. C callers should follow the instructions # in ./README.md to incorporate these C files into their existing build. NAME=blake3 CC=gcc CFLAGS=-O3 -Wall -Wextra -std=c11 -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2 -fPIE -fvisibility=hidden LDFLAGS=-pie -Wl,-z,relro,-z,now TARGETS= ASM_TARGETS= EXTRAFLAGS=-Wa,--noexecstack ifdef BLAKE3_NO_SSE2 EXTRAFLAGS += -DBLAKE3_NO_SSE2 else TARGETS += blake3_sse2.o ASM_TARGETS += blake3_sse2_x86-64_unix.S endif ifdef BLAKE3_NO_SSE41 EXTRAFLAGS += -DBLAKE3_NO_SSE41 else TARGETS += blake3_sse41.o ASM_TARGETS += blake3_sse41_x86-64_unix.S endif ifdef BLAKE3_NO_AVX2 EXTRAFLAGS += -DBLAKE3_NO_AVX2 else TARGETS += blake3_avx2.o ASM_TARGETS += blake3_avx2_x86-64_unix.S endif ifdef BLAKE3_NO_AVX512 EXTRAFLAGS += -DBLAKE3_NO_AVX512 else TARGETS += blake3_avx512.o ASM_TARGETS += blake3_avx512_x86-64_unix.S endif ifdef BLAKE3_USE_NEON EXTRAFLAGS += -DBLAKE3_USE_NEON=1 TARGETS += blake3_neon.o endif ifdef BLAKE3_NO_NEON EXTRAFLAGS += -DBLAKE3_USE_NEON=0 endif all: blake3.c blake3_dispatch.c blake3_portable.c main.c $(TARGETS) $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS) blake3_sse2.o: blake3_sse2.c $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse2 blake3_sse41.o: blake3_sse41.c $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse4.1 blake3_avx2.o: blake3_avx2.c $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx2 blake3_avx512.o: blake3_avx512.c $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx512f -mavx512vl blake3_neon.o: blake3_neon.c $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ test: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined test: all ./test.py asm: blake3.c blake3_dispatch.c blake3_portable.c main.c $(ASM_TARGETS) $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS) test_asm: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined test_asm: asm ./test.py example: example.c blake3.c blake3_dispatch.c blake3_portable.c $(ASM_TARGETS) $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $@ $(LDFLAGS) clean: rm -f $(NAME) *.o blake3-1.5.4/c/README.md000064400000000000000000000253131046102023000125010ustar 00000000000000The official C implementation of BLAKE3. # Example An example program that hashes bytes from standard input and prints the result: ```c #include "blake3.h" #include #include #include #include #include int main(void) { // Initialize the hasher. blake3_hasher hasher; blake3_hasher_init(&hasher); // Read input bytes from stdin. unsigned char buf[65536]; while (1) { ssize_t n = read(STDIN_FILENO, buf, sizeof(buf)); if (n > 0) { blake3_hasher_update(&hasher, buf, n); } else if (n == 0) { break; // end of file } else { fprintf(stderr, "read failed: %s\n", strerror(errno)); exit(1); } } // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. uint8_t output[BLAKE3_OUT_LEN]; blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); // Print the hash as hexadecimal. for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { printf("%02x", output[i]); } printf("\n"); return 0; } ``` The code above is included in this directory as `example.c`. If you're on x86\_64 with a Unix-like OS, you can compile a working binary like this: ```bash gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \ blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ blake3_avx512_x86-64_unix.S ``` # API ## The Struct ```c typedef struct { // private fields } blake3_hasher; ``` An incremental BLAKE3 hashing state, which can accept any number of updates. This implementation doesn't allocate any heap memory, but `sizeof(blake3_hasher)` itself is relatively large, currently 1912 bytes on x86-64. This size can be reduced by restricting the maximum input length, as described in Section 5.4 of [the BLAKE3 spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf), but this implementation doesn't currently support that strategy. ## Common API Functions ```c void blake3_hasher_init( blake3_hasher *self); ``` Initialize a `blake3_hasher` in the default hashing mode. --- ```c void blake3_hasher_update( blake3_hasher *self, const void *input, size_t input_len); ``` Add input to the hasher. This can be called any number of times. --- ```c void blake3_hasher_finalize( const blake3_hasher *self, uint8_t *out, size_t out_len); ``` Finalize the hasher and return an output of any length, given in bytes. This doesn't modify the hasher itself, and it's possible to finalize again after adding more input. The constant `BLAKE3_OUT_LEN` provides the default output length, 32 bytes, which is recommended for most callers. See the [Security Notes](#security-notes) below. ## Less Common API Functions ```c void blake3_hasher_init_keyed( blake3_hasher *self, const uint8_t key[BLAKE3_KEY_LEN]); ``` Initialize a `blake3_hasher` in the keyed hashing mode. The key must be exactly 32 bytes. --- ```c void blake3_hasher_init_derive_key( blake3_hasher *self, const char *context); ``` Initialize a `blake3_hasher` in the key derivation mode. The context string is given as an initialization parameter, and afterwards input key material should be given with `blake3_hasher_update`. The context string is a null-terminated C string which should be **hardcoded, globally unique, and application-specific**. The context string should not include any dynamic input like salts, nonces, or identifiers read from a database at runtime. A good default format for the context string is `"[application] [commit timestamp] [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`. This function is intended for application code written in C. For language bindings, see `blake3_hasher_init_derive_key_raw` below. --- ```c void blake3_hasher_init_derive_key_raw( blake3_hasher *self, const void *context, size_t context_len); ``` As `blake3_hasher_init_derive_key` above, except that the context string is given as a pointer to an array of arbitrary bytes with a provided length. This is intended for writing language bindings, where C string conversion would add unnecessary overhead and new error cases. Unicode strings should be encoded as UTF-8. Application code in C should prefer `blake3_hasher_init_derive_key`, which takes the context as a C string. If you need to use arbitrary bytes as a context string in application code, consider whether you're violating the requirement that context strings should be hardcoded. --- ```c void blake3_hasher_finalize_seek( const blake3_hasher *self, uint64_t seek, uint8_t *out, size_t out_len); ``` The same as `blake3_hasher_finalize`, but with an additional `seek` parameter for the starting byte position in the output stream. To efficiently stream a large output without allocating memory, call this function in a loop, incrementing `seek` by the output length each time. --- ```c void blake3_hasher_reset( blake3_hasher *self); ``` Reset the hasher to its initial state, prior to any calls to `blake3_hasher_update`. Currently this is no different from calling `blake3_hasher_init` or similar again. However, if this implementation gains multithreading support in the future, and if `blake3_hasher` holds (optional) threading resources, this function will reuse those resources. Until then, this is mainly for feature compatibility with the Rust implementation. # Security Notes Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2 bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional security. Avoid relying on the secrecy of the output offset, that is, the `seek` argument of `blake3_hasher_finalize_seek`. [_Block-Cipher-Based Tree Hashing_ by Aldo Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows both the message and the key (if any) can easily determine the offset of an extended output. For comparison, AES-CTR has a similar property: if you know the key, you can decrypt a block from an unknown position in the output stream to recover its block index. Callers with strong secret keys aren't affected in practice, but secret offsets are a [design smell](https://en.wikipedia.org/wiki/Design_smell) in any case. # Building This implementation is just C and assembly files. It doesn't include a public-facing build system. (The `Makefile` in this directory is only for testing.) Instead, the intention is that you can include these files in whatever build system you're already using. This section describes the commands your build system should execute, or which you can execute by hand. Note that these steps may change in future versions. ## x86 Dynamic dispatch is enabled by default on x86. The implementation will query the CPU at runtime to detect SIMD support, and it will use the widest instruction set available. By default, `blake3_dispatch.c` expects to be linked with code for five different instruction sets: portable C, SSE2, SSE4.1, AVX2, and AVX-512. For each of the x86 SIMD instruction sets, four versions are available: three flavors of assembly (Unix, Windows MSVC, and Windows GNU) and one version using C intrinsics. The assembly versions are generally preferred. They perform better, they perform more consistently across different compilers, and they build more quickly. On the other hand, the assembly versions are x86\_64-only, and you need to select the right flavor for your target platform. Here's an example of building a shared library on x86\_64 Linux using the assembly implementations: ```bash gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ blake3_avx512_x86-64_unix.S ``` When building the intrinsics-based implementations, you need to build each implementation separately, with the corresponding instruction set explicitly enabled in the compiler. Here's the same shared library using the intrinsics-based implementations: ```bash gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o ``` Note above that building `blake3_avx512.c` requires both `-mavx512f` and `-mavx512vl` under GCC and Clang. Under MSVC, the single `/arch:AVX512` flag is sufficient. The MSVC equivalent of `-mavx2` is `/arch:AVX2`. MSVC enables SSE2 and SSE4.1 by default, and it doesn't have a corresponding flag. If you want to omit SIMD code entirely, you need to explicitly disable each instruction set. Here's an example of building a shared library on x86 with only portable code: ```bash gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \ -DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c ``` ## ARM NEON The NEON implementation is enabled by default on AArch64, but not on other ARM targets, since not all of them support it. To enable it, set `BLAKE3_USE_NEON=1`. Here's an example of building a shared library on ARM Linux with NEON support: ```bash gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=1 blake3.c blake3_dispatch.c \ blake3_portable.c blake3_neon.c ``` To explicitiy disable using NEON instructions on AArch64, set `BLAKE3_USE_NEON=0`. ```bash gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=0 blake3.c blake3_dispatch.c \ blake3_portable.c ``` Note that on some targets (ARMv7 in particular), extra flags may be required to activate NEON support in the compiler. If you see an error like... ``` /usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed in call to always_inline ‘vaddq_u32’: target specific option mismatch ``` ...then you may need to add something like `-mfpu=neon-vfpv4 -mfloat-abi=hard`. ## Other Platforms The portable implementation should work on most other architectures. For example: ```bash gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c ``` # Multithreading Unlike the Rust implementation, the C implementation doesn't currently support multithreading. A future version of this library could add support by taking an optional dependency on OpenMP or similar. Alternatively, we could expose a lower-level API to allow callers to implement concurrency themselves. The former would be more convenient and less error-prone, but the latter would give callers the maximum possible amount of control. The best choice here depends on the specific use case, so if you have a use case for multithreaded hashing in C, please file a GitHub issue and let us know. blake3-1.5.4/c/blake3-config.cmake.in000064400000000000000000000001541046102023000152310ustar 00000000000000@PACKAGE_INIT@ include("${CMAKE_CURRENT_LIST_DIR}/blake3-targets.cmake") check_required_components(blake3)blake3-1.5.4/c/blake3.c000064400000000000000000000666041046102023000125370ustar 00000000000000#include #include #include #include "blake3.h" #include "blake3_impl.h" const char *blake3_version(void) { return BLAKE3_VERSION_STRING; } INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], uint8_t flags) { memcpy(self->cv, key, BLAKE3_KEY_LEN); self->chunk_counter = 0; memset(self->buf, 0, BLAKE3_BLOCK_LEN); self->buf_len = 0; self->blocks_compressed = 0; self->flags = flags; } INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], uint64_t chunk_counter) { memcpy(self->cv, key, BLAKE3_KEY_LEN); self->chunk_counter = chunk_counter; self->blocks_compressed = 0; memset(self->buf, 0, BLAKE3_BLOCK_LEN); self->buf_len = 0; } INLINE size_t chunk_state_len(const blake3_chunk_state *self) { return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + ((size_t)self->buf_len); } INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, const uint8_t *input, size_t input_len) { size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); if (take > input_len) { take = input_len; } uint8_t *dest = self->buf + ((size_t)self->buf_len); memcpy(dest, input, take); self->buf_len += (uint8_t)take; return take; } INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { if (self->blocks_compressed == 0) { return CHUNK_START; } else { return 0; } } typedef struct { uint32_t input_cv[8]; uint64_t counter; uint8_t block[BLAKE3_BLOCK_LEN]; uint8_t block_len; uint8_t flags; } output_t; INLINE output_t make_output(const uint32_t input_cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { output_t ret; memcpy(ret.input_cv, input_cv, 32); memcpy(ret.block, block, BLAKE3_BLOCK_LEN); ret.block_len = block_len; ret.counter = counter; ret.flags = flags; return ret; } // Chaining values within a given chunk (specifically the compress_in_place // interface) are represented as words. This avoids unnecessary bytes<->words // conversion overhead in the portable implementation. However, the hash_many // interface handles both user input and parent node blocks, so it accepts // bytes. For that reason, chaining values in the CV stack are represented as // bytes. INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { uint32_t cv_words[8]; memcpy(cv_words, self->input_cv, 32); blake3_compress_in_place(cv_words, self->block, self->block_len, self->counter, self->flags); store_cv_words(cv, cv_words); } INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, size_t out_len) { if (out_len == 0) { return; } uint64_t output_block_counter = seek / 64; size_t offset_within_block = seek % 64; uint8_t wide_buf[64]; if(offset_within_block) { blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf); const size_t available_bytes = 64 - offset_within_block; const size_t bytes = out_len > available_bytes ? available_bytes : out_len; memcpy(out, wide_buf + offset_within_block, bytes); out += bytes; out_len -= bytes; output_block_counter += 1; } if(out_len / 64) { blake3_xof_many(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64); } output_block_counter += out_len / 64; out += out_len & -64; out_len -= out_len & -64; if(out_len) { blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf); memcpy(out, wide_buf, out_len); } } INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, size_t input_len) { if (self->buf_len > 0) { size_t take = chunk_state_fill_buf(self, input, input_len); input += take; input_len -= take; if (input_len > 0) { blake3_compress_in_place( self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self)); self->blocks_compressed += 1; self->buf_len = 0; memset(self->buf, 0, BLAKE3_BLOCK_LEN); } } while (input_len > BLAKE3_BLOCK_LEN) { blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self)); self->blocks_compressed += 1; input += BLAKE3_BLOCK_LEN; input_len -= BLAKE3_BLOCK_LEN; } chunk_state_fill_buf(self, input, input_len); } INLINE output_t chunk_state_output(const blake3_chunk_state *self) { uint8_t block_flags = self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, block_flags); } INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], const uint32_t key[8], uint8_t flags) { return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); } // Given some input larger than one chunk, return the number of bytes that // should go in the left subtree. This is the largest power-of-2 number of // chunks that leaves at least 1 byte for the right subtree. INLINE size_t left_len(size_t content_len) { // Subtract 1 to reserve at least one byte for the right side. content_len // should always be greater than BLAKE3_CHUNK_LEN. size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; } // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time // on a single thread. Write out the chunk chaining values and return the // number of chunks hashed. These chunks are never the root and never empty; // those cases use a different codepath. INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out) { #if defined(BLAKE3_TESTING) assert(0 < input_len); assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); #endif const uint8_t *chunks_array[MAX_SIMD_DEGREE]; size_t input_position = 0; size_t chunks_array_len = 0; while (input_len - input_position >= BLAKE3_CHUNK_LEN) { chunks_array[chunks_array_len] = &input[input_position]; input_position += BLAKE3_CHUNK_LEN; chunks_array_len += 1; } blake3_hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, true, flags, CHUNK_START, CHUNK_END, out); // Hash the remaining partial chunk, if there is one. Note that the empty // chunk (meaning the empty message) is a different codepath. if (input_len > input_position) { uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; blake3_chunk_state chunk_state; chunk_state_init(&chunk_state, key, flags); chunk_state.chunk_counter = counter; chunk_state_update(&chunk_state, &input[input_position], input_len - input_position); output_t output = chunk_state_output(&chunk_state); output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); return chunks_array_len + 1; } else { return chunks_array_len; } } // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time // on a single thread. Write out the parent chaining values and return the // number of parents hashed. (If there's an odd input chaining value left over, // return it as an additional output.) These parents are never the root and // never empty; those cases use a different codepath. INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t *out) { #if defined(BLAKE3_TESTING) assert(2 <= num_chaining_values); assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); #endif const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; size_t parents_array_len = 0; while (num_chaining_values - (2 * parents_array_len) >= 2) { parents_array[parents_array_len] = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; parents_array_len += 1; } blake3_hash_many(parents_array, parents_array_len, 1, key, 0, // Parents always use counter 0. false, flags | PARENT, 0, // Parents have no start flags. 0, // Parents have no end flags. out); // If there's an odd child left over, it becomes an output. if (num_chaining_values > 2 * parents_array_len) { memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], BLAKE3_OUT_LEN); return parents_array_len + 1; } else { return parents_array_len; } } // The wide helper function returns (writes out) an array of chaining values // and returns the length of that array. The number of chaining values returned // is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, // if the input is shorter than that many chunks. The reason for maintaining a // wide array of chaining values going back up the tree, is to allow the // implementation to hash as many parents in parallel as possible. // // As a special case when the SIMD degree is 1, this function will still return // at least 2 outputs. This guarantees that this function doesn't perform the // root compression. (If it did, it would use the wrong flags, and also we // wouldn't be able to implement extendable output.) Note that this function is // not used when the whole input is only 1 chunk long; that's a different // codepath. // // Why not just have the caller split the input on the first update(), instead // of implementing this special rule? Because we don't want to limit SIMD or // multi-threading parallelism for that update(). static size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out) { // Note that the single chunk case does *not* bump the SIMD degree up to 2 // when it is 1. If this implementation adds multi-threading in the future, // this gives us the option of multi-threading even the 2-chunk case, which // can help performance on smaller platforms. if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, out); } // With more than simd_degree chunks, we need to recurse. Start by dividing // the input into left and right subtrees. (Note that this is only optimal // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree // of 3 or something, we'll need a more complicated strategy.) size_t left_input_len = left_len(input_len); size_t right_input_len = input_len - left_input_len; const uint8_t *right_input = &input[left_input_len]; uint64_t right_chunk_counter = chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to // account for the special case of returning 2 outputs when the SIMD degree // is 1. uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; size_t degree = blake3_simd_degree(); if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { // The special case: We always use a degree of at least two, to make // sure there are two outputs. Except, as noted above, at the chunk // level, where we allow degree=1. (Note that the 1-chunk-input case is // a different codepath.) degree = 2; } uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; // Recurse! If this implementation adds multi-threading support in the // future, this is where it will go. size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, chunk_counter, flags, cv_array); size_t right_n = blake3_compress_subtree_wide( right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); // The special case again. If simd_degree=1, then we'll have left_n=1 and // right_n=1. Rather than compressing them into a single output, return // them directly, to make sure we always have at least two outputs. if (left_n == 1) { memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); return 2; } // Otherwise, do one layer of parent node compression. size_t num_chaining_values = left_n + right_n; return compress_parents_parallel(cv_array, num_chaining_values, key, flags, out); } // Hash a subtree with compress_subtree_wide(), and then condense the resulting // list of chaining values down to a single parent node. Don't compress that // last parent node, however. Instead, return its message bytes (the // concatenated chaining values of its children). This is necessary when the // first call to update() supplies a complete subtree, because the topmost // parent node of that subtree could end up being the root. It's also necessary // for extended output in the general case. // // As with compress_subtree_wide(), this function is not used on inputs of 1 // chunk or less. That's a different codepath. INLINE void compress_subtree_to_parent_node( const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { #if defined(BLAKE3_TESTING) assert(input_len > BLAKE3_CHUNK_LEN); #endif uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array); assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because // as we just asserted, num_cvs will always be <=2 in that case. But GCC // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is // set then it emits incorrect warnings here. We tried a few different // hacks to silence these, but in the end our hacks just produced different // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of // desperation, we ifdef out this entire loop when we know it's not needed. #if MAX_SIMD_DEGREE_OR_2 > 2 // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input, // compress_subtree_wide() returns more than 2 chaining values. Condense // them into 2 by forming parent nodes repeatedly. uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; while (num_cvs > 2) { num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); } #endif memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); } INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], uint8_t flags) { memcpy(self->key, key, BLAKE3_KEY_LEN); chunk_state_init(&self->chunk, key, flags); self->cv_stack_len = 0; } void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } void blake3_hasher_init_keyed(blake3_hasher *self, const uint8_t key[BLAKE3_KEY_LEN]) { uint32_t key_words[8]; load_key_words(key, key_words); hasher_init_base(self, key_words, KEYED_HASH); } void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, size_t context_len) { blake3_hasher context_hasher; hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); blake3_hasher_update(&context_hasher, context, context_len); uint8_t context_key[BLAKE3_KEY_LEN]; blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); uint32_t context_key_words[8]; load_key_words(context_key, context_key_words); hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); } void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { blake3_hasher_init_derive_key_raw(self, context, strlen(context)); } // As described in hasher_push_cv() below, we do "lazy merging", delaying // merges until right before the next CV is about to be added. This is // different from the reference implementation. Another difference is that we // aren't always merging 1 chunk at a time. Instead, each CV might represent // any power-of-two number of chunks, as long as the smaller-above-larger stack // order is maintained. Instead of the "count the trailing 0-bits" algorithm // described in the spec, we use a "count the total number of 1-bits" variant // that doesn't require us to retain the subtree size of the CV on top of the // stack. The principle is the same: each CV that should remain in the stack is // represented by a 1-bit in the total number of chunks (or bytes) so far. INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { size_t post_merge_stack_len = (size_t)popcnt(total_len); while (self->cv_stack_len > post_merge_stack_len) { uint8_t *parent_node = &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; output_t output = parent_output(parent_node, self->key, self->chunk.flags); output_chaining_value(&output, parent_node); self->cv_stack_len -= 1; } } // In reference_impl.rs, we merge the new CV with existing CVs from the stack // before pushing it. We can do that because we know more input is coming, so // we know none of the merges are root. // // This setting is different. We want to feed as much input as possible to // compress_subtree_wide(), without setting aside anything for the chunk_state. // If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once // as a single subtree, if at all possible. // // This leads to two problems: // 1) This 64 KiB input might be the only call that ever gets made to update. // In this case, the root node of the 64 KiB subtree would be the root node // of the whole tree, and it would need to be ROOT finalized. We can't // compress it until we know. // 2) This 64 KiB input might complete a larger tree, whose root node is // similarly going to be the root of the whole tree. For example, maybe // we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the // node at the root of the 256 KiB subtree until we know how to finalize it. // // The second problem is solved with "lazy merging". That is, when we're about // to add a CV to the stack, we don't merge it with anything first, as the // reference impl does. Instead we do merges using the *previous* CV that was // added, which is sitting on top of the stack, and we put the new CV // (unmerged) on top of the stack afterwards. This guarantees that we never // merge the root node until finalize(). // // Solving the first problem requires an additional tool, // compress_subtree_to_parent_node(). That function always returns the top // *two* chaining values of the subtree it's compressing. We then do lazy // merging with each of them separately, so that the second CV will always // remain unmerged. (That also helps us support extendable output when we're // hashing an input all-at-once.) INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter) { hasher_merge_cv_stack(self, chunk_counter); memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN); self->cv_stack_len += 1; } void blake3_hasher_update(blake3_hasher *self, const void *input, size_t input_len) { // Explicitly checking for zero avoids causing UB by passing a null pointer // to memcpy. This comes up in practice with things like: // std::vector v; // blake3_hasher_update(&hasher, v.data(), v.size()); if (input_len == 0) { return; } const uint8_t *input_bytes = (const uint8_t *)input; // If we have some partial chunk bytes in the internal chunk_state, we need // to finish that chunk first. if (chunk_state_len(&self->chunk) > 0) { size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); if (take > input_len) { take = input_len; } chunk_state_update(&self->chunk, input_bytes, take); input_bytes += take; input_len -= take; // If we've filled the current chunk and there's more coming, finalize this // chunk and proceed. In this case we know it's not the root. if (input_len > 0) { output_t output = chunk_state_output(&self->chunk); uint8_t chunk_cv[32]; output_chaining_value(&output, chunk_cv); hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); } else { return; } } // Now the chunk_state is clear, and we have more input. If there's more than // a single chunk (so, definitely not the root chunk), hash the largest whole // subtree we can, with the full benefits of SIMD (and maybe in the future, // multi-threading) parallelism. Two restrictions: // - The subtree has to be a power-of-2 number of chunks. Only subtrees along // the right edge can be incomplete, and we don't know where the right edge // is going to be until we get to finalize(). // - The subtree must evenly divide the total number of chunks up until this // point (if total is not 0). If the current incomplete subtree is only // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have // to complete the current subtree first. // Because we might need to break up the input to form powers of 2, or to // evenly divide what we already have, this part runs in a loop. while (input_len > BLAKE3_CHUNK_LEN) { size_t subtree_len = round_down_to_power_of_2(input_len); uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; // Shrink the subtree_len until it evenly divides the count so far. We know // that subtree_len itself is a power of 2, so we can use a bitmasking // trick instead of an actual remainder operation. (Note that if the caller // consistently passes power-of-2 inputs of the same size, as is hopefully // typical, this loop condition will always fail, and subtree_len will // always be the full length of the input.) // // An aside: We don't have to shrink subtree_len quite this much. For // example, if count_so_far is 1, we could pass 2 chunks to // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still // get the right answer in the end, and we might get to use 2-way SIMD // parallelism. The problem with this optimization, is that it gets us // stuck always hashing 2 chunks. The total number of chunks will remain // odd, and we'll never graduate to higher degrees of parallelism. See // https://github.com/BLAKE3-team/BLAKE3/issues/69. while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { subtree_len /= 2; } // The shrunken subtree_len might now be 1 chunk long. If so, hash that one // chunk by itself. Otherwise, compress the subtree into a pair of CVs. uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; if (subtree_len <= BLAKE3_CHUNK_LEN) { blake3_chunk_state chunk_state; chunk_state_init(&chunk_state, self->key, self->chunk.flags); chunk_state.chunk_counter = self->chunk.chunk_counter; chunk_state_update(&chunk_state, input_bytes, subtree_len); output_t output = chunk_state_output(&chunk_state); uint8_t cv[BLAKE3_OUT_LEN]; output_chaining_value(&output, cv); hasher_push_cv(self, cv, chunk_state.chunk_counter); } else { // This is the high-performance happy path, though getting here depends // on the caller giving us a long enough input. uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, self->chunk.chunk_counter, self->chunk.flags, cv_pair); hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], self->chunk.chunk_counter + (subtree_chunks / 2)); } self->chunk.chunk_counter += subtree_chunks; input_bytes += subtree_len; input_len -= subtree_len; } // If there's any remaining input less than a full chunk, add it to the chunk // state. In that case, also do a final merge loop to make sure the subtree // stack doesn't contain any unmerged pairs. The remaining input means we // know these merges are non-root. This merge loop isn't strictly necessary // here, because hasher_push_chunk_cv already does its own merge loop, but it // simplifies blake3_hasher_finalize below. if (input_len > 0) { chunk_state_update(&self->chunk, input_bytes, input_len); hasher_merge_cv_stack(self, self->chunk.chunk_counter); } } void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, size_t out_len) { blake3_hasher_finalize_seek(self, 0, out, out_len); } void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, uint8_t *out, size_t out_len) { // Explicitly checking for zero avoids causing UB by passing a null pointer // to memcpy. This comes up in practice with things like: // std::vector v; // blake3_hasher_finalize(&hasher, v.data(), v.size()); if (out_len == 0) { return; } // If the subtree stack is empty, then the current chunk is the root. if (self->cv_stack_len == 0) { output_t output = chunk_state_output(&self->chunk); output_root_bytes(&output, seek, out, out_len); return; } // If there are any bytes in the chunk state, finalize that chunk and do a // roll-up merge between that chunk hash and every subtree in the stack. In // this case, the extra merge loop at the end of blake3_hasher_update // guarantees that none of the subtrees in the stack need to be merged with // each other first. Otherwise, if there are no bytes in the chunk state, // then the top of the stack is a chunk hash, and we start the merge from // that. output_t output; size_t cvs_remaining; if (chunk_state_len(&self->chunk) > 0) { cvs_remaining = self->cv_stack_len; output = chunk_state_output(&self->chunk); } else { // There are always at least 2 CVs in the stack in this case. cvs_remaining = self->cv_stack_len - 2; output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, self->chunk.flags); } while (cvs_remaining > 0) { cvs_remaining -= 1; uint8_t parent_block[BLAKE3_BLOCK_LEN]; memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); output_chaining_value(&output, &parent_block[32]); output = parent_output(parent_block, self->key, self->chunk.flags); } output_root_bytes(&output, seek, out, out_len); } void blake3_hasher_reset(blake3_hasher *self) { chunk_state_reset(&self->chunk, self->key, 0); self->cv_stack_len = 0; } blake3-1.5.4/c/blake3.h000064400000000000000000000052231046102023000125320ustar 00000000000000#ifndef BLAKE3_H #define BLAKE3_H #include #include #if !defined(BLAKE3_API) # if defined(_WIN32) || defined(__CYGWIN__) # if defined(BLAKE3_DLL) # if defined(BLAKE3_DLL_EXPORTS) # define BLAKE3_API __declspec(dllexport) # else # define BLAKE3_API __declspec(dllimport) # endif # define BLAKE3_PRIVATE # else # define BLAKE3_API # define BLAKE3_PRIVATE # endif # elif __GNUC__ >= 4 # define BLAKE3_API __attribute__((visibility("default"))) # define BLAKE3_PRIVATE __attribute__((visibility("hidden"))) # else # define BLAKE3_API # define BLAKE3_PRIVATE # endif #endif #ifdef __cplusplus extern "C" { #endif #define BLAKE3_VERSION_STRING "1.5.4" #define BLAKE3_KEY_LEN 32 #define BLAKE3_OUT_LEN 32 #define BLAKE3_BLOCK_LEN 64 #define BLAKE3_CHUNK_LEN 1024 #define BLAKE3_MAX_DEPTH 54 // This struct is a private implementation detail. It has to be here because // it's part of blake3_hasher below. typedef struct { uint32_t cv[8]; uint64_t chunk_counter; uint8_t buf[BLAKE3_BLOCK_LEN]; uint8_t buf_len; uint8_t blocks_compressed; uint8_t flags; } blake3_chunk_state; typedef struct { uint32_t key[8]; blake3_chunk_state chunk; uint8_t cv_stack_len; // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk // requires a 4th entry, rather than merging everything down to 1, because we // don't know whether more input is coming. This is different from how the // reference implementation does things. uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; } blake3_hasher; BLAKE3_API const char *blake3_version(void); BLAKE3_API void blake3_hasher_init(blake3_hasher *self); BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self, const uint8_t key[BLAKE3_KEY_LEN]); BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, size_t context_len); BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input, size_t input_len); BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, size_t out_len); BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, uint8_t *out, size_t out_len); BLAKE3_API void blake3_hasher_reset(blake3_hasher *self); #ifdef __cplusplus } #endif #endif /* BLAKE3_H */ blake3-1.5.4/c/blake3_avx2.c000064400000000000000000000302441046102023000134660ustar 00000000000000#include "blake3_impl.h" #include #define DEGREE 8 INLINE __m256i loadu(const uint8_t src[32]) { return _mm256_loadu_si256((const __m256i *)src); } INLINE void storeu(__m256i src, uint8_t dest[16]) { _mm256_storeu_si256((__m256i *)dest, src); } INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } // Note that clang-format doesn't like the name "xor" for some reason. INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } INLINE __m256i rot16(__m256i x) { return _mm256_shuffle_epi8( x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); } INLINE __m256i rot12(__m256i x) { return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); } INLINE __m256i rot8(__m256i x) { return _mm256_shuffle_epi8( x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); } INLINE __m256i rot7(__m256i x) { return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); } INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = addv(v[0], v[4]); v[1] = addv(v[1], v[5]); v[2] = addv(v[2], v[6]); v[3] = addv(v[3], v[7]); v[12] = xorv(v[12], v[0]); v[13] = xorv(v[13], v[1]); v[14] = xorv(v[14], v[2]); v[15] = xorv(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = addv(v[8], v[12]); v[9] = addv(v[9], v[13]); v[10] = addv(v[10], v[14]); v[11] = addv(v[11], v[15]); v[4] = xorv(v[4], v[8]); v[5] = xorv(v[5], v[9]); v[6] = xorv(v[6], v[10]); v[7] = xorv(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = addv(v[0], v[4]); v[1] = addv(v[1], v[5]); v[2] = addv(v[2], v[6]); v[3] = addv(v[3], v[7]); v[12] = xorv(v[12], v[0]); v[13] = xorv(v[13], v[1]); v[14] = xorv(v[14], v[2]); v[15] = xorv(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = addv(v[8], v[12]); v[9] = addv(v[9], v[13]); v[10] = addv(v[10], v[14]); v[11] = addv(v[11], v[15]); v[4] = xorv(v[4], v[8]); v[5] = xorv(v[5], v[9]); v[6] = xorv(v[6], v[10]); v[7] = xorv(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = addv(v[0], v[5]); v[1] = addv(v[1], v[6]); v[2] = addv(v[2], v[7]); v[3] = addv(v[3], v[4]); v[15] = xorv(v[15], v[0]); v[12] = xorv(v[12], v[1]); v[13] = xorv(v[13], v[2]); v[14] = xorv(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = addv(v[10], v[15]); v[11] = addv(v[11], v[12]); v[8] = addv(v[8], v[13]); v[9] = addv(v[9], v[14]); v[5] = xorv(v[5], v[10]); v[6] = xorv(v[6], v[11]); v[7] = xorv(v[7], v[8]); v[4] = xorv(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = addv(v[0], v[5]); v[1] = addv(v[1], v[6]); v[2] = addv(v[2], v[7]); v[3] = addv(v[3], v[4]); v[15] = xorv(v[15], v[0]); v[12] = xorv(v[12], v[1]); v[13] = xorv(v[13], v[2]); v[14] = xorv(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = addv(v[10], v[15]); v[11] = addv(v[11], v[12]); v[8] = addv(v[8], v[13]); v[9] = addv(v[9], v[14]); v[5] = xorv(v[5], v[10]); v[6] = xorv(v[6], v[11]); v[7] = xorv(v[7], v[8]); v[4] = xorv(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } INLINE void transpose_vecs(__m256i vecs[DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high // is 22/33/66/77. __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is // 11/33. __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); // Interleave 128-bit lanes. vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); } INLINE void transpose_msg_vecs(const uint8_t *const *inputs, size_t block_offset, __m256i out[16]) { out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]); out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]); out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]); out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]); out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]); out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]); out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]); out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]); out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]); out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]); out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]); out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]); out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]); out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]); out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); for (size_t i = 0; i < 8; ++i) { _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs(&out[0]); transpose_vecs(&out[8]); } INLINE void load_counters(uint64_t counter, bool increment_counter, __m256i *out_lo, __m256i *out_hi) { const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter); const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); const __m256i add1 = _mm256_and_si256(mask, add0); __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1); __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry); *out_lo = l; *out_hi = h; } static void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m256i h_vecs[8] = { set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), }; __m256i counter_low_vec, counter_high_vec; load_counters(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN); __m256i block_flags_vec = set1(block_flags); __m256i msg_vecs[16]; transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m256i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn(v, msg_vecs, 0); round_fn(v, msg_vecs, 1); round_fn(v, msg_vecs, 2); round_fn(v, msg_vecs, 3); round_fn(v, msg_vecs, 4); round_fn(v, msg_vecs, 5); round_fn(v, msg_vecs, 6); h_vecs[0] = xorv(v[0], v[8]); h_vecs[1] = xorv(v[1], v[9]); h_vecs[2] = xorv(v[2], v[10]); h_vecs[3] = xorv(v[3], v[11]); h_vecs[4] = xorv(v[4], v[12]); h_vecs[5] = xorv(v[5], v[13]); h_vecs[6] = xorv(v[6], v[14]); h_vecs[7] = xorv(v[7], v[15]); block_flags = flags; } transpose_vecs(h_vecs); storeu(h_vecs[0], &out[0 * sizeof(__m256i)]); storeu(h_vecs[1], &out[1 * sizeof(__m256i)]); storeu(h_vecs[2], &out[2 * sizeof(__m256i)]); storeu(h_vecs[3], &out[3 * sizeof(__m256i)]); storeu(h_vecs[4], &out[4 * sizeof(__m256i)]); storeu(h_vecs[5], &out[5 * sizeof(__m256i)]); storeu(h_vecs[6], &out[6 * sizeof(__m256i)]); storeu(h_vecs[7], &out[7 * sizeof(__m256i)]); } #if !defined(BLAKE3_NO_SSE41) void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #else void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #endif void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs >= DEGREE) { blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += DEGREE; } inputs += DEGREE; num_inputs -= DEGREE; out = &out[DEGREE * BLAKE3_OUT_LEN]; } #if !defined(BLAKE3_NO_SSE41) blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); #else blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); #endif } blake3-1.5.4/c/blake3_avx2_x86-64_unix.S000064400000000000000000002010021046102023000154350ustar 00000000000000#if defined(__ELF__) && defined(__linux__) .section .note.GNU-stack,"",%progbits #endif #if defined(__ELF__) && defined(__CET__) && defined(__has_include) #if __has_include() #include #endif #endif #if !defined(_CET_ENDBR) #define _CET_ENDBR #endif .intel_syntax noprefix .global _blake3_hash_many_avx2 .global blake3_hash_many_avx2 #ifdef __APPLE__ .text #else .section .text #endif .p2align 6 _blake3_hash_many_avx2: blake3_hash_many_avx2: _CET_ENDBR push r15 push r14 push r13 push r12 push rbx push rbp mov rbp, rsp sub rsp, 680 and rsp, 0xFFFFFFFFFFFFFFC0 neg r9d vmovd xmm0, r9d vpbroadcastd ymm0, xmm0 vmovdqa ymmword ptr [rsp+0x280], ymm0 vpand ymm1, ymm0, ymmword ptr [ADD0+rip] vpand ymm2, ymm0, ymmword ptr [ADD1+rip] vmovdqa ymmword ptr [rsp+0x220], ymm2 vmovd xmm2, r8d vpbroadcastd ymm2, xmm2 vpaddd ymm2, ymm2, ymm1 vmovdqa ymmword ptr [rsp+0x240], ymm2 vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] vpcmpgtd ymm2, ymm1, ymm2 shr r8, 32 vmovd xmm3, r8d vpbroadcastd ymm3, xmm3 vpsubd ymm3, ymm3, ymm2 vmovdqa ymmword ptr [rsp+0x260], ymm3 shl rdx, 6 mov qword ptr [rsp+0x2A0], rdx cmp rsi, 8 jc 3f 2: vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+0x4] vpbroadcastd ymm2, dword ptr [rcx+0x8] vpbroadcastd ymm3, dword ptr [rcx+0xC] vpbroadcastd ymm4, dword ptr [rcx+0x10] vpbroadcastd ymm5, dword ptr [rcx+0x14] vpbroadcastd ymm6, dword ptr [rcx+0x18] vpbroadcastd ymm7, dword ptr [rcx+0x1C] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x20] mov r13, qword ptr [rdi+0x28] mov r14, qword ptr [rdi+0x30] mov r15, qword ptr [rdi+0x38] movzx eax, byte ptr [rbp+0x38] movzx ebx, byte ptr [rbp+0x40] or eax, ebx xor edx, edx .p2align 5 9: movzx ebx, byte ptr [rbp+0x48] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x2A0] cmove eax, ebx mov dword ptr [rsp+0x200], eax vmovups xmm8, xmmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x40] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x40] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x20], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x40], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x60], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x30] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x30] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x80], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0xA0], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0xC0], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0xE0], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x20] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x20] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x100], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x120], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x140], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x160], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x10] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x10] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x180], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x1A0], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x1C0], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x1E0], ymm11 vpbroadcastd ymm15, dword ptr [rsp+0x200] prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] vpxor ymm15, ymm3, ymm15 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+0x38] jne 9b mov rbx, qword ptr [rbp+0x50] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0xCC vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0xCC vblendps ymm3, ymm12, ymm9, 0xCC vperm2f128 ymm12, ymm1, ymm2, 0x20 vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0xCC vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 0x20 vmovups ymmword ptr [rbx+0x20], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0xCC vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0xCC vblendps ymm14, ymm14, ymm13, 0xCC vperm2f128 ymm8, ymm10, ymm14, 0x20 vmovups ymmword ptr [rbx+0x40], ymm8 vblendps ymm15, ymm13, ymm15, 0xCC vperm2f128 ymm13, ymm6, ymm15, 0x20 vmovups ymmword ptr [rbx+0x60], ymm13 vperm2f128 ymm9, ymm1, ymm2, 0x31 vperm2f128 ymm11, ymm3, ymm4, 0x31 vmovups ymmword ptr [rbx+0x80], ymm9 vperm2f128 ymm14, ymm10, ymm14, 0x31 vperm2f128 ymm15, ymm6, ymm15, 0x31 vmovups ymmword ptr [rbx+0xA0], ymm11 vmovups ymmword ptr [rbx+0xC0], ymm14 vmovups ymmword ptr [rbx+0xE0], ymm15 vmovdqa ymm0, ymmword ptr [rsp+0x220] vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] vmovdqa ymmword ptr [rsp+0x240], ymm1 vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] vpcmpgtd ymm2, ymm0, ymm2 vmovdqa ymm0, ymmword ptr [rsp+0x260] vpsubd ymm2, ymm0, ymm2 vmovdqa ymmword ptr [rsp+0x260], ymm2 add rdi, 64 add rbx, 256 mov qword ptr [rbp+0x50], rbx sub rsi, 8 cmp rsi, 8 jnc 2b test rsi, rsi jnz 3f 4: vzeroupper mov rsp, rbp pop rbp pop rbx pop r12 pop r13 pop r14 pop r15 ret .p2align 5 3: mov rbx, qword ptr [rbp+0x50] mov r15, qword ptr [rsp+0x2A0] movzx r13d, byte ptr [rbp+0x38] movzx r12d, byte ptr [rbp+0x48] test rsi, 0x4 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovdqa ymm8, ymm0 vmovdqa ymm9, ymm1 vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] vpunpckldq ymm14, ymm12, ymm13 vpunpckhdq ymm15, ymm12, ymm13 vpermq ymm14, ymm14, 0x50 vpermq ymm15, ymm15, 0x50 vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] vpblendd ymm14, ymm14, ymm12, 0x44 vpblendd ymm15, ymm15, ymm12, 0x44 vmovdqa ymmword ptr [rsp], ymm14 vmovdqa ymmword ptr [rsp+0x20], ymm15 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x200], eax vmovups ymm2, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm3, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm2, ymm3, 136 vshufps ymm5, ymm2, ymm3, 221 vmovups ymm2, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm3, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm2, ymm3, 136 vshufps ymm7, ymm2, ymm3, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 vmovups ymm10, ymmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 vmovups ymm11, ymmword ptr [r10+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 vshufps ymm12, ymm10, ymm11, 136 vshufps ymm13, ymm10, ymm11, 221 vmovups ymm10, ymmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 vmovups ymm11, ymmword ptr [r10+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 vshufps ymm14, ymm10, ymm11, 136 vshufps ymm15, ymm10, ymm11, 221 vpshufd ymm14, ymm14, 0x93 vpshufd ymm15, ymm15, 0x93 prefetcht0 [r8+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r11+rdx+0x80] vpbroadcastd ymm2, dword ptr [rsp+0x200] vmovdqa ymm3, ymmword ptr [rsp] vmovdqa ymm11, ymmword ptr [rsp+0x20] vpblendd ymm3, ymm3, ymm2, 0x88 vpblendd ymm11, ymm11, ymm2, 0x88 vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vmovdqa ymm10, ymm2 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm8, ymm8, ymm12 vmovdqa ymmword ptr [rsp+0x40], ymm4 nop vmovdqa ymmword ptr [rsp+0x60], ymm12 nop vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm5 vpaddd ymm8, ymm8, ymm13 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vmovdqa ymmword ptr [rsp+0x80], ymm5 vmovdqa ymmword ptr [rsp+0xA0], ymm13 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 0x93 vpshufd ymm8, ymm8, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm11, ymm11, 0x4E vpshufd ymm2, ymm2, 0x39 vpshufd ymm10, ymm10, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm8, ymm8, ymm14 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm7 vpaddd ymm8, ymm8, ymm15 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 0x39 vpshufd ymm8, ymm8, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm11, ymm11, 0x4E vpshufd ymm2, ymm2, 0x93 vpshufd ymm10, ymm10, 0x93 dec al je 9f vmovdqa ymm4, ymmword ptr [rsp+0x40] vmovdqa ymm5, ymmword ptr [rsp+0x80] vshufps ymm12, ymm4, ymm5, 214 vpshufd ymm13, ymm4, 0x0F vpshufd ymm4, ymm12, 0x39 vshufps ymm12, ymm6, ymm7, 250 vpblendd ymm13, ymm13, ymm12, 0xAA vpunpcklqdq ymm12, ymm7, ymm5 vpblendd ymm12, ymm12, ymm6, 0x88 vpshufd ymm12, ymm12, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymmword ptr [rsp+0x40], ymm13 vmovdqa ymmword ptr [rsp+0x80], ymm12 vmovdqa ymm12, ymmword ptr [rsp+0x60] vmovdqa ymm13, ymmword ptr [rsp+0xA0] vshufps ymm5, ymm12, ymm13, 214 vpshufd ymm6, ymm12, 0x0F vpshufd ymm12, ymm5, 0x39 vshufps ymm5, ymm14, ymm15, 250 vpblendd ymm6, ymm6, ymm5, 0xAA vpunpcklqdq ymm5, ymm15, ymm13 vpblendd ymm5, ymm5, ymm14, 0x88 vpshufd ymm5, ymm5, 0x78 vpunpckhdq ymm13, ymm13, ymm15 vpunpckldq ymm14, ymm14, ymm13 vpshufd ymm15, ymm14, 0x1E vmovdqa ymm13, ymm6 vmovdqa ymm14, ymm5 vmovdqa ymm5, ymmword ptr [rsp+0x40] vmovdqa ymm6, ymmword ptr [rsp+0x80] jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 vpxor ymm8, ymm8, ymm10 vpxor ymm9, ymm9, ymm11 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovdqu xmmword ptr [rbx+0x40], xmm8 vmovdqu xmmword ptr [rbx+0x50], xmm9 vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 vmovaps xmm8, xmmword ptr [rsp+0x280] vmovaps xmm0, xmmword ptr [rsp+0x240] vmovaps xmm1, xmmword ptr [rsp+0x250] vmovaps xmm2, xmmword ptr [rsp+0x260] vmovaps xmm3, xmmword ptr [rsp+0x270] vblendvps xmm0, xmm0, xmm1, xmm8 vblendvps xmm2, xmm2, xmm3, xmm8 vmovaps xmmword ptr [rsp+0x240], xmm0 vmovaps xmmword ptr [rsp+0x260], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 3: test rsi, 0x2 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovd xmm13, dword ptr [rsp+0x240] vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovd xmm14, dword ptr [rsp+0x244] vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vinserti128 ymm13, ymm13, xmm14, 0x01 vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x200], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vpbroadcastd ymm8, dword ptr [rsp+0x200] vpblendd ymm3, ymm13, ymm8, 0x88 vmovups ymm8, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x93 dec al jz 9f vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0x0F vpshufd ymm4, ymm8, 0x39 vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0xAA vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 0x88 vpshufd ymm8, ymm8, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovaps ymm8, ymmword ptr [rsp+0x280] vmovaps ymm0, ymmword ptr [rsp+0x240] vmovups ymm1, ymmword ptr [rsp+0x248] vmovaps ymm2, ymmword ptr [rsp+0x260] vmovups ymm3, ymmword ptr [rsp+0x268] vblendvps ymm0, ymm0, ymm1, ymm8 vblendvps ymm2, ymm2, ymm3, ymm8 vmovaps ymmword ptr [rsp+0x240], ymm0 vmovaps ymmword ptr [rsp+0x260], ymm2 add rbx, 64 add rdi, 16 sub rsi, 2 3: test rsi, 0x1 je 4b vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] vmovd xmm3, dword ptr [rsp+0x240] vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovdqa xmm14, xmmword ptr [ROT16+rip] vmovdqa xmm15, xmmword ptr [ROT8+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] vmovdqa xmm3, xmm13 vpinsrd xmm3, xmm3, eax, 3 vmovups xmm8, xmmword ptr [r8+rdx-0x40] vmovups xmm9, xmmword ptr [r8+rdx-0x30] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vmovups xmm9, xmmword ptr [r8+rdx-0x10] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 jmp 4b #ifdef __APPLE__ .static_data #else .section .rodata #endif .p2align 6 ADD0: .long 0, 1, 2, 3, 4, 5, 6, 7 ADD1: .long 8, 8, 8, 8, 8, 8, 8, 8 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 ROT16: .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A blake3-1.5.4/c/blake3_avx2_x86-64_windows_gnu.S000064400000000000000000002022571046102023000170320ustar 00000000000000.intel_syntax noprefix .global _blake3_hash_many_avx2 .global blake3_hash_many_avx2 .section .text .p2align 6 _blake3_hash_many_avx2: blake3_hash_many_avx2: push r15 push r14 push r13 push r12 push rsi push rdi push rbx push rbp mov rbp, rsp sub rsp, 880 and rsp, 0xFFFFFFFFFFFFFFC0 vmovdqa xmmword ptr [rsp+0x2D0], xmm6 vmovdqa xmmword ptr [rsp+0x2E0], xmm7 vmovdqa xmmword ptr [rsp+0x2F0], xmm8 vmovdqa xmmword ptr [rsp+0x300], xmm9 vmovdqa xmmword ptr [rsp+0x310], xmm10 vmovdqa xmmword ptr [rsp+0x320], xmm11 vmovdqa xmmword ptr [rsp+0x330], xmm12 vmovdqa xmmword ptr [rsp+0x340], xmm13 vmovdqa xmmword ptr [rsp+0x350], xmm14 vmovdqa xmmword ptr [rsp+0x360], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+0x68] movzx r9, byte ptr [rbp+0x70] neg r9d vmovd xmm0, r9d vpbroadcastd ymm0, xmm0 vmovdqa ymmword ptr [rsp+0x260], ymm0 vpand ymm1, ymm0, ymmword ptr [ADD0+rip] vpand ymm2, ymm0, ymmword ptr [ADD1+rip] vmovdqa ymmword ptr [rsp+0x2A0], ymm2 vmovd xmm2, r8d vpbroadcastd ymm2, xmm2 vpaddd ymm2, ymm2, ymm1 vmovdqa ymmword ptr [rsp+0x220], ymm2 vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] vpcmpgtd ymm2, ymm1, ymm2 shr r8, 32 vmovd xmm3, r8d vpbroadcastd ymm3, xmm3 vpsubd ymm3, ymm3, ymm2 vmovdqa ymmword ptr [rsp+0x240], ymm3 shl rdx, 6 mov qword ptr [rsp+0x2C0], rdx cmp rsi, 8 jc 3f 2: vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+0x4] vpbroadcastd ymm2, dword ptr [rcx+0x8] vpbroadcastd ymm3, dword ptr [rcx+0xC] vpbroadcastd ymm4, dword ptr [rcx+0x10] vpbroadcastd ymm5, dword ptr [rcx+0x14] vpbroadcastd ymm6, dword ptr [rcx+0x18] vpbroadcastd ymm7, dword ptr [rcx+0x1C] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x20] mov r13, qword ptr [rdi+0x28] mov r14, qword ptr [rdi+0x30] mov r15, qword ptr [rdi+0x38] movzx eax, byte ptr [rbp+0x78] movzx ebx, byte ptr [rbp+0x80] or eax, ebx xor edx, edx .p2align 5 9: movzx ebx, byte ptr [rbp+0x88] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x2C0] cmove eax, ebx mov dword ptr [rsp+0x200], eax vmovups xmm8, xmmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x40] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x40] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x20], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x40], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x60], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x30] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x30] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x80], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0xA0], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0xC0], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0xE0], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x20] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x20] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x100], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x120], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x140], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x160], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x10] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x10] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x180], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x1A0], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x1C0], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x1E0], ymm11 vpbroadcastd ymm15, dword ptr [rsp+0x200] prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm0, ymmword ptr [rsp+0x220] vpxor ymm13, ymm1, ymmword ptr [rsp+0x240] vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] vpxor ymm15, ymm3, ymm15 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+0x78] jne 9b mov rbx, qword ptr [rbp+0x90] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0xCC vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0xCC vblendps ymm3, ymm12, ymm9, 0xCC vperm2f128 ymm12, ymm1, ymm2, 0x20 vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0xCC vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 0x20 vmovups ymmword ptr [rbx+0x20], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0xCC vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0xCC vblendps ymm14, ymm14, ymm13, 0xCC vperm2f128 ymm8, ymm10, ymm14, 0x20 vmovups ymmword ptr [rbx+0x40], ymm8 vblendps ymm15, ymm13, ymm15, 0xCC vperm2f128 ymm13, ymm6, ymm15, 0x20 vmovups ymmword ptr [rbx+0x60], ymm13 vperm2f128 ymm9, ymm1, ymm2, 0x31 vperm2f128 ymm11, ymm3, ymm4, 0x31 vmovups ymmword ptr [rbx+0x80], ymm9 vperm2f128 ymm14, ymm10, ymm14, 0x31 vperm2f128 ymm15, ymm6, ymm15, 0x31 vmovups ymmword ptr [rbx+0xA0], ymm11 vmovups ymmword ptr [rbx+0xC0], ymm14 vmovups ymmword ptr [rbx+0xE0], ymm15 vmovdqa ymm0, ymmword ptr [rsp+0x2A0] vpaddd ymm1, ymm0, ymmword ptr [rsp+0x220] vmovdqa ymmword ptr [rsp+0x220], ymm1 vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] vpcmpgtd ymm2, ymm0, ymm2 vmovdqa ymm0, ymmword ptr [rsp+0x240] vpsubd ymm2, ymm0, ymm2 vmovdqa ymmword ptr [rsp+0x240], ymm2 add rdi, 64 add rbx, 256 mov qword ptr [rbp+0x90], rbx sub rsi, 8 cmp rsi, 8 jnc 2b test rsi, rsi jnz 3f 4: vzeroupper vmovdqa xmm6, xmmword ptr [rsp+0x2D0] vmovdqa xmm7, xmmword ptr [rsp+0x2E0] vmovdqa xmm8, xmmword ptr [rsp+0x2F0] vmovdqa xmm9, xmmword ptr [rsp+0x300] vmovdqa xmm10, xmmword ptr [rsp+0x310] vmovdqa xmm11, xmmword ptr [rsp+0x320] vmovdqa xmm12, xmmword ptr [rsp+0x330] vmovdqa xmm13, xmmword ptr [rsp+0x340] vmovdqa xmm14, xmmword ptr [rsp+0x350] vmovdqa xmm15, xmmword ptr [rsp+0x360] mov rsp, rbp pop rbp pop rbx pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret .p2align 5 3: mov rbx, qword ptr [rbp+0x90] mov r15, qword ptr [rsp+0x2C0] movzx r13d, byte ptr [rbp+0x78] movzx r12d, byte ptr [rbp+0x88] test rsi, 0x4 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovdqa ymm8, ymm0 vmovdqa ymm9, ymm1 vbroadcasti128 ymm12, xmmword ptr [rsp+0x220] vbroadcasti128 ymm13, xmmword ptr [rsp+0x240] vpunpckldq ymm14, ymm12, ymm13 vpunpckhdq ymm15, ymm12, ymm13 vpermq ymm14, ymm14, 0x50 vpermq ymm15, ymm15, 0x50 vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] vpblendd ymm14, ymm14, ymm12, 0x44 vpblendd ymm15, ymm15, ymm12, 0x44 vmovdqa ymmword ptr [rsp], ymm14 vmovdqa ymmword ptr [rsp+0x20], ymm15 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x200], eax vmovups ymm2, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm3, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm2, ymm3, 136 vshufps ymm5, ymm2, ymm3, 221 vmovups ymm2, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm3, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm2, ymm3, 136 vshufps ymm7, ymm2, ymm3, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 vmovups ymm10, ymmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 vmovups ymm11, ymmword ptr [r10+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 vshufps ymm12, ymm10, ymm11, 136 vshufps ymm13, ymm10, ymm11, 221 vmovups ymm10, ymmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 vmovups ymm11, ymmword ptr [r10+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 vshufps ymm14, ymm10, ymm11, 136 vshufps ymm15, ymm10, ymm11, 221 vpshufd ymm14, ymm14, 0x93 vpshufd ymm15, ymm15, 0x93 vpbroadcastd ymm2, dword ptr [rsp+0x200] vmovdqa ymm3, ymmword ptr [rsp] vmovdqa ymm11, ymmword ptr [rsp+0x20] vpblendd ymm3, ymm3, ymm2, 0x88 vpblendd ymm11, ymm11, ymm2, 0x88 vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vmovdqa ymm10, ymm2 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm8, ymm8, ymm12 vmovdqa ymmword ptr [rsp+0x40], ymm4 nop vmovdqa ymmword ptr [rsp+0x60], ymm12 nop vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm5 vpaddd ymm8, ymm8, ymm13 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vmovdqa ymmword ptr [rsp+0x80], ymm5 vmovdqa ymmword ptr [rsp+0xA0], ymm13 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 0x93 vpshufd ymm8, ymm8, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm11, ymm11, 0x4E vpshufd ymm2, ymm2, 0x39 vpshufd ymm10, ymm10, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm8, ymm8, ymm14 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm7 vpaddd ymm8, ymm8, ymm15 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 0x39 vpshufd ymm8, ymm8, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm11, ymm11, 0x4E vpshufd ymm2, ymm2, 0x93 vpshufd ymm10, ymm10, 0x93 dec al je 9f vmovdqa ymm4, ymmword ptr [rsp+0x40] vmovdqa ymm5, ymmword ptr [rsp+0x80] vshufps ymm12, ymm4, ymm5, 214 vpshufd ymm13, ymm4, 0x0F vpshufd ymm4, ymm12, 0x39 vshufps ymm12, ymm6, ymm7, 250 vpblendd ymm13, ymm13, ymm12, 0xAA vpunpcklqdq ymm12, ymm7, ymm5 vpblendd ymm12, ymm12, ymm6, 0x88 vpshufd ymm12, ymm12, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymmword ptr [rsp+0x40], ymm13 vmovdqa ymmword ptr [rsp+0x80], ymm12 vmovdqa ymm12, ymmword ptr [rsp+0x60] vmovdqa ymm13, ymmword ptr [rsp+0xA0] vshufps ymm5, ymm12, ymm13, 214 vpshufd ymm6, ymm12, 0x0F vpshufd ymm12, ymm5, 0x39 vshufps ymm5, ymm14, ymm15, 250 vpblendd ymm6, ymm6, ymm5, 0xAA vpunpcklqdq ymm5, ymm15, ymm13 vpblendd ymm5, ymm5, ymm14, 0x88 vpshufd ymm5, ymm5, 0x78 vpunpckhdq ymm13, ymm13, ymm15 vpunpckldq ymm14, ymm14, ymm13 vpshufd ymm15, ymm14, 0x1E vmovdqa ymm13, ymm6 vmovdqa ymm14, ymm5 vmovdqa ymm5, ymmword ptr [rsp+0x40] vmovdqa ymm6, ymmword ptr [rsp+0x80] jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 vpxor ymm8, ymm8, ymm10 vpxor ymm9, ymm9, ymm11 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovdqu xmmword ptr [rbx+0x40], xmm8 vmovdqu xmmword ptr [rbx+0x50], xmm9 vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 vmovaps xmm8, xmmword ptr [rsp+0x260] vmovaps xmm0, xmmword ptr [rsp+0x220] vmovaps xmm1, xmmword ptr [rsp+0x230] vmovaps xmm2, xmmword ptr [rsp+0x240] vmovaps xmm3, xmmword ptr [rsp+0x250] vblendvps xmm0, xmm0, xmm1, xmm8 vblendvps xmm2, xmm2, xmm3, xmm8 vmovaps xmmword ptr [rsp+0x220], xmm0 vmovaps xmmword ptr [rsp+0x240], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 3: test rsi, 0x2 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovd xmm13, dword ptr [rsp+0x220] vpinsrd xmm13, xmm13, dword ptr [rsp+0x240], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovd xmm14, dword ptr [rsp+0x224] vpinsrd xmm14, xmm14, dword ptr [rsp+0x244], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vinserti128 ymm13, ymm13, xmm14, 0x01 vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x200], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vpbroadcastd ymm8, dword ptr [rsp+0x200] vpblendd ymm3, ymm13, ymm8, 0x88 vmovups ymm8, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x93 dec al jz 9f vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0x0F vpshufd ymm4, ymm8, 0x39 vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0xAA vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 0x88 vpshufd ymm8, ymm8, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovaps ymm8, ymmword ptr [rsp+0x260] vmovaps ymm0, ymmword ptr [rsp+0x220] vmovups ymm1, ymmword ptr [rsp+0x228] vmovaps ymm2, ymmword ptr [rsp+0x240] vmovups ymm3, ymmword ptr [rsp+0x248] vblendvps ymm0, ymm0, ymm1, ymm8 vblendvps ymm2, ymm2, ymm3, ymm8 vmovaps ymmword ptr [rsp+0x220], ymm0 vmovaps ymmword ptr [rsp+0x240], ymm2 add rbx, 64 add rdi, 16 sub rsi, 2 3: test rsi, 0x1 je 4b vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] vmovd xmm3, dword ptr [rsp+0x220] vpinsrd xmm3, xmm3, dword ptr [rsp+0x240], 1 vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovdqa xmm14, xmmword ptr [ROT16+rip] vmovdqa xmm15, xmmword ptr [ROT8+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] vmovdqa xmm3, xmm13 vpinsrd xmm3, xmm3, eax, 3 vmovups xmm8, xmmword ptr [r8+rdx-0x40] vmovups xmm9, xmmword ptr [r8+rdx-0x30] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vmovups xmm9, xmmword ptr [r8+rdx-0x10] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 jmp 4b .section .rdata .p2align 6 ADD0: .long 0, 1, 2, 3, 4, 5, 6, 7 ADD1: .long 8, 8, 8, 8, 8, 8, 8, 8 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 ROT16: .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A blake3-1.5.4/c/blake3_avx2_x86-64_windows_msvc.asm000064400000000000000000002006341046102023000175640ustar 00000000000000public _blake3_hash_many_avx2 public blake3_hash_many_avx2 _TEXT SEGMENT ALIGN(16) 'CODE' ALIGN 16 blake3_hash_many_avx2 PROC _blake3_hash_many_avx2 PROC push r15 push r14 push r13 push r12 push rsi push rdi push rbx push rbp mov rbp, rsp sub rsp, 880 and rsp, 0FFFFFFFFFFFFFFC0H vmovdqa xmmword ptr [rsp+2D0H], xmm6 vmovdqa xmmword ptr [rsp+2E0H], xmm7 vmovdqa xmmword ptr [rsp+2F0H], xmm8 vmovdqa xmmword ptr [rsp+300H], xmm9 vmovdqa xmmword ptr [rsp+310H], xmm10 vmovdqa xmmword ptr [rsp+320H], xmm11 vmovdqa xmmword ptr [rsp+330H], xmm12 vmovdqa xmmword ptr [rsp+340H], xmm13 vmovdqa xmmword ptr [rsp+350H], xmm14 vmovdqa xmmword ptr [rsp+360H], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+68H] movzx r9, byte ptr [rbp+70H] neg r9d vmovd xmm0, r9d vpbroadcastd ymm0, xmm0 vmovdqa ymmword ptr [rsp+260H], ymm0 vpand ymm1, ymm0, ymmword ptr [ADD0] vpand ymm2, ymm0, ymmword ptr [ADD1] vmovdqa ymmword ptr [rsp+2A0H], ymm2 vmovd xmm2, r8d vpbroadcastd ymm2, xmm2 vpaddd ymm2, ymm2, ymm1 vmovdqa ymmword ptr [rsp+220H], ymm2 vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK] vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK] vpcmpgtd ymm2, ymm1, ymm2 shr r8, 32 vmovd xmm3, r8d vpbroadcastd ymm3, xmm3 vpsubd ymm3, ymm3, ymm2 vmovdqa ymmword ptr [rsp+240H], ymm3 shl rdx, 6 mov qword ptr [rsp+2C0H], rdx cmp rsi, 8 jc final7blocks outerloop8: vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+4H] vpbroadcastd ymm2, dword ptr [rcx+8H] vpbroadcastd ymm3, dword ptr [rcx+0CH] vpbroadcastd ymm4, dword ptr [rcx+10H] vpbroadcastd ymm5, dword ptr [rcx+14H] vpbroadcastd ymm6, dword ptr [rcx+18H] vpbroadcastd ymm7, dword ptr [rcx+1CH] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] mov r12, qword ptr [rdi+20H] mov r13, qword ptr [rdi+28H] mov r14, qword ptr [rdi+30H] mov r15, qword ptr [rdi+38H] movzx eax, byte ptr [rbp+78H] movzx ebx, byte ptr [rbp+80H] or eax, ebx xor edx, edx ALIGN 16 innerloop8: movzx ebx, byte ptr [rbp+88H] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+2C0H] cmove eax, ebx mov dword ptr [rsp+200H], eax vmovups xmm8, xmmword ptr [r8+rdx-40H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H vmovups xmm9, xmmword ptr [r9+rdx-40H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-40H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H vmovups xmm11, xmmword ptr [r11+rdx-40H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+20H], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+40H], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+60H], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-30H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H vmovups xmm9, xmmword ptr [r9+rdx-30H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-30H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H vmovups xmm11, xmmword ptr [r11+rdx-30H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+80H], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0A0H], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0C0H], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0E0H], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-20H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H vmovups xmm9, xmmword ptr [r9+rdx-20H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-20H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H vmovups xmm11, xmmword ptr [r11+rdx-20H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+100H], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+120H], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+140H], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+160H], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-10H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H vmovups xmm9, xmmword ptr [r9+rdx-10H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-10H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H vmovups xmm11, xmmword ptr [r11+rdx-10H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+180H], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+1A0H], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+1C0H], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+1E0H], ymm11 vpbroadcastd ymm15, dword ptr [rsp+200H] prefetcht0 byte ptr [r8+rdx+80H] prefetcht0 byte ptr [r12+rdx+80H] prefetcht0 byte ptr [r9+rdx+80H] prefetcht0 byte ptr [r13+rdx+80H] prefetcht0 byte ptr [r10+rdx+80H] prefetcht0 byte ptr [r14+rdx+80H] prefetcht0 byte ptr [r11+rdx+80H] prefetcht0 byte ptr [r15+rdx+80H] vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm0, ymmword ptr [rsp+220H] vpxor ymm13, ymm1, ymmword ptr [rsp+240H] vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN] vpxor ymm15, ymm3, ymm15 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0] vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1] vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2] vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3] vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+100H] vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] vpaddd ymm2, ymm2, ymmword ptr [rsp+0E0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] vpaddd ymm2, ymm2, ymmword ptr [rsp+160H] vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0A0H] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+180H] vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] vpaddd ymm2, ymm2, ymmword ptr [rsp+140H] vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] vpaddd ymm2, ymm2, ymmword ptr [rsp+0C0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+1E0H] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+78H] jne innerloop8 mov rbx, qword ptr [rbp+90H] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0CCH vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0CCH vblendps ymm3, ymm12, ymm9, 0CCH vperm2f128 ymm12, ymm1, ymm2, 20H vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0CCH vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 20H vmovups ymmword ptr [rbx+20H], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0CCH vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0CCH vblendps ymm14, ymm14, ymm13, 0CCH vperm2f128 ymm8, ymm10, ymm14, 20H vmovups ymmword ptr [rbx+40H], ymm8 vblendps ymm15, ymm13, ymm15, 0CCH vperm2f128 ymm13, ymm6, ymm15, 20H vmovups ymmword ptr [rbx+60H], ymm13 vperm2f128 ymm9, ymm1, ymm2, 31H vperm2f128 ymm11, ymm3, ymm4, 31H vmovups ymmword ptr [rbx+80H], ymm9 vperm2f128 ymm14, ymm10, ymm14, 31H vperm2f128 ymm15, ymm6, ymm15, 31H vmovups ymmword ptr [rbx+0A0H], ymm11 vmovups ymmword ptr [rbx+0C0H], ymm14 vmovups ymmword ptr [rbx+0E0H], ymm15 vmovdqa ymm0, ymmword ptr [rsp+2A0H] vpaddd ymm1, ymm0, ymmword ptr [rsp+220H] vmovdqa ymmword ptr [rsp+220H], ymm1 vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK] vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK] vpcmpgtd ymm2, ymm0, ymm2 vmovdqa ymm0, ymmword ptr [rsp+240H] vpsubd ymm2, ymm0, ymm2 vmovdqa ymmword ptr [rsp+240H], ymm2 add rdi, 64 add rbx, 256 mov qword ptr [rbp+90H], rbx sub rsi, 8 cmp rsi, 8 jnc outerloop8 test rsi, rsi jnz final7blocks unwind: vzeroupper vmovdqa xmm6, xmmword ptr [rsp+2D0H] vmovdqa xmm7, xmmword ptr [rsp+2E0H] vmovdqa xmm8, xmmword ptr [rsp+2F0H] vmovdqa xmm9, xmmword ptr [rsp+300H] vmovdqa xmm10, xmmword ptr [rsp+310H] vmovdqa xmm11, xmmword ptr [rsp+320H] vmovdqa xmm12, xmmword ptr [rsp+330H] vmovdqa xmm13, xmmword ptr [rsp+340H] vmovdqa xmm14, xmmword ptr [rsp+350H] vmovdqa xmm15, xmmword ptr [rsp+360H] mov rsp, rbp pop rbp pop rbx pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret ALIGN 16 final7blocks: mov rbx, qword ptr [rbp+90H] mov r15, qword ptr [rsp+2C0H] movzx r13d, byte ptr [rbp+78H] movzx r12d, byte ptr [rbp+88H] test rsi, 4H je final3blocks vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+10H] vmovdqa ymm8, ymm0 vmovdqa ymm9, ymm1 vbroadcasti128 ymm12, xmmword ptr [rsp+220H] vbroadcasti128 ymm13, xmmword ptr [rsp+240H] vpunpckldq ymm14, ymm12, ymm13 vpunpckhdq ymm15, ymm12, ymm13 vpermq ymm14, ymm14, 50H vpermq ymm15, ymm15, 50H vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN] vpblendd ymm14, ymm14, ymm12, 44H vpblendd ymm15, ymm15, ymm12, 44H vmovdqa ymmword ptr [rsp], ymm14 vmovdqa ymmword ptr [rsp+20H], ymm15 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx ALIGN 16 innerloop4: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+200H], eax vmovups ymm2, ymmword ptr [r8+rdx-40H] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-40H], 01H vmovups ymm3, ymmword ptr [r8+rdx-30H] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-30H], 01H vshufps ymm4, ymm2, ymm3, 136 vshufps ymm5, ymm2, ymm3, 221 vmovups ymm2, ymmword ptr [r8+rdx-20H] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-20H], 01H vmovups ymm3, ymmword ptr [r8+rdx-10H] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-10H], 01H vshufps ymm6, ymm2, ymm3, 136 vshufps ymm7, ymm2, ymm3, 221 vpshufd ymm6, ymm6, 93H vpshufd ymm7, ymm7, 93H vmovups ymm10, ymmword ptr [r10+rdx-40H] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-40H], 01H vmovups ymm11, ymmword ptr [r10+rdx-30H] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-30H], 01H vshufps ymm12, ymm10, ymm11, 136 vshufps ymm13, ymm10, ymm11, 221 vmovups ymm10, ymmword ptr [r10+rdx-20H] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-20H], 01H vmovups ymm11, ymmword ptr [r10+rdx-10H] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-10H], 01H vshufps ymm14, ymm10, ymm11, 136 vshufps ymm15, ymm10, ymm11, 221 vpshufd ymm14, ymm14, 93H vpshufd ymm15, ymm15, 93H vpbroadcastd ymm2, dword ptr [rsp+200H] vmovdqa ymm3, ymmword ptr [rsp] vmovdqa ymm11, ymmword ptr [rsp+20H] vpblendd ymm3, ymm3, ymm2, 88H vpblendd ymm11, ymm11, ymm2, 88H vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] vmovdqa ymm10, ymm2 mov al, 7 roundloop4: vpaddd ymm0, ymm0, ymm4 vpaddd ymm8, ymm8, ymm12 vmovdqa ymmword ptr [rsp+40H], ymm4 nop vmovdqa ymmword ptr [rsp+60H], ymm12 nop vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm5 vpaddd ymm8, ymm8, ymm13 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vmovdqa ymmword ptr [rsp+80H], ymm5 vmovdqa ymmword ptr [rsp+0A0H], ymm13 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 93H vpshufd ymm8, ymm8, 93H vpshufd ymm3, ymm3, 4EH vpshufd ymm11, ymm11, 4EH vpshufd ymm2, ymm2, 39H vpshufd ymm10, ymm10, 39H vpaddd ymm0, ymm0, ymm6 vpaddd ymm8, ymm8, ymm14 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm7 vpaddd ymm8, ymm8, ymm15 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 39H vpshufd ymm8, ymm8, 39H vpshufd ymm3, ymm3, 4EH vpshufd ymm11, ymm11, 4EH vpshufd ymm2, ymm2, 93H vpshufd ymm10, ymm10, 93H dec al je endroundloop4 vmovdqa ymm4, ymmword ptr [rsp+40H] vmovdqa ymm5, ymmword ptr [rsp+80H] vshufps ymm12, ymm4, ymm5, 214 vpshufd ymm13, ymm4, 0FH vpshufd ymm4, ymm12, 39H vshufps ymm12, ymm6, ymm7, 250 vpblendd ymm13, ymm13, ymm12, 0AAH vpunpcklqdq ymm12, ymm7, ymm5 vpblendd ymm12, ymm12, ymm6, 88H vpshufd ymm12, ymm12, 78H vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 1EH vmovdqa ymmword ptr [rsp+40H], ymm13 vmovdqa ymmword ptr [rsp+80H], ymm12 vmovdqa ymm12, ymmword ptr [rsp+60H] vmovdqa ymm13, ymmword ptr [rsp+0A0H] vshufps ymm5, ymm12, ymm13, 214 vpshufd ymm6, ymm12, 0FH vpshufd ymm12, ymm5, 39H vshufps ymm5, ymm14, ymm15, 250 vpblendd ymm6, ymm6, ymm5, 0AAH vpunpcklqdq ymm5, ymm15, ymm13 vpblendd ymm5, ymm5, ymm14, 88H vpshufd ymm5, ymm5, 78H vpunpckhdq ymm13, ymm13, ymm15 vpunpckldq ymm14, ymm14, ymm13 vpshufd ymm15, ymm14, 1EH vmovdqa ymm13, ymm6 vmovdqa ymm14, ymm5 vmovdqa ymm5, ymmword ptr [rsp+40H] vmovdqa ymm6, ymmword ptr [rsp+80H] jmp roundloop4 endroundloop4: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 vpxor ymm8, ymm8, ymm10 vpxor ymm9, ymm9, ymm11 mov eax, r13d cmp rdx, r15 jne innerloop4 vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+10H], xmm1 vextracti128 xmmword ptr [rbx+20H], ymm0, 01H vextracti128 xmmword ptr [rbx+30H], ymm1, 01H vmovdqu xmmword ptr [rbx+40H], xmm8 vmovdqu xmmword ptr [rbx+50H], xmm9 vextracti128 xmmword ptr [rbx+60H], ymm8, 01H vextracti128 xmmword ptr [rbx+70H], ymm9, 01H vmovaps xmm8, xmmword ptr [rsp+260H] vmovaps xmm0, xmmword ptr [rsp+220H] vmovaps xmm1, xmmword ptr [rsp+230H] vmovaps xmm2, xmmword ptr [rsp+240H] vmovaps xmm3, xmmword ptr [rsp+250H] vblendvps xmm0, xmm0, xmm1, xmm8 vblendvps xmm2, xmm2, xmm3, xmm8 vmovaps xmmword ptr [rsp+220H], xmm0 vmovaps xmmword ptr [rsp+240H], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 final3blocks: test rsi, 2H je final1blocks vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+10H] vmovd xmm13, dword ptr [rsp+220H] vpinsrd xmm13, xmm13, dword ptr [rsp+240H], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 vmovd xmm14, dword ptr [rsp+224H] vpinsrd xmm14, xmm14, dword ptr [rsp+244H], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 vinserti128 ymm13, ymm13, xmm14, 01H vbroadcasti128 ymm14, xmmword ptr [ROT16] vbroadcasti128 ymm15, xmmword ptr [ROT8] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx ALIGN 16 innerloop2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+200H], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] vpbroadcastd ymm8, dword ptr [rsp+200H] vpblendd ymm3, ymm13, ymm8, 88H vmovups ymm8, ymmword ptr [r8+rdx-40H] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H vmovups ymm9, ymmword ptr [r8+rdx-30H] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-20H] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H vmovups ymm9, ymmword ptr [r8+rdx-10H] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 93H vpshufd ymm7, ymm7, 93H mov al, 7 roundloop2: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 93H vpshufd ymm3, ymm3, 4EH vpshufd ymm2, ymm2, 39H vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 39H vpshufd ymm3, ymm3, 4EH vpshufd ymm2, ymm2, 93H dec al jz endroundloop2 vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0FH vpshufd ymm4, ymm8, 39H vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0AAH vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 88H vpshufd ymm8, ymm8, 78H vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 1EH vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp roundloop2 endroundloop2: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne innerloop2 vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+10H], xmm1 vextracti128 xmmword ptr [rbx+20H], ymm0, 01H vextracti128 xmmword ptr [rbx+30H], ymm1, 01H vmovaps ymm8, ymmword ptr [rsp+260H] vmovaps ymm0, ymmword ptr [rsp+220H] vmovups ymm1, ymmword ptr [rsp+228H] vmovaps ymm2, ymmword ptr [rsp+240H] vmovups ymm3, ymmword ptr [rsp+248H] vblendvps ymm0, ymm0, ymm1, ymm8 vblendvps ymm2, ymm2, ymm3, ymm8 vmovaps ymmword ptr [rsp+220H], ymm0 vmovaps ymmword ptr [rsp+240H], ymm2 add rbx, 64 add rdi, 16 sub rsi, 2 final1blocks: test rsi, 1H je unwind vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+10H] vmovd xmm3, dword ptr [rsp+220H] vpinsrd xmm3, xmm3, dword ptr [rsp+240H], 1 vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN], 2 vmovdqa xmm14, xmmword ptr [ROT16] vmovdqa xmm15, xmmword ptr [ROT8] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx ALIGN 16 innerloop1: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vmovdqa xmm2, xmmword ptr [BLAKE3_IV] vmovdqa xmm3, xmm13 vpinsrd xmm3, xmm3, eax, 3 vmovups xmm8, xmmword ptr [r8+rdx-40H] vmovups xmm9, xmmword ptr [r8+rdx-30H] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-20H] vmovups xmm9, xmmword ptr [r8+rdx-10H] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 93H vpshufd xmm7, xmm7, 93H mov al, 7 roundloop1: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 93H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 39H vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 39H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 93H dec al jz endroundloop1 vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0FH vpshufd xmm4, xmm8, 39H vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0AAH vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 88H vpshufd xmm8, xmm8, 78H vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 1EH vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp roundloop1 endroundloop1: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne innerloop1 vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+10H], xmm1 jmp unwind _blake3_hash_many_avx2 ENDP blake3_hash_many_avx2 ENDP _TEXT ENDS _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' ALIGN 64 ADD0: dd 0, 1, 2, 3, 4, 5, 6, 7 ADD1: dd 8 dup (8) BLAKE3_IV_0: dd 8 dup (6A09E667H) BLAKE3_IV_1: dd 8 dup (0BB67AE85H) BLAKE3_IV_2: dd 8 dup (3C6EF372H) BLAKE3_IV_3: dd 8 dup (0A54FF53AH) BLAKE3_BLOCK_LEN: dd 8 dup (64) ROT16: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 CMP_MSB_MASK: dd 8 dup(80000000H) BLAKE3_IV: dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH _RDATA ENDS END blake3-1.5.4/c/blake3_avx512.c000064400000000000000000001525001046102023000136340ustar 00000000000000#include "blake3_impl.h" #include #define _mm_shuffle_ps2(a, b, c) \ (_mm_castps_si128( \ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) INLINE __m128i loadu_128(const uint8_t src[16]) { return _mm_loadu_si128((void*)src); } INLINE __m256i loadu_256(const uint8_t src[32]) { return _mm256_loadu_si256((void*)src); } INLINE __m512i loadu_512(const uint8_t src[64]) { return _mm512_loadu_si512((void*)src); } INLINE void storeu_128(__m128i src, uint8_t dest[16]) { _mm_storeu_si128((void*)dest, src); } INLINE void storeu_256(__m256i src, uint8_t dest[16]) { _mm256_storeu_si256((void*)dest, src); } INLINE void storeu_512(__m512i src, uint8_t dest[16]) { _mm512_storeu_si512((void*)dest, src); } INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); } INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); } INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); } INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); } INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); } INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); } INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); } INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); } INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); } INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); } INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); } INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); } INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); } INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); } INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); } INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); } INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); } /* * ---------------------------------------------------------------------------- * compress_avx512 * ---------------------------------------------------------------------------- */ INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { *row0 = add_128(add_128(*row0, m), *row1); *row3 = xor_128(*row3, *row0); *row3 = rot16_128(*row3); *row2 = add_128(*row2, *row3); *row1 = xor_128(*row1, *row2); *row1 = rot12_128(*row1); } INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { *row0 = add_128(add_128(*row0, m), *row1); *row3 = xor_128(*row3, *row0); *row3 = rot8_128(*row3); *row2 = add_128(*row2, *row3); *row1 = xor_128(*row1, *row2); *row1 = rot7_128(*row1); } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); } INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); } INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { rows[0] = loadu_128((uint8_t *)&cv[0]); rows[1] = loadu_128((uint8_t *)&cv[4]); rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); rows[3] = set4(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags); __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); __m128i t0, t1, t2, t3, tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); } void blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu_128(xor_128(rows[0], rows[2]), &out[0]); storeu_128(xor_128(rows[1], rows[3]), &out[16]); storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); } void blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); } /* * ---------------------------------------------------------------------------- * hash4_avx512 * ---------------------------------------------------------------------------- */ INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) { v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = add_128(v[0], v[4]); v[1] = add_128(v[1], v[5]); v[2] = add_128(v[2], v[6]); v[3] = add_128(v[3], v[7]); v[12] = xor_128(v[12], v[0]); v[13] = xor_128(v[13], v[1]); v[14] = xor_128(v[14], v[2]); v[15] = xor_128(v[15], v[3]); v[12] = rot16_128(v[12]); v[13] = rot16_128(v[13]); v[14] = rot16_128(v[14]); v[15] = rot16_128(v[15]); v[8] = add_128(v[8], v[12]); v[9] = add_128(v[9], v[13]); v[10] = add_128(v[10], v[14]); v[11] = add_128(v[11], v[15]); v[4] = xor_128(v[4], v[8]); v[5] = xor_128(v[5], v[9]); v[6] = xor_128(v[6], v[10]); v[7] = xor_128(v[7], v[11]); v[4] = rot12_128(v[4]); v[5] = rot12_128(v[5]); v[6] = rot12_128(v[6]); v[7] = rot12_128(v[7]); v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = add_128(v[0], v[4]); v[1] = add_128(v[1], v[5]); v[2] = add_128(v[2], v[6]); v[3] = add_128(v[3], v[7]); v[12] = xor_128(v[12], v[0]); v[13] = xor_128(v[13], v[1]); v[14] = xor_128(v[14], v[2]); v[15] = xor_128(v[15], v[3]); v[12] = rot8_128(v[12]); v[13] = rot8_128(v[13]); v[14] = rot8_128(v[14]); v[15] = rot8_128(v[15]); v[8] = add_128(v[8], v[12]); v[9] = add_128(v[9], v[13]); v[10] = add_128(v[10], v[14]); v[11] = add_128(v[11], v[15]); v[4] = xor_128(v[4], v[8]); v[5] = xor_128(v[5], v[9]); v[6] = xor_128(v[6], v[10]); v[7] = xor_128(v[7], v[11]); v[4] = rot7_128(v[4]); v[5] = rot7_128(v[5]); v[6] = rot7_128(v[6]); v[7] = rot7_128(v[7]); v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = add_128(v[0], v[5]); v[1] = add_128(v[1], v[6]); v[2] = add_128(v[2], v[7]); v[3] = add_128(v[3], v[4]); v[15] = xor_128(v[15], v[0]); v[12] = xor_128(v[12], v[1]); v[13] = xor_128(v[13], v[2]); v[14] = xor_128(v[14], v[3]); v[15] = rot16_128(v[15]); v[12] = rot16_128(v[12]); v[13] = rot16_128(v[13]); v[14] = rot16_128(v[14]); v[10] = add_128(v[10], v[15]); v[11] = add_128(v[11], v[12]); v[8] = add_128(v[8], v[13]); v[9] = add_128(v[9], v[14]); v[5] = xor_128(v[5], v[10]); v[6] = xor_128(v[6], v[11]); v[7] = xor_128(v[7], v[8]); v[4] = xor_128(v[4], v[9]); v[5] = rot12_128(v[5]); v[6] = rot12_128(v[6]); v[7] = rot12_128(v[7]); v[4] = rot12_128(v[4]); v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = add_128(v[0], v[5]); v[1] = add_128(v[1], v[6]); v[2] = add_128(v[2], v[7]); v[3] = add_128(v[3], v[4]); v[15] = xor_128(v[15], v[0]); v[12] = xor_128(v[12], v[1]); v[13] = xor_128(v[13], v[2]); v[14] = xor_128(v[14], v[3]); v[15] = rot8_128(v[15]); v[12] = rot8_128(v[12]); v[13] = rot8_128(v[13]); v[14] = rot8_128(v[14]); v[10] = add_128(v[10], v[15]); v[11] = add_128(v[11], v[12]); v[8] = add_128(v[8], v[13]); v[9] = add_128(v[9], v[14]); v[5] = xor_128(v[5], v[10]); v[6] = xor_128(v[6], v[11]); v[7] = xor_128(v[7], v[8]); v[4] = xor_128(v[4], v[9]); v[5] = rot7_128(v[5]); v[6] = rot7_128(v[6]); v[7] = rot7_128(v[7]); v[4] = rot7_128(v[4]); } INLINE void transpose_vecs_128(__m128i vecs[4]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, size_t block_offset, __m128i out[16]) { out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]); out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]); out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]); out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]); out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]); out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]); out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]); out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]); out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]); out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]); out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]); out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]); out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]); out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); for (size_t i = 0; i < 4; ++i) { _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs_128(&out[0]); transpose_vecs_128(&out[4]); transpose_vecs_128(&out[8]); transpose_vecs_128(&out[12]); } INLINE void load_counters4(uint64_t counter, bool increment_counter, __m128i *out_lo, __m128i *out_hi) { uint64_t mask = (increment_counter ? ~0 : 0); __m256i mask_vec = _mm256_set1_epi64x(mask); __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); deltas = _mm256_and_si256(mask_vec, deltas); __m256i counters = _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas); *out_lo = _mm256_cvtepi64_epi32(counters); *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); } static void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m128i h_vecs[8] = { set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), }; __m128i counter_low_vec, counter_high_vec; load_counters4(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN); __m128i block_flags_vec = set1_128(block_flags); __m128i msg_vecs[16]; transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m128i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn4(v, msg_vecs, 0); round_fn4(v, msg_vecs, 1); round_fn4(v, msg_vecs, 2); round_fn4(v, msg_vecs, 3); round_fn4(v, msg_vecs, 4); round_fn4(v, msg_vecs, 5); round_fn4(v, msg_vecs, 6); h_vecs[0] = xor_128(v[0], v[8]); h_vecs[1] = xor_128(v[1], v[9]); h_vecs[2] = xor_128(v[2], v[10]); h_vecs[3] = xor_128(v[3], v[11]); h_vecs[4] = xor_128(v[4], v[12]); h_vecs[5] = xor_128(v[5], v[13]); h_vecs[6] = xor_128(v[6], v[14]); h_vecs[7] = xor_128(v[7], v[15]); block_flags = flags; } transpose_vecs_128(&h_vecs[0]); transpose_vecs_128(&h_vecs[4]); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]); storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]); storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]); storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]); storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]); storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]); storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]); storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]); } static void blake3_xof4_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[4 * 64]) { __m128i h_vecs[8] = { set1_128(cv[0]), set1_128(cv[1]), set1_128(cv[2]), set1_128(cv[3]), set1_128(cv[4]), set1_128(cv[5]), set1_128(cv[6]), set1_128(cv[7]), }; uint32_t block_words[16]; load_block_words(block, block_words); __m128i msg_vecs[16]; for (size_t i = 0; i < 16; i++) { msg_vecs[i] = set1_128(block_words[i]); } __m128i counter_low_vec, counter_high_vec; load_counters4(counter, true, &counter_low_vec, &counter_high_vec); __m128i block_len_vec = set1_128(block_len); __m128i block_flags_vec = set1_128(flags); __m128i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn4(v, msg_vecs, 0); round_fn4(v, msg_vecs, 1); round_fn4(v, msg_vecs, 2); round_fn4(v, msg_vecs, 3); round_fn4(v, msg_vecs, 4); round_fn4(v, msg_vecs, 5); round_fn4(v, msg_vecs, 6); for (size_t i = 0; i < 8; i++) { v[i] = xor_128(v[i], v[i+8]); v[i+8] = xor_128(v[i+8], h_vecs[i]); } transpose_vecs_128(&v[0]); transpose_vecs_128(&v[4]); transpose_vecs_128(&v[8]); transpose_vecs_128(&v[12]); for (size_t i = 0; i < 4; i++) { storeu_128(v[i+ 0], &out[(4*i+0) * sizeof(__m128i)]); storeu_128(v[i+ 4], &out[(4*i+1) * sizeof(__m128i)]); storeu_128(v[i+ 8], &out[(4*i+2) * sizeof(__m128i)]); storeu_128(v[i+12], &out[(4*i+3) * sizeof(__m128i)]); } } /* * ---------------------------------------------------------------------------- * hash8_avx512 * ---------------------------------------------------------------------------- */ INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) { v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = add_256(v[0], v[4]); v[1] = add_256(v[1], v[5]); v[2] = add_256(v[2], v[6]); v[3] = add_256(v[3], v[7]); v[12] = xor_256(v[12], v[0]); v[13] = xor_256(v[13], v[1]); v[14] = xor_256(v[14], v[2]); v[15] = xor_256(v[15], v[3]); v[12] = rot16_256(v[12]); v[13] = rot16_256(v[13]); v[14] = rot16_256(v[14]); v[15] = rot16_256(v[15]); v[8] = add_256(v[8], v[12]); v[9] = add_256(v[9], v[13]); v[10] = add_256(v[10], v[14]); v[11] = add_256(v[11], v[15]); v[4] = xor_256(v[4], v[8]); v[5] = xor_256(v[5], v[9]); v[6] = xor_256(v[6], v[10]); v[7] = xor_256(v[7], v[11]); v[4] = rot12_256(v[4]); v[5] = rot12_256(v[5]); v[6] = rot12_256(v[6]); v[7] = rot12_256(v[7]); v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = add_256(v[0], v[4]); v[1] = add_256(v[1], v[5]); v[2] = add_256(v[2], v[6]); v[3] = add_256(v[3], v[7]); v[12] = xor_256(v[12], v[0]); v[13] = xor_256(v[13], v[1]); v[14] = xor_256(v[14], v[2]); v[15] = xor_256(v[15], v[3]); v[12] = rot8_256(v[12]); v[13] = rot8_256(v[13]); v[14] = rot8_256(v[14]); v[15] = rot8_256(v[15]); v[8] = add_256(v[8], v[12]); v[9] = add_256(v[9], v[13]); v[10] = add_256(v[10], v[14]); v[11] = add_256(v[11], v[15]); v[4] = xor_256(v[4], v[8]); v[5] = xor_256(v[5], v[9]); v[6] = xor_256(v[6], v[10]); v[7] = xor_256(v[7], v[11]); v[4] = rot7_256(v[4]); v[5] = rot7_256(v[5]); v[6] = rot7_256(v[6]); v[7] = rot7_256(v[7]); v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = add_256(v[0], v[5]); v[1] = add_256(v[1], v[6]); v[2] = add_256(v[2], v[7]); v[3] = add_256(v[3], v[4]); v[15] = xor_256(v[15], v[0]); v[12] = xor_256(v[12], v[1]); v[13] = xor_256(v[13], v[2]); v[14] = xor_256(v[14], v[3]); v[15] = rot16_256(v[15]); v[12] = rot16_256(v[12]); v[13] = rot16_256(v[13]); v[14] = rot16_256(v[14]); v[10] = add_256(v[10], v[15]); v[11] = add_256(v[11], v[12]); v[8] = add_256(v[8], v[13]); v[9] = add_256(v[9], v[14]); v[5] = xor_256(v[5], v[10]); v[6] = xor_256(v[6], v[11]); v[7] = xor_256(v[7], v[8]); v[4] = xor_256(v[4], v[9]); v[5] = rot12_256(v[5]); v[6] = rot12_256(v[6]); v[7] = rot12_256(v[7]); v[4] = rot12_256(v[4]); v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = add_256(v[0], v[5]); v[1] = add_256(v[1], v[6]); v[2] = add_256(v[2], v[7]); v[3] = add_256(v[3], v[4]); v[15] = xor_256(v[15], v[0]); v[12] = xor_256(v[12], v[1]); v[13] = xor_256(v[13], v[2]); v[14] = xor_256(v[14], v[3]); v[15] = rot8_256(v[15]); v[12] = rot8_256(v[12]); v[13] = rot8_256(v[13]); v[14] = rot8_256(v[14]); v[10] = add_256(v[10], v[15]); v[11] = add_256(v[11], v[12]); v[8] = add_256(v[8], v[13]); v[9] = add_256(v[9], v[14]); v[5] = xor_256(v[5], v[10]); v[6] = xor_256(v[6], v[11]); v[7] = xor_256(v[7], v[8]); v[4] = xor_256(v[4], v[9]); v[5] = rot7_256(v[5]); v[6] = rot7_256(v[6]); v[7] = rot7_256(v[7]); v[4] = rot7_256(v[4]); } INLINE void transpose_vecs_256(__m256i vecs[8]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high // is 22/33/66/77. __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is // 11/33. __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); // Interleave 128-bit lanes. vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); } INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, size_t block_offset, __m256i out[16]) { out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]); out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]); out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]); out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]); out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]); out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]); out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]); out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]); out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]); out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]); out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]); out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]); out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]); out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]); out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); for (size_t i = 0; i < 8; ++i) { _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs_256(&out[0]); transpose_vecs_256(&out[8]); } INLINE void load_counters8(uint64_t counter, bool increment_counter, __m256i *out_lo, __m256i *out_hi) { uint64_t mask = (increment_counter ? ~0 : 0); __m512i mask_vec = _mm512_set1_epi64(mask); __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); deltas = _mm512_and_si512(mask_vec, deltas); __m512i counters = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas); *out_lo = _mm512_cvtepi64_epi32(counters); *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); } static void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m256i h_vecs[8] = { set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), }; __m256i counter_low_vec, counter_high_vec; load_counters8(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN); __m256i block_flags_vec = set1_256(block_flags); __m256i msg_vecs[16]; transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m256i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn8(v, msg_vecs, 0); round_fn8(v, msg_vecs, 1); round_fn8(v, msg_vecs, 2); round_fn8(v, msg_vecs, 3); round_fn8(v, msg_vecs, 4); round_fn8(v, msg_vecs, 5); round_fn8(v, msg_vecs, 6); h_vecs[0] = xor_256(v[0], v[8]); h_vecs[1] = xor_256(v[1], v[9]); h_vecs[2] = xor_256(v[2], v[10]); h_vecs[3] = xor_256(v[3], v[11]); h_vecs[4] = xor_256(v[4], v[12]); h_vecs[5] = xor_256(v[5], v[13]); h_vecs[6] = xor_256(v[6], v[14]); h_vecs[7] = xor_256(v[7], v[15]); block_flags = flags; } transpose_vecs_256(h_vecs); storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]); storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]); storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]); storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]); storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]); storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]); storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]); storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]); } static void blake3_xof8_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[8 * 64]) { __m256i h_vecs[8] = { set1_256(cv[0]), set1_256(cv[1]), set1_256(cv[2]), set1_256(cv[3]), set1_256(cv[4]), set1_256(cv[5]), set1_256(cv[6]), set1_256(cv[7]), }; uint32_t block_words[16]; load_block_words(block, block_words); __m256i msg_vecs[16]; for (size_t i = 0; i < 16; i++) { msg_vecs[i] = set1_256(block_words[i]); } __m256i counter_low_vec, counter_high_vec; load_counters8(counter, true, &counter_low_vec, &counter_high_vec); __m256i block_len_vec = set1_256(block_len); __m256i block_flags_vec = set1_256(flags); __m256i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn8(v, msg_vecs, 0); round_fn8(v, msg_vecs, 1); round_fn8(v, msg_vecs, 2); round_fn8(v, msg_vecs, 3); round_fn8(v, msg_vecs, 4); round_fn8(v, msg_vecs, 5); round_fn8(v, msg_vecs, 6); for (size_t i = 0; i < 8; i++) { v[i] = xor_256(v[i], v[i+8]); v[i+8] = xor_256(v[i+8], h_vecs[i]); } transpose_vecs_256(&v[0]); transpose_vecs_256(&v[8]); for (size_t i = 0; i < 8; i++) { storeu_256(v[i+0], &out[(2*i+0) * sizeof(__m256i)]); storeu_256(v[i+8], &out[(2*i+1) * sizeof(__m256i)]); } } /* * ---------------------------------------------------------------------------- * hash16_avx512 * ---------------------------------------------------------------------------- */ INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) { v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = add_512(v[0], v[4]); v[1] = add_512(v[1], v[5]); v[2] = add_512(v[2], v[6]); v[3] = add_512(v[3], v[7]); v[12] = xor_512(v[12], v[0]); v[13] = xor_512(v[13], v[1]); v[14] = xor_512(v[14], v[2]); v[15] = xor_512(v[15], v[3]); v[12] = rot16_512(v[12]); v[13] = rot16_512(v[13]); v[14] = rot16_512(v[14]); v[15] = rot16_512(v[15]); v[8] = add_512(v[8], v[12]); v[9] = add_512(v[9], v[13]); v[10] = add_512(v[10], v[14]); v[11] = add_512(v[11], v[15]); v[4] = xor_512(v[4], v[8]); v[5] = xor_512(v[5], v[9]); v[6] = xor_512(v[6], v[10]); v[7] = xor_512(v[7], v[11]); v[4] = rot12_512(v[4]); v[5] = rot12_512(v[5]); v[6] = rot12_512(v[6]); v[7] = rot12_512(v[7]); v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = add_512(v[0], v[4]); v[1] = add_512(v[1], v[5]); v[2] = add_512(v[2], v[6]); v[3] = add_512(v[3], v[7]); v[12] = xor_512(v[12], v[0]); v[13] = xor_512(v[13], v[1]); v[14] = xor_512(v[14], v[2]); v[15] = xor_512(v[15], v[3]); v[12] = rot8_512(v[12]); v[13] = rot8_512(v[13]); v[14] = rot8_512(v[14]); v[15] = rot8_512(v[15]); v[8] = add_512(v[8], v[12]); v[9] = add_512(v[9], v[13]); v[10] = add_512(v[10], v[14]); v[11] = add_512(v[11], v[15]); v[4] = xor_512(v[4], v[8]); v[5] = xor_512(v[5], v[9]); v[6] = xor_512(v[6], v[10]); v[7] = xor_512(v[7], v[11]); v[4] = rot7_512(v[4]); v[5] = rot7_512(v[5]); v[6] = rot7_512(v[6]); v[7] = rot7_512(v[7]); v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = add_512(v[0], v[5]); v[1] = add_512(v[1], v[6]); v[2] = add_512(v[2], v[7]); v[3] = add_512(v[3], v[4]); v[15] = xor_512(v[15], v[0]); v[12] = xor_512(v[12], v[1]); v[13] = xor_512(v[13], v[2]); v[14] = xor_512(v[14], v[3]); v[15] = rot16_512(v[15]); v[12] = rot16_512(v[12]); v[13] = rot16_512(v[13]); v[14] = rot16_512(v[14]); v[10] = add_512(v[10], v[15]); v[11] = add_512(v[11], v[12]); v[8] = add_512(v[8], v[13]); v[9] = add_512(v[9], v[14]); v[5] = xor_512(v[5], v[10]); v[6] = xor_512(v[6], v[11]); v[7] = xor_512(v[7], v[8]); v[4] = xor_512(v[4], v[9]); v[5] = rot12_512(v[5]); v[6] = rot12_512(v[6]); v[7] = rot12_512(v[7]); v[4] = rot12_512(v[4]); v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = add_512(v[0], v[5]); v[1] = add_512(v[1], v[6]); v[2] = add_512(v[2], v[7]); v[3] = add_512(v[3], v[4]); v[15] = xor_512(v[15], v[0]); v[12] = xor_512(v[12], v[1]); v[13] = xor_512(v[13], v[2]); v[14] = xor_512(v[14], v[3]); v[15] = rot8_512(v[15]); v[12] = rot8_512(v[12]); v[13] = rot8_512(v[13]); v[14] = rot8_512(v[14]); v[10] = add_512(v[10], v[15]); v[11] = add_512(v[11], v[12]); v[8] = add_512(v[8], v[13]); v[9] = add_512(v[9], v[14]); v[5] = xor_512(v[5], v[10]); v[6] = xor_512(v[6], v[11]); v[7] = xor_512(v[7], v[8]); v[4] = xor_512(v[4], v[9]); v[5] = rot7_512(v[5]); v[6] = rot7_512(v[6]); v[7] = rot7_512(v[7]); v[4] = rot7_512(v[4]); } // 0b10001000, or lanes a0/a2/b0/b2 in little-endian order #define LO_IMM8 0x88 INLINE __m512i unpack_lo_128(__m512i a, __m512i b) { return _mm512_shuffle_i32x4(a, b, LO_IMM8); } // 0b11011101, or lanes a1/a3/b1/b3 in little-endian order #define HI_IMM8 0xdd INLINE __m512i unpack_hi_128(__m512i a, __m512i b) { return _mm512_shuffle_i32x4(a, b, HI_IMM8); } INLINE void transpose_vecs_512(__m512i vecs[16]) { // Interleave 32-bit lanes. The _0 unpack is lanes // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15. __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]); __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]); __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]); __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]); __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]); __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]); __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]); __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]); __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]); __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]); __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]); __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]); __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]); __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]); __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]); __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]); // Interleave 64-bit lanes. The _0 unpack is lanes // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15. __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0); __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0); __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2); __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2); __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0); __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0); __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2); __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2); __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0); __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0); __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2); __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2); __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0); __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0); __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2); __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2); // Interleave 128-bit lanes. The _0 unpack is // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on. __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0); __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1); __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2); __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3); __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0); __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1); __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2); __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3); __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0); __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1); __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2); __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3); __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0); __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1); __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2); __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3); // Interleave 128-bit lanes again for the final outputs. vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0); vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1); vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2); vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3); vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4); vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5); vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6); vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7); vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0); vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1); vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2); vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3); vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4); vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5); vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6); vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7); } INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, size_t block_offset, __m512i out[16]) { out[0] = loadu_512(&inputs[0][block_offset]); out[1] = loadu_512(&inputs[1][block_offset]); out[2] = loadu_512(&inputs[2][block_offset]); out[3] = loadu_512(&inputs[3][block_offset]); out[4] = loadu_512(&inputs[4][block_offset]); out[5] = loadu_512(&inputs[5][block_offset]); out[6] = loadu_512(&inputs[6][block_offset]); out[7] = loadu_512(&inputs[7][block_offset]); out[8] = loadu_512(&inputs[8][block_offset]); out[9] = loadu_512(&inputs[9][block_offset]); out[10] = loadu_512(&inputs[10][block_offset]); out[11] = loadu_512(&inputs[11][block_offset]); out[12] = loadu_512(&inputs[12][block_offset]); out[13] = loadu_512(&inputs[13][block_offset]); out[14] = loadu_512(&inputs[14][block_offset]); out[15] = loadu_512(&inputs[15][block_offset]); for (size_t i = 0; i < 16; ++i) { _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs_512(out); } INLINE void load_counters16(uint64_t counter, bool increment_counter, __m512i *out_lo, __m512i *out_hi) { const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); const __m512i masked_deltas = _mm512_and_si512(deltas, mask); const __m512i low_words = _mm512_add_epi32( _mm512_set1_epi32((int32_t)counter), masked_deltas); // The carry bit is 1 if the high bit of the word was 1 before addition and is // 0 after. // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to // compute the carry bits here, and originally we did, but that intrinsic is // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271. const __m512i carries = _mm512_srli_epi32( _mm512_andnot_si512( low_words, // 0 after (gets inverted by andnot) _mm512_set1_epi32((int32_t)counter)), // and 1 before 31); const __m512i high_words = _mm512_add_epi32( _mm512_set1_epi32((int32_t)(counter >> 32)), carries); *out_lo = low_words; *out_hi = high_words; } static void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m512i h_vecs[8] = { set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), }; __m512i counter_low_vec, counter_high_vec; load_counters16(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN); __m512i block_flags_vec = set1_512(block_flags); __m512i msg_vecs[16]; transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m512i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn16(v, msg_vecs, 0); round_fn16(v, msg_vecs, 1); round_fn16(v, msg_vecs, 2); round_fn16(v, msg_vecs, 3); round_fn16(v, msg_vecs, 4); round_fn16(v, msg_vecs, 5); round_fn16(v, msg_vecs, 6); h_vecs[0] = xor_512(v[0], v[8]); h_vecs[1] = xor_512(v[1], v[9]); h_vecs[2] = xor_512(v[2], v[10]); h_vecs[3] = xor_512(v[3], v[11]); h_vecs[4] = xor_512(v[4], v[12]); h_vecs[5] = xor_512(v[5], v[13]); h_vecs[6] = xor_512(v[6], v[14]); h_vecs[7] = xor_512(v[7], v[15]); block_flags = flags; } // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8 // state vectors. Pad the matrix with zeros. After transposition, store the // lower half of each vector. __m512i padded[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_512(0), set1_512(0), set1_512(0), set1_512(0), set1_512(0), set1_512(0), set1_512(0), set1_512(0), }; transpose_vecs_512(padded); _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0])); _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1])); _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2])); _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3])); _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4])); _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5])); _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6])); _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7])); _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8])); _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9])); _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10])); _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11])); _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12])); _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13])); _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14])); _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15])); } static void blake3_xof16_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[16 * 64]) { __m512i h_vecs[8] = { set1_512(cv[0]), set1_512(cv[1]), set1_512(cv[2]), set1_512(cv[3]), set1_512(cv[4]), set1_512(cv[5]), set1_512(cv[6]), set1_512(cv[7]), }; uint32_t block_words[16]; load_block_words(block, block_words); __m512i msg_vecs[16]; for (size_t i = 0; i < 16; i++) { msg_vecs[i] = set1_512(block_words[i]); } __m512i counter_low_vec, counter_high_vec; load_counters16(counter, true, &counter_low_vec, &counter_high_vec); __m512i block_len_vec = set1_512(block_len); __m512i block_flags_vec = set1_512(flags); __m512i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn16(v, msg_vecs, 0); round_fn16(v, msg_vecs, 1); round_fn16(v, msg_vecs, 2); round_fn16(v, msg_vecs, 3); round_fn16(v, msg_vecs, 4); round_fn16(v, msg_vecs, 5); round_fn16(v, msg_vecs, 6); for (size_t i = 0; i < 8; i++) { v[i] = xor_512(v[i], v[i+8]); v[i+8] = xor_512(v[i+8], h_vecs[i]); } transpose_vecs_512(&v[0]); for (size_t i = 0; i < 16; i++) { storeu_512(v[i], &out[i * sizeof(__m512i)]); } } /* * ---------------------------------------------------------------------------- * hash_many_avx512 * ---------------------------------------------------------------------------- */ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { uint32_t cv[8]; memcpy(cv, key, BLAKE3_KEY_LEN); uint8_t block_flags = flags | flags_start; while (blocks > 0) { if (blocks == 1) { block_flags |= flags_end; } blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; } memcpy(out, cv, BLAKE3_OUT_LEN); } void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs >= 16) { blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 16; } inputs += 16; num_inputs -= 16; out = &out[16 * BLAKE3_OUT_LEN]; } while (num_inputs >= 8) { blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 8; } inputs += 8; num_inputs -= 8; out = &out[8 * BLAKE3_OUT_LEN]; } while (num_inputs >= 4) { blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 4; } inputs += 4; num_inputs -= 4; out = &out[4 * BLAKE3_OUT_LEN]; } while (num_inputs > 0) { hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 1; } inputs += 1; num_inputs -= 1; out = &out[BLAKE3_OUT_LEN]; } } void blake3_xof_many_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t* out, size_t outblocks) { while (outblocks >= 16) { blake3_xof16_avx512(cv, block, block_len, counter, flags, out); counter += 16; outblocks -= 16; out += 16 * BLAKE3_BLOCK_LEN; } while (outblocks >= 8) { blake3_xof8_avx512(cv, block, block_len, counter, flags, out); counter += 8; outblocks -= 8; out += 8 * BLAKE3_BLOCK_LEN; } while (outblocks >= 4) { blake3_xof4_avx512(cv, block, block_len, counter, flags, out); counter += 4; outblocks -= 4; out += 4 * BLAKE3_BLOCK_LEN; } while (outblocks > 0) { blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); counter += 1; outblocks -= 1; out += BLAKE3_BLOCK_LEN; } } blake3-1.5.4/c/blake3_avx512_x86-64_unix.S000064400000000000000000004736101046102023000156230ustar 00000000000000#if defined(__ELF__) && defined(__linux__) .section .note.GNU-stack,"",%progbits #endif #if defined(__ELF__) && defined(__CET__) && defined(__has_include) #if __has_include() #include #endif #endif #if !defined(_CET_ENDBR) #define _CET_ENDBR #endif .intel_syntax noprefix .global _blake3_hash_many_avx512 .global blake3_hash_many_avx512 .global blake3_compress_in_place_avx512 .global _blake3_compress_in_place_avx512 .global blake3_compress_xof_avx512 .global _blake3_compress_xof_avx512 .global blake3_xof_many_avx512 .global _blake3_xof_many_avx512 #ifdef __APPLE__ .text #else .section .text #endif .p2align 6 _blake3_hash_many_avx512: blake3_hash_many_avx512: _CET_ENDBR push r15 push r14 push r13 push r12 push rbx push rbp mov rbp, rsp sub rsp, 144 and rsp, 0xFFFFFFFFFFFFFFC0 neg r9 kmovw k1, r9d vmovd xmm0, r8d vpbroadcastd ymm0, xmm0 shr r8, 32 vmovd xmm1, r8d vpbroadcastd ymm1, xmm1 vmovdqa ymm4, ymm1 vmovdqa ymm5, ymm1 vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] vpcmpltud k2, ymm2, ymm0 vpcmpltud k3, ymm3, ymm0 vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} knotw k2, k1 vmovdqa32 ymm2 {k2}, ymm0 vmovdqa32 ymm3 {k2}, ymm0 vmovdqa32 ymm4 {k2}, ymm1 vmovdqa32 ymm5 {k2}, ymm1 vmovdqa ymmword ptr [rsp], ymm2 vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 shl rdx, 6 mov qword ptr [rsp+0x80], rdx cmp rsi, 16 jc 3f 2: vpbroadcastd zmm0, dword ptr [rcx] vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] movzx eax, byte ptr [rbp+0x38] movzx ebx, byte ptr [rbp+0x40] or eax, ebx xor edx, edx .p2align 5 9: movzx ebx, byte ptr [rbp+0x48] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x80] cmove eax, ebx mov dword ptr [rsp+0x88], eax mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x40] mov r13, qword ptr [rdi+0x48] mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm8, zmm16, zmm17 vpunpckhqdq zmm9, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm10, zmm18, zmm19 vpunpckhqdq zmm11, zmm18, zmm19 mov r8, qword ptr [rdi+0x20] mov r9, qword ptr [rdi+0x28] mov r10, qword ptr [rdi+0x30] mov r11, qword ptr [rdi+0x38] mov r12, qword ptr [rdi+0x60] mov r13, qword ptr [rdi+0x68] mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm12, zmm16, zmm17 vpunpckhqdq zmm13, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm14, zmm18, zmm19 vpunpckhqdq zmm15, zmm18, zmm19 vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] vshufps zmm16, zmm8, zmm10, 136 vshufps zmm17, zmm12, zmm14, 136 vmovdqa32 zmm20, zmm16 vpermt2d zmm16, zmm27, zmm17 vpermt2d zmm20, zmm31, zmm17 vshufps zmm17, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm21, zmm17 vpermt2d zmm17, zmm27, zmm30 vpermt2d zmm21, zmm31, zmm30 vshufps zmm18, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm22, zmm18 vpermt2d zmm18, zmm27, zmm8 vpermt2d zmm22, zmm31, zmm8 vshufps zmm19, zmm9, zmm11, 221 vshufps zmm8, zmm13, zmm15, 221 vmovdqa32 zmm23, zmm19 vpermt2d zmm19, zmm27, zmm8 vpermt2d zmm23, zmm31, zmm8 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x40] mov r13, qword ptr [rdi+0x48] mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm8, zmm24, zmm25 vpunpckhqdq zmm9, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm10, zmm24, zmm25 vpunpckhqdq zmm11, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] mov r8, qword ptr [rdi+0x20] mov r9, qword ptr [rdi+0x28] mov r10, qword ptr [rdi+0x30] mov r11, qword ptr [rdi+0x38] mov r12, qword ptr [rdi+0x60] mov r13, qword ptr [rdi+0x68] mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm12, zmm24, zmm25 vpunpckhqdq zmm13, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm14, zmm24, zmm25 vpunpckhqdq zmm15, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] vshufps zmm24, zmm8, zmm10, 136 vshufps zmm30, zmm12, zmm14, 136 vmovdqa32 zmm28, zmm24 vpermt2d zmm24, zmm27, zmm30 vpermt2d zmm28, zmm31, zmm30 vshufps zmm25, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm29, zmm25 vpermt2d zmm25, zmm27, zmm30 vpermt2d zmm29, zmm31, zmm30 vshufps zmm26, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm30, zmm26 vpermt2d zmm26, zmm27, zmm8 vpermt2d zmm30, zmm31, zmm8 vshufps zmm8, zmm9, zmm11, 221 vshufps zmm10, zmm13, zmm15, 221 vpermi2d zmm27, zmm8, zmm10 vpermi2d zmm31, zmm8, zmm10 vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] vmovdqa32 zmm12, zmmword ptr [rsp] vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm24 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm23 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm27 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm21 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm28 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm26 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm22 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm31 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpxord zmm0, zmm0, zmm8 vpxord zmm1, zmm1, zmm9 vpxord zmm2, zmm2, zmm10 vpxord zmm3, zmm3, zmm11 vpxord zmm4, zmm4, zmm12 vpxord zmm5, zmm5, zmm13 vpxord zmm6, zmm6, zmm14 vpxord zmm7, zmm7, zmm15 movzx eax, byte ptr [rbp+0x38] jne 9b mov rbx, qword ptr [rbp+0x50] vpunpckldq zmm16, zmm0, zmm1 vpunpckhdq zmm17, zmm0, zmm1 vpunpckldq zmm18, zmm2, zmm3 vpunpckhdq zmm19, zmm2, zmm3 vpunpckldq zmm20, zmm4, zmm5 vpunpckhdq zmm21, zmm4, zmm5 vpunpckldq zmm22, zmm6, zmm7 vpunpckhdq zmm23, zmm6, zmm7 vpunpcklqdq zmm0, zmm16, zmm18 vpunpckhqdq zmm1, zmm16, zmm18 vpunpcklqdq zmm2, zmm17, zmm19 vpunpckhqdq zmm3, zmm17, zmm19 vpunpcklqdq zmm4, zmm20, zmm22 vpunpckhqdq zmm5, zmm20, zmm22 vpunpcklqdq zmm6, zmm21, zmm23 vpunpckhqdq zmm7, zmm21, zmm23 vshufi32x4 zmm16, zmm0, zmm4, 0x88 vshufi32x4 zmm17, zmm1, zmm5, 0x88 vshufi32x4 zmm18, zmm2, zmm6, 0x88 vshufi32x4 zmm19, zmm3, zmm7, 0x88 vshufi32x4 zmm20, zmm0, zmm4, 0xDD vshufi32x4 zmm21, zmm1, zmm5, 0xDD vshufi32x4 zmm22, zmm2, zmm6, 0xDD vshufi32x4 zmm23, zmm3, zmm7, 0xDD vshufi32x4 zmm0, zmm16, zmm17, 0x88 vshufi32x4 zmm1, zmm18, zmm19, 0x88 vshufi32x4 zmm2, zmm20, zmm21, 0x88 vshufi32x4 zmm3, zmm22, zmm23, 0x88 vshufi32x4 zmm4, zmm16, zmm17, 0xDD vshufi32x4 zmm5, zmm18, zmm19, 0xDD vshufi32x4 zmm6, zmm20, zmm21, 0xDD vshufi32x4 zmm7, zmm22, zmm23, 0xDD vmovdqu32 zmmword ptr [rbx], zmm0 vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 vmovdqa32 zmm0, zmmword ptr [rsp] vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] vmovdqa32 zmm2, zmm0 vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} vpcmpltud k2, zmm2, zmm0 vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} vmovdqa32 zmmword ptr [rsp], zmm2 vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 add rdi, 128 add rbx, 512 mov qword ptr [rbp+0x50], rbx sub rsi, 16 cmp rsi, 16 jnc 2b test rsi, rsi jnz 3f 4: vzeroupper mov rsp, rbp pop rbp pop rbx pop r12 pop r13 pop r14 pop r15 ret .p2align 6 3: test esi, 0x8 je 3f vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+0x4] vpbroadcastd ymm2, dword ptr [rcx+0x8] vpbroadcastd ymm3, dword ptr [rcx+0xC] vpbroadcastd ymm4, dword ptr [rcx+0x10] vpbroadcastd ymm5, dword ptr [rcx+0x14] vpbroadcastd ymm6, dword ptr [rcx+0x18] vpbroadcastd ymm7, dword ptr [rcx+0x1C] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x20] mov r13, qword ptr [rdi+0x28] mov r14, qword ptr [rdi+0x30] mov r15, qword ptr [rdi+0x38] movzx eax, byte ptr [rbp+0x38] movzx ebx, byte ptr [rbp+0x40] or eax, ebx xor edx, edx 2: movzx ebx, byte ptr [rbp+0x48] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x80] cmove eax, ebx mov dword ptr [rsp+0x88], eax vmovups xmm8, xmmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x40] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x40] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm16, ymm12, ymm14, 136 vshufps ymm17, ymm12, ymm14, 221 vshufps ymm18, ymm13, ymm15, 136 vshufps ymm19, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x30] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x30] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm20, ymm12, ymm14, 136 vshufps ymm21, ymm12, ymm14, 221 vshufps ymm22, ymm13, ymm15, 136 vshufps ymm23, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x20] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x20] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm24, ymm12, ymm14, 136 vshufps ymm25, ymm12, ymm14, 221 vshufps ymm26, ymm13, ymm15, 136 vshufps ymm27, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x10] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x10] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm28, ymm12, ymm14, 136 vshufps ymm29, ymm12, ymm14, 221 vshufps ymm30, ymm13, ymm15, 136 vshufps ymm31, ymm13, ymm15, 221 vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] vmovdqa ymm12, ymmword ptr [rsp] vmovdqa ymm13, ymmword ptr [rsp+0x40] vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] vpbroadcastd ymm15, dword ptr [rsp+0x88] vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm24 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm23 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm27 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm21 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm28 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm26 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm22 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm31 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+0x38] jne 2b mov rbx, qword ptr [rbp+0x50] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0xCC vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0xCC vblendps ymm3, ymm12, ymm9, 0xCC vperm2f128 ymm12, ymm1, ymm2, 0x20 vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0xCC vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 0x20 vmovups ymmword ptr [rbx+0x20], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0xCC vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0xCC vblendps ymm14, ymm14, ymm13, 0xCC vperm2f128 ymm8, ymm10, ymm14, 0x20 vmovups ymmword ptr [rbx+0x40], ymm8 vblendps ymm15, ymm13, ymm15, 0xCC vperm2f128 ymm13, ymm6, ymm15, 0x20 vmovups ymmword ptr [rbx+0x60], ymm13 vperm2f128 ymm9, ymm1, ymm2, 0x31 vperm2f128 ymm11, ymm3, ymm4, 0x31 vmovups ymmword ptr [rbx+0x80], ymm9 vperm2f128 ymm14, ymm10, ymm14, 0x31 vperm2f128 ymm15, ymm6, ymm15, 0x31 vmovups ymmword ptr [rbx+0xA0], ymm11 vmovups ymmword ptr [rbx+0xC0], ymm14 vmovups ymmword ptr [rbx+0xE0], ymm15 vmovdqa ymm0, ymmword ptr [rsp] vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] vmovdqa ymmword ptr [rsp], ymm0 vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 add rbx, 256 mov qword ptr [rbp+0x50], rbx add rdi, 64 sub rsi, 8 3: mov rbx, qword ptr [rbp+0x50] mov r15, qword ptr [rsp+0x80] movzx r13, byte ptr [rbp+0x38] movzx r12, byte ptr [rbp+0x48] test esi, 0x4 je 3f vbroadcasti32x4 zmm0, xmmword ptr [rcx] vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] vmovdqa xmm12, xmmword ptr [rsp] vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] vpunpckldq xmm14, xmm12, xmm13 vpunpckhdq xmm15, xmm12, xmm13 vpermq ymm14, ymm14, 0xDC vpermq ymm15, ymm15, 0xDC vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] vinserti64x4 zmm13, zmm14, ymm15, 0x01 mov eax, 17476 kmovw k2, eax vpblendmd zmm13 {k2}, zmm13, zmm12 vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov eax, 43690 kmovw k3, eax mov eax, 34952 kmovw k4, eax movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x88], eax vmovdqa32 zmm2, zmm15 vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] vpblendmd zmm3 {k4}, zmm13, zmm8 vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 vmovups zmm9, zmmword ptr [r8+rdx-0x30] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 vshufps zmm4, zmm8, zmm9, 136 vshufps zmm5, zmm8, zmm9, 221 vmovups zmm8, zmmword ptr [r8+rdx-0x20] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 vmovups zmm9, zmmword ptr [r8+rdx-0x10] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 vshufps zmm6, zmm8, zmm9, 136 vshufps zmm7, zmm8, zmm9, 221 vpshufd zmm6, zmm6, 0x93 vpshufd zmm7, zmm7, 0x93 mov al, 7 9: vpaddd zmm0, zmm0, zmm4 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm5 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 0x93 vpshufd zmm3, zmm3, 0x4E vpshufd zmm2, zmm2, 0x39 vpaddd zmm0, zmm0, zmm6 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm7 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 0x39 vpshufd zmm3, zmm3, 0x4E vpshufd zmm2, zmm2, 0x93 dec al jz 9f vshufps zmm8, zmm4, zmm5, 214 vpshufd zmm9, zmm4, 0x0F vpshufd zmm4, zmm8, 0x39 vshufps zmm8, zmm6, zmm7, 250 vpblendmd zmm9 {k3}, zmm9, zmm8 vpunpcklqdq zmm8, zmm7, zmm5 vpblendmd zmm8 {k4}, zmm8, zmm6 vpshufd zmm8, zmm8, 0x78 vpunpckhdq zmm5, zmm5, zmm7 vpunpckldq zmm6, zmm6, zmm5 vpshufd zmm7, zmm6, 0x1E vmovdqa32 zmm5, zmm9 vmovdqa32 zmm6, zmm8 jmp 9b 9: vpxord zmm0, zmm0, zmm2 vpxord zmm1, zmm1, zmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+0x40] vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+0x40], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 3: test esi, 0x2 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovd xmm13, dword ptr [rsp] vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovd xmm14, dword ptr [rsp+0x4] vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vinserti128 ymm13, ymm13, xmm14, 0x01 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x88], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vpbroadcastd ymm8, dword ptr [rsp+0x88] vpblendd ymm3, ymm13, ymm8, 0x88 vmovups ymm8, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x93 dec al jz 9f vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0x0F vpshufd ymm4, ymm8, 0x39 vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0xAA vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 0x88 vpshufd ymm8, ymm8, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 add rbx, 64 add rdi, 16 sub rsi, 2 3: test esi, 0x1 je 4b vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] vmovd xmm14, dword ptr [rsp] vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vpinsrd xmm3, xmm14, eax, 3 vmovdqa xmm2, xmm15 vmovups xmm8, xmmword ptr [r8+rdx-0x40] vmovups xmm9, xmmword ptr [r8+rdx-0x30] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vmovups xmm9, xmmword ptr [r8+rdx-0x10] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 jmp 4b .p2align 6 _blake3_compress_in_place_avx512: blake3_compress_in_place_avx512: _CET_ENDBR vmovdqu xmm0, xmmword ptr [rdi] vmovdqu xmm1, xmmword ptr [rdi+0x10] movzx eax, r8b movzx edx, dl shl rax, 32 add rdx, rax vmovq xmm3, rcx vmovq xmm4, rdx vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] vmovups xmm8, xmmword ptr [rsi] vmovups xmm9, xmmword ptr [rsi+0x10] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rsi+0x20] vmovups xmm9, xmmword ptr [rsi+0x30] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vmovdqu xmmword ptr [rdi], xmm0 vmovdqu xmmword ptr [rdi+0x10], xmm1 ret .p2align 6 _blake3_compress_xof_avx512: blake3_compress_xof_avx512: _CET_ENDBR vmovdqu xmm0, xmmword ptr [rdi] vmovdqu xmm1, xmmword ptr [rdi+0x10] movzx eax, r8b movzx edx, dl shl rax, 32 add rdx, rax vmovq xmm3, rcx vmovq xmm4, rdx vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] vmovups xmm8, xmmword ptr [rsi] vmovups xmm9, xmmword ptr [rsi+0x10] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rsi+0x20] vmovups xmm9, xmmword ptr [rsi+0x30] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vpxor xmm2, xmm2, [rdi] vpxor xmm3, xmm3, [rdi+0x10] vmovdqu xmmword ptr [r9], xmm0 vmovdqu xmmword ptr [r9+0x10], xmm1 vmovdqu xmmword ptr [r9+0x20], xmm2 vmovdqu xmmword ptr [r9+0x30], xmm3 ret .p2align 6 blake3_xof_many_avx512: _blake3_xof_many_avx512: _CET_ENDBR mov r10,QWORD PTR [rsp+0x8] cmp r10,0x1 ja 2f vmovdqu xmm0,XMMWORD PTR [rdi] vmovdqu xmm1,XMMWORD PTR [rdi+0x10] movzx eax,r8b movzx edx,dl shl rax,0x20 add rdx,rax vmovq xmm3,rcx vmovq xmm4,rdx vpunpcklqdq xmm3,xmm3,xmm4 vmovaps xmm2,XMMWORD PTR [BLAKE3_IV+rip] vmovups xmm8,XMMWORD PTR [rsi] vmovups xmm9,XMMWORD PTR [rsi+0x10] vshufps xmm4,xmm8,xmm9,0x88 vshufps xmm5,xmm8,xmm9,0xdd vmovups xmm8,XMMWORD PTR [rsi+0x20] vmovups xmm9,XMMWORD PTR [rsi+0x30] vshufps xmm6,xmm8,xmm9,0x88 vshufps xmm7,xmm8,xmm9,0xdd vpshufd xmm6,xmm6,0x93 vpshufd xmm7,xmm7,0x93 mov al,0x7 3: vpaddd xmm0,xmm0,xmm4 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x10 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0xc vpaddd xmm0,xmm0,xmm5 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x8 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0x7 vpshufd xmm0,xmm0,0x93 vpshufd xmm3,xmm3,0x4e vpshufd xmm2,xmm2,0x39 vpaddd xmm0,xmm0,xmm6 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x10 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0xc vpaddd xmm0,xmm0,xmm7 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x8 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0x7 vpshufd xmm0,xmm0,0x39 vpshufd xmm3,xmm3,0x4e vpshufd xmm2,xmm2,0x93 dec al je 3f vshufps xmm8,xmm4,xmm5,0xd6 vpshufd xmm9,xmm4,0xf vpshufd xmm4,xmm8,0x39 vshufps xmm8,xmm6,xmm7,0xfa vpblendd xmm9,xmm9,xmm8,0xaa vpunpcklqdq xmm8,xmm7,xmm5 vpblendd xmm8,xmm8,xmm6,0x88 vpshufd xmm8,xmm8,0x78 vpunpckhdq xmm5,xmm5,xmm7 vpunpckldq xmm6,xmm6,xmm5 vpshufd xmm7,xmm6,0x1e vmovdqa xmm5,xmm9 vmovdqa xmm6,xmm8 jmp 3b 3: vpxor xmm0,xmm0,xmm2 vpxor xmm1,xmm1,xmm3 vpxor xmm2,xmm2,XMMWORD PTR [rdi] vpxor xmm3,xmm3,XMMWORD PTR [rdi+0x10] vmovdqu XMMWORD PTR [r9],xmm0 vmovdqu XMMWORD PTR [r9+0x10],xmm1 vmovdqu XMMWORD PTR [r9+0x20],xmm2 vmovdqu XMMWORD PTR [r9+0x30],xmm3 ret .p2align 6 2: push rbp mov rbp,rsp sub rsp,0x90 and rsp,0xffffffffffffffc0 vpbroadcastd zmm0,ecx shr rcx,0x20 vpbroadcastd zmm1,ecx vpaddd zmm2,zmm0,ZMMWORD PTR [ADD0+rip] vpcmpltud k1,zmm2,zmm0 vpaddd zmm1{k1},zmm1,DWORD PTR [ADD1+rip]{1to16} vmovdqa32 ZMMWORD PTR [rsp],zmm2 vmovdqa32 ZMMWORD PTR [rsp+0x40],zmm1 cmp r10,0x10 jb 2f 3: vpbroadcastd zmm16,DWORD PTR [rsi] vpbroadcastd zmm17,DWORD PTR [rsi+0x4] vpbroadcastd zmm18,DWORD PTR [rsi+0x8] vpbroadcastd zmm19,DWORD PTR [rsi+0xc] vpbroadcastd zmm20,DWORD PTR [rsi+0x10] vpbroadcastd zmm21,DWORD PTR [rsi+0x14] vpbroadcastd zmm22,DWORD PTR [rsi+0x18] vpbroadcastd zmm23,DWORD PTR [rsi+0x1c] vpbroadcastd zmm24,DWORD PTR [rsi+0x20] vpbroadcastd zmm25,DWORD PTR [rsi+0x24] vpbroadcastd zmm26,DWORD PTR [rsi+0x28] vpbroadcastd zmm27,DWORD PTR [rsi+0x2c] vpbroadcastd zmm28,DWORD PTR [rsi+0x30] vpbroadcastd zmm29,DWORD PTR [rsi+0x34] vpbroadcastd zmm30,DWORD PTR [rsi+0x38] vpbroadcastd zmm31,DWORD PTR [rsi+0x3c] vpbroadcastd zmm0,DWORD PTR [rdi] vpbroadcastd zmm1,DWORD PTR [rdi+0x4] vpbroadcastd zmm2,DWORD PTR [rdi+0x8] vpbroadcastd zmm3,DWORD PTR [rdi+0xc] vpbroadcastd zmm4,DWORD PTR [rdi+0x10] vpbroadcastd zmm5,DWORD PTR [rdi+0x14] vpbroadcastd zmm6,DWORD PTR [rdi+0x18] vpbroadcastd zmm7,DWORD PTR [rdi+0x1c] vpbroadcastd zmm8,DWORD PTR [BLAKE3_IV_0+rip] vpbroadcastd zmm9,DWORD PTR [BLAKE3_IV_1+rip] vpbroadcastd zmm10,DWORD PTR [BLAKE3_IV_2+rip] vpbroadcastd zmm11,DWORD PTR [BLAKE3_IV_3+rip] vmovdqa32 zmm12,ZMMWORD PTR [rsp] vmovdqa32 zmm13,ZMMWORD PTR [rsp+0x40] vpbroadcastd zmm14,edx vpbroadcastd zmm15,r8d vpaddd zmm0,zmm0,zmm16 vpaddd zmm1,zmm1,zmm18 vpaddd zmm2,zmm2,zmm20 vpaddd zmm3,zmm3,zmm22 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm17 vpaddd zmm1,zmm1,zmm19 vpaddd zmm2,zmm2,zmm21 vpaddd zmm3,zmm3,zmm23 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm24 vpaddd zmm1,zmm1,zmm26 vpaddd zmm2,zmm2,zmm28 vpaddd zmm3,zmm3,zmm30 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm25 vpaddd zmm1,zmm1,zmm27 vpaddd zmm2,zmm2,zmm29 vpaddd zmm3,zmm3,zmm31 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpaddd zmm0,zmm0,zmm18 vpaddd zmm1,zmm1,zmm19 vpaddd zmm2,zmm2,zmm23 vpaddd zmm3,zmm3,zmm20 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm22 vpaddd zmm1,zmm1,zmm26 vpaddd zmm2,zmm2,zmm16 vpaddd zmm3,zmm3,zmm29 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm17 vpaddd zmm1,zmm1,zmm28 vpaddd zmm2,zmm2,zmm25 vpaddd zmm3,zmm3,zmm31 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm27 vpaddd zmm1,zmm1,zmm21 vpaddd zmm2,zmm2,zmm30 vpaddd zmm3,zmm3,zmm24 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpaddd zmm0,zmm0,zmm19 vpaddd zmm1,zmm1,zmm26 vpaddd zmm2,zmm2,zmm29 vpaddd zmm3,zmm3,zmm23 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm20 vpaddd zmm1,zmm1,zmm28 vpaddd zmm2,zmm2,zmm18 vpaddd zmm3,zmm3,zmm30 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm22 vpaddd zmm1,zmm1,zmm25 vpaddd zmm2,zmm2,zmm27 vpaddd zmm3,zmm3,zmm24 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm21 vpaddd zmm1,zmm1,zmm16 vpaddd zmm2,zmm2,zmm31 vpaddd zmm3,zmm3,zmm17 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpaddd zmm0,zmm0,zmm26 vpaddd zmm1,zmm1,zmm28 vpaddd zmm2,zmm2,zmm30 vpaddd zmm3,zmm3,zmm29 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm23 vpaddd zmm1,zmm1,zmm25 vpaddd zmm2,zmm2,zmm19 vpaddd zmm3,zmm3,zmm31 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm20 vpaddd zmm1,zmm1,zmm27 vpaddd zmm2,zmm2,zmm21 vpaddd zmm3,zmm3,zmm17 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm16 vpaddd zmm1,zmm1,zmm18 vpaddd zmm2,zmm2,zmm24 vpaddd zmm3,zmm3,zmm22 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpaddd zmm0,zmm0,zmm28 vpaddd zmm1,zmm1,zmm25 vpaddd zmm2,zmm2,zmm31 vpaddd zmm3,zmm3,zmm30 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm29 vpaddd zmm1,zmm1,zmm27 vpaddd zmm2,zmm2,zmm26 vpaddd zmm3,zmm3,zmm24 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm23 vpaddd zmm1,zmm1,zmm21 vpaddd zmm2,zmm2,zmm16 vpaddd zmm3,zmm3,zmm22 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm18 vpaddd zmm1,zmm1,zmm19 vpaddd zmm2,zmm2,zmm17 vpaddd zmm3,zmm3,zmm20 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpaddd zmm0,zmm0,zmm25 vpaddd zmm1,zmm1,zmm27 vpaddd zmm2,zmm2,zmm24 vpaddd zmm3,zmm3,zmm31 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm30 vpaddd zmm1,zmm1,zmm21 vpaddd zmm2,zmm2,zmm28 vpaddd zmm3,zmm3,zmm17 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm29 vpaddd zmm1,zmm1,zmm16 vpaddd zmm2,zmm2,zmm18 vpaddd zmm3,zmm3,zmm20 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm19 vpaddd zmm1,zmm1,zmm26 vpaddd zmm2,zmm2,zmm22 vpaddd zmm3,zmm3,zmm23 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpaddd zmm0,zmm0,zmm27 vpaddd zmm1,zmm1,zmm21 vpaddd zmm2,zmm2,zmm17 vpaddd zmm3,zmm3,zmm24 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm31 vpaddd zmm1,zmm1,zmm16 vpaddd zmm2,zmm2,zmm25 vpaddd zmm3,zmm3,zmm22 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm30 vpaddd zmm1,zmm1,zmm18 vpaddd zmm2,zmm2,zmm19 vpaddd zmm3,zmm3,zmm23 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm26 vpaddd zmm1,zmm1,zmm28 vpaddd zmm2,zmm2,zmm20 vpaddd zmm3,zmm3,zmm29 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpxord zmm0,zmm0,zmm8 vpxord zmm1,zmm1,zmm9 vpxord zmm2,zmm2,zmm10 vpxord zmm3,zmm3,zmm11 vpxord zmm4,zmm4,zmm12 vpxord zmm5,zmm5,zmm13 vpxord zmm6,zmm6,zmm14 vpxord zmm7,zmm7,zmm15 vpxord zmm8,zmm8,DWORD PTR [rdi]{1to16} vpxord zmm9,zmm9,DWORD PTR [rdi+0x4]{1to16} vpxord zmm10,zmm10,DWORD PTR [rdi+0x8]{1to16} vpxord zmm11,zmm11,DWORD PTR [rdi+0xc]{1to16} vpxord zmm12,zmm12,DWORD PTR [rdi+0x10]{1to16} vpxord zmm13,zmm13,DWORD PTR [rdi+0x14]{1to16} vpxord zmm14,zmm14,DWORD PTR [rdi+0x18]{1to16} vpxord zmm15,zmm15,DWORD PTR [rdi+0x1c]{1to16} vpunpckldq zmm16,zmm0,zmm1 vpunpckhdq zmm17,zmm0,zmm1 vpunpckldq zmm18,zmm2,zmm3 vpunpckhdq zmm19,zmm2,zmm3 vpunpckldq zmm20,zmm4,zmm5 vpunpckhdq zmm21,zmm4,zmm5 vpunpckldq zmm22,zmm6,zmm7 vpunpckhdq zmm23,zmm6,zmm7 vpunpckldq zmm24,zmm8,zmm9 vpunpckhdq zmm25,zmm8,zmm9 vpunpckldq zmm26,zmm10,zmm11 vpunpckhdq zmm27,zmm10,zmm11 vpunpckldq zmm28,zmm12,zmm13 vpunpckhdq zmm29,zmm12,zmm13 vpunpckldq zmm30,zmm14,zmm15 vpunpckhdq zmm31,zmm14,zmm15 vpunpcklqdq zmm0,zmm16,zmm18 vpunpckhqdq zmm1,zmm16,zmm18 vpunpcklqdq zmm2,zmm17,zmm19 vpunpckhqdq zmm3,zmm17,zmm19 vpunpcklqdq zmm4,zmm20,zmm22 vpunpckhqdq zmm5,zmm20,zmm22 vpunpcklqdq zmm6,zmm21,zmm23 vpunpckhqdq zmm7,zmm21,zmm23 vpunpcklqdq zmm8,zmm24,zmm26 vpunpckhqdq zmm9,zmm24,zmm26 vpunpcklqdq zmm10,zmm25,zmm27 vpunpckhqdq zmm11,zmm25,zmm27 vpunpcklqdq zmm12,zmm28,zmm30 vpunpckhqdq zmm13,zmm28,zmm30 vpunpcklqdq zmm14,zmm29,zmm31 vpunpckhqdq zmm15,zmm29,zmm31 vshufi32x4 zmm16,zmm0,zmm4,0x88 vshufi32x4 zmm17,zmm1,zmm5,0x88 vshufi32x4 zmm18,zmm2,zmm6,0x88 vshufi32x4 zmm19,zmm3,zmm7,0x88 vshufi32x4 zmm20,zmm0,zmm4,0xdd vshufi32x4 zmm21,zmm1,zmm5,0xdd vshufi32x4 zmm22,zmm2,zmm6,0xdd vshufi32x4 zmm23,zmm3,zmm7,0xdd vshufi32x4 zmm24,zmm8,zmm12,0x88 vshufi32x4 zmm25,zmm9,zmm13,0x88 vshufi32x4 zmm26,zmm10,zmm14,0x88 vshufi32x4 zmm27,zmm11,zmm15,0x88 vshufi32x4 zmm28,zmm8,zmm12,0xdd vshufi32x4 zmm29,zmm9,zmm13,0xdd vshufi32x4 zmm30,zmm10,zmm14,0xdd vshufi32x4 zmm31,zmm11,zmm15,0xdd vshufi32x4 zmm0,zmm16,zmm24,0x88 vshufi32x4 zmm1,zmm17,zmm25,0x88 vshufi32x4 zmm2,zmm18,zmm26,0x88 vshufi32x4 zmm3,zmm19,zmm27,0x88 vshufi32x4 zmm4,zmm20,zmm28,0x88 vshufi32x4 zmm5,zmm21,zmm29,0x88 vshufi32x4 zmm6,zmm22,zmm30,0x88 vshufi32x4 zmm7,zmm23,zmm31,0x88 vshufi32x4 zmm8,zmm16,zmm24,0xdd vshufi32x4 zmm9,zmm17,zmm25,0xdd vshufi32x4 zmm10,zmm18,zmm26,0xdd vshufi32x4 zmm11,zmm19,zmm27,0xdd vshufi32x4 zmm12,zmm20,zmm28,0xdd vshufi32x4 zmm13,zmm21,zmm29,0xdd vshufi32x4 zmm14,zmm22,zmm30,0xdd vshufi32x4 zmm15,zmm23,zmm31,0xdd vmovdqu32 ZMMWORD PTR [r9],zmm0 vmovdqu32 ZMMWORD PTR [r9+0x40],zmm1 vmovdqu32 ZMMWORD PTR [r9+0x80],zmm2 vmovdqu32 ZMMWORD PTR [r9+0xc0],zmm3 vmovdqu32 ZMMWORD PTR [r9+0x100],zmm4 vmovdqu32 ZMMWORD PTR [r9+0x140],zmm5 vmovdqu32 ZMMWORD PTR [r9+0x180],zmm6 vmovdqu32 ZMMWORD PTR [r9+0x1c0],zmm7 vmovdqu32 ZMMWORD PTR [r9+0x200],zmm8 vmovdqu32 ZMMWORD PTR [r9+0x240],zmm9 vmovdqu32 ZMMWORD PTR [r9+0x280],zmm10 vmovdqu32 ZMMWORD PTR [r9+0x2c0],zmm11 vmovdqu32 ZMMWORD PTR [r9+0x300],zmm12 vmovdqu32 ZMMWORD PTR [r9+0x340],zmm13 vmovdqu32 ZMMWORD PTR [r9+0x380],zmm14 vmovdqu32 ZMMWORD PTR [r9+0x3c0],zmm15 vmovdqa32 zmm0,ZMMWORD PTR [rsp] vmovdqa32 zmm1,ZMMWORD PTR [rsp+0x40] vpaddd zmm2,zmm0,DWORD PTR [ADD16+rip]{1to16} vpcmpltud k1,zmm2,zmm0 vpaddd zmm1{k1},zmm1,DWORD PTR [ADD1+rip]{1to16} vmovdqa32 ZMMWORD PTR [rsp],zmm2 vmovdqa32 ZMMWORD PTR [rsp+0x40],zmm1 add r9,0x400 sub r10,0x10 cmp r10,0x10 jae 3b test r10,r10 jne 2f 9: vzeroupper mov rsp,rbp pop rbp ret 2: test r10,0x8 je 2f vpbroadcastd ymm16,DWORD PTR [rsi] vpbroadcastd ymm17,DWORD PTR [rsi+0x4] vpbroadcastd ymm18,DWORD PTR [rsi+0x8] vpbroadcastd ymm19,DWORD PTR [rsi+0xc] vpbroadcastd ymm20,DWORD PTR [rsi+0x10] vpbroadcastd ymm21,DWORD PTR [rsi+0x14] vpbroadcastd ymm22,DWORD PTR [rsi+0x18] vpbroadcastd ymm23,DWORD PTR [rsi+0x1c] vpbroadcastd ymm24,DWORD PTR [rsi+0x20] vpbroadcastd ymm25,DWORD PTR [rsi+0x24] vpbroadcastd ymm26,DWORD PTR [rsi+0x28] vpbroadcastd ymm27,DWORD PTR [rsi+0x2c] vpbroadcastd ymm28,DWORD PTR [rsi+0x30] vpbroadcastd ymm29,DWORD PTR [rsi+0x34] vpbroadcastd ymm30,DWORD PTR [rsi+0x38] vpbroadcastd ymm31,DWORD PTR [rsi+0x3c] vpbroadcastd ymm0,DWORD PTR [rdi] vpbroadcastd ymm1,DWORD PTR [rdi+0x4] vpbroadcastd ymm2,DWORD PTR [rdi+0x8] vpbroadcastd ymm3,DWORD PTR [rdi+0xc] vpbroadcastd ymm4,DWORD PTR [rdi+0x10] vpbroadcastd ymm5,DWORD PTR [rdi+0x14] vpbroadcastd ymm6,DWORD PTR [rdi+0x18] vpbroadcastd ymm7,DWORD PTR [rdi+0x1c] vpbroadcastd ymm8,DWORD PTR [BLAKE3_IV_0+rip] vpbroadcastd ymm9,DWORD PTR [BLAKE3_IV_1+rip] vpbroadcastd ymm10,DWORD PTR [BLAKE3_IV_2+rip] vpbroadcastd ymm11,DWORD PTR [BLAKE3_IV_3+rip] vmovdqa ymm12,YMMWORD PTR [rsp] vmovdqa ymm13,YMMWORD PTR [rsp+0x40] vpbroadcastd ymm14,edx vpbroadcastd ymm15,r8d vpaddd ymm0,ymm0,ymm16 vpaddd ymm1,ymm1,ymm18 vpaddd ymm2,ymm2,ymm20 vpaddd ymm3,ymm3,ymm22 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm17 vpaddd ymm1,ymm1,ymm19 vpaddd ymm2,ymm2,ymm21 vpaddd ymm3,ymm3,ymm23 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm24 vpaddd ymm1,ymm1,ymm26 vpaddd ymm2,ymm2,ymm28 vpaddd ymm3,ymm3,ymm30 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm25 vpaddd ymm1,ymm1,ymm27 vpaddd ymm2,ymm2,ymm29 vpaddd ymm3,ymm3,ymm31 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpaddd ymm0,ymm0,ymm18 vpaddd ymm1,ymm1,ymm19 vpaddd ymm2,ymm2,ymm23 vpaddd ymm3,ymm3,ymm20 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm22 vpaddd ymm1,ymm1,ymm26 vpaddd ymm2,ymm2,ymm16 vpaddd ymm3,ymm3,ymm29 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm17 vpaddd ymm1,ymm1,ymm28 vpaddd ymm2,ymm2,ymm25 vpaddd ymm3,ymm3,ymm31 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm27 vpaddd ymm1,ymm1,ymm21 vpaddd ymm2,ymm2,ymm30 vpaddd ymm3,ymm3,ymm24 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpaddd ymm0,ymm0,ymm19 vpaddd ymm1,ymm1,ymm26 vpaddd ymm2,ymm2,ymm29 vpaddd ymm3,ymm3,ymm23 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm20 vpaddd ymm1,ymm1,ymm28 vpaddd ymm2,ymm2,ymm18 vpaddd ymm3,ymm3,ymm30 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm22 vpaddd ymm1,ymm1,ymm25 vpaddd ymm2,ymm2,ymm27 vpaddd ymm3,ymm3,ymm24 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm21 vpaddd ymm1,ymm1,ymm16 vpaddd ymm2,ymm2,ymm31 vpaddd ymm3,ymm3,ymm17 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpaddd ymm0,ymm0,ymm26 vpaddd ymm1,ymm1,ymm28 vpaddd ymm2,ymm2,ymm30 vpaddd ymm3,ymm3,ymm29 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm23 vpaddd ymm1,ymm1,ymm25 vpaddd ymm2,ymm2,ymm19 vpaddd ymm3,ymm3,ymm31 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm20 vpaddd ymm1,ymm1,ymm27 vpaddd ymm2,ymm2,ymm21 vpaddd ymm3,ymm3,ymm17 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm16 vpaddd ymm1,ymm1,ymm18 vpaddd ymm2,ymm2,ymm24 vpaddd ymm3,ymm3,ymm22 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpaddd ymm0,ymm0,ymm28 vpaddd ymm1,ymm1,ymm25 vpaddd ymm2,ymm2,ymm31 vpaddd ymm3,ymm3,ymm30 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm29 vpaddd ymm1,ymm1,ymm27 vpaddd ymm2,ymm2,ymm26 vpaddd ymm3,ymm3,ymm24 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm23 vpaddd ymm1,ymm1,ymm21 vpaddd ymm2,ymm2,ymm16 vpaddd ymm3,ymm3,ymm22 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm18 vpaddd ymm1,ymm1,ymm19 vpaddd ymm2,ymm2,ymm17 vpaddd ymm3,ymm3,ymm20 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpaddd ymm0,ymm0,ymm25 vpaddd ymm1,ymm1,ymm27 vpaddd ymm2,ymm2,ymm24 vpaddd ymm3,ymm3,ymm31 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm30 vpaddd ymm1,ymm1,ymm21 vpaddd ymm2,ymm2,ymm28 vpaddd ymm3,ymm3,ymm17 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm29 vpaddd ymm1,ymm1,ymm16 vpaddd ymm2,ymm2,ymm18 vpaddd ymm3,ymm3,ymm20 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm19 vpaddd ymm1,ymm1,ymm26 vpaddd ymm2,ymm2,ymm22 vpaddd ymm3,ymm3,ymm23 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpaddd ymm0,ymm0,ymm27 vpaddd ymm1,ymm1,ymm21 vpaddd ymm2,ymm2,ymm17 vpaddd ymm3,ymm3,ymm24 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm31 vpaddd ymm1,ymm1,ymm16 vpaddd ymm2,ymm2,ymm25 vpaddd ymm3,ymm3,ymm22 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm30 vpaddd ymm1,ymm1,ymm18 vpaddd ymm2,ymm2,ymm19 vpaddd ymm3,ymm3,ymm23 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm26 vpaddd ymm1,ymm1,ymm28 vpaddd ymm2,ymm2,ymm20 vpaddd ymm3,ymm3,ymm29 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpxor ymm0,ymm0,ymm8 vpxor ymm1,ymm1,ymm9 vpxor ymm2,ymm2,ymm10 vpxor ymm3,ymm3,ymm11 vpxor ymm4,ymm4,ymm12 vpxor ymm5,ymm5,ymm13 vpxor ymm6,ymm6,ymm14 vpxor ymm7,ymm7,ymm15 vpxord ymm8,ymm8,DWORD PTR [rdi]{1to8} vpxord ymm9,ymm9,DWORD PTR [rdi+0x4]{1to8} vpxord ymm10,ymm10,DWORD PTR [rdi+0x8]{1to8} vpxord ymm11,ymm11,DWORD PTR [rdi+0xc]{1to8} vpxord ymm12,ymm12,DWORD PTR [rdi+0x10]{1to8} vpxord ymm13,ymm13,DWORD PTR [rdi+0x14]{1to8} vpxord ymm14,ymm14,DWORD PTR [rdi+0x18]{1to8} vpxord ymm15,ymm15,DWORD PTR [rdi+0x1c]{1to8} vpunpckldq ymm16,ymm0,ymm1 vpunpckhdq ymm17,ymm0,ymm1 vpunpckldq ymm18,ymm2,ymm3 vpunpckhdq ymm19,ymm2,ymm3 vpunpckldq ymm20,ymm4,ymm5 vpunpckhdq ymm21,ymm4,ymm5 vpunpckldq ymm22,ymm6,ymm7 vpunpckhdq ymm23,ymm6,ymm7 vpunpckldq ymm24,ymm8,ymm9 vpunpckhdq ymm25,ymm8,ymm9 vpunpckldq ymm26,ymm10,ymm11 vpunpckhdq ymm27,ymm10,ymm11 vpunpckldq ymm28,ymm12,ymm13 vpunpckhdq ymm29,ymm12,ymm13 vpunpckldq ymm30,ymm14,ymm15 vpunpckhdq ymm31,ymm14,ymm15 vpunpcklqdq ymm0,ymm16,ymm18 vpunpckhqdq ymm1,ymm16,ymm18 vpunpcklqdq ymm2,ymm17,ymm19 vpunpckhqdq ymm3,ymm17,ymm19 vpunpcklqdq ymm4,ymm20,ymm22 vpunpckhqdq ymm5,ymm20,ymm22 vpunpcklqdq ymm6,ymm21,ymm23 vpunpckhqdq ymm7,ymm21,ymm23 vpunpcklqdq ymm8,ymm24,ymm26 vpunpckhqdq ymm9,ymm24,ymm26 vpunpcklqdq ymm10,ymm25,ymm27 vpunpckhqdq ymm11,ymm25,ymm27 vpunpcklqdq ymm12,ymm28,ymm30 vpunpckhqdq ymm13,ymm28,ymm30 vpunpcklqdq ymm14,ymm29,ymm31 vpunpckhqdq ymm15,ymm29,ymm31 vshufi32x4 ymm16,ymm0,ymm4,0x0 vshufi32x4 ymm17,ymm8,ymm12,0x0 vshufi32x4 ymm18,ymm1,ymm5,0x0 vshufi32x4 ymm19,ymm9,ymm13,0x0 vshufi32x4 ymm20,ymm2,ymm6,0x0 vshufi32x4 ymm21,ymm10,ymm14,0x0 vshufi32x4 ymm22,ymm3,ymm7,0x0 vshufi32x4 ymm23,ymm11,ymm15,0x0 vshufi32x4 ymm24,ymm0,ymm4,0x3 vshufi32x4 ymm25,ymm8,ymm12,0x3 vshufi32x4 ymm26,ymm1,ymm5,0x3 vshufi32x4 ymm27,ymm9,ymm13,0x3 vshufi32x4 ymm28,ymm2,ymm6,0x3 vshufi32x4 ymm29,ymm10,ymm14,0x3 vshufi32x4 ymm30,ymm3,ymm7,0x3 vshufi32x4 ymm31,ymm11,ymm15,0x3 vmovdqu32 YMMWORD PTR [r9],ymm16 vmovdqu32 YMMWORD PTR [r9+0x20],ymm17 vmovdqu32 YMMWORD PTR [r9+0x40],ymm18 vmovdqu32 YMMWORD PTR [r9+0x60],ymm19 vmovdqu32 YMMWORD PTR [r9+0x80],ymm20 vmovdqu32 YMMWORD PTR [r9+0xa0],ymm21 vmovdqu32 YMMWORD PTR [r9+0xc0],ymm22 vmovdqu32 YMMWORD PTR [r9+0xe0],ymm23 vmovdqu32 YMMWORD PTR [r9+0x100],ymm24 vmovdqu32 YMMWORD PTR [r9+0x120],ymm25 vmovdqu32 YMMWORD PTR [r9+0x140],ymm26 vmovdqu32 YMMWORD PTR [r9+0x160],ymm27 vmovdqu32 YMMWORD PTR [r9+0x180],ymm28 vmovdqu32 YMMWORD PTR [r9+0x1a0],ymm29 vmovdqu32 YMMWORD PTR [r9+0x1c0],ymm30 vmovdqu32 YMMWORD PTR [r9+0x1e0],ymm31 vmovdqa ymm0,YMMWORD PTR [rsp+0x20] vmovdqa ymm1,YMMWORD PTR [rsp+0x60] vmovdqa YMMWORD PTR [rsp],ymm0 vmovdqa YMMWORD PTR [rsp+0x40],ymm1 add r9,0x200 sub r10,0x8 2: test r10,0x4 je 2f vbroadcasti32x4 zmm0,XMMWORD PTR [rdi] vbroadcasti32x4 zmm1,XMMWORD PTR [rdi+0x10] vbroadcasti32x4 zmm2,XMMWORD PTR [BLAKE3_IV+rip] vmovdqa xmm12,XMMWORD PTR [rsp] vmovdqa xmm13,XMMWORD PTR [rsp+0x40] vpunpckldq xmm14,xmm12,xmm13 vpunpckhdq xmm15,xmm12,xmm13 vpermq ymm14,ymm14,0xdc vpermq ymm15,ymm15,0xdc vpbroadcastd zmm12,edx vinserti64x4 zmm13,zmm14,ymm15,0x1 mov eax,0x4444 kmovw k2,eax vpblendmd zmm13{k2},zmm13,zmm12 vpbroadcastd zmm15,r8d mov eax,0x8888 kmovw k4,eax vpblendmd zmm3{k4},zmm13,zmm15 mov eax,0xaaaa kmovw k3,eax vbroadcasti32x4 zmm8,XMMWORD PTR [rsi] vbroadcasti32x4 zmm9,XMMWORD PTR [rsi+0x10] vshufps zmm4,zmm8,zmm9,0x88 vshufps zmm5,zmm8,zmm9,0xdd vbroadcasti32x4 zmm8,XMMWORD PTR [rsi+0x20] vbroadcasti32x4 zmm9,XMMWORD PTR [rsi+0x30] vshufps zmm6,zmm8,zmm9,0x88 vshufps zmm7,zmm8,zmm9,0xdd vpshufd zmm6,zmm6,0x93 vpshufd zmm7,zmm7,0x93 mov al,0x7 3: vpaddd zmm0,zmm0,zmm4 vpaddd zmm0,zmm0,zmm1 vpxord zmm3,zmm3,zmm0 vprord zmm3,zmm3,0x10 vpaddd zmm2,zmm2,zmm3 vpxord zmm1,zmm1,zmm2 vprord zmm1,zmm1,0xc vpaddd zmm0,zmm0,zmm5 vpaddd zmm0,zmm0,zmm1 vpxord zmm3,zmm3,zmm0 vprord zmm3,zmm3,0x8 vpaddd zmm2,zmm2,zmm3 vpxord zmm1,zmm1,zmm2 vprord zmm1,zmm1,0x7 vpshufd zmm0,zmm0,0x93 vpshufd zmm3,zmm3,0x4e vpshufd zmm2,zmm2,0x39 vpaddd zmm0,zmm0,zmm6 vpaddd zmm0,zmm0,zmm1 vpxord zmm3,zmm3,zmm0 vprord zmm3,zmm3,0x10 vpaddd zmm2,zmm2,zmm3 vpxord zmm1,zmm1,zmm2 vprord zmm1,zmm1,0xc vpaddd zmm0,zmm0,zmm7 vpaddd zmm0,zmm0,zmm1 vpxord zmm3,zmm3,zmm0 vprord zmm3,zmm3,0x8 vpaddd zmm2,zmm2,zmm3 vpxord zmm1,zmm1,zmm2 vprord zmm1,zmm1,0x7 vpshufd zmm0,zmm0,0x39 vpshufd zmm3,zmm3,0x4e vpshufd zmm2,zmm2,0x93 dec al je 3f vshufps zmm8,zmm4,zmm5,0xd6 vpshufd zmm9,zmm4,0xf vpshufd zmm4,zmm8,0x39 vshufps zmm8,zmm6,zmm7,0xfa vpblendmd zmm9{k3},zmm9,zmm8 vpunpcklqdq zmm8,zmm7,zmm5 vpblendmd zmm8{k4},zmm8,zmm6 vpshufd zmm8,zmm8,0x78 vpunpckhdq zmm5,zmm5,zmm7 vpunpckldq zmm6,zmm6,zmm5 vpshufd zmm7,zmm6,0x1e vmovdqa32 zmm5,zmm9 vmovdqa32 zmm6,zmm8 jmp 3b 3: vpxord zmm0,zmm0,zmm2 vpxord zmm1,zmm1,zmm3 vbroadcasti32x4 zmm8,XMMWORD PTR [rdi] vbroadcasti32x4 zmm9,XMMWORD PTR [rdi+0x10] vpxord zmm2,zmm2,zmm8 vpxord zmm3,zmm3,zmm9 vmovdqu XMMWORD PTR [r9],xmm0 vmovdqu XMMWORD PTR [r9+0x10],xmm1 vmovdqu XMMWORD PTR [r9+0x20],xmm2 vmovdqu XMMWORD PTR [r9+0x30],xmm3 vextracti128 XMMWORD PTR [r9+0x40],ymm0,0x1 vextracti128 XMMWORD PTR [r9+0x50],ymm1,0x1 vextracti128 XMMWORD PTR [r9+0x60],ymm2,0x1 vextracti128 XMMWORD PTR [r9+0x70],ymm3,0x1 vextracti32x4 XMMWORD PTR [r9+0x80],zmm0,0x2 vextracti32x4 XMMWORD PTR [r9+0x90],zmm1,0x2 vextracti32x4 XMMWORD PTR [r9+0xa0],zmm2,0x2 vextracti32x4 XMMWORD PTR [r9+0xb0],zmm3,0x2 vextracti32x4 XMMWORD PTR [r9+0xc0],zmm0,0x3 vextracti32x4 XMMWORD PTR [r9+0xd0],zmm1,0x3 vextracti32x4 XMMWORD PTR [r9+0xe0],zmm2,0x3 vextracti32x4 XMMWORD PTR [r9+0xf0],zmm3,0x3 vmovdqa xmm0,XMMWORD PTR [rsp+0x10] vmovdqa xmm1,XMMWORD PTR [rsp+0x50] vmovdqa XMMWORD PTR [rsp],xmm0 vmovdqa XMMWORD PTR [rsp+0x40],xmm1 add r9,0x100 sub r10,0x4 2: test r10,0x2 je 2f vbroadcasti128 ymm0,XMMWORD PTR [rdi] vbroadcasti128 ymm1,XMMWORD PTR [rdi+0x10] vmovd xmm13,DWORD PTR [rsp] vpinsrd xmm13,xmm13,DWORD PTR [rsp+0x40],0x1 vpinsrd xmm13,xmm13,edx,0x2 vmovd xmm14,DWORD PTR [rsp+0x4] vpinsrd xmm14,xmm14,DWORD PTR [rsp+0x44],0x1 vpinsrd xmm14,xmm14,edx,0x2 vinserti128 ymm13,ymm13,xmm14,0x1 vbroadcasti128 ymm2,XMMWORD PTR [BLAKE3_IV+rip] vpbroadcastd ymm8,r8d vpblendd ymm3,ymm13,ymm8,0x88 vbroadcasti128 ymm8,XMMWORD PTR [rsi] vbroadcasti128 ymm9,XMMWORD PTR [rsi+0x10] vshufps ymm4,ymm8,ymm9,0x88 vshufps ymm5,ymm8,ymm9,0xdd vbroadcasti128 ymm8,XMMWORD PTR [rsi+0x20] vbroadcasti128 ymm9,XMMWORD PTR [rsi+0x30] vshufps ymm6,ymm8,ymm9,0x88 vshufps ymm7,ymm8,ymm9,0xdd vpshufd ymm6,ymm6,0x93 vpshufd ymm7,ymm7,0x93 mov al,0x7 3: vpaddd ymm0,ymm0,ymm4 vpaddd ymm0,ymm0,ymm1 vpxord ymm3,ymm3,ymm0 vprord ymm3,ymm3,0x10 vpaddd ymm2,ymm2,ymm3 vpxord ymm1,ymm1,ymm2 vprord ymm1,ymm1,0xc vpaddd ymm0,ymm0,ymm5 vpaddd ymm0,ymm0,ymm1 vpxord ymm3,ymm3,ymm0 vprord ymm3,ymm3,0x8 vpaddd ymm2,ymm2,ymm3 vpxord ymm1,ymm1,ymm2 vprord ymm1,ymm1,0x7 vpshufd ymm0,ymm0,0x93 vpshufd ymm3,ymm3,0x4e vpshufd ymm2,ymm2,0x39 vpaddd ymm0,ymm0,ymm6 vpaddd ymm0,ymm0,ymm1 vpxord ymm3,ymm3,ymm0 vprord ymm3,ymm3,0x10 vpaddd ymm2,ymm2,ymm3 vpxord ymm1,ymm1,ymm2 vprord ymm1,ymm1,0xc vpaddd ymm0,ymm0,ymm7 vpaddd ymm0,ymm0,ymm1 vpxord ymm3,ymm3,ymm0 vprord ymm3,ymm3,0x8 vpaddd ymm2,ymm2,ymm3 vpxord ymm1,ymm1,ymm2 vprord ymm1,ymm1,0x7 vpshufd ymm0,ymm0,0x39 vpshufd ymm3,ymm3,0x4e vpshufd ymm2,ymm2,0x93 dec al je 3f vshufps ymm8,ymm4,ymm5,0xd6 vpshufd ymm9,ymm4,0xf vpshufd ymm4,ymm8,0x39 vshufps ymm8,ymm6,ymm7,0xfa vpblendd ymm9,ymm9,ymm8,0xaa vpunpcklqdq ymm8,ymm7,ymm5 vpblendd ymm8,ymm8,ymm6,0x88 vpshufd ymm8,ymm8,0x78 vpunpckhdq ymm5,ymm5,ymm7 vpunpckldq ymm6,ymm6,ymm5 vpshufd ymm7,ymm6,0x1e vmovdqa ymm5,ymm9 vmovdqa ymm6,ymm8 jmp 3b 3: vpxor ymm0,ymm0,ymm2 vpxor ymm1,ymm1,ymm3 vbroadcasti128 ymm8,XMMWORD PTR [rdi] vbroadcasti128 ymm9,XMMWORD PTR [rdi+0x10] vpxor ymm2,ymm2,ymm8 vpxor ymm3,ymm3,ymm9 vmovdqu XMMWORD PTR [r9],xmm0 vmovdqu XMMWORD PTR [r9+0x10],xmm1 vmovdqu XMMWORD PTR [r9+0x20],xmm2 vmovdqu XMMWORD PTR [r9+0x30],xmm3 vextracti128 XMMWORD PTR [r9+0x40],ymm0,0x1 vextracti128 XMMWORD PTR [r9+0x50],ymm1,0x1 vextracti128 XMMWORD PTR [r9+0x60],ymm2,0x1 vextracti128 XMMWORD PTR [r9+0x70],ymm3,0x1 vmovdqu xmm0,XMMWORD PTR [rsp+0x8] vmovdqu xmm1,XMMWORD PTR [rsp+0x48] vmovdqa XMMWORD PTR [rsp],xmm0 vmovdqa XMMWORD PTR [rsp+0x40],xmm1 add r9,0x80 sub r10,0x2 2: test r10,0x1 je 9b vmovdqu xmm0,XMMWORD PTR [rdi] vmovdqu xmm1,XMMWORD PTR [rdi+0x10] vmovd xmm14,DWORD PTR [rsp] vpinsrd xmm14,xmm14,DWORD PTR [rsp+0x40],0x1 vpinsrd xmm14,xmm14,edx,0x2 vmovdqa xmm2,XMMWORD PTR [BLAKE3_IV+rip] vpinsrd xmm3,xmm14,r8d,0x3 vmovups xmm8,XMMWORD PTR [rsi] vmovups xmm9,XMMWORD PTR [rsi+0x10] vshufps xmm4,xmm8,xmm9,0x88 vshufps xmm5,xmm8,xmm9,0xdd vmovups xmm8,XMMWORD PTR [rsi+0x20] vmovups xmm9,XMMWORD PTR [rsi+0x30] vshufps xmm6,xmm8,xmm9,0x88 vshufps xmm7,xmm8,xmm9,0xdd vpshufd xmm6,xmm6,0x93 vpshufd xmm7,xmm7,0x93 mov al,0x7 3: vpaddd xmm0,xmm0,xmm4 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x10 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0xc vpaddd xmm0,xmm0,xmm5 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x8 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0x7 vpshufd xmm0,xmm0,0x93 vpshufd xmm3,xmm3,0x4e vpshufd xmm2,xmm2,0x39 vpaddd xmm0,xmm0,xmm6 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x10 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0xc vpaddd xmm0,xmm0,xmm7 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x8 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0x7 vpshufd xmm0,xmm0,0x39 vpshufd xmm3,xmm3,0x4e vpshufd xmm2,xmm2,0x93 dec al je 3f vshufps xmm8,xmm4,xmm5,0xd6 vpshufd xmm9,xmm4,0xf vpshufd xmm4,xmm8,0x39 vshufps xmm8,xmm6,xmm7,0xfa vpblendd xmm9,xmm9,xmm8,0xaa vpunpcklqdq xmm8,xmm7,xmm5 vpblendd xmm8,xmm8,xmm6,0x88 vpshufd xmm8,xmm8,0x78 vpunpckhdq xmm5,xmm5,xmm7 vpunpckldq xmm6,xmm6,xmm5 vpshufd xmm7,xmm6,0x1e vmovdqa xmm5,xmm9 vmovdqa xmm6,xmm8 jmp 3b 3: vpxor xmm0,xmm0,xmm2 vpxor xmm1,xmm1,xmm3 vpxor xmm2,xmm2,XMMWORD PTR [rdi] vpxor xmm3,xmm3,XMMWORD PTR [rdi+0x10] vmovdqu XMMWORD PTR [r9],xmm0 vmovdqu XMMWORD PTR [r9+0x10],xmm1 vmovdqu XMMWORD PTR [r9+0x20],xmm2 vmovdqu XMMWORD PTR [r9+0x30],xmm3 jmp 9b #ifdef __APPLE__ .static_data #else .section .rodata #endif .p2align 6 INDEX0: .long 0, 1, 2, 3, 16, 17, 18, 19 .long 8, 9, 10, 11, 24, 25, 26, 27 INDEX1: .long 4, 5, 6, 7, 20, 21, 22, 23 .long 12, 13, 14, 15, 28, 29, 30, 31 ADD0: .long 0, 1, 2, 3, 4, 5, 6, 7 .long 8, 9, 10, 11, 12, 13, 14, 15 ADD1: .long 1 ADD16: .long 16 BLAKE3_BLOCK_LEN: .long 64 .p2align 6 BLAKE3_IV: BLAKE3_IV_0: .long 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A blake3-1.5.4/c/blake3_avx512_x86-64_windows_gnu.S000064400000000000000000002615731046102023000172060ustar 00000000000000.intel_syntax noprefix .global _blake3_hash_many_avx512 .global blake3_hash_many_avx512 .global blake3_compress_in_place_avx512 .global _blake3_compress_in_place_avx512 .global blake3_compress_xof_avx512 .global _blake3_compress_xof_avx512 .section .text .p2align 6 _blake3_hash_many_avx512: blake3_hash_many_avx512: push r15 push r14 push r13 push r12 push rdi push rsi push rbx push rbp mov rbp, rsp sub rsp, 304 and rsp, 0xFFFFFFFFFFFFFFC0 vmovdqa xmmword ptr [rsp+0x90], xmm6 vmovdqa xmmword ptr [rsp+0xA0], xmm7 vmovdqa xmmword ptr [rsp+0xB0], xmm8 vmovdqa xmmword ptr [rsp+0xC0], xmm9 vmovdqa xmmword ptr [rsp+0xD0], xmm10 vmovdqa xmmword ptr [rsp+0xE0], xmm11 vmovdqa xmmword ptr [rsp+0xF0], xmm12 vmovdqa xmmword ptr [rsp+0x100], xmm13 vmovdqa xmmword ptr [rsp+0x110], xmm14 vmovdqa xmmword ptr [rsp+0x120], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+0x68] movzx r9, byte ptr [rbp+0x70] neg r9 kmovw k1, r9d vmovd xmm0, r8d vpbroadcastd ymm0, xmm0 shr r8, 32 vmovd xmm1, r8d vpbroadcastd ymm1, xmm1 vmovdqa ymm4, ymm1 vmovdqa ymm5, ymm1 vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] vpcmpltud k2, ymm2, ymm0 vpcmpltud k3, ymm3, ymm0 vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} knotw k2, k1 vmovdqa32 ymm2 {k2}, ymm0 vmovdqa32 ymm3 {k2}, ymm0 vmovdqa32 ymm4 {k2}, ymm1 vmovdqa32 ymm5 {k2}, ymm1 vmovdqa ymmword ptr [rsp], ymm2 vmovdqa ymmword ptr [rsp+0x20], ymm3 vmovdqa ymmword ptr [rsp+0x40], ymm4 vmovdqa ymmword ptr [rsp+0x60], ymm5 shl rdx, 6 mov qword ptr [rsp+0x80], rdx cmp rsi, 16 jc 3f 2: vpbroadcastd zmm0, dword ptr [rcx] vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] movzx eax, byte ptr [rbp+0x78] movzx ebx, byte ptr [rbp+0x80] or eax, ebx xor edx, edx .p2align 5 9: movzx ebx, byte ptr [rbp+0x88] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x80] cmove eax, ebx mov dword ptr [rsp+0x88], eax mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x40] mov r13, qword ptr [rdi+0x48] mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm8, zmm16, zmm17 vpunpckhqdq zmm9, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm10, zmm18, zmm19 vpunpckhqdq zmm11, zmm18, zmm19 mov r8, qword ptr [rdi+0x20] mov r9, qword ptr [rdi+0x28] mov r10, qword ptr [rdi+0x30] mov r11, qword ptr [rdi+0x38] mov r12, qword ptr [rdi+0x60] mov r13, qword ptr [rdi+0x68] mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm12, zmm16, zmm17 vpunpckhqdq zmm13, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm14, zmm18, zmm19 vpunpckhqdq zmm15, zmm18, zmm19 vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] vshufps zmm16, zmm8, zmm10, 136 vshufps zmm17, zmm12, zmm14, 136 vmovdqa32 zmm20, zmm16 vpermt2d zmm16, zmm27, zmm17 vpermt2d zmm20, zmm31, zmm17 vshufps zmm17, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm21, zmm17 vpermt2d zmm17, zmm27, zmm30 vpermt2d zmm21, zmm31, zmm30 vshufps zmm18, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm22, zmm18 vpermt2d zmm18, zmm27, zmm8 vpermt2d zmm22, zmm31, zmm8 vshufps zmm19, zmm9, zmm11, 221 vshufps zmm8, zmm13, zmm15, 221 vmovdqa32 zmm23, zmm19 vpermt2d zmm19, zmm27, zmm8 vpermt2d zmm23, zmm31, zmm8 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x40] mov r13, qword ptr [rdi+0x48] mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm8, zmm24, zmm25 vpunpckhqdq zmm9, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm10, zmm24, zmm25 vpunpckhqdq zmm11, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] mov r8, qword ptr [rdi+0x20] mov r9, qword ptr [rdi+0x28] mov r10, qword ptr [rdi+0x30] mov r11, qword ptr [rdi+0x38] mov r12, qword ptr [rdi+0x60] mov r13, qword ptr [rdi+0x68] mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm12, zmm24, zmm25 vpunpckhqdq zmm13, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm14, zmm24, zmm25 vpunpckhqdq zmm15, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] vshufps zmm24, zmm8, zmm10, 136 vshufps zmm30, zmm12, zmm14, 136 vmovdqa32 zmm28, zmm24 vpermt2d zmm24, zmm27, zmm30 vpermt2d zmm28, zmm31, zmm30 vshufps zmm25, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm29, zmm25 vpermt2d zmm25, zmm27, zmm30 vpermt2d zmm29, zmm31, zmm30 vshufps zmm26, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm30, zmm26 vpermt2d zmm26, zmm27, zmm8 vpermt2d zmm30, zmm31, zmm8 vshufps zmm8, zmm9, zmm11, 221 vshufps zmm10, zmm13, zmm15, 221 vpermi2d zmm27, zmm8, zmm10 vpermi2d zmm31, zmm8, zmm10 vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] vmovdqa32 zmm12, zmmword ptr [rsp] vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm24 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm23 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm27 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm21 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm28 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm26 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm22 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm31 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpxord zmm0, zmm0, zmm8 vpxord zmm1, zmm1, zmm9 vpxord zmm2, zmm2, zmm10 vpxord zmm3, zmm3, zmm11 vpxord zmm4, zmm4, zmm12 vpxord zmm5, zmm5, zmm13 vpxord zmm6, zmm6, zmm14 vpxord zmm7, zmm7, zmm15 movzx eax, byte ptr [rbp+0x78] jne 9b mov rbx, qword ptr [rbp+0x90] vpunpckldq zmm16, zmm0, zmm1 vpunpckhdq zmm17, zmm0, zmm1 vpunpckldq zmm18, zmm2, zmm3 vpunpckhdq zmm19, zmm2, zmm3 vpunpckldq zmm20, zmm4, zmm5 vpunpckhdq zmm21, zmm4, zmm5 vpunpckldq zmm22, zmm6, zmm7 vpunpckhdq zmm23, zmm6, zmm7 vpunpcklqdq zmm0, zmm16, zmm18 vpunpckhqdq zmm1, zmm16, zmm18 vpunpcklqdq zmm2, zmm17, zmm19 vpunpckhqdq zmm3, zmm17, zmm19 vpunpcklqdq zmm4, zmm20, zmm22 vpunpckhqdq zmm5, zmm20, zmm22 vpunpcklqdq zmm6, zmm21, zmm23 vpunpckhqdq zmm7, zmm21, zmm23 vshufi32x4 zmm16, zmm0, zmm4, 0x88 vshufi32x4 zmm17, zmm1, zmm5, 0x88 vshufi32x4 zmm18, zmm2, zmm6, 0x88 vshufi32x4 zmm19, zmm3, zmm7, 0x88 vshufi32x4 zmm20, zmm0, zmm4, 0xDD vshufi32x4 zmm21, zmm1, zmm5, 0xDD vshufi32x4 zmm22, zmm2, zmm6, 0xDD vshufi32x4 zmm23, zmm3, zmm7, 0xDD vshufi32x4 zmm0, zmm16, zmm17, 0x88 vshufi32x4 zmm1, zmm18, zmm19, 0x88 vshufi32x4 zmm2, zmm20, zmm21, 0x88 vshufi32x4 zmm3, zmm22, zmm23, 0x88 vshufi32x4 zmm4, zmm16, zmm17, 0xDD vshufi32x4 zmm5, zmm18, zmm19, 0xDD vshufi32x4 zmm6, zmm20, zmm21, 0xDD vshufi32x4 zmm7, zmm22, zmm23, 0xDD vmovdqu32 zmmword ptr [rbx], zmm0 vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 vmovdqa32 zmm0, zmmword ptr [rsp] vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] vmovdqa32 zmm2, zmm0 vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} vpcmpltud k2, zmm2, zmm0 vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} vmovdqa32 zmmword ptr [rsp], zmm2 vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 add rdi, 128 add rbx, 512 mov qword ptr [rbp+0x90], rbx sub rsi, 16 cmp rsi, 16 jnc 2b test rsi, rsi jne 3f 4: vzeroupper vmovdqa xmm6, xmmword ptr [rsp+0x90] vmovdqa xmm7, xmmword ptr [rsp+0xA0] vmovdqa xmm8, xmmword ptr [rsp+0xB0] vmovdqa xmm9, xmmword ptr [rsp+0xC0] vmovdqa xmm10, xmmword ptr [rsp+0xD0] vmovdqa xmm11, xmmword ptr [rsp+0xE0] vmovdqa xmm12, xmmword ptr [rsp+0xF0] vmovdqa xmm13, xmmword ptr [rsp+0x100] vmovdqa xmm14, xmmword ptr [rsp+0x110] vmovdqa xmm15, xmmword ptr [rsp+0x120] mov rsp, rbp pop rbp pop rbx pop rsi pop rdi pop r12 pop r13 pop r14 pop r15 ret .p2align 6 3: test esi, 0x8 je 3f vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+0x4] vpbroadcastd ymm2, dword ptr [rcx+0x8] vpbroadcastd ymm3, dword ptr [rcx+0xC] vpbroadcastd ymm4, dword ptr [rcx+0x10] vpbroadcastd ymm5, dword ptr [rcx+0x14] vpbroadcastd ymm6, dword ptr [rcx+0x18] vpbroadcastd ymm7, dword ptr [rcx+0x1C] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x20] mov r13, qword ptr [rdi+0x28] mov r14, qword ptr [rdi+0x30] mov r15, qword ptr [rdi+0x38] movzx eax, byte ptr [rbp+0x78] movzx ebx, byte ptr [rbp+0x80] or eax, ebx xor edx, edx 2: movzx ebx, byte ptr [rbp+0x88] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x80] cmove eax, ebx mov dword ptr [rsp+0x88], eax vmovups xmm8, xmmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x40] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x40] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm16, ymm12, ymm14, 136 vshufps ymm17, ymm12, ymm14, 221 vshufps ymm18, ymm13, ymm15, 136 vshufps ymm19, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x30] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x30] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm20, ymm12, ymm14, 136 vshufps ymm21, ymm12, ymm14, 221 vshufps ymm22, ymm13, ymm15, 136 vshufps ymm23, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x20] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x20] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm24, ymm12, ymm14, 136 vshufps ymm25, ymm12, ymm14, 221 vshufps ymm26, ymm13, ymm15, 136 vshufps ymm27, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x10] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x10] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm28, ymm12, ymm14, 136 vshufps ymm29, ymm12, ymm14, 221 vshufps ymm30, ymm13, ymm15, 136 vshufps ymm31, ymm13, ymm15, 221 vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] vmovdqa ymm12, ymmword ptr [rsp] vmovdqa ymm13, ymmword ptr [rsp+0x40] vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] vpbroadcastd ymm15, dword ptr [rsp+0x88] vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm24 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm23 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm27 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm21 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm28 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm26 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm22 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm31 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+0x78] jne 2b mov rbx, qword ptr [rbp+0x90] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0xCC vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0xCC vblendps ymm3, ymm12, ymm9, 0xCC vperm2f128 ymm12, ymm1, ymm2, 0x20 vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0xCC vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 0x20 vmovups ymmword ptr [rbx+0x20], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0xCC vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0xCC vblendps ymm14, ymm14, ymm13, 0xCC vperm2f128 ymm8, ymm10, ymm14, 0x20 vmovups ymmword ptr [rbx+0x40], ymm8 vblendps ymm15, ymm13, ymm15, 0xCC vperm2f128 ymm13, ymm6, ymm15, 0x20 vmovups ymmword ptr [rbx+0x60], ymm13 vperm2f128 ymm9, ymm1, ymm2, 0x31 vperm2f128 ymm11, ymm3, ymm4, 0x31 vmovups ymmword ptr [rbx+0x80], ymm9 vperm2f128 ymm14, ymm10, ymm14, 0x31 vperm2f128 ymm15, ymm6, ymm15, 0x31 vmovups ymmword ptr [rbx+0xA0], ymm11 vmovups ymmword ptr [rbx+0xC0], ymm14 vmovups ymmword ptr [rbx+0xE0], ymm15 vmovdqa ymm0, ymmword ptr [rsp] vmovdqa ymm2, ymmword ptr [rsp+0x40] vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] vmovdqa ymmword ptr [rsp], ymm0 vmovdqa ymmword ptr [rsp+0x40], ymm2 add rbx, 256 mov qword ptr [rbp+0x90], rbx add rdi, 64 sub rsi, 8 3: mov rbx, qword ptr [rbp+0x90] mov r15, qword ptr [rsp+0x80] movzx r13, byte ptr [rbp+0x78] movzx r12, byte ptr [rbp+0x88] test esi, 0x4 je 3f vbroadcasti32x4 zmm0, xmmword ptr [rcx] vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] vmovdqa xmm12, xmmword ptr [rsp] vmovdqa xmm13, xmmword ptr [rsp+0x40] vpunpckldq xmm14, xmm12, xmm13 vpunpckhdq xmm15, xmm12, xmm13 vpermq ymm14, ymm14, 0xDC vpermq ymm15, ymm15, 0xDC vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] vinserti64x4 zmm13, zmm14, ymm15, 0x01 mov eax, 17476 kmovw k2, eax vpblendmd zmm13 {k2}, zmm13, zmm12 vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov eax, 43690 kmovw k3, eax mov eax, 34952 kmovw k4, eax movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x88], eax vmovdqa32 zmm2, zmm15 vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] vpblendmd zmm3 {k4}, zmm13, zmm8 vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 vmovups zmm9, zmmword ptr [r8+rdx-0x30] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 vshufps zmm4, zmm8, zmm9, 136 vshufps zmm5, zmm8, zmm9, 221 vmovups zmm8, zmmword ptr [r8+rdx-0x20] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 vmovups zmm9, zmmword ptr [r8+rdx-0x10] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 vshufps zmm6, zmm8, zmm9, 136 vshufps zmm7, zmm8, zmm9, 221 vpshufd zmm6, zmm6, 0x93 vpshufd zmm7, zmm7, 0x93 mov al, 7 9: vpaddd zmm0, zmm0, zmm4 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm5 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 0x93 vpshufd zmm3, zmm3, 0x4E vpshufd zmm2, zmm2, 0x39 vpaddd zmm0, zmm0, zmm6 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm7 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 0x39 vpshufd zmm3, zmm3, 0x4E vpshufd zmm2, zmm2, 0x93 dec al jz 9f vshufps zmm8, zmm4, zmm5, 214 vpshufd zmm9, zmm4, 0x0F vpshufd zmm4, zmm8, 0x39 vshufps zmm8, zmm6, zmm7, 250 vpblendmd zmm9 {k3}, zmm9, zmm8 vpunpcklqdq zmm8, zmm7, zmm5 vpblendmd zmm8 {k4}, zmm8, zmm6 vpshufd zmm8, zmm8, 0x78 vpunpckhdq zmm5, zmm5, zmm7 vpunpckldq zmm6, zmm6, zmm5 vpshufd zmm7, zmm6, 0x1E vmovdqa32 zmm5, zmm9 vmovdqa32 zmm6, zmm8 jmp 9b 9: vpxord zmm0, zmm0, zmm2 vpxord zmm1, zmm1, zmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+0x40] vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+0x40], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 3: test esi, 0x2 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovd xmm13, dword ptr [rsp] vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovd xmm14, dword ptr [rsp+0x4] vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vinserti128 ymm13, ymm13, xmm14, 0x01 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x88], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vpbroadcastd ymm8, dword ptr [rsp+0x88] vpblendd ymm3, ymm13, ymm8, 0x88 vmovups ymm8, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x93 dec al jz 9f vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0x0F vpshufd ymm4, ymm8, 0x39 vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0xAA vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 0x88 vpshufd ymm8, ymm8, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+0x40] vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+0x40], xmm2 add rbx, 64 add rdi, 16 sub rsi, 2 3: test esi, 0x1 je 4b vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] vmovd xmm14, dword ptr [rsp] vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vpinsrd xmm3, xmm14, eax, 3 vmovdqa xmm2, xmm15 vmovups xmm8, xmmword ptr [r8+rdx-0x40] vmovups xmm9, xmmword ptr [r8+rdx-0x30] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vmovups xmm9, xmmword ptr [r8+rdx-0x10] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 jmp 4b .p2align 6 _blake3_compress_in_place_avx512: blake3_compress_in_place_avx512: sub rsp, 72 vmovdqa xmmword ptr [rsp], xmm6 vmovdqa xmmword ptr [rsp+0x10], xmm7 vmovdqa xmmword ptr [rsp+0x20], xmm8 vmovdqa xmmword ptr [rsp+0x30], xmm9 vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] movzx eax, byte ptr [rsp+0x70] movzx r8d, r8b shl rax, 32 add r8, rax vmovq xmm3, r9 vmovq xmm4, r8 vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] vmovups xmm8, xmmword ptr [rdx] vmovups xmm9, xmmword ptr [rdx+0x10] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rdx+0x20] vmovups xmm9, xmmword ptr [rdx+0x30] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vmovdqu xmmword ptr [rcx], xmm0 vmovdqu xmmword ptr [rcx+0x10], xmm1 vmovdqa xmm6, xmmword ptr [rsp] vmovdqa xmm7, xmmword ptr [rsp+0x10] vmovdqa xmm8, xmmword ptr [rsp+0x20] vmovdqa xmm9, xmmword ptr [rsp+0x30] add rsp, 72 ret .p2align 6 _blake3_compress_xof_avx512: blake3_compress_xof_avx512: sub rsp, 72 vmovdqa xmmword ptr [rsp], xmm6 vmovdqa xmmword ptr [rsp+0x10], xmm7 vmovdqa xmmword ptr [rsp+0x20], xmm8 vmovdqa xmmword ptr [rsp+0x30], xmm9 vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] movzx eax, byte ptr [rsp+0x70] movzx r8d, r8b mov r10, qword ptr [rsp+0x78] shl rax, 32 add r8, rax vmovq xmm3, r9 vmovq xmm4, r8 vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] vmovups xmm8, xmmword ptr [rdx] vmovups xmm9, xmmword ptr [rdx+0x10] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rdx+0x20] vmovups xmm9, xmmword ptr [rdx+0x30] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vpxor xmm2, xmm2, xmmword ptr [rcx] vpxor xmm3, xmm3, xmmword ptr [rcx+0x10] vmovdqu xmmword ptr [r10], xmm0 vmovdqu xmmword ptr [r10+0x10], xmm1 vmovdqu xmmword ptr [r10+0x20], xmm2 vmovdqu xmmword ptr [r10+0x30], xmm3 vmovdqa xmm6, xmmword ptr [rsp] vmovdqa xmm7, xmmword ptr [rsp+0x10] vmovdqa xmm8, xmmword ptr [rsp+0x20] vmovdqa xmm9, xmmword ptr [rsp+0x30] add rsp, 72 ret .section .rdata .p2align 6 INDEX0: .long 0, 1, 2, 3, 16, 17, 18, 19 .long 8, 9, 10, 11, 24, 25, 26, 27 INDEX1: .long 4, 5, 6, 7, 20, 21, 22, 23 .long 12, 13, 14, 15, 28, 29, 30, 31 ADD0: .long 0, 1, 2, 3, 4, 5, 6, 7 .long 8, 9, 10, 11, 12, 13, 14, 15 ADD1: .long 1 ADD16: .long 16 BLAKE3_BLOCK_LEN: .long 64 .p2align 6 BLAKE3_IV: BLAKE3_IV_0: .long 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A blake3-1.5.4/c/blake3_avx512_x86-64_windows_msvc.asm000064400000000000000000002627251046102023000177430ustar 00000000000000public _blake3_hash_many_avx512 public blake3_hash_many_avx512 public blake3_compress_in_place_avx512 public _blake3_compress_in_place_avx512 public blake3_compress_xof_avx512 public _blake3_compress_xof_avx512 _TEXT SEGMENT ALIGN(16) 'CODE' ALIGN 16 blake3_hash_many_avx512 PROC _blake3_hash_many_avx512 PROC push r15 push r14 push r13 push r12 push rdi push rsi push rbx push rbp mov rbp, rsp sub rsp, 304 and rsp, 0FFFFFFFFFFFFFFC0H vmovdqa xmmword ptr [rsp+90H], xmm6 vmovdqa xmmword ptr [rsp+0A0H], xmm7 vmovdqa xmmword ptr [rsp+0B0H], xmm8 vmovdqa xmmword ptr [rsp+0C0H], xmm9 vmovdqa xmmword ptr [rsp+0D0H], xmm10 vmovdqa xmmword ptr [rsp+0E0H], xmm11 vmovdqa xmmword ptr [rsp+0F0H], xmm12 vmovdqa xmmword ptr [rsp+100H], xmm13 vmovdqa xmmword ptr [rsp+110H], xmm14 vmovdqa xmmword ptr [rsp+120H], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+68H] movzx r9, byte ptr [rbp+70H] neg r9 kmovw k1, r9d vmovd xmm0, r8d vpbroadcastd ymm0, xmm0 shr r8, 32 vmovd xmm1, r8d vpbroadcastd ymm1, xmm1 vmovdqa ymm4, ymm1 vmovdqa ymm5, ymm1 vpaddd ymm2, ymm0, ymmword ptr [ADD0] vpaddd ymm3, ymm0, ymmword ptr [ADD0+32] vpcmpud k2, ymm2, ymm0, 1 vpcmpud k3, ymm3, ymm0, 1 ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. vpbroadcastd ymm6, dword ptr [ADD1] vpaddd ymm4 {k2}, ymm4, ymm6 vpaddd ymm5 {k3}, ymm5, ymm6 ; vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1] {1to8} ; vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1] {1to8} knotw k2, k1 vmovdqa32 ymm2 {k2}, ymm0 vmovdqa32 ymm3 {k2}, ymm0 vmovdqa32 ymm4 {k2}, ymm1 vmovdqa32 ymm5 {k2}, ymm1 vmovdqa ymmword ptr [rsp], ymm2 vmovdqa ymmword ptr [rsp+20H], ymm3 vmovdqa ymmword ptr [rsp+40H], ymm4 vmovdqa ymmword ptr [rsp+60H], ymm5 shl rdx, 6 mov qword ptr [rsp+80H], rdx cmp rsi, 16 jc final15blocks outerloop16: vpbroadcastd zmm0, dword ptr [rcx] vpbroadcastd zmm1, dword ptr [rcx+1H*4H] vpbroadcastd zmm2, dword ptr [rcx+2H*4H] vpbroadcastd zmm3, dword ptr [rcx+3H*4H] vpbroadcastd zmm4, dword ptr [rcx+4H*4H] vpbroadcastd zmm5, dword ptr [rcx+5H*4H] vpbroadcastd zmm6, dword ptr [rcx+6H*4H] vpbroadcastd zmm7, dword ptr [rcx+7H*4H] movzx eax, byte ptr [rbp+78H] movzx ebx, byte ptr [rbp+80H] or eax, ebx xor edx, edx ALIGN 16 innerloop16: movzx ebx, byte ptr [rbp+88H] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+80H] cmove eax, ebx mov dword ptr [rsp+88H], eax mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] mov r12, qword ptr [rdi+40H] mov r13, qword ptr [rdi+48H] mov r14, qword ptr [rdi+50H] mov r15, qword ptr [rdi+58H] vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H vpunpcklqdq zmm8, zmm16, zmm17 vpunpckhqdq zmm9, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H vpunpcklqdq zmm10, zmm18, zmm19 vpunpckhqdq zmm11, zmm18, zmm19 mov r8, qword ptr [rdi+20H] mov r9, qword ptr [rdi+28H] mov r10, qword ptr [rdi+30H] mov r11, qword ptr [rdi+38H] mov r12, qword ptr [rdi+60H] mov r13, qword ptr [rdi+68H] mov r14, qword ptr [rdi+70H] mov r15, qword ptr [rdi+78H] vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H vpunpcklqdq zmm12, zmm16, zmm17 vpunpckhqdq zmm13, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H vpunpcklqdq zmm14, zmm18, zmm19 vpunpckhqdq zmm15, zmm18, zmm19 vmovdqa32 zmm27, zmmword ptr [INDEX0] vmovdqa32 zmm31, zmmword ptr [INDEX1] vshufps zmm16, zmm8, zmm10, 136 vshufps zmm17, zmm12, zmm14, 136 vmovdqa32 zmm20, zmm16 vpermt2d zmm16, zmm27, zmm17 vpermt2d zmm20, zmm31, zmm17 vshufps zmm17, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm21, zmm17 vpermt2d zmm17, zmm27, zmm30 vpermt2d zmm21, zmm31, zmm30 vshufps zmm18, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm22, zmm18 vpermt2d zmm18, zmm27, zmm8 vpermt2d zmm22, zmm31, zmm8 vshufps zmm19, zmm9, zmm11, 221 vshufps zmm8, zmm13, zmm15, 221 vmovdqa32 zmm23, zmm19 vpermt2d zmm19, zmm27, zmm8 vpermt2d zmm23, zmm31, zmm8 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] mov r12, qword ptr [rdi+40H] mov r13, qword ptr [rdi+48H] mov r14, qword ptr [rdi+50H] mov r15, qword ptr [rdi+58H] vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H vpunpcklqdq zmm8, zmm24, zmm25 vpunpckhqdq zmm9, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H vpunpcklqdq zmm10, zmm24, zmm25 vpunpckhqdq zmm11, zmm24, zmm25 prefetcht0 byte ptr [r8+rdx+80H] prefetcht0 byte ptr [r12+rdx+80H] prefetcht0 byte ptr [r9+rdx+80H] prefetcht0 byte ptr [r13+rdx+80H] prefetcht0 byte ptr [r10+rdx+80H] prefetcht0 byte ptr [r14+rdx+80H] prefetcht0 byte ptr [r11+rdx+80H] prefetcht0 byte ptr [r15+rdx+80H] mov r8, qword ptr [rdi+20H] mov r9, qword ptr [rdi+28H] mov r10, qword ptr [rdi+30H] mov r11, qword ptr [rdi+38H] mov r12, qword ptr [rdi+60H] mov r13, qword ptr [rdi+68H] mov r14, qword ptr [rdi+70H] mov r15, qword ptr [rdi+78H] vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H vpunpcklqdq zmm12, zmm24, zmm25 vpunpckhqdq zmm13, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H vpunpcklqdq zmm14, zmm24, zmm25 vpunpckhqdq zmm15, zmm24, zmm25 prefetcht0 byte ptr [r8+rdx+80H] prefetcht0 byte ptr [r12+rdx+80H] prefetcht0 byte ptr [r9+rdx+80H] prefetcht0 byte ptr [r13+rdx+80H] prefetcht0 byte ptr [r10+rdx+80H] prefetcht0 byte ptr [r14+rdx+80H] prefetcht0 byte ptr [r11+rdx+80H] prefetcht0 byte ptr [r15+rdx+80H] vshufps zmm24, zmm8, zmm10, 136 vshufps zmm30, zmm12, zmm14, 136 vmovdqa32 zmm28, zmm24 vpermt2d zmm24, zmm27, zmm30 vpermt2d zmm28, zmm31, zmm30 vshufps zmm25, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm29, zmm25 vpermt2d zmm25, zmm27, zmm30 vpermt2d zmm29, zmm31, zmm30 vshufps zmm26, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm30, zmm26 vpermt2d zmm26, zmm27, zmm8 vpermt2d zmm30, zmm31, zmm8 vshufps zmm8, zmm9, zmm11, 221 vshufps zmm10, zmm13, zmm15, 221 vpermi2d zmm27, zmm8, zmm10 vpermi2d zmm31, zmm8, zmm10 vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0] vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1] vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2] vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3] vmovdqa32 zmm12, zmmword ptr [rsp] vmovdqa32 zmm13, zmmword ptr [rsp+1H*40H] vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN] vpbroadcastd zmm15, dword ptr [rsp+22H*4H] vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm24 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm23 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm27 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm21 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm28 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm26 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm22 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm31 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpxord zmm0, zmm0, zmm8 vpxord zmm1, zmm1, zmm9 vpxord zmm2, zmm2, zmm10 vpxord zmm3, zmm3, zmm11 vpxord zmm4, zmm4, zmm12 vpxord zmm5, zmm5, zmm13 vpxord zmm6, zmm6, zmm14 vpxord zmm7, zmm7, zmm15 movzx eax, byte ptr [rbp+78H] jne innerloop16 mov rbx, qword ptr [rbp+90H] vpunpckldq zmm16, zmm0, zmm1 vpunpckhdq zmm17, zmm0, zmm1 vpunpckldq zmm18, zmm2, zmm3 vpunpckhdq zmm19, zmm2, zmm3 vpunpckldq zmm20, zmm4, zmm5 vpunpckhdq zmm21, zmm4, zmm5 vpunpckldq zmm22, zmm6, zmm7 vpunpckhdq zmm23, zmm6, zmm7 vpunpcklqdq zmm0, zmm16, zmm18 vpunpckhqdq zmm1, zmm16, zmm18 vpunpcklqdq zmm2, zmm17, zmm19 vpunpckhqdq zmm3, zmm17, zmm19 vpunpcklqdq zmm4, zmm20, zmm22 vpunpckhqdq zmm5, zmm20, zmm22 vpunpcklqdq zmm6, zmm21, zmm23 vpunpckhqdq zmm7, zmm21, zmm23 vshufi32x4 zmm16, zmm0, zmm4, 88H vshufi32x4 zmm17, zmm1, zmm5, 88H vshufi32x4 zmm18, zmm2, zmm6, 88H vshufi32x4 zmm19, zmm3, zmm7, 88H vshufi32x4 zmm20, zmm0, zmm4, 0DDH vshufi32x4 zmm21, zmm1, zmm5, 0DDH vshufi32x4 zmm22, zmm2, zmm6, 0DDH vshufi32x4 zmm23, zmm3, zmm7, 0DDH vshufi32x4 zmm0, zmm16, zmm17, 88H vshufi32x4 zmm1, zmm18, zmm19, 88H vshufi32x4 zmm2, zmm20, zmm21, 88H vshufi32x4 zmm3, zmm22, zmm23, 88H vshufi32x4 zmm4, zmm16, zmm17, 0DDH vshufi32x4 zmm5, zmm18, zmm19, 0DDH vshufi32x4 zmm6, zmm20, zmm21, 0DDH vshufi32x4 zmm7, zmm22, zmm23, 0DDH vmovdqu32 zmmword ptr [rbx], zmm0 vmovdqu32 zmmword ptr [rbx+1H*40H], zmm1 vmovdqu32 zmmword ptr [rbx+2H*40H], zmm2 vmovdqu32 zmmword ptr [rbx+3H*40H], zmm3 vmovdqu32 zmmword ptr [rbx+4H*40H], zmm4 vmovdqu32 zmmword ptr [rbx+5H*40H], zmm5 vmovdqu32 zmmword ptr [rbx+6H*40H], zmm6 vmovdqu32 zmmword ptr [rbx+7H*40H], zmm7 vmovdqa32 zmm0, zmmword ptr [rsp] vmovdqa32 zmm1, zmmword ptr [rsp+1H*40H] vmovdqa32 zmm2, zmm0 ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. vpbroadcastd zmm4, dword ptr [ADD16] vpbroadcastd zmm5, dword ptr [ADD1] vpaddd zmm2{k1}, zmm0, zmm4 ; vpaddd zmm2{k1}, zmm0, dword ptr [ADD16] ; {1to16} vpcmpud k2, zmm2, zmm0, 1 vpaddd zmm1 {k2}, zmm1, zmm5 ; vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1] ; {1to16} vmovdqa32 zmmword ptr [rsp], zmm2 vmovdqa32 zmmword ptr [rsp+1H*40H], zmm1 add rdi, 128 add rbx, 512 mov qword ptr [rbp+90H], rbx sub rsi, 16 cmp rsi, 16 jnc outerloop16 test rsi, rsi jne final15blocks unwind: vzeroupper vmovdqa xmm6, xmmword ptr [rsp+90H] vmovdqa xmm7, xmmword ptr [rsp+0A0H] vmovdqa xmm8, xmmword ptr [rsp+0B0H] vmovdqa xmm9, xmmword ptr [rsp+0C0H] vmovdqa xmm10, xmmword ptr [rsp+0D0H] vmovdqa xmm11, xmmword ptr [rsp+0E0H] vmovdqa xmm12, xmmword ptr [rsp+0F0H] vmovdqa xmm13, xmmword ptr [rsp+100H] vmovdqa xmm14, xmmword ptr [rsp+110H] vmovdqa xmm15, xmmword ptr [rsp+120H] mov rsp, rbp pop rbp pop rbx pop rsi pop rdi pop r12 pop r13 pop r14 pop r15 ret ALIGN 16 final15blocks: test esi, 8H je final7blocks vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+4H] vpbroadcastd ymm2, dword ptr [rcx+8H] vpbroadcastd ymm3, dword ptr [rcx+0CH] vpbroadcastd ymm4, dword ptr [rcx+10H] vpbroadcastd ymm5, dword ptr [rcx+14H] vpbroadcastd ymm6, dword ptr [rcx+18H] vpbroadcastd ymm7, dword ptr [rcx+1CH] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] mov r12, qword ptr [rdi+20H] mov r13, qword ptr [rdi+28H] mov r14, qword ptr [rdi+30H] mov r15, qword ptr [rdi+38H] movzx eax, byte ptr [rbp+78H] movzx ebx, byte ptr [rbp+80H] or eax, ebx xor edx, edx innerloop8: movzx ebx, byte ptr [rbp+88H] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+80H] cmove eax, ebx mov dword ptr [rsp+88H], eax vmovups xmm8, xmmword ptr [r8+rdx-40H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H vmovups xmm9, xmmword ptr [r9+rdx-40H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-40H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H vmovups xmm11, xmmword ptr [r11+rdx-40H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm16, ymm12, ymm14, 136 vshufps ymm17, ymm12, ymm14, 221 vshufps ymm18, ymm13, ymm15, 136 vshufps ymm19, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-30H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H vmovups xmm9, xmmword ptr [r9+rdx-30H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-30H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H vmovups xmm11, xmmword ptr [r11+rdx-30H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm20, ymm12, ymm14, 136 vshufps ymm21, ymm12, ymm14, 221 vshufps ymm22, ymm13, ymm15, 136 vshufps ymm23, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-20H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H vmovups xmm9, xmmword ptr [r9+rdx-20H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-20H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H vmovups xmm11, xmmword ptr [r11+rdx-20H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm24, ymm12, ymm14, 136 vshufps ymm25, ymm12, ymm14, 221 vshufps ymm26, ymm13, ymm15, 136 vshufps ymm27, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-10H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H vmovups xmm9, xmmword ptr [r9+rdx-10H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-10H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H vmovups xmm11, xmmword ptr [r11+rdx-10H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm28, ymm12, ymm14, 136 vshufps ymm29, ymm12, ymm14, 221 vshufps ymm30, ymm13, ymm15, 136 vshufps ymm31, ymm13, ymm15, 221 vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0] vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1] vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2] vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3] vmovdqa ymm12, ymmword ptr [rsp] vmovdqa ymm13, ymmword ptr [rsp+40H] vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN] vpbroadcastd ymm15, dword ptr [rsp+88H] vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm24 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm23 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm27 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm21 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm28 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm26 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm22 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm31 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+78H] jne innerloop8 mov rbx, qword ptr [rbp+90H] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0CCH vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0CCH vblendps ymm3, ymm12, ymm9, 0CCH vperm2f128 ymm12, ymm1, ymm2, 20H vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0CCH vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 20H vmovups ymmword ptr [rbx+20H], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0CCH vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0CCH vblendps ymm14, ymm14, ymm13, 0CCH vperm2f128 ymm8, ymm10, ymm14, 20H vmovups ymmword ptr [rbx+40H], ymm8 vblendps ymm15, ymm13, ymm15, 0CCH vperm2f128 ymm13, ymm6, ymm15, 20H vmovups ymmword ptr [rbx+60H], ymm13 vperm2f128 ymm9, ymm1, ymm2, 31H vperm2f128 ymm11, ymm3, ymm4, 31H vmovups ymmword ptr [rbx+80H], ymm9 vperm2f128 ymm14, ymm10, ymm14, 31H vperm2f128 ymm15, ymm6, ymm15, 31H vmovups ymmword ptr [rbx+0A0H], ymm11 vmovups ymmword ptr [rbx+0C0H], ymm14 vmovups ymmword ptr [rbx+0E0H], ymm15 vmovdqa ymm0, ymmword ptr [rsp] vmovdqa ymm2, ymmword ptr [rsp+40H] vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+1H*20H] vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+3H*20H] vmovdqa ymmword ptr [rsp], ymm0 vmovdqa ymmword ptr [rsp+40H], ymm2 add rbx, 256 mov qword ptr [rbp+90H], rbx add rdi, 64 sub rsi, 8 final7blocks: mov rbx, qword ptr [rbp+90H] mov r15, qword ptr [rsp+80H] movzx r13, byte ptr [rbp+78H] movzx r12, byte ptr [rbp+88H] test esi, 4H je final3blocks vbroadcasti32x4 zmm0, xmmword ptr [rcx] vbroadcasti32x4 zmm1, xmmword ptr [rcx+1H*10H] vmovdqa xmm12, xmmword ptr [rsp] vmovdqa xmm13, xmmword ptr [rsp+40H] vpunpckldq xmm14, xmm12, xmm13 vpunpckhdq xmm15, xmm12, xmm13 vpermq ymm14, ymm14, 0DCH vpermq ymm15, ymm15, 0DCH vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN] vinserti64x4 zmm13, zmm14, ymm15, 01H mov eax, 17476 kmovw k2, eax vpblendmd zmm13 {k2}, zmm13, zmm12 vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] mov eax, 43690 kmovw k3, eax mov eax, 34952 kmovw k4, eax movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx ALIGN 16 innerloop4: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+88H], eax vmovdqa32 zmm2, zmm15 vpbroadcastd zmm8, dword ptr [rsp+22H*4H] vpblendmd zmm3 {k4}, zmm13, zmm8 vmovups zmm8, zmmword ptr [r8+rdx-1H*40H] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-4H*10H], 01H vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-4H*10H], 02H vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-4H*10H], 03H vmovups zmm9, zmmword ptr [r8+rdx-30H] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-3H*10H], 01H vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-3H*10H], 02H vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-3H*10H], 03H vshufps zmm4, zmm8, zmm9, 136 vshufps zmm5, zmm8, zmm9, 221 vmovups zmm8, zmmword ptr [r8+rdx-20H] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-2H*10H], 01H vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-2H*10H], 02H vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-2H*10H], 03H vmovups zmm9, zmmword ptr [r8+rdx-10H] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-1H*10H], 01H vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-1H*10H], 02H vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-1H*10H], 03H vshufps zmm6, zmm8, zmm9, 136 vshufps zmm7, zmm8, zmm9, 221 vpshufd zmm6, zmm6, 93H vpshufd zmm7, zmm7, 93H mov al, 7 roundloop4: vpaddd zmm0, zmm0, zmm4 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm5 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 93H vpshufd zmm3, zmm3, 4EH vpshufd zmm2, zmm2, 39H vpaddd zmm0, zmm0, zmm6 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm7 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 39H vpshufd zmm3, zmm3, 4EH vpshufd zmm2, zmm2, 93H dec al jz endroundloop4 vshufps zmm8, zmm4, zmm5, 214 vpshufd zmm9, zmm4, 0FH vpshufd zmm4, zmm8, 39H vshufps zmm8, zmm6, zmm7, 250 vpblendmd zmm9 {k3}, zmm9, zmm8 vpunpcklqdq zmm8, zmm7, zmm5 vpblendmd zmm8 {k4}, zmm8, zmm6 vpshufd zmm8, zmm8, 78H vpunpckhdq zmm5, zmm5, zmm7 vpunpckldq zmm6, zmm6, zmm5 vpshufd zmm7, zmm6, 1EH vmovdqa32 zmm5, zmm9 vmovdqa32 zmm6, zmm8 jmp roundloop4 endroundloop4: vpxord zmm0, zmm0, zmm2 vpxord zmm1, zmm1, zmm3 mov eax, r13d cmp rdx, r15 jne innerloop4 vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+10H], xmm1 vextracti128 xmmword ptr [rbx+20H], ymm0, 01H vextracti128 xmmword ptr [rbx+30H], ymm1, 01H vextracti32x4 xmmword ptr [rbx+4H*10H], zmm0, 02H vextracti32x4 xmmword ptr [rbx+5H*10H], zmm1, 02H vextracti32x4 xmmword ptr [rbx+6H*10H], zmm0, 03H vextracti32x4 xmmword ptr [rbx+7H*10H], zmm1, 03H vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+40H] vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+1H*10H] vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+5H*10H] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+40H], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 final3blocks: test esi, 2H je final1block vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+10H] vmovd xmm13, dword ptr [rsp] vpinsrd xmm13, xmm13, dword ptr [rsp+40H], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 vmovd xmm14, dword ptr [rsp+4H] vpinsrd xmm14, xmm14, dword ptr [rsp+44H], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 vinserti128 ymm13, ymm13, xmm14, 01H mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx ALIGN 16 innerloop2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+88H], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] vpbroadcastd ymm8, dword ptr [rsp+88H] vpblendd ymm3, ymm13, ymm8, 88H vmovups ymm8, ymmword ptr [r8+rdx-40H] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H vmovups ymm9, ymmword ptr [r8+rdx-30H] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-20H] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H vmovups ymm9, ymmword ptr [r8+rdx-10H] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 93H vpshufd ymm7, ymm7, 93H mov al, 7 roundloop2: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 93H vpshufd ymm3, ymm3, 4EH vpshufd ymm2, ymm2, 39H vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 39H vpshufd ymm3, ymm3, 4EH vpshufd ymm2, ymm2, 93H dec al jz endroundloop2 vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0FH vpshufd ymm4, ymm8, 39H vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0AAH vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 88H vpshufd ymm8, ymm8, 78H vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 1EH vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp roundloop2 endroundloop2: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne innerloop2 vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+10H], xmm1 vextracti128 xmmword ptr [rbx+20H], ymm0, 01H vextracti128 xmmword ptr [rbx+30H], ymm1, 01H vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+40H] vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+8H] vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+48H] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+40H], xmm2 add rbx, 64 add rdi, 16 sub rsi, 2 final1block: test esi, 1H je unwind vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+10H] vmovd xmm14, dword ptr [rsp] vpinsrd xmm14, xmm14, dword ptr [rsp+40H], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 vmovdqa xmm15, xmmword ptr [BLAKE3_IV] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx ALIGN 16 innerloop1: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vpinsrd xmm3, xmm14, eax, 3 vmovdqa xmm2, xmm15 vmovups xmm8, xmmword ptr [r8+rdx-40H] vmovups xmm9, xmmword ptr [r8+rdx-30H] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-20H] vmovups xmm9, xmmword ptr [r8+rdx-10H] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 93H vpshufd xmm7, xmm7, 93H mov al, 7 roundloop1: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 93H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 39H vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 39H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 93H dec al jz endroundloop1 vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0FH vpshufd xmm4, xmm8, 39H vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0AAH vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 88H vpshufd xmm8, xmm8, 78H vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 1EH vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp roundloop1 endroundloop1: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne innerloop1 vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+10H], xmm1 jmp unwind _blake3_hash_many_avx512 ENDP blake3_hash_many_avx512 ENDP ALIGN 16 blake3_compress_in_place_avx512 PROC _blake3_compress_in_place_avx512 PROC sub rsp, 72 vmovdqa xmmword ptr [rsp], xmm6 vmovdqa xmmword ptr [rsp+10H], xmm7 vmovdqa xmmword ptr [rsp+20H], xmm8 vmovdqa xmmword ptr [rsp+30H], xmm9 vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+10H] movzx eax, byte ptr [rsp+70H] movzx r8d, r8b shl rax, 32 add r8, rax vmovq xmm3, r9 vmovq xmm4, r8 vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV] vmovups xmm8, xmmword ptr [rdx] vmovups xmm9, xmmword ptr [rdx+10H] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rdx+20H] vmovups xmm9, xmmword ptr [rdx+30H] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 93H vpshufd xmm7, xmm7, 93H mov al, 7 @@: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 93H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 39H vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 39H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 93H dec al jz @F vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0FH vpshufd xmm4, xmm8, 39H vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0AAH vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 88H vpshufd xmm8, xmm8, 78H vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 1EH vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp @B @@: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vmovdqu xmmword ptr [rcx], xmm0 vmovdqu xmmword ptr [rcx+10H], xmm1 vmovdqa xmm6, xmmword ptr [rsp] vmovdqa xmm7, xmmword ptr [rsp+10H] vmovdqa xmm8, xmmword ptr [rsp+20H] vmovdqa xmm9, xmmword ptr [rsp+30H] add rsp, 72 ret _blake3_compress_in_place_avx512 ENDP blake3_compress_in_place_avx512 ENDP ALIGN 16 blake3_compress_xof_avx512 PROC _blake3_compress_xof_avx512 PROC sub rsp, 72 vmovdqa xmmword ptr [rsp], xmm6 vmovdqa xmmword ptr [rsp+10H], xmm7 vmovdqa xmmword ptr [rsp+20H], xmm8 vmovdqa xmmword ptr [rsp+30H], xmm9 vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+10H] movzx eax, byte ptr [rsp+70H] movzx r8d, r8b mov r10, qword ptr [rsp+78H] shl rax, 32 add r8, rax vmovq xmm3, r9 vmovq xmm4, r8 vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV] vmovups xmm8, xmmword ptr [rdx] vmovups xmm9, xmmword ptr [rdx+10H] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rdx+20H] vmovups xmm9, xmmword ptr [rdx+30H] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 93H vpshufd xmm7, xmm7, 93H mov al, 7 @@: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 93H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 39H vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 39H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 93H dec al jz @F vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0FH vpshufd xmm4, xmm8, 39H vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0AAH vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 88H vpshufd xmm8, xmm8, 78H vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 1EH vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp @B @@: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vpxor xmm2, xmm2, xmmword ptr [rcx] vpxor xmm3, xmm3, xmmword ptr [rcx+10H] vmovdqu xmmword ptr [r10], xmm0 vmovdqu xmmword ptr [r10+10H], xmm1 vmovdqu xmmword ptr [r10+20H], xmm2 vmovdqu xmmword ptr [r10+30H], xmm3 vmovdqa xmm6, xmmword ptr [rsp] vmovdqa xmm7, xmmword ptr [rsp+10H] vmovdqa xmm8, xmmword ptr [rsp+20H] vmovdqa xmm9, xmmword ptr [rsp+30H] add rsp, 72 ret _blake3_compress_xof_avx512 ENDP blake3_compress_xof_avx512 ENDP _TEXT ENDS _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' ALIGN 64 INDEX0: dd 0, 1, 2, 3, 16, 17, 18, 19 dd 8, 9, 10, 11, 24, 25, 26, 27 INDEX1: dd 4, 5, 6, 7, 20, 21, 22, 23 dd 12, 13, 14, 15, 28, 29, 30, 31 ADD0: dd 0, 1, 2, 3, 4, 5, 6, 7 dd 8, 9, 10, 11, 12, 13, 14, 15 ADD1: dd 1 ADD16: dd 16 BLAKE3_BLOCK_LEN: dd 64 ALIGN 64 BLAKE3_IV: BLAKE3_IV_0: dd 06A09E667H BLAKE3_IV_1: dd 0BB67AE85H BLAKE3_IV_2: dd 03C6EF372H BLAKE3_IV_3: dd 0A54FF53AH _RDATA ENDS END blake3-1.5.4/c/blake3_dispatch.c000064400000000000000000000216631046102023000144120ustar 00000000000000#include #include #include #include "blake3_impl.h" #if defined(_MSC_VER) #include #endif #if defined(IS_X86) #if defined(_MSC_VER) #include #elif defined(__GNUC__) #include #else #undef IS_X86 /* Unimplemented! */ #endif #endif #if !defined(BLAKE3_ATOMICS) #if defined(__has_include) #if __has_include() && !defined(_MSC_VER) #define BLAKE3_ATOMICS 1 #else #define BLAKE3_ATOMICS 0 #endif /* __has_include() && !defined(_MSC_VER) */ #else #define BLAKE3_ATOMICS 0 #endif /* defined(__has_include) */ #endif /* BLAKE3_ATOMICS */ #if BLAKE3_ATOMICS #define ATOMIC_INT _Atomic int #define ATOMIC_LOAD(x) x #define ATOMIC_STORE(x, y) x = y #elif defined(_MSC_VER) #define ATOMIC_INT LONG #define ATOMIC_LOAD(x) InterlockedOr(&x, 0) #define ATOMIC_STORE(x, y) InterlockedExchange(&x, y) #else #define ATOMIC_INT int #define ATOMIC_LOAD(x) x #define ATOMIC_STORE(x, y) x = y #endif #define MAYBE_UNUSED(x) (void)((x)) #if defined(IS_X86) static uint64_t xgetbv(void) { #if defined(_MSC_VER) return _xgetbv(0); #else uint32_t eax = 0, edx = 0; __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); return ((uint64_t)edx << 32) | eax; #endif } static void cpuid(uint32_t out[4], uint32_t id) { #if defined(_MSC_VER) __cpuid((int *)out, id); #elif defined(__i386__) || defined(_M_IX86) __asm__ __volatile__("movl %%ebx, %1\n" "cpuid\n" "xchgl %1, %%ebx\n" : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id)); #else __asm__ __volatile__("cpuid\n" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id)); #endif } static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { #if defined(_MSC_VER) __cpuidex((int *)out, id, sid); #elif defined(__i386__) || defined(_M_IX86) __asm__ __volatile__("movl %%ebx, %1\n" "cpuid\n" "xchgl %1, %%ebx\n" : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id), "c"(sid)); #else __asm__ __volatile__("cpuid\n" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id), "c"(sid)); #endif } #endif enum cpu_feature { SSE2 = 1 << 0, SSSE3 = 1 << 1, SSE41 = 1 << 2, AVX = 1 << 3, AVX2 = 1 << 4, AVX512F = 1 << 5, AVX512VL = 1 << 6, /* ... */ UNDEFINED = 1 << 30 }; #if !defined(BLAKE3_TESTING) static /* Allow the variable to be controlled manually for testing */ #endif ATOMIC_INT g_cpu_features = UNDEFINED; #if !defined(BLAKE3_TESTING) static #endif enum cpu_feature get_cpu_features(void) { /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */ enum cpu_feature features = ATOMIC_LOAD(g_cpu_features); if (features != UNDEFINED) { return features; } else { #if defined(IS_X86) uint32_t regs[4] = {0}; uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; (void)edx; features = 0; cpuid(regs, 0); const int max_id = *eax; cpuid(regs, 1); #if defined(__amd64__) || defined(_M_X64) features |= SSE2; #else if (*edx & (1UL << 26)) features |= SSE2; #endif if (*ecx & (1UL << 9)) features |= SSSE3; if (*ecx & (1UL << 19)) features |= SSE41; if (*ecx & (1UL << 27)) { // OSXSAVE const uint64_t mask = xgetbv(); if ((mask & 6) == 6) { // SSE and AVX states if (*ecx & (1UL << 28)) features |= AVX; if (max_id >= 7) { cpuidex(regs, 7, 0); if (*ebx & (1UL << 5)) features |= AVX2; if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm if (*ebx & (1UL << 31)) features |= AVX512VL; if (*ebx & (1UL << 16)) features |= AVX512F; } } } } ATOMIC_STORE(g_cpu_features, features); return features; #else /* How to detect NEON? */ return 0; #endif } } void blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); MAYBE_UNUSED(features); #if !defined(BLAKE3_NO_AVX512) if (features & AVX512VL) { blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); return; } #endif #if !defined(BLAKE3_NO_SSE41) if (features & SSE41) { blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); return; } #endif #if !defined(BLAKE3_NO_SSE2) if (features & SSE2) { blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); return; } #endif #endif blake3_compress_in_place_portable(cv, block, block_len, counter, flags); } void blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); MAYBE_UNUSED(features); #if !defined(BLAKE3_NO_AVX512) if (features & AVX512VL) { blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); return; } #endif #if !defined(BLAKE3_NO_SSE41) if (features & SSE41) { blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); return; } #endif #if !defined(BLAKE3_NO_SSE2) if (features & SSE2) { blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); return; } #endif #endif blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); } void blake3_xof_many(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64], size_t outblocks) { if (outblocks == 0) { // The current assembly implementation always outputs at least 1 block. return; } #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); #if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512) if (features & AVX512VL) { blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks); return; } #endif #endif for(size_t i = 0; i < outblocks; ++i) { blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i); } } void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); MAYBE_UNUSED(features); #if !defined(BLAKE3_NO_AVX512) if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; } #endif #if !defined(BLAKE3_NO_AVX2) if (features & AVX2) { blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; } #endif #if !defined(BLAKE3_NO_SSE41) if (features & SSE41) { blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; } #endif #if !defined(BLAKE3_NO_SSE2) if (features & SSE2) { blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; } #endif #endif #if BLAKE3_USE_NEON == 1 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; #endif blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); } // The dynamically detected SIMD degree of the current platform. size_t blake3_simd_degree(void) { #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); MAYBE_UNUSED(features); #if !defined(BLAKE3_NO_AVX512) if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { return 16; } #endif #if !defined(BLAKE3_NO_AVX2) if (features & AVX2) { return 8; } #endif #if !defined(BLAKE3_NO_SSE41) if (features & SSE41) { return 4; } #endif #if !defined(BLAKE3_NO_SSE2) if (features & SSE2) { return 4; } #endif #endif #if BLAKE3_USE_NEON == 1 return 4; #endif return 1; } blake3-1.5.4/c/blake3_impl.h000064400000000000000000000255301046102023000135560ustar 00000000000000#ifndef BLAKE3_IMPL_H #define BLAKE3_IMPL_H #include #include #include #include #include #include "blake3.h" // internal flags enum blake3_flags { CHUNK_START = 1 << 0, CHUNK_END = 1 << 1, PARENT = 1 << 2, ROOT = 1 << 3, KEYED_HASH = 1 << 4, DERIVE_KEY_CONTEXT = 1 << 5, DERIVE_KEY_MATERIAL = 1 << 6, }; // This C implementation tries to support recent versions of GCC, Clang, and // MSVC. #if defined(_MSC_VER) #define INLINE static __forceinline #else #define INLINE static inline __attribute__((always_inline)) #endif #if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC) #define IS_X86 #define IS_X86_64 #endif #if defined(__i386__) || defined(_M_IX86) #define IS_X86 #define IS_X86_32 #endif #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) #define IS_AARCH64 #endif #if defined(IS_X86) #if defined(_MSC_VER) #include #endif #endif #if !defined(BLAKE3_USE_NEON) // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness #if defined(IS_AARCH64) #if defined(__ARM_BIG_ENDIAN) #define BLAKE3_USE_NEON 0 #else #define BLAKE3_USE_NEON 1 #endif #else #define BLAKE3_USE_NEON 0 #endif #endif #if defined(IS_X86) #define MAX_SIMD_DEGREE 16 #elif BLAKE3_USE_NEON == 1 #define MAX_SIMD_DEGREE 4 #else #define MAX_SIMD_DEGREE 1 #endif // There are some places where we want a static size that's equal to the // MAX_SIMD_DEGREE, but also at least 2. #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL}; static const uint8_t MSG_SCHEDULE[7][16] = { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, }; /* Find index of the highest set bit */ /* x is assumed to be nonzero. */ static unsigned int highest_one(uint64_t x) { #if defined(__GNUC__) || defined(__clang__) return 63 ^ (unsigned int)__builtin_clzll(x); #elif defined(_MSC_VER) && defined(IS_X86_64) unsigned long index; _BitScanReverse64(&index, x); return index; #elif defined(_MSC_VER) && defined(IS_X86_32) if(x >> 32) { unsigned long index; _BitScanReverse(&index, (unsigned long)(x >> 32)); return 32 + index; } else { unsigned long index; _BitScanReverse(&index, (unsigned long)x); return index; } #else unsigned int c = 0; if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } if(x & 0x0000000000000002ULL) { c += 1; } return c; #endif } // Count the number of 1 bits. INLINE unsigned int popcnt(uint64_t x) { #if defined(__GNUC__) || defined(__clang__) return (unsigned int)__builtin_popcountll(x); #else unsigned int count = 0; while (x != 0) { count += 1; x &= x - 1; } return count; #endif } // Largest power of two less than or equal to x. As a special case, returns 1 // when x is 0. INLINE uint64_t round_down_to_power_of_2(uint64_t x) { return 1ULL << highest_one(x | 1); } INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } INLINE uint32_t counter_high(uint64_t counter) { return (uint32_t)(counter >> 32); } INLINE uint32_t load32(const void *src) { const uint8_t *p = (const uint8_t *)src; return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); } INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], uint32_t key_words[8]) { key_words[0] = load32(&key[0 * 4]); key_words[1] = load32(&key[1 * 4]); key_words[2] = load32(&key[2 * 4]); key_words[3] = load32(&key[3 * 4]); key_words[4] = load32(&key[4 * 4]); key_words[5] = load32(&key[5 * 4]); key_words[6] = load32(&key[6 * 4]); key_words[7] = load32(&key[7 * 4]); } INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN], uint32_t block_words[16]) { for (size_t i = 0; i < 16; i++) { block_words[i] = load32(&block[i * 4]); } } INLINE void store32(void *dst, uint32_t w) { uint8_t *p = (uint8_t *)dst; p[0] = (uint8_t)(w >> 0); p[1] = (uint8_t)(w >> 8); p[2] = (uint8_t)(w >> 16); p[3] = (uint8_t)(w >> 24); } INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { store32(&bytes_out[0 * 4], cv_words[0]); store32(&bytes_out[1 * 4], cv_words[1]); store32(&bytes_out[2 * 4], cv_words[2]); store32(&bytes_out[3 * 4], cv_words[3]); store32(&bytes_out[4 * 4], cv_words[4]); store32(&bytes_out[5 * 4], cv_words[5]); store32(&bytes_out[6 * 4], cv_words[6]); store32(&bytes_out[7 * 4], cv_words[7]); } void blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); void blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); void blake3_xof_many(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64], size_t outblocks); void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); size_t blake3_simd_degree(void); // Declarations for implementation-specific functions. void blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); void blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #if defined(IS_X86) #if !defined(BLAKE3_NO_SSE2) void blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); void blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #endif #if !defined(BLAKE3_NO_SSE41) void blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); void blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #endif #if !defined(BLAKE3_NO_AVX2) void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #endif #if !defined(BLAKE3_NO_AVX512) void blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); void blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #if !defined(_WIN32) void blake3_xof_many_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t* out, size_t outblocks); #endif #endif #endif #if BLAKE3_USE_NEON == 1 void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #endif #endif /* BLAKE3_IMPL_H */ blake3-1.5.4/c/blake3_neon.c000064400000000000000000000330111046102023000135400ustar 00000000000000#include "blake3_impl.h" #include #ifdef __ARM_BIG_ENDIAN #error "This implementation only supports little-endian ARM." // It might be that all we need for big-endian support here is to get the loads // and stores right, but step zero would be finding a way to test it in CI. #endif INLINE uint32x4_t loadu_128(const uint8_t src[16]) { // vld1q_u32 has alignment requirements. Don't use it. return vreinterpretq_u32_u8(vld1q_u8(src)); } INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) { // vst1q_u32 has alignment requirements. Don't use it. vst1q_u8(dest, vreinterpretq_u8_u32(src)); } INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) { return vaddq_u32(a, b); } INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) { return veorq_u32(a, b); } INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); } INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { uint32_t array[4] = {a, b, c, d}; return vld1q_u32(array); } INLINE uint32x4_t rot16_128(uint32x4_t x) { // The straightforward implementation would be two shifts and an or, but that's // slower on microarchitectures we've tested. See // https://github.com/BLAKE3-team/BLAKE3/pull/319. // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16)); return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))); } INLINE uint32x4_t rot12_128(uint32x4_t x) { // See comment in rot16_128. // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12)); return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12); } INLINE uint32x4_t rot8_128(uint32x4_t x) { // See comment in rot16_128. // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8)); #if defined(__clang__) return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12)); #elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700 static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12}; return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8)); #else return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8); #endif } INLINE uint32x4_t rot7_128(uint32x4_t x) { // See comment in rot16_128. // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7)); return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7); } // TODO: compress_neon // TODO: hash2_neon /* * ---------------------------------------------------------------------------- * hash4_neon * ---------------------------------------------------------------------------- */ INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) { v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = add_128(v[0], v[4]); v[1] = add_128(v[1], v[5]); v[2] = add_128(v[2], v[6]); v[3] = add_128(v[3], v[7]); v[12] = xor_128(v[12], v[0]); v[13] = xor_128(v[13], v[1]); v[14] = xor_128(v[14], v[2]); v[15] = xor_128(v[15], v[3]); v[12] = rot16_128(v[12]); v[13] = rot16_128(v[13]); v[14] = rot16_128(v[14]); v[15] = rot16_128(v[15]); v[8] = add_128(v[8], v[12]); v[9] = add_128(v[9], v[13]); v[10] = add_128(v[10], v[14]); v[11] = add_128(v[11], v[15]); v[4] = xor_128(v[4], v[8]); v[5] = xor_128(v[5], v[9]); v[6] = xor_128(v[6], v[10]); v[7] = xor_128(v[7], v[11]); v[4] = rot12_128(v[4]); v[5] = rot12_128(v[5]); v[6] = rot12_128(v[6]); v[7] = rot12_128(v[7]); v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = add_128(v[0], v[4]); v[1] = add_128(v[1], v[5]); v[2] = add_128(v[2], v[6]); v[3] = add_128(v[3], v[7]); v[12] = xor_128(v[12], v[0]); v[13] = xor_128(v[13], v[1]); v[14] = xor_128(v[14], v[2]); v[15] = xor_128(v[15], v[3]); v[12] = rot8_128(v[12]); v[13] = rot8_128(v[13]); v[14] = rot8_128(v[14]); v[15] = rot8_128(v[15]); v[8] = add_128(v[8], v[12]); v[9] = add_128(v[9], v[13]); v[10] = add_128(v[10], v[14]); v[11] = add_128(v[11], v[15]); v[4] = xor_128(v[4], v[8]); v[5] = xor_128(v[5], v[9]); v[6] = xor_128(v[6], v[10]); v[7] = xor_128(v[7], v[11]); v[4] = rot7_128(v[4]); v[5] = rot7_128(v[5]); v[6] = rot7_128(v[6]); v[7] = rot7_128(v[7]); v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = add_128(v[0], v[5]); v[1] = add_128(v[1], v[6]); v[2] = add_128(v[2], v[7]); v[3] = add_128(v[3], v[4]); v[15] = xor_128(v[15], v[0]); v[12] = xor_128(v[12], v[1]); v[13] = xor_128(v[13], v[2]); v[14] = xor_128(v[14], v[3]); v[15] = rot16_128(v[15]); v[12] = rot16_128(v[12]); v[13] = rot16_128(v[13]); v[14] = rot16_128(v[14]); v[10] = add_128(v[10], v[15]); v[11] = add_128(v[11], v[12]); v[8] = add_128(v[8], v[13]); v[9] = add_128(v[9], v[14]); v[5] = xor_128(v[5], v[10]); v[6] = xor_128(v[6], v[11]); v[7] = xor_128(v[7], v[8]); v[4] = xor_128(v[4], v[9]); v[5] = rot12_128(v[5]); v[6] = rot12_128(v[6]); v[7] = rot12_128(v[7]); v[4] = rot12_128(v[4]); v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = add_128(v[0], v[5]); v[1] = add_128(v[1], v[6]); v[2] = add_128(v[2], v[7]); v[3] = add_128(v[3], v[4]); v[15] = xor_128(v[15], v[0]); v[12] = xor_128(v[12], v[1]); v[13] = xor_128(v[13], v[2]); v[14] = xor_128(v[14], v[3]); v[15] = rot8_128(v[15]); v[12] = rot8_128(v[12]); v[13] = rot8_128(v[13]); v[14] = rot8_128(v[14]); v[10] = add_128(v[10], v[15]); v[11] = add_128(v[11], v[12]); v[8] = add_128(v[8], v[13]); v[9] = add_128(v[9], v[14]); v[5] = xor_128(v[5], v[10]); v[6] = xor_128(v[6], v[11]); v[7] = xor_128(v[7], v[8]); v[4] = xor_128(v[4], v[9]); v[5] = rot7_128(v[5]); v[6] = rot7_128(v[6]); v[7] = rot7_128(v[7]); v[4] = rot7_128(v[4]); } INLINE void transpose_vecs_128(uint32x4_t vecs[4]) { // Individually transpose the four 2x2 sub-matrices in each corner. uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]); uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]); // Swap the top-right and bottom-left 2x2s (which just got transposed). vecs[0] = vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0])); vecs[1] = vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1])); vecs[2] = vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0])); vecs[3] = vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1])); } INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, size_t block_offset, uint32x4_t out[16]) { out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]); out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]); out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]); out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]); out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]); out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]); out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]); out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]); out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]); out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]); out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]); out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]); out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]); out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]); out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]); out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]); transpose_vecs_128(&out[0]); transpose_vecs_128(&out[4]); transpose_vecs_128(&out[8]); transpose_vecs_128(&out[12]); } INLINE void load_counters4(uint64_t counter, bool increment_counter, uint32x4_t *out_low, uint32x4_t *out_high) { uint64_t mask = (increment_counter ? ~0 : 0); *out_low = set4( counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3))); *out_high = set4( counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); } void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { uint32x4_t h_vecs[8] = { set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), }; uint32x4_t counter_low_vec, counter_high_vec; load_counters4(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN); uint32x4_t block_flags_vec = set1_128(block_flags); uint32x4_t msg_vecs[16]; transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); uint32x4_t v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn4(v, msg_vecs, 0); round_fn4(v, msg_vecs, 1); round_fn4(v, msg_vecs, 2); round_fn4(v, msg_vecs, 3); round_fn4(v, msg_vecs, 4); round_fn4(v, msg_vecs, 5); round_fn4(v, msg_vecs, 6); h_vecs[0] = xor_128(v[0], v[8]); h_vecs[1] = xor_128(v[1], v[9]); h_vecs[2] = xor_128(v[2], v[10]); h_vecs[3] = xor_128(v[3], v[11]); h_vecs[4] = xor_128(v[4], v[12]); h_vecs[5] = xor_128(v[5], v[13]); h_vecs[6] = xor_128(v[6], v[14]); h_vecs[7] = xor_128(v[7], v[15]); block_flags = flags; } transpose_vecs_128(&h_vecs[0]); transpose_vecs_128(&h_vecs[4]); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]); storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]); storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]); storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]); storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]); storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]); storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]); storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]); } /* * ---------------------------------------------------------------------------- * hash_many_neon * ---------------------------------------------------------------------------- */ void blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); INLINE void hash_one_neon(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { uint32_t cv[8]; memcpy(cv, key, BLAKE3_KEY_LEN); uint8_t block_flags = flags | flags_start; while (blocks > 0) { if (blocks == 1) { block_flags |= flags_end; } // TODO: Implement compress_neon. However note that according to // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227, // compress_neon might not be any faster than compress_portable. blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; } memcpy(out, cv, BLAKE3_OUT_LEN); } void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs >= 4) { blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 4; } inputs += 4; num_inputs -= 4; out = &out[4 * BLAKE3_OUT_LEN]; } while (num_inputs > 0) { hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 1; } inputs += 1; num_inputs -= 1; out = &out[BLAKE3_OUT_LEN]; } } blake3-1.5.4/c/blake3_portable.c000064400000000000000000000134011046102023000144120ustar 00000000000000#include "blake3_impl.h" #include INLINE uint32_t rotr32(uint32_t w, uint32_t c) { return (w >> c) | (w << (32 - c)); } INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y) { state[a] = state[a] + state[b] + x; state[d] = rotr32(state[d] ^ state[a], 16); state[c] = state[c] + state[d]; state[b] = rotr32(state[b] ^ state[c], 12); state[a] = state[a] + state[b] + y; state[d] = rotr32(state[d] ^ state[a], 8); state[c] = state[c] + state[d]; state[b] = rotr32(state[b] ^ state[c], 7); } INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { // Select the message schedule based on the round. const uint8_t *schedule = MSG_SCHEDULE[round]; // Mix the columns. g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); // Mix the rows. g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); } INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { uint32_t block_words[16]; block_words[0] = load32(block + 4 * 0); block_words[1] = load32(block + 4 * 1); block_words[2] = load32(block + 4 * 2); block_words[3] = load32(block + 4 * 3); block_words[4] = load32(block + 4 * 4); block_words[5] = load32(block + 4 * 5); block_words[6] = load32(block + 4 * 6); block_words[7] = load32(block + 4 * 7); block_words[8] = load32(block + 4 * 8); block_words[9] = load32(block + 4 * 9); block_words[10] = load32(block + 4 * 10); block_words[11] = load32(block + 4 * 11); block_words[12] = load32(block + 4 * 12); block_words[13] = load32(block + 4 * 13); block_words[14] = load32(block + 4 * 14); block_words[15] = load32(block + 4 * 15); state[0] = cv[0]; state[1] = cv[1]; state[2] = cv[2]; state[3] = cv[3]; state[4] = cv[4]; state[5] = cv[5]; state[6] = cv[6]; state[7] = cv[7]; state[8] = IV[0]; state[9] = IV[1]; state[10] = IV[2]; state[11] = IV[3]; state[12] = counter_low(counter); state[13] = counter_high(counter); state[14] = (uint32_t)block_len; state[15] = (uint32_t)flags; round_fn(state, &block_words[0], 0); round_fn(state, &block_words[0], 1); round_fn(state, &block_words[0], 2); round_fn(state, &block_words[0], 3); round_fn(state, &block_words[0], 4); round_fn(state, &block_words[0], 5); round_fn(state, &block_words[0], 6); } void blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { uint32_t state[16]; compress_pre(state, cv, block, block_len, counter, flags); cv[0] = state[0] ^ state[8]; cv[1] = state[1] ^ state[9]; cv[2] = state[2] ^ state[10]; cv[3] = state[3] ^ state[11]; cv[4] = state[4] ^ state[12]; cv[5] = state[5] ^ state[13]; cv[6] = state[6] ^ state[14]; cv[7] = state[7] ^ state[15]; } void blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { uint32_t state[16]; compress_pre(state, cv, block, block_len, counter, flags); store32(&out[0 * 4], state[0] ^ state[8]); store32(&out[1 * 4], state[1] ^ state[9]); store32(&out[2 * 4], state[2] ^ state[10]); store32(&out[3 * 4], state[3] ^ state[11]); store32(&out[4 * 4], state[4] ^ state[12]); store32(&out[5 * 4], state[5] ^ state[13]); store32(&out[6 * 4], state[6] ^ state[14]); store32(&out[7 * 4], state[7] ^ state[15]); store32(&out[8 * 4], state[8] ^ cv[0]); store32(&out[9 * 4], state[9] ^ cv[1]); store32(&out[10 * 4], state[10] ^ cv[2]); store32(&out[11 * 4], state[11] ^ cv[3]); store32(&out[12 * 4], state[12] ^ cv[4]); store32(&out[13 * 4], state[13] ^ cv[5]); store32(&out[14 * 4], state[14] ^ cv[6]); store32(&out[15 * 4], state[15] ^ cv[7]); } INLINE void hash_one_portable(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { uint32_t cv[8]; memcpy(cv, key, BLAKE3_KEY_LEN); uint8_t block_flags = flags | flags_start; while (blocks > 0) { if (blocks == 1) { block_flags |= flags_end; } blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; } store_cv_words(out, cv); } void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs > 0) { hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 1; } inputs += 1; num_inputs -= 1; out = &out[BLAKE3_OUT_LEN]; } } blake3-1.5.4/c/blake3_sse2.c000064400000000000000000000510151046102023000134610ustar 00000000000000#include "blake3_impl.h" #include #define DEGREE 4 #define _mm_shuffle_ps2(a, b, c) \ (_mm_castps_si128( \ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) INLINE __m128i loadu(const uint8_t src[16]) { return _mm_loadu_si128((const __m128i *)src); } INLINE void storeu(__m128i src, uint8_t dest[16]) { _mm_storeu_si128((__m128i *)dest, src); } INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } // Note that clang-format doesn't like the name "xor" for some reason. INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); } INLINE __m128i rot16(__m128i x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1); } INLINE __m128i rot12(__m128i x) { return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); } INLINE __m128i rot8(__m128i x) { return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8)); } INLINE __m128i rot7(__m128i x) { return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); } INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { *row0 = addv(addv(*row0, m), *row1); *row3 = xorv(*row3, *row0); *row3 = rot16(*row3); *row2 = addv(*row2, *row3); *row1 = xorv(*row1, *row2); *row1 = rot12(*row1); } INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { *row0 = addv(addv(*row0, m), *row1); *row3 = xorv(*row3, *row0); *row3 = rot8(*row3); *row2 = addv(*row2, *row3); *row1 = xorv(*row1, *row2); *row1 = rot7(*row1); } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); } INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); } INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) { const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); __m128i mask = _mm_set1_epi16(imm8); mask = _mm_and_si128(mask, bits); mask = _mm_cmpeq_epi16(mask, bits); return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); } INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { rows[0] = loadu((uint8_t *)&cv[0]); rows[1] = loadu((uint8_t *)&cv[4]); rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); rows[3] = set4(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags); __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); __m128i t0, t1, t2, t3, tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); } void blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); } void blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu(xorv(rows[0], rows[2]), &out[0]); storeu(xorv(rows[1], rows[3]), &out[16]); storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); } INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = addv(v[0], v[4]); v[1] = addv(v[1], v[5]); v[2] = addv(v[2], v[6]); v[3] = addv(v[3], v[7]); v[12] = xorv(v[12], v[0]); v[13] = xorv(v[13], v[1]); v[14] = xorv(v[14], v[2]); v[15] = xorv(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = addv(v[8], v[12]); v[9] = addv(v[9], v[13]); v[10] = addv(v[10], v[14]); v[11] = addv(v[11], v[15]); v[4] = xorv(v[4], v[8]); v[5] = xorv(v[5], v[9]); v[6] = xorv(v[6], v[10]); v[7] = xorv(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = addv(v[0], v[4]); v[1] = addv(v[1], v[5]); v[2] = addv(v[2], v[6]); v[3] = addv(v[3], v[7]); v[12] = xorv(v[12], v[0]); v[13] = xorv(v[13], v[1]); v[14] = xorv(v[14], v[2]); v[15] = xorv(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = addv(v[8], v[12]); v[9] = addv(v[9], v[13]); v[10] = addv(v[10], v[14]); v[11] = addv(v[11], v[15]); v[4] = xorv(v[4], v[8]); v[5] = xorv(v[5], v[9]); v[6] = xorv(v[6], v[10]); v[7] = xorv(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = addv(v[0], v[5]); v[1] = addv(v[1], v[6]); v[2] = addv(v[2], v[7]); v[3] = addv(v[3], v[4]); v[15] = xorv(v[15], v[0]); v[12] = xorv(v[12], v[1]); v[13] = xorv(v[13], v[2]); v[14] = xorv(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = addv(v[10], v[15]); v[11] = addv(v[11], v[12]); v[8] = addv(v[8], v[13]); v[9] = addv(v[9], v[14]); v[5] = xorv(v[5], v[10]); v[6] = xorv(v[6], v[11]); v[7] = xorv(v[7], v[8]); v[4] = xorv(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = addv(v[0], v[5]); v[1] = addv(v[1], v[6]); v[2] = addv(v[2], v[7]); v[3] = addv(v[3], v[4]); v[15] = xorv(v[15], v[0]); v[12] = xorv(v[12], v[1]); v[13] = xorv(v[13], v[2]); v[14] = xorv(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = addv(v[10], v[15]); v[11] = addv(v[11], v[12]); v[8] = addv(v[8], v[13]); v[9] = addv(v[9], v[14]); v[5] = xorv(v[5], v[10]); v[6] = xorv(v[6], v[11]); v[7] = xorv(v[7], v[8]); v[4] = xorv(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } INLINE void transpose_vecs(__m128i vecs[DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } INLINE void transpose_msg_vecs(const uint8_t *const *inputs, size_t block_offset, __m128i out[16]) { out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); for (size_t i = 0; i < 4; ++i) { _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs(&out[0]); transpose_vecs(&out[4]); transpose_vecs(&out[8]); transpose_vecs(&out[12]); } INLINE void load_counters(uint64_t counter, bool increment_counter, __m128i *out_lo, __m128i *out_hi) { const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); const __m128i add1 = _mm_and_si128(mask, add0); __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); *out_lo = l; *out_hi = h; } static void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m128i h_vecs[8] = { set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), }; __m128i counter_low_vec, counter_high_vec; load_counters(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); __m128i block_flags_vec = set1(block_flags); __m128i msg_vecs[16]; transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m128i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn(v, msg_vecs, 0); round_fn(v, msg_vecs, 1); round_fn(v, msg_vecs, 2); round_fn(v, msg_vecs, 3); round_fn(v, msg_vecs, 4); round_fn(v, msg_vecs, 5); round_fn(v, msg_vecs, 6); h_vecs[0] = xorv(v[0], v[8]); h_vecs[1] = xorv(v[1], v[9]); h_vecs[2] = xorv(v[2], v[10]); h_vecs[3] = xorv(v[3], v[11]); h_vecs[4] = xorv(v[4], v[12]); h_vecs[5] = xorv(v[5], v[13]); h_vecs[6] = xorv(v[6], v[14]); h_vecs[7] = xorv(v[7], v[15]); block_flags = flags; } transpose_vecs(&h_vecs[0]); transpose_vecs(&h_vecs[4]); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); } INLINE void hash_one_sse2(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { uint32_t cv[8]; memcpy(cv, key, BLAKE3_KEY_LEN); uint8_t block_flags = flags | flags_start; while (blocks > 0) { if (blocks == 1) { block_flags |= flags_end; } blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; } memcpy(out, cv, BLAKE3_OUT_LEN); } void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs >= DEGREE) { blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += DEGREE; } inputs += DEGREE; num_inputs -= DEGREE; out = &out[DEGREE * BLAKE3_OUT_LEN]; } while (num_inputs > 0) { hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 1; } inputs += 1; num_inputs -= 1; out = &out[BLAKE3_OUT_LEN]; } } blake3-1.5.4/c/blake3_sse2_x86-64_unix.S000064400000000000000000002063721046102023000154500ustar 00000000000000#if defined(__ELF__) && defined(__linux__) .section .note.GNU-stack,"",%progbits #endif #if defined(__ELF__) && defined(__CET__) && defined(__has_include) #if __has_include() #include #endif #endif #if !defined(_CET_ENDBR) #define _CET_ENDBR #endif .intel_syntax noprefix .global blake3_hash_many_sse2 .global _blake3_hash_many_sse2 .global blake3_compress_in_place_sse2 .global _blake3_compress_in_place_sse2 .global blake3_compress_xof_sse2 .global _blake3_compress_xof_sse2 #ifdef __APPLE__ .text #else .section .text #endif .p2align 6 _blake3_hash_many_sse2: blake3_hash_many_sse2: _CET_ENDBR push r15 push r14 push r13 push r12 push rbx push rbp mov rbp, rsp sub rsp, 360 and rsp, 0xFFFFFFFFFFFFFFC0 neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 0x00 movdqa xmmword ptr [rsp+0x130], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0+rip] pand xmm0, xmmword ptr [ADD1+rip] movdqa xmmword ptr [rsp+0x150], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 0x00 paddd xmm0, xmm1 movdqa xmmword ptr [rsp+0x110], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 0x00 psubd xmm2, xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 mov rbx, qword ptr [rbp+0x50] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+0x38] movzx r12d, byte ptr [rbp+0x48] cmp rsi, 4 jc 3f 2: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 0x00 pshufd xmm1, xmm3, 0x55 pshufd xmm2, xmm3, 0xAA pshufd xmm3, xmm3, 0xFF movdqu xmm7, xmmword ptr [rcx+0x10] pshufd xmm4, xmm7, 0x00 pshufd xmm5, xmm7, 0x55 pshufd xmm6, xmm7, 0xAA pshufd xmm7, xmm7, 0xFF mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 9: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-0x40] movdqu xmm9, xmmword ptr [r9+rdx-0x40] movdqu xmm10, xmmword ptr [r10+rdx-0x40] movdqu xmm11, xmmword ptr [r11+rdx-0x40] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+0x10], xmm9 movdqa xmmword ptr [rsp+0x20], xmm12 movdqa xmmword ptr [rsp+0x30], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x30] movdqu xmm9, xmmword ptr [r9+rdx-0x30] movdqu xmm10, xmmword ptr [r10+rdx-0x30] movdqu xmm11, xmmword ptr [r11+rdx-0x30] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x40], xmm8 movdqa xmmword ptr [rsp+0x50], xmm9 movdqa xmmword ptr [rsp+0x60], xmm12 movdqa xmmword ptr [rsp+0x70], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x20] movdqu xmm9, xmmword ptr [r9+rdx-0x20] movdqu xmm10, xmmword ptr [r10+rdx-0x20] movdqu xmm11, xmmword ptr [r11+rdx-0x20] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x80], xmm8 movdqa xmmword ptr [rsp+0x90], xmm9 movdqa xmmword ptr [rsp+0xA0], xmm12 movdqa xmmword ptr [rsp+0xB0], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x10] movdqu xmm9, xmmword ptr [r9+rdx-0x10] movdqu xmm10, xmmword ptr [r10+rdx-0x10] movdqu xmm11, xmmword ptr [r11+rdx-0x10] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0xC0], xmm8 movdqa xmmword ptr [rsp+0xD0], xmm9 movdqa xmmword ptr [rsp+0xE0], xmm12 movdqa xmmword ptr [rsp+0xF0], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] movdqa xmm12, xmmword ptr [rsp+0x110] movdqa xmm13, xmmword ptr [rsp+0x120] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] movd xmm15, eax pshufd xmm15, xmm15, 0x00 prefetcht0 [r8+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r11+rdx+0x80] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x80] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x70] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xB0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x50] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xC0] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xA0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0x60] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xF0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne 9b movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+0x20], xmm1 movdqu xmmword ptr [rbx+0x40], xmm9 movdqu xmmword ptr [rbx+0x60], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+0x10], xmm4 movdqu xmmword ptr [rbx+0x30], xmm5 movdqu xmmword ptr [rbx+0x50], xmm9 movdqu xmmword ptr [rbx+0x70], xmm7 movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+0x150] movdqa xmmword ptr [rsp+0x110], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+0x120] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+0x120], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc 2b test rsi, rsi jnz 3f 4: mov rsp, rbp pop rbp pop rbx pop r12 pop r13 pop r14 pop r15 ret .p2align 5 3: test esi, 0x2 je 3f movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] movd xmm14, dword ptr [rsp+0x120] punpckldq xmm13, xmm14 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] movd xmm13, dword ptr [rsp+0x124] punpckldq xmm14, xmm13 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 0x93 movups xmm12, xmmword ptr [r9+rdx-0x40] movups xmm13, xmmword ptr [r9+rdx-0x30] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-0x20] movups xmm15, xmmword ptr [r9+rdx-0x10] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 0x93 shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 0x93 shl rax, 0x20 or rax, 0x40 movq xmm3, rax movdqa xmmword ptr [rsp+0x20], xmm3 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] punpcklqdq xmm3, xmmword ptr [rsp+0x20] punpcklqdq xmm11, xmmword ptr [rsp+0x20] mov al, 7 9: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+0x20], xmm4 movaps xmmword ptr [rsp+0x30], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 pshuflw xmm11, xmm11, 0xB1 pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+0x40], xmm5 movaps xmmword ptr [rsp+0x50], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x93 pshufd xmm8, xmm8, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x39 pshufd xmm10, xmm10, 0x39 paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 pshuflw xmm11, xmm11, 0xB1 pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x39 pshufd xmm8, xmm8, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x93 pshufd xmm10, xmm10, 0x93 dec al je 9f movdqa xmm12, xmmword ptr [rsp+0x20] movdqa xmm5, xmmword ptr [rsp+0x40] pshufd xmm13, xmm12, 0x0F shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm13, xmm12 movdqa xmmword ptr [rsp+0x20], xmm13 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 movdqa xmm13, xmm6 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm12, xmm13 pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmmword ptr [rsp+0x40], xmm12 movdqa xmm5, xmmword ptr [rsp+0x30] movdqa xmm13, xmmword ptr [rsp+0x50] pshufd xmm6, xmm5, 0x0F shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm6, xmm5 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 movdqa xmmword ptr [rsp+0x30], xmm2 movdqa xmm2, xmm14 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm5, xmm2 movdqa xmm2, xmmword ptr [rsp+0x30] pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 0x1E movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+0x20] movdqa xmm6, xmmword ptr [rsp+0x40] jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 movups xmmword ptr [rbx+0x20], xmm8 movups xmmword ptr [rbx+0x30], xmm9 mov eax, dword ptr [rsp+0x130] neg eax mov r10d, dword ptr [rsp+0x110+8*rax] mov r11d, dword ptr [rsp+0x120+8*rax] mov dword ptr [rsp+0x110], r10d mov dword ptr [rsp+0x120], r11d add rdi, 16 add rbx, 64 sub rsi, 2 3: test esi, 0x1 je 4b movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] movd xmm14, dword ptr [rsp+0x120] punpckldq xmm13, xmm14 mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] shl rax, 32 or rax, 64 movq xmm12, rax movdqa xmm3, xmm13 punpcklqdq xmm3, xmm12 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 jmp 4b .p2align 6 blake3_compress_in_place_sse2: _blake3_compress_in_place_sse2: _CET_ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] shl r8, 32 add rdx, r8 movq xmm3, rcx movq xmm4, rdx punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rsi] movups xmm5, xmmword ptr [rsi+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rsi+0x20] movups xmm7, xmmword ptr [rsi+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rdi], xmm0 movups xmmword ptr [rdi+0x10], xmm1 ret .p2align 6 blake3_compress_xof_sse2: _blake3_compress_xof_sse2: _CET_ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, r8b movzx edx, dl shl rax, 32 add rdx, rax movq xmm3, rcx movq xmm4, rdx punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rsi] movups xmm5, xmmword ptr [rsi+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rsi+0x20] movups xmm7, xmmword ptr [rsi+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: movdqu xmm4, xmmword ptr [rdi] movdqu xmm5, xmmword ptr [rdi+0x10] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r9], xmm0 movups xmmword ptr [r9+0x10], xmm1 movups xmmword ptr [r9+0x20], xmm2 movups xmmword ptr [r9+0x30], xmm3 ret #ifdef __APPLE__ .static_data #else .section .rodata #endif .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A ADD0: .long 0, 1, 2, 3 ADD1: .long 4, 4, 4, 4 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 PBLENDW_0x33_MASK: .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 PBLENDW_0xCC_MASK: .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF PBLENDW_0x3F_MASK: .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 PBLENDW_0xC0_MASK: .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF blake3-1.5.4/c/blake3_sse2_x86-64_windows_gnu.S000064400000000000000000002130421046102023000170200ustar 00000000000000.intel_syntax noprefix .global blake3_hash_many_sse2 .global _blake3_hash_many_sse2 .global blake3_compress_in_place_sse2 .global _blake3_compress_in_place_sse2 .global blake3_compress_xof_sse2 .global _blake3_compress_xof_sse2 .section .text .p2align 6 _blake3_hash_many_sse2: blake3_hash_many_sse2: push r15 push r14 push r13 push r12 push rsi push rdi push rbx push rbp mov rbp, rsp sub rsp, 528 and rsp, 0xFFFFFFFFFFFFFFC0 movdqa xmmword ptr [rsp+0x170], xmm6 movdqa xmmword ptr [rsp+0x180], xmm7 movdqa xmmword ptr [rsp+0x190], xmm8 movdqa xmmword ptr [rsp+0x1A0], xmm9 movdqa xmmword ptr [rsp+0x1B0], xmm10 movdqa xmmword ptr [rsp+0x1C0], xmm11 movdqa xmmword ptr [rsp+0x1D0], xmm12 movdqa xmmword ptr [rsp+0x1E0], xmm13 movdqa xmmword ptr [rsp+0x1F0], xmm14 movdqa xmmword ptr [rsp+0x200], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+0x68] movzx r9, byte ptr [rbp+0x70] neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 0x00 movdqa xmmword ptr [rsp+0x130], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0+rip] pand xmm0, xmmword ptr [ADD1+rip] movdqa xmmword ptr [rsp+0x150], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 0x00 paddd xmm0, xmm1 movdqa xmmword ptr [rsp+0x110], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 0x00 psubd xmm2, xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 mov rbx, qword ptr [rbp+0x90] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+0x78] movzx r12d, byte ptr [rbp+0x88] cmp rsi, 4 jc 3f 2: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 0x00 pshufd xmm1, xmm3, 0x55 pshufd xmm2, xmm3, 0xAA pshufd xmm3, xmm3, 0xFF movdqu xmm7, xmmword ptr [rcx+0x10] pshufd xmm4, xmm7, 0x00 pshufd xmm5, xmm7, 0x55 pshufd xmm6, xmm7, 0xAA pshufd xmm7, xmm7, 0xFF mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx 9: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-0x40] movdqu xmm9, xmmword ptr [r9+rdx-0x40] movdqu xmm10, xmmword ptr [r10+rdx-0x40] movdqu xmm11, xmmword ptr [r11+rdx-0x40] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+0x10], xmm9 movdqa xmmword ptr [rsp+0x20], xmm12 movdqa xmmword ptr [rsp+0x30], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x30] movdqu xmm9, xmmword ptr [r9+rdx-0x30] movdqu xmm10, xmmword ptr [r10+rdx-0x30] movdqu xmm11, xmmword ptr [r11+rdx-0x30] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x40], xmm8 movdqa xmmword ptr [rsp+0x50], xmm9 movdqa xmmword ptr [rsp+0x60], xmm12 movdqa xmmword ptr [rsp+0x70], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x20] movdqu xmm9, xmmword ptr [r9+rdx-0x20] movdqu xmm10, xmmword ptr [r10+rdx-0x20] movdqu xmm11, xmmword ptr [r11+rdx-0x20] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x80], xmm8 movdqa xmmword ptr [rsp+0x90], xmm9 movdqa xmmword ptr [rsp+0xA0], xmm12 movdqa xmmword ptr [rsp+0xB0], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x10] movdqu xmm9, xmmword ptr [r9+rdx-0x10] movdqu xmm10, xmmword ptr [r10+rdx-0x10] movdqu xmm11, xmmword ptr [r11+rdx-0x10] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0xC0], xmm8 movdqa xmmword ptr [rsp+0xD0], xmm9 movdqa xmmword ptr [rsp+0xE0], xmm12 movdqa xmmword ptr [rsp+0xF0], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] movdqa xmm12, xmmword ptr [rsp+0x110] movdqa xmm13, xmmword ptr [rsp+0x120] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] movd xmm15, eax pshufd xmm15, xmm15, 0x00 prefetcht0 [r8+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r11+rdx+0x80] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x80] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x70] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xB0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x50] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xC0] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xA0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0x60] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xF0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne 9b movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+0x20], xmm1 movdqu xmmword ptr [rbx+0x40], xmm9 movdqu xmmword ptr [rbx+0x60], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+0x10], xmm4 movdqu xmmword ptr [rbx+0x30], xmm5 movdqu xmmword ptr [rbx+0x50], xmm9 movdqu xmmword ptr [rbx+0x70], xmm7 movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+0x150] movdqa xmmword ptr [rsp+0x110], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+0x120] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+0x120], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc 2b test rsi, rsi jne 3f 4: movdqa xmm6, xmmword ptr [rsp+0x170] movdqa xmm7, xmmword ptr [rsp+0x180] movdqa xmm8, xmmword ptr [rsp+0x190] movdqa xmm9, xmmword ptr [rsp+0x1A0] movdqa xmm10, xmmword ptr [rsp+0x1B0] movdqa xmm11, xmmword ptr [rsp+0x1C0] movdqa xmm12, xmmword ptr [rsp+0x1D0] movdqa xmm13, xmmword ptr [rsp+0x1E0] movdqa xmm14, xmmword ptr [rsp+0x1F0] movdqa xmm15, xmmword ptr [rsp+0x200] mov rsp, rbp pop rbp pop rbx pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret .p2align 5 3: test esi, 0x2 je 3f movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] movd xmm14, dword ptr [rsp+0x120] punpckldq xmm13, xmm14 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] movd xmm13, dword ptr [rsp+0x124] punpckldq xmm14, xmm13 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 0x93 movups xmm12, xmmword ptr [r9+rdx-0x40] movups xmm13, xmmword ptr [r9+rdx-0x30] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-0x20] movups xmm15, xmmword ptr [r9+rdx-0x10] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 0x93 shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 0x93 shl rax, 0x20 or rax, 0x40 movq xmm3, rax movdqa xmmword ptr [rsp+0x20], xmm3 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] punpcklqdq xmm3, xmmword ptr [rsp+0x20] punpcklqdq xmm11, xmmword ptr [rsp+0x20] mov al, 7 9: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+0x20], xmm4 movaps xmmword ptr [rsp+0x30], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 pshuflw xmm11, xmm11, 0xB1 pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+0x40], xmm5 movaps xmmword ptr [rsp+0x50], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x93 pshufd xmm8, xmm8, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x39 pshufd xmm10, xmm10, 0x39 paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 pshuflw xmm11, xmm11, 0xB1 pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x39 pshufd xmm8, xmm8, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x93 pshufd xmm10, xmm10, 0x93 dec al je 9f movdqa xmm12, xmmword ptr [rsp+0x20] movdqa xmm5, xmmword ptr [rsp+0x40] pshufd xmm13, xmm12, 0x0F shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm13, xmm12 movdqa xmmword ptr [rsp+0x20], xmm13 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 movdqa xmm13, xmm6 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm12, xmm13 pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmmword ptr [rsp+0x40], xmm12 movdqa xmm5, xmmword ptr [rsp+0x30] movdqa xmm13, xmmword ptr [rsp+0x50] pshufd xmm6, xmm5, 0x0F shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm6, xmm5 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 movdqa xmmword ptr [rsp+0x30], xmm2 movdqa xmm2, xmm14 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm5, xmm2 movdqa xmm2, xmmword ptr [rsp+0x30] pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 0x1E movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+0x20] movdqa xmm6, xmmword ptr [rsp+0x40] jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 movups xmmword ptr [rbx+0x20], xmm8 movups xmmword ptr [rbx+0x30], xmm9 mov eax, dword ptr [rsp+0x130] neg eax mov r10d, dword ptr [rsp+0x110+8*rax] mov r11d, dword ptr [rsp+0x120+8*rax] mov dword ptr [rsp+0x110], r10d mov dword ptr [rsp+0x120], r11d add rdi, 16 add rbx, 64 sub rsi, 2 3: test esi, 0x1 je 4b movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] movd xmm14, dword ptr [rsp+0x120] punpckldq xmm13, xmm14 mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] shl rax, 32 or rax, 64 movq xmm12, rax movdqa xmm3, xmm13 punpcklqdq xmm3, xmm12 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 jmp 4b .p2align 6 blake3_compress_in_place_sse2: _blake3_compress_in_place_sse2: sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+0x10], xmm7 movdqa xmmword ptr [rsp+0x20], xmm8 movdqa xmmword ptr [rsp+0x30], xmm9 movdqa xmmword ptr [rsp+0x40], xmm11 movdqa xmmword ptr [rsp+0x50], xmm14 movdqa xmmword ptr [rsp+0x60], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, byte ptr [rsp+0xA0] movzx r8d, r8b shl rax, 32 add r8, rax movq xmm3, r9 movq xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+0x20] movups xmm7, xmmword ptr [rdx+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm14 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rcx], xmm0 movups xmmword ptr [rcx+0x10], xmm1 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+0x10] movdqa xmm8, xmmword ptr [rsp+0x20] movdqa xmm9, xmmword ptr [rsp+0x30] movdqa xmm11, xmmword ptr [rsp+0x40] movdqa xmm14, xmmword ptr [rsp+0x50] movdqa xmm15, xmmword ptr [rsp+0x60] add rsp, 120 ret .p2align 6 _blake3_compress_xof_sse2: blake3_compress_xof_sse2: sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+0x10], xmm7 movdqa xmmword ptr [rsp+0x20], xmm8 movdqa xmmword ptr [rsp+0x30], xmm9 movdqa xmmword ptr [rsp+0x40], xmm11 movdqa xmmword ptr [rsp+0x50], xmm14 movdqa xmmword ptr [rsp+0x60], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, byte ptr [rsp+0xA0] movzx r8d, r8b mov r10, qword ptr [rsp+0xA8] shl rax, 32 add r8, rax movq xmm3, r9 movq xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+0x20] movups xmm7, xmmword ptr [rdx+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm14 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: movdqu xmm4, xmmword ptr [rcx] movdqu xmm5, xmmword ptr [rcx+0x10] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r10], xmm0 movups xmmword ptr [r10+0x10], xmm1 movups xmmword ptr [r10+0x20], xmm2 movups xmmword ptr [r10+0x30], xmm3 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+0x10] movdqa xmm8, xmmword ptr [rsp+0x20] movdqa xmm9, xmmword ptr [rsp+0x30] movdqa xmm11, xmmword ptr [rsp+0x40] movdqa xmm14, xmmword ptr [rsp+0x50] movdqa xmm15, xmmword ptr [rsp+0x60] add rsp, 120 ret .section .rdata .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A ADD0: .long 0, 1, 2, 3 ADD1: .long 4, 4, 4, 4 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 PBLENDW_0x33_MASK: .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 PBLENDW_0xCC_MASK: .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF PBLENDW_0x3F_MASK: .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 PBLENDW_0xC0_MASK: .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF blake3-1.5.4/c/blake3_sse2_x86-64_windows_msvc.asm000064400000000000000000002125061046102023000175610ustar 00000000000000public _blake3_hash_many_sse2 public blake3_hash_many_sse2 public blake3_compress_in_place_sse2 public _blake3_compress_in_place_sse2 public blake3_compress_xof_sse2 public _blake3_compress_xof_sse2 _TEXT SEGMENT ALIGN(16) 'CODE' ALIGN 16 blake3_hash_many_sse2 PROC _blake3_hash_many_sse2 PROC push r15 push r14 push r13 push r12 push rsi push rdi push rbx push rbp mov rbp, rsp sub rsp, 528 and rsp, 0FFFFFFFFFFFFFFC0H movdqa xmmword ptr [rsp+170H], xmm6 movdqa xmmword ptr [rsp+180H], xmm7 movdqa xmmword ptr [rsp+190H], xmm8 movdqa xmmword ptr [rsp+1A0H], xmm9 movdqa xmmword ptr [rsp+1B0H], xmm10 movdqa xmmword ptr [rsp+1C0H], xmm11 movdqa xmmword ptr [rsp+1D0H], xmm12 movdqa xmmword ptr [rsp+1E0H], xmm13 movdqa xmmword ptr [rsp+1F0H], xmm14 movdqa xmmword ptr [rsp+200H], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+68H] movzx r9, byte ptr [rbp+70H] neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 00H movdqa xmmword ptr [rsp+130H], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0] pand xmm0, xmmword ptr [ADD1] movdqa xmmword ptr [rsp+150H], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 00H paddd xmm0, xmm1 movdqa xmmword ptr [rsp+110H], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK] pxor xmm1, xmmword ptr [CMP_MSB_MASK] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 00H psubd xmm2, xmm1 movdqa xmmword ptr [rsp+120H], xmm2 mov rbx, qword ptr [rbp+90H] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+78H] movzx r12d, byte ptr [rbp+88H] cmp rsi, 4 jc final3blocks outerloop4: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 00H pshufd xmm1, xmm3, 55H pshufd xmm2, xmm3, 0AAH pshufd xmm3, xmm3, 0FFH movdqu xmm7, xmmword ptr [rcx+10H] pshufd xmm4, xmm7, 00H pshufd xmm5, xmm7, 55H pshufd xmm6, xmm7, 0AAH pshufd xmm7, xmm7, 0FFH mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop4: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-40H] movdqu xmm9, xmmword ptr [r9+rdx-40H] movdqu xmm10, xmmword ptr [r10+rdx-40H] movdqu xmm11, xmmword ptr [r11+rdx-40H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+10H], xmm9 movdqa xmmword ptr [rsp+20H], xmm12 movdqa xmmword ptr [rsp+30H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-30H] movdqu xmm9, xmmword ptr [r9+rdx-30H] movdqu xmm10, xmmword ptr [r10+rdx-30H] movdqu xmm11, xmmword ptr [r11+rdx-30H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+40H], xmm8 movdqa xmmword ptr [rsp+50H], xmm9 movdqa xmmword ptr [rsp+60H], xmm12 movdqa xmmword ptr [rsp+70H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-20H] movdqu xmm9, xmmword ptr [r9+rdx-20H] movdqu xmm10, xmmword ptr [r10+rdx-20H] movdqu xmm11, xmmword ptr [r11+rdx-20H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+80H], xmm8 movdqa xmmword ptr [rsp+90H], xmm9 movdqa xmmword ptr [rsp+0A0H], xmm12 movdqa xmmword ptr [rsp+0B0H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-10H] movdqu xmm9, xmmword ptr [r9+rdx-10H] movdqu xmm10, xmmword ptr [r10+rdx-10H] movdqu xmm11, xmmword ptr [r11+rdx-10H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0C0H], xmm8 movdqa xmmword ptr [rsp+0D0H], xmm9 movdqa xmmword ptr [rsp+0E0H], xmm12 movdqa xmmword ptr [rsp+0F0H], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1] movdqa xmm10, xmmword ptr [BLAKE3_IV_2] movdqa xmm11, xmmword ptr [BLAKE3_IV_3] movdqa xmm12, xmmword ptr [rsp+110H] movdqa xmm13, xmmword ptr [rsp+120H] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] movd xmm15, eax pshufd xmm15, xmm15, 00H prefetcht0 byte ptr [r8+rdx+80H] prefetcht0 byte ptr [r9+rdx+80H] prefetcht0 byte ptr [r10+rdx+80H] prefetcht0 byte ptr [r11+rdx+80H] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+40H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [BLAKE3_IV_0] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+10H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+50H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+80H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+0C0H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+90H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+0D0H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+20H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+70H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+60H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+10H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+90H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0B0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+0E0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+30H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+0D0H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+40H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+20H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+60H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+0B0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+50H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0F0H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0A0H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+0E0H] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+70H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+30H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+40H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+50H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+80H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0C0H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+0F0H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0D0H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+0A0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+70H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+20H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+10H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+90H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+80H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0E0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+0C0H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0D0H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+20H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+30H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+60H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0B0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+10H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0F0H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+90H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0E0H] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+30H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0A0H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+40H] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne innerloop4 movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+20H], xmm1 movdqu xmmword ptr [rbx+40H], xmm9 movdqu xmmword ptr [rbx+60H], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+10H], xmm4 movdqu xmmword ptr [rbx+30H], xmm5 movdqu xmmword ptr [rbx+50H], xmm9 movdqu xmmword ptr [rbx+70H], xmm7 movdqa xmm1, xmmword ptr [rsp+110H] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+150H] movdqa xmmword ptr [rsp+110H], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK] pxor xmm1, xmmword ptr [CMP_MSB_MASK] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+120H] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+120H], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc outerloop4 test rsi, rsi jne final3blocks unwind: movdqa xmm6, xmmword ptr [rsp+170H] movdqa xmm7, xmmword ptr [rsp+180H] movdqa xmm8, xmmword ptr [rsp+190H] movdqa xmm9, xmmword ptr [rsp+1A0H] movdqa xmm10, xmmword ptr [rsp+1B0H] movdqa xmm11, xmmword ptr [rsp+1C0H] movdqa xmm12, xmmword ptr [rsp+1D0H] movdqa xmm13, xmmword ptr [rsp+1E0H] movdqa xmm14, xmmword ptr [rsp+1F0H] movdqa xmm15, xmmword ptr [rsp+200H] mov rsp, rbp pop rbp pop rbx pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret ALIGN 16 final3blocks: test esi, 2H je final1block movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+110H] movd xmm14, dword ptr [rsp+120H] punpckldq xmm13, xmm14 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+114H] movd xmm13, dword ptr [rsp+124H] punpckldq xmm14, xmm13 movaps xmmword ptr [rsp+10H], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-40H] movups xmm5, xmmword ptr [r8+rdx-30H] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-20H] movups xmm7, xmmword ptr [r8+rdx-10H] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 93H movups xmm12, xmmword ptr [r9+rdx-40H] movups xmm13, xmmword ptr [r9+rdx-30H] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-20H] movups xmm15, xmmword ptr [r9+rdx-10H] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 93H shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 93H shl rax, 20H or rax, 40H movd xmm3, rax movdqa xmmword ptr [rsp+20H], xmm3 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+10H] punpcklqdq xmm3, xmmword ptr [rsp+20H] punpcklqdq xmm11, xmmword ptr [rsp+20H] mov al, 7 roundloop2: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+20H], xmm4 movaps xmmword ptr [rsp+30H], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H pshuflw xmm11, xmm11, 0B1H pshufhw xmm11, xmm11, 0B1H paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+40H], xmm5 movaps xmmword ptr [rsp+50H], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 93H pshufd xmm8, xmm8, 93H pshufd xmm3, xmm3, 4EH pshufd xmm11, xmm11, 4EH pshufd xmm2, xmm2, 39H pshufd xmm10, xmm10, 39H paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H pshuflw xmm11, xmm11, 0B1H pshufhw xmm11, xmm11, 0B1H paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 39H pshufd xmm8, xmm8, 39H pshufd xmm3, xmm3, 4EH pshufd xmm11, xmm11, 4EH pshufd xmm2, xmm2, 93H pshufd xmm10, xmm10, 93H dec al je endroundloop2 movdqa xmm12, xmmword ptr [rsp+20H] movdqa xmm5, xmmword ptr [rsp+40H] pshufd xmm13, xmm12, 0FH shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 39H movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK] pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK] por xmm13, xmm12 movdqa xmmword ptr [rsp+20H], xmm13 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 movdqa xmm13, xmm6 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK] por xmm12, xmm13 pshufd xmm12, xmm12, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmmword ptr [rsp+40H], xmm12 movdqa xmm5, xmmword ptr [rsp+30H] movdqa xmm13, xmmword ptr [rsp+50H] pshufd xmm6, xmm5, 0FH shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 39H movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK] pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK] por xmm6, xmm5 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 movdqa xmmword ptr [rsp+30H], xmm2 movdqa xmm2, xmm14 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK] por xmm5, xmm2 movdqa xmm2, xmmword ptr [rsp+30H] pshufd xmm5, xmm5, 78H punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 1EH movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+20H] movdqa xmm6, xmmword ptr [rsp+40H] jmp roundloop2 endroundloop2: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne innerloop2 movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+10H], xmm1 movups xmmword ptr [rbx+20H], xmm8 movups xmmword ptr [rbx+30H], xmm9 mov eax, dword ptr [rsp+130H] neg eax mov r10d, dword ptr [rsp+110H+8*rax] mov r11d, dword ptr [rsp+120H+8*rax] mov dword ptr [rsp+110H], r10d mov dword ptr [rsp+120H], r11d add rdi, 16 add rbx, 64 sub rsi, 2 final1block: test esi, 1H je unwind movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movd xmm13, dword ptr [rsp+110H] movd xmm14, dword ptr [rsp+120H] punpckldq xmm13, xmm14 mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop1: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV] shl rax, 32 or rax, 64 movd xmm12, rax movdqa xmm3, xmm13 punpcklqdq xmm3, xmm12 movups xmm4, xmmword ptr [r8+rdx-40H] movups xmm5, xmmword ptr [r8+rdx-30H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-20H] movups xmm7, xmmword ptr [r8+rdx-10H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H mov al, 7 roundloop1: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz endroundloop1 movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] por xmm8, xmm10 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp roundloop1 endroundloop1: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne innerloop1 movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+10H], xmm1 jmp unwind _blake3_hash_many_sse2 ENDP blake3_hash_many_sse2 ENDP blake3_compress_in_place_sse2 PROC _blake3_compress_in_place_sse2 PROC sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+10H], xmm7 movdqa xmmword ptr [rsp+20H], xmm8 movdqa xmmword ptr [rsp+30H], xmm9 movdqa xmmword ptr [rsp+40H], xmm11 movdqa xmmword ptr [rsp+50H], xmm14 movdqa xmmword ptr [rsp+60H], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm2, xmmword ptr [BLAKE3_IV] movzx eax, byte ptr [rsp+0A0H] movzx r8d, r8b shl rax, 32 add r8, rax movd xmm3, r9 movd xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+10H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+20H] movups xmm7, xmmword ptr [rdx+30H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H mov al, 7 @@: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz @F movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] por xmm8, xmm14 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp @B @@: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rcx], xmm0 movups xmmword ptr [rcx+10H], xmm1 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+10H] movdqa xmm8, xmmword ptr [rsp+20H] movdqa xmm9, xmmword ptr [rsp+30H] movdqa xmm11, xmmword ptr [rsp+40H] movdqa xmm14, xmmword ptr [rsp+50H] movdqa xmm15, xmmword ptr [rsp+60H] add rsp, 120 ret _blake3_compress_in_place_sse2 ENDP blake3_compress_in_place_sse2 ENDP ALIGN 16 blake3_compress_xof_sse2 PROC _blake3_compress_xof_sse2 PROC sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+10H], xmm7 movdqa xmmword ptr [rsp+20H], xmm8 movdqa xmmword ptr [rsp+30H], xmm9 movdqa xmmword ptr [rsp+40H], xmm11 movdqa xmmword ptr [rsp+50H], xmm14 movdqa xmmword ptr [rsp+60H], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm2, xmmword ptr [BLAKE3_IV] movzx eax, byte ptr [rsp+0A0H] movzx r8d, r8b mov r10, qword ptr [rsp+0A8H] shl rax, 32 add r8, rax movd xmm3, r9 movd xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+10H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+20H] movups xmm7, xmmword ptr [rdx+30H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H mov al, 7 @@: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz @F movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] por xmm8, xmm14 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp @B @@: movdqu xmm4, xmmword ptr [rcx] movdqu xmm5, xmmword ptr [rcx+10H] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r10], xmm0 movups xmmword ptr [r10+10H], xmm1 movups xmmword ptr [r10+20H], xmm2 movups xmmword ptr [r10+30H], xmm3 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+10H] movdqa xmm8, xmmword ptr [rsp+20H] movdqa xmm9, xmmword ptr [rsp+30H] movdqa xmm11, xmmword ptr [rsp+40H] movdqa xmm14, xmmword ptr [rsp+50H] movdqa xmm15, xmmword ptr [rsp+60H] add rsp, 120 ret _blake3_compress_xof_sse2 ENDP blake3_compress_xof_sse2 ENDP _TEXT ENDS _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' ALIGN 64 BLAKE3_IV: dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH ADD0: dd 0, 1, 2, 3 ADD1: dd 4 dup (4) BLAKE3_IV_0: dd 4 dup (6A09E667H) BLAKE3_IV_1: dd 4 dup (0BB67AE85H) BLAKE3_IV_2: dd 4 dup (3C6EF372H) BLAKE3_IV_3: dd 4 dup (0A54FF53AH) BLAKE3_BLOCK_LEN: dd 4 dup (64) CMP_MSB_MASK: dd 8 dup(80000000H) PBLENDW_0x33_MASK: dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H PBLENDW_0xCC_MASK: dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH PBLENDW_0x3F_MASK: dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H PBLENDW_0xC0_MASK: dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH _RDATA ENDS END blake3-1.5.4/c/blake3_sse41.c000064400000000000000000000505151046102023000135500ustar 00000000000000#include "blake3_impl.h" #include #define DEGREE 4 #define _mm_shuffle_ps2(a, b, c) \ (_mm_castps_si128( \ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) INLINE __m128i loadu(const uint8_t src[16]) { return _mm_loadu_si128((const __m128i *)src); } INLINE void storeu(__m128i src, uint8_t dest[16]) { _mm_storeu_si128((__m128i *)dest, src); } INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } // Note that clang-format doesn't like the name "xor" for some reason. INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); } INLINE __m128i rot16(__m128i x) { return _mm_shuffle_epi8( x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); } INLINE __m128i rot12(__m128i x) { return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); } INLINE __m128i rot8(__m128i x) { return _mm_shuffle_epi8( x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); } INLINE __m128i rot7(__m128i x) { return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); } INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { *row0 = addv(addv(*row0, m), *row1); *row3 = xorv(*row3, *row0); *row3 = rot16(*row3); *row2 = addv(*row2, *row3); *row1 = xorv(*row1, *row2); *row1 = rot12(*row1); } INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { *row0 = addv(addv(*row0, m), *row1); *row3 = xorv(*row3, *row0); *row3 = rot8(*row3); *row2 = addv(*row2, *row3); *row1 = xorv(*row1, *row2); *row1 = rot7(*row1); } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); } INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); } INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { rows[0] = loadu((uint8_t *)&cv[0]); rows[1] = loadu((uint8_t *)&cv[4]); rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); rows[3] = set4(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags); __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); __m128i t0, t1, t2, t3, tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); } void blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); } void blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu(xorv(rows[0], rows[2]), &out[0]); storeu(xorv(rows[1], rows[3]), &out[16]); storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); } INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = addv(v[0], v[4]); v[1] = addv(v[1], v[5]); v[2] = addv(v[2], v[6]); v[3] = addv(v[3], v[7]); v[12] = xorv(v[12], v[0]); v[13] = xorv(v[13], v[1]); v[14] = xorv(v[14], v[2]); v[15] = xorv(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = addv(v[8], v[12]); v[9] = addv(v[9], v[13]); v[10] = addv(v[10], v[14]); v[11] = addv(v[11], v[15]); v[4] = xorv(v[4], v[8]); v[5] = xorv(v[5], v[9]); v[6] = xorv(v[6], v[10]); v[7] = xorv(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = addv(v[0], v[4]); v[1] = addv(v[1], v[5]); v[2] = addv(v[2], v[6]); v[3] = addv(v[3], v[7]); v[12] = xorv(v[12], v[0]); v[13] = xorv(v[13], v[1]); v[14] = xorv(v[14], v[2]); v[15] = xorv(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = addv(v[8], v[12]); v[9] = addv(v[9], v[13]); v[10] = addv(v[10], v[14]); v[11] = addv(v[11], v[15]); v[4] = xorv(v[4], v[8]); v[5] = xorv(v[5], v[9]); v[6] = xorv(v[6], v[10]); v[7] = xorv(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = addv(v[0], v[5]); v[1] = addv(v[1], v[6]); v[2] = addv(v[2], v[7]); v[3] = addv(v[3], v[4]); v[15] = xorv(v[15], v[0]); v[12] = xorv(v[12], v[1]); v[13] = xorv(v[13], v[2]); v[14] = xorv(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = addv(v[10], v[15]); v[11] = addv(v[11], v[12]); v[8] = addv(v[8], v[13]); v[9] = addv(v[9], v[14]); v[5] = xorv(v[5], v[10]); v[6] = xorv(v[6], v[11]); v[7] = xorv(v[7], v[8]); v[4] = xorv(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = addv(v[0], v[5]); v[1] = addv(v[1], v[6]); v[2] = addv(v[2], v[7]); v[3] = addv(v[3], v[4]); v[15] = xorv(v[15], v[0]); v[12] = xorv(v[12], v[1]); v[13] = xorv(v[13], v[2]); v[14] = xorv(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = addv(v[10], v[15]); v[11] = addv(v[11], v[12]); v[8] = addv(v[8], v[13]); v[9] = addv(v[9], v[14]); v[5] = xorv(v[5], v[10]); v[6] = xorv(v[6], v[11]); v[7] = xorv(v[7], v[8]); v[4] = xorv(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } INLINE void transpose_vecs(__m128i vecs[DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } INLINE void transpose_msg_vecs(const uint8_t *const *inputs, size_t block_offset, __m128i out[16]) { out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); for (size_t i = 0; i < 4; ++i) { _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs(&out[0]); transpose_vecs(&out[4]); transpose_vecs(&out[8]); transpose_vecs(&out[12]); } INLINE void load_counters(uint64_t counter, bool increment_counter, __m128i *out_lo, __m128i *out_hi) { const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); const __m128i add1 = _mm_and_si128(mask, add0); __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); *out_lo = l; *out_hi = h; } static void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m128i h_vecs[8] = { set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), }; __m128i counter_low_vec, counter_high_vec; load_counters(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); __m128i block_flags_vec = set1(block_flags); __m128i msg_vecs[16]; transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m128i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn(v, msg_vecs, 0); round_fn(v, msg_vecs, 1); round_fn(v, msg_vecs, 2); round_fn(v, msg_vecs, 3); round_fn(v, msg_vecs, 4); round_fn(v, msg_vecs, 5); round_fn(v, msg_vecs, 6); h_vecs[0] = xorv(v[0], v[8]); h_vecs[1] = xorv(v[1], v[9]); h_vecs[2] = xorv(v[2], v[10]); h_vecs[3] = xorv(v[3], v[11]); h_vecs[4] = xorv(v[4], v[12]); h_vecs[5] = xorv(v[5], v[13]); h_vecs[6] = xorv(v[6], v[14]); h_vecs[7] = xorv(v[7], v[15]); block_flags = flags; } transpose_vecs(&h_vecs[0]); transpose_vecs(&h_vecs[4]); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); } INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { uint32_t cv[8]; memcpy(cv, key, BLAKE3_KEY_LEN); uint8_t block_flags = flags | flags_start; while (blocks > 0) { if (blocks == 1) { block_flags |= flags_end; } blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; } memcpy(out, cv, BLAKE3_OUT_LEN); } void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs >= DEGREE) { blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += DEGREE; } inputs += DEGREE; num_inputs -= DEGREE; out = &out[DEGREE * BLAKE3_OUT_LEN]; } while (num_inputs > 0) { hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 1; } inputs += 1; num_inputs -= 1; out = &out[BLAKE3_OUT_LEN]; } } blake3-1.5.4/c/blake3_sse41_x86-64_unix.S000064400000000000000000001673271046102023000155410ustar 00000000000000#if defined(__ELF__) && defined(__linux__) .section .note.GNU-stack,"",%progbits #endif #if defined(__ELF__) && defined(__CET__) && defined(__has_include) #if __has_include() #include #endif #endif #if !defined(_CET_ENDBR) #define _CET_ENDBR #endif .intel_syntax noprefix .global blake3_hash_many_sse41 .global _blake3_hash_many_sse41 .global blake3_compress_in_place_sse41 .global _blake3_compress_in_place_sse41 .global blake3_compress_xof_sse41 .global _blake3_compress_xof_sse41 #ifdef __APPLE__ .text #else .section .text #endif .p2align 6 _blake3_hash_many_sse41: blake3_hash_many_sse41: _CET_ENDBR push r15 push r14 push r13 push r12 push rbx push rbp mov rbp, rsp sub rsp, 360 and rsp, 0xFFFFFFFFFFFFFFC0 neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 0x00 movdqa xmmword ptr [rsp+0x130], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0+rip] pand xmm0, xmmword ptr [ADD1+rip] movdqa xmmword ptr [rsp+0x150], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 0x00 paddd xmm0, xmm1 movdqa xmmword ptr [rsp+0x110], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 0x00 psubd xmm2, xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 mov rbx, qword ptr [rbp+0x50] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+0x38] movzx r12d, byte ptr [rbp+0x48] cmp rsi, 4 jc 3f 2: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 0x00 pshufd xmm1, xmm3, 0x55 pshufd xmm2, xmm3, 0xAA pshufd xmm3, xmm3, 0xFF movdqu xmm7, xmmword ptr [rcx+0x10] pshufd xmm4, xmm7, 0x00 pshufd xmm5, xmm7, 0x55 pshufd xmm6, xmm7, 0xAA pshufd xmm7, xmm7, 0xFF mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 9: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-0x40] movdqu xmm9, xmmword ptr [r9+rdx-0x40] movdqu xmm10, xmmword ptr [r10+rdx-0x40] movdqu xmm11, xmmword ptr [r11+rdx-0x40] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+0x10], xmm9 movdqa xmmword ptr [rsp+0x20], xmm12 movdqa xmmword ptr [rsp+0x30], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x30] movdqu xmm9, xmmword ptr [r9+rdx-0x30] movdqu xmm10, xmmword ptr [r10+rdx-0x30] movdqu xmm11, xmmword ptr [r11+rdx-0x30] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x40], xmm8 movdqa xmmword ptr [rsp+0x50], xmm9 movdqa xmmword ptr [rsp+0x60], xmm12 movdqa xmmword ptr [rsp+0x70], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x20] movdqu xmm9, xmmword ptr [r9+rdx-0x20] movdqu xmm10, xmmword ptr [r10+rdx-0x20] movdqu xmm11, xmmword ptr [r11+rdx-0x20] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x80], xmm8 movdqa xmmword ptr [rsp+0x90], xmm9 movdqa xmmword ptr [rsp+0xA0], xmm12 movdqa xmmword ptr [rsp+0xB0], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x10] movdqu xmm9, xmmword ptr [r9+rdx-0x10] movdqu xmm10, xmmword ptr [r10+rdx-0x10] movdqu xmm11, xmmword ptr [r11+rdx-0x10] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0xC0], xmm8 movdqa xmmword ptr [rsp+0xD0], xmm9 movdqa xmmword ptr [rsp+0xE0], xmm12 movdqa xmmword ptr [rsp+0xF0], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] movdqa xmm12, xmmword ptr [rsp+0x110] movdqa xmm13, xmmword ptr [rsp+0x120] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] movd xmm15, eax pshufd xmm15, xmm15, 0x00 prefetcht0 [r8+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r11+rdx+0x80] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x80] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x70] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xB0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x50] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xC0] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xA0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0x60] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xF0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne 9b movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+0x20], xmm1 movdqu xmmword ptr [rbx+0x40], xmm9 movdqu xmmword ptr [rbx+0x60], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+0x10], xmm4 movdqu xmmword ptr [rbx+0x30], xmm5 movdqu xmmword ptr [rbx+0x50], xmm9 movdqu xmmword ptr [rbx+0x70], xmm7 movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+0x150] movdqa xmmword ptr [rsp+0x110], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+0x120] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+0x120], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc 2b test rsi, rsi jnz 3f 4: mov rsp, rbp pop rbp pop rbx pop r12 pop r13 pop r14 pop r15 ret .p2align 5 3: test esi, 0x2 je 3f movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] pinsrd xmm13, dword ptr [rsp+0x120], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] pinsrd xmm14, dword ptr [rsp+0x124], 1 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 0x93 movups xmm12, xmmword ptr [r9+rdx-0x40] movups xmm13, xmmword ptr [r9+rdx-0x30] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-0x20] movups xmm15, xmmword ptr [r9+rdx-0x10] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 0x93 shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 0x93 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] pinsrd xmm3, eax, 3 pinsrd xmm11, eax, 3 mov al, 7 9: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+0x20], xmm4 movaps xmmword ptr [rsp+0x30], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm12, xmmword ptr [ROT16+rip] pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+0x40], xmm5 movaps xmmword ptr [rsp+0x50], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm13, xmmword ptr [ROT8+rip] pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x93 pshufd xmm8, xmm8, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x39 pshufd xmm10, xmm10, 0x39 paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x39 pshufd xmm8, xmm8, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x93 pshufd xmm10, xmm10, 0x93 dec al je 9f movdqa xmm12, xmmword ptr [rsp+0x20] movdqa xmm5, xmmword ptr [rsp+0x40] pshufd xmm13, xmm12, 0x0F shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pblendw xmm13, xmm12, 0xCC movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 pblendw xmm12, xmm6, 0xC0 pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmmword ptr [rsp+0x20], xmm13 movdqa xmmword ptr [rsp+0x40], xmm12 movdqa xmm5, xmmword ptr [rsp+0x30] movdqa xmm13, xmmword ptr [rsp+0x50] pshufd xmm6, xmm5, 0x0F shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pblendw xmm6, xmm5, 0xCC movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 pblendw xmm5, xmm14, 0xC0 pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 0x1E movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+0x20] movdqa xmm6, xmmword ptr [rsp+0x40] jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 movups xmmword ptr [rbx+0x20], xmm8 movups xmmword ptr [rbx+0x30], xmm9 movdqa xmm0, xmmword ptr [rsp+0x130] movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm2, xmmword ptr [rsp+0x120] movdqu xmm3, xmmword ptr [rsp+0x118] movdqu xmm4, xmmword ptr [rsp+0x128] blendvps xmm1, xmm3, xmm0 blendvps xmm2, xmm4, xmm0 movdqa xmmword ptr [rsp+0x110], xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 add rdi, 16 add rbx, 64 sub rsi, 2 3: test esi, 0x1 je 4b movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] pinsrd xmm13, dword ptr [rsp+0x120], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm3, xmm13 pinsrd xmm3, eax, 3 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 jmp 4b .p2align 6 blake3_compress_in_place_sse41: _blake3_compress_in_place_sse41: _CET_ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] shl r8, 32 add rdx, r8 movq xmm3, rcx movq xmm4, rdx punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rsi] movups xmm5, xmmword ptr [rsi+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rsi+0x20] movups xmm7, xmmword ptr [rsi+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rdi], xmm0 movups xmmword ptr [rdi+0x10], xmm1 ret .p2align 6 blake3_compress_xof_sse41: _blake3_compress_xof_sse41: _CET_ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, r8b movzx edx, dl shl rax, 32 add rdx, rax movq xmm3, rcx movq xmm4, rdx punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rsi] movups xmm5, xmmword ptr [rsi+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rsi+0x20] movups xmm7, xmmword ptr [rsi+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: movdqu xmm4, xmmword ptr [rdi] movdqu xmm5, xmmword ptr [rdi+0x10] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r9], xmm0 movups xmmword ptr [r9+0x10], xmm1 movups xmmword ptr [r9+0x20], xmm2 movups xmmword ptr [r9+0x30], xmm3 ret #ifdef __APPLE__ .static_data #else .section .rodata #endif .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A ROT16: .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 ADD0: .long 0, 1, 2, 3 ADD1: .long 4, 4, 4, 4 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 blake3-1.5.4/c/blake3_sse41_x86-64_windows_gnu.S000064400000000000000000001737431046102023000171200ustar 00000000000000.intel_syntax noprefix .global blake3_hash_many_sse41 .global _blake3_hash_many_sse41 .global blake3_compress_in_place_sse41 .global _blake3_compress_in_place_sse41 .global blake3_compress_xof_sse41 .global _blake3_compress_xof_sse41 .section .text .p2align 6 _blake3_hash_many_sse41: blake3_hash_many_sse41: push r15 push r14 push r13 push r12 push rsi push rdi push rbx push rbp mov rbp, rsp sub rsp, 528 and rsp, 0xFFFFFFFFFFFFFFC0 movdqa xmmword ptr [rsp+0x170], xmm6 movdqa xmmword ptr [rsp+0x180], xmm7 movdqa xmmword ptr [rsp+0x190], xmm8 movdqa xmmword ptr [rsp+0x1A0], xmm9 movdqa xmmword ptr [rsp+0x1B0], xmm10 movdqa xmmword ptr [rsp+0x1C0], xmm11 movdqa xmmword ptr [rsp+0x1D0], xmm12 movdqa xmmword ptr [rsp+0x1E0], xmm13 movdqa xmmword ptr [rsp+0x1F0], xmm14 movdqa xmmword ptr [rsp+0x200], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+0x68] movzx r9, byte ptr [rbp+0x70] neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 0x00 movdqa xmmword ptr [rsp+0x130], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0+rip] pand xmm0, xmmword ptr [ADD1+rip] movdqa xmmword ptr [rsp+0x150], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 0x00 paddd xmm0, xmm1 movdqa xmmword ptr [rsp+0x110], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 0x00 psubd xmm2, xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 mov rbx, qword ptr [rbp+0x90] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+0x78] movzx r12d, byte ptr [rbp+0x88] cmp rsi, 4 jc 3f 2: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 0x00 pshufd xmm1, xmm3, 0x55 pshufd xmm2, xmm3, 0xAA pshufd xmm3, xmm3, 0xFF movdqu xmm7, xmmword ptr [rcx+0x10] pshufd xmm4, xmm7, 0x00 pshufd xmm5, xmm7, 0x55 pshufd xmm6, xmm7, 0xAA pshufd xmm7, xmm7, 0xFF mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx 9: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-0x40] movdqu xmm9, xmmword ptr [r9+rdx-0x40] movdqu xmm10, xmmword ptr [r10+rdx-0x40] movdqu xmm11, xmmword ptr [r11+rdx-0x40] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+0x10], xmm9 movdqa xmmword ptr [rsp+0x20], xmm12 movdqa xmmword ptr [rsp+0x30], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x30] movdqu xmm9, xmmword ptr [r9+rdx-0x30] movdqu xmm10, xmmword ptr [r10+rdx-0x30] movdqu xmm11, xmmword ptr [r11+rdx-0x30] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x40], xmm8 movdqa xmmword ptr [rsp+0x50], xmm9 movdqa xmmword ptr [rsp+0x60], xmm12 movdqa xmmword ptr [rsp+0x70], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x20] movdqu xmm9, xmmword ptr [r9+rdx-0x20] movdqu xmm10, xmmword ptr [r10+rdx-0x20] movdqu xmm11, xmmword ptr [r11+rdx-0x20] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x80], xmm8 movdqa xmmword ptr [rsp+0x90], xmm9 movdqa xmmword ptr [rsp+0xA0], xmm12 movdqa xmmword ptr [rsp+0xB0], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x10] movdqu xmm9, xmmword ptr [r9+rdx-0x10] movdqu xmm10, xmmword ptr [r10+rdx-0x10] movdqu xmm11, xmmword ptr [r11+rdx-0x10] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0xC0], xmm8 movdqa xmmword ptr [rsp+0xD0], xmm9 movdqa xmmword ptr [rsp+0xE0], xmm12 movdqa xmmword ptr [rsp+0xF0], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] movdqa xmm12, xmmword ptr [rsp+0x110] movdqa xmm13, xmmword ptr [rsp+0x120] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] movd xmm15, eax pshufd xmm15, xmm15, 0x00 prefetcht0 [r8+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r11+rdx+0x80] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x80] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x70] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xB0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x50] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xC0] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xA0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0x60] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xF0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne 9b movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+0x20], xmm1 movdqu xmmword ptr [rbx+0x40], xmm9 movdqu xmmword ptr [rbx+0x60], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+0x10], xmm4 movdqu xmmword ptr [rbx+0x30], xmm5 movdqu xmmword ptr [rbx+0x50], xmm9 movdqu xmmword ptr [rbx+0x70], xmm7 movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+0x150] movdqa xmmword ptr [rsp+0x110], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+0x120] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+0x120], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc 2b test rsi, rsi jne 3f 4: movdqa xmm6, xmmword ptr [rsp+0x170] movdqa xmm7, xmmword ptr [rsp+0x180] movdqa xmm8, xmmword ptr [rsp+0x190] movdqa xmm9, xmmword ptr [rsp+0x1A0] movdqa xmm10, xmmword ptr [rsp+0x1B0] movdqa xmm11, xmmword ptr [rsp+0x1C0] movdqa xmm12, xmmword ptr [rsp+0x1D0] movdqa xmm13, xmmword ptr [rsp+0x1E0] movdqa xmm14, xmmword ptr [rsp+0x1F0] movdqa xmm15, xmmword ptr [rsp+0x200] mov rsp, rbp pop rbp pop rbx pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret .p2align 5 3: test esi, 0x2 je 3f movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] pinsrd xmm13, dword ptr [rsp+0x120], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] pinsrd xmm14, dword ptr [rsp+0x124], 1 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 0x93 movups xmm12, xmmword ptr [r9+rdx-0x40] movups xmm13, xmmword ptr [r9+rdx-0x30] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-0x20] movups xmm15, xmmword ptr [r9+rdx-0x10] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 0x93 shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 0x93 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] pinsrd xmm3, eax, 3 pinsrd xmm11, eax, 3 mov al, 7 9: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+0x20], xmm4 movaps xmmword ptr [rsp+0x30], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm12, xmmword ptr [ROT16+rip] pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+0x40], xmm5 movaps xmmword ptr [rsp+0x50], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm13, xmmword ptr [ROT8+rip] pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x93 pshufd xmm8, xmm8, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x39 pshufd xmm10, xmm10, 0x39 paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x39 pshufd xmm8, xmm8, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x93 pshufd xmm10, xmm10, 0x93 dec al je 9f movdqa xmm12, xmmword ptr [rsp+0x20] movdqa xmm5, xmmword ptr [rsp+0x40] pshufd xmm13, xmm12, 0x0F shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pblendw xmm13, xmm12, 0xCC movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 pblendw xmm12, xmm6, 0xC0 pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmmword ptr [rsp+0x20], xmm13 movdqa xmmword ptr [rsp+0x40], xmm12 movdqa xmm5, xmmword ptr [rsp+0x30] movdqa xmm13, xmmword ptr [rsp+0x50] pshufd xmm6, xmm5, 0x0F shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pblendw xmm6, xmm5, 0xCC movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 pblendw xmm5, xmm14, 0xC0 pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 0x1E movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+0x20] movdqa xmm6, xmmword ptr [rsp+0x40] jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 movups xmmword ptr [rbx+0x20], xmm8 movups xmmword ptr [rbx+0x30], xmm9 movdqa xmm0, xmmword ptr [rsp+0x130] movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm2, xmmword ptr [rsp+0x120] movdqu xmm3, xmmword ptr [rsp+0x118] movdqu xmm4, xmmword ptr [rsp+0x128] blendvps xmm1, xmm3, xmm0 blendvps xmm2, xmm4, xmm0 movdqa xmmword ptr [rsp+0x110], xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 add rdi, 16 add rbx, 64 sub rsi, 2 3: test esi, 0x1 je 4b movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] pinsrd xmm13, dword ptr [rsp+0x120], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm3, xmm13 pinsrd xmm3, eax, 3 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 jmp 4b .p2align 6 blake3_compress_in_place_sse41: _blake3_compress_in_place_sse41: sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+0x10], xmm7 movdqa xmmword ptr [rsp+0x20], xmm8 movdqa xmmword ptr [rsp+0x30], xmm9 movdqa xmmword ptr [rsp+0x40], xmm11 movdqa xmmword ptr [rsp+0x50], xmm14 movdqa xmmword ptr [rsp+0x60], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, byte ptr [rsp+0xA0] movzx r8d, r8b shl rax, 32 add r8, rax movq xmm3, r9 movq xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+0x20] movups xmm7, xmmword ptr [rdx+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rcx], xmm0 movups xmmword ptr [rcx+0x10], xmm1 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+0x10] movdqa xmm8, xmmword ptr [rsp+0x20] movdqa xmm9, xmmword ptr [rsp+0x30] movdqa xmm11, xmmword ptr [rsp+0x40] movdqa xmm14, xmmword ptr [rsp+0x50] movdqa xmm15, xmmword ptr [rsp+0x60] add rsp, 120 ret .p2align 6 _blake3_compress_xof_sse41: blake3_compress_xof_sse41: sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+0x10], xmm7 movdqa xmmword ptr [rsp+0x20], xmm8 movdqa xmmword ptr [rsp+0x30], xmm9 movdqa xmmword ptr [rsp+0x40], xmm11 movdqa xmmword ptr [rsp+0x50], xmm14 movdqa xmmword ptr [rsp+0x60], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, byte ptr [rsp+0xA0] movzx r8d, r8b mov r10, qword ptr [rsp+0xA8] shl rax, 32 add r8, rax movq xmm3, r9 movq xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+0x20] movups xmm7, xmmword ptr [rdx+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: movdqu xmm4, xmmword ptr [rcx] movdqu xmm5, xmmword ptr [rcx+0x10] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r10], xmm0 movups xmmword ptr [r10+0x10], xmm1 movups xmmword ptr [r10+0x20], xmm2 movups xmmword ptr [r10+0x30], xmm3 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+0x10] movdqa xmm8, xmmword ptr [rsp+0x20] movdqa xmm9, xmmword ptr [rsp+0x30] movdqa xmm11, xmmword ptr [rsp+0x40] movdqa xmm14, xmmword ptr [rsp+0x50] movdqa xmm15, xmmword ptr [rsp+0x60] add rsp, 120 ret .section .rdata .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A ROT16: .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 ADD0: .long 0, 1, 2, 3 ADD1: .long 4, 4, 4, 4 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 blake3-1.5.4/c/blake3_sse41_x86-64_windows_msvc.asm000064400000000000000000001733321046102023000176470ustar 00000000000000public _blake3_hash_many_sse41 public blake3_hash_many_sse41 public blake3_compress_in_place_sse41 public _blake3_compress_in_place_sse41 public blake3_compress_xof_sse41 public _blake3_compress_xof_sse41 _TEXT SEGMENT ALIGN(16) 'CODE' ALIGN 16 blake3_hash_many_sse41 PROC _blake3_hash_many_sse41 PROC push r15 push r14 push r13 push r12 push rsi push rdi push rbx push rbp mov rbp, rsp sub rsp, 528 and rsp, 0FFFFFFFFFFFFFFC0H movdqa xmmword ptr [rsp+170H], xmm6 movdqa xmmword ptr [rsp+180H], xmm7 movdqa xmmword ptr [rsp+190H], xmm8 movdqa xmmword ptr [rsp+1A0H], xmm9 movdqa xmmword ptr [rsp+1B0H], xmm10 movdqa xmmword ptr [rsp+1C0H], xmm11 movdqa xmmword ptr [rsp+1D0H], xmm12 movdqa xmmword ptr [rsp+1E0H], xmm13 movdqa xmmword ptr [rsp+1F0H], xmm14 movdqa xmmword ptr [rsp+200H], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+68H] movzx r9, byte ptr [rbp+70H] neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 00H movdqa xmmword ptr [rsp+130H], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0] pand xmm0, xmmword ptr [ADD1] movdqa xmmword ptr [rsp+150H], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 00H paddd xmm0, xmm1 movdqa xmmword ptr [rsp+110H], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK] pxor xmm1, xmmword ptr [CMP_MSB_MASK] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 00H psubd xmm2, xmm1 movdqa xmmword ptr [rsp+120H], xmm2 mov rbx, qword ptr [rbp+90H] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+78H] movzx r12d, byte ptr [rbp+88H] cmp rsi, 4 jc final3blocks outerloop4: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 00H pshufd xmm1, xmm3, 55H pshufd xmm2, xmm3, 0AAH pshufd xmm3, xmm3, 0FFH movdqu xmm7, xmmword ptr [rcx+10H] pshufd xmm4, xmm7, 00H pshufd xmm5, xmm7, 55H pshufd xmm6, xmm7, 0AAH pshufd xmm7, xmm7, 0FFH mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop4: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-40H] movdqu xmm9, xmmword ptr [r9+rdx-40H] movdqu xmm10, xmmword ptr [r10+rdx-40H] movdqu xmm11, xmmword ptr [r11+rdx-40H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+10H], xmm9 movdqa xmmword ptr [rsp+20H], xmm12 movdqa xmmword ptr [rsp+30H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-30H] movdqu xmm9, xmmword ptr [r9+rdx-30H] movdqu xmm10, xmmword ptr [r10+rdx-30H] movdqu xmm11, xmmword ptr [r11+rdx-30H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+40H], xmm8 movdqa xmmword ptr [rsp+50H], xmm9 movdqa xmmword ptr [rsp+60H], xmm12 movdqa xmmword ptr [rsp+70H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-20H] movdqu xmm9, xmmword ptr [r9+rdx-20H] movdqu xmm10, xmmword ptr [r10+rdx-20H] movdqu xmm11, xmmword ptr [r11+rdx-20H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+80H], xmm8 movdqa xmmword ptr [rsp+90H], xmm9 movdqa xmmword ptr [rsp+0A0H], xmm12 movdqa xmmword ptr [rsp+0B0H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-10H] movdqu xmm9, xmmword ptr [r9+rdx-10H] movdqu xmm10, xmmword ptr [r10+rdx-10H] movdqu xmm11, xmmword ptr [r11+rdx-10H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0C0H], xmm8 movdqa xmmword ptr [rsp+0D0H], xmm9 movdqa xmmword ptr [rsp+0E0H], xmm12 movdqa xmmword ptr [rsp+0F0H], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1] movdqa xmm10, xmmword ptr [BLAKE3_IV_2] movdqa xmm11, xmmword ptr [BLAKE3_IV_3] movdqa xmm12, xmmword ptr [rsp+110H] movdqa xmm13, xmmword ptr [rsp+120H] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] movd xmm15, eax pshufd xmm15, xmm15, 00H prefetcht0 byte ptr [r8+rdx+80H] prefetcht0 byte ptr [r9+rdx+80H] prefetcht0 byte ptr [r10+rdx+80H] prefetcht0 byte ptr [r11+rdx+80H] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+40H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [BLAKE3_IV_0] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+10H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+50H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+80H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+0C0H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+90H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+0D0H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+20H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+70H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+60H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+10H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+90H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0B0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+0E0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+30H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+0D0H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+40H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+20H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+60H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+0B0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+50H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0F0H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0A0H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+0E0H] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+70H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+30H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+40H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+50H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+80H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0C0H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+0F0H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0D0H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+0A0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+70H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+20H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+10H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+90H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+80H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0E0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+0C0H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0D0H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+20H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+30H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+60H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0B0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+10H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0F0H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+90H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0E0H] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+30H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0A0H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+40H] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne innerloop4 movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+20H], xmm1 movdqu xmmword ptr [rbx+40H], xmm9 movdqu xmmword ptr [rbx+60H], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+10H], xmm4 movdqu xmmword ptr [rbx+30H], xmm5 movdqu xmmword ptr [rbx+50H], xmm9 movdqu xmmword ptr [rbx+70H], xmm7 movdqa xmm1, xmmword ptr [rsp+110H] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+150H] movdqa xmmword ptr [rsp+110H], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK] pxor xmm1, xmmword ptr [CMP_MSB_MASK] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+120H] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+120H], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc outerloop4 test rsi, rsi jne final3blocks unwind: movdqa xmm6, xmmword ptr [rsp+170H] movdqa xmm7, xmmword ptr [rsp+180H] movdqa xmm8, xmmword ptr [rsp+190H] movdqa xmm9, xmmword ptr [rsp+1A0H] movdqa xmm10, xmmword ptr [rsp+1B0H] movdqa xmm11, xmmword ptr [rsp+1C0H] movdqa xmm12, xmmword ptr [rsp+1D0H] movdqa xmm13, xmmword ptr [rsp+1E0H] movdqa xmm14, xmmword ptr [rsp+1F0H] movdqa xmm15, xmmword ptr [rsp+200H] mov rsp, rbp pop rbp pop rbx pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret ALIGN 16 final3blocks: test esi, 2H je final1block movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+110H] pinsrd xmm13, dword ptr [rsp+120H], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+114H] pinsrd xmm14, dword ptr [rsp+124H], 1 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 movaps xmmword ptr [rsp+10H], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-40H] movups xmm5, xmmword ptr [r8+rdx-30H] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-20H] movups xmm7, xmmword ptr [r8+rdx-10H] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 93H movups xmm12, xmmword ptr [r9+rdx-40H] movups xmm13, xmmword ptr [r9+rdx-30H] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-20H] movups xmm15, xmmword ptr [r9+rdx-10H] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 93H shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 93H movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+10H] pinsrd xmm3, eax, 3 pinsrd xmm11, eax, 3 mov al, 7 roundloop2: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+20H], xmm4 movaps xmmword ptr [rsp+30H], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm12, xmmword ptr [ROT16] pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+40H], xmm5 movaps xmmword ptr [rsp+50H], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm13, xmmword ptr [ROT8] pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 93H pshufd xmm8, xmm8, 93H pshufd xmm3, xmm3, 4EH pshufd xmm11, xmm11, 4EH pshufd xmm2, xmm2, 39H pshufd xmm10, xmm10, 39H paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 39H pshufd xmm8, xmm8, 39H pshufd xmm3, xmm3, 4EH pshufd xmm11, xmm11, 4EH pshufd xmm2, xmm2, 93H pshufd xmm10, xmm10, 93H dec al je endroundloop2 movdqa xmm12, xmmword ptr [rsp+20H] movdqa xmm5, xmmword ptr [rsp+40H] pshufd xmm13, xmm12, 0FH shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 39H movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pblendw xmm13, xmm12, 0CCH movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 pblendw xmm12, xmm6, 0C0H pshufd xmm12, xmm12, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmmword ptr [rsp+20H], xmm13 movdqa xmmword ptr [rsp+40H], xmm12 movdqa xmm5, xmmword ptr [rsp+30H] movdqa xmm13, xmmword ptr [rsp+50H] pshufd xmm6, xmm5, 0FH shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 39H movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pblendw xmm6, xmm5, 0CCH movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 pblendw xmm5, xmm14, 0C0H pshufd xmm5, xmm5, 78H punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 1EH movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+20H] movdqa xmm6, xmmword ptr [rsp+40H] jmp roundloop2 endroundloop2: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne innerloop2 movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+10H], xmm1 movups xmmword ptr [rbx+20H], xmm8 movups xmmword ptr [rbx+30H], xmm9 movdqa xmm0, xmmword ptr [rsp+130H] movdqa xmm1, xmmword ptr [rsp+110H] movdqa xmm2, xmmword ptr [rsp+120H] movdqu xmm3, xmmword ptr [rsp+118H] movdqu xmm4, xmmword ptr [rsp+128H] blendvps xmm1, xmm3, xmm0 blendvps xmm2, xmm4, xmm0 movdqa xmmword ptr [rsp+110H], xmm1 movdqa xmmword ptr [rsp+120H], xmm2 add rdi, 16 add rbx, 64 sub rsi, 2 final1block: test esi, 1H je unwind movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movd xmm13, dword ptr [rsp+110H] pinsrd xmm13, dword ptr [rsp+120H], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 movaps xmm14, xmmword ptr [ROT8] movaps xmm15, xmmword ptr [ROT16] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop1: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV] movaps xmm3, xmm13 pinsrd xmm3, eax, 3 movups xmm4, xmmword ptr [r8+rdx-40H] movups xmm5, xmmword ptr [r8+rdx-30H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-20H] movups xmm7, xmmword ptr [r8+rdx-10H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H mov al, 7 roundloop1: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz endroundloop1 movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0CCH movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0C0H pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp roundloop1 endroundloop1: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne innerloop1 movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+10H], xmm1 jmp unwind _blake3_hash_many_sse41 ENDP blake3_hash_many_sse41 ENDP blake3_compress_in_place_sse41 PROC _blake3_compress_in_place_sse41 PROC sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+10H], xmm7 movdqa xmmword ptr [rsp+20H], xmm8 movdqa xmmword ptr [rsp+30H], xmm9 movdqa xmmword ptr [rsp+40H], xmm11 movdqa xmmword ptr [rsp+50H], xmm14 movdqa xmmword ptr [rsp+60H], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm2, xmmword ptr [BLAKE3_IV] movzx eax, byte ptr [rsp+0A0H] movzx r8d, r8b shl rax, 32 add r8, rax movd xmm3, r9 movd xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+10H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+20H] movups xmm7, xmmword ptr [rdx+30H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H movaps xmm14, xmmword ptr [ROT8] movaps xmm15, xmmword ptr [ROT16] mov al, 7 @@: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz @F movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0CCH movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0C0H pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp @B @@: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rcx], xmm0 movups xmmword ptr [rcx+10H], xmm1 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+10H] movdqa xmm8, xmmword ptr [rsp+20H] movdqa xmm9, xmmword ptr [rsp+30H] movdqa xmm11, xmmword ptr [rsp+40H] movdqa xmm14, xmmword ptr [rsp+50H] movdqa xmm15, xmmword ptr [rsp+60H] add rsp, 120 ret _blake3_compress_in_place_sse41 ENDP blake3_compress_in_place_sse41 ENDP ALIGN 16 blake3_compress_xof_sse41 PROC _blake3_compress_xof_sse41 PROC sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+10H], xmm7 movdqa xmmword ptr [rsp+20H], xmm8 movdqa xmmword ptr [rsp+30H], xmm9 movdqa xmmword ptr [rsp+40H], xmm11 movdqa xmmword ptr [rsp+50H], xmm14 movdqa xmmword ptr [rsp+60H], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm2, xmmword ptr [BLAKE3_IV] movzx eax, byte ptr [rsp+0A0H] movzx r8d, r8b mov r10, qword ptr [rsp+0A8H] shl rax, 32 add r8, rax movd xmm3, r9 movd xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+10H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+20H] movups xmm7, xmmword ptr [rdx+30H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H movaps xmm14, xmmword ptr [ROT8] movaps xmm15, xmmword ptr [ROT16] mov al, 7 @@: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz @F movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0CCH movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0C0H pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp @B @@: movdqu xmm4, xmmword ptr [rcx] movdqu xmm5, xmmword ptr [rcx+10H] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r10], xmm0 movups xmmword ptr [r10+10H], xmm1 movups xmmword ptr [r10+20H], xmm2 movups xmmword ptr [r10+30H], xmm3 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+10H] movdqa xmm8, xmmword ptr [rsp+20H] movdqa xmm9, xmmword ptr [rsp+30H] movdqa xmm11, xmmword ptr [rsp+40H] movdqa xmm14, xmmword ptr [rsp+50H] movdqa xmm15, xmmword ptr [rsp+60H] add rsp, 120 ret _blake3_compress_xof_sse41 ENDP blake3_compress_xof_sse41 ENDP _TEXT ENDS _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' ALIGN 64 BLAKE3_IV: dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH ADD0: dd 0, 1, 2, 3 ADD1: dd 4 dup (4) BLAKE3_IV_0: dd 4 dup (6A09E667H) BLAKE3_IV_1: dd 4 dup (0BB67AE85H) BLAKE3_IV_2: dd 4 dup (3C6EF372H) BLAKE3_IV_3: dd 4 dup (0A54FF53AH) BLAKE3_BLOCK_LEN: dd 4 dup (64) ROT16: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 CMP_MSB_MASK: dd 8 dup(80000000H) _RDATA ENDS END blake3-1.5.4/c/example.c000064400000000000000000000015671046102023000130260ustar 00000000000000#include "blake3.h" #include #include #include #include #include int main(void) { // Initialize the hasher. blake3_hasher hasher; blake3_hasher_init(&hasher); // Read input bytes from stdin. unsigned char buf[65536]; while (1) { ssize_t n = read(STDIN_FILENO, buf, sizeof(buf)); if (n > 0) { blake3_hasher_update(&hasher, buf, n); } else if (n == 0) { break; // end of file } else { fprintf(stderr, "read failed: %s\n", strerror(errno)); exit(1); } } // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. uint8_t output[BLAKE3_OUT_LEN]; blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); // Print the hash as hexadecimal. for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { printf("%02x", output[i]); } printf("\n"); return 0; } blake3-1.5.4/c/libblake3.pc.in000064400000000000000000000005041046102023000137760ustar 00000000000000prefix="@CMAKE_INSTALL_PREFIX@" exec_prefix="${prefix}" libdir="${prefix}/@CMAKE_INSTALL_LIBDIR@" includedir="${prefix}/@CMAKE_INSTALL_INCLUDEDIR@" Name: @PROJECT_NAME@ Description: @PROJECT_DESCRIPTION@ Version: @PROJECT_VERSION@ Requires: Libs: -L"${libdir}" -lblake3 Cflags: -I"${includedir}" @BLAKE3_PKGCONFIG_CFLAGS@ blake3-1.5.4/c/main.c000064400000000000000000000077131046102023000123160ustar 00000000000000/* * This main file is intended for testing via `make test`. It does not build in * other settings. See README.md in this directory for examples of how to build * C code. */ #include #include #include #include #include #include #include "blake3.h" #include "blake3_impl.h" #define HASH_MODE 0 #define KEYED_HASH_MODE 1 #define DERIVE_KEY_MODE 2 static void hex_char_value(uint8_t c, uint8_t *value, bool *valid) { if ('0' <= c && c <= '9') { *value = c - '0'; *valid = true; } else if ('a' <= c && c <= 'f') { *value = 10 + c - 'a'; *valid = true; } else { *valid = false; } } static int parse_key(char *hex_key, uint8_t out[BLAKE3_KEY_LEN]) { size_t hex_len = strlen(hex_key); if (hex_len != 64) { fprintf(stderr, "Expected a 64-char hexadecimal key, got %zu chars.\n", hex_len); return 1; } for (size_t i = 0; i < 64; i++) { uint8_t value; bool valid; hex_char_value(hex_key[i], &value, &valid); if (!valid) { fprintf(stderr, "Invalid hex char.\n"); return 1; } if (i % 2 == 0) { out[i / 2] = 0; value <<= 4; } out[i / 2] += value; } return 0; } /* A little repetition here */ enum cpu_feature { SSE2 = 1 << 0, SSSE3 = 1 << 1, SSE41 = 1 << 2, AVX = 1 << 3, AVX2 = 1 << 4, AVX512F = 1 << 5, AVX512VL = 1 << 6, /* ... */ UNDEFINED = 1 << 30 }; extern enum cpu_feature g_cpu_features; enum cpu_feature get_cpu_features(void); int main(int argc, char **argv) { size_t out_len = BLAKE3_OUT_LEN; uint8_t key[BLAKE3_KEY_LEN]; char *context = ""; uint8_t mode = HASH_MODE; while (argc > 1) { if (argc <= 2) { fprintf(stderr, "Odd number of arguments.\n"); return 1; } if (strcmp("--length", argv[1]) == 0) { char *endptr = NULL; errno = 0; unsigned long long out_len_ll = strtoull(argv[2], &endptr, 10); if (errno != 0 || out_len_ll > SIZE_MAX || endptr == argv[2] || *endptr != 0) { fprintf(stderr, "Bad length argument.\n"); return 1; } out_len = (size_t)out_len_ll; } else if (strcmp("--keyed", argv[1]) == 0) { mode = KEYED_HASH_MODE; int ret = parse_key(argv[2], key); if (ret != 0) { return ret; } } else if (strcmp("--derive-key", argv[1]) == 0) { mode = DERIVE_KEY_MODE; context = argv[2]; } else { fprintf(stderr, "Unknown flag.\n"); return 1; } argc -= 2; argv += 2; } /* * We're going to hash the input multiple times, so we need to buffer it all. * This is just for test cases, so go ahead and assume that the input is less * than 1 MiB. */ size_t buf_capacity = 1 << 20; uint8_t *buf = malloc(buf_capacity); assert(buf != NULL); size_t buf_len = 0; while (1) { size_t n = fread(&buf[buf_len], 1, buf_capacity - buf_len, stdin); if (n == 0) { break; } buf_len += n; assert(buf_len < buf_capacity); } const int mask = get_cpu_features(); int feature = 0; do { fprintf(stderr, "Testing 0x%08X\n", feature); g_cpu_features = feature; blake3_hasher hasher; switch (mode) { case HASH_MODE: blake3_hasher_init(&hasher); break; case KEYED_HASH_MODE: blake3_hasher_init_keyed(&hasher, key); break; case DERIVE_KEY_MODE: blake3_hasher_init_derive_key(&hasher, context); break; default: abort(); } blake3_hasher_update(&hasher, buf, buf_len); /* TODO: An incremental output reader API to avoid this allocation. */ uint8_t *out = malloc(out_len); if (out_len > 0 && out == NULL) { fprintf(stderr, "malloc() failed.\n"); return 1; } blake3_hasher_finalize(&hasher, out, out_len); for (size_t i = 0; i < out_len; i++) { printf("%02x", out[i]); } printf("\n"); free(out); feature = (feature - mask) & mask; } while (feature != 0); free(buf); return 0; } blake3-1.5.4/c/test.py000075500000000000000000000071061046102023000125560ustar 00000000000000#! /usr/bin/env python3 from binascii import hexlify import json from os import path import subprocess HERE = path.dirname(__file__) TEST_VECTORS_PATH = path.join(HERE, "..", "test_vectors", "test_vectors.json") TEST_VECTORS = json.load(open(TEST_VECTORS_PATH)) def run_blake3(args, input): output = subprocess.run([path.join(HERE, "blake3")] + args, input=input, stdout=subprocess.PIPE, check=True) return output.stdout.decode().strip() # Fill the input with a repeating byte pattern. We use a cycle length of 251, # because that's the largest prime number less than 256. This makes it unlikely # to swapping any two adjacent input blocks or chunks will give the same # answer. def make_test_input(length): i = 0 buf = bytearray() while len(buf) < length: buf.append(i) i = (i + 1) % 251 return buf def main(): for case in TEST_VECTORS["cases"]: input_len = case["input_len"] input = make_test_input(input_len) hex_key = hexlify(TEST_VECTORS["key"].encode()) context_string = TEST_VECTORS["context_string"] expected_hash_xof = case["hash"] expected_hash = expected_hash_xof[:64] expected_keyed_hash_xof = case["keyed_hash"] expected_keyed_hash = expected_keyed_hash_xof[:64] expected_derive_key_xof = case["derive_key"] expected_derive_key = expected_derive_key_xof[:64] # Test the default hash. test_hash = run_blake3([], input) for line in test_hash.splitlines(): assert expected_hash == line, \ "hash({}): {} != {}".format(input_len, expected_hash, line) # Test the extended hash. xof_len = len(expected_hash_xof) // 2 test_hash_xof = run_blake3(["--length", str(xof_len)], input) for line in test_hash_xof.splitlines(): assert expected_hash_xof == line, \ "hash_xof({}): {} != {}".format( input_len, expected_hash_xof, line) # Test the default keyed hash. test_keyed_hash = run_blake3(["--keyed", hex_key], input) for line in test_keyed_hash.splitlines(): assert expected_keyed_hash == line, \ "keyed_hash({}): {} != {}".format( input_len, expected_keyed_hash, line) # Test the extended keyed hash. xof_len = len(expected_keyed_hash_xof) // 2 test_keyed_hash_xof = run_blake3( ["--keyed", hex_key, "--length", str(xof_len)], input) for line in test_keyed_hash_xof.splitlines(): assert expected_keyed_hash_xof == line, \ "keyed_hash_xof({}): {} != {}".format( input_len, expected_keyed_hash_xof, line) # Test the default derive key. test_derive_key = run_blake3(["--derive-key", context_string], input) for line in test_derive_key.splitlines(): assert expected_derive_key == line, \ "derive_key({}): {} != {}".format( input_len, expected_derive_key, line) # Test the extended derive key. xof_len = len(expected_derive_key_xof) // 2 test_derive_key_xof = run_blake3( ["--derive-key", context_string, "--length", str(xof_len)], input) for line in test_derive_key_xof.splitlines(): assert expected_derive_key_xof == line, \ "derive_key_xof({}): {} != {}".format( input_len, expected_derive_key_xof, line) if __name__ == "__main__": main() blake3-1.5.4/media/B3.svg000064400000000000000000000075161046102023000130510ustar 00000000000000 image/svg+xml blake3-1.5.4/media/BLAKE3.svg000064400000000000000000000152121046102023000134760ustar 00000000000000 image/svg+xml blake3-1.5.4/media/speed.svg000064400000000000000000001334251046102023000137040ustar 00000000000000 blake3-1.5.4/src/ffi_avx2.rs000064400000000000000000000030531046102023000136360ustar 00000000000000use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Note that there is no AVX2 implementation of compress_in_place or // compress_xof. // Unsafe because this may only be called on platforms supporting AVX2. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_avx2( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } pub mod ffi { extern "C" { pub fn blake3_hash_many_avx2( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_hash_many() { if !crate::platform::avx2_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.5.4/src/ffi_avx512.rs000064400000000000000000000073221046102023000140070ustar 00000000000000use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Unsafe because this may only be called on platforms supporting AVX-512. pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) } // Unsafe because this may only be called on platforms supporting AVX-512. pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let mut out = [0u8; 64]; ffi::blake3_compress_xof_avx512( cv.as_ptr(), block.as_ptr(), block_len, counter, flags, out.as_mut_ptr(), ); out } // Unsafe because this may only be called on platforms supporting AVX-512. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_avx512( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } // Unsafe because this may only be called on platforms supporting AVX-512. #[cfg(unix)] pub unsafe fn xof_many( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, out: &mut [u8], ) { debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only"); ffi::blake3_xof_many_avx512( cv.as_ptr(), block.as_ptr(), block_len, counter, flags, out.as_mut_ptr(), out.len() / BLOCK_LEN, ); } pub mod ffi { extern "C" { pub fn blake3_compress_in_place_avx512( cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8, ); pub fn blake3_compress_xof_avx512( cv: *const u32, block: *const u8, block_len: u8, counter: u64, flags: u8, out: *mut u8, ); pub fn blake3_hash_many_avx512( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); #[cfg(unix)] pub fn blake3_xof_many_avx512( cv: *const u32, block: *const u8, block_len: u8, counter: u64, flags: u8, out: *mut u8, outblocks: usize, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_compress() { if !crate::platform::avx512_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::avx512_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } #[cfg(unix)] #[test] fn test_xof_many() { if !crate::platform::avx512_detected() { return; } crate::test::test_xof_many_fn(xof_many); } } blake3-1.5.4/src/ffi_neon.rs000064400000000000000000000042231046102023000137150ustar 00000000000000use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Unsafe because this may only be called on platforms supporting NEON. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_neon( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } // blake3_neon.c normally depends on blake3_portable.c, because the NEON // implementation only provides 4x compression, and it relies on the portable // implementation for 1x compression. However, we expose the portable Rust // implementation here instead, to avoid linking in unnecessary code. #[no_mangle] pub extern "C" fn blake3_compress_in_place_portable( cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8, ) { unsafe { crate::portable::compress_in_place( &mut *(cv as *mut [u32; 8]), &*(block as *const [u8; 64]), block_len, counter, flags, ) } } pub mod ffi { extern "C" { pub fn blake3_hash_many_neon( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_hash_many() { // This entire file is gated on feature="neon", so NEON support is // assumed here. crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.5.4/src/ffi_sse2.rs000064400000000000000000000054031046102023000136330ustar 00000000000000use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Unsafe because this may only be called on platforms supporting SSE2. pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { ffi::blake3_compress_in_place_sse2(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) } // Unsafe because this may only be called on platforms supporting SSE2. pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let mut out = [0u8; 64]; ffi::blake3_compress_xof_sse2( cv.as_ptr(), block.as_ptr(), block_len, counter, flags, out.as_mut_ptr(), ); out } // Unsafe because this may only be called on platforms supporting SSE2. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_sse2( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } pub mod ffi { extern "C" { pub fn blake3_compress_in_place_sse2( cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8, ); pub fn blake3_compress_xof_sse2( cv: *const u32, block: *const u8, block_len: u8, counter: u64, flags: u8, out: *mut u8, ); pub fn blake3_hash_many_sse2( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_compress() { if !crate::platform::sse2_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::sse2_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.5.4/src/ffi_sse41.rs000064400000000000000000000054211046102023000137160ustar 00000000000000use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Unsafe because this may only be called on platforms supporting SSE4.1. pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { ffi::blake3_compress_in_place_sse41(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) } // Unsafe because this may only be called on platforms supporting SSE4.1. pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let mut out = [0u8; 64]; ffi::blake3_compress_xof_sse41( cv.as_ptr(), block.as_ptr(), block_len, counter, flags, out.as_mut_ptr(), ); out } // Unsafe because this may only be called on platforms supporting SSE4.1. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_sse41( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } pub mod ffi { extern "C" { pub fn blake3_compress_in_place_sse41( cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8, ); pub fn blake3_compress_xof_sse41( cv: *const u32, block: *const u8, block_len: u8, counter: u64, flags: u8, out: *mut u8, ); pub fn blake3_hash_many_sse41( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_compress() { if !crate::platform::sse41_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::sse41_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.5.4/src/guts.rs000064400000000000000000000053261046102023000131210ustar 00000000000000//! This undocumented and unstable module is for use cases like the `bao` crate, //! which need to traverse the BLAKE3 Merkle tree and work with chunk and parent //! chaining values directly. There might be breaking changes to this module //! between patch versions. //! //! We could stabilize something like this module in the future. If you have a //! use case for it, please let us know by filing a GitHub issue. pub const BLOCK_LEN: usize = 64; pub const CHUNK_LEN: usize = 1024; #[derive(Clone, Debug)] pub struct ChunkState(crate::ChunkState); impl ChunkState { // Currently this type only supports the regular hash mode. If an // incremental user needs keyed_hash or derive_key, we can add that. pub fn new(chunk_counter: u64) -> Self { Self(crate::ChunkState::new( crate::IV, chunk_counter, 0, crate::platform::Platform::detect(), )) } #[inline] pub fn len(&self) -> usize { self.0.len() } #[inline] pub fn update(&mut self, input: &[u8]) -> &mut Self { self.0.update(input); self } pub fn finalize(&self, is_root: bool) -> crate::Hash { let output = self.0.output(); if is_root { output.root_hash() } else { output.chaining_value().into() } } } // As above, this currently assumes the regular hash mode. If an incremental // user needs keyed_hash or derive_key, we can add that. pub fn parent_cv( left_child: &crate::Hash, right_child: &crate::Hash, is_root: bool, ) -> crate::Hash { let output = crate::parent_node_output( left_child.as_bytes(), right_child.as_bytes(), crate::IV, 0, crate::platform::Platform::detect(), ); if is_root { output.root_hash() } else { output.chaining_value().into() } } #[cfg(test)] mod test { use super::*; #[test] fn test_chunk() { assert_eq!( crate::hash(b"foo"), ChunkState::new(0).update(b"foo").finalize(true) ); } #[test] fn test_parents() { let mut hasher = crate::Hasher::new(); let mut buf = [0; crate::CHUNK_LEN]; buf[0] = 'a' as u8; hasher.update(&buf); let chunk0_cv = ChunkState::new(0).update(&buf).finalize(false); buf[0] = 'b' as u8; hasher.update(&buf); let chunk1_cv = ChunkState::new(1).update(&buf).finalize(false); hasher.update(b"c"); let chunk2_cv = ChunkState::new(2).update(b"c").finalize(false); let parent = parent_cv(&chunk0_cv, &chunk1_cv, false); let root = parent_cv(&parent, &chunk2_cv, true); assert_eq!(hasher.finalize(), root); } } blake3-1.5.4/src/io.rs000064400000000000000000000073421046102023000125460ustar 00000000000000//! Helper functions for efficient IO. #[cfg(feature = "std")] pub(crate) fn copy_wide( mut reader: impl std::io::Read, hasher: &mut crate::Hasher, ) -> std::io::Result { let mut buffer = [0; 65536]; let mut total = 0; loop { match reader.read(&mut buffer) { Ok(0) => return Ok(total), Ok(n) => { hasher.update(&buffer[..n]); total += n as u64; } // see test_update_reader_interrupted Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, Err(e) => return Err(e), } } } // Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or // if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it // fails, return the error. // // SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like // str::from_utf8 on them and then have them change out from under you. Letting a safe caller get // their hands on an mmap, or even a &[u8] that's backed by an mmap, is unsound. However, because // this function is crate-private, we can guarantee that all can ever happen in the event of a race // condition is that we either hash nonsense bytes or crash with SIGBUS or similar, neither of // which should risk memory corruption in a safe caller. // // PARANOIA: But a data race...is a data race...is a data race...right? Even if we know that no // platform in the "real world" is ever going to do anything other than compute the "wrong answer" // if we race on this mmap while we hash it, aren't we still supposed to feel bad about doing this? // Well, maybe. This is IO, and IO gets special carve-outs in the memory model. Consider a // memory-mapped register that returns random 32-bit words. (This is actually realistic if you have // a hardware RNG.) It's probably sound to construct a *const i32 pointing to that register and do // some raw pointer reads from it. Those reads should be volatile if you don't want the compiler to // coalesce them, but either way the compiler isn't allowed to just _go nuts_ and insert // should-never-happen branches to wipe your hard drive if two adjacent reads happen to give // different values. As far as I'm aware, there's no such thing as a read that's allowed if it's // volatile but prohibited if it's not (unlike atomics). As mentioned above, it's not ok to // construct a safe &i32 to the register if you're going to leak that reference to unknown callers. // But if you "know what you're doing," I don't think *const i32 and &i32 are fundamentally // different here. Feedback needed. #[cfg(feature = "mmap")] pub(crate) fn maybe_mmap_file(file: &std::fs::File) -> std::io::Result> { let metadata = file.metadata()?; let file_size = metadata.len(); #[allow(clippy::if_same_then_else)] if !metadata.is_file() { // Not a real file. Ok(None) } else if file_size > isize::max_value() as u64 { // Too long to safely map. // https://github.com/danburkert/memmap-rs/issues/69 Ok(None) } else if file_size == 0 { // Mapping an empty file currently fails. // https://github.com/danburkert/memmap-rs/issues/72 // See test_mmap_virtual_file. Ok(None) } else if file_size < 16 * 1024 { // Mapping small files is not worth it. Ok(None) } else { // Explicitly set the length of the memory map, so that filesystem // changes can't race to violate the invariants we just checked. let map = unsafe { memmap2::MmapOptions::new() .len(file_size as usize) .map(file)? }; Ok(Some(map)) } } blake3-1.5.4/src/join.rs000064400000000000000000000055561046102023000131030ustar 00000000000000//! The multi-threading abstractions used by `Hasher::update_with_join`. //! //! Different implementations of the `Join` trait determine whether //! `Hasher::update_with_join` performs multi-threading on sufficiently large //! inputs. The `SerialJoin` implementation is single-threaded, and the //! `RayonJoin` implementation (gated by the `rayon` feature) is multi-threaded. //! Interfaces other than `Hasher::update_with_join`, like [`hash`](crate::hash) //! and [`Hasher::update`](crate::Hasher::update), always use `SerialJoin` //! internally. //! //! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and //! `RayonJoin` is the only non-trivial implementation. Previously this trait //! was public, but currently it's been re-privatized, as it's both 1) of no //! value to most callers and 2) a pretty big implementation detail to commit //! to. //! //! [`rayon::join`]: https://docs.rs/rayon/1.3.0/rayon/fn.join.html /// The trait that abstracts over single-threaded and multi-threaded recursion. /// /// See the [`join` module docs](index.html) for more details. pub trait Join { fn join(oper_a: A, oper_b: B) -> (RA, RB) where A: FnOnce() -> RA + Send, B: FnOnce() -> RB + Send, RA: Send, RB: Send; } /// The trivial, serial implementation of `Join`. The left and right sides are /// executed one after the other, on the calling thread. The standalone hashing /// functions and the `Hasher::update` method use this implementation /// internally. /// /// See the [`join` module docs](index.html) for more details. pub enum SerialJoin {} impl Join for SerialJoin { #[inline] fn join(oper_a: A, oper_b: B) -> (RA, RB) where A: FnOnce() -> RA + Send, B: FnOnce() -> RB + Send, RA: Send, RB: Send, { (oper_a(), oper_b()) } } /// The Rayon-based implementation of `Join`. The left and right sides are /// executed on the Rayon thread pool, potentially in parallel. This /// implementation is gated by the `rayon` feature, which is off by default. /// /// See the [`join` module docs](index.html) for more details. #[cfg(feature = "rayon")] pub enum RayonJoin {} #[cfg(feature = "rayon")] impl Join for RayonJoin { #[inline] fn join(oper_a: A, oper_b: B) -> (RA, RB) where A: FnOnce() -> RA + Send, B: FnOnce() -> RB + Send, RA: Send, RB: Send, { rayon_core::join(oper_a, oper_b) } } #[cfg(test)] mod test { use super::*; #[test] fn test_serial_join() { let oper_a = || 1 + 1; let oper_b = || 2 + 2; assert_eq!((2, 4), SerialJoin::join(oper_a, oper_b)); } #[test] #[cfg(feature = "rayon")] fn test_rayon_join() { let oper_a = || 1 + 1; let oper_b = || 2 + 2; assert_eq!((2, 4), RayonJoin::join(oper_a, oper_b)); } } blake3-1.5.4/src/lib.rs000064400000000000000000002116221046102023000127030ustar 00000000000000//! The official Rust implementation of the [BLAKE3] cryptographic hash //! function. //! //! # Examples //! //! ``` //! # fn main() -> Result<(), Box> { //! // Hash an input all at once. //! let hash1 = blake3::hash(b"foobarbaz"); //! //! // Hash an input incrementally. //! let mut hasher = blake3::Hasher::new(); //! hasher.update(b"foo"); //! hasher.update(b"bar"); //! hasher.update(b"baz"); //! let hash2 = hasher.finalize(); //! assert_eq!(hash1, hash2); //! //! // Extended output. OutputReader also implements Read and Seek. //! # #[cfg(feature = "std")] { //! let mut output = [0; 1000]; //! let mut output_reader = hasher.finalize_xof(); //! output_reader.fill(&mut output); //! assert_eq!(hash1, output[..32]); //! # } //! //! // Print a hash as hex. //! println!("{}", hash1); //! # Ok(()) //! # } //! ``` //! //! # Cargo Features //! //! The `std` feature (the only feature enabled by default) is required for //! implementations of the [`Write`] and [`Seek`] traits, the //! [`update_reader`](Hasher::update_reader) helper method, and runtime CPU //! feature detection on x86. If this feature is disabled, the only way to use //! the x86 SIMD implementations is to enable the corresponding instruction sets //! globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting binary //! will not be portable to other machines. //! //! The `rayon` feature (disabled by default, but enabled for [docs.rs]) adds //! the [`update_rayon`](Hasher::update_rayon) and (in combination with `mmap` //! below) [`update_mmap_rayon`](Hasher::update_mmap_rayon) methods, for //! multithreaded hashing. However, even if this feature is enabled, all other //! APIs remain single-threaded. //! //! The `mmap` feature (disabled by default, but enabled for [docs.rs]) adds the //! [`update_mmap`](Hasher::update_mmap) and (in combination with `rayon` above) //! [`update_mmap_rayon`](Hasher::update_mmap_rayon) helper methods for //! memory-mapped IO. //! //! The `zeroize` feature (disabled by default, but enabled for [docs.rs]) //! implements //! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for //! this crate's types. //! //! The `serde` feature (disabled by default, but enabled for [docs.rs]) implements //! [`serde::Serialize`](https://docs.rs/serde/latest/serde/trait.Serialize.html) and //! [`serde::Deserialize`](https://docs.rs/serde/latest/serde/trait.Deserialize.html) //! for [`Hash`](struct@Hash). //! //! The NEON implementation is enabled by default for AArch64 but requires the //! `neon` feature for other ARM targets. Not all ARMv7 CPUs support NEON, and //! enabling this feature will produce a binary that's not portable to CPUs //! without NEON support. //! //! The `traits-preview` feature enables implementations of traits from the //! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`. //! However, the traits aren't stable, and they're expected to change in //! incompatible ways before that crate reaches 1.0. For that reason, this crate //! makes no SemVer guarantees for this feature, and callers who use it should //! expect breaking changes between patch versions. (The "-preview" feature name //! follows the conventions of the RustCrypto [`signature`] crate.) //! //! [`Hasher::update_rayon`]: struct.Hasher.html#method.update_rayon //! [BLAKE3]: https://blake3.io //! [Rayon]: https://github.com/rayon-rs/rayon //! [docs.rs]: https://docs.rs/ //! [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html //! [`Seek`]: https://doc.rust-lang.org/std/io/trait.Seek.html //! [`digest`]: https://crates.io/crates/digest //! [`signature`]: https://crates.io/crates/signature #![cfg_attr(not(feature = "std"), no_std)] #[cfg(test)] mod test; // The guts module is for incremental use cases like the `bao` crate that need // to explicitly compute chunk and parent chaining values. It is semi-stable // and likely to keep working, but largely undocumented and not intended for // widespread use. #[doc(hidden)] pub mod guts; /// Undocumented and unstable, for benchmarks only. #[doc(hidden)] pub mod platform; // Platform-specific implementations of the compression function. These // BLAKE3-specific cfg flags are set in build.rs. #[cfg(blake3_avx2_rust)] #[path = "rust_avx2.rs"] mod avx2; #[cfg(blake3_avx2_ffi)] #[path = "ffi_avx2.rs"] mod avx2; #[cfg(blake3_avx512_ffi)] #[path = "ffi_avx512.rs"] mod avx512; #[cfg(blake3_neon)] #[path = "ffi_neon.rs"] mod neon; mod portable; #[cfg(blake3_sse2_rust)] #[path = "rust_sse2.rs"] mod sse2; #[cfg(blake3_sse2_ffi)] #[path = "ffi_sse2.rs"] mod sse2; #[cfg(blake3_sse41_rust)] #[path = "rust_sse41.rs"] mod sse41; #[cfg(blake3_sse41_ffi)] #[path = "ffi_sse41.rs"] mod sse41; #[cfg(feature = "traits-preview")] pub mod traits; mod io; mod join; use arrayref::{array_mut_ref, array_ref}; use arrayvec::{ArrayString, ArrayVec}; use core::cmp; use core::fmt; use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2}; #[cfg(feature = "zeroize")] use zeroize::Zeroize; /// The number of bytes in a [`Hash`](struct.Hash.html), 32. pub const OUT_LEN: usize = 32; /// The number of bytes in a key, 32. pub const KEY_LEN: usize = 32; const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64 use guts::{BLOCK_LEN, CHUNK_LEN}; // While iterating the compression function within a chunk, the CV is // represented as words, to avoid doing two extra endianness conversions for // each compression in the portable implementation. But the hash_many interface // needs to hash both input bytes and parent nodes, so its better for its // output CVs to be represented as bytes. type CVWords = [u32; 8]; type CVBytes = [u8; 32]; // little-endian const IV: &CVWords = &[ 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, ]; const MSG_SCHEDULE: [[usize; 16]; 7] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], ]; // These are the internal flags that we use to domain separate root/non-root, // chunk/parent, and chunk beginning/middle/end. These get set at the high end // of the block flags word in the compression function, so their values start // high and go down. const CHUNK_START: u8 = 1 << 0; const CHUNK_END: u8 = 1 << 1; const PARENT: u8 = 1 << 2; const ROOT: u8 = 1 << 3; const KEYED_HASH: u8 = 1 << 4; const DERIVE_KEY_CONTEXT: u8 = 1 << 5; const DERIVE_KEY_MATERIAL: u8 = 1 << 6; #[inline] fn counter_low(counter: u64) -> u32 { counter as u32 } #[inline] fn counter_high(counter: u64) -> u32 { (counter >> 32) as u32 } /// An output of the default size, 32 bytes, which provides constant-time /// equality checking. /// /// `Hash` implements [`From`] and [`Into`] for `[u8; 32]`, and it provides /// [`from_bytes`] and [`as_bytes`] for explicit conversions between itself and /// `[u8; 32]`. However, byte arrays and slices don't provide constant-time /// equality checking, which is often a security requirement in software that /// handles private data. `Hash` doesn't implement [`Deref`] or [`AsRef`], to /// avoid situations where a type conversion happens implicitly and the /// constant-time property is accidentally lost. /// /// `Hash` provides the [`to_hex`] and [`from_hex`] methods for converting to /// and from hexadecimal. It also implements [`Display`] and [`FromStr`]. /// /// [`From`]: https://doc.rust-lang.org/std/convert/trait.From.html /// [`Into`]: https://doc.rust-lang.org/std/convert/trait.Into.html /// [`as_bytes`]: #method.as_bytes /// [`from_bytes`]: #method.from_bytes /// [`Deref`]: https://doc.rust-lang.org/stable/std/ops/trait.Deref.html /// [`AsRef`]: https://doc.rust-lang.org/std/convert/trait.AsRef.html /// [`to_hex`]: #method.to_hex /// [`from_hex`]: #method.from_hex /// [`Display`]: https://doc.rust-lang.org/std/fmt/trait.Display.html /// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] #[derive(Clone, Copy, Hash)] pub struct Hash([u8; OUT_LEN]); impl Hash { /// The raw bytes of the `Hash`. Note that byte arrays don't provide /// constant-time equality checking, so if you need to compare hashes, /// prefer the `Hash` type. #[inline] pub const fn as_bytes(&self) -> &[u8; OUT_LEN] { &self.0 } /// Create a `Hash` from its raw bytes representation. pub const fn from_bytes(bytes: [u8; OUT_LEN]) -> Self { Self(bytes) } /// Encode a `Hash` in lowercase hexadecimal. /// /// The returned [`ArrayString`] is a fixed size and doesn't allocate memory /// on the heap. Note that [`ArrayString`] doesn't provide constant-time /// equality checking, so if you need to compare hashes, prefer the `Hash` /// type. /// /// [`ArrayString`]: https://docs.rs/arrayvec/0.5.1/arrayvec/struct.ArrayString.html pub fn to_hex(&self) -> ArrayString<{ 2 * OUT_LEN }> { let mut s = ArrayString::new(); let table = b"0123456789abcdef"; for &b in self.0.iter() { s.push(table[(b >> 4) as usize] as char); s.push(table[(b & 0xf) as usize] as char); } s } /// Decode a `Hash` from hexadecimal. Both uppercase and lowercase ASCII /// bytes are supported. /// /// Any byte outside the ranges `'0'...'9'`, `'a'...'f'`, and `'A'...'F'` /// results in an error. An input length other than 64 also results in an /// error. /// /// Note that `Hash` also implements `FromStr`, so `Hash::from_hex("...")` /// is equivalent to `"...".parse()`. pub fn from_hex(hex: impl AsRef<[u8]>) -> Result { fn hex_val(byte: u8) -> Result { match byte { b'A'..=b'F' => Ok(byte - b'A' + 10), b'a'..=b'f' => Ok(byte - b'a' + 10), b'0'..=b'9' => Ok(byte - b'0'), _ => Err(HexError(HexErrorInner::InvalidByte(byte))), } } let hex_bytes: &[u8] = hex.as_ref(); if hex_bytes.len() != OUT_LEN * 2 { return Err(HexError(HexErrorInner::InvalidLen(hex_bytes.len()))); } let mut hash_bytes: [u8; OUT_LEN] = [0; OUT_LEN]; for i in 0..OUT_LEN { hash_bytes[i] = 16 * hex_val(hex_bytes[2 * i])? + hex_val(hex_bytes[2 * i + 1])?; } Ok(Hash::from(hash_bytes)) } } impl From<[u8; OUT_LEN]> for Hash { #[inline] fn from(bytes: [u8; OUT_LEN]) -> Self { Self::from_bytes(bytes) } } impl From for [u8; OUT_LEN] { #[inline] fn from(hash: Hash) -> Self { hash.0 } } impl core::str::FromStr for Hash { type Err = HexError; fn from_str(s: &str) -> Result { Hash::from_hex(s) } } #[cfg(feature = "zeroize")] impl Zeroize for Hash { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self(bytes) = self; bytes.zeroize(); } } // A proper implementation of constant time equality is tricky, and we get it from the // constant_time_eq crate instead of rolling our own. However, that crate isn't compatible with // Miri, so we roll our own just for that. #[cfg(miri)] fn constant_time_eq_miri(a: &[u8], b: &[u8]) -> bool { if a.len() != b.len() { return false; } let mut x = 0; for i in 0..a.len() { x |= a[i] ^ b[i]; } x == 0 } /// This implementation is constant-time. impl PartialEq for Hash { #[inline] fn eq(&self, other: &Hash) -> bool { #[cfg(miri)] return constant_time_eq_miri(&self.0, &other.0); #[cfg(not(miri))] constant_time_eq::constant_time_eq_32(&self.0, &other.0) } } /// This implementation is constant-time. impl PartialEq<[u8; OUT_LEN]> for Hash { #[inline] fn eq(&self, other: &[u8; OUT_LEN]) -> bool { #[cfg(miri)] return constant_time_eq_miri(&self.0, other); #[cfg(not(miri))] constant_time_eq::constant_time_eq_32(&self.0, other) } } /// This implementation is constant-time if the target is 32 bytes long. impl PartialEq<[u8]> for Hash { #[inline] fn eq(&self, other: &[u8]) -> bool { #[cfg(miri)] return constant_time_eq_miri(&self.0, other); #[cfg(not(miri))] constant_time_eq::constant_time_eq(&self.0, other) } } impl Eq for Hash {} impl fmt::Display for Hash { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // Formatting field as `&str` to reduce code size since the `Debug` // dynamic dispatch table for `&str` is likely needed elsewhere already, // but that for `ArrayString<[u8; 64]>` is not. let hex = self.to_hex(); let hex: &str = hex.as_str(); f.write_str(hex) } } impl fmt::Debug for Hash { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // Formatting field as `&str` to reduce code size since the `Debug` // dynamic dispatch table for `&str` is likely needed elsewhere already, // but that for `ArrayString<[u8; 64]>` is not. let hex = self.to_hex(); let hex: &str = hex.as_str(); f.debug_tuple("Hash").field(&hex).finish() } } /// The error type for [`Hash::from_hex`]. /// /// The `.to_string()` representation of this error currently distinguishes between bad length /// errors and bad character errors. This is to help with logging and debugging, but it isn't a /// stable API detail, and it may change at any time. #[derive(Clone, Debug)] pub struct HexError(HexErrorInner); #[derive(Clone, Debug)] enum HexErrorInner { InvalidByte(u8), InvalidLen(usize), } impl fmt::Display for HexError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.0 { HexErrorInner::InvalidByte(byte) => { if byte < 128 { write!(f, "invalid hex character: {:?}", byte as char) } else { write!(f, "invalid hex character: 0x{:x}", byte) } } HexErrorInner::InvalidLen(len) => { write!(f, "expected 64 hex bytes, received {}", len) } } } } #[cfg(feature = "std")] impl std::error::Error for HexError {} // Each chunk or parent node can produce either a 32-byte chaining value or, by // setting the ROOT flag, any number of final output bytes. The Output struct // captures the state just prior to choosing between those two possibilities. #[derive(Clone)] struct Output { input_chaining_value: CVWords, block: [u8; 64], block_len: u8, counter: u64, flags: u8, platform: Platform, } impl Output { fn chaining_value(&self) -> CVBytes { let mut cv = self.input_chaining_value; self.platform.compress_in_place( &mut cv, &self.block, self.block_len, self.counter, self.flags, ); platform::le_bytes_from_words_32(&cv) } fn root_hash(&self) -> Hash { debug_assert_eq!(self.counter, 0); let mut cv = self.input_chaining_value; self.platform .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT); Hash(platform::le_bytes_from_words_32(&cv)) } fn root_output_block(&self) -> [u8; 2 * OUT_LEN] { self.platform.compress_xof( &self.input_chaining_value, &self.block, self.block_len, self.counter, self.flags | ROOT, ) } } #[cfg(feature = "zeroize")] impl Zeroize for Output { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self { input_chaining_value, block, block_len, counter, flags, platform: _, } = self; input_chaining_value.zeroize(); block.zeroize(); block_len.zeroize(); counter.zeroize(); flags.zeroize(); } } #[derive(Clone)] struct ChunkState { cv: CVWords, chunk_counter: u64, buf: [u8; BLOCK_LEN], buf_len: u8, blocks_compressed: u8, flags: u8, platform: Platform, } impl ChunkState { fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self { Self { cv: *key, chunk_counter, buf: [0; BLOCK_LEN], buf_len: 0, blocks_compressed: 0, flags, platform, } } fn len(&self) -> usize { BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize } fn fill_buf(&mut self, input: &mut &[u8]) { let want = BLOCK_LEN - self.buf_len as usize; let take = cmp::min(want, input.len()); self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]); self.buf_len += take as u8; *input = &input[take..]; } fn start_flag(&self) -> u8 { if self.blocks_compressed == 0 { CHUNK_START } else { 0 } } // Try to avoid buffering as much as possible, by compressing directly from // the input slice when full blocks are available. fn update(&mut self, mut input: &[u8]) -> &mut Self { if self.buf_len > 0 { self.fill_buf(&mut input); if !input.is_empty() { debug_assert_eq!(self.buf_len as usize, BLOCK_LEN); let block_flags = self.flags | self.start_flag(); // borrowck self.platform.compress_in_place( &mut self.cv, &self.buf, BLOCK_LEN as u8, self.chunk_counter, block_flags, ); self.buf_len = 0; self.buf = [0; BLOCK_LEN]; self.blocks_compressed += 1; } } while input.len() > BLOCK_LEN { debug_assert_eq!(self.buf_len, 0); let block_flags = self.flags | self.start_flag(); // borrowck self.platform.compress_in_place( &mut self.cv, array_ref!(input, 0, BLOCK_LEN), BLOCK_LEN as u8, self.chunk_counter, block_flags, ); self.blocks_compressed += 1; input = &input[BLOCK_LEN..]; } self.fill_buf(&mut input); debug_assert!(input.is_empty()); debug_assert!(self.len() <= CHUNK_LEN); self } fn output(&self) -> Output { let block_flags = self.flags | self.start_flag() | CHUNK_END; Output { input_chaining_value: self.cv, block: self.buf, block_len: self.buf_len, counter: self.chunk_counter, flags: block_flags, platform: self.platform, } } } // Don't derive(Debug), because the state may be secret. impl fmt::Debug for ChunkState { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("ChunkState") .field("len", &self.len()) .field("chunk_counter", &self.chunk_counter) .field("flags", &self.flags) .field("platform", &self.platform) .finish() } } #[cfg(feature = "zeroize")] impl Zeroize for ChunkState { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self { cv, chunk_counter, buf, buf_len, blocks_compressed, flags, platform: _, } = self; cv.zeroize(); chunk_counter.zeroize(); buf.zeroize(); buf_len.zeroize(); blocks_compressed.zeroize(); flags.zeroize(); } } // IMPLEMENTATION NOTE // =================== // The recursive function compress_subtree_wide(), implemented below, is the // basis of high-performance BLAKE3. We use it both for all-at-once hashing, // and for the incremental input with Hasher (though we have to be careful with // subtree boundaries in the incremental case). compress_subtree_wide() applies // several optimizations at the same time: // - Multithreading with Rayon. // - Parallel chunk hashing with SIMD. // - Parallel parent hashing with SIMD. Note that while SIMD chunk hashing // maxes out at MAX_SIMD_DEGREE*CHUNK_LEN, parallel parent hashing continues // to benefit from larger inputs, because more levels of the tree benefit can // use full-width SIMD vectors for parent hashing. Without parallel parent // hashing, we lose about 10% of overall throughput on AVX2 and AVX-512. /// Undocumented and unstable, for benchmarks only. #[doc(hidden)] #[derive(Clone, Copy)] pub enum IncrementCounter { Yes, No, } impl IncrementCounter { #[inline] fn yes(&self) -> bool { match self { IncrementCounter::Yes => true, IncrementCounter::No => false, } } } // The largest power of two less than or equal to `n`, used for left_len() // immediately below, and also directly in Hasher::update(). fn largest_power_of_two_leq(n: usize) -> usize { ((n / 2) + 1).next_power_of_two() } // Given some input larger than one chunk, return the number of bytes that // should go in the left subtree. This is the largest power-of-2 number of // chunks that leaves at least 1 byte for the right subtree. fn left_len(content_len: usize) -> usize { debug_assert!(content_len > CHUNK_LEN); // Subtract 1 to reserve at least one byte for the right side. let full_chunks = (content_len - 1) / CHUNK_LEN; largest_power_of_two_leq(full_chunks) * CHUNK_LEN } // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time // on a single thread. Write out the chunk chaining values and return the // number of chunks hashed. These chunks are never the root and never empty; // those cases use a different codepath. fn compress_chunks_parallel( input: &[u8], key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform, out: &mut [u8], ) -> usize { debug_assert!(!input.is_empty(), "empty chunks below the root"); debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN); let mut chunks_exact = input.chunks_exact(CHUNK_LEN); let mut chunks_array = ArrayVec::<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE>::new(); for chunk in &mut chunks_exact { chunks_array.push(array_ref!(chunk, 0, CHUNK_LEN)); } platform.hash_many( &chunks_array, key, chunk_counter, IncrementCounter::Yes, flags, CHUNK_START, CHUNK_END, out, ); // Hash the remaining partial chunk, if there is one. Note that the empty // chunk (meaning the empty message) is a different codepath. let chunks_so_far = chunks_array.len(); if !chunks_exact.remainder().is_empty() { let counter = chunk_counter + chunks_so_far as u64; let mut chunk_state = ChunkState::new(key, counter, flags, platform); chunk_state.update(chunks_exact.remainder()); *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) = chunk_state.output().chaining_value(); chunks_so_far + 1 } else { chunks_so_far } } // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time // on a single thread. Write out the parent chaining values and return the // number of parents hashed. (If there's an odd input chaining value left over, // return it as an additional output.) These parents are never the root and // never empty; those cases use a different codepath. fn compress_parents_parallel( child_chaining_values: &[u8], key: &CVWords, flags: u8, platform: Platform, out: &mut [u8], ) -> usize { debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes"); let num_children = child_chaining_values.len() / OUT_LEN; debug_assert!(num_children >= 2, "not enough children"); debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many"); let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN); // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of // the requirements of compress_subtree_wide(). let mut parents_array = ArrayVec::<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE_OR_2>::new(); for parent in &mut parents_exact { parents_array.push(array_ref!(parent, 0, BLOCK_LEN)); } platform.hash_many( &parents_array, key, 0, // Parents always use counter 0. IncrementCounter::No, flags | PARENT, 0, // Parents have no start flags. 0, // Parents have no end flags. out, ); // If there's an odd child left over, it becomes an output. let parents_so_far = parents_array.len(); if !parents_exact.remainder().is_empty() { out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder()); parents_so_far + 1 } else { parents_so_far } } // The wide helper function returns (writes out) an array of chaining values // and returns the length of that array. The number of chaining values returned // is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, // if the input is shorter than that many chunks. The reason for maintaining a // wide array of chaining values going back up the tree, is to allow the // implementation to hash as many parents in parallel as possible. // // As a special case when the SIMD degree is 1, this function will still return // at least 2 outputs. This guarantees that this function doesn't perform the // root compression. (If it did, it would use the wrong flags, and also we // wouldn't be able to implement extendable output.) Note that this function is // not used when the whole input is only 1 chunk long; that's a different // codepath. // // Why not just have the caller split the input on the first update(), instead // of implementing this special rule? Because we don't want to limit SIMD or // multithreading parallelism for that update(). fn compress_subtree_wide( input: &[u8], key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform, out: &mut [u8], ) -> usize { // Note that the single chunk case does *not* bump the SIMD degree up to 2 // when it is 1. This allows Rayon the option of multithreading even the // 2-chunk case, which can help performance on smaller platforms. if input.len() <= platform.simd_degree() * CHUNK_LEN { return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out); } // With more than simd_degree chunks, we need to recurse. Start by dividing // the input into left and right subtrees. (Note that this is only optimal // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree // of 3 or something, we'll need a more complicated strategy.) debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2"); let (left, right) = input.split_at(left_len(input.len())); let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64; // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to // account for the special case of returning 2 outputs when the SIMD degree // is 1. let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; let degree = if left.len() == CHUNK_LEN { // The "simd_degree=1 and we're at the leaf nodes" case. debug_assert_eq!(platform.simd_degree(), 1); 1 } else { cmp::max(platform.simd_degree(), 2) }; let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN); // Recurse! For update_rayon(), this is where we take advantage of RayonJoin and use multiple // threads. let (left_n, right_n) = J::join( || compress_subtree_wide::(left, key, chunk_counter, flags, platform, left_out), || compress_subtree_wide::(right, key, right_chunk_counter, flags, platform, right_out), ); // The special case again. If simd_degree=1, then we'll have left_n=1 and // right_n=1. Rather than compressing them into a single output, return // them directly, to make sure we always have at least two outputs. debug_assert_eq!(left_n, degree); debug_assert!(right_n >= 1 && right_n <= left_n); if left_n == 1 { out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]); return 2; } // Otherwise, do one layer of parent node compression. let num_children = left_n + right_n; compress_parents_parallel( &cv_array[..num_children * OUT_LEN], key, flags, platform, out, ) } // Hash a subtree with compress_subtree_wide(), and then condense the resulting // list of chaining values down to a single parent node. Don't compress that // last parent node, however. Instead, return its message bytes (the // concatenated chaining values of its children). This is necessary when the // first call to update() supplies a complete subtree, because the topmost // parent node of that subtree could end up being the root. It's also necessary // for extended output in the general case. // // As with compress_subtree_wide(), this function is not used on inputs of 1 // chunk or less. That's a different codepath. fn compress_subtree_to_parent_node( input: &[u8], key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform, ) -> [u8; BLOCK_LEN] { debug_assert!(input.len() > CHUNK_LEN); let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; let mut num_cvs = compress_subtree_wide::(input, &key, chunk_counter, flags, platform, &mut cv_array); debug_assert!(num_cvs >= 2); // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, // compress_subtree_wide() returns more than 2 chaining values. Condense // them into 2 by forming parent nodes repeatedly. let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2]; while num_cvs > 2 { let cv_slice = &cv_array[..num_cvs * OUT_LEN]; num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array); cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]); } *array_ref!(cv_array, 0, 2 * OUT_LEN) } // Hash a complete input all at once. Unlike compress_subtree_wide() and // compress_subtree_to_parent_node(), this function handles the 1 chunk case. fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output { let platform = Platform::detect(); // If the whole subtree is one chunk, hash it directly with a ChunkState. if input.len() <= CHUNK_LEN { return ChunkState::new(key, 0, flags, platform) .update(input) .output(); } // Otherwise construct an Output object from the parent node returned by // compress_subtree_to_parent_node(). Output { input_chaining_value: *key, block: compress_subtree_to_parent_node::(input, key, 0, flags, platform), block_len: BLOCK_LEN as u8, counter: 0, flags: flags | PARENT, platform, } } /// The default hash function. /// /// For an incremental version that accepts multiple writes, see /// [`Hasher::update`]. /// /// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`] and /// [`OutputReader`]. /// /// This function is always single-threaded. For multithreading support, see /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). pub fn hash(input: &[u8]) -> Hash { hash_all_at_once::(input, IV, 0).root_hash() } /// The keyed hash function. /// /// This is suitable for use as a message authentication code, for example to /// replace an HMAC instance. In that use case, the constant-time equality /// checking provided by [`Hash`](struct.Hash.html) is almost always a security /// requirement, and callers need to be careful not to compare MACs as raw /// bytes. /// /// For output sizes other than 32 bytes, see [`Hasher::new_keyed`], /// [`Hasher::finalize_xof`], and [`OutputReader`]. /// /// This function is always single-threaded. For multithreading support, see /// [`Hasher::new_keyed`] and /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash { let key_words = platform::words_from_le_bytes_32(key); hash_all_at_once::(input, &key_words, KEYED_HASH).root_hash() } /// The key derivation function. /// /// Given cryptographic key material of any length and a context string of any /// length, this function outputs a 32-byte derived subkey. **The context string /// should be hardcoded, globally unique, and application-specific.** A good /// default format for such strings is `"[application] [commit timestamp] /// [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`. /// /// Key derivation is important when you want to use the same key in multiple /// algorithms or use cases. Using the same key with different cryptographic /// algorithms is generally forbidden, and deriving a separate subkey for each /// use case protects you from bad interactions. Derived keys also mitigate the /// damage from one part of your application accidentally leaking its key. /// /// As a rare exception to that general rule, however, it is possible to use /// `derive_key` itself with key material that you are already using with /// another algorithm. You might need to do this if you're adding features to /// an existing application, which does not yet use key derivation internally. /// However, you still must not share key material with algorithms that forbid /// key reuse entirely, like a one-time pad. For more on this, see sections 6.2 /// and 7.8 of the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). /// /// Note that BLAKE3 is not a password hash, and **`derive_key` should never be /// used with passwords.** Instead, use a dedicated password hash like /// [Argon2]. Password hashes are entirely different from generic hash /// functions, with opposite design requirements. /// /// For output sizes other than 32 bytes, see [`Hasher::new_derive_key`], /// [`Hasher::finalize_xof`], and [`OutputReader`]. /// /// This function is always single-threaded. For multithreading support, see /// [`Hasher::new_derive_key`] and /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). /// /// [Argon2]: https://en.wikipedia.org/wiki/Argon2 pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] { let context_key = hash_all_at_once::(context.as_bytes(), IV, DERIVE_KEY_CONTEXT) .root_hash(); let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); hash_all_at_once::(key_material, &context_key_words, DERIVE_KEY_MATERIAL) .root_hash() .0 } fn parent_node_output( left_child: &CVBytes, right_child: &CVBytes, key: &CVWords, flags: u8, platform: Platform, ) -> Output { let mut block = [0; BLOCK_LEN]; block[..32].copy_from_slice(left_child); block[32..].copy_from_slice(right_child); Output { input_chaining_value: *key, block, block_len: BLOCK_LEN as u8, counter: 0, flags: flags | PARENT, platform, } } /// An incremental hash state that can accept any number of writes. /// /// The `rayon` and `mmap` Cargo features enable additional methods on this /// type related to multithreading and memory-mapped IO. /// /// When the `traits-preview` Cargo feature is enabled, this type implements /// several commonly used traits from the /// [`digest`](https://crates.io/crates/digest) crate. However, those /// traits aren't stable, and they're expected to change in incompatible ways /// before that crate reaches 1.0. For that reason, this crate makes no SemVer /// guarantees for this feature, and callers who use it should expect breaking /// changes between patch versions. /// /// # Examples /// /// ``` /// # fn main() -> Result<(), Box> { /// // Hash an input incrementally. /// let mut hasher = blake3::Hasher::new(); /// hasher.update(b"foo"); /// hasher.update(b"bar"); /// hasher.update(b"baz"); /// assert_eq!(hasher.finalize(), blake3::hash(b"foobarbaz")); /// /// // Extended output. OutputReader also implements Read and Seek. /// # #[cfg(feature = "std")] { /// let mut output = [0; 1000]; /// let mut output_reader = hasher.finalize_xof(); /// output_reader.fill(&mut output); /// assert_eq!(&output[..32], blake3::hash(b"foobarbaz").as_bytes()); /// # } /// # Ok(()) /// # } /// ``` #[derive(Clone)] pub struct Hasher { key: CVWords, chunk_state: ChunkState, // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk // requires a 4th entry, rather than merging everything down to 1, because // we don't know whether more input is coming. This is different from how // the reference implementation does things. cv_stack: ArrayVec, } impl Hasher { fn new_internal(key: &CVWords, flags: u8) -> Self { Self { key: *key, chunk_state: ChunkState::new(key, 0, flags, Platform::detect()), cv_stack: ArrayVec::new(), } } /// Construct a new `Hasher` for the regular hash function. pub fn new() -> Self { Self::new_internal(IV, 0) } /// Construct a new `Hasher` for the keyed hash function. See /// [`keyed_hash`]. /// /// [`keyed_hash`]: fn.keyed_hash.html pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { let key_words = platform::words_from_le_bytes_32(key); Self::new_internal(&key_words, KEYED_HASH) } /// Construct a new `Hasher` for the key derivation function. See /// [`derive_key`]. The context string should be hardcoded, globally /// unique, and application-specific. /// /// [`derive_key`]: fn.derive_key.html pub fn new_derive_key(context: &str) -> Self { let context_key = hash_all_at_once::(context.as_bytes(), IV, DERIVE_KEY_CONTEXT) .root_hash(); let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL) } /// Reset the `Hasher` to its initial state. /// /// This is functionally the same as overwriting the `Hasher` with a new /// one, using the same key or context string if any. pub fn reset(&mut self) -> &mut Self { self.chunk_state = ChunkState::new( &self.key, 0, self.chunk_state.flags, self.chunk_state.platform, ); self.cv_stack.clear(); self } // As described in push_cv() below, we do "lazy merging", delaying merges // until right before the next CV is about to be added. This is different // from the reference implementation. Another difference is that we aren't // always merging 1 chunk at a time. Instead, each CV might represent any // power-of-two number of chunks, as long as the smaller-above-larger stack // order is maintained. Instead of the "count the trailing 0-bits" // algorithm described in the spec, we use a "count the total number of // 1-bits" variant that doesn't require us to retain the subtree size of // the CV on top of the stack. The principle is the same: each CV that // should remain in the stack is represented by a 1-bit in the total number // of chunks (or bytes) so far. fn merge_cv_stack(&mut self, total_len: u64) { let post_merge_stack_len = total_len.count_ones() as usize; while self.cv_stack.len() > post_merge_stack_len { let right_child = self.cv_stack.pop().unwrap(); let left_child = self.cv_stack.pop().unwrap(); let parent_output = parent_node_output( &left_child, &right_child, &self.key, self.chunk_state.flags, self.chunk_state.platform, ); self.cv_stack.push(parent_output.chaining_value()); } } // In reference_impl.rs, we merge the new CV with existing CVs from the // stack before pushing it. We can do that because we know more input is // coming, so we know none of the merges are root. // // This setting is different. We want to feed as much input as possible to // compress_subtree_wide(), without setting aside anything for the // chunk_state. If the user gives us 64 KiB, we want to parallelize over // all 64 KiB at once as a single subtree, if at all possible. // // This leads to two problems: // 1) This 64 KiB input might be the only call that ever gets made to // update. In this case, the root node of the 64 KiB subtree would be // the root node of the whole tree, and it would need to be ROOT // finalized. We can't compress it until we know. // 2) This 64 KiB input might complete a larger tree, whose root node is // similarly going to be the root of the whole tree. For example, // maybe we have 196 KiB (that is, 128 + 64) hashed so far. We can't // compress the node at the root of the 256 KiB subtree until we know // how to finalize it. // // The second problem is solved with "lazy merging". That is, when we're // about to add a CV to the stack, we don't merge it with anything first, // as the reference impl does. Instead we do merges using the *previous* CV // that was added, which is sitting on top of the stack, and we put the new // CV (unmerged) on top of the stack afterwards. This guarantees that we // never merge the root node until finalize(). // // Solving the first problem requires an additional tool, // compress_subtree_to_parent_node(). That function always returns the top // *two* chaining values of the subtree it's compressing. We then do lazy // merging with each of them separately, so that the second CV will always // remain unmerged. (That also helps us support extendable output when // we're hashing an input all-at-once.) fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) { self.merge_cv_stack(chunk_counter); self.cv_stack.push(*new_cv); } /// Add input bytes to the hash state. You can call this any number of times. /// /// This method is always single-threaded. For multithreading support, see /// [`update_rayon`](#method.update_rayon) (enabled with the `rayon` Cargo feature). /// /// Note that the degree of SIMD parallelism that `update` can use is limited by the size of /// this input buffer. See [`update_reader`](#method.update_reader). pub fn update(&mut self, input: &[u8]) -> &mut Self { self.update_with_join::(input) } fn update_with_join(&mut self, mut input: &[u8]) -> &mut Self { // If we have some partial chunk bytes in the internal chunk_state, we // need to finish that chunk first. if self.chunk_state.len() > 0 { let want = CHUNK_LEN - self.chunk_state.len(); let take = cmp::min(want, input.len()); self.chunk_state.update(&input[..take]); input = &input[take..]; if !input.is_empty() { // We've filled the current chunk, and there's more input // coming, so we know it's not the root and we can finalize it. // Then we'll proceed to hashing whole chunks below. debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN); let chunk_cv = self.chunk_state.output().chaining_value(); self.push_cv(&chunk_cv, self.chunk_state.chunk_counter); self.chunk_state = ChunkState::new( &self.key, self.chunk_state.chunk_counter + 1, self.chunk_state.flags, self.chunk_state.platform, ); } else { return self; } } // Now the chunk_state is clear, and we have more input. If there's // more than a single chunk (so, definitely not the root chunk), hash // the largest whole subtree we can, with the full benefits of SIMD and // multithreading parallelism. Two restrictions: // - The subtree has to be a power-of-2 number of chunks. Only subtrees // along the right edge can be incomplete, and we don't know where // the right edge is going to be until we get to finalize(). // - The subtree must evenly divide the total number of chunks up until // this point (if total is not 0). If the current incomplete subtree // is only waiting for 1 more chunk, we can't hash a subtree of 4 // chunks. We have to complete the current subtree first. // Because we might need to break up the input to form powers of 2, or // to evenly divide what we already have, this part runs in a loop. while input.len() > CHUNK_LEN { debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data"); debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len"); let mut subtree_len = largest_power_of_two_leq(input.len()); let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64; // Shrink the subtree_len until it evenly divides the count so far. // We know that subtree_len itself is a power of 2, so we can use a // bitmasking trick instead of an actual remainder operation. (Note // that if the caller consistently passes power-of-2 inputs of the // same size, as is hopefully typical, this loop condition will // always fail, and subtree_len will always be the full length of // the input.) // // An aside: We don't have to shrink subtree_len quite this much. // For example, if count_so_far is 1, we could pass 2 chunks to // compress_subtree_to_parent_node. Since we'll get 2 CVs back, // we'll still get the right answer in the end, and we might get to // use 2-way SIMD parallelism. The problem with this optimization, // is that it gets us stuck always hashing 2 chunks. The total // number of chunks will remain odd, and we'll never graduate to // higher degrees of parallelism. See // https://github.com/BLAKE3-team/BLAKE3/issues/69. while (subtree_len - 1) as u64 & count_so_far != 0 { subtree_len /= 2; } // The shrunken subtree_len might now be 1 chunk long. If so, hash // that one chunk by itself. Otherwise, compress the subtree into a // pair of CVs. let subtree_chunks = (subtree_len / CHUNK_LEN) as u64; if subtree_len <= CHUNK_LEN { debug_assert_eq!(subtree_len, CHUNK_LEN); self.push_cv( &ChunkState::new( &self.key, self.chunk_state.chunk_counter, self.chunk_state.flags, self.chunk_state.platform, ) .update(&input[..subtree_len]) .output() .chaining_value(), self.chunk_state.chunk_counter, ); } else { // This is the high-performance happy path, though getting here // depends on the caller giving us a long enough input. let cv_pair = compress_subtree_to_parent_node::( &input[..subtree_len], &self.key, self.chunk_state.chunk_counter, self.chunk_state.flags, self.chunk_state.platform, ); let left_cv = array_ref!(cv_pair, 0, 32); let right_cv = array_ref!(cv_pair, 32, 32); // Push the two CVs we received into the CV stack in order. Because // the stack merges lazily, this guarantees we aren't merging the // root. self.push_cv(left_cv, self.chunk_state.chunk_counter); self.push_cv( right_cv, self.chunk_state.chunk_counter + (subtree_chunks / 2), ); } self.chunk_state.chunk_counter += subtree_chunks; input = &input[subtree_len..]; } // What remains is 1 chunk or less. Add it to the chunk state. debug_assert!(input.len() <= CHUNK_LEN); if !input.is_empty() { self.chunk_state.update(input); // Having added some input to the chunk_state, we know what's in // the CV stack won't become the root node, and we can do an extra // merge. This simplifies finalize(). self.merge_cv_stack(self.chunk_state.chunk_counter); } self } fn final_output(&self) -> Output { // If the current chunk is the only chunk, that makes it the root node // also. Convert it directly into an Output. Otherwise, we need to // merge subtrees below. if self.cv_stack.is_empty() { debug_assert_eq!(self.chunk_state.chunk_counter, 0); return self.chunk_state.output(); } // If there are any bytes in the ChunkState, finalize that chunk and // merge its CV with everything in the CV stack. In that case, the work // we did at the end of update() above guarantees that the stack // doesn't contain any unmerged subtrees that need to be merged first. // (This is important, because if there were two chunk hashes sitting // on top of the stack, they would need to merge with each other, and // merging a new chunk hash into them would be incorrect.) // // If there are no bytes in the ChunkState, we'll merge what's already // in the stack. In this case it's fine if there are unmerged chunks on // top, because we'll merge them with each other. Note that the case of // the empty chunk is taken care of above. let mut output: Output; let mut num_cvs_remaining = self.cv_stack.len(); if self.chunk_state.len() > 0 { debug_assert_eq!( self.cv_stack.len(), self.chunk_state.chunk_counter.count_ones() as usize, "cv stack does not need a merge" ); output = self.chunk_state.output(); } else { debug_assert!(self.cv_stack.len() >= 2); output = parent_node_output( &self.cv_stack[num_cvs_remaining - 2], &self.cv_stack[num_cvs_remaining - 1], &self.key, self.chunk_state.flags, self.chunk_state.platform, ); num_cvs_remaining -= 2; } while num_cvs_remaining > 0 { output = parent_node_output( &self.cv_stack[num_cvs_remaining - 1], &output.chaining_value(), &self.key, self.chunk_state.flags, self.chunk_state.platform, ); num_cvs_remaining -= 1; } output } /// Finalize the hash state and return the [`Hash`](struct.Hash.html) of /// the input. /// /// This method is idempotent. Calling it twice will give the same result. /// You can also add more input and finalize again. pub fn finalize(&self) -> Hash { self.final_output().root_hash() } /// Finalize the hash state and return an [`OutputReader`], which can /// supply any number of output bytes. /// /// This method is idempotent. Calling it twice will give the same result. /// You can also add more input and finalize again. /// /// [`OutputReader`]: struct.OutputReader.html pub fn finalize_xof(&self) -> OutputReader { OutputReader::new(self.final_output()) } /// Return the total number of bytes hashed so far. pub fn count(&self) -> u64 { self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64 } /// As [`update`](Hasher::update), but reading from a /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) implementation. /// /// [`Hasher`] implements /// [`std::io::Write`](https://doc.rust-lang.org/std/io/trait.Write.html), so it's possible to /// use [`std::io::copy`](https://doc.rust-lang.org/std/io/fn.copy.html) to update a [`Hasher`] /// from any reader. Unfortunately, this standard approach can limit performance, because /// `copy` currently uses an internal 8 KiB buffer that isn't big enough to take advantage of /// all SIMD instruction sets. (In particular, [AVX-512](https://en.wikipedia.org/wiki/AVX-512) /// needs a 16 KiB buffer.) `update_reader` avoids this performance problem and is slightly /// more convenient. /// /// The internal buffer size this method uses may change at any time, and it may be different /// for different targets. The only guarantee is that it will be large enough for all of this /// crate's SIMD implementations on the current platform. /// /// The most common implementer of /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) might be /// [`std::fs::File`](https://doc.rust-lang.org/std/fs/struct.File.html), but note that memory /// mapping can be faster than this method for hashing large files. See /// [`update_mmap`](Hasher::update_mmap) and [`update_mmap_rayon`](Hasher::update_mmap_rayon), /// which require the `mmap` and (for the latter) `rayon` Cargo features. /// /// This method requires the `std` Cargo feature, which is enabled by default. /// /// # Example /// /// ```no_run /// # use std::fs::File; /// # use std::io; /// # fn main() -> io::Result<()> { /// // Hash standard input. /// let mut hasher = blake3::Hasher::new(); /// hasher.update_reader(std::io::stdin().lock())?; /// println!("{}", hasher.finalize()); /// # Ok(()) /// # } /// ``` #[cfg(feature = "std")] pub fn update_reader(&mut self, reader: impl std::io::Read) -> std::io::Result<&mut Self> { io::copy_wide(reader, self)?; Ok(self) } /// As [`update`](Hasher::update), but using Rayon-based multithreading /// internally. /// /// This method is gated by the `rayon` Cargo feature, which is disabled by /// default but enabled on [docs.rs](https://docs.rs). /// /// To get any performance benefit from multithreading, the input buffer /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is /// _slower_ than `update` for inputs under 128 KiB. That threshold varies /// quite a lot across different processors, and it's important to benchmark /// your specific use case. See also the performance warning associated with /// [`update_mmap_rayon`](Hasher::update_mmap_rayon). /// /// If you already have a large buffer in memory, and you want to hash it /// with multiple threads, this method is a good option. However, reading a /// file into memory just to call this method can be a performance mistake, /// both because it requires lots of memory and because single-threaded /// reads can be slow. For hashing whole files, see /// [`update_mmap_rayon`](Hasher::update_mmap_rayon), which is gated by both /// the `rayon` and `mmap` Cargo features. #[cfg(feature = "rayon")] pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self { self.update_with_join::(input) } /// As [`update`](Hasher::update), but reading the contents of a file using memory mapping. /// /// Not all files can be memory mapped, and memory mapping small files can be slower than /// reading them the usual way. In those cases, this method will fall back to standard file IO. /// The heuristic for whether to use memory mapping is currently very simple (file size >= /// 16 KiB), and it might change at any time. /// /// Like [`update`](Hasher::update), this method is single-threaded. In this author's /// experience, memory mapping improves single-threaded performance by ~10% for large files /// that are already in cache. This probably varies between platforms, and as always it's a /// good idea to benchmark your own use case. In comparison, the multithreaded /// [`update_mmap_rayon`](Hasher::update_mmap_rayon) method can have a much larger impact on /// performance. /// /// There's a correctness reason that this method takes /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) instead of /// [`File`](https://doc.rust-lang.org/std/fs/struct.File.html): reading from a memory-mapped /// file ignores the seek position of the original file handle (it neither respects the current /// position nor updates the position). This difference in behavior would've caused /// `update_mmap` and [`update_reader`](Hasher::update_reader) to give different answers and /// have different side effects in some cases. Taking a /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) avoids this problem by /// making it clear that a new [`File`](https://doc.rust-lang.org/std/fs/struct.File.html) is /// opened internally. /// /// This method requires the `mmap` Cargo feature, which is disabled by default but enabled on /// [docs.rs](https://docs.rs). /// /// # Example /// /// ```no_run /// # use std::io; /// # use std::path::Path; /// # fn main() -> io::Result<()> { /// let path = Path::new("file.dat"); /// let mut hasher = blake3::Hasher::new(); /// hasher.update_mmap(path)?; /// println!("{}", hasher.finalize()); /// # Ok(()) /// # } /// ``` #[cfg(feature = "mmap")] pub fn update_mmap(&mut self, path: impl AsRef) -> std::io::Result<&mut Self> { let file = std::fs::File::open(path.as_ref())?; if let Some(mmap) = io::maybe_mmap_file(&file)? { self.update(&mmap); } else { io::copy_wide(&file, self)?; } Ok(self) } /// As [`update_rayon`](Hasher::update_rayon), but reading the contents of a file using /// memory mapping. This is the default behavior of `b3sum`. /// /// For large files that are likely to be in cache, this can be much faster than /// single-threaded hashing. When benchmarks report that BLAKE3 is 10x or 20x faster than other /// cryptographic hashes, this is usually what they're measuring. However... /// /// **Performance Warning:** There are cases where multithreading hurts performance. The worst /// case is [a large file on a spinning disk](https://github.com/BLAKE3-team/BLAKE3/issues/31), /// where simultaneous reads from multiple threads can cause "thrashing" (i.e. the disk spends /// more time seeking around than reading data). Windows tends to be somewhat worse about this, /// in part because it's less likely than Linux to keep very large files in cache. More /// generally, if your CPU cores are already busy, then multithreading will add overhead /// without improving performance. If your code runs in different environments that you don't /// control and can't measure, then unfortunately there's no one-size-fits-all answer for /// whether multithreading is a good idea. /// /// The memory mapping behavior of this function is the same as /// [`update_mmap`](Hasher::update_mmap), and the heuristic for when to fall back to standard /// file IO might change at any time. /// /// This method requires both the `mmap` and `rayon` Cargo features, which are disabled by /// default but enabled on [docs.rs](https://docs.rs). /// /// # Example /// /// ```no_run /// # use std::io; /// # use std::path::Path; /// # fn main() -> io::Result<()> { /// # #[cfg(feature = "rayon")] /// # { /// let path = Path::new("big_file.dat"); /// let mut hasher = blake3::Hasher::new(); /// hasher.update_mmap_rayon(path)?; /// println!("{}", hasher.finalize()); /// # } /// # Ok(()) /// # } /// ``` #[cfg(feature = "mmap")] #[cfg(feature = "rayon")] pub fn update_mmap_rayon( &mut self, path: impl AsRef, ) -> std::io::Result<&mut Self> { let file = std::fs::File::open(path.as_ref())?; if let Some(mmap) = io::maybe_mmap_file(&file)? { self.update_rayon(&mmap); } else { io::copy_wide(&file, self)?; } Ok(self) } } // Don't derive(Debug), because the state may be secret. impl fmt::Debug for Hasher { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Hasher") .field("flags", &self.chunk_state.flags) .field("platform", &self.chunk_state.platform) .finish() } } impl Default for Hasher { #[inline] fn default() -> Self { Self::new() } } #[cfg(feature = "std")] impl std::io::Write for Hasher { /// This is equivalent to [`update`](#method.update). #[inline] fn write(&mut self, input: &[u8]) -> std::io::Result { self.update(input); Ok(input.len()) } #[inline] fn flush(&mut self) -> std::io::Result<()> { Ok(()) } } #[cfg(feature = "zeroize")] impl Zeroize for Hasher { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self { key, chunk_state, cv_stack, } = self; key.zeroize(); chunk_state.zeroize(); cv_stack.zeroize(); } } /// An incremental reader for extended output, returned by /// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof). /// /// Shorter BLAKE3 outputs are prefixes of longer ones, and explicitly requesting a short output is /// equivalent to truncating the default-length output. Note that this is a difference between /// BLAKE2 and BLAKE3. /// /// # Security notes /// /// Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit /// BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2 /// bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional /// security. /// /// Avoid relying on the secrecy of the output offset, that is, the number of output bytes read or /// the arguments to [`seek`](struct.OutputReader.html#method.seek) or /// [`set_position`](struct.OutputReader.html#method.set_position). [_Block-Cipher-Based Tree /// Hashing_ by Aldo Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows /// both the message and the key (if any) can easily determine the offset of an extended output. /// For comparison, AES-CTR has a similar property: if you know the key, you can decrypt a block /// from an unknown position in the output stream to recover its block index. Callers with strong /// secret keys aren't affected in practice, but secret offsets are a [design /// smell](https://en.wikipedia.org/wiki/Design_smell) in any case. #[derive(Clone)] pub struct OutputReader { inner: Output, position_within_block: u8, } impl OutputReader { fn new(inner: Output) -> Self { Self { inner, position_within_block: 0, } } // This helper function handles both the case where the output buffer is // shorter than one block, and the case where our position_within_block is // non-zero. fn fill_one_block(&mut self, buf: &mut &mut [u8]) { let output_block: [u8; BLOCK_LEN] = self.inner.root_output_block(); let output_bytes = &output_block[self.position_within_block as usize..]; let take = cmp::min(buf.len(), output_bytes.len()); buf[..take].copy_from_slice(&output_bytes[..take]); self.position_within_block += take as u8; if self.position_within_block == BLOCK_LEN as u8 { self.inner.counter += 1; self.position_within_block = 0; } // Advance the dest buffer. mem::take() is a borrowck workaround. *buf = &mut core::mem::take(buf)[take..]; } /// Fill a buffer with output bytes and advance the position of the /// `OutputReader`. This is equivalent to [`Read::read`], except that it /// doesn't return a `Result`. Both methods always fill the entire buffer. /// /// Note that `OutputReader` doesn't buffer output bytes internally, so /// calling `fill` repeatedly with a short-length or odd-length slice will /// end up performing the same compression multiple times. If you're /// reading output in a loop, prefer a slice length that's a multiple of /// 64. /// /// The maximum output size of BLAKE3 is 264-1 bytes. If you try /// to extract more than that, for example by seeking near the end and /// reading further, the behavior is unspecified. /// /// [`Read::read`]: #method.read pub fn fill(&mut self, mut buf: &mut [u8]) { if buf.is_empty() { return; } // If we're partway through a block, try to get to a block boundary. if self.position_within_block != 0 { self.fill_one_block(&mut buf); } let full_blocks = buf.len() / BLOCK_LEN; let full_blocks_len = full_blocks * BLOCK_LEN; if full_blocks > 0 { debug_assert_eq!(0, self.position_within_block); self.inner.platform.xof_many( &self.inner.input_chaining_value, &self.inner.block, self.inner.block_len, self.inner.counter, self.inner.flags | ROOT, &mut buf[..full_blocks_len], ); self.inner.counter += full_blocks as u64; buf = &mut buf[full_blocks * BLOCK_LEN..]; } if !buf.is_empty() { debug_assert!(buf.len() < BLOCK_LEN); self.fill_one_block(&mut buf); debug_assert!(buf.is_empty()); } } /// Return the current read position in the output stream. This is /// equivalent to [`Seek::stream_position`], except that it doesn't return /// a `Result`. The position of a new `OutputReader` starts at 0, and each /// call to [`fill`] or [`Read::read`] moves the position forward by the /// number of bytes read. /// /// [`Seek::stream_position`]: #method.stream_position /// [`fill`]: #method.fill /// [`Read::read`]: #method.read pub fn position(&self) -> u64 { self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64 } /// Seek to a new read position in the output stream. This is equivalent to /// calling [`Seek::seek`] with [`SeekFrom::Start`], except that it doesn't /// return a `Result`. /// /// [`Seek::seek`]: #method.seek /// [`SeekFrom::Start`]: https://doc.rust-lang.org/std/io/enum.SeekFrom.html pub fn set_position(&mut self, position: u64) { self.position_within_block = (position % BLOCK_LEN as u64) as u8; self.inner.counter = position / BLOCK_LEN as u64; } } // Don't derive(Debug), because the state may be secret. impl fmt::Debug for OutputReader { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("OutputReader") .field("position", &self.position()) .finish() } } #[cfg(feature = "std")] impl std::io::Read for OutputReader { #[inline] fn read(&mut self, buf: &mut [u8]) -> std::io::Result { self.fill(buf); Ok(buf.len()) } } #[cfg(feature = "std")] impl std::io::Seek for OutputReader { fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result { let max_position = u64::max_value() as i128; let target_position: i128 = match pos { std::io::SeekFrom::Start(x) => x as i128, std::io::SeekFrom::Current(x) => self.position() as i128 + x as i128, std::io::SeekFrom::End(_) => { return Err(std::io::Error::new( std::io::ErrorKind::InvalidInput, "seek from end not supported", )); } }; if target_position < 0 { return Err(std::io::Error::new( std::io::ErrorKind::InvalidInput, "seek before start", )); } self.set_position(cmp::min(target_position, max_position) as u64); Ok(self.position()) } } #[cfg(feature = "zeroize")] impl Zeroize for OutputReader { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self { inner, position_within_block, } = self; inner.zeroize(); position_within_block.zeroize(); } } blake3-1.5.4/src/platform.rs000064400000000000000000000434741046102023000137710ustar 00000000000000use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; use arrayref::{array_mut_ref, array_ref}; cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if::cfg_if! { if #[cfg(blake3_avx512_ffi)] { pub const MAX_SIMD_DEGREE: usize = 16; } else { pub const MAX_SIMD_DEGREE: usize = 8; } } } else if #[cfg(blake3_neon)] { pub const MAX_SIMD_DEGREE: usize = 4; } else { pub const MAX_SIMD_DEGREE: usize = 1; } } // There are some places where we want a static size that's equal to the // MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently // allowed to use cmp::max, so we have to hardcode this additional constant // value. Get rid of this once cmp::max is a const fn. cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if::cfg_if! { if #[cfg(blake3_avx512_ffi)] { pub const MAX_SIMD_DEGREE_OR_2: usize = 16; } else { pub const MAX_SIMD_DEGREE_OR_2: usize = 8; } } } else if #[cfg(blake3_neon)] { pub const MAX_SIMD_DEGREE_OR_2: usize = 4; } else { pub const MAX_SIMD_DEGREE_OR_2: usize = 2; } } #[derive(Clone, Copy, Debug)] pub enum Platform { Portable, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] SSE2, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] SSE41, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX2, #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX512, #[cfg(blake3_neon)] NEON, } impl Platform { #[allow(unreachable_code)] pub fn detect() -> Self { #[cfg(miri)] { return Platform::Portable; } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { #[cfg(blake3_avx512_ffi)] { if avx512_detected() { return Platform::AVX512; } } if avx2_detected() { return Platform::AVX2; } if sse41_detected() { return Platform::SSE41; } if sse2_detected() { return Platform::SSE2; } } // We don't use dynamic feature detection for NEON. If the "neon" // feature is on, NEON is assumed to be supported. #[cfg(blake3_neon)] { return Platform::NEON; } Platform::Portable } pub fn simd_degree(&self) -> usize { let degree = match self { Platform::Portable => 1, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => 4, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 => 4, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => 8, #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => 16, #[cfg(blake3_neon)] Platform::NEON => 4, }; debug_assert!(degree <= MAX_SIMD_DEGREE); degree } pub fn compress_in_place( &self, cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { match self { Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => unsafe { crate::sse2::compress_in_place(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { crate::sse41::compress_in_place(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::compress_in_place(cv, block, block_len, counter, flags) }, // No NEON compress_in_place() implementation yet. #[cfg(blake3_neon)] Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags), } } pub fn compress_xof( &self, cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { match self { Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => unsafe { crate::sse2::compress_xof(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { crate::sse41::compress_xof(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::compress_xof(cv, block, block_len, counter, flags) }, // No NEON compress_xof() implementation yet. #[cfg(blake3_neon)] Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags), } } // IMPLEMENTATION NOTE // =================== // hash_many() applies two optimizations. The critically important // optimization is the high-performance parallel SIMD hashing mode, // described in detail in the spec. This more than doubles throughput per // thread. Another optimization is keeping the state vectors transposed // from block to block within a chunk. When state vectors are transposed // after every block, there's a small but measurable performance loss. // Compressing chunks with a dedicated loop avoids this. pub fn hash_many( &self, inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { match self { Platform::Portable => portable::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => unsafe { crate::sse2::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 => unsafe { crate::sse41::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => unsafe { crate::avx2::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Assumed to be safe if the "neon" feature is on. #[cfg(blake3_neon)] Platform::NEON => unsafe { crate::neon::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, } } pub fn xof_many( &self, cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, mut counter: u64, flags: u8, out: &mut [u8], ) { debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only"); if out.is_empty() { // The current assembly implementation always outputs at least 1 block. return; } match self { // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(unix)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::xof_many(cv, block, block_len, counter, flags, out) }, _ => { // For platforms without an optimized xof_many, fall back to a loop over // compress_xof. This is still faster than portable code. for out_block in out.chunks_exact_mut(BLOCK_LEN) { // TODO: Use array_chunks_mut here once that's stable. let out_array: &mut [u8; BLOCK_LEN] = out_block.try_into().unwrap(); *out_array = self.compress_xof(cv, block, block_len, counter, flags); counter += 1; } } } } // Explicit platform constructors, for benchmarks. pub fn portable() -> Self { Self::Portable } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn sse2() -> Option { if sse2_detected() { Some(Self::SSE2) } else { None } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn sse41() -> Option { if sse41_detected() { Some(Self::SSE41) } else { None } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn avx2() -> Option { if avx2_detected() { Some(Self::AVX2) } else { None } } #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn avx512() -> Option { if avx512_detected() { Some(Self::AVX512) } else { None } } #[cfg(blake3_neon)] pub fn neon() -> Option { // Assumed to be safe if the "neon" feature is on. Some(Self::NEON) } } // Note that AVX-512 is divided into multiple featuresets, and we use two of // them, F and VL. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] #[allow(unreachable_code)] pub fn avx512_detected() -> bool { if cfg!(miri) { return false; } // A testing-only short-circuit. if cfg!(feature = "no_avx512") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { return true; } } false } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] #[allow(unreachable_code)] pub fn avx2_detected() -> bool { if cfg!(miri) { return false; } // A testing-only short-circuit. if cfg!(feature = "no_avx2") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(target_feature = "avx2")] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("avx2") { return true; } } false } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] #[allow(unreachable_code)] pub fn sse41_detected() -> bool { if cfg!(miri) { return false; } // A testing-only short-circuit. if cfg!(feature = "no_sse41") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(target_feature = "sse4.1")] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("sse4.1") { return true; } } false } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] #[allow(unreachable_code)] pub fn sse2_detected() -> bool { if cfg!(miri) { return false; } // A testing-only short-circuit. if cfg!(feature = "no_sse2") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(target_feature = "sse2")] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("sse2") { return true; } } false } #[inline(always)] pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { let mut out = [0; 8]; out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); out } #[inline(always)] pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { let mut out = [0; 16]; out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4)); out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4)); out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4)); out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4)); out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4)); out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4)); out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4)); out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4)); out } #[inline(always)] pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { let mut out = [0; 32]; *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); out } #[inline(always)] pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { let mut out = [0; 64]; *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes(); *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes(); *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes(); *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes(); *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes(); *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes(); *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes(); *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes(); out } blake3-1.5.4/src/portable.rs000064400000000000000000000124511046102023000137440ustar 00000000000000use crate::{ counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, array_ref}; #[inline(always)] fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); state[d] = (state[d] ^ state[a]).rotate_right(16); state[c] = state[c].wrapping_add(state[d]); state[b] = (state[b] ^ state[c]).rotate_right(12); state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); state[d] = (state[d] ^ state[a]).rotate_right(8); state[c] = state[c].wrapping_add(state[d]); state[b] = (state[b] ^ state[c]).rotate_right(7); } #[inline(always)] fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) { // Select the message schedule based on the round. let schedule = MSG_SCHEDULE[round]; // Mix the columns. g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); // Mix the diagonals. g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); } #[inline(always)] fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u32; 16] { let block_words = crate::platform::words_from_le_bytes_64(block); let mut state = [ cv[0], cv[1], cv[2], cv[3], cv[4], cv[5], cv[6], cv[7], IV[0], IV[1], IV[2], IV[3], counter_low(counter), counter_high(counter), block_len as u32, flags as u32, ]; round(&mut state, &block_words, 0); round(&mut state, &block_words, 1); round(&mut state, &block_words, 2); round(&mut state, &block_words, 3); round(&mut state, &block_words, 4); round(&mut state, &block_words, 5); round(&mut state, &block_words, 6); state } pub fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { let state = compress_pre(cv, block, block_len, counter, flags); cv[0] = state[0] ^ state[8]; cv[1] = state[1] ^ state[9]; cv[2] = state[2] ^ state[10]; cv[3] = state[3] ^ state[11]; cv[4] = state[4] ^ state[12]; cv[5] = state[5] ^ state[13]; cv[6] = state[6] ^ state[14]; cv[7] = state[7] ^ state[15]; } pub fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let mut state = compress_pre(cv, block, block_len, counter, flags); state[0] ^= state[8]; state[1] ^= state[9]; state[2] ^= state[10]; state[3] ^= state[11]; state[4] ^= state[12]; state[5] ^= state[13]; state[6] ^= state[14]; state[7] ^= state[15]; state[8] ^= cv[0]; state[9] ^= cv[1]; state[10] ^= cv[2]; state[11] ^= cv[3]; state[12] ^= cv[4]; state[13] ^= cv[5]; state[14] ^= cv[6]; state[15] ^= cv[7]; crate::platform::le_bytes_from_words_64(&state) } pub fn hash1( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, ) { debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); let mut cv = *key; let mut block_flags = flags | flags_start; let mut slice = &input[..]; while slice.len() >= BLOCK_LEN { if slice.len() == BLOCK_LEN { block_flags |= flags_end; } compress_in_place( &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, counter, block_flags, ); block_flags = flags; slice = &slice[BLOCK_LEN..]; } *out = crate::platform::le_bytes_from_words_32(&cv); } pub fn hash_many( inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { hash1( input, key, counter, flags, flags_start, flags_end, array_mut_ref!(output, 0, OUT_LEN), ); if increment_counter.yes() { counter += 1; } } } #[cfg(test)] pub mod test { use super::*; // This is basically testing the portable implementation against itself, // but it also checks that compress_in_place and compress_xof are // consistent. And there are tests against the reference implementation and // against hardcoded test vectors elsewhere. #[test] fn test_compress() { crate::test::test_compress_fn(compress_in_place, compress_xof); } // Ditto. #[test] fn test_hash_many() { crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.5.4/src/rust_avx2.rs000064400000000000000000000366631046102023000141040ustar 00000000000000#[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; use crate::{ counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, mut_array_refs}; pub const DEGREE: usize = 8; #[inline(always)] unsafe fn loadu(src: *const u8) -> __m256i { // This is an unaligned load, so the pointer cast is allowed. _mm256_loadu_si256(src as *const __m256i) } #[inline(always)] unsafe fn storeu(src: __m256i, dest: *mut u8) { // This is an unaligned store, so the pointer cast is allowed. _mm256_storeu_si256(dest as *mut __m256i, src) } #[inline(always)] unsafe fn add(a: __m256i, b: __m256i) -> __m256i { _mm256_add_epi32(a, b) } #[inline(always)] unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { _mm256_xor_si256(a, b) } #[inline(always)] unsafe fn set1(x: u32) -> __m256i { _mm256_set1_epi32(x as i32) } #[inline(always)] unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i { _mm256_setr_epi32( a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32, ) } // These rotations are the "simple/shifts version". For the // "complicated/shuffles version", see // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. // For a discussion of the tradeoffs, see // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better // on recent x86 chips. #[inline(always)] unsafe fn rot16(x: __m256i) -> __m256i { _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16)) } #[inline(always)] unsafe fn rot12(x: __m256i) -> __m256i { _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)) } #[inline(always)] unsafe fn rot8(x: __m256i) -> __m256i { _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8)) } #[inline(always)] unsafe fn rot7(x: __m256i) -> __m256i { _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)) } #[inline(always)] unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) { v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } #[inline(always)] unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) { ( _mm256_permute2x128_si256(a, b, 0x20), _mm256_permute2x128_si256(a, b, 0x31), ) } // There are several ways to do a transposition. We could do it naively, with 8 separate // _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy // the vecs into contiguous storage and then use gather instructions. This third approach is to use // a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the // fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the // https://github.com/oconnor663/bao_experiments repo. #[inline(always)] unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77. let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is 11/33. let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); // Interleave 128-bit lanes. let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04); let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15); let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26); let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37); vecs[0] = abcdefgh_0; vecs[1] = abcdefgh_1; vecs[2] = abcdefgh_2; vecs[3] = abcdefgh_3; vecs[4] = abcdefgh_4; vecs[5] = abcdefgh_5; vecs[6] = abcdefgh_6; vecs[7] = abcdefgh_7; } #[inline(always)] unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] { let mut vecs = [ loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)), ]; for i in 0..DEGREE { _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); } let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); vecs } #[inline(always)] unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) { let mask = if increment_counter.yes() { !0 } else { 0 }; ( set8( counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)), counter_low(counter + (mask & 4)), counter_low(counter + (mask & 5)), counter_low(counter + (mask & 6)), counter_low(counter + (mask & 7)), ), set8( counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)), counter_high(counter + (mask & 4)), counter_high(counter + (mask & 5)), counter_high(counter + (mask & 6)), counter_high(counter + (mask & 7)), ), ) } #[target_feature(enable = "avx2")] pub unsafe fn hash8( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], ) { let mut h_vecs = [ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), ]; let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); let mut block_flags = flags | flags_start; for block in 0..blocks { if block + 1 == blocks { block_flags |= flags_end; } let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only let block_flags_vec = set1(block_flags as u32); let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); // The transposed compression function. Note that inlining this // manually here improves compile times by a lot, compared to factoring // it out into its own function and making it #[inline(always)]. Just // guessing, it might have something to do with loop unrolling. let mut v = [ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, ]; round(&mut v, &msg_vecs, 0); round(&mut v, &msg_vecs, 1); round(&mut v, &msg_vecs, 2); round(&mut v, &msg_vecs, 3); round(&mut v, &msg_vecs, 4); round(&mut v, &msg_vecs, 5); round(&mut v, &msg_vecs, 6); h_vecs[0] = xor(v[0], v[8]); h_vecs[1] = xor(v[1], v[9]); h_vecs[2] = xor(v[2], v[10]); h_vecs[3] = xor(v[3], v[11]); h_vecs[4] = xor(v[4], v[12]); h_vecs[5] = xor(v[5], v[13]); h_vecs[6] = xor(v[6], v[14]); h_vecs[7] = xor(v[7], v[15]); block_flags = flags; } transpose_vecs(&mut h_vecs); storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE)); storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE)); storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE)); storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE)); storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE)); storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE)); storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); } #[target_feature(enable = "avx2")] pub unsafe fn hash_many( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { // Safe because the layout of arrays is guaranteed, and because the // `blocks` count is determined statically from the argument type. let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); let blocks = N / BLOCK_LEN; hash8( input_ptrs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, array_mut_ref!(out, 0, DEGREE * OUT_LEN), ); if increment_counter.yes() { counter += DEGREE as u64; } inputs = &inputs[DEGREE..]; out = &mut out[DEGREE * OUT_LEN..]; } crate::sse41::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ); } #[cfg(test)] mod test { use super::*; #[test] fn test_transpose() { if !crate::platform::avx2_detected() { return; } #[target_feature(enable = "avx2")] unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) { transpose_vecs(vecs); } let mut matrix = [[0 as u32; DEGREE]; DEGREE]; for i in 0..DEGREE { for j in 0..DEGREE { matrix[i][j] = (i * DEGREE + j) as u32; } } unsafe { let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix); transpose_wrapper(&mut vecs); matrix = core::mem::transmute(vecs); } for i in 0..DEGREE { for j in 0..DEGREE { // Reversed indexes from above. assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); } } } #[test] fn test_hash_many() { if !crate::platform::avx2_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.5.4/src/rust_sse2.rs000064400000000000000000000604631046102023000140730ustar 00000000000000#[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; use crate::{ counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, array_ref, mut_array_refs}; pub const DEGREE: usize = 4; #[inline(always)] unsafe fn loadu(src: *const u8) -> __m128i { // This is an unaligned load, so the pointer cast is allowed. _mm_loadu_si128(src as *const __m128i) } #[inline(always)] unsafe fn storeu(src: __m128i, dest: *mut u8) { // This is an unaligned store, so the pointer cast is allowed. _mm_storeu_si128(dest as *mut __m128i, src) } #[inline(always)] unsafe fn add(a: __m128i, b: __m128i) -> __m128i { _mm_add_epi32(a, b) } #[inline(always)] unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { _mm_xor_si128(a, b) } #[inline(always)] unsafe fn set1(x: u32) -> __m128i { _mm_set1_epi32(x as i32) } #[inline(always)] unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) } // These rotations are the "simple/shifts version". For the // "complicated/shuffles version", see // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. // For a discussion of the tradeoffs, see // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better // on recent x86 chips. #[inline(always)] unsafe fn rot16(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) } #[inline(always)] unsafe fn rot12(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) } #[inline(always)] unsafe fn rot8(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) } #[inline(always)] unsafe fn rot7(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) } #[inline(always)] unsafe fn g1( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, ) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot16(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot12(*row1); } #[inline(always)] unsafe fn g2( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, ) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot8(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot7(*row1); } // Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. macro_rules! _MM_SHUFFLE { ($z:expr, $y:expr, $x:expr, $w:expr) => { ($z << 6) | ($y << 4) | ($x << 2) | $w }; } macro_rules! shuffle2 { ($a:expr, $b:expr, $c:expr) => { _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps($a), _mm_castsi128_ps($b), $c, )) }; } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 #[inline(always)] unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); } #[inline(always)] unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); } #[inline(always)] unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); let mut mask = _mm_set1_epi16(imm8 as i16); mask = _mm_and_si128(mask, bits); mask = _mm_cmpeq_epi16(mask, bits); _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)) } #[inline(always)] unsafe fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [__m128i; 4] { let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); let row3 = &mut set4( counter_low(counter), counter_high(counter), block_len as u32, flags as u32, ); let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); let mut t0; let mut t1; let mut t2; let mut t3; let mut tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 g1(row0, row1, row2, row3, t2); t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); [*row0, *row1, *row2, *row3] } #[target_feature(enable = "sse2")] pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); } #[target_feature(enable = "sse2")] pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let [mut row0, mut row1, mut row2, mut row3] = compress_pre(cv, block, block_len, counter, flags); row0 = xor(row0, row2); row1 = xor(row1, row3); row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); core::mem::transmute([row0, row1, row2, row3]) } #[inline(always)] unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } #[inline(always)] unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } #[inline(always)] unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { let mut vecs = [ loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), ]; for i in 0..DEGREE { _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); } let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); transpose_vecs(squares.2); transpose_vecs(squares.3); vecs } #[inline(always)] unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { let mask = if increment_counter.yes() { !0 } else { 0 }; ( set4( counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)), ), set4( counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)), ), ) } #[target_feature(enable = "sse2")] pub unsafe fn hash4( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], ) { let mut h_vecs = [ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), ]; let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); let mut block_flags = flags | flags_start; for block in 0..blocks { if block + 1 == blocks { block_flags |= flags_end; } let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only let block_flags_vec = set1(block_flags as u32); let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); // The transposed compression function. Note that inlining this // manually here improves compile times by a lot, compared to factoring // it out into its own function and making it #[inline(always)]. Just // guessing, it might have something to do with loop unrolling. let mut v = [ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, ]; round(&mut v, &msg_vecs, 0); round(&mut v, &msg_vecs, 1); round(&mut v, &msg_vecs, 2); round(&mut v, &msg_vecs, 3); round(&mut v, &msg_vecs, 4); round(&mut v, &msg_vecs, 5); round(&mut v, &msg_vecs, 6); h_vecs[0] = xor(v[0], v[8]); h_vecs[1] = xor(v[1], v[9]); h_vecs[2] = xor(v[2], v[10]); h_vecs[3] = xor(v[3], v[11]); h_vecs[4] = xor(v[4], v[12]); h_vecs[5] = xor(v[5], v[13]); h_vecs[6] = xor(v[6], v[14]); h_vecs[7] = xor(v[7], v[15]); block_flags = flags; } let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); } #[target_feature(enable = "sse2")] unsafe fn hash1( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, ) { debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); let mut cv = *key; let mut block_flags = flags | flags_start; let mut slice = &input[..]; while slice.len() >= BLOCK_LEN { if slice.len() == BLOCK_LEN { block_flags |= flags_end; } compress_in_place( &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, counter, block_flags, ); block_flags = flags; slice = &slice[BLOCK_LEN..]; } *out = core::mem::transmute(cv); // x86 is little-endian } #[target_feature(enable = "sse2")] pub unsafe fn hash_many( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { // Safe because the layout of arrays is guaranteed, and because the // `blocks` count is determined statically from the argument type. let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); let blocks = N / BLOCK_LEN; hash4( input_ptrs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, array_mut_ref!(out, 0, DEGREE * OUT_LEN), ); if increment_counter.yes() { counter += DEGREE as u64; } inputs = &inputs[DEGREE..]; out = &mut out[DEGREE * OUT_LEN..]; } for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { hash1( input, key, counter, flags, flags_start, flags_end, array_mut_ref!(output, 0, OUT_LEN), ); if increment_counter.yes() { counter += 1; } } } #[cfg(test)] mod test { use super::*; #[test] fn test_transpose() { if !crate::platform::sse2_detected() { return; } #[target_feature(enable = "sse2")] unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { transpose_vecs(vecs); } let mut matrix = [[0 as u32; DEGREE]; DEGREE]; for i in 0..DEGREE { for j in 0..DEGREE { matrix[i][j] = (i * DEGREE + j) as u32; } } unsafe { let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); transpose_wrapper(&mut vecs); matrix = core::mem::transmute(vecs); } for i in 0..DEGREE { for j in 0..DEGREE { // Reversed indexes from above. assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); } } } #[test] fn test_compress() { if !crate::platform::sse2_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::sse2_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.5.4/src/rust_sse41.rs000064400000000000000000000600071046102023000141500ustar 00000000000000#[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; use crate::{ counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, array_ref, mut_array_refs}; pub const DEGREE: usize = 4; #[inline(always)] unsafe fn loadu(src: *const u8) -> __m128i { // This is an unaligned load, so the pointer cast is allowed. _mm_loadu_si128(src as *const __m128i) } #[inline(always)] unsafe fn storeu(src: __m128i, dest: *mut u8) { // This is an unaligned store, so the pointer cast is allowed. _mm_storeu_si128(dest as *mut __m128i, src) } #[inline(always)] unsafe fn add(a: __m128i, b: __m128i) -> __m128i { _mm_add_epi32(a, b) } #[inline(always)] unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { _mm_xor_si128(a, b) } #[inline(always)] unsafe fn set1(x: u32) -> __m128i { _mm_set1_epi32(x as i32) } #[inline(always)] unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) } // These rotations are the "simple/shifts version". For the // "complicated/shuffles version", see // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. // For a discussion of the tradeoffs, see // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better // on recent x86 chips. #[inline(always)] unsafe fn rot16(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) } #[inline(always)] unsafe fn rot12(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) } #[inline(always)] unsafe fn rot8(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) } #[inline(always)] unsafe fn rot7(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) } #[inline(always)] unsafe fn g1( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, ) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot16(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot12(*row1); } #[inline(always)] unsafe fn g2( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, ) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot8(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot7(*row1); } // Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. macro_rules! _MM_SHUFFLE { ($z:expr, $y:expr, $x:expr, $w:expr) => { ($z << 6) | ($y << 4) | ($x << 2) | $w }; } macro_rules! shuffle2 { ($a:expr, $b:expr, $c:expr) => { _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps($a), _mm_castsi128_ps($b), $c, )) }; } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 #[inline(always)] unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); } #[inline(always)] unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); } #[inline(always)] unsafe fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [__m128i; 4] { let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); let row3 = &mut set4( counter_low(counter), counter_high(counter), block_len as u32, flags as u32, ); let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); let mut t0; let mut t1; let mut t2; let mut t3; let mut tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 g1(row0, row1, row2, row3, t2); t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); [*row0, *row1, *row2, *row3] } #[target_feature(enable = "sse4.1")] pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); } #[target_feature(enable = "sse4.1")] pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let [mut row0, mut row1, mut row2, mut row3] = compress_pre(cv, block, block_len, counter, flags); row0 = xor(row0, row2); row1 = xor(row1, row3); row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); core::mem::transmute([row0, row1, row2, row3]) } #[inline(always)] unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } #[inline(always)] unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } #[inline(always)] unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { let mut vecs = [ loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), ]; for i in 0..DEGREE { _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); } let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); transpose_vecs(squares.2); transpose_vecs(squares.3); vecs } #[inline(always)] unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { let mask = if increment_counter.yes() { !0 } else { 0 }; ( set4( counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)), ), set4( counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)), ), ) } #[target_feature(enable = "sse4.1")] pub unsafe fn hash4( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], ) { let mut h_vecs = [ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), ]; let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); let mut block_flags = flags | flags_start; for block in 0..blocks { if block + 1 == blocks { block_flags |= flags_end; } let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only let block_flags_vec = set1(block_flags as u32); let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); // The transposed compression function. Note that inlining this // manually here improves compile times by a lot, compared to factoring // it out into its own function and making it #[inline(always)]. Just // guessing, it might have something to do with loop unrolling. let mut v = [ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, ]; round(&mut v, &msg_vecs, 0); round(&mut v, &msg_vecs, 1); round(&mut v, &msg_vecs, 2); round(&mut v, &msg_vecs, 3); round(&mut v, &msg_vecs, 4); round(&mut v, &msg_vecs, 5); round(&mut v, &msg_vecs, 6); h_vecs[0] = xor(v[0], v[8]); h_vecs[1] = xor(v[1], v[9]); h_vecs[2] = xor(v[2], v[10]); h_vecs[3] = xor(v[3], v[11]); h_vecs[4] = xor(v[4], v[12]); h_vecs[5] = xor(v[5], v[13]); h_vecs[6] = xor(v[6], v[14]); h_vecs[7] = xor(v[7], v[15]); block_flags = flags; } let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); } #[target_feature(enable = "sse4.1")] unsafe fn hash1( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, ) { debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); let mut cv = *key; let mut block_flags = flags | flags_start; let mut slice = &input[..]; while slice.len() >= BLOCK_LEN { if slice.len() == BLOCK_LEN { block_flags |= flags_end; } compress_in_place( &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, counter, block_flags, ); block_flags = flags; slice = &slice[BLOCK_LEN..]; } *out = core::mem::transmute(cv); // x86 is little-endian } #[target_feature(enable = "sse4.1")] pub unsafe fn hash_many( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { // Safe because the layout of arrays is guaranteed, and because the // `blocks` count is determined statically from the argument type. let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); let blocks = N / BLOCK_LEN; hash4( input_ptrs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, array_mut_ref!(out, 0, DEGREE * OUT_LEN), ); if increment_counter.yes() { counter += DEGREE as u64; } inputs = &inputs[DEGREE..]; out = &mut out[DEGREE * OUT_LEN..]; } for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { hash1( input, key, counter, flags, flags_start, flags_end, array_mut_ref!(output, 0, OUT_LEN), ); if increment_counter.yes() { counter += 1; } } } #[cfg(test)] mod test { use super::*; #[test] fn test_transpose() { if !crate::platform::sse41_detected() { return; } #[target_feature(enable = "sse4.1")] unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { transpose_vecs(vecs); } let mut matrix = [[0 as u32; DEGREE]; DEGREE]; for i in 0..DEGREE { for j in 0..DEGREE { matrix[i][j] = (i * DEGREE + j) as u32; } } unsafe { let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); transpose_wrapper(&mut vecs); matrix = core::mem::transmute(vecs); } for i in 0..DEGREE { for j in 0..DEGREE { // Reversed indexes from above. assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); } } } #[test] fn test_compress() { if !crate::platform::sse41_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::sse41_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.5.4/src/test.rs000064400000000000000000001036771046102023000131260ustar 00000000000000use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; use arrayref::array_ref; use arrayvec::ArrayVec; use core::usize; use rand::prelude::*; // Interesting input lengths to run tests on. pub const TEST_CASES: &[usize] = &[ 0, 1, 2, 3, 4, 5, 6, 7, 8, BLOCK_LEN - 1, BLOCK_LEN, BLOCK_LEN + 1, 2 * BLOCK_LEN - 1, 2 * BLOCK_LEN, 2 * BLOCK_LEN + 1, CHUNK_LEN - 1, CHUNK_LEN, CHUNK_LEN + 1, 2 * CHUNK_LEN, 2 * CHUNK_LEN + 1, 3 * CHUNK_LEN, 3 * CHUNK_LEN + 1, 4 * CHUNK_LEN, 4 * CHUNK_LEN + 1, 5 * CHUNK_LEN, 5 * CHUNK_LEN + 1, 6 * CHUNK_LEN, 6 * CHUNK_LEN + 1, 7 * CHUNK_LEN, 7 * CHUNK_LEN + 1, 8 * CHUNK_LEN, 8 * CHUNK_LEN + 1, 16 * CHUNK_LEN, // AVX512's bandwidth 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks ]; pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; // There's a test to make sure these two are equal below. pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; pub const TEST_KEY_WORDS: CVWords = [ 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, ]; // Paint the input with a repeating byte pattern. We use a cycle length of 251, // because that's the largest prime number less than 256. This makes it // unlikely to swapping any two adjacent input blocks or chunks will give the // same answer. pub fn paint_test_input(buf: &mut [u8]) { for (i, b) in buf.iter_mut().enumerate() { *b = (i % 251) as u8; } } type CompressInPlaceFn = unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); type CompressXofFn = unsafe fn( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64]; // A shared helper function for platform-specific tests. pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { let initial_state = TEST_KEY_WORDS; let block_len: u8 = 61; let mut block = [0; BLOCK_LEN]; paint_test_input(&mut block[..block_len as usize]); // Use a counter with set bits in both 32-bit words. let counter = (5u64 << 32) + 6; let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH; let portable_out = crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags); let mut test_state = initial_state; unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) }; let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state); let test_xof = unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) }; assert_eq!(&portable_out[..32], &test_state_bytes[..]); assert_eq!(&portable_out[..], &test_xof[..]); } type HashManyFn = unsafe fn( inputs: &[&A], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ); // A shared helper function for platform-specific tests. pub fn test_hash_many_fn( hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>, hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>, ) { // Test a few different initial counter values. // - 0: The base case. // - u32::MAX: The low word of the counter overflows for all inputs except the first. // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR // when you're supposed to ANDNOT... let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; for counter in initial_counters { #[cfg(feature = "std")] dbg!(counter); // 31 (16 + 8 + 4 + 2 + 1) inputs const NUM_INPUTS: usize = 31; let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; crate::test::paint_test_input(&mut input_buf); // First hash chunks. let mut chunks = ArrayVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new(); for i in 0..NUM_INPUTS { chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN)); } let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; crate::portable::hash_many( &chunks, &TEST_KEY_WORDS, counter, IncrementCounter::Yes, crate::KEYED_HASH, crate::CHUNK_START, crate::CHUNK_END, &mut portable_chunks_out, ); let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; unsafe { hash_many_chunks_fn( &chunks[..], &TEST_KEY_WORDS, counter, IncrementCounter::Yes, crate::KEYED_HASH, crate::CHUNK_START, crate::CHUNK_END, &mut test_chunks_out, ); } for n in 0..NUM_INPUTS { #[cfg(feature = "std")] dbg!(n); assert_eq!( &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], &test_chunks_out[n * OUT_LEN..][..OUT_LEN] ); } // Then hash parents. let mut parents = ArrayVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new(); for i in 0..NUM_INPUTS { parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN)); } let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; crate::portable::hash_many( &parents, &TEST_KEY_WORDS, counter, IncrementCounter::No, crate::KEYED_HASH | crate::PARENT, 0, 0, &mut portable_parents_out, ); let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; unsafe { hash_many_parents_fn( &parents[..], &TEST_KEY_WORDS, counter, IncrementCounter::No, crate::KEYED_HASH | crate::PARENT, 0, 0, &mut test_parents_out, ); } for n in 0..NUM_INPUTS { #[cfg(feature = "std")] dbg!(n); assert_eq!( &portable_parents_out[n * OUT_LEN..][..OUT_LEN], &test_parents_out[n * OUT_LEN..][..OUT_LEN] ); } } } #[allow(unused)] type XofManyFunction = unsafe fn( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, out: &mut [u8], ); // A shared helper function for platform-specific tests. #[allow(unused)] pub fn test_xof_many_fn(xof_many_function: XofManyFunction) { let mut block = [0; BLOCK_LEN]; let block_len = 42; crate::test::paint_test_input(&mut block[..block_len]); let cv = [40, 41, 42, 43, 44, 45, 46, 47]; let flags = crate::KEYED_HASH; // Test a few different initial counter values. // - 0: The base case. // - u32::MAX: The low word of the counter overflows for all inputs except the first. // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR // when you're supposed to ANDNOT... let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; for counter in initial_counters { #[cfg(feature = "std")] dbg!(counter); // 31 (16 + 8 + 4 + 2 + 1) outputs const OUTPUT_SIZE: usize = 31 * BLOCK_LEN; let mut portable_out = [0u8; OUTPUT_SIZE]; for (i, out_block) in portable_out.chunks_exact_mut(64).enumerate() { out_block.copy_from_slice(&crate::portable::compress_xof( &cv, &block, block_len as u8, counter + i as u64, flags, )); } let mut test_out = [0u8; OUTPUT_SIZE]; unsafe { xof_many_function(&cv, &block, block_len as u8, counter, flags, &mut test_out); } assert_eq!(portable_out, test_out); } // Test that xof_many doesn't write more blocks than requested. Note that the current assembly // implementation always outputs at least one block, so we don't test the zero case. for block_count in 1..=32 { let mut array = [0; BLOCK_LEN * 33]; let output_start = 17; let output_len = block_count * BLOCK_LEN; let output_end = output_start + output_len; let output = &mut array[output_start..output_end]; unsafe { xof_many_function(&cv, &block, block_len as u8, 0, flags, output); } for i in 0..array.len() { if i < output_start || output_end <= i { assert_eq!(0, array[i], "index {i}"); } } } } #[test] fn test_key_bytes_equal_key_words() { assert_eq!( TEST_KEY_WORDS, crate::platform::words_from_le_bytes_32(&TEST_KEY), ); } #[test] fn test_reference_impl_size() { // Because the Rust compiler optimizes struct layout, it's possible that // some future version of the compiler will produce a different size. If // that happens, we can either disable this test, or test for multiple // expected values. For now, the purpose of this test is to make sure we // notice if that happens. assert_eq!(1880, core::mem::size_of::()); } #[test] fn test_counter_words() { let counter: u64 = (1 << 32) + 2; assert_eq!(crate::counter_low(counter), 2); assert_eq!(crate::counter_high(counter), 1); } #[test] fn test_largest_power_of_two_leq() { let input_output = &[ // The zero case is nonsensical, but it does work. (0, 1), (1, 1), (2, 2), (3, 2), (4, 4), (5, 4), (6, 4), (7, 4), (8, 8), // the largest possible usize (usize::MAX, (usize::MAX >> 1) + 1), ]; for &(input, output) in input_output { assert_eq!( output, crate::largest_power_of_two_leq(input), "wrong output for n={}", input ); } } #[test] fn test_left_len() { let input_output = &[ (CHUNK_LEN + 1, CHUNK_LEN), (2 * CHUNK_LEN - 1, CHUNK_LEN), (2 * CHUNK_LEN, CHUNK_LEN), (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN), (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN), (4 * CHUNK_LEN, 2 * CHUNK_LEN), (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN), ]; for &(input, output) in input_output { assert_eq!(crate::left_len(input), output); } } #[test] fn test_compare_reference_impl() { const OUT: usize = 303; // more than 64, not a multiple of 4 let mut input_buf = [0; TEST_CASES_MAX]; paint_test_input(&mut input_buf); for &case in TEST_CASES { let input = &input_buf[..case]; #[cfg(feature = "std")] dbg!(case); // regular { let mut reference_hasher = reference_impl::Hasher::new(); reference_hasher.update(input); let mut expected_out = [0; OUT]; reference_hasher.finalize(&mut expected_out); // all at once let test_out = crate::hash(input); assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); // incremental let mut hasher = crate::Hasher::new(); hasher.update(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), test_out); // incremental (rayon) #[cfg(feature = "rayon")] { let mut hasher = crate::Hasher::new(); hasher.update_rayon(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), test_out); } // xof let mut extended = [0; OUT]; hasher.finalize_xof().fill(&mut extended); assert_eq!(extended, expected_out); } // keyed { let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); reference_hasher.update(input); let mut expected_out = [0; OUT]; reference_hasher.finalize(&mut expected_out); // all at once let test_out = crate::keyed_hash(&TEST_KEY, input); assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); // incremental let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); hasher.update(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), test_out); // incremental (rayon) #[cfg(feature = "rayon")] { let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); hasher.update_rayon(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), test_out); } // xof let mut extended = [0; OUT]; hasher.finalize_xof().fill(&mut extended); assert_eq!(extended, expected_out); } // derive_key { let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); reference_hasher.update(input); let mut expected_out = [0; OUT]; reference_hasher.finalize(&mut expected_out); // all at once let test_out = crate::derive_key(context, input); assert_eq!(test_out, expected_out[..32]); // incremental let mut hasher = crate::Hasher::new_derive_key(context); hasher.update(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); // incremental (rayon) #[cfg(feature = "rayon")] { let mut hasher = crate::Hasher::new_derive_key(context); hasher.update_rayon(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); } // xof let mut extended = [0; OUT]; hasher.finalize_xof().fill(&mut extended); assert_eq!(extended, expected_out); } } } #[test] fn test_xof_partial_blocks() { const OUT_LEN: usize = 6 * BLOCK_LEN; let mut reference_out = [0u8; OUT_LEN]; reference_impl::Hasher::new().finalize(&mut reference_out); let mut all_at_once_out = [0u8; OUT_LEN]; crate::Hasher::new() .finalize_xof() .fill(&mut all_at_once_out); assert_eq!(reference_out, all_at_once_out); let mut partial_out = [0u8; OUT_LEN]; let partial_start = 32; let partial_end = OUT_LEN - 32; let mut xof = crate::Hasher::new().finalize_xof(); xof.fill(&mut partial_out[..partial_start]); xof.fill(&mut partial_out[partial_start..partial_end]); xof.fill(&mut partial_out[partial_end..]); assert_eq!(reference_out, partial_out); } fn reference_hash(input: &[u8]) -> crate::Hash { let mut hasher = reference_impl::Hasher::new(); hasher.update(input); let mut bytes = [0; 32]; hasher.finalize(&mut bytes); bytes.into() } #[test] fn test_compare_update_multiple() { // Don't use all the long test cases here, since that's unnecessarily slow // in debug mode. let mut short_test_cases = TEST_CASES; while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; } assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); let mut input_buf = [0; 2 * TEST_CASES_MAX]; paint_test_input(&mut input_buf); for &first_update in short_test_cases { #[cfg(feature = "std")] dbg!(first_update); let first_input = &input_buf[..first_update]; let mut test_hasher = crate::Hasher::new(); test_hasher.update(first_input); for &second_update in short_test_cases { #[cfg(feature = "std")] dbg!(second_update); let second_input = &input_buf[first_update..][..second_update]; let total_input = &input_buf[..first_update + second_update]; // Clone the hasher with first_update bytes already written, so // that the next iteration can reuse it. let mut test_hasher = test_hasher.clone(); test_hasher.update(second_input); let expected = reference_hash(total_input); assert_eq!(expected, test_hasher.finalize()); } } } #[test] fn test_fuzz_hasher() { const INPUT_MAX: usize = 4 * CHUNK_LEN; let mut input_buf = [0; 3 * INPUT_MAX]; paint_test_input(&mut input_buf); // Don't do too many iterations in debug mode, to keep the tests under a // second or so. CI should run tests in release mode also. Provide an // environment variable for specifying a larger number of fuzz iterations. let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; // Use a fixed RNG seed for reproducibility. let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); for _num_test in 0..num_tests { #[cfg(feature = "std")] dbg!(_num_test); let mut hasher = crate::Hasher::new(); let mut total_input = 0; // For each test, write 3 inputs of random length. for _ in 0..3 { let input_len = rng.gen_range(0..(INPUT_MAX + 1)); #[cfg(feature = "std")] dbg!(input_len); let input = &input_buf[total_input..][..input_len]; hasher.update(input); total_input += input_len; } let expected = reference_hash(&input_buf[..total_input]); assert_eq!(expected, hasher.finalize()); } } #[test] fn test_fuzz_xof() { let mut input_buf = [0u8; 3 * BLOCK_LEN]; paint_test_input(&mut input_buf); // Don't do too many iterations in debug mode, to keep the tests under a // second or so. CI should run tests in release mode also. Provide an // environment variable for specifying a larger number of fuzz iterations. let num_tests = if cfg!(debug_assertions) { 100 } else { 2500 }; // Use a fixed RNG seed for reproducibility. let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); for _num_test in 0..num_tests { #[cfg(feature = "std")] dbg!(_num_test); // 31 (16 + 8 + 4 + 2 + 1) outputs let mut output_buf = [0; 31 * CHUNK_LEN]; let input_len = rng.gen_range(0..input_buf.len()); let mut xof = crate::Hasher::new() .update(&input_buf[..input_len]) .finalize_xof(); let partial_start = rng.gen_range(0..output_buf.len()); let partial_end = rng.gen_range(partial_start..output_buf.len()); xof.fill(&mut output_buf[..partial_start]); xof.fill(&mut output_buf[partial_start..partial_end]); xof.fill(&mut output_buf[partial_end..]); let mut reference_buf = [0; 31 * CHUNK_LEN]; let mut reference_hasher = reference_impl::Hasher::new(); reference_hasher.update(&input_buf[..input_len]); reference_hasher.finalize(&mut reference_buf); assert_eq!(reference_buf, output_buf); } } #[test] fn test_xof_seek() { let mut out = [0; 533]; let mut hasher = crate::Hasher::new(); hasher.update(b"foo"); hasher.finalize_xof().fill(&mut out); assert_eq!(hasher.finalize().as_bytes(), &out[0..32]); let mut reader = hasher.finalize_xof(); reader.set_position(303); let mut out2 = [0; 102]; reader.fill(&mut out2); assert_eq!(&out[303..][..102], &out2[..]); #[cfg(feature = "std")] { use std::io::prelude::*; let mut reader = hasher.finalize_xof(); reader.seek(std::io::SeekFrom::Start(303)).unwrap(); let mut out3 = Vec::new(); reader.by_ref().take(102).read_to_end(&mut out3).unwrap(); assert_eq!(&out[303..][..102], &out3[..]); assert_eq!( reader.seek(std::io::SeekFrom::Current(0)).unwrap(), 303 + 102 ); reader.seek(std::io::SeekFrom::Current(-5)).unwrap(); assert_eq!( reader.seek(std::io::SeekFrom::Current(0)).unwrap(), 303 + 102 - 5 ); let mut out4 = [0; 17]; assert_eq!(reader.read(&mut out4).unwrap(), 17); assert_eq!(&out[303 + 102 - 5..][..17], &out4[..]); assert_eq!( reader.seek(std::io::SeekFrom::Current(0)).unwrap(), 303 + 102 - 5 + 17 ); assert!(reader.seek(std::io::SeekFrom::End(0)).is_err()); assert!(reader.seek(std::io::SeekFrom::Current(-1000)).is_err()); } } #[test] fn test_msg_schedule_permutation() { let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; let mut generated = [[0; 16]; 7]; generated[0] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; for round in 1..7 { for i in 0..16 { generated[round][i] = generated[round - 1][permutation[i]]; } } assert_eq!(generated, crate::MSG_SCHEDULE); } #[test] fn test_reset() { let mut hasher = crate::Hasher::new(); hasher.update(&[42; 3 * CHUNK_LEN + 7]); hasher.reset(); hasher.update(&[42; CHUNK_LEN + 3]); assert_eq!(hasher.finalize(), crate::hash(&[42; CHUNK_LEN + 3])); let key = &[99; crate::KEY_LEN]; let mut keyed_hasher = crate::Hasher::new_keyed(key); keyed_hasher.update(&[42; 3 * CHUNK_LEN + 7]); keyed_hasher.reset(); keyed_hasher.update(&[42; CHUNK_LEN + 3]); assert_eq!( keyed_hasher.finalize(), crate::keyed_hash(key, &[42; CHUNK_LEN + 3]), ); let context = "BLAKE3 2020-02-12 10:20:58 reset test"; let mut kdf = crate::Hasher::new_derive_key(context); kdf.update(&[42; 3 * CHUNK_LEN + 7]); kdf.reset(); kdf.update(&[42; CHUNK_LEN + 3]); let expected = crate::derive_key(context, &[42; CHUNK_LEN + 3]); assert_eq!(kdf.finalize(), expected); } #[test] fn test_hex_encoding_decoding() { let digest_str = "04e0bb39f30b1a3feb89f536c93be15055482df748674b00d26e5a75777702e9"; let mut hasher = crate::Hasher::new(); hasher.update(b"foo"); let digest = hasher.finalize(); assert_eq!(digest.to_hex().as_str(), digest_str); #[cfg(feature = "std")] assert_eq!(digest.to_string(), digest_str); // Test round trip let digest = crate::Hash::from_hex(digest_str).unwrap(); assert_eq!(digest.to_hex().as_str(), digest_str); // Test uppercase let digest = crate::Hash::from_hex(digest_str.to_uppercase()).unwrap(); assert_eq!(digest.to_hex().as_str(), digest_str); // Test string parsing via FromStr let digest: crate::Hash = digest_str.parse().unwrap(); assert_eq!(digest.to_hex().as_str(), digest_str); // Test errors let bad_len = "04e0bb39f30b1"; let _result = crate::Hash::from_hex(bad_len).unwrap_err(); #[cfg(feature = "std")] assert_eq!(_result.to_string(), "expected 64 hex bytes, received 13"); let bad_char = "Z4e0bb39f30b1a3feb89f536c93be15055482df748674b00d26e5a75777702e9"; let _result = crate::Hash::from_hex(bad_char).unwrap_err(); #[cfg(feature = "std")] assert_eq!(_result.to_string(), "invalid hex character: 'Z'"); let _result = crate::Hash::from_hex([128; 64]).unwrap_err(); #[cfg(feature = "std")] assert_eq!(_result.to_string(), "invalid hex character: 0x80"); } // This test is a mimized failure case for the Windows SSE2 bug described in // https://github.com/BLAKE3-team/BLAKE3/issues/206. // // Before that issue was fixed, this test would fail on Windows in the following configuration: // // cargo test --features=no_avx512,no_avx2,no_sse41 --release // // Bugs like this one (stomping on a caller's register) are very sensitive to the details of // surrounding code, so it's not especially likely that this test will catch another bug (or even // the same bug) in the future. Still, there's no harm in keeping it. #[test] fn test_issue_206_windows_sse2() { // This stupid loop has to be here to trigger the bug. I don't know why. for _ in &[0] { // The length 65 (two blocks) is significant. It doesn't repro with 64 (one block). It also // doesn't repro with an all-zero input. let input = &[0xff; 65]; let expected_hash = [ 183, 235, 50, 217, 156, 24, 190, 219, 2, 216, 176, 255, 224, 53, 28, 95, 57, 148, 179, 245, 162, 90, 37, 121, 0, 142, 219, 62, 234, 204, 225, 161, ]; // This throwaway call has to be here to trigger the bug. crate::Hasher::new().update(input); // This assert fails when the bug is triggered. assert_eq!(crate::Hasher::new().update(input).finalize(), expected_hash); } } #[test] fn test_hash_conversions() { let bytes1 = [42; 32]; let hash1: crate::Hash = bytes1.into(); let bytes2: [u8; 32] = hash1.into(); assert_eq!(bytes1, bytes2); let bytes3 = *hash1.as_bytes(); assert_eq!(bytes1, bytes3); let hash2 = crate::Hash::from_bytes(bytes1); assert_eq!(hash1, hash2); let hex = hash1.to_hex(); let hash3 = crate::Hash::from_hex(hex.as_bytes()).unwrap(); assert_eq!(hash1, hash3); } #[test] const fn test_hash_const_conversions() { let bytes = [42; 32]; let hash = crate::Hash::from_bytes(bytes); _ = hash.as_bytes(); } #[cfg(feature = "zeroize")] #[test] fn test_zeroize() { use zeroize::Zeroize; let mut hash = crate::Hash([42; 32]); hash.zeroize(); assert_eq!(hash.0, [0u8; 32]); let mut hasher = crate::Hasher { chunk_state: crate::ChunkState { cv: [42; 8], chunk_counter: 42, buf: [42; 64], buf_len: 42, blocks_compressed: 42, flags: 42, platform: crate::Platform::Portable, }, key: [42; 8], cv_stack: [[42; 32]; { crate::MAX_DEPTH + 1 }].into(), }; hasher.zeroize(); assert_eq!(hasher.chunk_state.cv, [0; 8]); assert_eq!(hasher.chunk_state.chunk_counter, 0); assert_eq!(hasher.chunk_state.buf, [0; 64]); assert_eq!(hasher.chunk_state.buf_len, 0); assert_eq!(hasher.chunk_state.blocks_compressed, 0); assert_eq!(hasher.chunk_state.flags, 0); assert!(matches!( hasher.chunk_state.platform, crate::Platform::Portable )); assert_eq!(hasher.key, [0; 8]); assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]); let mut output_reader = crate::OutputReader { inner: crate::Output { input_chaining_value: [42; 8], block: [42; 64], counter: 42, block_len: 42, flags: 42, platform: crate::Platform::Portable, }, position_within_block: 42, }; output_reader.zeroize(); assert_eq!(output_reader.inner.input_chaining_value, [0; 8]); assert_eq!(output_reader.inner.block, [0; 64]); assert_eq!(output_reader.inner.counter, 0); assert_eq!(output_reader.inner.block_len, 0); assert_eq!(output_reader.inner.flags, 0); assert!(matches!( output_reader.inner.platform, crate::Platform::Portable )); assert_eq!(output_reader.position_within_block, 0); } #[test] #[cfg(feature = "std")] fn test_update_reader() -> Result<(), std::io::Error> { // This is a brief test, since update_reader() is mostly a wrapper around update(), which already // has substantial testing. let mut input = vec![0; 1_000_000]; paint_test_input(&mut input); assert_eq!( crate::Hasher::new().update_reader(&input[..])?.finalize(), crate::hash(&input), ); Ok(()) } #[test] #[cfg(feature = "std")] fn test_update_reader_interrupted() -> std::io::Result<()> { use std::io; struct InterruptingReader<'a> { already_interrupted: bool, slice: &'a [u8], } impl<'a> InterruptingReader<'a> { fn new(slice: &'a [u8]) -> Self { Self { already_interrupted: false, slice, } } } impl<'a> io::Read for InterruptingReader<'a> { fn read(&mut self, buf: &mut [u8]) -> io::Result { if !self.already_interrupted { self.already_interrupted = true; return Err(io::Error::from(io::ErrorKind::Interrupted)); } let take = std::cmp::min(self.slice.len(), buf.len()); buf[..take].copy_from_slice(&self.slice[..take]); self.slice = &self.slice[take..]; Ok(take) } } let input = b"hello world"; let mut reader = InterruptingReader::new(input); let mut hasher = crate::Hasher::new(); hasher.update_reader(&mut reader)?; assert_eq!(hasher.finalize(), crate::hash(input)); Ok(()) } #[test] #[cfg(feature = "mmap")] // NamedTempFile isn't Miri-compatible #[cfg(not(miri))] fn test_mmap() -> Result<(), std::io::Error> { // This is a brief test, since update_mmap() is mostly a wrapper around update(), which already // has substantial testing. use std::io::prelude::*; let mut input = vec![0; 1_000_000]; paint_test_input(&mut input); let mut tempfile = tempfile::NamedTempFile::new()?; tempfile.write_all(&input)?; tempfile.flush()?; assert_eq!( crate::Hasher::new() .update_mmap(tempfile.path())? .finalize(), crate::hash(&input), ); Ok(()) } #[test] #[cfg(feature = "mmap")] #[cfg(target_os = "linux")] fn test_mmap_virtual_file() -> Result<(), std::io::Error> { // Virtual files like /proc/version can't be mmapped, because their contents don't actually // exist anywhere in memory. Make sure we fall back to regular file IO in these cases. // Currently this is handled with a length check, where the assumption is that virtual files // will always report length 0. If that assumption ever breaks, hopefully this test will catch // it. let virtual_filepath = "/proc/version"; let mut mmap_hasher = crate::Hasher::new(); // We'll fail right here if the fallback doesn't work. mmap_hasher.update_mmap(virtual_filepath)?; let mut read_hasher = crate::Hasher::new(); read_hasher.update_reader(std::fs::File::open(virtual_filepath)?)?; assert_eq!(mmap_hasher.finalize(), read_hasher.finalize()); Ok(()) } #[test] #[cfg(feature = "mmap")] #[cfg(feature = "rayon")] // NamedTempFile isn't Miri-compatible #[cfg(not(miri))] fn test_mmap_rayon() -> Result<(), std::io::Error> { // This is a brief test, since update_mmap_rayon() is mostly a wrapper around update_rayon(), // which already has substantial testing. use std::io::prelude::*; let mut input = vec![0; 1_000_000]; paint_test_input(&mut input); let mut tempfile = tempfile::NamedTempFile::new()?; tempfile.write_all(&input)?; tempfile.flush()?; assert_eq!( crate::Hasher::new() .update_mmap_rayon(tempfile.path())? .finalize(), crate::hash(&input), ); Ok(()) } #[test] #[cfg(feature = "std")] #[cfg(feature = "serde")] fn test_serde() { // Henrik suggested that we use 0xfe / 254 for byte test data instead of 0xff / 255, due to the // fact that 0xfe is not a well formed CBOR item. let hash: crate::Hash = [0xfe; 32].into(); let json = serde_json::to_string(&hash).unwrap(); assert_eq!( json, "[254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]", ); let hash2: crate::Hash = serde_json::from_str(&json).unwrap(); assert_eq!(hash, hash2); let mut cbor = Vec::::new(); ciborium::into_writer(&hash, &mut cbor).unwrap(); assert_eq!( cbor, [ 0x98, 0x20, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, ] ); let hash_from_cbor: crate::Hash = ciborium::from_reader(&cbor[..]).unwrap(); assert_eq!(hash_from_cbor, hash); // Version 1.5.2 of this crate changed the default serialization format to a bytestring // (instead of an array/list) to save bytes on the wire. That was a backwards compatibility // mistake for non-self-describing formats, and it's been reverted. Since some small number of // serialized bytestrings will probably exist forever in the wild, we shold test that we can // still deserialize these from self-describing formats. let bytestring_cbor: &[u8] = &[ 0x58, 0x20, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, ]; let hash_from_bytestring_cbor: crate::Hash = ciborium::from_reader(bytestring_cbor).unwrap(); assert_eq!(hash_from_bytestring_cbor, hash); } // `cargo +nightly miri test` currently works, but it takes forever, because some of our test // inputs are quite large. Most of our unsafe code is platform specific and incompatible with Miri // anyway, but we'd like it to be possible for callers to run their own tests under Miri, assuming // they don't use incompatible features like Rayon or mmap. This test should get reasonable // coverage of our public API without using any large inputs, so we can run it in CI and catch // obvious breaks. (For example, constant_time_eq is not compatible with Miri.) #[test] fn test_miri_smoketest() { let mut hasher = crate::Hasher::new_derive_key("Miri smoketest"); hasher.update(b"foo"); #[cfg(feature = "std")] hasher.update_reader(&b"bar"[..]).unwrap(); assert_eq!(hasher.finalize(), hasher.finalize()); let mut reader = hasher.finalize_xof(); reader.set_position(999999); reader.fill(&mut [0]); } blake3-1.5.4/src/traits.rs000064400000000000000000000157751046102023000134560ustar 00000000000000//! Implementations of commonly used traits like `Digest` and `Mac` from the //! [`digest`](https://crates.io/crates/digest) crate. pub use digest; use crate::{Hasher, OutputReader}; use digest::crypto_common; use digest::generic_array::{typenum::U32, typenum::U64, GenericArray}; impl digest::HashMarker for Hasher {} impl digest::Update for Hasher { #[inline] fn update(&mut self, data: &[u8]) { self.update(data); } } impl digest::Reset for Hasher { #[inline] fn reset(&mut self) { self.reset(); // the inherent method } } impl digest::OutputSizeUser for Hasher { type OutputSize = U32; } impl digest::FixedOutput for Hasher { #[inline] fn finalize_into(self, out: &mut GenericArray) { out.copy_from_slice(self.finalize().as_bytes()); } } impl digest::FixedOutputReset for Hasher { #[inline] fn finalize_into_reset(&mut self, out: &mut GenericArray) { out.copy_from_slice(self.finalize().as_bytes()); self.reset(); } } impl digest::ExtendableOutput for Hasher { type Reader = OutputReader; #[inline] fn finalize_xof(self) -> Self::Reader { Hasher::finalize_xof(&self) } } impl digest::ExtendableOutputReset for Hasher { #[inline] fn finalize_xof_reset(&mut self) -> Self::Reader { let reader = Hasher::finalize_xof(self); self.reset(); reader } } impl digest::XofReader for OutputReader { #[inline] fn read(&mut self, buffer: &mut [u8]) { self.fill(buffer); } } impl crypto_common::KeySizeUser for Hasher { type KeySize = U32; } impl crypto_common::BlockSizeUser for Hasher { type BlockSize = U64; } impl digest::MacMarker for Hasher {} impl digest::KeyInit for Hasher { #[inline] fn new(key: &digest::Key) -> Self { let key_bytes: [u8; 32] = (*key).into(); Hasher::new_keyed(&key_bytes) } } #[cfg(test)] mod test { use super::*; #[test] fn test_digest_traits() { // Inherent methods. let mut hasher1 = crate::Hasher::new(); hasher1.update(b"foo"); hasher1.update(b"bar"); hasher1.update(b"baz"); let out1 = hasher1.finalize(); let mut xof1 = [0; 301]; hasher1.finalize_xof().fill(&mut xof1); assert_eq!(out1.as_bytes(), &xof1[..32]); // Trait implementations. let mut hasher2: crate::Hasher = digest::Digest::new(); digest::Digest::update(&mut hasher2, b"xxx"); digest::Digest::reset(&mut hasher2); digest::Digest::update(&mut hasher2, b"foo"); digest::Digest::update(&mut hasher2, b"bar"); digest::Digest::update(&mut hasher2, b"baz"); let out2 = digest::Digest::finalize(hasher2.clone()); let mut xof2 = [0; 301]; digest::XofReader::read( &mut digest::ExtendableOutput::finalize_xof(hasher2.clone()), &mut xof2, ); assert_eq!(out1.as_bytes(), &out2[..]); assert_eq!(xof1[..], xof2[..]); // Again with the resetting variants. let mut hasher3: crate::Hasher = digest::Digest::new(); digest::Digest::update(&mut hasher3, b"foobarbaz"); let mut out3 = [0; 32]; digest::FixedOutputReset::finalize_into_reset( &mut hasher3, GenericArray::from_mut_slice(&mut out3), ); digest::Digest::update(&mut hasher3, b"foobarbaz"); let mut out4 = [0; 32]; digest::FixedOutputReset::finalize_into_reset( &mut hasher3, GenericArray::from_mut_slice(&mut out4), ); digest::Digest::update(&mut hasher3, b"foobarbaz"); let mut xof3 = [0; 301]; digest::XofReader::read( &mut digest::ExtendableOutputReset::finalize_xof_reset(&mut hasher3), &mut xof3, ); digest::Digest::update(&mut hasher3, b"foobarbaz"); let mut xof4 = [0; 301]; digest::XofReader::read( &mut digest::ExtendableOutputReset::finalize_xof_reset(&mut hasher3), &mut xof4, ); assert_eq!(out1.as_bytes(), &out3[..]); assert_eq!(out1.as_bytes(), &out4[..]); assert_eq!(xof1[..], xof3[..]); assert_eq!(xof1[..], xof4[..]); } #[test] fn test_mac_trait() { // Inherent methods. let key = b"some super secret key bytes fooo"; let mut hasher1 = crate::Hasher::new_keyed(key); hasher1.update(b"foo"); hasher1.update(b"bar"); hasher1.update(b"baz"); let out1 = hasher1.finalize(); // Trait implementation. let generic_key = (*key).into(); let mut hasher2: crate::Hasher = digest::Mac::new(&generic_key); digest::Mac::update(&mut hasher2, b"xxx"); digest::Mac::reset(&mut hasher2); digest::Mac::update(&mut hasher2, b"foo"); digest::Mac::update(&mut hasher2, b"bar"); digest::Mac::update(&mut hasher2, b"baz"); let out2 = digest::Mac::finalize(hasher2); assert_eq!(out1.as_bytes(), out2.into_bytes().as_slice()); } fn expected_hmac_blake3(key: &[u8], input: &[u8]) -> [u8; 32] { // See https://en.wikipedia.org/wiki/HMAC. let key_hash; let key_prime = if key.len() <= 64 { key } else { key_hash = *crate::hash(key).as_bytes(); &key_hash }; let mut ipad = [0x36; 64]; let mut opad = [0x5c; 64]; for i in 0..key_prime.len() { ipad[i] ^= key_prime[i]; opad[i] ^= key_prime[i]; } let mut inner_state = crate::Hasher::new(); inner_state.update(&ipad); inner_state.update(input); let mut outer_state = crate::Hasher::new(); outer_state.update(&opad); outer_state.update(inner_state.finalize().as_bytes()); outer_state.finalize().into() } #[test] fn test_hmac_compatibility() { use hmac::{Mac, SimpleHmac}; // Test a short key. let mut x = SimpleHmac::::new_from_slice(b"key").unwrap(); hmac::digest::Update::update(&mut x, b"data"); let output = x.finalize().into_bytes(); assert_ne!(output.len(), 0); let expected = expected_hmac_blake3(b"key", b"data"); assert_eq!(expected, output.as_ref()); // Test a range of key and data lengths, particularly to exercise the long-key logic. let mut input_bytes = [0; crate::test::TEST_CASES_MAX]; crate::test::paint_test_input(&mut input_bytes); for &input_len in crate::test::TEST_CASES { #[cfg(feature = "std")] dbg!(input_len); let input = &input_bytes[..input_len]; let mut x = SimpleHmac::::new_from_slice(input).unwrap(); hmac::digest::Update::update(&mut x, input); let output = x.finalize().into_bytes(); assert_ne!(output.len(), 0); let expected = expected_hmac_blake3(input, input); assert_eq!(expected, output.as_ref()); } } } blake3-1.5.4/tools/release.md000064400000000000000000000011571046102023000141020ustar 00000000000000# Release checklist - Make sure `cargo outdated -R` is clean in the root and in b3sum/. - Bump the version in the root Cargo.toml. - Bump the version in b3sum/Cargo.toml. - Delete b3sum/Cargo.lock and recreate it with `cargo build` or similar. - Update the `-h` output in b3sum/README.md if it's changed. - Bump `BLAKE3_VERSION_STRING` in c/blake3.h. - Bump `VERSION` in c/CMakeLists.txt. - Make a version bump commit with change notes. - `git push` and make sure CI is green. - `git tag` the version bump commit with the new version number. - `git push --tags` - `cargo publish` in the root. - `cargo publish` in b3sum/.